Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Quagga: Dev

trivial quagga patches

 

 

Quagga dev RSS feed   Index | Next | Previous | View Threaded


shemminger at vyatta

Apr 6, 2011, 10:01 AM

Post #1 of 11 (1000 views)
Permalink
trivial quagga patches

The following changes since commit 74bd8495d0fc3c24e5fba09164093bd6d997771e:

Merge remote-tracking branch 'remotes/quagga/master' (2011-03-29 14:25:56 +0100)

are available in the git repository at:

git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/quagga.git for-paul

Stephen Hemminger (8):
smux: oid optimization
ospfd: set multicast sockopt when socket is created
bgpd: fix warning about missing prototype for nonblocking
ospf6d: fix warnings from missing prototypes
bgpd: make flag manipulation inline
bgpd: inline lock/unlock
bgpd: eliminate strict aliasing warnings
vty: remove unused vty_serv_socket_family

bgpd/bgp_advertise.h | 15 +++---------
bgpd/bgp_network.c | 1 +
bgpd/bgp_packet.c | 18 ++++++++++-----
bgpd/bgpd.c | 56 -------------------------------------------------
bgpd/bgpd.h | 57 +++++++++++++++++++++++++++++++++++++++++++++----
lib/smux.c | 12 +++++-----
lib/smux.h | 7 ++---
lib/vty.c | 2 +
ospf6d/ospf6_main.c | 8 +++++++
ospfd/ospf_network.c | 34 ++++++++++++++---------------
10 files changed, 104 insertions(+), 106 deletions(-)

diff --git a/bgpd/bgp_advertise.h b/bgpd/bgp_advertise.h
index 4ebde90..9e20c22 100644
--- a/bgpd/bgp_advertise.h
+++ b/bgpd/bgp_advertise.h
@@ -21,13 +21,6 @@ Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#ifndef _QUAGGA_BGP_ADVERTISE_H
#define _QUAGGA_BGP_ADVERTISE_H

-/* BGP advertise FIFO. */
-struct bgp_advertise_fifo
-{
- struct bgp_advertise *next;
- struct bgp_advertise *prev;
-};
-
/* BGP advertise attribute. */
struct bgp_advertise_attr
{
@@ -44,7 +37,7 @@ struct bgp_advertise_attr
struct bgp_advertise
{
/* FIFO for advertisement. */
- struct bgp_advertise_fifo fifo;
+ struct fifo fifo;

/* Link list for same attribute advertise. */
struct bgp_advertise *next;
@@ -97,9 +90,9 @@ struct bgp_adj_in
/* BGP advertisement list. */
struct bgp_synchronize
{
- struct bgp_advertise_fifo update;
- struct bgp_advertise_fifo withdraw;
- struct bgp_advertise_fifo withdraw_low;
+ struct fifo update;
+ struct fifo withdraw;
+ struct fifo withdraw_low;
};

/* BGP adjacency linked list. */
diff --git a/bgpd/bgp_network.c b/bgpd/bgp_network.c
index 502f567..570cc3b 100644
--- a/bgpd/bgp_network.c
+++ b/bgpd/bgp_network.c
@@ -30,6 +30,7 @@ Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#include "command.h"
#include "privs.h"
#include "linklist.h"
+#include "network.h"

#include "bgpd/bgpd.h"
#include "bgpd/bgp_fsm.h"
diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
index 1d6b9ee..f159198 100644
--- a/bgpd/bgp_packet.c
+++ b/bgpd/bgp_packet.c
@@ -137,6 +137,12 @@ bgp_connect_check (struct peer *peer)
}
}

+static inline struct bgp_advertise *
+bgp_advertise_head (struct fifo *fifo)
+{
+ return (struct bgp_advertise *) FIFO_HEAD(fifo);
+}
+
/* Make BGP update packet. */
static struct stream *
bgp_update_packet (struct peer *peer, afi_t afi, safi_t safi)
@@ -154,7 +160,7 @@ bgp_update_packet (struct peer *peer, afi_t afi, safi_t safi)
s = peer->work;
stream_reset (s);

- adv = FIFO_HEAD (&peer->sync[afi][safi]->update);
+ adv = bgp_advertise_head (&peer->sync[afi][safi]->update);

while (adv)
{
@@ -290,7 +296,7 @@ bgp_withdraw_packet (struct peer *peer, afi_t afi, safi_t safi)
s = peer->work;
stream_reset (s);

- while ((adv = FIFO_HEAD (&peer->sync[afi][safi]->withdraw)) != NULL)
+ while ((adv = bgp_advertise_head (&peer->sync[afi][safi]->withdraw)) != NULL)
{
assert (adv->rn);
adj = adv->adj;
@@ -514,7 +520,7 @@ bgp_write_packet (struct peer *peer)
for (afi = AFI_IP; afi < AFI_MAX; afi++)
for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
{
- adv = FIFO_HEAD (&peer->sync[afi][safi]->withdraw);
+ adv = bgp_advertise_head (&peer->sync[afi][safi]->withdraw);
if (adv)
{
s = bgp_withdraw_packet (peer, afi, safi);
@@ -526,7 +532,7 @@ bgp_write_packet (struct peer *peer)
for (afi = AFI_IP; afi < AFI_MAX; afi++)
for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
{
- adv = FIFO_HEAD (&peer->sync[afi][safi]->update);
+ adv = bgp_advertise_head (&peer->sync[afi][safi]->update);
if (adv)
{
if (adv->binfo && adv->binfo->uptime < peer->synctime)
@@ -577,12 +583,12 @@ bgp_write_proceed (struct peer *peer)

for (afi = AFI_IP; afi < AFI_MAX; afi++)
for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
- if (FIFO_HEAD (&peer->sync[afi][safi]->withdraw))
+ if (bgp_advertise_head (&peer->sync[afi][safi]->withdraw))
return 1;

for (afi = AFI_IP; afi < AFI_MAX; afi++)
for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
- if ((adv = FIFO_HEAD (&peer->sync[afi][safi]->update)) != NULL)
+ if ((adv = bgp_advertise_head (&peer->sync[afi][safi]->update)) != NULL)
if (adv->binfo->uptime < peer->synctime)
return 1;

diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c
index ee0cc5d..cb43af1 100644
--- a/bgpd/bgpd.c
+++ b/bgpd/bgpd.c
@@ -114,46 +114,6 @@ bgp_option_check (int flag)
return CHECK_FLAG (bm->options, flag);
}

-/* BGP flag manipulation. */
-int
-bgp_flag_set (struct bgp *bgp, int flag)
-{
- SET_FLAG (bgp->flags, flag);
- return 0;
-}
-
-int
-bgp_flag_unset (struct bgp *bgp, int flag)
-{
- UNSET_FLAG (bgp->flags, flag);
- return 0;
-}
-
-int
-bgp_flag_check (struct bgp *bgp, int flag)
-{
- return CHECK_FLAG (bgp->flags, flag);
-}
-
-/* Internal function to set BGP structure configureation flag. */
-static void
-bgp_config_set (struct bgp *bgp, int config)
-{
- SET_FLAG (bgp->config, config);
-}
-
-static void
-bgp_config_unset (struct bgp *bgp, int config)
-{
- UNSET_FLAG (bgp->config, config);
-}
-
-static int
-bgp_config_check (struct bgp *bgp, int config)
-{
- return CHECK_FLAG (bgp->config, config);
-}
-
/* Set BGP router identifier. */
int
bgp_router_id_set (struct bgp *bgp, struct in_addr *id)
@@ -2108,23 +2068,7 @@ bgp_delete (struct bgp *bgp)
return 0;
}

-static void bgp_free (struct bgp *);
-
void
-bgp_lock (struct bgp *bgp)
-{
- ++bgp->lock;
-}
-
-void
-bgp_unlock(struct bgp *bgp)
-{
- assert(bgp->lock > 0);
- if (--bgp->lock == 0)
- bgp_free (bgp);
-}
-
-static void
bgp_free (struct bgp *bgp)
{
afi_t afi;
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
index 4da19e7..8c817c0 100644
--- a/bgpd/bgpd.h
+++ b/bgpd/bgpd.h
@@ -845,13 +845,60 @@ extern int bgp_option_check (int);

extern int bgp_get (struct bgp **, as_t *, const char *);
extern int bgp_delete (struct bgp *);
+extern void bgp_free (struct bgp *);

-extern int bgp_flag_set (struct bgp *, int);
-extern int bgp_flag_unset (struct bgp *, int);
-extern int bgp_flag_check (struct bgp *, int);
+/* BGP flag manipulation. */
+static inline void
+bgp_flag_set (struct bgp *bgp, int flag)
+{
+ SET_FLAG (bgp->flags, flag);
+}
+
+static inline void
+bgp_flag_unset (struct bgp *bgp, int flag)
+{
+ UNSET_FLAG (bgp->flags, flag);
+}
+
+static inline int
+bgp_flag_check (const struct bgp *bgp, int flag)
+{
+ return CHECK_FLAG (bgp->flags, flag);
+}
+
+/* Internal function to set BGP structure configureation flag. */
+static inline void
+bgp_config_set (struct bgp *bgp, int config)
+{
+ SET_FLAG (bgp->config, config);
+}

-extern void bgp_lock (struct bgp *);
-extern void bgp_unlock (struct bgp *);
+static inline void
+bgp_config_unset (struct bgp *bgp, int config)
+{
+ UNSET_FLAG (bgp->config, config);
+}
+
+static inline int
+bgp_config_check (const struct bgp *bgp, int config)
+{
+ return CHECK_FLAG (bgp->config, config);
+}
+
+
+static inline void
+bgp_lock (struct bgp *bgp)
+{
+ ++bgp->lock;
+}
+
+static inline void
+bgp_unlock (struct bgp *bgp)
+{
+ assert(bgp->lock > 0);
+ if (--bgp->lock == 0)
+ bgp_free (bgp);
+}

extern int bgp_router_id_set (struct bgp *, struct in_addr *);

diff --git a/lib/smux.c b/lib/smux.c
index 1941cf8..1adcf31 100644
--- a/lib/smux.c
+++ b/lib/smux.c
@@ -82,14 +82,14 @@ static struct cmd_node smux_node =
/* thread master */
static struct thread_master *master;

-void *
+static void *
oid_copy (void *dest, const void *src, size_t size)
{
return memcpy (dest, src, size * sizeof (oid));
}

void
-oid2in_addr (oid oid[], int len, struct in_addr *addr)
+oid2in_addr (const oid oid[], int len, struct in_addr *addr)
{
int i;
u_char *pnt;
@@ -104,10 +104,10 @@ oid2in_addr (oid oid[], int len, struct in_addr *addr)
}

void
-oid_copy_addr (oid oid[], struct in_addr *addr, int len)
+oid_copy_addr (oid oid[], const struct in_addr *addr, int len)
{
int i;
- u_char *pnt;
+ const u_char *pnt;

if (len == 0)
return;
@@ -118,8 +118,8 @@ oid_copy_addr (oid oid[], struct in_addr *addr, int len)
oid[i] = *pnt++;
}

-int
-oid_compare (oid *o1, int o1_len, oid *o2, int o2_len)
+static int
+oid_compare (const oid *o1, int o1_len, const oid *o2, int o2_len)
{
int i;

diff --git a/lib/smux.h b/lib/smux.h
index f5754ed..cbfedd8 100644
--- a/lib/smux.h
+++ b/lib/smux.h
@@ -153,9 +153,8 @@ extern int smux_header_generic (struct variable *, oid [], size_t *,
extern int smux_trap (const oid *, size_t, const oid *, size_t,
const struct trap_object *,
size_t, unsigned int, u_char);
-extern int oid_compare (oid *, int, oid *, int);
-extern void oid2in_addr (oid [], int, struct in_addr *);
-extern void *oid_copy (void *, const void *, size_t);
-extern void oid_copy_addr (oid [], struct in_addr *, int);
+
+extern void oid2in_addr (const oid [], int, struct in_addr *);
+extern void oid_copy_addr (oid [], const struct in_addr *, int);

#endif /* _ZEBRA_SNMP_H */
diff --git a/lib/vty.c b/lib/vty.c
index bd1dbac..f7e3741 100644
--- a/lib/vty.c
+++ b/lib/vty.c
@@ -1841,6 +1841,7 @@ vty_serv_sock_addrinfo (const char *hostname, unsigned short port)
}
#endif /* HAVE_IPV6 && ! NRL */

+#if defined(NRL) || !defined(HAVE_IPV6)
/* Make vty server socket. */
static void
vty_serv_sock_family (const char* addr, unsigned short port, int family)
@@ -1905,6 +1906,7 @@ vty_serv_sock_family (const char* addr, unsigned short port, int family)
/* Add vty server event. */
vty_event (VTY_SERV, accept_sock, NULL);
}
+#endif

#ifdef VTYSH
/* For sockaddr_un. */
diff --git a/ospf6d/ospf6_main.c b/ospf6d/ospf6_main.c
index 800fae4..69738d4 100644
--- a/ospf6d/ospf6_main.c
+++ b/ospf6d/ospf6_main.c
@@ -34,8 +34,16 @@
#include "plist.h"
#include "privs.h"
#include "sigevent.h"
+#include "zclient.h"

#include "ospf6d.h"
+#include "ospf6_proto.h"
+#include "ospf6_lsa.h"
+#include "ospf6_route.h"
+#include "ospf6_message.h"
+#include "ospf6_top.h"
+#include "ospf6_area.h"
+#include "ospf6_asbr.h"

/* Default configuration file name for ospf6d. */
#define OSPF6_DEFAULT_CONFIG "ospf6d.conf"
diff --git a/ospfd/ospf_network.c b/ospfd/ospf_network.c
index 1e2d44e..7e107e1 100644
--- a/ospfd/ospf_network.c
+++ b/ospfd/ospf_network.c
@@ -132,25 +132,8 @@ ospf_if_drop_alldrouters (struct ospf *top, struct prefix *p, unsigned int
int
ospf_if_ipmulticast (struct ospf *top, struct prefix *p, unsigned int ifindex)
{
- u_char val;
- int ret, len;
-
- val = 0;
- len = sizeof (val);
-
- /* Prevent receiving self-origined multicast packets. */
- ret = setsockopt (top->fd, IPPROTO_IP, IP_MULTICAST_LOOP, (void *)&val, len);
- if (ret < 0)
- zlog_warn ("can't setsockopt IP_MULTICAST_LOOP(0) for fd %d: %s",
- top->fd, safe_strerror(errno));
+ int ret;

- /* Explicitly set multicast ttl to 1 -- endo. */
- val = 1;
- ret = setsockopt (top->fd, IPPROTO_IP, IP_MULTICAST_TTL, (void *)&val, len);
- if (ret < 0)
- zlog_warn ("can't setsockopt IP_MULTICAST_TTL(1) for fd %d: %s",
- top->fd, safe_strerror (errno));
-
ret = setsockopt_multicast_ipv4 (top->fd, IP_MULTICAST_IF,
p->u.prefix4, 0, ifindex);
if (ret < 0)
@@ -166,6 +149,7 @@ ospf_sock_init (void)
{
int ospf_sock;
int ret, hincl = 1;
+ int val;

if ( ospfd_privs.change (ZPRIVS_RAISE) )
zlog_err ("ospf_sock_init: could not raise privs, %s",
@@ -225,6 +209,20 @@ ospf_sock_init (void)
safe_strerror (errno) );
}

+ /* Prevent receiving self-origined multicast packets. */
+ val = 0;
+ ret = setsockopt (ospf_sock, IPPROTO_IP, IP_MULTICAST_LOOP, (void *)&val, sizeof(val));
+ if (ret < 0)
+ zlog_warn ("can't setsockopt IP_MULTICAST_LOOP(0) for fd %d: %s",
+ ospf_sock, safe_strerror(errno));
+
+ /* Explicitly set multicast ttl to 1 -- endo. */
+ val = 1;
+ ret = setsockopt (ospf_sock, IPPROTO_IP, IP_MULTICAST_TTL, (void *)&val, sizeof(val));
+ if (ret < 0)
+ zlog_warn ("can't setsockopt IP_MULTICAST_TTL(1) for fd %d: %s",
+ ospf_sock, safe_strerror (errno));
+
return ospf_sock;
}

_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


paul at jakma

Apr 7, 2011, 2:23 AM

Post #2 of 11 (972 views)
Permalink
Re: trivial quagga patches [In reply to]

On Wed, 6 Apr 2011, Stephen Hemminger wrote:

> bgpd: inline lock/unlock

> --- a/bgpd/bgpd.h
> +++ b/bgpd/bgpd.h

> +static inline void
> +bgp_flag_set (struct bgp *bgp, int flag)
> +{
> + SET_FLAG (bgp->flags, flag);
> +}

I have to say I don't like these inlining patches. Unless they come
with benchmarks across a diverse range of setups showing actual
performance improvements, there's as much reason to believe they hurt
performance as improve it - and I won't take them.

It'd be *much* better to supply patches to the build system to allow
multi-unit compiling. Inlines in headers can /never/ be uninlined,
but the compiler can always inline code if you tell it to or, better,
give it enough information to make intelligent decisions (e.g. using
previous-profile directed compilation).

Please, no cargo-cult inlining. ;)

regards,
--
Paul Jakma paul [at] jakma twitter: @pjakma PGP: 64A2FF6A
Fortune:
People seem to enjoy things more when they know a lot of other people
have been left out on the pleasure.
-- Russell Baker
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


shemminger at vyatta

Apr 7, 2011, 8:35 AM

Post #3 of 11 (965 views)
Permalink
Re: trivial quagga patches [In reply to]

On Thu, 7 Apr 2011 10:23:15 +0100 (BST)
paul [at] jakma wrote:

> On Wed, 6 Apr 2011, Stephen Hemminger wrote:
>
> > bgpd: inline lock/unlock
>
> > --- a/bgpd/bgpd.h
> > +++ b/bgpd/bgpd.h
>
> > +static inline void
> > +bgp_flag_set (struct bgp *bgp, int flag)
> > +{
> > + SET_FLAG (bgp->flags, flag);
> > +}
>
> I have to say I don't like these inlining patches. Unless they come
> with benchmarks across a diverse range of setups showing actual
> performance improvements, there's as much reason to believe they hurt
> performance as improve it - and I won't take them.

Smaller code is faster code,

> It'd be *much* better to supply patches to the build system to allow
> multi-unit compiling. Inlines in headers can /never/ be uninlined,
> but the compiler can always inline code if you tell it to or, better,
> give it enough information to make intelligent decisions (e.g. using
> previous-profile directed compilation).

Gcc multi-unit compiling is broken in many versions.

--
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


paul at jakma

Apr 7, 2011, 9:21 AM

Post #4 of 11 (969 views)
Permalink
Re: trivial quagga patches [In reply to]

On Thu, 7 Apr 2011, Stephen Hemminger wrote:

> Smaller code is faster code,

But inlining can make the total code size bigger, even where it
reduces code size at any specific call site. And runtime branch
prediction no longer can know it's the same code.

> Gcc multi-unit compiling is broken in many versions.

Ah, interesting. Presumably "in many versions" means that it's now
fixed? There's also link-time optimisation.

Point remains: these optimisations need to be done based on objective
measurements (by human, or through profile-directed compiler
optimisation).

regards,
--
Paul Jakma paul [at] jakma twitter: @pjakma PGP: 64A2FF6A
Fortune:
"For three days after death hair and fingernails continue to grow but
phone calls taper off."
-- Johnny Carson
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


shemminger at vyatta

Apr 7, 2011, 9:33 AM

Post #5 of 11 (966 views)
Permalink
Re: trivial quagga patches [In reply to]

On Thu, 7 Apr 2011 17:21:31 +0100 (BST)
paul [at] jakma wrote:

> On Thu, 7 Apr 2011, Stephen Hemminger wrote:
>
> > Smaller code is faster code,
>
> But inlining can make the total code size bigger, even where it
> reduces code size at any specific call site. And runtime branch
> prediction no longer can know it's the same code.
>
> > Gcc multi-unit compiling is broken in many versions.
>
> Ah, interesting. Presumably "in many versions" means that it's now
> fixed? There's also link-time optimisation.
>
> Point remains: these optimisations need to be done based on objective
> measurements (by human, or through profile-directed compiler
> optimisation).

My statement is based on measurements, not speculation.

Original code:
text data bss dec hex filename
654901 85536 888 741325 b4fcd bgpd

After inlining flag and lock
text data bss dec hex filename
654381 85536 888 740805 b4dc5 bgpd

Code is smaller by 520 bytes.
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


paul at jakma

Apr 7, 2011, 9:58 AM

Post #6 of 11 (965 views)
Permalink
Re: trivial quagga patches [In reply to]

On Thu, 7 Apr 2011, Stephen Hemminger wrote:

> My statement is based on measurements, not speculation.

> Code is smaller by 520 bytes.

Good that size is smaller, though what about run-time (ok, very very
likely that smaller code is faster, but it doesn't follow
automatically)? Still, we'd be much better off letting the compiler
do this for us...

regards,
--
Paul Jakma paul [at] jakma twitter: @pjakma PGP: 64A2FF6A
Fortune:
My advice to you, my violent friend, is to seek out gold and sit on it.
-- The Dragon to Grendel, in John Gardner's "Grendel"
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


joakim.tjernlund at transmode

Apr 7, 2011, 11:03 AM

Post #7 of 11 (965 views)
Permalink
Re: trivial quagga patches [In reply to]

>
> On Thu, 7 Apr 2011, Stephen Hemminger wrote:
>
> > My statement is based on measurements, not speculation.
>
> > Code is smaller by 520 bytes.
>
> Good that size is smaller, though what about run-time (ok, very very
> likely that smaller code is faster, but it doesn't follow
> automatically)? Still, we'd be much better off letting the compiler
> do this for us...

I really don't understand your objection to:
static inline void
bgp_flag_set (struct bgp *bgp, int flag)
{
SET_FLAG (bgp->flags, flag);
}
vs. using SET_FLAG() directly?

The inline function offers better abstraction and type checking
at virtually no cost.

Jocke

_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


paul at jakma

Apr 8, 2011, 8:25 AM

Post #8 of 11 (961 views)
Permalink
Re: trivial quagga patches [In reply to]

On Thu, 7 Apr 2011, Stephen Hemminger wrote:

> Original code:
> text data bss dec hex filename
> 654901 85536 888 741325 b4fcd bgpd
>
> After inlining flag and lock
> text data bss dec hex filename
> 654381 85536 888 740805 b4dc5 bgpd
>
> Code is smaller by 520 bytes.

With 4.6.0-2.fc15:

default LTO -Os LTO -O2 LTOnone -O2 LTOnone -Os
inline: 651886 648053 742925 742869 648029
master: +440 +124 +128 +48 +88
notinline: +445 +129 +113 -11 +109


LTO == -flto
LTOnone == -flto -flto-partition=none

inline is today's master + "bgpd: inline lock/unlock" and "bgpd: make
flag manipulation inline".

notinline is today's master + attached patch to unline stuff.

Note that the compiler can sometimes make smaller code than with
explicit inlines, for the same compiler options. Also, LTO produces
smaller code regardless of the patches. Just compiling master with
LTO is a 10* gain over just adding inlines: a 4149 byte saving versus
440 bytes.

Further, for some reason, in the 'notinline' case it's always calling
prefix_bit, rather than inlining it - while prefix6_bit /does/ seem
to get inlined. So the compiler seems to think there's good reason to
*not* inline prefix_bit at least...

Finally, smaller code may generally be good, but the /smallest/ code
isn't always the fastest. Runtime measurement data is what should
drive optimisations (or, sound complexity analysis ;) ).

regards,
--
Paul Jakma paul [at] jakma twitter: @pjakma PGP: 64A2FF6A
Fortune:
Doubt is not a pleasant condition, but certainty is absurd.
- Voltaire
Attachments: uninline.diff (5.85 KB)


paul at jakma

Apr 8, 2011, 8:37 AM

Post #9 of 11 (959 views)
Permalink
Re: trivial quagga patches [In reply to]

On Thu, 7 Apr 2011, Joakim Tjernlund wrote:

> bgp_flag_set (struct bgp *bgp, int flag)
> {
> SET_FLAG (bgp->flags, flag);
> }
> vs. using SET_FLAG() directly?

> The inline function offers better abstraction and type checking
> at virtually no cost.

I'd no have objection to that at all, but the 2 patches I'm looking
at change a function from having a single common definition, to
having a header-inline definition. I'd really prefer that we leave
that kind of thing to the compiler.

I don't mean to pick on Stephen - I've seen a bunch of other patches
elsewhere doing this, but Stephen's one is what triggered my rant. ;)

regards,
--
Paul Jakma paul [at] jakma twitter: @pjakma PGP: 64A2FF6A
Fortune:
If someone says he will do something "without fail", he won't.
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


chris.hall.list at highwayman

Apr 10, 2011, 1:29 PM

Post #10 of 11 (935 views)
Permalink
Re: trivial quagga patches [In reply to]

Paul Jakma wrote (on Fri 08-Apr-2011 at 16:26):
....
> With 4.6.0-2.fc15:
>
> default LTO -Os LTO -O2 LTOnone -O2 LTOnone -Os
> inline: 651886 648053 742925 742869 648029
> master: +440 +124 +128 +48 +88
> notinline: +445 +129 +113 -11 +109
>
> LTO == -flto
> LTOnone == -flto -flto-partition=none
>
> inline is today's master + "bgpd: inline lock/unlock" and "bgpd:
> make flag manipulation inline".
>
> notinline is today's master + attached patch to unline stuff.
>
> Note that the compiler can sometimes make smaller code than with
> explicit inlines, for the same compiler options. Also, LTO produces
> smaller code regardless of the patches. Just compiling master with
> LTO is a 10* gain over just adding inlines: a 4149 byte saving
> versus 440 bytes.

So the default appears to be -Os. So LTO -Os does better than -Os on
its own.

Nevertheless, in all but one case, the inline version is the smallest
for the same compiler options.

Now, AFAICS the upsides of inlining a small function include:

* elimination of the call/return

it is argued that branch prediction may squeeze out much of
this -- and clever-clogs machines may even squeeze out
much of the cost of the implied push/pop.

* elimination of call set-up

moving arguments around into the right registers and such.

* elimination of stack-frame set-up and tear down

though for higher levels of optimisation, this will get
squeezed out for "leaf" functions.

* allowing the optimiser to range across the code inside the
function as part of the function that is calling it.

* where the function is in lib, I suppose one should count the
tiny additional cost of the dynamic linkage.

The downsides include:

* increasing size of program, or parts thereof, so that the
cache is less effective.

It may be more effective to execute more instructions to do
the same job if those instructions are in the cache.

* if the inline is of any size, adding complexity to the
function inlined into... thereby reducing the effectiveness
of optimisation, branch-prediction ...

So... on the face of it, for *small* functions, provided the total
code size does not increase, prodding the compiler to inline would
appear likely to improve things.

But, whether it will improve things by much... whether it's worth the
trouble... for that one needs some measurements.

I have tried three versions of a simple "set bit(s) in given data
structure" function. So, in set_bit.h I have:

struct set_bit_s
{
int padding[76] ;

uint32_t bits ;
} set_bit_data ;

#define set_bit_m(d, b) (d)->bits |= (b)

extern void
set_bit_f(struct set_bit_s* d, uint32_t b) ;

inline static void
set_bit_fi(struct set_bit_s* d, uint32_t b)
{
set_bit_m(d, b) ;
} ;

in set_bit.c:

extern void
set_bit_f(struct set_bit_s* d, uint32_t b)
{
set_bit_m(d, b) ;
} ;

in trial.c, the trials and timing structure:


/*--------------------------------------------------------------------
----------
* Trial functions
*/
typedef uint trial_func(uint, void*) ;

static volatile uint zzz ;

#define make_trial(name, function) \
static uint \
name(uint iteration, void* arg) \
{ \
uint bit = iteration ^ 0xAAAA ; \
zzz = bit ; \
\
function ; \
\
return iteration - 1 ; \
}

make_trial(trial_empty, ) ;

make_trial(trial_set_bit_m, set_bit_m((struct set_bit_s*)arg, bit))
;
make_trial(trial_set_bit_f, set_bit_f((struct set_bit_s*)arg, bit))
;
make_trial(trial_set_bit_fi, set_bit_fi((struct set_bit_s*)arg,
bit)) ;


/*--------------------------------------------------------------------
----------
* Timing loop
*/
static double
double_time(struct timespec* ts)
{
return ((double)ts->tv_sec * 1E9) + ts->tv_nsec ;
}

static double __attribute__((__noinline__))
time_this(trial_func* trial, uint32_t iterations, void* arg)
{
struct timespec cpu_start, cpu_end ;
uint32_t i ;

i = iterations ;

clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &cpu_start) ;

do
i = trial(i, arg) ;
while (i != 0) ;

clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &cpu_end) ;

return (double_time(&cpu_end) - double_time(&cpu_start))
/
(double)iterations ;
} ;

The trials are intended to be at least not be trivial. The volatile
"zzz" stops the optimiser from throwing away the setting of the uint
"bit" in the trial_empty.

After some experimentation, I found that running multiple trials of
1,000 iterations each, gave stable and repeatable results. I assume
that the odd interrupt get counted in to the CPU time for the process.
With a 1ns CPU clock resolution it's possible to time relatively small
numbers of iterations. I collect 100,001 trials, and discounting the
first, establish the minimum, discard anything < 99% of the median or
> 101%. The result is:

100000 + 1 trials of 1000 iterations, each
trial first min median mean max low high
value good
empty 16.5030 2.4500 2.4670 2.7800 118.9140 0 1072
2.4662 (98928)
set_f 15.7360 3.4350 3.4510 4.2510 176.8490 0 38677
3.4498 (61323)
set_i 11.2100 2.4520 2.4670 2.7918 131.5080 0 1069
2.4664 (98931)
set_m 11.1550 2.4500 2.4670 2.7774 139.9460 0 1051
2.4664 (98949)

The low/high columns show the number of trials excluded when
calculating the average "value". With this data the median is good,
and not that different to the minimum. (set_f is the trial_set_f,
set_i is trial_set_fi and set_m is trial_set_m.)

This is gcc 4.5.1, on Linux 2.6.35, on Phenom II X6 1090T. Compiling
-O2 -falign-functions=32 -falign-jumps=32.

What this appears to show is that the inline and the macro are
indistinguishable -- no surprise there. Those are also
indistinguishable from the empty loop ! The "set_f" trial, which
involves an actual subroutine and call -- forced here because the
set_bit_f() is extern, is a whole ~1ns longer ! But since the inline
version is being measured at 0ns, there is an infinitely large penalty
for not inlining !!

Looking at the code generated:

Dump of assembler code for function trial_empty:
<+0>: mov %edi,%eax
<+2>: xor $0xaaaa,%eax
<+7>: mov %eax,0x50e07b(%rip) # 0x90e6a8 <zzz>
<+13>: lea -0x1(%rdi),%eax
<+16>: retq

Dump of assembler code for function trial_set_bit_m:
<+0>: mov %edi,%eax
<+2>: xor $0xaaaa,%eax
<+7>: or %eax,0x130(%rsi)
<+13>: mov %eax,0x50e055(%rip) # 0x90e6a8 <zzz>
<+19>: lea -0x1(%rdi),%eax
<+22>: retq

Dump of assembler code for function trial_set_bit_fi:
<+0>: mov %edi,%eax
<+2>: xor $0xaaaa,%eax
<+7>: or %eax,0x130(%rsi)
<+13>: mov %eax,0x50e035(%rip) # 0x90e6a8 <zzz>
<+19>: lea -0x1(%rdi),%eax
<+22>: retq

Dump of assembler code for function trial_set_bit_f:
<+0>: push %rbx
<+1>: mov %edi,%ebx
<+3>: mov %rsi,%rdi
<+6>: mov %ebx,%esi
<+8>: xor $0xaaaa,%esi
<+14>: mov %esi,0x50dfd4(%rip) # 0x90e6a8 <zzz>
<+20>: callq 0x400be0 <set_bit_f>
<+25>: lea -0x1(%rbx),%eax
<+28>: pop %rbx
<+29>: retq

Dump of assembler code for function set_bit_f:
<+0>: or %esi,0x130(%rdi)
<+6>: retq

The actual code for setting bits boils down to a single instruction.
The overhead saved by inlining is:

<+0>: push %rbx
<+3>: mov %rsi,%rdi
<+6>: mov %ebx,%esi
<+20>: callq 0x400be0 <set_bit_f>
<+6>: retq -- in set_bit_f
<+28>: pop %rbx

So this processor can rattle through those in 1ns :-) If a dynamic
link were involved that would add an extra jump instruction.

For a small function like this the inlined code is less than the
overhead to call the function (in this case 6 bytes vs 13).

So it all looks as if inlining this class of function is a win. If
the compiler can do that automatically, that's fine.

> Further, for some reason, in the 'notinline' case it's always
> calling prefix_bit, rather than inlining it - while prefix6_bit
> /does/ seem to get inlined. So the compiler seems to think there's
> good reason to *not* inline prefix_bit at least...

I found that for 1 or 2 instances of prefix_bit() in a compilation
unit, it would inline. For 3 (or more, I assume) it would not. I
suspect that's -Os at work... but I haven't got time to test that
today.

> Finally, smaller code may generally be good, but the /smallest/ code
> isn't always the fastest. Runtime measurement data is what should
> drive optimisations (or, sound complexity analysis ;) ).

Depends on how the code is made smaller. If you make it smaller by
factoring out bits of code and creating little subroutines, then the
cost of call set-up, calls and rets etc. cannot be avoided entirely.
Today's processors have multiple execution units, and optimise for
common instructions. So, yes, you may be able to replace one
expensive instruction by some cheap ones, and even lose the cost of
those by moving them off the critical path.

Here, however, we are eliminating overhead -- instructions that do not
really need to be there at all. I fondly believe that removing
unnecessary instructions may make things run quicker, but at least
won't make things slower. The small amount of testing I've done seems
to support that.

The big fear about excessive inlining is that it makes the code
bigger, and hence possibly slower -- because it strains the cache.
So, I think the point about checking the total code size is: if the
result of encouraging inlining is smaller code, then at least it has
done no harm.

Caveat: in the timing above, I do not know why the set_f trials had so
many high results. That bothers me.

Caveat: the first run of a given trial always gave a high value. I do
not know why that is, which bothers me too. It seems very high for
training of branch prediction ? Perhaps 1.5% of the results are >102%
of the mean (except for set_f). So the consistency of the first value
doesn't seem like coincidence.

Caveat: just occasionally I got a trial run looking like this:

100000 + 1 trials of 1000 iterations, each
trial first min median mean max low high
value good
empty 16.5380 2.4330 2.4420 2.7660 175.4640 0 1168
2.4423 (98832)
set_f 15.5710 3.4160 4.1620 4.6134 142.6020 13234 1205
4.1619 (85561)
set_i 11.1720 2.4300 2.4420 2.7990 131.0320 0 1190
2.4425 (98810)
set_m 11.1760 2.4330 2.4420 2.7500 138.2330 0 1176
2.4426 (98824)

The empty, set_i and set_m are consistent across separate runs, but
set_f occasionally gives something like this, where the median is
rather higher.

Now, you could argue that the true figure is the minimum timing --
because that is most likely to be free of any extra cycles introduced
by interrupts -- except that one has to expect some timing errors.
Which is why I use the median, end the average of values median +/-2%.
(I started with +/-5%, but found that +/-20% discards a small number
of values, except for the set_f case.) I do not know what to make of
this. It gives 4.16ns instead of 3.45, so adds 60% to the 1ns extra
cost of the function call etc.

I'm not happy with this anomaly.

When I have a bit more time, I'll try out some slightly larger inline
functions.

Chris

[.With apologies for the length of this. Many moons ago I made a
living making software floating point run faster and better... so
making faster and smaller is a bit of an obsession.]

_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev


paul at jakma

Apr 11, 2011, 3:37 AM

Post #11 of 11 (933 views)
Permalink
Re: trivial quagga patches [In reply to]

On Sun, 10 Apr 2011, 'Chris Hall' wrote:

> Nevertheless, in all but one case, the inline version is the smallest
> for the same compiler options.

> Now, AFAICS the upsides of inlining a small function include:

The compiler can do these things when it's appropriate. Further, the
compiler can /precisely/ calculate where an inline saves instructions
and where it doesn't. A given inline might make sense on one arch,
but not another. If we header-inline it, we make it harder for the
compiler to exercise such flexibility.

There big picture is:

- Adding a compiler flag got an order(10) times *greater* saving than
the patch, for a *fraction* of the effort.

- Optimising code for the /compiler/ will, over time, give greater
performance benefits than manual inlining (with less code churn)

> But, whether it will improve things by much... whether it's worth the
> trouble... for that one needs some measurements.

I've done measurements in the past, of a macro versus a function for
(iirc) set_bit. And zebra, with a BGP workload, ended up being
/slower/ for me.

> 1,000 iterations each, gave stable and repeatable results. I
> assume that the odd interrupt get counted in to the CPU time for
> the process. With a 1ns CPU clock resolution it's possible to time
> relatively small numbers of iterations. I collect 100,001 trials,
> and discounting the first, establish the minimum, discard anything
> < 99% of the median or > 101%. The result is:

> 100000 + 1 trials of 1000 iterations, each
> trial first min median mean max low high value good
> empty 16.5030 2.4500 2.4670 2.7800 118.9140 0 1072 2.4662 (98928)
> set_f 15.7360 3.4350 3.4510 4.2510 176.8490 0 38677 3.4498 (61323)
> set_i 11.2100 2.4520 2.4670 2.7918 131.5080 0 1069 2.4664 (98931)
> set_m 11.1550 2.4500 2.4670 2.7774 139.9460 0 1051 2.4664 (98949)

> The low/high columns show the number of trials excluded when
> calculating the average "value". With this data the median is good,
> and not that different to the minimum. (set_f is the trial_set_f,
> set_i is trial_set_fi and set_m is trial_set_m.)

> This is gcc 4.5.1, on Linux 2.6.35, on Phenom II X6 1090T. Compiling
> -O2 -falign-functions=32 -falign-jumps=32.

> What this appears to show is that the inline and the macro are
> indistinguishable -- no surprise there. Those are also
> indistinguishable from the empty loop ! The "set_f" trial, which
> involves an actual subroutine and call -- forced here because the
> set_bit_f() is extern, is a whole ~1ns longer ! But since the inline
> version is being measured at 0ns, there is an infinitely large penalty
> for not inlining !!

> So this processor can rattle through those in 1ns :-) If a dynamic
> link were involved that would add an extra jump instruction.

> For a small function like this the inlined code is less than the
> overhead to call the function (in this case 6 bytes vs 13).

Note that overheads and the size of working code can differ between
arches - again, only the compiler has the flexibility to generally
get this right.

> So it all looks as if inlining this class of function is a win.
> If the compiler can do that automatically, that's fine.

Exactly.

>> Finally, smaller code may generally be good, but the /smallest/
>> code isn't always the fastest. Runtime measurement data is what
>> should drive optimisations (or, sound complexity analysis ;) ).

> Depends on how the code is made smaller. If you make it smaller by
> factoring out bits of code and creating little subroutines, then the
> cost of call set-up, calls and rets etc. cannot be avoided entirely.

C level functions can always be inlined by the compiler within the
same compilation unit - it's surely /decades/ since that wasn't true.
Modern compilers now can inline /across/ compile units, and even
across linking to shared libraries.

> Caveat: in the timing above, I do not know why the set_f trials had
> so many high results. That bothers me.

> Caveat: the first run of a given trial always gave a high value.
> I do not know why that is, which bothers me too.

Instruction cache load, at a guess. Modern CPUs have fairly
sophisticated performance counters perhaps can tell you more, e.g.
using oprofile on Linux.

> It seems very high for training of branch prediction ? Perhaps
> 1.5% of the results are >102% of the mean (except for set_f). So
> the consistency of the first value doesn't seem like coincidence.
>
> Caveat: just occasionally I got a trial run looking like this:
>
> 100000 + 1 trials of 1000 iterations, each
> trial first min median mean max low high value good
> empty 16.5380 2.4330 2.4420 2.7660 175.4640 0 1168 2.4423 (98832)
> set_f 15.5710 3.4160 4.1620 4.6134 142.6020 13234 1205 4.1619 (85561)
> set_i 11.1720 2.4300 2.4420 2.7990 131.0320 0 1190 2.4425 (98810)
> set_m 11.1760 2.4330 2.4420 2.7500 138.2330 0 1176 2.4426 (98824)

> The empty, set_i and set_m are consistent across separate runs, but
> set_f occasionally gives something like this, where the median is
> rather higher.

> Now, you could argue that the true figure is the minimum timing --
> because that is most likely to be free of any extra cycles introduced
> by interrupts -- except that one has to expect some timing errors.
> Which is why I use the median, end the average of values median +/-2%.
> (I started with +/-5%, but found that +/-20% discards a small number
> of values, except for the set_f case.) I do not know what to make of
> this. It gives 4.16ns instead of 3.45, so adds 60% to the 1ns extra
> cost of the function call etc.
>
> I'm not happy with this anomaly.
>
> When I have a bit more time, I'll try out some slightly larger inline
> functions.

Note that your benchmark is small enough that it perhaps fits
entirely in the L1 I-cache of your CPU (64KiB on yours apparently,
says wikilies), and almost certainly within the general L2 cache.

Also, note that what is true for a micro-benchmark need not be true
for actual useful code, which might not all fit in high-speed, on-die
cache. Even if it does, most will be in unified I+D caches, competing
with data for space.

regards,
--
Paul Jakma paul [at] jakma twitter: @pjakma PGP: 64A2FF6A
Fortune:
"In matters of principle, stand like a rock; in matters of taste, swim with
the current."
-- Thomas Jefferson
_______________________________________________
Quagga-dev mailing list
Quagga-dev [at] lists
http://lists.quagga.net/mailman/listinfo/quagga-dev

Quagga dev RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.