[PATCH v3] IPv6 ECMP support fixes for linux v3

Mikhail Sennikovskii mikhail.sennikovskii at profitbricks.com
Tue Aug 23 13:31:52 CEST 2016


The API for configuring ECMP for IPv6 on Linux is not symmetrical.
Routes can be set via the multipath structures, but Linux kernel
splits this up into separate routes internally.
As a result, ECMP routes are retorned as separate independent
routes when queried.
This patch works around this issue by making bird collect
individual routes for the same destination in one multipath route.
It also implements deletion of multipath routes as a set of
delete operations for each route entry.
Asynchronous motification are still not supported for now.

Signed-off-by: Mikhail Sennikovskii <mikhail.sennikovskii at profitbricks.com>
---
 nest/route.h           |  21 +++
 nest/rt-attr.c         | 163 ++++++++++++++++++---
 nest/rt-table.c        |  73 ++++++++--
 sysdep/linux/netlink.c | 382 +++++++++++++++++++++++++++++++++++++++++++------
 4 files changed, 567 insertions(+), 72 deletions(-)

diff --git a/nest/route.h b/nest/route.h
index 3969db6..b686950 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -506,6 +506,8 @@ int mpnh__same(struct mpnh *x, struct mpnh *y); /* Compare multipath nexthops */
 static inline int mpnh_same(struct mpnh *x, struct mpnh *y)
 { return (x == y) || mpnh__same(x, y); }
 struct mpnh *mpnh_merge(struct mpnh *x, struct mpnh *y, int rx, int ry, int max, linpool *lp);
+struct mpnh *mpnh_sub(struct mpnh *x, struct mpnh *y, linpool *lp);
+struct mpnh *mpnh_sort(struct mpnh *x, linpool *lp);
 
 void rta_init(void);
 rta *rta_lookup(rta *);			/* Get rta equivalent to this one, uc++ */
@@ -520,6 +522,25 @@ void rta_dump_all(void);
 void rta_show(struct cli *, rta *, ea_list *);
 void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr *gw, ip_addr *ll);
 
+static inline int
+rta_same(rta *x, rta *y)
+{
+  return (x->src == y->src &&
+	  x->source == y->source &&
+	  x->scope == y->scope &&
+	  x->cast == y->cast &&
+	  x->dest == y->dest &&
+	  x->flags == y->flags &&
+	  x->igp_metric == y->igp_metric &&
+	  ipa_equal(x->gw, y->gw) &&
+	  ipa_equal(x->from, y->from) &&
+	  x->iface == y->iface &&
+	  x->hostentry == y->hostentry &&
+	  mpnh_same(x->nexthops, y->nexthops) &&
+	  ea_same(x->eattrs, y->eattrs));
+}
+
+
 /*
  * rta_set_recursive_next_hop() acquires hostentry from hostcache and fills
  * rta->hostentry field.  New hostentry has zero use count. Cached rta locks its
diff --git a/nest/rt-attr.c b/nest/rt-attr.c
index 7fa05d6..3d01614 100644
--- a/nest/rt-attr.c
+++ b/nest/rt-attr.c
@@ -302,6 +302,151 @@ mpnh_merge(struct mpnh *x, struct mpnh *y, int rx, int ry, int max, linpool *lp)
   return root;
 }
 
+/**
+ * mpnh_sub - subtract one nexthop list from another.
+ * I.e. returns a list of entries, that existed in list1, but did not
+ * exist in list 2.
+ * The input lists must be sorted and the
+ * result is sorted too.
+ *
+ * @x: list 1
+ * @y: list 2
+ * @lp: linpool if not NULL list 1 is not reusable,
+ *      new entries are to be allocated using this pool.
+ *      list 2 is never modified.
+ *
+ * The argument linpool determines whether the list1
+ * consumed by the function (i.e. its nodes reused in the resulting list).
+ * If NULL, the list1 is reused, otherwise the resulting list
+ * is populated with the new entries, allocated using the linpool.
+ * To eliminate issues with deallocation of this list,
+ * the caller should use some form of bulk deallocation
+ * (e.g. stack or linpool) to free these nodes when the
+ * resulting list is no longer needed.
+ */
+struct mpnh *
+mpnh_sub(struct mpnh *x, struct mpnh *y, linpool *lp)
+{
+  struct mpnh *root = NULL;
+  struct mpnh **n = &root;
+
+  while (x || y)
+  {
+    int cmp = mpnh_compare_node(x, y);
+    if (cmp < 0)
+      {
+        *n = !lp ? x : mpnh_copy_node(x, lp);
+        x = x->next;
+        n = &((*n)->next);
+      }
+    else if (cmp > 0)
+      y = y->next;
+    else
+      {
+        x = x->next;
+        y = y->next;
+      }
+  }
+
+  *n = NULL;
+
+  return root;
+}
+
+/**
+ * mpnh_copy_lp copies nexthop list using given linpool
+ * (unlike mpnh_copy, which uses sl_alloc)
+ */
+static struct mpnh *
+mpnh_copy_lp(struct mpnh *o, linpool *lp)
+{
+  struct mpnh *first = NULL;
+  struct mpnh **last = &first;
+
+  for (; o; o = o->next)
+    {
+      struct mpnh *n = mpnh_copy_node(o, lp);
+      *last = n;
+      last = &(n->next);
+    }
+
+  return first;
+}
+
+/*
+ * mpnh_sort - sort the nexthop list
+ * @x: the list to be sorted
+ * @lp: if not NULL - the list will be copied in case it needs to be reordered,
+ * in this case the given list always remains unchanged.
+ * If however the list is ordered, the given list is just returned,
+ * and no copy of the list is created.
+ * If lp is NULL, the given list will be reordered directly
+ */
+struct mpnh *
+mpnh_sort(struct mpnh *x, linpool *lp)
+{
+  struct mpnh *ret = x;
+  struct mpnh *cur;
+  struct mpnh *prev;
+  int copy_on_change = !!lp;
+
+  for (cur = ret->next, prev = ret; cur; prev = cur, cur = cur->next)
+    {
+      int cmp = mpnh_compare_node(prev, cur);
+      if (cmp <= 0)
+        continue;
+
+      if (copy_on_change)
+        {
+          /* the list needs to be copied, and prev and cur need to be made
+           * pointing to the new list entries */
+
+          struct mpnh *old_prev, *new_prev;
+
+          ret = mpnh_copy_lp(x, lp);
+
+          for (old_prev = x, new_prev = ret;
+                    old_prev != prev;
+                    old_prev = old_prev->next, new_prev = new_prev->next);
+
+          prev = new_prev;
+          cur = new_prev->next;
+
+          copy_on_change = 0;
+        }
+
+      /* promote the entry */
+      struct mpnh *cur2;
+      struct mpnh **next2_ptr;
+
+      for (cur2 = ret, next2_ptr = &ret; ; next2_ptr = &cur2->next, cur2 = cur2->next)
+        {
+          cmp = mpnh_compare_node(cur2, cur);
+          if (cmp <= 0)
+            continue;
+
+          /*
+           * found the place, where to insert the entry
+           * do the entry move
+           */
+
+          /* 1. remove entry from the list */
+          prev->next = cur->next;
+
+          /* 2. now insert entry to the new place */
+          *next2_ptr = cur;
+          cur->next = cur2;
+
+          break;
+        }
+
+      /* now we have everything sorted upto prev,
+       * set cur to prev and proceed with the cur->next loop */
+      cur = prev;
+    }
+
+  return ret;
+}
 
 static struct mpnh *
 mpnh_copy(struct mpnh *o)
@@ -1012,24 +1157,6 @@ rta_hash(rta *a)
 	  mpnh_hash(a->nexthops) ^ ea_hash(a->eattrs)) & 0xffff;
 }
 
-static inline int
-rta_same(rta *x, rta *y)
-{
-  return (x->src == y->src &&
-	  x->source == y->source &&
-	  x->scope == y->scope &&
-	  x->cast == y->cast &&
-	  x->dest == y->dest &&
-	  x->flags == y->flags &&
-	  x->igp_metric == y->igp_metric &&
-	  ipa_equal(x->gw, y->gw) &&
-	  ipa_equal(x->from, y->from) &&
-	  x->iface == y->iface &&
-	  x->hostentry == y->hostentry &&
-	  mpnh_same(x->nexthops, y->nexthops) &&
-	  ea_same(x->eattrs, y->eattrs));
-}
-
 static rta *
 rta_copy(rta *o)
 {
diff --git a/nest/rt-table.c b/nest/rt-table.c
index 57c8b8e..5a28d94 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -592,18 +592,37 @@ static struct mpnh *
 mpnh_merge_rta(struct mpnh *nhs, rta *a, int max)
 {
   struct mpnh nh = { .gw = a->gw, .iface = a->iface };
-  struct mpnh *nh2 = (a->dest == RTD_MULTIPATH) ? a->nexthops : &nh;
-  return mpnh_merge(nhs, nh2, 1, 0, max, rte_update_pool);
+  struct mpnh *nh2;
+  int r2 = 0;
+
+  if (a->dest == RTD_MULTIPATH)
+    {
+      /*
+       * mpnh_merge expects the nexthops list to be sorted,
+       * while the nexthops returned by the protocols,
+       * e.g. the "static" one, are actually not.
+       * Ensures the nh2 is sorted.
+       */
+      nh2 = mpnh_sort(a->nexthops, rte_update_pool);
+      /*
+       * If the sort was actually done, the nh2 is already copies,
+       * so no need to copy it once again, set r2 to 1 in this case.
+       */
+      r2 = (nh2 != a->nexthops);
+    }
+  else
+    nh2 = &nh;
+  return mpnh_merge(nhs, nh2, 1, r2, max, rte_update_pool);
 }
 
-rte *
-rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent)
+static rte *
+_rt_export_merged_entry(struct announce_hook *ah, rte *best0, rte **rt_free, ea_list **tmpa, int silent,
+                                                                         rte *add_entry, rte *del_entry)
 {
   // struct proto *p = ah->proto;
   struct mpnh *nhs = NULL;
-  rte *best0, *best, *rt0, *rt, *tmp;
+  rte *best, *rt0, *rt, *tmp;
 
-  best0 = net->routes;
   *rt_free = NULL;
 
   if (!rte_is_valid(best0))
@@ -620,17 +639,30 @@ rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tm
       continue;
 
     rt = export_filter(ah, rt0, &tmp, NULL, 1);
-
     if (!rt)
       continue;
 
+    if (add_entry &&
+            (add_entry == rt
+               || rta_same(add_entry->attrs, rt->attrs)))
+      add_entry = NULL; /* already in a list */
+
+    if (del_entry &&
+            (del_entry == rt
+               || rta_same(del_entry->attrs, rt->attrs)))
+       goto free;
+
     if (rte_is_reachable(rt))
       nhs = mpnh_merge_rta(nhs, rt->attrs, ah->proto->merge_limit);
 
+free:
     if (tmp)
       rte_free(tmp);
   }
 
+  if (add_entry && rte_is_reachable(add_entry))
+      nhs = mpnh_merge_rta(nhs, add_entry->attrs, ah->proto->merge_limit);
+
   if (nhs)
   {
     nhs = mpnh_merge_rta(nhs, best->attrs, ah->proto->merge_limit);
@@ -642,6 +674,24 @@ rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tm
       best->attrs->nexthops = nhs;
     }
   }
+  else if (best->attrs->dest == RTD_MULTIPATH)
+  {
+    /*
+     * mpnh_merge, mpnh_same and mpnh_sub expect the nexthops list
+     * to be sorted, while the nexthops returned by the protocols,
+     * e.g. the "static" one, are actually not.
+     * This ensures the resulting entry has nexthops sorted,
+     * and makes the behavior consistent and agnostic to
+     * the number of elements in the best0 entries list
+     * (i.e. best0->next processing above)
+     */
+    nhs = mpnh_sort(best->attrs->nexthops, rte_update_pool);
+    if (nhs != best->attrs->nexthops)
+    {
+      best = rte_cow_rta(best, rte_update_pool);
+      best->attrs->nexthops = nhs;
+    }
+  }
 
   if (best != best0)
     *rt_free = best;
@@ -649,6 +699,12 @@ rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tm
   return best;
 }
 
+rte *
+rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent)
+{
+  return _rt_export_merged_entry(ah, net->routes, rt_free, tmpa, silent, NULL, NULL);
+}
+
 
 static void
 rt_notify_merged(struct announce_hook *ah, net *net, rte *new_changed, rte *old_changed,
@@ -690,10 +746,9 @@ rt_notify_merged(struct announce_hook *ah, net *net, rte *new_changed, rte *old_
   if (new_best)
     new_best = rt_export_merged(ah, net, &new_best_free, &tmpa, 0);
 
-  /* Prepare old merged route (without proper merged next hops) */
   /* There are some issues with running filter on old route - see rt_notify_basic() */
   if (old_best && !refeed)
-    old_best = export_filter(ah, old_best, &old_best_free, NULL, 1);
+    old_best = _rt_export_merged_entry(ah, old_best, &old_best_free, &tmpa, 0, old_changed, new_changed);
 
   if (new_best || old_best)
     do_rt_notify(ah, net, new_best, old_best, tmpa, refeed);
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 1ffdff0..766e3da 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -19,7 +19,6 @@
 #include "nest/route.h"
 #include "nest/protocol.h"
 #include "nest/iface.h"
-#include "lib/alloca.h"
 #include "lib/timer.h"
 #include "lib/unix.h"
 #include "lib/krt.h"
@@ -46,6 +45,32 @@
 #define RTA_TABLE  15
 #endif
 
+/*
+ * nl parse route context
+ * its duty is
+ * 1. To maintain the entry collect state -
+ *      for IPv6 ECMP the nl parsing logic needs to collect
+ *      separate individual entries, representing the multipath
+ *      into one multipath entry
+ * 2. To hold some temporary data used while parsing
+ *    (like non-cached rta) on the stack.
+ *
+ *    Implementation note: the context actually maintain two rta entries:
+ *    one to be used for the current rte being processed
+ *    (i.e. being created as a result of the nl data parsing),
+ *    another is used for the current rte being collected,
+ *    (i.e. stored in collect_rte, and for which multipath entries are being collected).
+ *    process_attrs holds the index of the attrs, being used for rte being processed.
+ *    Once the rte being processed becomes the one being collected,
+ *    the attrs used with it become "being collected", and another attrs become "being processed".
+ */
+typedef struct nl_parsectx
+{
+  struct krt_proto *collect_p; /* Protocol, for which entries are currently being processed */
+  rte *collect_rte; /* Entry, for which multipath entries are currently being collected */
+  int process_attrs; /* index in the attrs array for the entry to be used for the "processed" entry */
+  rta attrs[2];
+} nl_parsectx;
 
 /*
  *	Synchronous Netlink interface
@@ -62,6 +87,8 @@ struct nl_sock
 
 #define NL_RX_SIZE 8192
 
+static linpool *netlink_lp;
+
 static struct nl_sock nl_scan = {.fd = -1};	/* Netlink socket for synchronous scan */
 static struct nl_sock nl_req  = {.fd = -1};	/* Netlink socket for requests */
 
@@ -803,7 +830,7 @@ nh_bufsize(struct mpnh *nh)
 }
 
 static int
-nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
+nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new, int mp)
 {
   eattr *ea;
   net *net = e->net;
@@ -820,7 +847,8 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
   bzero(&r.r, sizeof(r.r));
   r.h.nlmsg_type = new ? RTM_NEWROUTE : RTM_DELROUTE;
   r.h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
-  r.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | (new ? NLM_F_CREATE|NLM_F_EXCL : 0);
+  r.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK
+		  | (new ? NLM_F_CREATE | (!mp ? NLM_F_EXCL : 0) : 0);
 
   r.r.rtm_family = BIRD_AF;
   r.r.rtm_dst_len = net->n.pxlen;
@@ -835,8 +863,12 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
 
   /* For route delete, we do not specify route attributes */
   if (!new)
-    return nl_exchange(&r.h);
-
+    {
+      if (mp)
+        goto set_dest;
+      else
+        goto submit;
+    }
 
   if (ea = ea_find(eattrs, EA_KRT_METRIC))
     nl_add_attr_u32(&r.h, sizeof(r), RTA_PRIORITY, ea->u.data);
@@ -864,7 +896,7 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
 
 
   /* a->iface != NULL checked in krt_capable() for router and device routes */
-
+set_dest:
   switch (a->dest)
     {
     case RTD_ROUTER:
@@ -892,10 +924,104 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
     default:
       bug("krt_capable inconsistent with nl_send_route");
     }
-
+submit:
   return nl_exchange(&r.h);
 }
 
+/*
+ * this is just to unify the code for bird1.x and bird2
+ * for bird1.x it is just a define, resolving to 1
+ * for IPV6 and 0 for IPV4
+ *
+ * for bird2 it is a function, making a decision based
+ * on the p->p.table->addr_type
+ *
+ * static int
+ * trk_is_use_collect_mode(struct krt_proto *p);
+ */
+#ifdef IPV6
+#define trk_is_use_collect_mode(_p) 1
+#else
+#define trk_is_use_collect_mode(_p) 0
+#endif
+
+static struct mpnh *
+krt_mp_merge_rta(struct mpnh *nhs, rta *a, int max)
+{
+  struct mpnh nh = { .gw = a->gw, .iface = a->iface };
+  struct mpnh *nh2 = (a->dest == RTD_MULTIPATH) ? a->nexthops : &nh;
+  return mpnh_merge(nhs, nh2, 1, 0, max, netlink_lp);
+}
+
+static struct mpnh *
+krt_mp_sub_rte_rta(rta *ax, rta *ay)
+{
+  struct mpnh nhx = { .gw = ax->gw, .iface = ax->iface };
+  struct mpnh nhy = { .gw = ay->gw, .iface = ay->iface };
+  struct mpnh *nhpx = (ax->dest == RTD_MULTIPATH) ? ax->nexthops : &nhx;
+  struct mpnh *nhpy = (ay->dest == RTD_MULTIPATH) ? ay->nexthops : &nhy;
+  return mpnh_sub(nhpx, nhpy, netlink_lp);
+}
+
+static int
+krt_send_nh_multipath(struct krt_proto *p, rte *base, struct mpnh *nh, struct ea_list *eattrs, int new)
+{
+  rte *e;
+  int err = 0;
+  rta ra = {
+    .src= p->p.main_source,
+    .source = RTS_INHERIT,
+    .scope = SCOPE_UNIVERSE,
+    .cast = RTC_UNICAST
+  };
+
+  e = rte_get_temp(&ra);
+  e->net = base->net;
+  e->u.krt = base->u.krt;
+
+  for (; nh; nh = nh->next)
+    {
+      ra.gw = nh->gw;
+      ra.iface = nh->iface;
+
+      err = nl_send_route(p, e, eattrs, new, 1);
+      if (err < 0)
+        DBG("deleting route failed %d\n", err);
+    }
+
+  rte_free(e);
+
+  return err;
+}
+
+static int
+krt_adjust_rte_multipath(struct krt_proto *p, rte *new, rte *old, struct ea_list *eattrs)
+{
+  struct mpnh *nhold, *nhnew;
+  int err = 0;
+
+  nhold = krt_mp_sub_rte_rta(old->attrs, new->attrs);
+  nhnew = krt_mp_sub_rte_rta(new->attrs, old->attrs);
+
+  if (nhold)
+  {
+    if (old->attrs->dest == RTD_MULTIPATH)
+      err = krt_send_nh_multipath(p, old, nhold, NULL, 0);
+    else
+      err = nl_send_route(p, old, NULL, 0, 1);
+  }
+
+  if (nhnew)
+    {
+      if (new->attrs->dest == RTD_MULTIPATH)
+        err |= krt_send_nh_multipath(p, new, nhnew, eattrs, 1);
+      else
+        err |= nl_send_route(p, new, eattrs, 1, 1);
+    }
+
+  return err;
+}
+
 void
 krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
 {
@@ -909,10 +1035,38 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list
    */
 
   if (old)
-    nl_send_route(p, old, NULL, 0);
+    {
+      if (trk_is_use_collect_mode(p))
+        {
+          DBG("KRT: krt_replace_rte: in collect mode\n");
+          if (new && ( new->attrs->dest == RTD_MULTIPATH
+                        || old->attrs->dest == RTD_MULTIPATH))
+            {
+              DBG("KRT: krt_replace_rte: adjusting\n");
+              err = krt_adjust_rte_multipath(p, new, old, eattrs);
+              /* zero up "new" to ensure the below "if (new)" branch is not triggered */
+              new = NULL;
+            }
+          else if (old->attrs->dest == RTD_MULTIPATH)
+            {
+              DBG("KRT: krt_replace_rte: old is multipath\n");
+              krt_send_nh_multipath(p, old, old->attrs->nexthops, NULL, 0);
+            }
+          else
+            {
+              DBG("KRT: krt_replace_rte: no multipath\n");
+              nl_send_route(p, old, NULL, 0, 0);
+            }
+        }
+      else
+        {
+          DBG("KRT: krt_replace_rte: NOT collect mode\n");
+          nl_send_route(p, old, NULL, 0, 0);
+        }
+    }
 
   if (new)
-    err = nl_send_route(p, new, eattrs, 1);
+    err = nl_send_route(p, new, eattrs, 1, 0);
 
   if (err < 0)
     n->n.flags |= KRF_SYNC_ERROR;
@@ -920,11 +1074,138 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list
     n->n.flags &= ~KRF_SYNC_ERROR;
 }
 
+static int
+krt_mp_is_collectable(struct krt_proto *p, rte *e)
+{
+  if (!trk_is_use_collect_mode(p))
+    return 0;
+
+  struct rta *a = e->attrs;
+
+  if (a->dest != RTD_ROUTER && a->dest != RTD_DEVICE)
+      return 0;
+
+  return 1;
+}
+
+static int
+krt_mp_is_mergable(struct krt_proto *p, rte *e1, rte *e2)
+{
+  if (e1->net != e2->net)
+    return 0;
+
+  if (!rte_is_valid(e1) || !rte_is_valid(e2))
+    return 0;
+
+  if (e1->pref != e2->pref)
+    return 0;
+
+  if (e1->attrs->src->proto->proto != e2->attrs->src->proto->proto)
+    return 0;
+
+  return 1;
+}
+
+static rte *
+krt_mp_collect_do_add(struct krt_proto *p, rte *mp_collect_rte, rte *e)
+{
+  struct rta *attrs = mp_collect_rte->attrs;
+
+  ASSERT(!rta_is_cached(attrs));
+
+  /* sanity to check our tmp attrs selection logic works correctly */
+  ASSERT(attrs != e->attrs);
+
+  if (attrs->dest != RTD_MULTIPATH)
+    {
+      attrs->nexthops = krt_mp_merge_rta(NULL, attrs, p->p.merge_limit);
+      attrs->dest = RTD_MULTIPATH;
+    }
+
+  attrs->nexthops = krt_mp_merge_rta(attrs->nexthops, e->attrs, p->p.merge_limit);
+
+  return mp_collect_rte;
+}
+
+static int
+krt_mp_can_collect(struct krt_proto *p, rte *mp_collect_rte, rte *e)
+{
+  if (!krt_mp_is_collectable(p, e))
+    return 0;
+
+  if (!krt_mp_is_mergable(p, mp_collect_rte, e))
+    return 0;
+
+  return 1;
+}
+
+static rta* nl_parse_get_tmp_rta(nl_parsectx *ctx)
+{
+  rta *a = &ctx->attrs[ctx->process_attrs];
+
+  memset(a, 0, sizeof(*a));
+  return a;
+}
+
+static void nl_parse_collect_complete(nl_parsectx *ctx)
+{
+  if (ctx->collect_p)
+    {
+      DBG("KRT: collected\n");
+      krt_got_route(ctx->collect_p, ctx->collect_rte);
+      ctx->collect_p = NULL;
+      ctx->collect_rte = NULL;
+      lp_flush(netlink_lp);
+    }
+}
+
+static void
+nl_parse_collect_rte(nl_parsectx *ctx, struct krt_proto *p, rte *e)
+{
+  if (ctx->collect_p)
+    {
+      ASSERT(ctx->collect_rte);
+      if (ctx->collect_p == p && krt_mp_can_collect(p, ctx->collect_rte, e))
+      {
+        ctx->collect_rte = krt_mp_collect_do_add(p, ctx->collect_rte, e);
+        DBG("KRT: collecting[add]\n");
+        return;
+      }
+
+      nl_parse_collect_complete(ctx);
+    }
+
+  ASSERT(!ctx->collect_p);
+  ASSERT(!ctx->collect_rte);
+
+  if (krt_mp_is_collectable(p, e))
+    {
+      ASSERT(e->attrs == &ctx->attrs[ctx->process_attrs]);
+      ASSERT(!rta_is_cached(e->attrs));
+      ctx->collect_p = p;
+      ctx->collect_rte = e;
+      ctx->process_attrs = (ctx->process_attrs + 1) % 2;
+      DBG("KRT: collecting\n");
+      return;
+    }
+
+  krt_got_route(p, e);
+}
+
+static void nl_parse_begin(nl_parsectx *ctx)
+{
+  memset(ctx, 0, sizeof (*ctx));
+}
+
+static void nl_parse_end(nl_parsectx *ctx)
+{
+  nl_parse_collect_complete(ctx);
+}
 
 #define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
 
 static void
-nl_parse_route(struct nlmsghdr *h, int scan)
+nl_parse_route(nl_parsectx *ctx, struct nlmsghdr *h, int scan)
 {
   struct krt_proto *p;
   struct rtmsg *i;
@@ -1022,12 +1303,12 @@ nl_parse_route(struct nlmsghdr *h, int scan)
 
   net *net = net_get(p->p.table, dst, i->rtm_dst_len);
 
-  rta ra = {
-    .src= p->p.main_source,
-    .source = RTS_INHERIT,
-    .scope = SCOPE_UNIVERSE,
-    .cast = RTC_UNICAST
-  };
+  rta *ra = nl_parse_get_tmp_rta(ctx);
+
+  ra->src= p->p.main_source,
+  ra->source = RTS_INHERIT,
+  ra->scope = SCOPE_UNIVERSE,
+  ra->cast = RTC_UNICAST;
 
   switch (i->rtm_type)
     {
@@ -1035,9 +1316,9 @@ nl_parse_route(struct nlmsghdr *h, int scan)
 
       if (a[RTA_MULTIPATH] && (i->rtm_family == AF_INET))
 	{
-	  ra.dest = RTD_MULTIPATH;
-	  ra.nexthops = nl_parse_multipath(p, a[RTA_MULTIPATH]);
-	  if (!ra.nexthops)
+	  ra->dest = RTD_MULTIPATH;
+	  ra->nexthops = nl_parse_multipath(p, a[RTA_MULTIPATH]);
+	  if (!ra->nexthops)
 	    {
 	      log(L_ERR "KRT: Received strange multipath route %I/%d",
 		  net->n.prefix, net->n.pxlen);
@@ -1047,8 +1328,8 @@ nl_parse_route(struct nlmsghdr *h, int scan)
 	  break;
 	}
 
-      ra.iface = if_find_by_index(oif);
-      if (!ra.iface)
+      ra->iface = if_find_by_index(oif);
+      if (!ra->iface)
 	{
 	  log(L_ERR "KRT: Received route %I/%d with unknown ifindex %u",
 	      net->n.prefix, net->n.pxlen, oif);
@@ -1058,39 +1339,39 @@ nl_parse_route(struct nlmsghdr *h, int scan)
       if (a[RTA_GATEWAY])
 	{
 	  neighbor *ng;
-	  ra.dest = RTD_ROUTER;
-	  memcpy(&ra.gw, RTA_DATA(a[RTA_GATEWAY]), sizeof(ra.gw));
-	  ipa_ntoh(ra.gw);
+	  ra->dest = RTD_ROUTER;
+	  memcpy(&ra->gw, RTA_DATA(a[RTA_GATEWAY]), sizeof(ra->gw));
+	  ipa_ntoh(ra->gw);
 
 #ifdef IPV6
 	  /* Silently skip strange 6to4 routes */
-	  if (ipa_in_net(ra.gw, IPA_NONE, 96))
+	  if (ipa_in_net(ra->gw, IPA_NONE, 96))
 	    return;
 #endif
 
-	  ng = neigh_find2(&p->p, &ra.gw, ra.iface,
+	  ng = neigh_find2(&p->p, &ra->gw, ra->iface,
 			   (i->rtm_flags & RTNH_F_ONLINK) ? NEF_ONLINK : 0);
 	  if (!ng || (ng->scope == SCOPE_HOST))
 	    {
 	      log(L_ERR "KRT: Received route %I/%d with strange next-hop %I",
-		  net->n.prefix, net->n.pxlen, ra.gw);
+		  net->n.prefix, net->n.pxlen, ra->gw);
 	      return;
 	    }
 	}
       else
 	{
-	  ra.dest = RTD_DEVICE;
+	  ra->dest = RTD_DEVICE;
 	}
 
       break;
     case RTN_BLACKHOLE:
-      ra.dest = RTD_BLACKHOLE;
+      ra->dest = RTD_BLACKHOLE;
       break;
     case RTN_UNREACHABLE:
-      ra.dest = RTD_UNREACHABLE;
+      ra->dest = RTD_UNREACHABLE;
       break;
     case RTN_PROHIBIT:
-      ra.dest = RTD_PROHIBIT;
+      ra->dest = RTD_PROHIBIT;
       break;
     /* FIXME: What about RTN_THROW? */
     default:
@@ -1098,7 +1379,7 @@ nl_parse_route(struct nlmsghdr *h, int scan)
       return;
     }
 
-  rte *e = rte_get_temp(&ra);
+  rte *e = rte_get_temp(ra);
   e->net = net;
   e->u.krt.src = src;
   e->u.krt.proto = i->rtm_protocol;
@@ -1115,24 +1396,24 @@ nl_parse_route(struct nlmsghdr *h, int scan)
       memcpy(&ps, RTA_DATA(a[RTA_PREFSRC]), sizeof(ps));
       ipa_ntoh(ps);
 
-      ea_list *ea = alloca(sizeof(ea_list) + sizeof(eattr));
-      ea->next = ra.eattrs;
-      ra.eattrs = ea;
+      ea_list *ea = lp_alloc(netlink_lp, sizeof(ea_list) + sizeof(eattr));
+      ea->next = ra->eattrs;
+      ra->eattrs = ea;
       ea->flags = EALF_SORTED;
       ea->count = 1;
       ea->attrs[0].id = EA_KRT_PREFSRC;
       ea->attrs[0].flags = 0;
       ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
-      ea->attrs[0].u.ptr = alloca(sizeof(struct adata) + sizeof(ps));
+      ea->attrs[0].u.ptr = lp_alloc(netlink_lp, sizeof(struct adata) + sizeof(ps));
       ea->attrs[0].u.ptr->length = sizeof(ps);
       memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps));
     }
 
   if (a[RTA_FLOW])
     {
-      ea_list *ea = alloca(sizeof(ea_list) + sizeof(eattr));
-      ea->next = ra.eattrs;
-      ra.eattrs = ea;
+      ea_list *ea = lp_alloc(netlink_lp, sizeof(ea_list) + sizeof(eattr));
+      ea->next = ra->eattrs;
+      ra->eattrs = ea;
       ea->flags = EALF_SORTED;
       ea->count = 1;
       ea->attrs[0].id = EA_KRT_REALM;
@@ -1144,7 +1425,7 @@ nl_parse_route(struct nlmsghdr *h, int scan)
   if (a[RTA_METRICS])
     {
       u32 metrics[KRT_METRICS_MAX];
-      ea_list *ea = alloca(sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
+      ea_list *ea = lp_alloc(netlink_lp, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
       int t, n = 0;
 
       if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
@@ -1166,15 +1447,15 @@ nl_parse_route(struct nlmsghdr *h, int scan)
 
       if (n > 0)
         {
-	  ea->next = ra.eattrs;
+	  ea->next = ra->eattrs;
 	  ea->flags = EALF_SORTED;
 	  ea->count = n;
-	  ra.eattrs = ea;
+	  ra->eattrs = ea;
 	}
     }
 
   if (scan)
-    krt_got_route(p, e);
+    nl_parse_collect_rte(ctx, p, e);
   else
     krt_got_route_async(p, e, new);
 }
@@ -1183,13 +1464,19 @@ void
 krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
 {
   struct nlmsghdr *h;
+  nl_parsectx ctx;
 
   nl_request_dump(BIRD_AF, RTM_GETROUTE);
+
+  nl_parse_begin(&ctx);
+
   while (h = nl_get_scan())
     if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
-      nl_parse_route(h, 1);
+      nl_parse_route(&ctx, h, 1);
     else
       log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
+
+  nl_parse_end(&ctx);
 }
 
 /*
@@ -1202,12 +1489,16 @@ static byte *nl_async_rx_buffer;	/* Receive buffer */
 static void
 nl_async_msg(struct nlmsghdr *h)
 {
+  nl_parsectx ctx;
+
   switch (h->nlmsg_type)
     {
     case RTM_NEWROUTE:
     case RTM_DELROUTE:
       DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
-      nl_parse_route(h, 0);
+      nl_parse_begin(&ctx);
+      nl_parse_route(&ctx, h, 0);
+      nl_parse_end(&ctx);
       break;
     case RTM_NEWLINK:
     case RTM_DELLINK:
@@ -1326,6 +1617,7 @@ void
 krt_sys_io_init(void)
 {
   HASH_INIT(nl_table_map, krt_pool, 6);
+  netlink_lp = lp_new(krt_pool, 4080);
 }
 
 int
-- 
2.5.0



More information about the Bird-users mailing list