[PATCH] IPv6 ECMP support fixes for linux

Mikhail Sennikovskii mikhail.sennikovskii at profitbricks.com
Wed Feb 10 16:43:21 CET 2016


The API for configuring ECMP for IPv6 on Linux is not symmetrical.
Routes can be set via the multipath structures, but Linux kernel
splits this up into separate routes internally.
As a result, ECMP routes are retorned as separate independent
routes when queried.
This patch works around this issue by making bird collect
individual routes for the same destination in one multipath route.
It also implements deletion of multipath routes as a set of
delete operations for each route entry.
Learn mode is still not supported for now.

Signed-off-by: Mikhail Sennikovskii <mikhail.sennikovskii at profitbricks.com>
---
 nest/route.h           |   1 +
 nest/rt-table.c        |  29 ++++++++++
 sysdep/linux/netlink.c |  93 +++++++++++++++++++++++++++++++-
 sysdep/unix/krt.c      | 144 +++++++++++++++++++++++++++++++++++++++++++++++--
 sysdep/unix/krt.h      |   5 ++
 5 files changed, 268 insertions(+), 4 deletions(-)

diff --git a/nest/route.h b/nest/route.h
index eba3d9b..66d7213 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -284,6 +284,7 @@ static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2
 void rte_discard(rtable *tab, rte *old);
 int rt_examine(rtable *t, net_addr *a, struct proto *p, struct filter *filter);
 rte *rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, struct ea_list **tmpa, int silent);
+rte *rt_merge_list(struct announce_hook *ah, rte *e);
 void rt_refresh_begin(rtable *t, struct announce_hook *ah);
 void rt_refresh_end(rtable *t, struct announce_hook *ah);
 void rte_dump(rte *);
diff --git a/nest/rt-table.c b/nest/rt-table.c
index f164ecd..ccc5845 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -698,6 +698,35 @@ mpnh_merge_rta(struct mpnh *nhs, rta *a, int max)
 }
 
 rte *
+rt_merge_list(struct announce_hook *ah, rte *e)
+{
+  struct mpnh *nhs = NULL;
+  rte *cur = e, *next, *ret;
+
+  if (!e->next)
+      return e;
+
+  for (; cur; cur = next)
+    {
+      next = cur->next;
+      /* sanity */
+      cur->next = NULL;
+      nhs = mpnh_merge_rta(nhs, cur->attrs, ah->proto->merge_limit);
+      if (cur != e)
+          rte_free(cur);
+    }
+
+  ret = rte_cow_rta(e, rte_update_pool);
+  ret->attrs->dest = RTD_MULTIPATH;
+  ret->attrs->nexthops = nhs;
+
+  if (e != ret)
+    rte_free(e);
+
+  return ret;
+}
+
+rte *
 rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent)
 {
   // struct proto *p = ah->proto;
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 530cb24..a0eda60 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -1003,6 +1003,48 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
   return nl_exchange(&r.h);
 }
 
+static void
+krt_del_rte_multipath(struct krt_proto *p, rte *old)
+{
+  rta *a = old->attrs;
+  struct mpnh *nh;
+  rte *e;
+  int err;
+  rta ra = {
+    .src= p->p.main_source,
+    .source = RTS_INHERIT,
+    .scope = SCOPE_UNIVERSE,
+    .cast = RTC_UNICAST
+  };
+
+  e = rte_get_temp(&ra);
+
+  for (nh = a->nexthops; nh; nh = nh->next)
+    {
+      ra.gw = nh->gw;
+      ra.iface = nh->iface;
+
+      err = nl_send_route(p, old, NULL, 0);
+      if (err < 0)
+        DBG("deleting route failed %d\n", err);
+    }
+
+  rte_free(e);
+}
+
+static int trk_is_use_collect_mode(struct krt_proto *p)
+{
+  switch (p->p.table->addr_type)
+  {
+  case NET_IP6:
+  case NET_VPN6:
+  case NET_ROA6:
+    return 1;
+  default:
+    return 0;
+  }
+}
+
 void
 krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
 {
@@ -1016,7 +1058,12 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list
    */
 
   if (old)
-    nl_send_route(p, old, NULL, 0);
+    {
+      if (trk_is_use_collect_mode(p) && old->attrs->dest == RTD_MULTIPATH)
+        krt_del_rte_multipath(p, old);
+      else
+        nl_send_route(p, old, NULL, 0);
+    }
 
   if (new)
     err = nl_send_route(p, new, eattrs, 1);
@@ -1277,6 +1324,45 @@ nl_parse_route(struct nlmsghdr *h, int scan)
     krt_got_route_async(p, e, new);
 }
 
+static void
+krt_scan_notify_begin(struct krt_proto *p)
+{
+  if (p)
+    {
+      if (trk_is_use_collect_mode(p))
+        krt_got_route_begin(p);
+    }
+  else
+    {
+      HASH_WALK(nl_table_map, sys.hash_next, cp)
+        {
+          if (trk_is_use_collect_mode(cp))
+            krt_got_route_begin(cp);
+        }
+      HASH_WALK_END;
+    }
+}
+
+static void
+krt_scan_notify_end(struct krt_proto *p)
+{
+  if (p)
+    {
+      DBG("KRT: mp_collect: end: proto is specified (%s)\n", p->p.name);
+      if (trk_is_use_collect_mode(p))
+        krt_got_route_end(p);
+    }
+  else
+    {
+      HASH_WALK(nl_table_map, sys.hash_next, cp)
+        {
+          if (trk_is_use_collect_mode(cp))
+            krt_got_route_end(cp);
+        }
+      HASH_WALK_END;
+    }
+}
+
 void
 krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
 {
@@ -1290,11 +1376,16 @@ krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NUL
       log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
 
   nl_request_dump(AF_INET6, RTM_GETROUTE);
+
+  krt_scan_notify_begin(p);
+
   while (h = nl_get_scan())
     if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
       nl_parse_route(h, 1);
     else
       log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
+
+  krt_scan_notify_end(p);
 }
 
 /*
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index a15d00e..ae93ce4 100644
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@ -658,8 +658,8 @@ krt_same_dest(rte *k, rte *e)
  *  We expect that the route is a temporary rte and its attributes are uncached.
  */
 
-void
-krt_got_route(struct krt_proto *p, rte *e)
+static void
+krt_got_route_collected(struct krt_proto *p, rte *e)
 {
   net *net = e->net;
   int verdict;
@@ -735,7 +735,8 @@ krt_got_route(struct krt_proto *p, rte *e)
       /* Get a cached copy of attributes and temporarily link the route */
       rta *a = e->attrs;
       a->source = RTS_DUMMY;
-      e->attrs = rta_lookup(a);
+      if (!rta_is_cached(a))
+          e->attrs = rta_lookup(a);
       e->next = net->routes;
       net->routes = e;
     }
@@ -743,6 +744,143 @@ krt_got_route(struct krt_proto *p, rte *e)
     rte_free(e);
 }
 
+static rte *
+krt_mp_collect_postprocess(struct krt_proto *p, rte *e)
+{
+  return rt_merge_list(p->p.main_ahook, e);
+}
+
+static int
+krt_mp_is_collectable(struct krt_proto *p, rte *e)
+{
+  struct rta *a = e->attrs;
+
+  if (a->dest != RTD_ROUTER && a->dest != RTD_DEVICE)
+      return 0;
+
+  return 1;
+}
+
+static int
+krt_mp_is_mergable(struct krt_proto *p, rte *e1, rte *e2)
+{
+  if (!rte_is_valid(e1) || !rte_is_valid(e2))
+    return 0;
+
+  if (e1->pref != e2->pref)
+    return 0;
+
+  if (e1->attrs->src->proto->proto != e2->attrs->src->proto->proto)
+    return 0;
+
+  return 1;
+}
+
+static int
+krt_mp_collect_add(struct krt_proto *p, rte *mp_collect_rte, rte *e)
+{
+  struct rte *last;
+  if (mp_collect_rte->net != e->net)
+      return -1;
+
+  if (!krt_mp_is_collectable(p, e))
+      return -1;
+
+  if (!krt_mp_is_mergable(p, mp_collect_rte, e))
+      return -1;
+
+  rta *a = e->attrs;
+  if (!rta_is_cached(a))
+      e->attrs = rta_lookup(a);
+
+  last = mp_collect_rte;
+
+  for ( ; last->next; last = last->next);
+
+  last->next = e;
+  e->next = NULL;
+
+  return 0;
+}
+
+void
+krt_mp_collect(struct krt_proto *p, rte *e)
+{
+  if (p->mp_collect_rte)
+    {
+      if (!krt_mp_collect_add(p, p->mp_collect_rte, e))
+        {
+          krt_trace_in(p, e, "collecting[add]");
+          return;
+        }
+
+      rte *cur = NULL;
+
+      cur = krt_mp_collect_postprocess(p, p->mp_collect_rte);
+      p->mp_collect_rte = NULL;
+      krt_trace_in(p, cur, "collected");
+      krt_got_route_collected(p, cur);
+    }
+
+    ASSERT(!p->mp_collect_rte);
+    if (krt_mp_is_collectable(p, e))
+      {
+        e->attrs = rta_lookup(e->attrs);
+        e->next = NULL;
+        p->mp_collect_rte = e;
+        krt_trace_in(p, e, "collecting");
+        return;
+      }
+
+    krt_got_route_collected(p, e);
+}
+
+void krt_got_route_begin(struct krt_proto *p)
+{
+  DBG("KRT: mp_collect: begin for proto (%s)\n", p->p.name);
+  ASSERT(!p->mp_collect_mode);
+  p->mp_collect_mode = 1;
+}
+
+void krt_got_route_end(struct krt_proto *p)
+{
+  DBG("KRT: mp_collect: end for proto (%s)\n", p->p.name);
+
+  ASSERT(p->mp_collect_mode);
+
+  p->mp_collect_mode = 0;
+
+  rte *mp_collect_rte = p->mp_collect_rte;
+
+  if (!mp_collect_rte)
+    {
+      DBG("KRT: mp_collect: no collected entry on end\n");
+      return;
+    }
+
+  p->mp_collect_rte = NULL;
+
+  mp_collect_rte = krt_mp_collect_postprocess(p, mp_collect_rte);
+
+  krt_trace_in(p, mp_collect_rte, "collected[end]");
+
+  krt_got_route_collected(p, mp_collect_rte);
+
+  DBG("KRT: mp_collect: route collected on end\n");
+}
+
+void
+krt_got_route(struct krt_proto *p, rte *e)
+{
+  if (p->mp_collect_mode)
+    {
+      krt_mp_collect(p, e);
+      return;
+    }
+
+  krt_got_route_collected(p, e);
+}
+
 static void
 krt_prune(struct krt_proto *p)
 {
diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h
index f05dc37..3a1781f 100644
--- a/sysdep/unix/krt.h
+++ b/sysdep/unix/krt.h
@@ -69,6 +69,9 @@ struct krt_proto {
   byte ready;			/* Initial feed has been finished */
   byte initialized;		/* First scan has been finished */
   byte reload;			/* Next scan is doing reload */
+  byte mp_collect_mode;		/* Collecting multipath entries from single-path */
+
+  rte *mp_collect_rte;
 };
 
 extern pool *krt_pool;
@@ -82,6 +85,8 @@ extern pool *krt_pool;
 
 struct proto_config * kif_init_config(int class);
 void kif_request_scan(void);
+void krt_got_route_begin(struct krt_proto *p);
+void krt_got_route_end(struct krt_proto *p);
 void krt_got_route(struct krt_proto *p, struct rte *e);
 void krt_got_route_async(struct krt_proto *p, struct rte *e, int new);
 
-- 
2.5.0



More information about the Bird-users mailing list