[PATCH] IPv6 ECMP support fixes for linux

Mikhail Sennikovskii mikhail.sennikovskii at profitbricks.com
Wed Feb 10 16:38:04 CET 2016


The API for configuring ECMP for IPv6 on Linux is not symmetrical.
Routes can be set via the multipath structures, but Linux kernel
splits this up into separate routes internally.
As a result, ECMP routes are retorned as separate independent
routes when queried.
This patch works around this issue by making bird collect
individual routes for the same destination in one multipath route.
It also implements deletion of multipath routes as a set of
delete operations for each route entry.
Learn mode is still not supported for now.

Signed-off-by: Mikhail Sennikovskii <mikhail.sennikovskii at profitbricks.com>
---
 nest/route.h           |   1 +
 nest/rt-table.c        |  29 ++++++++++
 sysdep/linux/netlink.c |  79 ++++++++++++++++++++++++++-
 sysdep/unix/krt.c      | 144 +++++++++++++++++++++++++++++++++++++++++++++++--
 sysdep/unix/krt.h      |   5 ++
 5 files changed, 254 insertions(+), 4 deletions(-)

diff --git a/nest/route.h b/nest/route.h
index c435b9e..c24d7ac 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -271,6 +271,7 @@ static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2
 void rte_discard(rtable *tab, rte *old);
 int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter);
 rte *rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, struct ea_list **tmpa, int silent);
+rte *rt_merge_list(struct announce_hook *ah, rte *e);
 void rt_refresh_begin(rtable *t, struct announce_hook *ah);
 void rt_refresh_end(rtable *t, struct announce_hook *ah);
 void rte_dump(rte *);
diff --git a/nest/rt-table.c b/nest/rt-table.c
index 57c8b8e..a3437da 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -597,6 +597,35 @@ mpnh_merge_rta(struct mpnh *nhs, rta *a, int max)
 }
 
 rte *
+rt_merge_list(struct announce_hook *ah, rte *e)
+{
+  struct mpnh *nhs = NULL;
+  rte *cur = e, *next, *ret;
+
+  if (!e->next)
+      return e;
+
+  for (; cur; cur = next)
+    {
+      next = cur->next;
+      /* sanity */
+      cur->next = NULL;
+      nhs = mpnh_merge_rta(nhs, cur->attrs, ah->proto->merge_limit);
+      if (cur != e)
+          rte_free(cur);
+    }
+
+  ret = rte_cow_rta(e, rte_update_pool);
+  ret->attrs->dest = RTD_MULTIPATH;
+  ret->attrs->nexthops = nhs;
+
+  if (e != ret)
+    rte_free(e);
+
+  return ret;
+}
+
+rte *
 rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent)
 {
   // struct proto *p = ah->proto;
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 640d187..31e833c 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -896,6 +896,37 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new)
   return nl_exchange(&r.h);
 }
 
+#ifdef IPV6
+static void
+krt_del_rte_multipath(struct krt_proto *p, rte *old)
+{
+  rta *a = old->attrs;
+  struct mpnh *nh;
+  rte *e;
+  int err;
+  rta ra = {
+    .src= p->p.main_source,
+    .source = RTS_INHERIT,
+    .scope = SCOPE_UNIVERSE,
+    .cast = RTC_UNICAST
+  };
+
+  e = rte_get_temp(&ra);
+
+  for (nh = a->nexthops; nh; nh = nh->next)
+    {
+      ra.gw = nh->gw;
+      ra.iface = nh->iface;
+
+      err = nl_send_route(p, old, NULL, 0);
+      if (err < 0)
+        DBG("deleting route failed %d\n", err);
+    }
+
+  rte_free(e);
+}
+#endif
+
 void
 krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
 {
@@ -909,7 +940,14 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list
    */
 
   if (old)
-    nl_send_route(p, old, NULL, 0);
+    {
+#ifdef IPV6
+      if (old->attrs->dest == RTD_MULTIPATH)
+        krt_del_rte_multipath(p, old);
+      else
+#endif
+        nl_send_route(p, old, NULL, 0);
+    }
 
   if (new)
     err = nl_send_route(p, new, eattrs, 1);
@@ -1178,17 +1216,56 @@ nl_parse_route(struct nlmsghdr *h, int scan)
     krt_got_route_async(p, e, new);
 }
 
+static void
+krt_scan_notify_begin(struct krt_proto *p)
+{
+#ifdef IPV6
+  if (p)
+    krt_got_route_begin(p);
+  else
+    {
+      HASH_WALK(nl_table_map, sys.hash_next, cp)
+        {
+          krt_got_route_begin(cp);
+        }
+      HASH_WALK_END;
+    }
+#endif
+}
+
+static void
+krt_scan_notify_end(struct krt_proto *p)
+{
+#ifdef IPV6
+  if (p)
+    krt_got_route_end(p);
+  else
+    {
+      HASH_WALK(nl_table_map, sys.hash_next, cp)
+        {
+          krt_got_route_end(cp);
+        }
+      HASH_WALK_END;
+    }
+#endif
+}
+
 void
 krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
 {
   struct nlmsghdr *h;
 
   nl_request_dump(BIRD_AF, RTM_GETROUTE);
+
+  krt_scan_notify_begin(p);
+
   while (h = nl_get_scan())
     if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
       nl_parse_route(h, 1);
     else
       log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
+
+  krt_scan_notify_end(p);
 }
 
 /*
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 5e78586..1a6ec90 100644
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@ -660,8 +660,8 @@ krt_same_dest(rte *k, rte *e)
  *  We expect that the route is a temporary rte and its attributes are uncached.
  */
 
-void
-krt_got_route(struct krt_proto *p, rte *e)
+static void
+krt_got_route_collected(struct krt_proto *p, rte *e)
 {
   net *net = e->net;
   int verdict;
@@ -737,7 +737,8 @@ krt_got_route(struct krt_proto *p, rte *e)
       /* Get a cached copy of attributes and temporarily link the route */
       rta *a = e->attrs;
       a->source = RTS_DUMMY;
-      e->attrs = rta_lookup(a);
+      if (!rta_is_cached(a))
+          e->attrs = rta_lookup(a);
       e->next = net->routes;
       net->routes = e;
     }
@@ -745,6 +746,143 @@ krt_got_route(struct krt_proto *p, rte *e)
     rte_free(e);
 }
 
+static rte *
+krt_mp_collect_postprocess(struct krt_proto *p, rte *e)
+{
+  return rt_merge_list(p->p.main_ahook, e);
+}
+
+static int
+krt_mp_is_collectable(struct krt_proto *p, rte *e)
+{
+  struct rta *a = e->attrs;
+
+  if (a->dest != RTD_ROUTER && a->dest != RTD_DEVICE)
+      return 0;
+
+  return 1;
+}
+
+static int
+krt_mp_is_mergable(struct krt_proto *p, rte *e1, rte *e2)
+{
+  if (!rte_is_valid(e1) || !rte_is_valid(e2))
+    return 0;
+
+  if (e1->pref != e2->pref)
+    return 0;
+
+  if (e1->attrs->src->proto->proto != e2->attrs->src->proto->proto)
+    return 0;
+
+  return 1;
+}
+
+static int
+krt_mp_collect_add(struct krt_proto *p, rte *mp_collect_rte, rte *e)
+{
+  struct rte *last;
+  if (mp_collect_rte->net != e->net)
+      return -1;
+
+  if (!krt_mp_is_collectable(p, e))
+      return -1;
+
+  if (!krt_mp_is_mergable(p, mp_collect_rte, e))
+      return -1;
+
+  rta *a = e->attrs;
+  if (!rta_is_cached(a))
+      e->attrs = rta_lookup(a);
+
+  last = mp_collect_rte;
+
+  for ( ; last->next; last = last->next);
+
+  last->next = e;
+  e->next = NULL;
+
+  return 0;
+}
+
+void
+krt_mp_collect(struct krt_proto *p, rte *e)
+{
+  if (p->mp_collect_rte)
+    {
+      if (!krt_mp_collect_add(p, p->mp_collect_rte, e))
+        {
+          krt_trace_in(p, e, "collecting[add]");
+          return;
+        }
+
+      rte *cur = NULL;
+
+      cur = krt_mp_collect_postprocess(p, p->mp_collect_rte);
+      p->mp_collect_rte = NULL;
+      krt_trace_in(p, cur, "collected");
+      krt_got_route_collected(p, cur);
+    }
+
+    ASSERT(!p->mp_collect_rte);
+    if (krt_mp_is_collectable(p, e))
+      {
+        e->attrs = rta_lookup(e->attrs);
+        e->next = NULL;
+        p->mp_collect_rte = e;
+        krt_trace_in(p, e, "collecting");
+        return;
+      }
+
+    krt_got_route_collected(p, e);
+}
+
+void krt_got_route_begin(struct krt_proto *p)
+{
+  DBG("KRT: mp_collect: begin for proto (%s)\n", p->p.name);
+  ASSERT(!p->mp_collect_mode);
+  p->mp_collect_mode = 1;
+}
+
+void krt_got_route_end(struct krt_proto *p)
+{
+  DBG("KRT: mp_collect: end for proto (%s)\n", p->p.name);
+
+  ASSERT(p->mp_collect_mode);
+
+  p->mp_collect_mode = 0;
+
+  rte *mp_collect_rte = p->mp_collect_rte;
+
+  if (!mp_collect_rte)
+    {
+      DBG("KRT: mp_collect: no collected entry on end\n");
+      return;
+    }
+
+  p->mp_collect_rte = NULL;
+
+  mp_collect_rte = krt_mp_collect_postprocess(p, mp_collect_rte);
+
+  krt_trace_in(p, mp_collect_rte, "collected[end]");
+
+  krt_got_route_collected(p, mp_collect_rte);
+
+  DBG("KRT: mp_collect: route collected on end\n");
+}
+
+void
+krt_got_route(struct krt_proto *p, rte *e)
+{
+  if (p->mp_collect_mode)
+    {
+      krt_mp_collect(p, e);
+      return;
+    }
+
+  krt_got_route_collected(p, e);
+}
+
 static void
 krt_prune(struct krt_proto *p)
 {
diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h
index d4a8717..7d46888 100644
--- a/sysdep/unix/krt.h
+++ b/sysdep/unix/krt.h
@@ -68,6 +68,9 @@ struct krt_proto {
   byte ready;			/* Initial feed has been finished */
   byte initialized;		/* First scan has been finished */
   byte reload;			/* Next scan is doing reload */
+  byte mp_collect_mode;		/* Collecting multipath entries from single-path */
+
+  rte *mp_collect_rte;
 };
 
 extern pool *krt_pool;
@@ -81,6 +84,8 @@ extern pool *krt_pool;
 
 struct proto_config * kif_init_config(int class);
 void kif_request_scan(void);
+void krt_got_route_begin(struct krt_proto *p);
+void krt_got_route_end(struct krt_proto *p);
 void krt_got_route(struct krt_proto *p, struct rte *e);
 void krt_got_route_async(struct krt_proto *p, struct rte *e, int new);
 
-- 
2.5.0



More information about the Bird-users mailing list