[PATCH v2 1/2] Netlink: Drop ECMP route splitting hacks

Daniel Gröber dxld at darkboxed.org
Thu May 26 14:16:34 CEST 2022


This removes the hacky route merging/splitting code needed to support older
kernel versions. Consequently the required Linux version is raised to 4.11
for reliable operation.
---
 sysdep/linux/netlink.c | 218 +++++++----------------------------------
 1 file changed, 33 insertions(+), 185 deletions(-)

diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 29b744cb..eca681f9 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -78,49 +78,6 @@
 
 const int rt_default_ecmp = 16;
 
-/*
- * Structure nl_parse_state keeps state of received route processing. Ideally,
- * we could just independently parse received Netlink messages and immediately
- * propagate received routes to the rest of BIRD, but older Linux kernel (before
- * version 4.11) represents and announces IPv6 ECMP routes not as one route with
- * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
- * routes with the same prefix. More recent kernels work as with IPv4.
- *
- * Therefore, BIRD keeps currently processed route in nl_parse_state structure
- * and postpones its propagation until we expect it to be final; i.e., when
- * non-matching route is received or when the scan ends. When another matching
- * route is received, it is merged with the already processed route to form an
- * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
- * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
- * routes with RTA_MULTIPATH set are just considered non-matching.
- *
- * This is ignored for asynchronous notifications (every notification is handled
- * as a separate route). It is not an issue for our routes, as we ignore such
- * notifications anyways. But importing alien IPv6 ECMP routes does not work
- * properly with older kernels.
- *
- * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
- * for the same prefix.
- */
-
-struct nl_parse_state
-{
-  struct linpool *pool;
-  int scan;
-  int merge;
-
-  net *net;
-  rta *attrs;
-  struct krt_proto *proto;
-  s8 new;
-  s8 krt_src;
-  u8 krt_type;
-  u8 krt_proto;
-  u32 krt_metric;
-
-  u32 rta_flow;		/* Used during parsing */
-};
-
 /*
  *	Synchronous Netlink interface
  */
@@ -761,7 +718,7 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e
 }
 
 static struct nexthop *
-nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src)
+nl_parse_multipath(struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src, u32 *rta_flow)
 {
   struct rtattr *a[BIRD_RTA_MAX];
   struct rtnexthop *nh = RTA_DATA(ra);
@@ -780,7 +737,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr
       if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
 	goto next;
 
-      *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE);
+      *last = rv = lp_allocz(nl_linpool, NEXTHOP_MAX_SIZE);
       last = &(rv->next);
 
       rv->weight = nh->rtnh_hops;
@@ -824,7 +781,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr
 	rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
 
       if (a[RTA_FLOW])
-	s->rta_flow = rta_get_u32(a[RTA_FLOW]);
+	*rta_flow = rta_get_u32(a[RTA_FLOW]);
 
 #ifdef HAVE_MPLS_KERNEL
       if (a[RTA_VIA])
@@ -1480,36 +1437,13 @@ static inline int
 nl_add_rte(struct krt_proto *p, rte *e)
 {
   rta *a = e->attrs;
-  int err = 0;
-
-  if (krt_ecmp6(p) && a->nh.next)
-  {
-    struct nexthop *nh = &(a->nh);
-
-    err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh);
-    if (err < 0)
-      return err;
-
-    for (nh = nh->next; nh; nh = nh->next)
-      err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh);
-
-    return err;
-  }
-
   return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh));
 }
 
 static inline int
 nl_delete_rte(struct krt_proto *p, rte *e)
 {
-  int err = 0;
-
-  /* For IPv6, we just repeatedly request DELETE until we get error */
-  do
-    err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL);
-  while (krt_ecmp6(p) && !err);
-
-  return err;
+  return nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL);
 }
 
 static inline int
@@ -1559,67 +1493,11 @@ krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
   }
 }
 
-static int
-nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family)
-{
-  /* Route merging is used for IPv6 scans */
-  if (!s->scan || (rtm_family != AF_INET6))
-    return 0;
-
-  /* Saved and new route must have same network, proto/table, and priority */
-  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
-    return 0;
-
-  /* Both must be regular unicast routes */
-  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
-    return 0;
-
-  return 1;
-}
-
-static void
-nl_announce_route(struct nl_parse_state *s)
-{
-  rte *e = rte_get_temp(s->attrs);
-  e->net = s->net;
-  e->u.krt.src = s->krt_src;
-  e->u.krt.proto = s->krt_proto;
-  e->u.krt.seen = 0;
-  e->u.krt.best = 0;
-  e->u.krt.metric = s->krt_metric;
-
-  if (s->scan)
-    krt_got_route(s->proto, e);
-  else
-    krt_got_route_async(s->proto, e, s->new);
-
-  s->net = NULL;
-  s->attrs = NULL;
-  s->proto = NULL;
-  lp_flush(s->pool);
-}
-
-static inline void
-nl_parse_begin(struct nl_parse_state *s, int scan)
-{
-  memset(s, 0, sizeof (struct nl_parse_state));
-  s->pool = nl_linpool;
-  s->scan = scan;
-}
-
-static inline void
-nl_parse_end(struct nl_parse_state *s)
-{
-  if (s->net)
-    nl_announce_route(s);
-}
-
-
 #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0)
 #define SKIP(ARG, ...)  do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0)
 
 static void
-nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
+nl_parse_route(struct nlmsghdr *h, int scan)
 {
   struct krt_proto *p;
   struct rtmsg *i;
@@ -1632,6 +1510,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
   u32 priority = 0;
   u32 def_scope = RT_SCOPE_UNIVERSE;
   int krt_src;
+  u32 rta_flow;
 
   if (!(i = nl_checkin(h, sizeof(*i))))
     return;
@@ -1707,7 +1586,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
   if (i->rtm_tos != 0)			/* We don't support TOS */
     SKIP("TOS %02x\n", i->rtm_tos);
 
-  if (s->scan && !new)
+  if (scan && !new)
     SKIP("RTM_DELROUTE in scan\n");
 
   if (a[RTA_PRIORITY])
@@ -1731,7 +1610,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
       return;
 
     case RTPROT_BIRD:
-      if (!s->scan)
+      if (!scan)
 	SKIP("echo\n");
       krt_src = KRT_SRC_BIRD;
       break;
@@ -1751,18 +1630,15 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 
   net *net = net_get(p->p.main_channel->table, n);
 
-  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family))
-    nl_announce_route(s);
-
-  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
+  rta *ra = lp_allocz(nl_linpool, RTA_MAX_SIZE);
   ra->src = p->p.main_source;
   ra->source = RTS_INHERIT;
   ra->scope = SCOPE_UNIVERSE;
 
   if (a[RTA_FLOW])
-    s->rta_flow = rta_get_u32(a[RTA_FLOW]);
+    rta_flow = rta_get_u32(a[RTA_FLOW]);
   else
-    s->rta_flow = 0;
+    rta_flow = 0;
 
   switch (i->rtm_type)
     {
@@ -1771,7 +1647,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 
       if (a[RTA_MULTIPATH])
         {
-	  struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src);
+	  struct nexthop *nh = nl_parse_multipath(p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src, &rta_flow);
 	  if (!nh)
 	    SKIP("strange RTA_MULTIPATH\n");
 
@@ -1859,7 +1735,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 
   if (i->rtm_scope != def_scope)
     {
-      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
+      ea_list *ea = lp_alloc(nl_linpool, sizeof(ea_list) + sizeof(eattr));
       ea->next = ra->eattrs;
       ra->eattrs = ea;
       ea->flags = EALF_SORTED;
@@ -1874,7 +1750,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
     {
       ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
 
-      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
+      ea_list *ea = lp_alloc(nl_linpool, sizeof(ea_list) + sizeof(eattr));
       ea->next = ra->eattrs;
       ra->eattrs = ea;
       ea->flags = EALF_SORTED;
@@ -1883,7 +1759,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
       ea->attrs[0].flags = 0;
       ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
 
-      struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
+      struct adata *ad = lp_alloc(nl_linpool, sizeof(struct adata) + sizeof(ps));
       ad->length = sizeof(ps);
       memcpy(ad->data, &ps, sizeof(ps));
 
@@ -1891,9 +1767,9 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
     }
 
   /* Can be set per-route or per-nexthop */
-  if (s->rta_flow)
+  if (rta_flow)
     {
-      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
+      ea_list *ea = lp_alloc(nl_linpool, sizeof(ea_list) + sizeof(eattr));
       ea->next = ra->eattrs;
       ra->eattrs = ea;
       ea->flags = EALF_SORTED;
@@ -1901,13 +1777,13 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
       ea->attrs[0].id = EA_KRT_REALM;
       ea->attrs[0].flags = 0;
       ea->attrs[0].type = EAF_TYPE_INT;
-      ea->attrs[0].u.data = s->rta_flow;
+      ea->attrs[0].u.data = rta_flow;
     }
 
   if (a[RTA_METRICS])
     {
       u32 metrics[KRT_METRICS_MAX];
-      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
+      ea_list *ea = lp_alloc(nl_linpool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
       int t, n = 0;
 
       if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
@@ -1936,59 +1812,35 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
     }
 
   /*
-   * Ideally, now we would send the received route to the rest of kernel code.
-   * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
-   * postpone it and merge next hops until the end of the sequence. Note that
-   * when doing merging of next hops, we expect the new route to be unipath.
-   * Otherwise, we ignore additional next hops in nexthop_insert().
+   * Send the received route to the rest of kernel code.
    */
+  rte *e = rte_get_temp(ra);
+  e->net = net;
+  e->u.krt.src = krt_src;
+  e->u.krt.proto = i->rtm_protocol;
+  e->u.krt.seen = 0;
+  e->u.krt.best = 0;
+  e->u.krt.metric = priority;
 
-  if (!s->net)
-  {
-    /* Store the new route */
-    s->net = net;
-    s->attrs = ra;
-    s->proto = p;
-    s->new = new;
-    s->krt_src = krt_src;
-    s->krt_type = i->rtm_type;
-    s->krt_proto = i->rtm_protocol;
-    s->krt_metric = priority;
-  }
+  if (scan)
+    krt_got_route(p, e);
   else
-  {
-    /* Merge next hops with the stored route */
-    rta *oa = s->attrs;
-
-    struct nexthop *nhs = &oa->nh;
-    nexthop_insert(&nhs, &ra->nh);
-
-    /* Perhaps new nexthop is inserted at the first position */
-    if (nhs == &ra->nh)
-    {
-      /* Swap rtas */
-      s->attrs = ra;
+    krt_got_route_async(0, e, new);
 
-      /* Keep old eattrs */
-      ra->eattrs = oa->eattrs;
-    }
-  }
+  lp_flush(nl_linpool);
 }
 
 void
 krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
 {
   struct nlmsghdr *h;
-  struct nl_parse_state s;
 
-  nl_parse_begin(&s, 1);
   nl_request_dump_route(AF_UNSPEC);
   while (h = nl_get_scan())
     if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
-      nl_parse_route(&s, h);
+      nl_parse_route(h, /*scan=*/1);
     else
       log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
-  nl_parse_end(&s);
 }
 
 /*
@@ -2003,16 +1855,12 @@ static struct config *nl_last_config;	/* For tracking changes to nl_async_bufsiz
 static void
 nl_async_msg(struct nlmsghdr *h)
 {
-  struct nl_parse_state s;
-
   switch (h->nlmsg_type)
     {
     case RTM_NEWROUTE:
     case RTM_DELROUTE:
       DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
-      nl_parse_begin(&s, 0);
-      nl_parse_route(&s, h);
-      nl_parse_end(&s);
+      nl_parse_route(h, /*scan=*/0);
       break;
     case RTM_NEWLINK:
     case RTM_DELLINK:
-- 
2.30.2



More information about the Bird-users mailing list