[PATCH 1/1] * Implement correct RFC4271 MED comparison
Alexander V. Chernikov
melifaro at ipfw.ru
Fri Nov 18 02:53:09 CET 2011
---
filter/filter.c | 6 +
nest/proto-hooks.c | 3 +-
nest/protocol.h | 6 +-
nest/route.h | 8 ++
nest/rt-table.c | 58 +++++++++--
proto/bgp/attrs.c | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++-
proto/bgp/bgp.c | 4 +-
proto/bgp/bgp.h | 9 ++-
proto/ospf/ospf.c | 4 +-
proto/rip/rip.c | 4 +-
10 files changed, 374 insertions(+), 17 deletions(-)
diff --git a/filter/filter.c b/filter/filter.c
index b3b17dc..77af39a 100644
--- a/filter/filter.c
+++ b/filter/filter.c
@@ -935,6 +935,12 @@ interpret(struct f_inst *what)
if (v1.type != T_PATH)
runtime( "Setting path attribute to non-path value" );
l->attrs[0].u.ptr = v1.val.ad;
+ /*
+ * Zero cache on AS-PATH change.
+ * XXX: This breaks COW concept but
+ * removing this flag is harmless
+ */
+ (*f_rte)->pflags &= ~0x01;
break;
case EAF_TYPE_INT_SET:
if (v1.type != T_CLIST)
diff --git a/nest/proto-hooks.c b/nest/proto-hooks.c
index f026192..1dec95c 100644
--- a/nest/proto-hooks.c
+++ b/nest/proto-hooks.c
@@ -271,6 +271,7 @@ int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *po
* rte_better - compare metrics of two routes
* @new: the new route
* @old: the original route
+ * @flags: protocol-specific comparison flags
*
* This hook gets called when the routing table contains two routes
* for the same network which have originated from different instances
@@ -279,7 +280,7 @@ int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *po
*
* Result: 1 if @new is better (more preferred) than @old, 0 otherwise.
*/
-int rte_better(rte *new, rte *old)
+int rte_better(rte *new, rte *old, u32 flags)
{ DUMMY; }
/**
diff --git a/nest/protocol.h b/nest/protocol.h
index a7518c2..02defb4 100644
--- a/nest/protocol.h
+++ b/nest/protocol.h
@@ -178,13 +178,15 @@ struct proto {
/*
* Routing entry hooks (called only for rte's belonging to this protocol):
*
- * rte_better Compare two rte's and decide which one is better (1=first, 0=second).
+ * rte_best Find best rte starting from given.
+ * rte_better Compare two rte's and decide if better (1=first, 0=second).
* rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no).
* rte_insert Called whenever a rte is inserted to a routing table.
* rte_remove Called whenever a rte is removed from the routing table.
*/
- int (*rte_better)(struct rte *, struct rte *);
+ struct rte *(*rte_best)(struct rte *);
+ int (*rte_better)(struct rte *, struct rte *, u32 flags);
int (*rte_same)(struct rte *, struct rte *);
void (*rte_insert)(struct network *, struct rte *);
void (*rte_remove)(struct network *, struct rte *);
diff --git a/nest/route.h b/nest/route.h
index a4c0154..31a7a56 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -201,6 +201,12 @@ typedef struct rte {
u32 router_id; /* Router that originated this route */
} ospf;
#endif
+#ifdef CONFIG_BGP
+ struct {
+ u32 first_as; /* First as number in AS-PATH or local AS */
+ struct rte *next_item; /* Used in bgp_get_best */
+ } bgp;
+#endif
struct { /* Routes generated by krt sync (both temporary and inherited ones) */
s8 src; /* Alleged route source (see krt.h) */
u8 proto; /* Kernel source protocol ID */
@@ -212,6 +218,8 @@ typedef struct rte {
} rte;
#define REF_COW 1 /* Copy this rte on write */
+/* Protocol-specific flags */
+#define PFR_FLAG1 0x01 /* Definition of first possible flag */
/* Types of route announcement, also used as flags */
#define RA_OPTIMAL 1 /* Announcement of optimal route change */
diff --git a/nest/rt-table.c b/nest/rt-table.c
index e20d2f6..574711a 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -135,10 +135,23 @@ rte_do_cow(rte *r)
return e;
}
-static int /* Actually better or at least as good as */
+/**
+ * rte_better - returns 1 if new rte is better
+ * @new: new rte
+ * @old: old BEST rte
+ *
+ * Functions compares old and new rte returning 1 IFF new rte is better.
+ * Best rte in @old->net is required to be specified in @old if @old and @new are
+ * originated from same protocol type. Specifying different rte can produce
+ * incorrect calculations in some protocols (see bgp_rte_better)
+ *
+ * Returns 1 if @new is better, 0 overwise
+ */
+static int
rte_better(rte *new, rte *old)
{
- int (*better)(rte *, rte *);
+ int (*better)(rte *, rte *, u32);
+ struct proto *p_old, *p_new;
if (!old)
return 1;
@@ -146,17 +159,19 @@ rte_better(rte *new, rte *old)
return 1;
if (new->pref < old->pref)
return 0;
- if (new->attrs->proto->proto != old->attrs->proto->proto)
+ p_old = old->attrs->proto;
+ p_new = new->attrs->proto;
+ if (p_new->proto != p_old->proto)
{
/*
* If the user has configured protocol preferences, so that two different protocols
* have the same preference, try to break the tie by comparing addresses. Not too
* useful, but keeps the ordering of routes unambiguous.
*/
- return new->attrs->proto->proto > old->attrs->proto->proto;
+ return p_new->proto > p_old->proto;
}
- if (better = new->attrs->proto->rte_better)
- return better(new, old);
+ if (better = p_new->rte_better)
+ return better(new, old, 0);
return 0;
}
@@ -426,9 +441,11 @@ static void
rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte *new, ea_list *tmpa)
{
struct proto_stats *stats = &p->stats;
+ struct proto *rte_proto;
rte *old_best = net->routes;
rte *old = NULL;
- rte **k, *r, *s;
+ rte **k, *r, *s, *t;
+ u32 proto_mask, attr_class;
#ifdef CONFIG_PIPE
if (proto_is_pipe(p) && (p->table == table))
@@ -526,9 +543,36 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte
/* Find new optimal route */
r = NULL;
+ proto_mask = 0;
for (s=net->routes; s; s=s->next)
+ {
+ rte_proto = s->attrs->proto;
+ attr_class = rte_proto->proto->attr_class;
+ /* Protocol has non-zero attr_class and rte_best hook */
+ if (attr_class && rte_proto->rte_best)
+ {
+ /* Check if this is first hook invocation */
+ if (!((2 << attr_class) & proto_mask))
+ {
+ proto_mask |= 2 << attr_class;
+ /*
+ * Find best route from protocol point of view
+ * and compare it with current best route if exists
+ */
+ t = rte_proto->rte_best(s);
+ if (!r || rte_better(r, t))
+ r = t;
+ }
+ /*
+ * Protocol has already supplied its best route.
+ * Skip this rte
+ */
+ continue;
+ }
+
if (rte_better(s, r))
r = s;
+ }
/* Announce optimal route */
rte_announce(table, RA_OPTIMAL, net, r, old_best, tmpa);
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index 2832f42..0b417df 100644
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@ -1118,7 +1118,7 @@ rte_resolvable(rte *rt)
}
int
-bgp_rte_better(rte *new, rte *old)
+bgp_rte_better(rte *new, rte *old, u32 flags UNUSED)
{
struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto;
struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto;
@@ -1236,6 +1236,293 @@ bgp_rte_better(rte *new, rte *old)
return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
}
+
+
+static inline u32
+bgp_get_neighbor_as(rte *s)
+{
+ u32 first_as;
+
+ if (s->flags & PFR_BGP_ASCACHE)
+ return s->u.bgp.first_as;
+
+ first_as = bgp_get_neighbor(s);
+ s->u.bgp.first_as = first_as;
+ s->pflags |= PFR_BGP_ASCACHE;
+
+ return first_as;
+}
+
+/**
+ * bgp_rte_better_med - Determine best route
+ * @new: new rte
+ * @old: old BEST rte in @old->net
+ *
+ * Function implements RFC 4271 best route selection
+ * with so-called 'deterministic' MED comparison.
+ * If @new and @old are announced from different neighbours
+ * @old is required to be the best route.
+ *
+ * Returns: 1 if @new is better, 0 overwise
+ */
+int
+bgp_rte_better_med(rte *new, rte *old, u32 flags)
+{
+ struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto;
+ struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto;
+ struct bgp_config *new_config, *old_config;
+ ea_list *new_eattrs = new->attrs->eattrs;
+ ea_list *old_eattrs = old->attrs->eattrs;
+ eattr *x, *y;
+ u32 n, o;
+ u32 new_as, old_as, tmp_as;
+ rte *s, *r;
+
+ /* RFC 4271 9.1.2.1. Route resolvability test */
+ n = rte_resolvable(new);
+ o = rte_resolvable(old);
+ if (n > o)
+ return 1;
+ if (n < o)
+ return 0;
+
+ /* Start with local preferences */
+ x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
+ y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
+ n = x ? x->u.data : new_bgp->cf->default_local_pref;
+ o = y ? y->u.data : old_bgp->cf->default_local_pref;
+ if (n > o)
+ return 1;
+ if (n < o)
+ return 0;
+
+ /* RFC 4271 9.1.2.2. a) Use AS path lengths */
+ if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
+ {
+ x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
+ y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
+ n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
+ o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+ }
+
+ /* RFC 4271 9.1.2.2. b) Use origins */
+ x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
+ y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
+ n = x ? x->u.data : ORIGIN_INCOMPLETE;
+ o = y ? y->u.data : ORIGIN_INCOMPLETE;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+
+ /* RFC 4271 9.1.2.2. c) Compare MED's */
+ /*
+ * This is a bit tricky.
+ *
+ * First we retrieve/update neighbor ASNs from rte cache.
+ * If @old and @new ASNs are the same, we can immediately
+ * compare MED. If not, we need to compare best routes
+ * from both ASNs. Since @old is the best route already nothing
+ * needs to be done with @old. However, we need to scan all
+ * routes for given network (another place where @old is required
+ * to be the best (and the first) route) to get best rte in
+ * @new neighbor ASN. This changes complexity from O(1) to O(N).
+ * If found rte is not @new, we simply returns 0
+ * since @old is the best rte of all routes. Overwise, we
+ * continue to compare @old and @new skippind MED comparison.
+ */
+ /* Get neighbor info first as && update cache if nesessary */
+ old_as = bgp_get_neighbor_as(old);
+ new_as = bgp_get_neighbor_as(new);
+
+ new_config = new_bgp->cf;
+ old_config = old_bgp->cf;
+
+ if (new_as == old_as)
+ {
+ /* We can compare MED */
+ x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
+ y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
+ n = x ? x->u.data : new_config->default_med;
+ o = y ? y->u.data : old_config->default_med;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+ }
+ else
+ {
+ /*
+ * routes came from different neighbor ASses.
+ * We need to find best route within routes
+ * from new_as
+ */
+ r = new;
+ for (s = old; s; s = s->next)
+ {
+ /* Get neighbor ASN */
+ tmp_as = bgp_get_neighbor_as(s);
+ if (tmp_as != new_as)
+ continue;
+
+ /*
+ * Okay, we've found another rte from the same neighbor.
+ * Let's check it.
+ */
+ if (bgp_rte_better_med(s, r, 0))
+ r = s;
+ }
+
+ /*
+ * Best route for neighbor routes is saved in r.
+ * If new is not the best route in the group this
+ * means that current best route remains best.
+ */
+ if (r != new)
+ return 0;
+ }
+
+ /* RFC 4271 9.1.2.2. d) Prefer external peers */
+ if (new_bgp->is_internal > old_bgp->is_internal)
+ return 0;
+ if (new_bgp->is_internal < old_bgp->is_internal)
+ return 1;
+
+ /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
+ n = new_config->igp_metric ? new->attrs->igp_metric : 0;
+ o = old_config->igp_metric ? old->attrs->igp_metric : 0;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+
+ /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
+ /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
+ x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
+ y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
+ n = x ? x->u.data : new_bgp->remote_id;
+ o = y ? y->u.data : old_bgp->remote_id;
+
+ /* RFC 5004 - prefer older routes */
+ /* (if both are external and from different peer) */
+ if ((new_config->prefer_older || old_config->prefer_older) &&
+ !new_bgp->is_internal && n != o)
+ return 0;
+
+ /* rest of RFC 4271 9.1.2.2. f) */
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+
+ /* RFC 4456 9. b) Compare cluster list lengths */
+ x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
+ y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
+ n = x ? int_set_get_size(x->u.ptr) : 0;
+ o = y ? int_set_get_size(y->u.ptr) : 0;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+
+ /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
+ return (ipa_compare(new_config->remote_ip, old_config->remote_ip) < 0);
+}
+
+#define LOCAL_HASH 16
+/**
+ * bgp_rte_best - returns the best rte in chain originated by BGP
+ *
+ */
+rte *
+bgp_rte_best(rte *old)
+{
+ rte **rgroups, *s, *ss, *best = NULL;
+ u32 local_as, best_as;
+ int i;
+
+ if (!old->next)
+ return old;
+
+ /* Allocate hash for ASn groups */
+ rgroups = alloca(LOCAL_HASH * sizeof(rte *));
+ memset(rgroups, 0, LOCAL_HASH * sizeof(rte *));
+
+ for (s = old; s; s = s->next)
+ {
+ local_as = bgp_get_neighbor_as(s);
+ best = rgroups[local_as & (LOCAL_HASH - 1)];
+ if (!best)
+ {
+ /* New group */
+ rgroups[local_as & (LOCAL_HASH - 1)] = s;
+ s->u.bgp.next_item = NULL;
+ continue;
+ }
+
+ ss = best;
+ best_as = bgp_get_neighbor_as(ss);
+ while (ss->u.bgp.next_item)
+ {
+ if ((best_as = bgp_get_neighbor_as(ss)) == local_as)
+ break;
+ /* Hash collision */
+ ss = ss->u.bgp.next_item;
+ }
+
+ /* List ended */
+ if (best_as == local_as)
+ {
+ /* Both rte are from the same group, let's compare */
+ if (bgp_rte_better_med(s, best, 0))
+ {
+ /* New best rte in group */
+ rgroups[local_as & (LOCAL_HASH - 1)] = s;
+ s->u.bgp.next_item = best->u.bgp.next_item;
+ }
+
+ continue;
+ }
+
+ /* New group */
+ ss->u.bgp.next_item = s;
+ s->u.bgp.next_item = NULL;
+ }
+
+ /* Cycle ended, now we have to compare all per-group best rtes */
+ for (i = 0, best = NULL; i < LOCAL_HASH; i++)
+ {
+ for (s = rgroups[i]; s; s = s->u.bgp.next_item)
+ {
+ if (!best)
+ {
+ best = s;
+ continue;
+ }
+
+ /* Do general preference checking */
+ if (best->pref > s->pref)
+ continue;
+ if (s->pref > best->pref)
+ {
+ best = s;
+ continue;
+ }
+
+ /* Do full bgp-specific checking without MED comparison */
+ if (bgp_rte_better_med(s, best, BGP_CFLAG_SKIPMED))
+ best = s;
+ }
+ }
+
+ return best;
+}
+#undef LOCAL_HASH
+
static struct adata *
bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
{
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 675342d..a277680 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -904,7 +904,9 @@ bgp_init(struct proto_config *C)
P->accept_ra_types = RA_OPTIMAL;
P->rt_notify = bgp_rt_notify;
- P->rte_better = bgp_rte_better;
+ //P->rte_better = bgp_rte_better;
+ P->rte_better = bgp_rte_better_med;
+ P->rte_best = bgp_rte_best;
P->import_control = bgp_import_control;
P->neigh_notify = bgp_neigh_notify;
P->reload_routes = bgp_reload_routes;
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index 437ba33..70e1ff9 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -184,7 +184,9 @@ void bgp_attach_attr(struct ea_list **to, struct linpool *pool, unsigned attr, u
byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned attr, unsigned len);
struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory);
int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
-int bgp_rte_better(struct rte *, struct rte *);
+int bgp_rte_better(struct rte *, struct rte *, u32);
+int bgp_rte_better_med(struct rte *, struct rte *, u32);
+rte *bgp_rte_best(rte *old);
void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs);
int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *);
void bgp_attr_init(struct bgp_proto *);
@@ -241,6 +243,11 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
#define BA_AS4_PATH 0x11 /* [RFC4893] */
#define BA_AS4_AGGREGATOR 0x12
+/* Protocol-specififc rte flags */
+#define PFR_BGP_ASCACHE PFR_FLAG1 /* 0x01 Set if bgp.first_as rte field is valid */
+/* Protocol-specific rte comparison flags */
+#define BGP_CFLAG_SKIPMED 0x01 /* Skip MED when comparing rte from different neigbour ASNs */
+
/* BGP connection states */
#define BS_IDLE 0
diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c
index ce7ad37..12b209f 100644
--- a/proto/ospf/ospf.c
+++ b/proto/ospf/ospf.c
@@ -105,7 +105,7 @@
static int ospf_reload_routes(struct proto *p);
static void ospf_rt_notify(struct proto *p, struct rtable *table UNUSED, net * n, rte * new, rte * old UNUSED, ea_list * attrs);
-static int ospf_rte_better(struct rte *new, struct rte *old);
+static int ospf_rte_better(struct rte *new, struct rte *old, u32 flags UNUSED);
static int ospf_rte_same(struct rte *new, struct rte *old);
static void ospf_disp(timer *timer);
@@ -314,7 +314,7 @@ ospf_init(struct proto_config *c)
/* If new is better return 1 */
static int
-ospf_rte_better(struct rte *new, struct rte *old)
+ospf_rte_better(struct rte *new, struct rte *old, u32 flags UNUSED)
{
if (new->u.ospf.metric1 == LSINFINITY)
return 0;
diff --git a/proto/rip/rip.c b/proto/rip/rip.c
index 543aa30..6967c68 100644
--- a/proto/rip/rip.c
+++ b/proto/rip/rip.c
@@ -265,7 +265,7 @@ rip_rte_update_if_better(rtable *tab, net *net, struct proto *p, rte *new)
rte *old;
old = rte_find(net, p);
- if (!old || p->rte_better(new, old) ||
+ if (!old || p->rte_better(new, old, 0) ||
(ipa_equal(old->attrs->from, new->attrs->from) &&
(old->u.rip.metric != new->u.rip.metric)) )
rte_update(tab, net, p, p, new);
@@ -909,7 +909,7 @@ rip_rte_same(struct rte *new, struct rte *old)
static int
-rip_rte_better(struct rte *new, struct rte *old)
+rip_rte_better(struct rte *new, struct rte *old, u32 flags UNUSED)
{
struct proto *p = new->attrs->proto;
--
1.7.3.2
--------------070507060406060400060500--
More information about the Bird-users
mailing list