[PATCH] RFC: Add netlink KRT dump filters on Linux

Ondrej Zajicek santiago at crfreenet.org
Fri Jan 14 23:17:08 CET 2022


On Mon, Jan 10, 2022 at 11:47:57PM +0100, Tomas Hlavacek wrote:
> Add netlink KRT dump filter on Linux to avoid PMTU cache records from FNHE
> table dump along with KRT.
> 
> Linux Kernel added FNHE table dump to the netlink API in patch
>  https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd6793fcf480c6c1.1561131177.git.sbrivio@redhat.com/
> 
> The filter mitigates the risk of receiving unknown and potentially large
> number of FNHE records that would block BIRD I/O in each sync. There is a
> known issue caused by the GRE tunnels on Linux that seems to be creating
> one FNHE record for each destination IP address that is routed through the
> tunnel, even when the PMTU equals to GRE interface MTU (tested with kernel
> 5.5 - 5.16-rc7).

Thanks, merged with some modifications:

https://gitlab.nic.cz/labs/bird/-/commit/e818f16448e918ed07633480291283f3449dd9e4

Instead of switching NETLINK_GET_STRICT_CHK on and off, i just used strict
checking for all dumps (including link and address).


Also, removed the SO_SNDBUF/SO_RCVBUF change. That seems unrelated and
has some issues:

1) Why these values? 32k for SO_SNDBUF is smaller than the default value
(208k), so it in fact makes the buffer smaller (which probably does not
matter). While 1M for SO_RCVBUF is bigger that max value, so it is capped
at 416k.

2) It applies just for nl_scan and nl_req, and not for async fd, where it
makes most sense.

3) We may want big rx buffer for async fd, in this case we may consider
using SO_SNDBUFFORCE.

I am not sure which netlink socket operations are really synchronous or
with flow control, so big buffer is not needed.



> ---
>  sysdep/linux/netlink.c | 44 +++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
> index f85bcf35..79414122 100644
> --- a/sysdep/linux/netlink.c
> +++ b/sysdep/linux/netlink.c
> @@ -128,7 +128,7 @@ struct nl_sock
>    uint last_size;
>  };
>  
> -#define NL_RX_SIZE 8192
> +#define NL_RX_SIZE 32768
>  
>  #define NL_OP_DELETE	0
>  #define NL_OP_ADD	(NLM_F_CREATE|NLM_F_EXCL)
> @@ -143,11 +143,18 @@ static struct nl_sock nl_req  = {.fd = -1};	/* Netlink socket for requests */
>  static void
>  nl_open_sock(struct nl_sock *nl)
>  {
> +  int sndbuf = 32768;
> +  int rcvbuf = 1024*1024;
> +
>    if (nl->fd < 0)
>      {
> -      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
> +      nl->fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
>        if (nl->fd < 0)
>  	die("Unable to open rtnetlink socket: %m");
> +
> +      setsockopt(nl->fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf));
> +      setsockopt(nl->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf));
> +
>        nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
>        nl->rx_buffer = xmalloc(NL_RX_SIZE);
>        nl->last_hdr = NULL;
> @@ -155,6 +162,12 @@ nl_open_sock(struct nl_sock *nl)
>      }
>  }
>  
> +static void
> +nl_set_strict_dump(struct nl_sock *nl, int strict)
> +{
> +  setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
> +}
> +
>  static void
>  nl_open(void)
>  {
> @@ -192,6 +205,29 @@ nl_request_dump(int af, int cmd)
>    nl_send(&nl_scan, &req.nh);
>  }
>  
> +static void
> +nl_request_dump_rt(int af, int cmd)
> +{
> +  struct {
> +    struct nlmsghdr nh;
> +    struct rtmsg rtm;
> +    char buf[128];
> +  } req = {
> +    .nh.nlmsg_type = cmd,
> +    .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
> +    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
> +    .nh.nlmsg_seq = ++(nl_scan.seq),
> +    .nh.nlmsg_pid = 0,
> +    .rtm.rtm_protocol = RTPROT_UNSPEC,
> +    .rtm.rtm_family = af
> +    /* .rtm.rtm_flags is defaults to zero, hence RTM_F_CLONED is not set */
> +  };
> +
> +  send(nl_scan.fd, &req, sizeof(req), 0);
> +  nl_scan.last_hdr = NULL;
> +}
> +
> +
>  static struct nlmsghdr *
>  nl_get_reply(struct nl_sock *nl)
>  {
> @@ -1864,13 +1900,15 @@ krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NUL
>    struct nl_parse_state s;
>  
>    nl_parse_begin(&s, 1);
> -  nl_request_dump(AF_UNSPEC, RTM_GETROUTE);
> +  nl_set_strict_dump(&nl_scan, 1);
> +  nl_request_dump_rt(AF_UNSPEC, RTM_GETROUTE);
>    while (h = nl_get_scan())
>      if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
>        nl_parse_route(&s, h);
>      else
>        log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
>    nl_parse_end(&s);
> +  nl_set_strict_dump(&nl_scan, 0);
>  }
>  
>  /*
> -- 
> 2.25.1

-- 
Elen sila lumenn' omentielvo

Ondrej 'Santiago' Zajicek (email: santiago at crfreenet.org)
OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net)
"To err is human -- to blame it on a computer is even more so."


More information about the Bird-users mailing list