[PATCH] RFC: Add netlink KRT dump filters on Linux
Ondrej Zajicek
santiago at crfreenet.org
Fri Jan 14 23:17:08 CET 2022
On Mon, Jan 10, 2022 at 11:47:57PM +0100, Tomas Hlavacek wrote:
> Add netlink KRT dump filter on Linux to avoid PMTU cache records from FNHE
> table dump along with KRT.
>
> Linux Kernel added FNHE table dump to the netlink API in patch
> https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd6793fcf480c6c1.1561131177.git.sbrivio@redhat.com/
>
> The filter mitigates the risk of receiving unknown and potentially large
> number of FNHE records that would block BIRD I/O in each sync. There is a
> known issue caused by the GRE tunnels on Linux that seems to be creating
> one FNHE record for each destination IP address that is routed through the
> tunnel, even when the PMTU equals to GRE interface MTU (tested with kernel
> 5.5 - 5.16-rc7).
Thanks, merged with some modifications:
https://gitlab.nic.cz/labs/bird/-/commit/e818f16448e918ed07633480291283f3449dd9e4
Instead of switching NETLINK_GET_STRICT_CHK on and off, i just used strict
checking for all dumps (including link and address).
Also, removed the SO_SNDBUF/SO_RCVBUF change. That seems unrelated and
has some issues:
1) Why these values? 32k for SO_SNDBUF is smaller than the default value
(208k), so it in fact makes the buffer smaller (which probably does not
matter). While 1M for SO_RCVBUF is bigger that max value, so it is capped
at 416k.
2) It applies just for nl_scan and nl_req, and not for async fd, where it
makes most sense.
3) We may want big rx buffer for async fd, in this case we may consider
using SO_SNDBUFFORCE.
I am not sure which netlink socket operations are really synchronous or
with flow control, so big buffer is not needed.
> ---
> sysdep/linux/netlink.c | 44 +++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
> index f85bcf35..79414122 100644
> --- a/sysdep/linux/netlink.c
> +++ b/sysdep/linux/netlink.c
> @@ -128,7 +128,7 @@ struct nl_sock
> uint last_size;
> };
>
> -#define NL_RX_SIZE 8192
> +#define NL_RX_SIZE 32768
>
> #define NL_OP_DELETE 0
> #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL)
> @@ -143,11 +143,18 @@ static struct nl_sock nl_req = {.fd = -1}; /* Netlink socket for requests */
> static void
> nl_open_sock(struct nl_sock *nl)
> {
> + int sndbuf = 32768;
> + int rcvbuf = 1024*1024;
> +
> if (nl->fd < 0)
> {
> - nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
> + nl->fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
> if (nl->fd < 0)
> die("Unable to open rtnetlink socket: %m");
> +
> + setsockopt(nl->fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf));
> + setsockopt(nl->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf));
> +
> nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
> nl->rx_buffer = xmalloc(NL_RX_SIZE);
> nl->last_hdr = NULL;
> @@ -155,6 +162,12 @@ nl_open_sock(struct nl_sock *nl)
> }
> }
>
> +static void
> +nl_set_strict_dump(struct nl_sock *nl, int strict)
> +{
> + setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
> +}
> +
> static void
> nl_open(void)
> {
> @@ -192,6 +205,29 @@ nl_request_dump(int af, int cmd)
> nl_send(&nl_scan, &req.nh);
> }
>
> +static void
> +nl_request_dump_rt(int af, int cmd)
> +{
> + struct {
> + struct nlmsghdr nh;
> + struct rtmsg rtm;
> + char buf[128];
> + } req = {
> + .nh.nlmsg_type = cmd,
> + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
> + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
> + .nh.nlmsg_seq = ++(nl_scan.seq),
> + .nh.nlmsg_pid = 0,
> + .rtm.rtm_protocol = RTPROT_UNSPEC,
> + .rtm.rtm_family = af
> + /* .rtm.rtm_flags is defaults to zero, hence RTM_F_CLONED is not set */
> + };
> +
> + send(nl_scan.fd, &req, sizeof(req), 0);
> + nl_scan.last_hdr = NULL;
> +}
> +
> +
> static struct nlmsghdr *
> nl_get_reply(struct nl_sock *nl)
> {
> @@ -1864,13 +1900,15 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL
> struct nl_parse_state s;
>
> nl_parse_begin(&s, 1);
> - nl_request_dump(AF_UNSPEC, RTM_GETROUTE);
> + nl_set_strict_dump(&nl_scan, 1);
> + nl_request_dump_rt(AF_UNSPEC, RTM_GETROUTE);
> while (h = nl_get_scan())
> if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
> nl_parse_route(&s, h);
> else
> log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
> nl_parse_end(&s);
> + nl_set_strict_dump(&nl_scan, 0);
> }
>
> /*
> --
> 2.25.1
--
Elen sila lumenn' omentielvo
Ondrej 'Santiago' Zajicek (email: santiago at crfreenet.org)
OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net)
"To err is human -- to blame it on a computer is even more so."
More information about the Bird-users
mailing list