Message ID | 160277680864.157904.8719768977907736015.stgit@toke.dk (mailing list archive) |
---|---|
State | RFC |
Delegated to: | BPF |
Headers | show |
Series | bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller | expand |
On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index bf5a99d803e4..980cc1363be8 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3677,15 +3677,19 @@ union bpf_attr { > * Return > * The id is returned or 0 in case the id could not be retrieved. > * > - * long bpf_redirect_neigh(u32 ifindex, u64 flags) > + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) why not fold ifindex into params? with params and plen this should be extensible later if needed. A couple of nits below that caught me eye. > * Description > * Redirect the packet to another net device of index *ifindex* > * and fill in L2 addresses from neighboring subsystem. This helper > * is somewhat similar to **bpf_redirect**\ (), except that it > * populates L2 addresses as well, meaning, internally, the helper > - * performs a FIB lookup based on the skb's networking header to > - * get the address of the next hop and then relies on the neighbor > - * lookup for the L2 address of the nexthop. > + * relies on the neighbor lookup for the L2 address of the nexthop. > + * > + * The helper will perform a FIB lookup based on the skb's > + * networking header to get the address of the next hop, unless > + * this is supplied by the caller in the *params* argument. The > + * *plen* argument indicates the len of *params* and should be set > + * to 0 if *params* is NULL. > * > * The *flags* argument is reserved and must be 0. The helper is > * currently only supported for tc BPF program types, and enabled > @@ -4906,6 +4910,17 @@ struct bpf_fib_lookup { > __u8 dmac[6]; /* ETH_ALEN */ > }; > > +struct bpf_redir_neigh { > + /* network family for lookup (AF_INET, AF_INET6) > + */ second line for the comment is not needed. > + __u8 nh_family; > + /* network address of nexthop; skips fib lookup to find gateway */ > + union { > + __be32 ipv4_nh; > + __u32 ipv6_nh[4]; /* in6_addr; network order */ > + }; > +}; > + > enum bpf_task_fd_type { > BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ > BPF_FD_TYPE_TRACEPOINT, /* tp name */ > diff --git a/net/core/filter.c b/net/core/filter.c > index c5e2a1c5fd8d..d073031a3a61 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, > } > > #if IS_ENABLED(CONFIG_IPV6) > -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, > + struct net_device *dev, const struct in6_addr *nexthop) > { > - struct dst_entry *dst = skb_dst(skb); > - struct net_device *dev = dst->dev; > u32 hh_len = LL_RESERVED_SPACE(dev); > - const struct in6_addr *nexthop; > + struct dst_entry *dst = NULL; > struct neighbour *neigh; > > if (dev_xmit_recursion()) { > @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > } > > rcu_read_lock_bh(); > - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > - &ipv6_hdr(skb)->daddr); > + if (!nexthop) { > + dst = skb_dst(skb); > + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > + &ipv6_hdr(skb)->daddr); > + } > neigh = ip_neigh_gw6(dev, nexthop); > if (likely(!IS_ERR(neigh))) { > int ret; > @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > return ret; > } > rcu_read_unlock_bh(); > - IP6_INC_STATS(dev_net(dst->dev), > - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > + if (dst) > + IP6_INC_STATS(dev_net(dst->dev), > + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > out_drop: > kfree_skb(skb); > return -ENETDOWN; > } > > -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, > + struct bpf_nh_params *nh) > { > const struct ipv6hdr *ip6h = ipv6_hdr(skb); > + struct in6_addr *nexthop = NULL; > struct net *net = dev_net(dev); > int err, ret = NET_XMIT_DROP; > - struct dst_entry *dst; > - struct flowi6 fl6 = { > - .flowi6_flags = FLOWI_FLAG_ANYSRC, > - .flowi6_mark = skb->mark, > - .flowlabel = ip6_flowinfo(ip6h), > - .flowi6_oif = dev->ifindex, > - .flowi6_proto = ip6h->nexthdr, > - .daddr = ip6h->daddr, > - .saddr = ip6h->saddr, > - }; > > - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > - if (IS_ERR(dst)) > - goto out_drop; > + if (!nh->nh_family) { > + struct dst_entry *dst; reverse xmas tree ordering > + struct flowi6 fl6 = { > + .flowi6_flags = FLOWI_FLAG_ANYSRC, > + .flowi6_mark = skb->mark, > + .flowlabel = ip6_flowinfo(ip6h), > + .flowi6_oif = dev->ifindex, > + .flowi6_proto = ip6h->nexthdr, > + .daddr = ip6h->daddr, > + .saddr = ip6h->saddr, > + }; > + > + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > + if (IS_ERR(dst)) > + goto out_drop; > > - skb_dst_set(skb, dst); > + skb_dst_set(skb, dst); > + } else if (nh->nh_family == AF_INET6) { > + nexthop = &nh->ipv6_nh; > + } else { > + goto out_drop; > + } > > - err = bpf_out_neigh_v6(net, skb); > + err = bpf_out_neigh_v6(net, skb, dev, nexthop); > if (unlikely(net_xmit_eval(err))) > dev->stats.tx_errors++; > else
David Ahern <dsahern@gmail.com> writes: > On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index bf5a99d803e4..980cc1363be8 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -3677,15 +3677,19 @@ union bpf_attr { >> * Return >> * The id is returned or 0 in case the id could not be retrieved. >> * >> - * long bpf_redirect_neigh(u32 ifindex, u64 flags) >> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) > > why not fold ifindex into params? with params and plen this should be > extensible later if needed. Figured this way would make it easier to run *without* the params (like in the existing examples). But don't feel strongly about it, let's see what Daniel thinks. > A couple of nits below that caught me eye. Thanks, will fix; the kernel bot also found a sparse warning, so I guess I need to respin anyway (but waiting for Daniel's comments and/or instructions on what tree to properly submit this to). -Toke
On 10/15/20 9:34 PM, Toke Høiland-Jørgensen wrote: > David Ahern <dsahern@gmail.com> writes: >> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>> index bf5a99d803e4..980cc1363be8 100644 >>> --- a/include/uapi/linux/bpf.h >>> +++ b/include/uapi/linux/bpf.h >>> @@ -3677,15 +3677,19 @@ union bpf_attr { >>> * Return >>> * The id is returned or 0 in case the id could not be retrieved. >>> * >>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags) >>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) >> >> why not fold ifindex into params? with params and plen this should be >> extensible later if needed. > > Figured this way would make it easier to run *without* the params (like > in the existing examples). But don't feel strongly about it, let's see > what Daniel thinks. My preference is what Toke has here, this simplifies use by just being able to call bpf_redirect_neigh(ifindex, NULL, 0, 0) when just single external facing device is used. >> A couple of nits below that caught me eye. > > Thanks, will fix; the kernel bot also found a sparse warning, so I guess > I need to respin anyway (but waiting for Daniel's comments and/or > instructions on what tree to properly submit this to). Given API change, lets do bpf. (Will review the rest later today.)
Daniel Borkmann <daniel@iogearbox.net> writes: > On 10/15/20 9:34 PM, Toke Høiland-Jørgensen wrote: >> David Ahern <dsahern@gmail.com> writes: >>> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: >>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>>> index bf5a99d803e4..980cc1363be8 100644 >>>> --- a/include/uapi/linux/bpf.h >>>> +++ b/include/uapi/linux/bpf.h >>>> @@ -3677,15 +3677,19 @@ union bpf_attr { >>>> * Return >>>> * The id is returned or 0 in case the id could not be retrieved. >>>> * >>>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags) >>>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) >>> >>> why not fold ifindex into params? with params and plen this should be >>> extensible later if needed. >> >> Figured this way would make it easier to run *without* the params (like >> in the existing examples). But don't feel strongly about it, let's see >> what Daniel thinks. > > My preference is what Toke has here, this simplifies use by just being able to > call bpf_redirect_neigh(ifindex, NULL, 0, 0) when just single external facing > device is used. > >>> A couple of nits below that caught me eye. >> >> Thanks, will fix; the kernel bot also found a sparse warning, so I guess >> I need to respin anyway (but waiting for Daniel's comments and/or >> instructions on what tree to properly submit this to). > > Given API change, lets do bpf. (Will review the rest later today.) Right, ACK. I'll wait for your review, then resubmit against the bpf tree :) -Toke
On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote: > From: Toke Høiland-Jørgensen <toke@redhat.com> > > Based on the discussion in [0], update the bpf_redirect_neigh() helper to > accept an optional parameter specifying the nexthop information. This makes > it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without > incurring a duplicate FIB lookup - since the FIB lookup helper will return > the nexthop information even if no neighbour is present, this can simply be > passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns > BPF_FIB_LKUP_RET_NO_NEIGH. > > [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ > > Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> Overall looks good from what I can tell, just small nits below on top of David's feedback: [...] > -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) > +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, > + struct bpf_nh_params *nh) > { > const struct iphdr *ip4h = ip_hdr(skb); > struct net *net = dev_net(dev); > int err, ret = NET_XMIT_DROP; > - struct rtable *rt; > - struct flowi4 fl4 = { > - .flowi4_flags = FLOWI_FLAG_ANYSRC, > - .flowi4_mark = skb->mark, > - .flowi4_tos = RT_TOS(ip4h->tos), > - .flowi4_oif = dev->ifindex, > - .flowi4_proto = ip4h->protocol, > - .daddr = ip4h->daddr, > - .saddr = ip4h->saddr, > - }; > > - rt = ip_route_output_flow(net, &fl4, NULL); > - if (IS_ERR(rt)) > - goto out_drop; > - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { > - ip_rt_put(rt); > - goto out_drop; > - } > + if (!nh->nh_family) { > + struct rtable *rt; > + struct flowi4 fl4 = { > + .flowi4_flags = FLOWI_FLAG_ANYSRC, > + .flowi4_mark = skb->mark, > + .flowi4_tos = RT_TOS(ip4h->tos), > + .flowi4_oif = dev->ifindex, > + .flowi4_proto = ip4h->protocol, > + .daddr = ip4h->daddr, > + .saddr = ip4h->saddr, > + }; > + > + rt = ip_route_output_flow(net, &fl4, NULL); > + if (IS_ERR(rt)) > + goto out_drop; > + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { > + ip_rt_put(rt); > + goto out_drop; > + } > > - skb_dst_set(skb, &rt->dst); > + skb_dst_set(skb, &rt->dst); > + nh = NULL; > + } > > - err = bpf_out_neigh_v4(net, skb); > + err = bpf_out_neigh_v4(net, skb, dev, nh); > if (unlikely(net_xmit_eval(err))) > dev->stats.tx_errors++; > else > @@ -2355,7 +2383,8 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) > } > #endif /* CONFIG_INET */ > > -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) > +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, > + struct bpf_nh_params *nh) > { > struct ethhdr *ethh = eth_hdr(skb); > > @@ -2370,9 +2399,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) > skb_reset_network_header(skb); > > if (skb->protocol == htons(ETH_P_IP)) > - return __bpf_redirect_neigh_v4(skb, dev); > + return __bpf_redirect_neigh_v4(skb, dev, nh); > else if (skb->protocol == htons(ETH_P_IPV6)) > - return __bpf_redirect_neigh_v6(skb, dev); > + return __bpf_redirect_neigh_v6(skb, dev, nh); > out: > kfree_skb(skb); > return -ENOTSUPP; > @@ -2455,8 +2484,8 @@ int skb_do_redirect(struct sk_buff *skb) > return -EAGAIN; > } > return flags & BPF_F_NEIGH ? > - __bpf_redirect_neigh(skb, dev) : > - __bpf_redirect(skb, dev, flags); > + __bpf_redirect_neigh(skb, dev, &ri->nh) : > + __bpf_redirect(skb, dev, flags); > out_drop: > kfree_skb(skb); > return -EINVAL; > @@ -2504,16 +2533,23 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { > .arg2_type = ARG_ANYTHING, > }; > > -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) > +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, > + int, plen, u64, flags) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > > - if (unlikely(flags)) > + if (unlikely((plen && plen < sizeof(*params)) || flags)) > return TC_ACT_SHOT; > > ri->flags = BPF_F_NEIGH; > ri->tgt_index = ifindex; > > + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); > + if (plen) > + memcpy(&ri->nh, params, sizeof(ri->nh)); > + else > + ri->nh.nh_family = 0; /* clear previous value */ I'd probably just add an internal flag and do ... ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ... instead of above clearing, and skb_do_redirect() then becomes: __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) ... which would then also avoid this !nh->nh_family check where you later on set nh = NULL to pass it onwards. > return TC_ACT_REDIRECT; > } >
Daniel Borkmann <daniel@iogearbox.net> writes: > On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote: >> From: Toke Høiland-Jørgensen <toke@redhat.com> >> >> Based on the discussion in [0], update the bpf_redirect_neigh() helper to >> accept an optional parameter specifying the nexthop information. This makes >> it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without >> incurring a duplicate FIB lookup - since the FIB lookup helper will return >> the nexthop information even if no neighbour is present, this can simply be >> passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns >> BPF_FIB_LKUP_RET_NO_NEIGH. >> >> [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> > > Overall looks good from what I can tell, just small nits below on top of > David's feedback: > > [...] >> -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) >> +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, >> + struct bpf_nh_params *nh) >> { >> const struct iphdr *ip4h = ip_hdr(skb); >> struct net *net = dev_net(dev); >> int err, ret = NET_XMIT_DROP; >> - struct rtable *rt; >> - struct flowi4 fl4 = { >> - .flowi4_flags = FLOWI_FLAG_ANYSRC, >> - .flowi4_mark = skb->mark, >> - .flowi4_tos = RT_TOS(ip4h->tos), >> - .flowi4_oif = dev->ifindex, >> - .flowi4_proto = ip4h->protocol, >> - .daddr = ip4h->daddr, >> - .saddr = ip4h->saddr, >> - }; >> >> - rt = ip_route_output_flow(net, &fl4, NULL); >> - if (IS_ERR(rt)) >> - goto out_drop; >> - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { >> - ip_rt_put(rt); >> - goto out_drop; >> - } >> + if (!nh->nh_family) { >> + struct rtable *rt; >> + struct flowi4 fl4 = { >> + .flowi4_flags = FLOWI_FLAG_ANYSRC, >> + .flowi4_mark = skb->mark, >> + .flowi4_tos = RT_TOS(ip4h->tos), >> + .flowi4_oif = dev->ifindex, >> + .flowi4_proto = ip4h->protocol, >> + .daddr = ip4h->daddr, >> + .saddr = ip4h->saddr, >> + }; >> + >> + rt = ip_route_output_flow(net, &fl4, NULL); >> + if (IS_ERR(rt)) >> + goto out_drop; >> + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { >> + ip_rt_put(rt); >> + goto out_drop; >> + } >> >> - skb_dst_set(skb, &rt->dst); >> + skb_dst_set(skb, &rt->dst); >> + nh = NULL; >> + } >> >> - err = bpf_out_neigh_v4(net, skb); >> + err = bpf_out_neigh_v4(net, skb, dev, nh); >> if (unlikely(net_xmit_eval(err))) >> dev->stats.tx_errors++; >> else >> @@ -2355,7 +2383,8 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) >> } >> #endif /* CONFIG_INET */ >> >> -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) >> +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, >> + struct bpf_nh_params *nh) >> { >> struct ethhdr *ethh = eth_hdr(skb); >> >> @@ -2370,9 +2399,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) >> skb_reset_network_header(skb); >> >> if (skb->protocol == htons(ETH_P_IP)) >> - return __bpf_redirect_neigh_v4(skb, dev); >> + return __bpf_redirect_neigh_v4(skb, dev, nh); >> else if (skb->protocol == htons(ETH_P_IPV6)) >> - return __bpf_redirect_neigh_v6(skb, dev); >> + return __bpf_redirect_neigh_v6(skb, dev, nh); >> out: >> kfree_skb(skb); >> return -ENOTSUPP; >> @@ -2455,8 +2484,8 @@ int skb_do_redirect(struct sk_buff *skb) >> return -EAGAIN; >> } >> return flags & BPF_F_NEIGH ? >> - __bpf_redirect_neigh(skb, dev) : >> - __bpf_redirect(skb, dev, flags); >> + __bpf_redirect_neigh(skb, dev, &ri->nh) : >> + __bpf_redirect(skb, dev, flags); >> out_drop: >> kfree_skb(skb); >> return -EINVAL; >> @@ -2504,16 +2533,23 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { >> .arg2_type = ARG_ANYTHING, >> }; >> >> -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) >> +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, >> + int, plen, u64, flags) >> { >> struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> >> - if (unlikely(flags)) >> + if (unlikely((plen && plen < sizeof(*params)) || flags)) >> return TC_ACT_SHOT; >> >> ri->flags = BPF_F_NEIGH; >> ri->tgt_index = ifindex; >> >> + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); >> + if (plen) >> + memcpy(&ri->nh, params, sizeof(ri->nh)); >> + else >> + ri->nh.nh_family = 0; /* clear previous value */ > > I'd probably just add an internal flag and do ... > > ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); > > ... instead of above clearing, and skb_do_redirect() then becomes: > > __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) > > ... which would then also avoid this !nh->nh_family check where you later on > set nh = NULL to pass it onwards. Ah yes, excellent idea! Will fix :) -Toke
On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote: [...] > +struct bpf_redir_neigh { > + /* network family for lookup (AF_INET, AF_INET6) > + */ > + __u8 nh_family; > + /* network address of nexthop; skips fib lookup to find gateway */ > + union { > + __be32 ipv4_nh; > + __u32 ipv6_nh[4]; /* in6_addr; network order */ > + }; > +}; > + > enum bpf_task_fd_type { > BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ > BPF_FD_TYPE_TRACEPOINT, /* tp name */ > diff --git a/net/core/filter.c b/net/core/filter.c > index c5e2a1c5fd8d..d073031a3a61 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, > } > > #if IS_ENABLED(CONFIG_IPV6) > -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, > + struct net_device *dev, const struct in6_addr *nexthop) > { > - struct dst_entry *dst = skb_dst(skb); > - struct net_device *dev = dst->dev; > u32 hh_len = LL_RESERVED_SPACE(dev); > - const struct in6_addr *nexthop; > + struct dst_entry *dst = NULL; > struct neighbour *neigh; > > if (dev_xmit_recursion()) { > @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > } > > rcu_read_lock_bh(); > - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > - &ipv6_hdr(skb)->daddr); > + if (!nexthop) { > + dst = skb_dst(skb); > + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > + &ipv6_hdr(skb)->daddr); > + } > neigh = ip_neigh_gw6(dev, nexthop); > if (likely(!IS_ERR(neigh))) { > int ret; > @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > return ret; > } > rcu_read_unlock_bh(); > - IP6_INC_STATS(dev_net(dst->dev), > - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > + if (dst) > + IP6_INC_STATS(dev_net(dst->dev), > + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > out_drop: > kfree_skb(skb); > return -ENETDOWN; > } > > -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, > + struct bpf_nh_params *nh) > { > const struct ipv6hdr *ip6h = ipv6_hdr(skb); > + struct in6_addr *nexthop = NULL; > struct net *net = dev_net(dev); > int err, ret = NET_XMIT_DROP; > - struct dst_entry *dst; > - struct flowi6 fl6 = { > - .flowi6_flags = FLOWI_FLAG_ANYSRC, > - .flowi6_mark = skb->mark, > - .flowlabel = ip6_flowinfo(ip6h), > - .flowi6_oif = dev->ifindex, > - .flowi6_proto = ip6h->nexthdr, > - .daddr = ip6h->daddr, > - .saddr = ip6h->saddr, > - }; > > - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > - if (IS_ERR(dst)) > - goto out_drop; > + if (!nh->nh_family) { > + struct dst_entry *dst; > + struct flowi6 fl6 = { > + .flowi6_flags = FLOWI_FLAG_ANYSRC, > + .flowi6_mark = skb->mark, > + .flowlabel = ip6_flowinfo(ip6h), > + .flowi6_oif = dev->ifindex, > + .flowi6_proto = ip6h->nexthdr, > + .daddr = ip6h->daddr, > + .saddr = ip6h->saddr, nit: Would be good for readability to keep the previous whitespace alignment intact. > + }; > + > + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > + if (IS_ERR(dst)) > + goto out_drop; > > - skb_dst_set(skb, dst); > + skb_dst_set(skb, dst); > + } else if (nh->nh_family == AF_INET6) { > + nexthop = &nh->ipv6_nh; > + } else { > + goto out_drop; > + } > > - err = bpf_out_neigh_v6(net, skb); > + err = bpf_out_neigh_v6(net, skb, dev, nexthop); I'd probably model the bpf_out_neigh_v{4,6}() as close as possible similar to each other in terms of args we pass etc. In the v6 case you pass the nexthop in6_addr directly whereas v4 passes bpf_nh_params, I'd probably also stick to the latter for v6 to keep it symmetric. > if (unlikely(net_xmit_eval(err))) > dev->stats.tx_errors++; > else > @@ -2260,11 +2272,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > #endif /* CONFIG_IPV6 */ > > #if IS_ENABLED(CONFIG_INET) > -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) > +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, > + struct net_device *dev, struct bpf_nh_params *nh) > { > - struct dst_entry *dst = skb_dst(skb); > - struct rtable *rt = container_of(dst, struct rtable, dst); > - struct net_device *dev = dst->dev; > u32 hh_len = LL_RESERVED_SPACE(dev); > struct neighbour *neigh; > bool is_v6gw = false; > @@ -2292,7 +2302,20 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) > } > > rcu_read_lock_bh(); > - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); > + if (!nh) { > + struct dst_entry *dst = skb_dst(skb); > + struct rtable *rt = container_of(dst, struct rtable, dst); > + > + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); > + } else if (nh->nh_family == AF_INET6) { > + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); > + is_v6gw = true; > + } else if (nh->nh_family == AF_INET) { > + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); > + } else { > + goto out_drop; > + } > + > if (likely(!IS_ERR(neigh))) { > int ret; >
diff --git a/include/linux/filter.h b/include/linux/filter.h index 20fc24c9779a..ba9de7188cd0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,12 +607,21 @@ struct bpf_skb_data_end { void *data_end; }; +struct bpf_nh_params { + u8 nh_family; + union { + __u32 ipv4_nh; + struct in6_addr ipv6_nh; + }; +}; + struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 kern_flags; + struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf5a99d803e4..980cc1363be8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,17 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) + */ + __u8 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index c5e2a1c5fd8d..d073031a3a61 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, const struct in6_addr *nexthop) { - struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); - const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), - &ipv6_hdr(skb)->daddr); + if (!nexthop) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) return ret; } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + if (dst) + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct in6_addr *nexthop = NULL; struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct dst_entry *dst; - struct flowi6 fl6 = { - .flowi6_flags = FLOWI_FLAG_ANYSRC, - .flowi6_mark = skb->mark, - .flowlabel = ip6_flowinfo(ip6h), - .flowi6_oif = dev->ifindex, - .flowi6_proto = ip6h->nexthdr, - .daddr = ip6h->daddr, - .saddr = ip6h->saddr, - }; - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); - if (IS_ERR(dst)) - goto out_drop; + if (!nh->nh_family) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; - skb_dst_set(skb, dst); + skb_dst_set(skb, dst); + } else if (nh->nh_family == AF_INET6) { + nexthop = &nh->ipv6_nh; + } else { + goto out_drop; + } - err = bpf_out_neigh_v6(net, skb); + err = bpf_out_neigh_v6(net, skb, dev, nexthop); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2260,11 +2272,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -2292,7 +2302,20 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + goto out_drop; + } + if (likely(!IS_ERR(neigh))) { int ret; @@ -2309,33 +2332,38 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) return -ENETDOWN; } -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct rtable *rt; - struct flowi4 fl4 = { - .flowi4_flags = FLOWI_FLAG_ANYSRC, - .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_oif = dev->ifindex, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; - rt = ip_route_output_flow(net, &fl4, NULL); - if (IS_ERR(rt)) - goto out_drop; - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { - ip_rt_put(rt); - goto out_drop; - } + if (!nh->nh_family) { + struct rtable *rt; + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } - skb_dst_set(skb, &rt->dst); + skb_dst_set(skb, &rt->dst); + nh = NULL; + } - err = bpf_out_neigh_v4(net, skb); + err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2355,7 +2383,8 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) } #endif /* CONFIG_INET */ -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); @@ -2370,9 +2399,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) - return __bpf_redirect_neigh_v4(skb, dev); + return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) - return __bpf_redirect_neigh_v6(skb, dev); + return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; @@ -2455,8 +2484,8 @@ int skb_do_redirect(struct sk_buff *skb) return -EAGAIN; } return flags & BPF_F_NEIGH ? - __bpf_redirect_neigh(skb, dev) : - __bpf_redirect(skb, dev, flags); + __bpf_redirect_neigh(skb, dev, &ri->nh) : + __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; @@ -2504,16 +2533,23 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; ri->flags = BPF_F_NEIGH; ri->tgt_index = ifindex; + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + else + ri->nh.nh_family = 0; /* clear previous value */ + return TC_ACT_REDIRECT; } @@ -2522,7 +2558,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 7d86fdd190be..6769caae142f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -453,6 +453,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf5a99d803e4..980cc1363be8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,17 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) + */ + __u8 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */