Message ID | 20230323042608.17573-1-vladimir@nikishkin.pw (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next,v5] vxlan: try to send a packet normally if local bypass fails | expand |
On Thu, Mar 23, 2023 at 12:26:08PM +0800, Vladimir Nikishkin wrote: > In vxlan_core, if an fdb entry is pointing to a local > address with some port, the system tries to get the packet to > deliver the packet to the vxlan directly, bypassing the network > stack. > > This patch makes it still try canonical delivery, if there is no > linux kernel vxlan listening on this port. This will be useful > for the cases when there is some userspace daemon expecting > vxlan packets for post-processing, or some other implementation > of vxlan. > > Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw> > --- > drivers/net/vxlan/vxlan_core.c | 34 ++++++++++++++++++++++++------ > include/net/vxlan.h | 4 +++- > include/uapi/linux/if_link.h | 1 + > tools/include/uapi/linux/if_link.h | 2 ++ > 4 files changed, 33 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c > index 561fe1b314f5..cef7a9aec24b 100644 > --- a/drivers/net/vxlan/vxlan_core.c > +++ b/drivers/net/vxlan/vxlan_core.c > @@ -2341,7 +2341,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, > union vxlan_addr *daddr, > __be16 dst_port, int dst_ifindex, __be32 vni, > struct dst_entry *dst, > - u32 rt_flags) > + u32 rt_flags, bool localbypass) > { > #if IS_ENABLED(CONFIG_IPV6) > /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of > @@ -2355,18 +2355,21 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, > !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { > struct vxlan_dev *dst_vxlan; > > - dst_release(dst); > dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, > daddr->sa.sa_family, dst_port, > vxlan->cfg.flags); > - if (!dst_vxlan) { > + if (!dst_vxlan && localbypass) { > + dst_release(dst); > dev->stats.tx_errors++; > vxlan_vnifilter_count(vxlan, vni, NULL, > VXLAN_VNI_STATS_TX_ERRORS, 0); > kfree_skb(skb); > > return -ENOENT; > + } else if (!dst_vxlan && !localbypass) { > + return 0; > } I'm a bit unsure about the logic around dst_release(). But assuming it is correct, perhaps this is a slightly nicer way to express the same logic: if (!dst_vxlan) { if (!localbypass) return 0; dst_release(dst); dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); return -ENOENT; } > + dst_release(dst); > vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); > return 1; > } > @@ -2393,6 +2396,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, > int err; > u32 flags = vxlan->cfg.flags; > bool udp_sum = false; > + bool localbypass = true; Is there a need to initialise this here? Also, it would be good to move towards, rather than away from, reverse xmas tree - longest line to shortest - for local variable declarations. > bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); > __be32 vni = 0; > #if IS_ENABLED(CONFIG_IPV6) > @@ -2494,9 +2498,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, > > if (!info) { > /* Bypass encapsulation if the destination is local */ > + localbypass = !(flags & VXLAN_F_LOCALBYPASS); Extra space after '='. Also, the scope of localbypass could be reduced to this block. > err = encap_bypass_if_local(skb, dev, vxlan, dst, > dst_port, ifindex, vni, > - &rt->dst, rt->rt_flags); > + &rt->dst, rt->rt_flags, > + localbypass); > if (err) > goto out_unlock; > > @@ -2568,10 +2574,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, > > if (!info) { > u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; > - > + localbypass = !(flags & VXLAN_F_LOCALBYPASS); Ditto. > err = encap_bypass_if_local(skb, dev, vxlan, dst, > dst_port, ifindex, vni, > - ndst, rt6i_flags); > + ndst, rt6i_flags, localbypass); > if (err) > goto out_unlock; > } > @@ -3202,6 +3208,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { > [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, > [IFLA_VXLAN_DF] = { .type = NLA_U8 }, > [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, > + [IFLA_VXLAN_LOCALBYPASS] = { .type = NLA_U8 }, > }; > > static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], > @@ -4011,6 +4018,16 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], > conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; > } > > + if (data[IFLA_VXLAN_LOCALBYPASS]) { > + if (changelink) { > + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCALBYPASS], > + "Cannot change LOCALBYPASS flag"); > + return -EOPNOTSUPP; > + } > + if (!nla_get_u8(data[IFLA_VXLAN_LOCALBYPASS])) > + conf->flags |= VXLAN_F_LOCALBYPASS; > + } Can vxlan_nl2flag() be used here? > + > if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { > err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, > VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, > @@ -4232,6 +4249,7 @@ static size_t vxlan_get_size(const struct net_device *dev) > nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ > nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ > nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ > + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ > 0; > } > > @@ -4308,7 +4326,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) > nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, > !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || > nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, > - !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) > + !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || > + nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, > + !(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) It seems that the sense of VXLAN_F_LOCALBYPASS is the opposite of other flags. I think it would be good to reconcile that. > goto nla_put_failure; > > if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) > diff --git a/include/net/vxlan.h b/include/net/vxlan.h > index 20bd7d893e10..0be91ca78d3a 100644 > --- a/include/net/vxlan.h > +++ b/include/net/vxlan.h > @@ -328,6 +328,7 @@ struct vxlan_dev { > #define VXLAN_F_TTL_INHERIT 0x10000 > #define VXLAN_F_VNIFILTER 0x20000 > #define VXLAN_F_MDB 0x40000 > +#define VXLAN_F_LOCALBYPASS 0x80000 > > /* Flags that are used in the receive path. These flags must match in > * order for a socket to be shareable > @@ -348,7 +349,8 @@ struct vxlan_dev { > VXLAN_F_UDP_ZERO_CSUM6_TX | \ > VXLAN_F_UDP_ZERO_CSUM6_RX | \ > VXLAN_F_COLLECT_METADATA | \ > - VXLAN_F_VNIFILTER) > + VXLAN_F_VNIFILTER | \ > + VXLAN_F_LOCALBYPASS) > > struct net_device *vxlan_dev_create(struct net *net, const char *name, > u8 name_assign_type, struct vxlan_config *conf); > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index 57ceb788250f..4e3a3d295056 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -826,6 +826,7 @@ enum { > IFLA_VXLAN_TTL_INHERIT, > IFLA_VXLAN_DF, > IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ > + IFLA_VXLAN_LOCALBYPASS, > __IFLA_VXLAN_MAX > }; > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) > diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h > index 901d98b865a1..3d9a1fd6f7e7 100644 > --- a/tools/include/uapi/linux/if_link.h > +++ b/tools/include/uapi/linux/if_link.h > @@ -747,6 +747,8 @@ enum { > IFLA_VXLAN_GPE, > IFLA_VXLAN_TTL_INHERIT, > IFLA_VXLAN_DF, > + IFLA_VXLAN_VNIFILTER, > + IFLA_VXLAN_LOCALBYPASS, > __IFLA_VXLAN_MAX > }; > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) > -- > 2.35.7 > > -- > Fastmail. >
Simon Horman <simon.horman@corigine.com> writes: > I'm a bit unsure about the logic around dst_release(). > But assuming it is correct, perhaps this is a slightly Let me try to defend this logic. In the previous version, if the destination is local (the first "if"), then there is no need to keep the address in memory any more, hence the address was free()'d at the beginning of the "if" (and was not freed after the "if", because the address was still needed at the userspace part.) With this patch, the "localbypass" creates one more branch inside that "if", which is handing over the processing logic to the userspace (which has no free()). The older two branches _inside_ the "if" (vxlan found/vxlan not found) are still terminating, and therefore have one call to free() each. Ido Schimmel <idosch@idosch.org> writes: > Also, please add a selftest under tools/testing/selftests/net/. We > already have a bunch of VXLAN tests that you can use as a reference. I have added a file tools/testing/selftests/net/test_vxlan_nolocalbypass.sh , which is written on the basis of test_vxlan_mdb.sh Some tests do have a "testing framework" for enumerating and running tests (for example, those in ./forwarding/lib.sh), and some do not. I have used the simplest one.
On Wed, Apr 05, 2023 at 01:05:02PM +0800, Vladimir Nikishkin wrote: > > Simon Horman <simon.horman@corigine.com> writes: > > > I'm a bit unsure about the logic around dst_release(). > > But assuming it is correct, perhaps this is a slightly > > Let me try to defend this logic. > > In the previous version, if the destination is local (the first "if"), > then there is no need to keep the address in memory any more, hence the > address was free()'d at the beginning of the "if" (and was not freed > after the "if", because the address was still needed at the userspace > part.) > > With this patch, the "localbypass" creates one more branch inside that > "if", which is handing over the processing logic to the userspace (which > has no free()). The older two branches _inside_ the "if" (vxlan > found/vxlan not found) are still terminating, and therefore have one > call to free() each. Hi Vladimir, thanks for your response. I do still feel that the code I proposed is slightly nicer and in keeping with general kernel coding practices. But I do also concede that is a highly subjective position. I do agree that your code is correct, within the scope of what the patch seeks to achieve. And I do not object to you keeping it as is if that is your preference. ...
Simon Horman <simon.horman@corigine.com> writes: > On Wed, Apr 05, 2023 at 01:05:02PM +0800, Vladimir Nikishkin wrote: >> >> Simon Horman <simon.horman@corigine.com> writes: >> >> > I'm a bit unsure about the logic around dst_release(). >> > But assuming it is correct, perhaps this is a slightly >> >> Let me try to defend this logic. >> >> In the previous version, if the destination is local (the first "if"), >> then there is no need to keep the address in memory any more, hence the >> address was free()'d at the beginning of the "if" (and was not freed >> after the "if", because the address was still needed at the userspace >> part.) >> >> With this patch, the "localbypass" creates one more branch inside that >> "if", which is handing over the processing logic to the userspace (which >> has no free()). The older two branches _inside_ the "if" (vxlan >> found/vxlan not found) are still terminating, and therefore have one >> call to free() each. > > Hi Vladimir, > > thanks for your response. > > I do still feel that the code I proposed is slightly nicer > and in keeping with general kernel coding practices. > But I do also concede that is a highly subjective position. > > I do agree that your code is correct, within the scope of what the patch > seeks to achieve. And I do not object to you keeping it as is if that is > your preference. > > ... Hello, Simon I have integrated all of your suggestions in v6. This response was only to justify the correctness of the dst_release() call, which is not a self-evident thing.
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 561fe1b314f5..cef7a9aec24b 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2341,7 +2341,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, union vxlan_addr *daddr, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, - u32 rt_flags) + u32 rt_flags, bool localbypass) { #if IS_ENABLED(CONFIG_IPV6) /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of @@ -2355,18 +2355,21 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; - dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, daddr->sa.sa_family, dst_port, vxlan->cfg.flags); - if (!dst_vxlan) { + if (!dst_vxlan && localbypass) { + dst_release(dst); dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); return -ENOENT; + } else if (!dst_vxlan && !localbypass) { + return 0; } + dst_release(dst); vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } @@ -2393,6 +2396,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, int err; u32 flags = vxlan->cfg.flags; bool udp_sum = false; + bool localbypass = true; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); __be32 vni = 0; #if IS_ENABLED(CONFIG_IPV6) @@ -2494,9 +2498,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!info) { /* Bypass encapsulation if the destination is local */ + localbypass = !(flags & VXLAN_F_LOCALBYPASS); err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, - &rt->dst, rt->rt_flags); + &rt->dst, rt->rt_flags, + localbypass); if (err) goto out_unlock; @@ -2568,10 +2574,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!info) { u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; - + localbypass = !(flags & VXLAN_F_LOCALBYPASS); err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, - ndst, rt6i_flags); + ndst, rt6i_flags, localbypass); if (err) goto out_unlock; } @@ -3202,6 +3208,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, + [IFLA_VXLAN_LOCALBYPASS] = { .type = NLA_U8 }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4011,6 +4018,16 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } + if (data[IFLA_VXLAN_LOCALBYPASS]) { + if (changelink) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCALBYPASS], + "Cannot change LOCALBYPASS flag"); + return -EOPNOTSUPP; + } + if (!nla_get_u8(data[IFLA_VXLAN_LOCALBYPASS])) + conf->flags |= VXLAN_F_LOCALBYPASS; + } + if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, @@ -4232,6 +4249,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ 0; } @@ -4308,7 +4326,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, - !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) + !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || + nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, + !(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 20bd7d893e10..0be91ca78d3a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -328,6 +328,7 @@ struct vxlan_dev { #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 +#define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -348,7 +349,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ - VXLAN_F_VNIFILTER) + VXLAN_F_VNIFILTER | \ + VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 57ceb788250f..4e3a3d295056 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -826,6 +826,7 @@ enum { IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 901d98b865a1..3d9a1fd6f7e7 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -747,6 +747,8 @@ enum { IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
In vxlan_core, if an fdb entry is pointing to a local address with some port, the system tries to get the packet to deliver the packet to the vxlan directly, bypassing the network stack. This patch makes it still try canonical delivery, if there is no linux kernel vxlan listening on this port. This will be useful for the cases when there is some userspace daemon expecting vxlan packets for post-processing, or some other implementation of vxlan. Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw> --- drivers/net/vxlan/vxlan_core.c | 34 ++++++++++++++++++++++++------ include/net/vxlan.h | 4 +++- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 2 ++ 4 files changed, 33 insertions(+), 8 deletions(-)