Message ID | 20230405050102.15612-1-vladimir@nikishkin.pw (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next,v6] vxlan: try to send a packet normally if local bypass fails | expand |
On Wed, 2023-04-05 at 13:01 +0800, Vladimir Nikishkin wrote: > In vxlan_core, if an fdb entry is pointing to a local > address with some port, the system tries to get the packet to > deliver the packet to the vxlan directly, bypassing the network The above sentence sounds confusing to me. Possibly: ... the system tries to deliver the packet to ... ? > stack. > > This patch makes it still try canonical delivery, if there is no > linux kernel vxlan listening on this port. This will be useful > for the cases when there is some userspace daemon expecting > vxlan packets for post-processing, or some other implementation > of vxlan. > > Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw> > --- > drivers/net/vxlan/vxlan_core.c | 29 +++-- > include/net/vxlan.h | 4 +- > include/uapi/linux/if_link.h | 1 + > tools/include/uapi/linux/if_link.h | 2 + > tools/testing/selftests/net/Makefile | 1 + > tools/testing/selftests/net/rtnetlink.sh | 3 + > .../selftests/net/test_vxlan_nolocalbypass.sh | 102 ++++++++++++++++++ > 7 files changed, 135 insertions(+), 7 deletions(-) > create mode 100755 tools/testing/selftests/net/test_vxlan_nolocalbypass.sh > > diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c > index 561fe1b314f5..f9dfb179af58 100644 > --- a/drivers/net/vxlan/vxlan_core.c > +++ b/drivers/net/vxlan/vxlan_core.c > @@ -2341,7 +2341,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, > union vxlan_addr *daddr, > __be16 dst_port, int dst_ifindex, __be32 vni, > struct dst_entry *dst, > - u32 rt_flags) > + u32 rt_flags, bool localbypass) No need to add an additional argument, you can use vxlan->cfg.flags instead > { > #if IS_ENABLED(CONFIG_IPV6) > /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of > @@ -2355,11 +2355,13 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, > !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { > struct vxlan_dev *dst_vxlan; > > - dst_release(dst); > dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, > daddr->sa.sa_family, dst_port, > vxlan->cfg.flags); > if (!dst_vxlan) { > + if (!localbypass) > + return 0; A space here would make the code more clear > + dst_release(dst); > dev->stats.tx_errors++; > vxlan_vnifilter_count(vxlan, vni, NULL, > VXLAN_VNI_STATS_TX_ERRORS, 0); > @@ -2367,6 +2369,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, > > return -ENOENT; > } > + dst_release(dst); > vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); > return 1; > } > @@ -2494,9 +2497,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, > > if (!info) { > /* Bypass encapsulation if the destination is local */ > + bool localbypass = flags & VXLAN_F_LOCALBYPASS; The new variable is not needed (see above). Anyway you need to add an empty line between variable declarations and code - btw strange that checkpatch did not complain. > err = encap_bypass_if_local(skb, dev, vxlan, dst, > dst_port, ifindex, vni, > - &rt->dst, rt->rt_flags); > + &rt->dst, rt->rt_flags, > + localbypass); > if (err) > goto out_unlock; > > @@ -2568,10 +2573,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, > > if (!info) { > u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; > - > + bool localbypass = flags & VXLAN_F_LOCALBYPASS; > err = encap_bypass_if_local(skb, dev, vxlan, dst, > dst_port, ifindex, vni, > - ndst, rt6i_flags); > + ndst, rt6i_flags, localbypass); > if (err) > goto out_unlock; > } > @@ -3202,6 +3207,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { > [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, > [IFLA_VXLAN_DF] = { .type = NLA_U8 }, > [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, > + [IFLA_VXLAN_LOCALBYPASS] = { .type = NLA_U8 }, > }; > > static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], > @@ -4011,6 +4017,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], > conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; > } > > + if (data[IFLA_VXLAN_LOCALBYPASS]) { > + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, > + VXLAN_F_LOCALBYPASS, changelink, > + false, extack); > + if (err) > + return err; > + } > + > if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { > err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, > VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, > @@ -4232,6 +4246,7 @@ static size_t vxlan_get_size(const struct net_device *dev) > nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ > nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ > nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ > + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ > 0; > } > > @@ -4308,7 +4323,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) > nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, > !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || > nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, > - !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) > + !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || > + nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, > + !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) > goto nla_put_failure; > > if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) > diff --git a/include/net/vxlan.h b/include/net/vxlan.h > index 20bd7d893e10..0be91ca78d3a 100644 > --- a/include/net/vxlan.h > +++ b/include/net/vxlan.h > @@ -328,6 +328,7 @@ struct vxlan_dev { > #define VXLAN_F_TTL_INHERIT 0x10000 > #define VXLAN_F_VNIFILTER 0x20000 > #define VXLAN_F_MDB 0x40000 > +#define VXLAN_F_LOCALBYPASS 0x80000 > > /* Flags that are used in the receive path. These flags must match in > * order for a socket to be shareable > @@ -348,7 +349,8 @@ struct vxlan_dev { > VXLAN_F_UDP_ZERO_CSUM6_TX | \ > VXLAN_F_UDP_ZERO_CSUM6_RX | \ > VXLAN_F_COLLECT_METADATA | \ > - VXLAN_F_VNIFILTER) > + VXLAN_F_VNIFILTER | \ > + VXLAN_F_LOCALBYPASS) > > struct net_device *vxlan_dev_create(struct net *net, const char *name, > u8 name_assign_type, struct vxlan_config *conf); > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index 8d679688efe0..0fc56be5e19f 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -827,6 +827,7 @@ enum { > IFLA_VXLAN_TTL_INHERIT, > IFLA_VXLAN_DF, > IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ > + IFLA_VXLAN_LOCALBYPASS, > __IFLA_VXLAN_MAX > }; > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) > diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h > index 39e659c83cfd..1253bd0aa90e 100644 > --- a/tools/include/uapi/linux/if_link.h > +++ b/tools/include/uapi/linux/if_link.h > @@ -748,6 +748,8 @@ enum { > IFLA_VXLAN_GPE, > IFLA_VXLAN_TTL_INHERIT, > IFLA_VXLAN_DF, > + IFLA_VXLAN_VNIFILTER, > + IFLA_VXLAN_LOCALBYPASS, > __IFLA_VXLAN_MAX > }; > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) > diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile > index 1de34ec99290..7a9cfd0c92db 100644 > --- a/tools/testing/selftests/net/Makefile > +++ b/tools/testing/selftests/net/Makefile > @@ -83,6 +83,7 @@ TEST_GEN_FILES += nat6to4.o > TEST_GEN_FILES += ip_local_port_range > TEST_GEN_FILES += bind_wildcard > TEST_PROGS += test_vxlan_mdb.sh > +TEST_PROGS += test_vxlan_nolocalbypass.sh > > TEST_FILES := settings > > diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh > index 383ac6fc037d..09a5ef4bd42b 100755 > --- a/tools/testing/selftests/net/rtnetlink.sh > +++ b/tools/testing/selftests/net/rtnetlink.sh > @@ -505,6 +505,9 @@ kci_test_encap_vxlan() > ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null > check_fail $? > > + ip -netns "$testns" link set dev "$vxlan" type vxlan nolocalbypass 2>/dev/null > + check_fail $? This will fail until a new version of 'ip' is deployed, right? I would wait before adding this test case (or will move it in the test_vxlan_nolocalbypass.sh, after the relevant check) > + > ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null > check_fail $? > > diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh > new file mode 100755 > index 000000000000..efa37af2da7b > --- /dev/null > +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh > @@ -0,0 +1,102 @@ > +#!/bin/bash > +# SPDX-License-Identifier: GPL-2.0 > + > +# This file is testing that the [no]localbypass option for a vxlan device is > +# working. With the nolocalbypass option, packets to a local destination, which > +# have no corresponding vxlan in the kernel, will be delivered to userspace, for > +# any userspace process to process. In this test tcpdump plays the role of such a > +# process. This is what the test 1 is checking. > +# The test 2 checks that without the nolocalbypass (which is equivalent to the > +# localbypass option), the packets do not reach userspace. > + > +EXIT_FAIL=1 > +ksft_skip=4 > +EXIT_SUCCESS=0 > + > +if [ "$(id -u)" -ne 0 ];then > + echo "SKIP: Need root privileges" > + exit $ksft_skip; > +fi > + > +if [ ! -x "$(command -v ip)" ]; then > + echo "SKIP: Could not run test without ip tool" > + exit $ksft_skip > +fi > + > +if [ ! -x "$(command -v bridge)" ]; then > + echo "SKIP: Could not run test without bridge tool" > + exit $ksft_skip > +fi No need to check for the above tool presence. You can assume they are present, as most net self-tests unconditionally relay on them. > + > +if [ ! -x "$(command -v tcpdump)" ]; then > + echo "SKIP: Could not run test without tcpdump tool" > + exit $ksft_skip > +fi I personally would opt for adding nft rules and checking the counters, but no strong opinion on that. > + > +if [ ! -x "$(command -v grep)" ]; then > + echo "SKIP: Could not run test without grep tool" > + exit $ksft_skip > +fi Some self-tests check for 'grep' presence but most simply assume such command is available. I would opt for the latter. > + > +ip link help vxlan 2>&1 | grep -q "localbypass" > +if [ $? -ne 0 ]; then > + echo "SKIP: iproute2 bridge too old, missing VXLAN nolocalbypass support" > + exit $ksft_skip > +fi > + > + Additional empty line not needed > +packetfile=/tmp/packets-"$(uuidgen)" > + > +# test 1: packets going to userspace > +rm "$packetfile" > +ip link del dev testvxlan0 > +ip link add testvxlan0 type vxlan \ > + id 100 \ > + dstport 4789 \ > + srcport 4789 4790 \ > + nolearning noproxy \ > + nolocalbypass > +ip link set up dev testvxlan0 > +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792 > +ip address add 172.16.100.1/24 dev testvxlan0 > +tcpdump -i lo 'udp and port 4792' > "$packetfile" & > +tcpdump_pid=$! > +timeout 5 ping -c 5 172.16.100.2 > +kill "$tcpdump_pid" > +ip link del dev testvxlan0 > + > +if grep -q "VXLAN" "$packetfile" ; then > + echo 'Positive test passed' > +else > + echo 'Positive test failed' > + exit $EXIT_FAIL > +fi > +rm "$packetfile" It's better if you additionally remove the generate file(s) in a cleanup function set as trap for exit: cleanup() { rm -f ${packetfile} } trap cleanup EXIT Cheers, Paolo
I agree with Paolo's comments. Some more below. On Wed, Apr 05, 2023 at 01:01:02PM +0800, Vladimir Nikishkin wrote: > In vxlan_core, if an fdb entry is pointing to a local > address with some port, the system tries to get the packet to > deliver the packet to the vxlan directly, bypassing the network > stack. > > This patch makes it still try canonical delivery, if there is no > linux kernel vxlan listening on this port. This will be useful > for the cases when there is some userspace daemon expecting > vxlan packets for post-processing, or some other implementation > of vxlan. Use "imperative mood" as described here: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes Something like this: " If a packet needs to be encapsulated towards a local destination IP and a VXLAN device that matches the destination port and VNI exists, then the packet will be injected into the Rx path as if it was received by the target VXLAN device without undergoing encapsulation. If such a device does not exist, the packet will be dropped. There are scenarios where we do not want to drop such packets and instead want to let them be encapsulated and locally received by a user space program that post-processes these VXLAN packets. To that end, add a new VXLAN device attribute that controls whether such packets are dropped or not. When set ("localbypass") these packets are dropped and when unset ("nolocalbypass") the packets are encapsulated and locally delivered to the listening user space application. Default to "localbypass" to maintain existing behavior. " Assuming you agree with this description, I think it reveals a bug. The current implementation does not actually default to "localbypass": # ip link add name vxlan0 up type vxlan id 1000 local 192.0.2.1 dstport 4789 # ip -d -j -p link show dev vxlan0 | jq '.[]["linkinfo"]["info_data"]["localbypass"]' false Fixed by this hunk: diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index f9dfb179af58..9c8135fd7be5 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -4023,6 +4023,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], false, extack); if (err) return err; + } else if (!changelink) { + /* default to local bypass on a new device */ + conf->flags |= VXLAN_F_LOCALBYPASS; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { > > Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw> Please add the selftest in a separate commit with its own description. [...] > @@ -3202,6 +3207,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { > [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, > [IFLA_VXLAN_DF] = { .type = NLA_U8 }, > [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, > + [IFLA_VXLAN_LOCALBYPASS] = { .type = NLA_U8 }, I suggest NLA_POLICY_MAX(NLA_U8, 1) to make sure values other than 0 and 1 are rejected in the unlikely case that we will want to utilize them in the future. Also, it's a good time to enable strict validation for new VXLAN attributes: diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 9c8135fd7be5..c37face0d021 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3177,6 +3177,7 @@ static void vxlan_raw_setup(struct net_device *dev) } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { + [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, > }; > > static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], > @@ -4011,6 +4017,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], > conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; > } > > + if (data[IFLA_VXLAN_LOCALBYPASS]) { > + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, > + VXLAN_F_LOCALBYPASS, changelink, > + false, extack); What is the idea behind forbidding changing this attribute? It's not fundamental to the VXLAN device like "id" / "external" / "vnifilter". I suggest enabling it unless you have a good reason not to. It is useful for the selftest (see below). > + if (err) > + return err; > + } > + > if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { > err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, > VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, [...] > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index 8d679688efe0..0fc56be5e19f 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -827,6 +827,7 @@ enum { > IFLA_VXLAN_TTL_INHERIT, > IFLA_VXLAN_DF, > IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ > + IFLA_VXLAN_LOCALBYPASS, > __IFLA_VXLAN_MAX > }; > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) > diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h > index 39e659c83cfd..1253bd0aa90e 100644 > --- a/tools/include/uapi/linux/if_link.h > +++ b/tools/include/uapi/linux/if_link.h > @@ -748,6 +748,8 @@ enum { > IFLA_VXLAN_GPE, > IFLA_VXLAN_TTL_INHERIT, > IFLA_VXLAN_DF, > + IFLA_VXLAN_VNIFILTER, > + IFLA_VXLAN_LOCALBYPASS, I wasn't aware of this file (looks like others weren't as well). Not sure you actually need to touch it (git history shows that it is synced by those who need it), but if you do, then make sure you also copy the comment next to the vnifilter attribute. > __IFLA_VXLAN_MAX > }; > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) > diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile > index 1de34ec99290..7a9cfd0c92db 100644 > --- a/tools/testing/selftests/net/Makefile > +++ b/tools/testing/selftests/net/Makefile > @@ -83,6 +83,7 @@ TEST_GEN_FILES += nat6to4.o > TEST_GEN_FILES += ip_local_port_range > TEST_GEN_FILES += bind_wildcard > TEST_PROGS += test_vxlan_mdb.sh > +TEST_PROGS += test_vxlan_nolocalbypass.sh > > TEST_FILES := settings > > diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh > index 383ac6fc037d..09a5ef4bd42b 100755 > --- a/tools/testing/selftests/net/rtnetlink.sh > +++ b/tools/testing/selftests/net/rtnetlink.sh > @@ -505,6 +505,9 @@ kci_test_encap_vxlan() > ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null > check_fail $? > > + ip -netns "$testns" link set dev "$vxlan" type vxlan nolocalbypass 2>/dev/null > + check_fail $? > + This is going to fail if you are going to enable change link support like I suggested above. I suggest removing this test and instead testing changing this attribute in the dedicated test. > ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null > check_fail $? > > diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh > new file mode 100755 > index 000000000000..efa37af2da7b > --- /dev/null > +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh > @@ -0,0 +1,102 @@ > +#!/bin/bash > +# SPDX-License-Identifier: GPL-2.0 > + > +# This file is testing that the [no]localbypass option for a vxlan device is > +# working. With the nolocalbypass option, packets to a local destination, which > +# have no corresponding vxlan in the kernel, will be delivered to userspace, for > +# any userspace process to process. In this test tcpdump plays the role of such a > +# process. This is what the test 1 is checking. > +# The test 2 checks that without the nolocalbypass (which is equivalent to the > +# localbypass option), the packets do not reach userspace. > + > +EXIT_FAIL=1 > +ksft_skip=4 > +EXIT_SUCCESS=0 > + > +if [ "$(id -u)" -ne 0 ];then > + echo "SKIP: Need root privileges" > + exit $ksft_skip; > +fi > + > +if [ ! -x "$(command -v ip)" ]; then > + echo "SKIP: Could not run test without ip tool" > + exit $ksft_skip > +fi > + > +if [ ! -x "$(command -v bridge)" ]; then > + echo "SKIP: Could not run test without bridge tool" > + exit $ksft_skip > +fi > + > +if [ ! -x "$(command -v tcpdump)" ]; then > + echo "SKIP: Could not run test without tcpdump tool" > + exit $ksft_skip > +fi > + > +if [ ! -x "$(command -v grep)" ]; then > + echo "SKIP: Could not run test without grep tool" > + exit $ksft_skip > +fi > + > +ip link help vxlan 2>&1 | grep -q "localbypass" > +if [ $? -ne 0 ]; then > + echo "SKIP: iproute2 bridge too old, missing VXLAN nolocalbypass support" s/bridge/ip/ > + exit $ksft_skip > +fi > + > + > +packetfile=/tmp/packets-"$(uuidgen)" > + > +# test 1: packets going to userspace > +rm "$packetfile" > +ip link del dev testvxlan0 Try using namespaces (like in test_vxlan_mdb.sh) to avoid interfering with the existing user environment. Also, like test_vxlan_mdb.sh, please use the general framework of log_test() and run_cmd(). I ran your test and although it passed, it emitted a lot of error messages (unlike other tests): # ./test_vxlan_nolocalbypass.sh rm: cannot remove '/tmp/packets-d614f8e4-549e-461e-9500-65bfea4d827e': No such file or directory Cannot find device "testvxlan0" PING 172.16.100.2 (172.16.100.2) 56(84) bytes of data. dropped privs to tcpdump tcpdump: verbose output suppressed, use -v[v]... for full protocol decode listening on lo, link-type EN10MB (Ethernet), snapshot length 262144 bytes From 172.16.100.1 icmp_seq=1 Destination Host Unreachable From 172.16.100.1 icmp_seq=2 Destination Host Unreachable From 172.16.100.1 icmp_seq=3 Destination Host Unreachable From 172.16.100.1 icmp_seq=4 Destination Host Unreachable 8 packets captured 16 packets received by filter 0 packets dropped by kernel Positive test passed Cannot find device "testvxlan0" PING 172.16.100.2 (172.16.100.2) 56(84) bytes of data. dropped privs to tcpdump tcpdump: verbose output suppressed, use -v[v]... for full protocol decode listening on lo, link-type EN10MB (Ethernet), snapshot length 262144 bytes From 172.16.100.1 icmp_seq=1 Destination Host Unreachable From 172.16.100.1 icmp_seq=2 Destination Host Unreachable From 172.16.100.1 icmp_seq=3 Destination Host Unreachable 0 packets captured 0 packets received by filter 0 packets dropped by kernel Negative test passed > +ip link add testvxlan0 type vxlan \ > + id 100 \ > + dstport 4789 \ > + srcport 4789 4790 \ > + nolearning noproxy \ > + nolocalbypass > +ip link set up dev testvxlan0 > +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792 > +ip address add 172.16.100.1/24 dev testvxlan0 > +tcpdump -i lo 'udp and port 4792' > "$packetfile" & > +tcpdump_pid=$! > +timeout 5 ping -c 5 172.16.100.2 > +kill "$tcpdump_pid" > +ip link del dev testvxlan0 Instead of creating and deleting a VXLAN device. I suggest the following: 1. Create a VXLAN device without setting the new option. 2. Test that by default packets do not reach the listening user space application. 3. Set "nolocalbypass". This is possible if you enable change link support like I suggested above. 4. Test that packets do reach the listening user space application. 5. Set "localbypass". 6. Test that packets do not reach the listening user space application. Regarding the "listening user space application", you can try to use socat. See commit 8826218215de ("selftests: fib_tests: Add test cases for interaction with mangling") for reference. I think it's better than using tcpdump on the loopback device because you can actually test that packets are delivered to user space. Thanks > + > +if grep -q "VXLAN" "$packetfile" ; then > + echo 'Positive test passed' > +else > + echo 'Positive test failed' > + exit $EXIT_FAIL > +fi > +rm "$packetfile" > + > +# test 2: old behaviour preserved > +ip link del dev testvxlan0 > +ip link add testvxlan0 type vxlan \ > + id 100 \ > + dstport 4789 \ > + srcport 4789 4790 \ > + nolearning noproxy \ > + localbypass > +ip link set up dev testvxlan0 > +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792 > +ip address add 172.16.100.1/24 dev testvxlan0 > +tcpdump -i lo 'udp and port 4792' > "$packetfile" & > +tcpdump_pid=$! > +timeout 5 ping -c 5 172.16.100.2 > +kill "$tcpdump_pid" > +ip link del dev testvxlan0 > + > +if grep -q "VXLAN" "$packetfile" ; then > + echo 'Negative test failed' > + exit $EXIT_FAIL > +else > + echo 'Negative test passed' > +fi > +rm "$packetfile" > + > +exit $EXIT_SUCCESS > + > -- > 2.35.7 > > -- > Fastmail. >
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 561fe1b314f5..f9dfb179af58 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2341,7 +2341,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, union vxlan_addr *daddr, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, - u32 rt_flags) + u32 rt_flags, bool localbypass) { #if IS_ENABLED(CONFIG_IPV6) /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of @@ -2355,11 +2355,13 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; - dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, daddr->sa.sa_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { + if (!localbypass) + return 0; + dst_release(dst); dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); @@ -2367,6 +2369,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, return -ENOENT; } + dst_release(dst); vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } @@ -2494,9 +2497,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!info) { /* Bypass encapsulation if the destination is local */ + bool localbypass = flags & VXLAN_F_LOCALBYPASS; err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, - &rt->dst, rt->rt_flags); + &rt->dst, rt->rt_flags, + localbypass); if (err) goto out_unlock; @@ -2568,10 +2573,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!info) { u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; - + bool localbypass = flags & VXLAN_F_LOCALBYPASS; err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, - ndst, rt6i_flags); + ndst, rt6i_flags, localbypass); if (err) goto out_unlock; } @@ -3202,6 +3207,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, + [IFLA_VXLAN_LOCALBYPASS] = { .type = NLA_U8 }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4011,6 +4017,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } + if (data[IFLA_VXLAN_LOCALBYPASS]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, + VXLAN_F_LOCALBYPASS, changelink, + false, extack); + if (err) + return err; + } + if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, @@ -4232,6 +4246,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ 0; } @@ -4308,7 +4323,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, - !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) + !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || + nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, + !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 20bd7d893e10..0be91ca78d3a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -328,6 +328,7 @@ struct vxlan_dev { #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 +#define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -348,7 +349,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ - VXLAN_F_VNIFILTER) + VXLAN_F_VNIFILTER | \ + VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8d679688efe0..0fc56be5e19f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -827,6 +827,7 @@ enum { IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 39e659c83cfd..1253bd0aa90e 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -748,6 +748,8 @@ enum { IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 1de34ec99290..7a9cfd0c92db 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -83,6 +83,7 @@ TEST_GEN_FILES += nat6to4.o TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh +TEST_PROGS += test_vxlan_nolocalbypass.sh TEST_FILES := settings diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 383ac6fc037d..09a5ef4bd42b 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -505,6 +505,9 @@ kci_test_encap_vxlan() ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null check_fail $? + ip -netns "$testns" link set dev "$vxlan" type vxlan nolocalbypass 2>/dev/null + check_fail $? + ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null check_fail $? diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh new file mode 100755 index 000000000000..efa37af2da7b --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This file is testing that the [no]localbypass option for a vxlan device is +# working. With the nolocalbypass option, packets to a local destination, which +# have no corresponding vxlan in the kernel, will be delivered to userspace, for +# any userspace process to process. In this test tcpdump plays the role of such a +# process. This is what the test 1 is checking. +# The test 2 checks that without the nolocalbypass (which is equivalent to the +# localbypass option), the packets do not reach userspace. + +EXIT_FAIL=1 +ksft_skip=4 +EXIT_SUCCESS=0 + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip; +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v bridge)" ]; then + echo "SKIP: Could not run test without bridge tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v grep)" ]; then + echo "SKIP: Could not run test without grep tool" + exit $ksft_skip +fi + +ip link help vxlan 2>&1 | grep -q "localbypass" +if [ $? -ne 0 ]; then + echo "SKIP: iproute2 bridge too old, missing VXLAN nolocalbypass support" + exit $ksft_skip +fi + + +packetfile=/tmp/packets-"$(uuidgen)" + +# test 1: packets going to userspace +rm "$packetfile" +ip link del dev testvxlan0 +ip link add testvxlan0 type vxlan \ + id 100 \ + dstport 4789 \ + srcport 4789 4790 \ + nolearning noproxy \ + nolocalbypass +ip link set up dev testvxlan0 +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792 +ip address add 172.16.100.1/24 dev testvxlan0 +tcpdump -i lo 'udp and port 4792' > "$packetfile" & +tcpdump_pid=$! +timeout 5 ping -c 5 172.16.100.2 +kill "$tcpdump_pid" +ip link del dev testvxlan0 + +if grep -q "VXLAN" "$packetfile" ; then + echo 'Positive test passed' +else + echo 'Positive test failed' + exit $EXIT_FAIL +fi +rm "$packetfile" + +# test 2: old behaviour preserved +ip link del dev testvxlan0 +ip link add testvxlan0 type vxlan \ + id 100 \ + dstport 4789 \ + srcport 4789 4790 \ + nolearning noproxy \ + localbypass +ip link set up dev testvxlan0 +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792 +ip address add 172.16.100.1/24 dev testvxlan0 +tcpdump -i lo 'udp and port 4792' > "$packetfile" & +tcpdump_pid=$! +timeout 5 ping -c 5 172.16.100.2 +kill "$tcpdump_pid" +ip link del dev testvxlan0 + +if grep -q "VXLAN" "$packetfile" ; then + echo 'Negative test failed' + exit $EXIT_FAIL +else + echo 'Negative test passed' +fi +rm "$packetfile" + +exit $EXIT_SUCCESS +
In vxlan_core, if an fdb entry is pointing to a local address with some port, the system tries to get the packet to deliver the packet to the vxlan directly, bypassing the network stack. This patch makes it still try canonical delivery, if there is no linux kernel vxlan listening on this port. This will be useful for the cases when there is some userspace daemon expecting vxlan packets for post-processing, or some other implementation of vxlan. Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw> --- drivers/net/vxlan/vxlan_core.c | 29 +++-- include/net/vxlan.h | 4 +- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 2 + tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/rtnetlink.sh | 3 + .../selftests/net/test_vxlan_nolocalbypass.sh | 102 ++++++++++++++++++ 7 files changed, 135 insertions(+), 7 deletions(-) create mode 100755 tools/testing/selftests/net/test_vxlan_nolocalbypass.sh