Message ID | 20240805-upstream-bpf-next-20240506-mptcp-subflow-test-v4-2-2b4ca6994993@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | selftests/bpf: new MPTCP subflow subtest | expand |
On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: > +static int endpoint_init(char *flags) > +{ > + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST); > + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); > + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); > + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); > + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); > + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) { > + printf("'ip mptcp' not supported, skip this test.\n"); > + test__skip(); It is always a skip now in bpf CI: #171/3 mptcp/subflow:SKIP This test is a useful addition for the bpf CI selftest. It can't catch regression if it is always a skip in bpf CI though. iproute2 needs to be updated (cc: Daniel Xu and Manu, the outdated iproute2 is something that came up multiple times). Not sure when the iproute2 can be updated. In the mean time, your v3 is pretty close to getting pm_nl_ctl compiled. Is there other blocker on this? > + goto fail; > + } > + > + return 0; > +fail: > + return -1; > +} > + > +static int _ss_search(char *src, char *dst, char *port, char *keyword) > +{ > + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst %s %s %d | grep -q '%s'", > + NS_TEST, src, dst, port, PORT_1, keyword); > +} > + > +static int ss_search(char *src, char *keyword) > +{ > + return _ss_search(src, ADDR_1, "dport", keyword); > +} > + > +static void run_subflow(char *new) > +{ > + int server_fd, client_fd, err; > + char cc[TCP_CA_NAME_MAX]; > + socklen_t len = sizeof(cc); > + > + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); > + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) > + return; > + > + client_fd = connect_to_fd(server_fd, 0); > + if (!ASSERT_GE(client_fd, 0, "connect to fd")) > + goto fail; > + > + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); > + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) > + goto fail; > + > + send_byte(client_fd); > + > + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search fwmark:0x1"); > + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search fwmark:0x2"); > + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); > + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); Is there a getsockopt way instead of ss + grep?
Hi Martin, Thank you for your reply! On 14/08/2024 03:12, Martin KaFai Lau wrote: > On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: >> +static int endpoint_init(char *flags) >> +{ >> + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", >> NS_TEST); >> + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); >> + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); >> + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); >> + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); >> + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, >> ADDR_2, flags)) { >> + printf("'ip mptcp' not supported, skip this test.\n"); >> + test__skip(); > > It is always a skip now in bpf CI: > > #171/3 mptcp/subflow:SKIP > > This test is a useful addition for the bpf CI selftest. > > It can't catch regression if it is always a skip in bpf CI though. Indeed, for the moment, this test is skipped in bpf CI. The MPTCP CI checks the MPTCP BPF selftests that are on top of net and net-next at least once a day. It is always running with the last stable version of iproute2, so this test is not skipped: #169/3 mptcp/subflow:OK https://github.com/multipath-tcp/mptcp_net-next/actions/runs/10384566794/job/28751869426#step:7:11080 > iproute2 needs to be updated (cc: Daniel Xu and Manu, the outdated > iproute2 is something that came up multiple times). > > Not sure when the iproute2 can be updated. In the mean time, your v3 is > pretty close to getting pm_nl_ctl compiled. Is there other blocker on this? I will try to find some time to check the modifications I suggested in the v3, but I don't know how long it will take to have them ready, as they might require some adaptations of the CI side as well, I need to check. On the other hand, I understood adding a duplicated version of the mptcp.h UAPI header is not an option either. So not to block this (already old) series, I thought it would help to first focus on this version using 'ip mptcp', while I'm looking at the selftests modifications. If these modifications are successful, I can always resend the patch 2/3 from the v3 later, and using 'pm_nl_ctl' instead of 'ip mptcp', to be able to work with IPRoute2 5.5. Do you think that could work like that? >> + goto fail; >> + } >> + >> + return 0; >> +fail: >> + return -1; >> +} >> + >> +static int _ss_search(char *src, char *dst, char *port, char *keyword) >> +{ >> + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst %s %s %d >> | grep -q '%s'", >> + NS_TEST, src, dst, port, PORT_1, keyword); >> +} >> + >> +static int ss_search(char *src, char *keyword) >> +{ >> + return _ss_search(src, ADDR_1, "dport", keyword); >> +} >> + >> +static void run_subflow(char *new) >> +{ >> + int server_fd, client_fd, err; >> + char cc[TCP_CA_NAME_MAX]; >> + socklen_t len = sizeof(cc); >> + >> + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); >> + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) >> + return; >> + >> + client_fd = connect_to_fd(server_fd, 0); >> + if (!ASSERT_GE(client_fd, 0, "connect to fd")) >> + goto fail; >> + >> + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); >> + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) >> + goto fail; >> + >> + send_byte(client_fd); >> + >> + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search fwmark:0x1"); >> + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search fwmark:0x2"); >> + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); >> + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); > > Is there a getsockopt way instead of ss + grep? No there isn't: from the userspace, the app communicates with the MPTCP socket, which can have multiple paths (subflows, a TCP socket). To keep the compatibility with TCP, [gs]etsockopt() will look at/modify the whole MPTCP connection. For example, in some cases, a setsockopt() will propagate the option to all the subflows. Depending on the option, the modification might only apply to the first subflow, or to the user-facing socket. For advanced users who want to have different options set to the different subflows of an MPTCP connection, they can use BPF: that's what is being validated here. In other words, doing a 'getsockopt()' from the userspace program here will not show all the different marks and TCP CC that can be set per subflow with BPF. We can see that in the test: a getsockopt() is done on the MPTCP socket to retrieve the default TCP CC ('cc' which is certainly 'cubic'), but we expect to find another one ('new' which is 'reno'), set by the BPF program from patch 1/2. I guess we could use bpf to do a getsockopt() per subflow, but that's seems a bit cheated to have the BPF test program setting something and checking if it is set. Here, it is an external way. Because it is done from a dedicated netns, it sounds OK to do that, no? Cheers, Matt
On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: > +static void run_subflow(char *new) > +{ > + int server_fd, client_fd, err; > + char cc[TCP_CA_NAME_MAX]; > + socklen_t len = sizeof(cc); > + > + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); > + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) > + return; > + > + client_fd = connect_to_fd(server_fd, 0); > + if (!ASSERT_GE(client_fd, 0, "connect to fd")) > + goto fail; > + > + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); > + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) > + goto fail; client_fd is leaked. > + > + send_byte(client_fd); > + > + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search fwmark:0x1"); > + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search fwmark:0x2"); > + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); > + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); > + > + close(client_fd); > +fail: > + close(server_fd); > +}
On 8/14/24 3:04 AM, Matthieu Baerts wrote: > Hi Martin, > > Thank you for your reply! > > On 14/08/2024 03:12, Martin KaFai Lau wrote: >> On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: >>> +static int endpoint_init(char *flags) >>> +{ >>> + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", >>> NS_TEST); >>> + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); >>> + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); >>> + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); >>> + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); >>> + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, >>> ADDR_2, flags)) { >>> + printf("'ip mptcp' not supported, skip this test.\n"); >>> + test__skip(); >> >> It is always a skip now in bpf CI: >> >> #171/3 mptcp/subflow:SKIP >> >> This test is a useful addition for the bpf CI selftest. >> >> It can't catch regression if it is always a skip in bpf CI though. > > Indeed, for the moment, this test is skipped in bpf CI. > > The MPTCP CI checks the MPTCP BPF selftests that are on top of net and > net-next at least once a day. It is always running with the last stable > version of iproute2, so this test is not skipped: > > #169/3 mptcp/subflow:OK > > https://github.com/multipath-tcp/mptcp_net-next/actions/runs/10384566794/job/28751869426#step:7:11080 > >> iproute2 needs to be updated (cc: Daniel Xu and Manu, the outdated >> iproute2 is something that came up multiple times). >> >> Not sure when the iproute2 can be updated. In the mean time, your v3 is >> pretty close to getting pm_nl_ctl compiled. Is there other blocker on this? > > I will try to find some time to check the modifications I suggested in > the v3, but I don't know how long it will take to have them ready, as > they might require some adaptations of the CI side as well, I need to > check. On the other hand, I understood adding a duplicated version of > the mptcp.h UAPI header is not an option either. > > So not to block this (already old) series, I thought it would help to > first focus on this version using 'ip mptcp', while I'm looking at the > selftests modifications. If these modifications are successful, I can > always resend the patch 2/3 from the v3 later, and using 'pm_nl_ctl' > instead of 'ip mptcp', to be able to work with IPRoute2 5.5. > > Do you think that could work like that? If there is CI started covering it, staying with the 'ip mptcp' is fine. The bpf CI has to start testing it asap also. The iproute2 package will need to be updated on the bpf CI side. I think this has to be done regardless. It will be useful to avoid the uapi header dup on its own. The last one you have seems pretty close. > >>> + goto fail; >>> + } >>> + >>> + return 0; >>> +fail: >>> + return -1; >>> +} >>> + >>> +static int _ss_search(char *src, char *dst, char *port, char *keyword) >>> +{ >>> + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst %s %s %d >>> | grep -q '%s'", >>> + NS_TEST, src, dst, port, PORT_1, keyword); >>> +} >>> + >>> +static int ss_search(char *src, char *keyword) >>> +{ >>> + return _ss_search(src, ADDR_1, "dport", keyword); >>> +} >>> + >>> +static void run_subflow(char *new) >>> +{ >>> + int server_fd, client_fd, err; >>> + char cc[TCP_CA_NAME_MAX]; >>> + socklen_t len = sizeof(cc); >>> + >>> + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); >>> + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) >>> + return; >>> + >>> + client_fd = connect_to_fd(server_fd, 0); >>> + if (!ASSERT_GE(client_fd, 0, "connect to fd")) >>> + goto fail; >>> + >>> + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); >>> + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) >>> + goto fail; >>> + >>> + send_byte(client_fd); >>> + >>> + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search fwmark:0x1"); >>> + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search fwmark:0x2"); >>> + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); >>> + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); >> >> Is there a getsockopt way instead of ss + grep? > > No there isn't: from the userspace, the app communicates with the MPTCP > socket, which can have multiple paths (subflows, a TCP socket). To keep > the compatibility with TCP, [gs]etsockopt() will look at/modify the > whole MPTCP connection. For example, in some cases, a setsockopt() will > propagate the option to all the subflows. Depending on the option, the > modification might only apply to the first subflow, or to the > user-facing socket. > > For advanced users who want to have different options set to the > different subflows of an MPTCP connection, they can use BPF: that's what > is being validated here. In other words, doing a 'getsockopt()' from the > userspace program here will not show all the different marks and TCP CC > that can be set per subflow with BPF. We can see that in the test: a > getsockopt() is done on the MPTCP socket to retrieve the default TCP CC > ('cc' which is certainly 'cubic'), but we expect to find another one > ('new' which is 'reno'), set by the BPF program from patch 1/2. I guess > we could use bpf to do a getsockopt() per subflow, but that's seems a > bit cheated to have the BPF test program setting something and checking > if it is set. Here, it is an external way. Because it is done from a I think the result is valid by having a bpf prog to inspect the value of a sock. Inspecting socket is an existing use case. There are many existing bpf tests covering this inspection use case to ensure the result is legit. A separate cgroup/getsockopt program should help here (more on this below). > dedicated netns, it sounds OK to do that, no? Thanks for the explanation. I was hoping there is a way to get to the underlying subflow fd. It seems impossible. In the netns does help here. It is not only about the ss iterating a lot of connections or not. My preference is not depending on external tool/shell-ing if possible, e.g. to avoid the package update discussion like the iproute2 here. The uapi from the testing kernel is always up-to-date. ss is another binary but arguably in the same iproute2 package. There is now another extra "grep" and pipe here. We had been bitten by different shell behaviors and some arch has different shells ...etc. I think it is ok to take this set as is if you (and Gelang?) are ok to followup a "cgroup/getsockopt" way to inspect the subflow as the very next patch to the mptcp selftest. It seems inspecting subflow will be a common test going forward for mptcp, so it will be beneficial to have a "cgroup/getsockopt" way to inspect the subflow directly. Take a look at a recent example [0]. The mptcp test is under a cgroup already and has the cgroup setup. An extra "cgroup/getsockopt" prog should be enough. That prog can walk the msk->conn_list and use bpf_rdonly_cast (or the bpf_core_cast macro in libbpf) to cast a pointer to tcp_sock for readonly. It will allow to inspect all the fields in a tcp_sock. Something needs to a fix in patch 2(replied separately), so a re-spin is needed. pw-bot: cr [0]: https://lore.kernel.org/all/20240808150558.1035626-3-alan.maguire@oracle.com/
Hi Martin, Thank you for your reply! On 15/08/2024 00:37, Martin KaFai Lau wrote: > On 8/14/24 3:04 AM, Matthieu Baerts wrote: >> Hi Martin, >> >> Thank you for your reply! >> >> On 14/08/2024 03:12, Martin KaFai Lau wrote: >>> On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: >>>> +static int endpoint_init(char *flags) >>>> +{ >>>> + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", >>>> NS_TEST); >>>> + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); >>>> + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); >>>> + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); >>>> + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); >>>> + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, >>>> ADDR_2, flags)) { >>>> + printf("'ip mptcp' not supported, skip this test.\n"); >>>> + test__skip(); >>> >>> It is always a skip now in bpf CI: >>> >>> #171/3 mptcp/subflow:SKIP >>> >>> This test is a useful addition for the bpf CI selftest. >>> >>> It can't catch regression if it is always a skip in bpf CI though. >> >> Indeed, for the moment, this test is skipped in bpf CI. >> >> The MPTCP CI checks the MPTCP BPF selftests that are on top of net and >> net-next at least once a day. It is always running with the last stable >> version of iproute2, so this test is not skipped: >> >> #169/3 mptcp/subflow:OK >> >> https://github.com/multipath-tcp/mptcp_net-next/actions/ >> runs/10384566794/job/28751869426#step:7:11080 >> >>> iproute2 needs to be updated (cc: Daniel Xu and Manu, the outdated >>> iproute2 is something that came up multiple times). >>> >>> Not sure when the iproute2 can be updated. In the mean time, your v3 is >>> pretty close to getting pm_nl_ctl compiled. Is there other blocker on >>> this? >> >> I will try to find some time to check the modifications I suggested in >> the v3, but I don't know how long it will take to have them ready, as >> they might require some adaptations of the CI side as well, I need to >> check. On the other hand, I understood adding a duplicated version of >> the mptcp.h UAPI header is not an option either. >> >> So not to block this (already old) series, I thought it would help to >> first focus on this version using 'ip mptcp', while I'm looking at the >> selftests modifications. If these modifications are successful, I can >> always resend the patch 2/3 from the v3 later, and using 'pm_nl_ctl' >> instead of 'ip mptcp', to be able to work with IPRoute2 5.5. >> >> Do you think that could work like that? > > If there is CI started covering it, staying with the 'ip mptcp' is fine. > > The bpf CI has to start testing it asap also. The iproute2 package will > need to be updated on the bpf CI side. I think this has to be done > regardless. > > It will be useful to avoid the uapi header dup on its own. The last one > you have seems pretty close. Thank you. Yes I will try to find time to look at that. >>>> + goto fail; >>>> + } >>>> + >>>> + return 0; >>>> +fail: >>>> + return -1; >>>> +} >>>> + >>>> +static int _ss_search(char *src, char *dst, char *port, char *keyword) >>>> +{ >>>> + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst %s %s %d >>>> | grep -q '%s'", >>>> + NS_TEST, src, dst, port, PORT_1, keyword); >>>> +} >>>> + >>>> +static int ss_search(char *src, char *keyword) >>>> +{ >>>> + return _ss_search(src, ADDR_1, "dport", keyword); >>>> +} >>>> + >>>> +static void run_subflow(char *new) >>>> +{ >>>> + int server_fd, client_fd, err; >>>> + char cc[TCP_CA_NAME_MAX]; >>>> + socklen_t len = sizeof(cc); >>>> + >>>> + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); >>>> + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) >>>> + return; >>>> + >>>> + client_fd = connect_to_fd(server_fd, 0); >>>> + if (!ASSERT_GE(client_fd, 0, "connect to fd")) >>>> + goto fail; >>>> + >>>> + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); >>>> + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) >>>> + goto fail; >>>> + >>>> + send_byte(client_fd); >>>> + >>>> + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search >>>> fwmark:0x1"); >>>> + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search >>>> fwmark:0x2"); >>>> + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); >>>> + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); >>> >>> Is there a getsockopt way instead of ss + grep? >> >> No there isn't: from the userspace, the app communicates with the MPTCP >> socket, which can have multiple paths (subflows, a TCP socket). To keep >> the compatibility with TCP, [gs]etsockopt() will look at/modify the >> whole MPTCP connection. For example, in some cases, a setsockopt() will >> propagate the option to all the subflows. Depending on the option, the >> modification might only apply to the first subflow, or to the >> user-facing socket. >> >> For advanced users who want to have different options set to the >> different subflows of an MPTCP connection, they can use BPF: that's what >> is being validated here. In other words, doing a 'getsockopt()' from the >> userspace program here will not show all the different marks and TCP CC >> that can be set per subflow with BPF. We can see that in the test: a >> getsockopt() is done on the MPTCP socket to retrieve the default TCP CC >> ('cc' which is certainly 'cubic'), but we expect to find another one >> ('new' which is 'reno'), set by the BPF program from patch 1/2. I guess >> we could use bpf to do a getsockopt() per subflow, but that's seems a >> bit cheated to have the BPF test program setting something and checking >> if it is set. Here, it is an external way. Because it is done from a > > I think the result is valid by having a bpf prog to inspect the value of > a sock. Inspecting socket is an existing use case. There are many > existing bpf tests covering this inspection use case to ensure the > result is legit. A separate cgroup/getsockopt program should help here > (more on this below). I didn't consider a separate program. Indeed, should work. >> dedicated netns, it sounds OK to do that, no? > > Thanks for the explanation. I was hoping there is a way to get to the > underlying subflow fd. It seems impossible. > > In the netns does help here. It is not only about the ss iterating a lot > of connections or not. My preference is not depending on external tool/ > shell-ing if possible, e.g. to avoid the package update discussion like > the iproute2 here. The uapi from the testing kernel is always up-to- > date. ss is another binary but arguably in the same iproute2 package. > There is now another extra "grep" and pipe here. We had been bitten by > different shell behaviors and some arch has different shells ...etc. OK, I thought it was fine to use 'ss | grep' because it is used in other BPF selftests: test_tc_tunnel.sh & test_xdp_features.sh. > I think it is ok to take this set as is if you (and Gelang?) are ok to > followup a "cgroup/getsockopt" way to inspect the subflow as the very > next patch to the mptcp selftest. It seems inspecting subflow will be a > common test going forward for mptcp, so it will be beneficial to have a > "cgroup/getsockopt" way to inspect the subflow directly. > > Take a look at a recent example [0]. The mptcp test is under a cgroup > already and has the cgroup setup. An extra "cgroup/getsockopt" prog > should be enough. That prog can walk the msk->conn_list and use > bpf_rdonly_cast (or the bpf_core_cast macro in libbpf) to cast a pointer > to tcp_sock for readonly. It will allow to inspect all the fields in a > tcp_sock. It looks interesting to be able to inspect all the fields in a tcp_sock! I will check with Geliang what can be done. > Something needs to a fix in patch 2(replied separately), so a re-spin is > needed. Thank you. Will do in the next version! > pw-bot: cr > > [0]: https://lore.kernel.org/all/20240808150558.1035626-3- > alan.maguire@oracle.com/ Cheers, Matt
Hi Martin, Matt, On Wed, 2024-08-14 at 15:37 -0700, Martin KaFai Lau wrote: > On 8/14/24 3:04 AM, Matthieu Baerts wrote: > > Hi Martin, > > > > Thank you for your reply! > > > > On 14/08/2024 03:12, Martin KaFai Lau wrote: > > > On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: > > > > +static int endpoint_init(char *flags) > > > > +{ > > > > + SYS(fail, "ip -net %s link add veth1 type veth peer name > > > > veth2", > > > > NS_TEST); > > > > + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, > > > > ADDR_1); > > > > + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); > > > > + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, > > > > ADDR_2); > > > > + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); > > > > + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", > > > > NS_TEST, > > > > ADDR_2, flags)) { > > > > + printf("'ip mptcp' not supported, skip this test.\n"); > > > > + test__skip(); > > > > > > It is always a skip now in bpf CI: > > > > > > #171/3 mptcp/subflow:SKIP > > > > > > This test is a useful addition for the bpf CI selftest. > > > > > > It can't catch regression if it is always a skip in bpf CI > > > though. > > > > Indeed, for the moment, this test is skipped in bpf CI. > > > > The MPTCP CI checks the MPTCP BPF selftests that are on top of net > > and > > net-next at least once a day. It is always running with the last > > stable > > version of iproute2, so this test is not skipped: > > > > #169/3 mptcp/subflow:OK > > > > https://github.com/multipath-tcp/mptcp_net-next/actions/runs/10384566794/job/28751869426#step:7:11080 > > > > > iproute2 needs to be updated (cc: Daniel Xu and Manu, the > > > outdated > > > iproute2 is something that came up multiple times). > > > > > > Not sure when the iproute2 can be updated. In the mean time, your > > > v3 is > > > pretty close to getting pm_nl_ctl compiled. Is there other > > > blocker on this? > > > > I will try to find some time to check the modifications I suggested > > in > > the v3, but I don't know how long it will take to have them ready, > > as > > they might require some adaptations of the CI side as well, I need > > to > > check. On the other hand, I understood adding a duplicated version > > of > > the mptcp.h UAPI header is not an option either. > > > > So not to block this (already old) series, I thought it would help > > to > > first focus on this version using 'ip mptcp', while I'm looking at > > the > > selftests modifications. If these modifications are successful, I > > can > > always resend the patch 2/3 from the v3 later, and using > > 'pm_nl_ctl' > > instead of 'ip mptcp', to be able to work with IPRoute2 5.5. > > > > Do you think that could work like that? > > If there is CI started covering it, staying with the 'ip mptcp' is > fine. > > The bpf CI has to start testing it asap also. The iproute2 package > will need to > be updated on the bpf CI side. I think this has to be done > regardless. > > It will be useful to avoid the uapi header dup on its own. The last > one you have > seems pretty close. > > > > > > > + goto fail; > > > > + } > > > > + > > > > + return 0; > > > > +fail: > > > > + return -1; > > > > +} > > > > + > > > > +static int _ss_search(char *src, char *dst, char *port, char > > > > *keyword) > > > > +{ > > > > + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst > > > > %s %s %d > > > > > grep -q '%s'", > > > > + NS_TEST, src, dst, port, PORT_1, keyword); > > > > +} > > > > + > > > > +static int ss_search(char *src, char *keyword) > > > > +{ > > > > + return _ss_search(src, ADDR_1, "dport", keyword); > > > > +} > > > > + > > > > +static void run_subflow(char *new) > > > > +{ > > > > + int server_fd, client_fd, err; > > > > + char cc[TCP_CA_NAME_MAX]; > > > > + socklen_t len = sizeof(cc); > > > > + > > > > + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, > > > > 0); > > > > + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) > > > > + return; > > > > + > > > > + client_fd = connect_to_fd(server_fd, 0); > > > > + if (!ASSERT_GE(client_fd, 0, "connect to fd")) > > > > + goto fail; > > > > + > > > > + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, > > > > &len); > > > > + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) > > > > + goto fail; > > > > + > > > > + send_byte(client_fd); > > > > + > > > > + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search > > > > fwmark:0x1"); > > > > + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search > > > > fwmark:0x2"); > > > > + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); > > > > + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); > > > > > > Is there a getsockopt way instead of ss + grep? > > > > No there isn't: from the userspace, the app communicates with the > > MPTCP > > socket, which can have multiple paths (subflows, a TCP socket). To > > keep > > the compatibility with TCP, [gs]etsockopt() will look at/modify the > > whole MPTCP connection. For example, in some cases, a setsockopt() > > will > > propagate the option to all the subflows. Depending on the option, > > the > > modification might only apply to the first subflow, or to the > > user-facing socket. > > > > For advanced users who want to have different options set to the > > different subflows of an MPTCP connection, they can use BPF: that's > > what > > is being validated here. In other words, doing a 'getsockopt()' > > from the > > userspace program here will not show all the different marks and > > TCP CC > > that can be set per subflow with BPF. We can see that in the test: > > a > > getsockopt() is done on the MPTCP socket to retrieve the default > > TCP CC > > ('cc' which is certainly 'cubic'), but we expect to find another > > one > > ('new' which is 'reno'), set by the BPF program from patch 1/2. I > > guess > > we could use bpf to do a getsockopt() per subflow, but that's seems > > a > > bit cheated to have the BPF test program setting something and > > checking > > if it is set. Here, it is an external way. Because it is done from > > a > > I think the result is valid by having a bpf prog to inspect the value > of a sock. > Inspecting socket is an existing use case. There are many existing > bpf tests > covering this inspection use case to ensure the result is legit. A > separate > cgroup/getsockopt program should help here (more on this below). > > > dedicated netns, it sounds OK to do that, no? > > Thanks for the explanation. I was hoping there is a way to get to the > underlying > subflow fd. It seems impossible. > > In the netns does help here. It is not only about the ss iterating a > lot of > connections or not. My preference is not depending on external > tool/shell-ing if > possible, e.g. to avoid the package update discussion like the > iproute2 here. > The uapi from the testing kernel is always up-to-date. ss is another > binary but > arguably in the same iproute2 package. There is now another extra > "grep" and > pipe here. We had been bitten by different shell behaviors and some > arch has > different shells ...etc. > > I think it is ok to take this set as is if you (and Gelang?) are ok > to followup > a "cgroup/getsockopt" way to inspect the subflow as the very next > patch to the > mptcp selftest. It seems inspecting subflow will be a common test > going forward > for mptcp, so it will be beneficial to have a "cgroup/getsockopt" way > to inspect > the subflow directly. > > Take a look at a recent example [0]. The mptcp test is under a cgroup > already > and has the cgroup setup. An extra "cgroup/getsockopt" prog should be > enough. > That prog can walk the msk->conn_list and use bpf_rdonly_cast (or the I encountered some difficulties while walking the msk->conn_list in BPF. I added mptcp_for_iach_stubflow() and other helpers related to list_dentry into progs/mptcp_bpf.h: static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } #define list_entry(ptr, type, member) \ container_of(ptr, type, member) #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) Then used them in progs/mptcp_subflow.c like this: SEC("cgroup/getsockopt") int _getsockopt(struct bpf_sockopt *ctx) { struct mptcp_sock *msk = bpf_core_cast(ctx->sk, struct mptcp_sock); struct mptcp_subflow_context *subflow; __u32 token = 0; if (!msk || ctx->level != SOL_TCP || ctx->optname != TCP_CONGESTION) return 1; subflow = list_first_entry(&msk->conn_list, struct mptcp_subflow_context, node); token = subflow->token; bpf_trace_printk(fmt, sizeof(fmt), msk, token); return 1; } And got some access errors: ; token = subflow->token; @ mptcp_subflow.c:92 13: (61) r4 = *(u32 *)(r1 +524) access beyond struct list_head at off 524 size 4 How can I resolve these errors? Shouldn't I walk the msk->conn_list like this? Or use bpf_iter instead? Please give me some advice? Thanks, -Geliang > bpf_core_cast macro in libbpf) to cast a pointer to tcp_sock for > readonly. It > will allow to inspect all the fields in a tcp_sock. > > Something needs to a fix in patch 2(replied separately), so a re-spin > is needed. > > pw-bot: cr > > [0]: > https://lore.kernel.org/all/20240808150558.1035626-3-alan.maguire@oracle.com/ > > >
On 8/17/24 7:13 PM, Geliang Tang wrote: >> Take a look at a recent example [0]. The mptcp test is under a cgroup >> already >> and has the cgroup setup. An extra "cgroup/getsockopt" prog should be >> enough. >> That prog can walk the msk->conn_list and use bpf_rdonly_cast (or the >> bpf_core_cast macro in libbpf) to cast a pointer to tcp_sock for [ ... ] > SEC("cgroup/getsockopt") > int _getsockopt(struct bpf_sockopt *ctx) > { > struct mptcp_sock *msk = bpf_core_cast(ctx->sk, struct > mptcp_sock); > struct mptcp_subflow_context *subflow; > __u32 token = 0; > > if (!msk || ctx->level != SOL_TCP || > ctx->optname != TCP_CONGESTION) > return 1; > > subflow = list_first_entry(&msk->conn_list, > struct mptcp_subflow_context, node); > token = subflow->token; > bpf_trace_printk(fmt, sizeof(fmt), msk, token); > > return 1; > } > > And got some access errors: > > ; token = subflow->token; @ mptcp_subflow.c:92 > 13: (61) r4 = *(u32 *)(r1 +524) > access beyond struct list_head at off 524 size 4 Similar to your bpf_core_cast() usage earlier that casts a sock ptr to mptcp_sock ptr. r1 is in list_head ptr type. It needs to cast to mptcp_subflow_context ptr. The same cast to tcp_sock is needed when you try to get the tcp_sock ptr from subflow->tcp_sock which is in "struct sock *" type in the kernel instead of "struct tcp_sock *".
> On Aug 13, 2024, at 6:12 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote: > > > > On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: >> +static int endpoint_init(char *flags) >> +{ >> + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST); >> + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); >> + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); >> + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); >> + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); >> + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) { >> + printf("'ip mptcp' not supported, skip this test.\n"); >> + test__skip(); > > It is always a skip now in bpf CI: > > #171/3 mptcp/subflow:SKIP > > This test is a useful addition for the bpf CI selftest. > > It can't catch regression if it is always a skip in bpf CI though. > > iproute2 needs to be updated (cc: Daniel Xu and Manu, the outdated iproute2 is something that came up multiple times). > > Not sure when the iproute2 can be updated. In the mean time, your v3 is pretty close to getting pm_nl_ctl compiled. Is there other blocker on this? I have updated runners to Ubuntu 24.04 which comes with: root@1fdd5d75581b:/actions-runner# ip --json -V ip utility, iproute2-6.1.0, libbpf 1.3.0 root@1fdd5d75581b:/actions-runner# ip mptcp help Usage: ip mptcp endpoint add ADDRESS [ dev NAME ] [ id ID ] [ port NR ] [ FLAG-LIST ] ip mptcp endpoint delete id ID [ ADDRESS ] ip mptcp endpoint change [ id ID ] [ ADDRESS ] [ port NR ] CHANGE-OPT ip mptcp endpoint show [ id ID ] ip mptcp endpoint flush ip mptcp limits set [ subflows NR ] [ add_addr_accepted NR ] ip mptcp limits show ip mptcp monitor FLAG-LIST := [ FLAG-LIST ] FLAG FLAG := [ signal | subflow | backup | fullmesh ] CHANGE-OPT := [ backup | nobackup | fullmesh | nofullmesh ] Assuming I don’t need to revert back to old runners due to unrelated issue, you should now have `ip mptcp` available. Manu > >> + goto fail; >> + } >> + >> + return 0; >> +fail: >> + return -1; >> +} >> + >> +static int _ss_search(char *src, char *dst, char *port, char *keyword) >> +{ >> + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst %s %s %d | grep -q '%s'", >> + NS_TEST, src, dst, port, PORT_1, keyword); >> +} >> + >> +static int ss_search(char *src, char *keyword) >> +{ >> + return _ss_search(src, ADDR_1, "dport", keyword); >> +} >> + >> +static void run_subflow(char *new) >> +{ >> + int server_fd, client_fd, err; >> + char cc[TCP_CA_NAME_MAX]; >> + socklen_t len = sizeof(cc); >> + >> + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); >> + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) >> + return; >> + >> + client_fd = connect_to_fd(server_fd, 0); >> + if (!ASSERT_GE(client_fd, 0, "connect to fd")) >> + goto fail; >> + >> + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); >> + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) >> + goto fail; >> + >> + send_byte(client_fd); >> + >> + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search fwmark:0x1"); >> + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search fwmark:0x2"); >> + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); >> + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); > > Is there a getsockopt way instead of ss + grep? >
Hi Manu, On 21/08/2024 22:32, Manu Bretelle wrote: > > >> On Aug 13, 2024, at 6:12 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >>> >> On 8/5/24 2:52 AM, Matthieu Baerts (NGI0) wrote: >>> +static int endpoint_init(char *flags) >>> +{ >>> + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST); >>> + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); >>> + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); >>> + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); >>> + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); >>> + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) { >>> + printf("'ip mptcp' not supported, skip this test.\n"); >>> + test__skip(); >> >> It is always a skip now in bpf CI: >> >> #171/3 mptcp/subflow:SKIP >> >> This test is a useful addition for the bpf CI selftest. >> >> It can't catch regression if it is always a skip in bpf CI though. >> >> iproute2 needs to be updated (cc: Daniel Xu and Manu, the outdated iproute2 is something that came up multiple times). >> >> Not sure when the iproute2 can be updated. In the mean time, your v3 is pretty close to getting pm_nl_ctl compiled. Is there other blocker on this? > > I have updated runners to Ubuntu 24.04 which comes with: > root@1fdd5d75581b:/actions-runner# ip --json -V > ip utility, iproute2-6.1.0, libbpf 1.3.0 > root@1fdd5d75581b:/actions-runner# ip mptcp help > Usage: ip mptcp endpoint add ADDRESS [ dev NAME ] [ id ID ] > [ port NR ] [ FLAG-LIST ] > ip mptcp endpoint delete id ID [ ADDRESS ] > ip mptcp endpoint change [ id ID ] [ ADDRESS ] [ port NR ] CHANGE-OPT > ip mptcp endpoint show [ id ID ] > ip mptcp endpoint flush > ip mptcp limits set [ subflows NR ] [ add_addr_accepted NR ] > ip mptcp limits show > ip mptcp monitor > FLAG-LIST := [ FLAG-LIST ] FLAG > FLAG := [ signal | subflow | backup | fullmesh ] > CHANGE-OPT := [ backup | nobackup | fullmesh | nofullmesh ] > > Assuming I don’t need to revert back to old runners due to unrelated issue, you should now have `ip mptcp` available. Great, thank you for this update, that will ease the inclusion of this series! (That's a shame Ubuntu 24.04 didn't come with IPRoute 6.8, same version as their default kernel [1]... but that's not blocking us here) [1] https://bugs.launchpad.net/ubuntu/+source/iproute2/+bug/2051672 Cheers, Matt
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index d2ca32fa3b21..d06be03cc0f0 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -9,8 +9,12 @@ #include "network_helpers.h" #include "mptcp_sock.skel.h" #include "mptcpify.skel.h" +#include "mptcp_subflow.skel.h" #define NS_TEST "mptcp_ns" +#define ADDR_1 "10.0.1.1" +#define ADDR_2 "10.0.1.2" +#define PORT_1 10001 #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 @@ -335,10 +339,111 @@ static void test_mptcpify(void) close(cgroup_fd); } +static int endpoint_init(char *flags) +{ + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST); + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) { + printf("'ip mptcp' not supported, skip this test.\n"); + test__skip(); + goto fail; + } + + return 0; +fail: + return -1; +} + +static int _ss_search(char *src, char *dst, char *port, char *keyword) +{ + return SYS_NOFAIL("ip netns exec %s ss -enita src %s dst %s %s %d | grep -q '%s'", + NS_TEST, src, dst, port, PORT_1, keyword); +} + +static int ss_search(char *src, char *keyword) +{ + return _ss_search(src, ADDR_1, "dport", keyword); +} + +static void run_subflow(char *new) +{ + int server_fd, client_fd, err; + char cc[TCP_CA_NAME_MAX]; + socklen_t len = sizeof(cc); + + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); + if (!ASSERT_GE(server_fd, 0, "start_mptcp_server")) + return; + + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect to fd")) + goto fail; + + err = getsockopt(server_fd, SOL_TCP, TCP_CONGESTION, cc, &len); + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) + goto fail; + + send_byte(client_fd); + + ASSERT_OK(ss_search(ADDR_1, "fwmark:0x1"), "ss_search fwmark:0x1"); + ASSERT_OK(ss_search(ADDR_2, "fwmark:0x2"), "ss_search fwmark:0x2"); + ASSERT_OK(ss_search(ADDR_1, new), "ss_search new cc"); + ASSERT_OK(ss_search(ADDR_2, cc), "ss_search default cc"); + + close(client_fd); +fail: + close(server_fd); +} + +static void test_subflow(void) +{ + int cgroup_fd, prog_fd, err; + struct mptcp_subflow *skel; + struct nstoken *nstoken; + + cgroup_fd = test__join_cgroup("/mptcp_subflow"); + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup: mptcp_subflow")) + return; + + skel = mptcp_subflow__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_subflow")) + goto close_cgroup; + + err = mptcp_subflow__attach(skel); + if (!ASSERT_OK(err, "skel_attach: mptcp_subflow")) + goto skel_destroy; + + prog_fd = bpf_program__fd(skel->progs.mptcp_subflow); + err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto skel_destroy; + + nstoken = create_netns(); + if (!ASSERT_OK_PTR(nstoken, "create_netns: mptcp_subflow")) + goto skel_destroy; + + if (endpoint_init("subflow") < 0) + goto close_netns; + + run_subflow(skel->data->cc); + +close_netns: + cleanup_netns(nstoken); +skel_destroy: + mptcp_subflow__destroy(skel); +close_cgroup: + close(cgroup_fd); +} + void test_mptcp(void) { if (test__start_subtest("base")) test_base(); if (test__start_subtest("mptcpify")) test_mptcpify(); + if (test__start_subtest("subflow")) + test_subflow(); }