diff mbox series

[bpf-next,v4,10/11] selftests/bpf: Add TX side to xdp_hw_metadata

Message ID 20231019174944.3376335-11-sdf@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series xsk: TX metadata | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers warning 5 maintainers not CCed: mykolal@fb.com yonghong.song@linux.dev davem@davemloft.net shuah@kernel.org linux-kselftest@vger.kernel.org
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 9 this patch: 9
netdev/checkpatch warning CHECK: Macro argument 'len' may be better as '(len)' to avoid precedence issues CHECK: spaces preferred around that '*' (ctx:VxV) WARNING: Too many leading tabs - consider code refactoring WARNING: Unknown commit id 'c3c9abc1d0c989e0be21d78cccd99076cc94ec44', maybe rebased or not pulled? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat

Commit Message

Stanislav Fomichev Oct. 19, 2023, 5:49 p.m. UTC
When we get a packet on port 9091, we swap src/dst and send it out.
At this point we also request the timestamp and checksum offloads.

Checksum offload is verified by looking at the tcpdump on the other side.
The tool prints pseudo-header csum and the final one it expects.
The final checksum actually matches the incoming packets checksum
because we only flip the src/dst and don't change the payload.

Some other related changes:
- switched to zerocopy mode by default; new flag can be used to force
  old behavior
- request fixed tx_metadata_len headroom
- some other small fixes (umem size, fill idx+i, etc)

mvbz3:~# ./xdp_hw_metadata eth3
...
xsk_ring_cons__peek: 1
0x19546f8: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
rx_hash: 0x80B7EA8B with RSS type:0x2A
rx_timestamp:  1697580171852147395 (sec:1697580171.8521)
HW RX-time:   1697580171852147395 (sec:1697580171.8521), delta to User RX-time sec:0.2797 (279673.082 usec)
XDP RX-time:   1697580172131699047 (sec:1697580172.1317), delta to User RX-time sec:0.0001 (121.430 usec)
0x19546f8: ping-pong with csum=3b8e (want d862) csum_start=54 csum_offset=6
0x19546f8: complete tx idx=0 addr=8
tx_timestamp:  1697580172056756493 (sec:1697580172.0568)
HW TX-complete-time:   1697580172056756493 (sec:1697580172.0568), delta to User TX-complete-time sec:0.0852 (85175.537 usec)
XDP RX-time:   1697580172131699047 (sec:1697580172.1317), delta to User TX-complete-time sec:0.0102 (10232.983 usec)
HW RX-time:   1697580171852147395 (sec:1697580171.8521), delta to HW TX-complete-time sec:0.2046 (204609.098 usec)
0x19546f8: complete rx idx=128 addr=80100

mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091

mvbz4:~# tcpdump -vvx -i eth3 udp
        tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144 bytes
12:26:09.301074 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1087.55807 > fe80::1270:fdff:fe48:1077.9091: [bad udp cksum 0x3b8e -> 0xde7e!] UDP, length 3
        0x0000:  6003 5fa5 000b 117f fe80 0000 0000 0000
        0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
        0x0020:  1270 fdff fe48 1077 d9ff 2383 000b 3b8e
        0x0030:  7864 70
12:26:09.301976 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.55807: [udp sum ok] UDP, length 3
        0x0000:  6003 5fa5 000b 117f fe80 0000 0000 0000
        0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
        0x0020:  1270 fdff fe48 1087 2383 d9ff 000b de7e
        0x0030:  7864 70

This reverts commit c3c9abc1d0c989e0be21d78cccd99076cc94ec44.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 159 +++++++++++++++++-
 1 file changed, 157 insertions(+), 2 deletions(-)

Comments

Song Yoong Siang Oct. 24, 2023, 2:19 a.m. UTC | #1
On Friday, October 20, 2023 1:50 AM Stanislav Fomichev <sdf@google.com> wrote:
>When we get a packet on port 9091, we swap src/dst and send it out.
>At this point we also request the timestamp and checksum offloads.
>
>Checksum offload is verified by looking at the tcpdump on the other side.
>The tool prints pseudo-header csum and the final one it expects.
>The final checksum actually matches the incoming packets checksum
>because we only flip the src/dst and don't change the payload.
>
>Some other related changes:
>- switched to zerocopy mode by default; new flag can be used to force
>  old behavior
>- request fixed tx_metadata_len headroom
>- some other small fixes (umem size, fill idx+i, etc)
>
>mvbz3:~# ./xdp_hw_metadata eth3
>...
>xsk_ring_cons__peek: 1
>0x19546f8: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
>rx_hash: 0x80B7EA8B with RSS type:0x2A
>rx_timestamp:  1697580171852147395 (sec:1697580171.8521)
>HW RX-time:   1697580171852147395 (sec:1697580171.8521), delta to User RX-time sec:0.2797 (279673.082 usec)
>XDP RX-time:   1697580172131699047 (sec:1697580172.1317), delta to User RX-time sec:0.0001 (121.430 usec)
>0x19546f8: ping-pong with csum=3b8e (want d862) csum_start=54 csum_offset=6
>0x19546f8: complete tx idx=0 addr=8
>tx_timestamp:  1697580172056756493 (sec:1697580172.0568)

Hi Stanislav,

rx_timestamp is duplicating HW RX-time while tx_timestamp is duplicating HW TX-complete-time,
so, I think can remove printing of rx_timestamp and tx_timestamp to avoid confusion.

>HW TX-complete-time:   1697580172056756493 (sec:1697580172.0568), delta to User TX-complete-time sec:0.0852 (85175.537 usec)
>XDP RX-time:   1697580172131699047 (sec:1697580172.1317), delta to User TX-complete-time sec:0.0102 (10232.983 usec)
>HW RX-time:   1697580171852147395 (sec:1697580171.8521), delta to HW TX-complete-time sec:0.2046 (204609.098 usec)
>0x19546f8: complete rx idx=128 addr=80100
>
>mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091
>
>mvbz4:~# tcpdump -vvx -i eth3 udp
>        tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144
>bytes
>12:26:09.301074 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload
>length: 11) fe80::1270:fdff:fe48:1087.55807 > fe80::1270:fdff:fe48:1077.9091: [bad
>udp cksum 0x3b8e -> 0xde7e!] UDP, length 3
>        0x0000:  6003 5fa5 000b 117f fe80 0000 0000 0000
>        0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
>        0x0020:  1270 fdff fe48 1077 d9ff 2383 000b 3b8e
>        0x0030:  7864 70
>12:26:09.301976 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload
>length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.55807: [udp
>sum ok] UDP, length 3
>        0x0000:  6003 5fa5 000b 117f fe80 0000 0000 0000
>        0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
>        0x0020:  1270 fdff fe48 1087 2383 d9ff 000b de7e
>        0x0030:  7864 70
>
>This reverts commit c3c9abc1d0c989e0be21d78cccd99076cc94ec44.

It didn't looked like this patch is reverting something.
If this is not a mistake, can you add the commit title behind the ID?

Thanks & Regards
Siang

>
>Signed-off-by: Stanislav Fomichev <sdf@google.com>
>---
> tools/testing/selftests/bpf/xdp_hw_metadata.c | 159 +++++++++++++++++-
> 1 file changed, 157 insertions(+), 2 deletions(-)
>
>diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c
>b/tools/testing/selftests/bpf/xdp_hw_metadata.c
>index 057f7c145f62..d9421c5889f8 100644
>--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
>+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
>@@ -10,7 +10,9 @@
>  *   - rx_hash
>  *
>  * TX:
>- * - TBD
>+ * - UDP 9091 packets trigger TX reply
>+ * - TX HW timestamp is requested and reported back upon completion
>+ * - TX checksum is requested
>  */
>
> #include <test_progs.h>
>@@ -24,11 +26,14 @@
> #include <linux/net_tstamp.h>
> #include <linux/udp.h>
> #include <linux/sockios.h>
>+#include <linux/if_xdp.h>
> #include <sys/mman.h>
> #include <net/if.h>
> #include <ctype.h>
> #include <poll.h>
> #include <time.h>
>+#include <unistd.h>
>+#include <libgen.h>
>
> #include "xdp_metadata.h"
>
>@@ -53,6 +58,9 @@ struct xsk *rx_xsk;
> const char *ifname;
> int ifindex;
> int rxq;
>+bool skip_tx;
>+__u64 last_hw_rx_timestamp;
>+__u64 last_xdp_rx_timestamp;
>
> void test__fail(void) { /* for network_helpers.c */ }
>
>@@ -69,6 +77,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
> 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
> 		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
> 		.flags = XSK_UMEM__DEFAULT_FLAGS,
>+		.tx_metadata_len = sizeof(struct xsk_tx_metadata),
> 	};
> 	__u32 idx;
> 	u64 addr;
>@@ -190,6 +199,10 @@ static void verify_xdp_metadata(void *data, clockid_t
>clock_id)
> 	if (meta->rx_timestamp) {
> 		__u64 ref_tstamp = gettime(clock_id);
>
>+		/* store received timestamps to calculate a delta at tx */
>+		last_hw_rx_timestamp = meta->rx_timestamp;
>+		last_xdp_rx_timestamp = meta->xdp_timestamp;
>+
> 		print_tstamp_delta("HW RX-time", "User RX-time",
> 				   meta->rx_timestamp, ref_tstamp);
> 		print_tstamp_delta("XDP RX-time", "User RX-time",
>@@ -242,6 +255,128 @@ static void verify_skb_metadata(int fd)
> 	printf("skb hwtstamp is not found!\n");
> }
>
>+static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
>+{
>+	struct xsk_tx_metadata *meta;
>+	__u64 addr;
>+	void *data;
>+	__u32 idx;
>+
>+	if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
>+		return false;
>+
>+	addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
>+	data = xsk_umem__get_data(xsk->umem_area, addr);
>+	meta = data - sizeof(struct xsk_tx_metadata);
>+
>+	printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
>+
>+	printf("tx_timestamp:  %llu (sec:%0.4f)\n", meta->completion.tx_timestamp,
>+	       (double)meta->completion.tx_timestamp / NANOSEC_PER_SEC);
>+	if (meta->completion.tx_timestamp) {
>+		__u64 ref_tstamp = gettime(clock_id);
>+
>+		print_tstamp_delta("HW TX-complete-time", "User TX-complete-
>time",
>+				   meta->completion.tx_timestamp, ref_tstamp);
>+		print_tstamp_delta("XDP RX-time", "User TX-complete-time",
>+				   last_xdp_rx_timestamp, ref_tstamp);
>+		print_tstamp_delta("HW RX-time", "HW TX-complete-time",
>+				   last_hw_rx_timestamp, meta-
>>completion.tx_timestamp);
>+	}
>+
>+	xsk_ring_cons__release(&xsk->comp, 1);
>+
>+	return true;
>+}
>+
>+#define swap(a, b, len) do { \
>+	for (int i = 0; i < len; i++) { \
>+		__u8 tmp = ((__u8 *)a)[i]; \
>+		((__u8 *)a)[i] = ((__u8 *)b)[i]; \
>+		((__u8 *)b)[i] = tmp; \
>+	} \
>+} while (0)
>+
>+static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
>+{
>+	struct xsk_tx_metadata *meta;
>+	struct ipv6hdr *ip6h = NULL;
>+	struct iphdr *iph = NULL;
>+	struct xdp_desc *tx_desc;
>+	struct udphdr *udph;
>+	struct ethhdr *eth;
>+	__sum16 want_csum;
>+	void *data;
>+	__u32 idx;
>+	int ret;
>+	int len;
>+
>+	ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
>+	if (ret != 1) {
>+		printf("%p: failed to reserve tx slot\n", xsk);
>+		return;
>+	}
>+
>+	tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
>+	tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE +
>sizeof(struct xsk_tx_metadata);
>+	data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
>+
>+	meta = data - sizeof(struct xsk_tx_metadata);
>+	memset(meta, 0, sizeof(*meta));
>+	meta->flags = XDP_TX_METADATA_TIMESTAMP;
>+
>+	eth = rx_packet;
>+
>+	if (eth->h_proto == htons(ETH_P_IP)) {
>+		iph = (void *)(eth + 1);
>+		udph = (void *)(iph + 1);
>+	} else if (eth->h_proto == htons(ETH_P_IPV6)) {
>+		ip6h = (void *)(eth + 1);
>+		udph = (void *)(ip6h + 1);
>+	} else {
>+		printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth-
>>h_proto);
>+		xsk_ring_prod__cancel(&xsk->tx, 1);
>+		return;
>+	}
>+
>+	len = ETH_HLEN;
>+	if (ip6h)
>+		len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
>+	if (iph)
>+		len += ntohs(iph->tot_len);
>+
>+	swap(eth->h_dest, eth->h_source, ETH_ALEN);
>+	if (iph)
>+		swap(&iph->saddr, &iph->daddr, 4);
>+	else
>+		swap(&ip6h->saddr, &ip6h->daddr, 16);
>+	swap(&udph->source, &udph->dest, 2);
>+
>+	want_csum = udph->check;
>+	if (ip6h)
>+		udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
>+					       ntohs(udph->len), IPPROTO_UDP, 0);
>+	else
>+		udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
>+						 ntohs(udph->len), IPPROTO_UDP, 0);
>+
>+	meta->flags |= XDP_TX_METADATA_CHECKSUM;
>+	if (iph)
>+		meta->csum_start = sizeof(*eth) + sizeof(*iph);
>+	else
>+		meta->csum_start = sizeof(*eth) + sizeof(*ip6h);
>+	meta->csum_offset = offsetof(struct udphdr, check);
>+
>+	printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d
>csum_offset=%d\n",
>+	       xsk, ntohs(udph->check), ntohs(want_csum), meta->csum_start, meta-
>>csum_offset);
>+
>+	memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
>+	tx_desc->options |= XDP_TX_METADATA;
>+	tx_desc->len = len;
>+
>+	xsk_ring_prod__submit(&xsk->tx, 1);
>+}
>+
> static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
> {
> 	const struct xdp_desc *rx_desc;
>@@ -307,6 +442,22 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int
>server_fd, clockid_t
> 				verify_xdp_metadata(xsk_umem__get_data(xsk-
>>umem_area, addr),
> 						    clock_id);
> 				first_seg = false;
>+
>+				if (!skip_tx) {
>+					/* mirror first chunk back */
>+					ping_pong(xsk, xsk_umem__get_data(xsk-
>>umem_area, addr),
>+						  clock_id);
>+
>+					ret = kick_tx(xsk);
>+					if (ret)
>+						printf("kick_tx ret=%d\n", ret);
>+
>+					for (int j = 0; j < 500; j++) {
>+						if (complete_tx(xsk, clock_id))
>+							break;
>+						usleep(10*1000);
>+					}
>+				}
> 			}
>
> 			xsk_ring_cons__release(&xsk->rx, 1);
>@@ -442,6 +593,7 @@ static void print_usage(void)
> 		"  -c    Run in copy mode (zerocopy is default)\n"
> 		"  -h    Display this help and exit\n\n"
> 		"  -m    Enable multi-buffer XDP for larger MTU\n"
>+		"  -r    Don't generate AF_XDP reply (rx metadata only)\n"
> 		"Generate test packets on the other machine with:\n"
> 		"  echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
>
>@@ -452,7 +604,7 @@ static void read_args(int argc, char *argv[])
> {
> 	char opt;
>
>-	while ((opt = getopt(argc, argv, "chm")) != -1) {
>+	while ((opt = getopt(argc, argv, "chmr")) != -1) {
> 		switch (opt) {
> 		case 'c':
> 			bind_flags &= ~XDP_USE_NEED_WAKEUP;
>@@ -465,6 +617,9 @@ static void read_args(int argc, char *argv[])
> 		case 'm':
> 			bind_flags |= XDP_USE_SG;
> 			break;
>+		case 'r':
>+			skip_tx = true;
>+			break;
> 		case '?':
> 			if (isprint(optopt))
> 				fprintf(stderr, "Unknown option: -%c\n", optopt);
>--
>2.42.0.655.g421f12c284-goog
Stanislav Fomichev Oct. 24, 2023, 4:41 p.m. UTC | #2
On Mon, Oct 23, 2023 at 7:19 PM Song, Yoong Siang
<yoong.siang.song@intel.com> wrote:
>
> On Friday, October 20, 2023 1:50 AM Stanislav Fomichev <sdf@google.com> wrote:
> >When we get a packet on port 9091, we swap src/dst and send it out.
> >At this point we also request the timestamp and checksum offloads.
> >
> >Checksum offload is verified by looking at the tcpdump on the other side.
> >The tool prints pseudo-header csum and the final one it expects.
> >The final checksum actually matches the incoming packets checksum
> >because we only flip the src/dst and don't change the payload.
> >
> >Some other related changes:
> >- switched to zerocopy mode by default; new flag can be used to force
> >  old behavior
> >- request fixed tx_metadata_len headroom
> >- some other small fixes (umem size, fill idx+i, etc)
> >
> >mvbz3:~# ./xdp_hw_metadata eth3
> >...
> >xsk_ring_cons__peek: 1
> >0x19546f8: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
> >rx_hash: 0x80B7EA8B with RSS type:0x2A
> >rx_timestamp:  1697580171852147395 (sec:1697580171.8521)
> >HW RX-time:   1697580171852147395 (sec:1697580171.8521), delta to User RX-time sec:0.2797 (279673.082 usec)
> >XDP RX-time:   1697580172131699047 (sec:1697580172.1317), delta to User RX-time sec:0.0001 (121.430 usec)
> >0x19546f8: ping-pong with csum=3b8e (want d862) csum_start=54 csum_offset=6
> >0x19546f8: complete tx idx=0 addr=8
> >tx_timestamp:  1697580172056756493 (sec:1697580172.0568)
>
> Hi Stanislav,
>
> rx_timestamp is duplicating HW RX-time while tx_timestamp is duplicating HW TX-complete-time,
> so, I think can remove printing of rx_timestamp and tx_timestamp to avoid confusion.

That's fair, I think I'll do the following:

if (meta->rx_timestamp) {
  /* print all those reference points */
} else {
  printf("No rx_timestamp\n");
}

And the same for tx. So at least the users get a signal that the
timestamps weren't set.

> >HW TX-complete-time:   1697580172056756493 (sec:1697580172.0568), delta to User TX-complete-time sec:0.0852 (85175.537 usec)
> >XDP RX-time:   1697580172131699047 (sec:1697580172.1317), delta to User TX-complete-time sec:0.0102 (10232.983 usec)
> >HW RX-time:   1697580171852147395 (sec:1697580171.8521), delta to HW TX-complete-time sec:0.2046 (204609.098 usec)
> >0x19546f8: complete rx idx=128 addr=80100
> >
> >mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091
> >
> >mvbz4:~# tcpdump -vvx -i eth3 udp
> >        tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144
> >bytes
> >12:26:09.301074 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload
> >length: 11) fe80::1270:fdff:fe48:1087.55807 > fe80::1270:fdff:fe48:1077.9091: [bad
> >udp cksum 0x3b8e -> 0xde7e!] UDP, length 3
> >        0x0000:  6003 5fa5 000b 117f fe80 0000 0000 0000
> >        0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
> >        0x0020:  1270 fdff fe48 1077 d9ff 2383 000b 3b8e
> >        0x0030:  7864 70
> >12:26:09.301976 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload
> >length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.55807: [udp
> >sum ok] UDP, length 3
> >        0x0000:  6003 5fa5 000b 117f fe80 0000 0000 0000
> >        0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
> >        0x0020:  1270 fdff fe48 1087 2383 d9ff 000b de7e
> >        0x0030:  7864 70
> >
> >This reverts commit c3c9abc1d0c989e0be21d78cccd99076cc94ec44.
>
> It didn't looked like this patch is reverting something.
> If this is not a mistake, can you add the commit title behind the ID?

Ah, that's a leftover from my rebasing and reshuffling, will drop, thanks!
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 057f7c145f62..d9421c5889f8 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -10,7 +10,9 @@ 
  *   - rx_hash
  *
  * TX:
- * - TBD
+ * - UDP 9091 packets trigger TX reply
+ * - TX HW timestamp is requested and reported back upon completion
+ * - TX checksum is requested
  */
 
 #include <test_progs.h>
@@ -24,11 +26,14 @@ 
 #include <linux/net_tstamp.h>
 #include <linux/udp.h>
 #include <linux/sockios.h>
+#include <linux/if_xdp.h>
 #include <sys/mman.h>
 #include <net/if.h>
 #include <ctype.h>
 #include <poll.h>
 #include <time.h>
+#include <unistd.h>
+#include <libgen.h>
 
 #include "xdp_metadata.h"
 
@@ -53,6 +58,9 @@  struct xsk *rx_xsk;
 const char *ifname;
 int ifindex;
 int rxq;
+bool skip_tx;
+__u64 last_hw_rx_timestamp;
+__u64 last_xdp_rx_timestamp;
 
 void test__fail(void) { /* for network_helpers.c */ }
 
@@ -69,6 +77,7 @@  static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
 		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
 		.flags = XSK_UMEM__DEFAULT_FLAGS,
+		.tx_metadata_len = sizeof(struct xsk_tx_metadata),
 	};
 	__u32 idx;
 	u64 addr;
@@ -190,6 +199,10 @@  static void verify_xdp_metadata(void *data, clockid_t clock_id)
 	if (meta->rx_timestamp) {
 		__u64 ref_tstamp = gettime(clock_id);
 
+		/* store received timestamps to calculate a delta at tx */
+		last_hw_rx_timestamp = meta->rx_timestamp;
+		last_xdp_rx_timestamp = meta->xdp_timestamp;
+
 		print_tstamp_delta("HW RX-time", "User RX-time",
 				   meta->rx_timestamp, ref_tstamp);
 		print_tstamp_delta("XDP RX-time", "User RX-time",
@@ -242,6 +255,128 @@  static void verify_skb_metadata(int fd)
 	printf("skb hwtstamp is not found!\n");
 }
 
+static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
+{
+	struct xsk_tx_metadata *meta;
+	__u64 addr;
+	void *data;
+	__u32 idx;
+
+	if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
+		return false;
+
+	addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
+	data = xsk_umem__get_data(xsk->umem_area, addr);
+	meta = data - sizeof(struct xsk_tx_metadata);
+
+	printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+
+	printf("tx_timestamp:  %llu (sec:%0.4f)\n", meta->completion.tx_timestamp,
+	       (double)meta->completion.tx_timestamp / NANOSEC_PER_SEC);
+	if (meta->completion.tx_timestamp) {
+		__u64 ref_tstamp = gettime(clock_id);
+
+		print_tstamp_delta("HW TX-complete-time", "User TX-complete-time",
+				   meta->completion.tx_timestamp, ref_tstamp);
+		print_tstamp_delta("XDP RX-time", "User TX-complete-time",
+				   last_xdp_rx_timestamp, ref_tstamp);
+		print_tstamp_delta("HW RX-time", "HW TX-complete-time",
+				   last_hw_rx_timestamp, meta->completion.tx_timestamp);
+	}
+
+	xsk_ring_cons__release(&xsk->comp, 1);
+
+	return true;
+}
+
+#define swap(a, b, len) do { \
+	for (int i = 0; i < len; i++) { \
+		__u8 tmp = ((__u8 *)a)[i]; \
+		((__u8 *)a)[i] = ((__u8 *)b)[i]; \
+		((__u8 *)b)[i] = tmp; \
+	} \
+} while (0)
+
+static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
+{
+	struct xsk_tx_metadata *meta;
+	struct ipv6hdr *ip6h = NULL;
+	struct iphdr *iph = NULL;
+	struct xdp_desc *tx_desc;
+	struct udphdr *udph;
+	struct ethhdr *eth;
+	__sum16 want_csum;
+	void *data;
+	__u32 idx;
+	int ret;
+	int len;
+
+	ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
+	if (ret != 1) {
+		printf("%p: failed to reserve tx slot\n", xsk);
+		return;
+	}
+
+	tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+	tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
+	data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+
+	meta = data - sizeof(struct xsk_tx_metadata);
+	memset(meta, 0, sizeof(*meta));
+	meta->flags = XDP_TX_METADATA_TIMESTAMP;
+
+	eth = rx_packet;
+
+	if (eth->h_proto == htons(ETH_P_IP)) {
+		iph = (void *)(eth + 1);
+		udph = (void *)(iph + 1);
+	} else if (eth->h_proto == htons(ETH_P_IPV6)) {
+		ip6h = (void *)(eth + 1);
+		udph = (void *)(ip6h + 1);
+	} else {
+		printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto);
+		xsk_ring_prod__cancel(&xsk->tx, 1);
+		return;
+	}
+
+	len = ETH_HLEN;
+	if (ip6h)
+		len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
+	if (iph)
+		len += ntohs(iph->tot_len);
+
+	swap(eth->h_dest, eth->h_source, ETH_ALEN);
+	if (iph)
+		swap(&iph->saddr, &iph->daddr, 4);
+	else
+		swap(&ip6h->saddr, &ip6h->daddr, 16);
+	swap(&udph->source, &udph->dest, 2);
+
+	want_csum = udph->check;
+	if (ip6h)
+		udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					       ntohs(udph->len), IPPROTO_UDP, 0);
+	else
+		udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						 ntohs(udph->len), IPPROTO_UDP, 0);
+
+	meta->flags |= XDP_TX_METADATA_CHECKSUM;
+	if (iph)
+		meta->csum_start = sizeof(*eth) + sizeof(*iph);
+	else
+		meta->csum_start = sizeof(*eth) + sizeof(*ip6h);
+	meta->csum_offset = offsetof(struct udphdr, check);
+
+	printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n",
+	       xsk, ntohs(udph->check), ntohs(want_csum), meta->csum_start, meta->csum_offset);
+
+	memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
+	tx_desc->options |= XDP_TX_METADATA;
+	tx_desc->len = len;
+
+	xsk_ring_prod__submit(&xsk->tx, 1);
+}
+
 static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
 {
 	const struct xdp_desc *rx_desc;
@@ -307,6 +442,22 @@  static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t
 				verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
 						    clock_id);
 				first_seg = false;
+
+				if (!skip_tx) {
+					/* mirror first chunk back */
+					ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr),
+						  clock_id);
+
+					ret = kick_tx(xsk);
+					if (ret)
+						printf("kick_tx ret=%d\n", ret);
+
+					for (int j = 0; j < 500; j++) {
+						if (complete_tx(xsk, clock_id))
+							break;
+						usleep(10*1000);
+					}
+				}
 			}
 
 			xsk_ring_cons__release(&xsk->rx, 1);
@@ -442,6 +593,7 @@  static void print_usage(void)
 		"  -c    Run in copy mode (zerocopy is default)\n"
 		"  -h    Display this help and exit\n\n"
 		"  -m    Enable multi-buffer XDP for larger MTU\n"
+		"  -r    Don't generate AF_XDP reply (rx metadata only)\n"
 		"Generate test packets on the other machine with:\n"
 		"  echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
 
@@ -452,7 +604,7 @@  static void read_args(int argc, char *argv[])
 {
 	char opt;
 
-	while ((opt = getopt(argc, argv, "chm")) != -1) {
+	while ((opt = getopt(argc, argv, "chmr")) != -1) {
 		switch (opt) {
 		case 'c':
 			bind_flags &= ~XDP_USE_NEED_WAKEUP;
@@ -465,6 +617,9 @@  static void read_args(int argc, char *argv[])
 		case 'm':
 			bind_flags |= XDP_USE_SG;
 			break;
+		case 'r':
+			skip_tx = true;
+			break;
 		case '?':
 			if (isprint(optopt))
 				fprintf(stderr, "Unknown option: -%c\n", optopt);