diff mbox series

[RFC,net-next,v4,8/8] selftests/bpf: Add TX side to xdp_hw_metadata

Message ID 20230724235957.1953861-9-sdf@google.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series xsk: TX metadata | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers warning 4 maintainers not CCed: shuah@kernel.org mykolal@fb.com davem@davemloft.net linux-kselftest@vger.kernel.org
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 9 this patch: 9
netdev/checkpatch warning CHECK: Macro argument 'len' may be better as '(len)' to avoid precedence issues CHECK: spaces preferred around that '*' (ctx:VxV) WARNING: Prefer using '"%s...", __func__' to using 'usage', this function's name, in a string WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Stanislav Fomichev July 24, 2023, 11:59 p.m. UTC
When we get packets on port 9091, we swap src/dst and send it out.
At this point, we also request the timestamp and plumb it back
to the userspace. The userspace simply prints the timestamp.

Also print current UDP checksum, rewrite it with the pseudo-header
checksum and offload TX checksum calculation to devtx. Upon
completion, report TX checksum back (mlx5 doesn't put it back, so
I've used tcpdump to confirm that the checksum is correct).

Some other related changes:
- switched to zerocopy mode by default; new flag can be used to force
  old behavior
- request fixed TX_METADATA_LEN headroom
- some other small fixes (umem size, fill idx+i, etc)

mvbz3:~# ./xdp_hw_metadata eth3 -c mlx5e_devtx_complete_xdp -s mlx5e_devtx_submit_xd
attach rx bpf program...
...
0x206d298: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
rx_hash: 0x2BFB7FEC with RSS type:0x2A
rx_timestamp:  1690238278345877848 (sec:1690238278.3459)
XDP RX-time:   1690238278538397674 (sec:1690238278.5384) delta sec:0.1925 (192519.826 usec)
AF_XDP time:   1690238278538515250 (sec:1690238278.5385) delta sec:0.0001 (117.576 usec)
0x206d298: ping-pong with csum=8e3b (want 57c9) csum_start=54 csum_offset=6
0x206d298: complete tx idx=0 addr=10
0x206d298: tx_timestamp:  1690238278577008140 (sec:1690238278.5770)
0x206d298: complete rx idx=128 addr=80100

mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091

mvbz4:~# tcpdump -vvx -i eth3 udp
tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144 bytes
10:12:43.901436 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1087.44339 > fe80::1270:fdff:fe48:1077.9091: [bad udp cksum 0x3b8e -> 0x0b4b!] UDP, length 3
        0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
        0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
        0x0020:  1270 fdff fe48 1077 ad33 2383 000b 3b8e
        0x0030:  7864 70
10:12:43.902125 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.44339: [udp sum ok] UDP, length 3
        0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
        0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
        0x0020:  1270 fdff fe48 1087 2383 ad33 000b 0b4b
        0x0030:  7864 70

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 201 +++++++++++++++++-
 1 file changed, 191 insertions(+), 10 deletions(-)

Comments

Willem de Bruijn July 25, 2023, 8:59 p.m. UTC | #1
Stanislav Fomichev wrote:
> When we get packets on port 9091, we swap src/dst and send it out.
> At this point, we also request the timestamp and plumb it back
> to the userspace. The userspace simply prints the timestamp.
> 
> Also print current UDP checksum, rewrite it with the pseudo-header
> checksum and offload TX checksum calculation to devtx. Upon
> completion, report TX checksum back (mlx5 doesn't put it back, so
> I've used tcpdump to confirm that the checksum is correct).
> 
> Some other related changes:
> - switched to zerocopy mode by default; new flag can be used to force
>   old behavior
> - request fixed TX_METADATA_LEN headroom
> - some other small fixes (umem size, fill idx+i, etc)
> 
> mvbz3:~# ./xdp_hw_metadata eth3 -c mlx5e_devtx_complete_xdp -s mlx5e_devtx_submit_xd
> attach rx bpf program...
> ...
> 0x206d298: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
> rx_hash: 0x2BFB7FEC with RSS type:0x2A
> rx_timestamp:  1690238278345877848 (sec:1690238278.3459)
> XDP RX-time:   1690238278538397674 (sec:1690238278.5384) delta sec:0.1925 (192519.826 usec)
> AF_XDP time:   1690238278538515250 (sec:1690238278.5385) delta sec:0.0001 (117.576 usec)
> 0x206d298: ping-pong with csum=8e3b (want 57c9) csum_start=54 csum_offset=6
> 0x206d298: complete tx idx=0 addr=10
> 0x206d298: tx_timestamp:  1690238278577008140 (sec:1690238278.5770)
> 0x206d298: complete rx idx=128 addr=80100
> 
> mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091
> 
> mvbz4:~# tcpdump -vvx -i eth3 udp
> tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144 bytes
> 10:12:43.901436 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1087.44339 > fe80::1270:fdff:fe48:1077.9091: [bad udp cksum 0x3b8e -> 0x0b4b!] UDP, length 3
>         0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
>         0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
>         0x0020:  1270 fdff fe48 1077 ad33 2383 000b 3b8e
>         0x0030:  7864 70
> 10:12:43.902125 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.44339: [udp sum ok] UDP, length 3
>         0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
>         0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
>         0x0020:  1270 fdff fe48 1087 2383 ad33 000b 0b4b
>         0x0030:  7864 70
> 
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  tools/testing/selftests/bpf/xdp_hw_metadata.c | 201 +++++++++++++++++-
>  1 file changed, 191 insertions(+), 10 deletions(-)
> 

> +static void usage(const char *prog)
> +{
> +	fprintf(stderr,
> +		"usage: %s [OPTS] <ifname>\n"
> +		"OPTS:\n"
> +		"    -T    don't generate AF_XDP reply (rx metadata only)\n"
> +		"    -Z    run in copy mode\n",

nit: makes more sense to call copy mode 'C', rather than 'Z'
Stanislav Fomichev July 25, 2023, 10:36 p.m. UTC | #2
On 07/25, Willem de Bruijn wrote:
> Stanislav Fomichev wrote:
> > When we get packets on port 9091, we swap src/dst and send it out.
> > At this point, we also request the timestamp and plumb it back
> > to the userspace. The userspace simply prints the timestamp.
> > 
> > Also print current UDP checksum, rewrite it with the pseudo-header
> > checksum and offload TX checksum calculation to devtx. Upon
> > completion, report TX checksum back (mlx5 doesn't put it back, so
> > I've used tcpdump to confirm that the checksum is correct).
> > 
> > Some other related changes:
> > - switched to zerocopy mode by default; new flag can be used to force
> >   old behavior
> > - request fixed TX_METADATA_LEN headroom
> > - some other small fixes (umem size, fill idx+i, etc)
> > 
> > mvbz3:~# ./xdp_hw_metadata eth3 -c mlx5e_devtx_complete_xdp -s mlx5e_devtx_submit_xd
> > attach rx bpf program...
> > ...
> > 0x206d298: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
> > rx_hash: 0x2BFB7FEC with RSS type:0x2A
> > rx_timestamp:  1690238278345877848 (sec:1690238278.3459)
> > XDP RX-time:   1690238278538397674 (sec:1690238278.5384) delta sec:0.1925 (192519.826 usec)
> > AF_XDP time:   1690238278538515250 (sec:1690238278.5385) delta sec:0.0001 (117.576 usec)
> > 0x206d298: ping-pong with csum=8e3b (want 57c9) csum_start=54 csum_offset=6
> > 0x206d298: complete tx idx=0 addr=10
> > 0x206d298: tx_timestamp:  1690238278577008140 (sec:1690238278.5770)
> > 0x206d298: complete rx idx=128 addr=80100
> > 
> > mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091
> > 
> > mvbz4:~# tcpdump -vvx -i eth3 udp
> > tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144 bytes
> > 10:12:43.901436 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1087.44339 > fe80::1270:fdff:fe48:1077.9091: [bad udp cksum 0x3b8e -> 0x0b4b!] UDP, length 3
> >         0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
> >         0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
> >         0x0020:  1270 fdff fe48 1077 ad33 2383 000b 3b8e
> >         0x0030:  7864 70
> > 10:12:43.902125 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.44339: [udp sum ok] UDP, length 3
> >         0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
> >         0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
> >         0x0020:  1270 fdff fe48 1087 2383 ad33 000b 0b4b
> >         0x0030:  7864 70
> > 
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  tools/testing/selftests/bpf/xdp_hw_metadata.c | 201 +++++++++++++++++-
> >  1 file changed, 191 insertions(+), 10 deletions(-)
> > 
> 
> > +static void usage(const char *prog)
> > +{
> > +	fprintf(stderr,
> > +		"usage: %s [OPTS] <ifname>\n"
> > +		"OPTS:\n"
> > +		"    -T    don't generate AF_XDP reply (rx metadata only)\n"
> > +		"    -Z    run in copy mode\n",
> 
> nit: makes more sense to call copy mode 'C', rather than 'Z'

Initially I had -c and -s for completion/submission bpf hooks. Now that
these are gone, can actually use -c. Capital letter here actually means
'not'. Z - not zerocopy. T - no tx.

I'll rename to:
-r - rx only
-c - copy mode

LMK if it doesn't make sense..
Willem de Bruijn July 25, 2023, 10:55 p.m. UTC | #3
Stanislav Fomichev wrote:
> On 07/25, Willem de Bruijn wrote:
> > Stanislav Fomichev wrote:
> > > When we get packets on port 9091, we swap src/dst and send it out.
> > > At this point, we also request the timestamp and plumb it back
> > > to the userspace. The userspace simply prints the timestamp.
> > > 
> > > Also print current UDP checksum, rewrite it with the pseudo-header
> > > checksum and offload TX checksum calculation to devtx. Upon
> > > completion, report TX checksum back (mlx5 doesn't put it back, so
> > > I've used tcpdump to confirm that the checksum is correct).
> > > 
> > > Some other related changes:
> > > - switched to zerocopy mode by default; new flag can be used to force
> > >   old behavior
> > > - request fixed TX_METADATA_LEN headroom
> > > - some other small fixes (umem size, fill idx+i, etc)
> > > 
> > > mvbz3:~# ./xdp_hw_metadata eth3 -c mlx5e_devtx_complete_xdp -s mlx5e_devtx_submit_xd
> > > attach rx bpf program...
> > > ...
> > > 0x206d298: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
> > > rx_hash: 0x2BFB7FEC with RSS type:0x2A
> > > rx_timestamp:  1690238278345877848 (sec:1690238278.3459)
> > > XDP RX-time:   1690238278538397674 (sec:1690238278.5384) delta sec:0.1925 (192519.826 usec)
> > > AF_XDP time:   1690238278538515250 (sec:1690238278.5385) delta sec:0.0001 (117.576 usec)
> > > 0x206d298: ping-pong with csum=8e3b (want 57c9) csum_start=54 csum_offset=6
> > > 0x206d298: complete tx idx=0 addr=10
> > > 0x206d298: tx_timestamp:  1690238278577008140 (sec:1690238278.5770)
> > > 0x206d298: complete rx idx=128 addr=80100
> > > 
> > > mvbz4:~# nc  -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091
> > > 
> > > mvbz4:~# tcpdump -vvx -i eth3 udp
> > > tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144 bytes
> > > 10:12:43.901436 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1087.44339 > fe80::1270:fdff:fe48:1077.9091: [bad udp cksum 0x3b8e -> 0x0b4b!] UDP, length 3
> > >         0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
> > >         0x0010:  1270 fdff fe48 1087 fe80 0000 0000 0000
> > >         0x0020:  1270 fdff fe48 1077 ad33 2383 000b 3b8e
> > >         0x0030:  7864 70
> > > 10:12:43.902125 IP6 (flowlabel 0x7a5d2, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.44339: [udp sum ok] UDP, length 3
> > >         0x0000:  6007 a5d2 000b 117f fe80 0000 0000 0000
> > >         0x0010:  1270 fdff fe48 1077 fe80 0000 0000 0000
> > >         0x0020:  1270 fdff fe48 1087 2383 ad33 000b 0b4b
> > >         0x0030:  7864 70
> > > 
> > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > ---
> > >  tools/testing/selftests/bpf/xdp_hw_metadata.c | 201 +++++++++++++++++-
> > >  1 file changed, 191 insertions(+), 10 deletions(-)
> > > 
> > 
> > > +static void usage(const char *prog)
> > > +{
> > > +	fprintf(stderr,
> > > +		"usage: %s [OPTS] <ifname>\n"
> > > +		"OPTS:\n"
> > > +		"    -T    don't generate AF_XDP reply (rx metadata only)\n"
> > > +		"    -Z    run in copy mode\n",
> > 
> > nit: makes more sense to call copy mode 'C', rather than 'Z'
> 
> Initially I had -c and -s for completion/submission bpf hooks. Now that
> these are gone, can actually use -c. Capital letter here actually means
> 'not'. Z - not zerocopy. T - no tx.
> 
> I'll rename to:
> -r - rx only
> -c - copy mode
> 
> LMK if it doesn't make sense..

Sounds great, thanks. I did not grasp the capitalization implies negation.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 613321eb84c1..0bef79ffac7a 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -10,7 +10,9 @@ 
  *   - rx_hash
  *
  * TX:
- * - TBD
+ * - UDP 9091 packets trigger TX reply
+ * - TX HW timestamp is requested and reported back upon completion
+ * - TX checksum is requested
  */
 
 #include <test_progs.h>
@@ -24,14 +26,17 @@ 
 #include <linux/net_tstamp.h>
 #include <linux/udp.h>
 #include <linux/sockios.h>
+#include <linux/if_xdp.h>
 #include <sys/mman.h>
 #include <net/if.h>
 #include <poll.h>
 #include <time.h>
+#include <unistd.h>
+#include <libgen.h>
 
 #include "xdp_metadata.h"
 
-#define UMEM_NUM 16
+#define UMEM_NUM 256
 #define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
 #define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
 #define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
@@ -51,22 +56,24 @@  struct xsk *rx_xsk;
 const char *ifname;
 int ifindex;
 int rxq;
+bool skip_tx;
 
 void test__fail(void) { /* for network_helpers.c */ }
 
-static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
+static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id, int flags)
 {
 	int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
 	const struct xsk_socket_config socket_config = {
+		.tx_metadata_len = sizeof(struct xsk_tx_metadata),
 		.rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
 		.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
-		.bind_flags = XDP_COPY,
+		.bind_flags = flags,
 	};
 	const struct xsk_umem_config umem_config = {
 		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
 		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
-		.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+		.flags = XSK_UMEM__DEFAULT_FLAGS,
 	};
 	__u32 idx;
 	u64 addr;
@@ -108,7 +115,7 @@  static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
 	for (i = 0; i < UMEM_NUM / 2; i++) {
 		addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
 		printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
-		*xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+		*xsk_ring_prod__fill_addr(&xsk->fill, idx + i) = addr;
 	}
 	xsk_ring_prod__submit(&xsk->fill, ret);
 
@@ -129,12 +136,22 @@  static void refill_rx(struct xsk *xsk, __u64 addr)
 	__u32 idx;
 
 	if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
-		printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+		printf("%p: complete rx idx=%u addr=%llx\n", xsk, idx, addr);
 		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
 		xsk_ring_prod__submit(&xsk->fill, 1);
 	}
 }
 
+static int kick_tx(struct xsk *xsk)
+{
+	return sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
+}
+
+static int kick_rx(struct xsk *xsk)
+{
+	return recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
+}
+
 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
 static __u64 gettime(clockid_t clock_id)
 {
@@ -228,6 +245,116 @@  static void verify_skb_metadata(int fd)
 	printf("skb hwtstamp is not found!\n");
 }
 
+static bool complete_tx(struct xsk *xsk)
+{
+	struct xsk_tx_metadata *meta;
+	__u64 addr;
+	void *data;
+	__u32 idx;
+
+	if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
+		return false;
+
+	addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
+	data = xsk_umem__get_data(xsk->umem_area, addr);
+	meta = data - sizeof(struct xsk_tx_metadata);
+
+	printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+	printf("%p: tx_timestamp:  %llu (sec:%0.4f)\n", xsk, meta->tx_timestamp,
+	       (double)meta->tx_timestamp / NANOSEC_PER_SEC);
+	xsk_ring_cons__release(&xsk->comp, 1);
+
+	return true;
+}
+
+#define swap(a, b, len) do { \
+	for (int i = 0; i < len; i++) { \
+		__u8 tmp = ((__u8 *)a)[i]; \
+		((__u8 *)a)[i] = ((__u8 *)b)[i]; \
+		((__u8 *)b)[i] = tmp; \
+	} \
+} while (0)
+
+static void ping_pong(struct xsk *xsk, void *rx_packet)
+{
+	struct xsk_tx_metadata *meta;
+	struct ipv6hdr *ip6h = NULL;
+	struct iphdr *iph = NULL;
+	struct xdp_desc *tx_desc;
+	struct udphdr *udph;
+	struct ethhdr *eth;
+	__sum16 want_csum;
+	void *data;
+	__u32 idx;
+	int ret;
+	int len;
+
+	ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
+	if (ret != 1) {
+		printf("%p: failed to reserve tx slot\n", xsk);
+		return;
+	}
+
+	tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+	tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
+	data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+
+	meta = data - sizeof(struct xsk_tx_metadata);
+	memset(meta, 0, sizeof(*meta));
+	meta->flags = XDP_TX_METADATA_TIMESTAMP;
+
+	eth = rx_packet;
+
+	if (eth->h_proto == htons(ETH_P_IP)) {
+		iph = (void *)(eth + 1);
+		udph = (void *)(iph + 1);
+	} else if (eth->h_proto == htons(ETH_P_IPV6)) {
+		ip6h = (void *)(eth + 1);
+		udph = (void *)(ip6h + 1);
+	} else {
+		printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto);
+		xsk_ring_prod__cancel(&xsk->tx, 1);
+		return;
+	}
+
+	len = ETH_HLEN;
+	if (ip6h)
+		len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
+	if (iph)
+		len += ntohs(iph->tot_len);
+
+	swap(eth->h_dest, eth->h_source, ETH_ALEN);
+	if (iph)
+		swap(&iph->saddr, &iph->daddr, 4);
+	else
+		swap(&ip6h->saddr, &ip6h->daddr, 16);
+	swap(&udph->source, &udph->dest, 2);
+
+	want_csum = udph->check;
+	if (ip6h)
+		udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					       ntohs(udph->len), IPPROTO_UDP, 0);
+	else
+		udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						 ntohs(udph->len), IPPROTO_UDP, 0);
+
+	meta->flags |= XDP_TX_METADATA_CHECKSUM;
+	if (iph)
+		meta->csum_start = sizeof(*eth) + sizeof(*iph);
+	else
+		meta->csum_start = sizeof(*eth) + sizeof(*ip6h);
+	meta->csum_offset = offsetof(struct udphdr, check);
+
+	printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n",
+	       xsk, udph->check, want_csum, meta->csum_start, meta->csum_offset);
+
+	memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
+	tx_desc->options |= XDP_TX_METADATA;
+	tx_desc->len = len;
+
+	xsk_ring_prod__submit(&xsk->tx, 1);
+}
+
 static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
 {
 	const struct xdp_desc *rx_desc;
@@ -250,6 +377,13 @@  static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t
 
 	while (true) {
 		errno = 0;
+
+		for (i = 0; i < rxq; i++) {
+			ret = kick_rx(&rx_xsk[i]);
+			if (ret)
+				printf("kick_rx ret=%d\n", ret);
+		}
+
 		ret = poll(fds, rxq + 1, 1000);
 		printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n",
 		       ret, errno, bpf_obj->bss->pkts_skip,
@@ -280,6 +414,22 @@  static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t
 			       xsk, idx, rx_desc->addr, addr, comp_addr);
 			verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
 					    clock_id);
+
+			if (!skip_tx) {
+				/* mirror the packet back */
+				ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr));
+
+				ret = kick_tx(xsk);
+				if (ret)
+					printf("kick_tx ret=%d\n", ret);
+
+				for (int j = 0; j < 500; j++) {
+					if (complete_tx(xsk))
+						break;
+					usleep(10*1000);
+				}
+			}
+
 			xsk_ring_cons__release(&xsk->rx, 1);
 			refill_rx(xsk, comp_addr);
 		}
@@ -404,21 +554,52 @@  static void timestamping_enable(int fd, int val)
 		error(1, errno, "setsockopt(SO_TIMESTAMPING)");
 }
 
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] <ifname>\n"
+		"OPTS:\n"
+		"    -T    don't generate AF_XDP reply (rx metadata only)\n"
+		"    -Z    run in copy mode\n",
+		prog);
+}
+
 int main(int argc, char *argv[])
 {
+	int bind_flags =  XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
 	clockid_t clock_id = CLOCK_TAI;
 	int server_fd = -1;
+	int opt;
 	int ret;
 	int i;
 
 	struct bpf_program *prog;
 
-	if (argc != 2) {
+	while ((opt = getopt(argc, argv, "TZ")) != -1) {
+		switch (opt) {
+		case 'T':
+			skip_tx = true;
+			break;
+		case 'Z':
+			bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (argc < 2) {
 		fprintf(stderr, "pass device name\n");
 		return -1;
 	}
 
-	ifname = argv[1];
+	if (optind >= argc) {
+		usage(basename(argv[0]));
+		return 1;
+	}
+
+	ifname = argv[optind];
 	ifindex = if_nametoindex(ifname);
 	rxq = rxq_num(ifname);
 
@@ -432,7 +613,7 @@  int main(int argc, char *argv[])
 
 	for (i = 0; i < rxq; i++) {
 		printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i);
-		ret = open_xsk(ifindex, &rx_xsk[i], i);
+		ret = open_xsk(ifindex, &rx_xsk[i], i, bind_flags);
 		if (ret)
 			error(1, -ret, "open_xsk");