Message ID | 20231106024413.2801438-13-almasrymina@google.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | Device Memory TCP | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply, async |
On Sun, 2023-11-05 at 18:44 -0800, Mina Almasry wrote: > @@ -91,6 +95,7 @@ TEST_PROGS += test_bridge_neigh_suppress.sh > TEST_PROGS += test_vxlan_nolocalbypass.sh > TEST_PROGS += test_bridge_backup_port.sh > TEST_PROGS += fdb_flush.sh > +TEST_GEN_FILES += ncdevmem I guess we want something added to TEST_PROGS, too ;) > TEST_FILES := settings > > diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c > new file mode 100644 > index 000000000000..78bc3ad767ca > --- /dev/null > +++ b/tools/testing/selftests/net/ncdevmem.c > @@ -0,0 +1,546 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#define _GNU_SOURCE > +#define __EXPORTED_HEADERS__ > + > +#include <linux/uio.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <stdbool.h> > +#include <string.h> > +#include <errno.h> > +#define __iovec_defined > +#include <fcntl.h> > +#include <malloc.h> > + > +#include <arpa/inet.h> > +#include <sys/socket.h> > +#include <sys/mman.h> > +#include <sys/ioctl.h> > +#include <sys/syscall.h> > + > +#include <linux/memfd.h> > +#include <linux/if.h> > +#include <linux/dma-buf.h> > +#include <linux/udmabuf.h> > +#include <libmnl/libmnl.h> > +#include <linux/types.h> > +#include <linux/netlink.h> > +#include <linux/genetlink.h> > +#include <linux/netdev.h> > +#include <time.h> > + > +#include "netdev-user.h" > +#include <ynl.h> > + > +#define PAGE_SHIFT 12 > +#define TEST_PREFIX "ncdevmem" > +#define NUM_PAGES 16000 > + > +#ifndef MSG_SOCK_DEVMEM > +#define MSG_SOCK_DEVMEM 0x2000000 > +#endif > + > +/* > + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP > + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. > + * > + * Usage: > + * > + * * Without validation: > + * > + * On server: > + * ncdevmem -s <server IP> -c <client IP> -f eth1 -n 0000:06:00.0 -l \ > + * -p 5201 > + * > + * On client: > + * ncdevmem -s <server IP> -c <client IP> -f eth1 -n 0000:06:00.0 -p 5201 > + * > + * * With Validation: > + * On server: > + * ncdevmem -s <server IP> -c <client IP> -l -f eth1 -n 0000:06:00.0 \ > + * -p 5202 -v 1 > + * > + * On client: > + * ncdevmem -s <server IP> -c <client IP> -f eth1 -n 0000:06:00.0 -p 5202 \ > + * -v 100000 > + * > + * Note this is compatible with regular netcat. i.e. the sender or receiver can > + * be replaced with regular netcat to test the RX or TX path in isolation. > + */ > + > +static char *server_ip = "192.168.1.4"; > +static char *client_ip = "192.168.1.2"; > +static char *port = "5201"; > +static size_t do_validation; > +static int queue_num = 15; > +static char *ifname = "eth1"; > +static char *nic_pci_addr = "0000:06:00.0"; > +static unsigned int iterations; > + > +void print_bytes(void *ptr, size_t size) > +{ > + unsigned char *p = ptr; > + int i; > + > + for (i = 0; i < size; i++) { > + printf("%02hhX ", p[i]); > + } > + printf("\n"); > +} > + > +void print_nonzero_bytes(void *ptr, size_t size) > +{ > + unsigned char *p = ptr; > + unsigned int i; > + > + for (i = 0; i < size; i++) > + putchar(p[i]); > + printf("\n"); > +} > + > +void validate_buffer(void *line, size_t size) > +{ > + static unsigned char seed = 1; > + unsigned char *ptr = line; > + int errors = 0; > + size_t i; > + > + for (i = 0; i < size; i++) { > + if (ptr[i] != seed) { > + fprintf(stderr, > + "Failed validation: expected=%u, actual=%u, index=%lu\n", > + seed, ptr[i], i); > + errors++; > + if (errors > 20) > + exit(1); > + } > + seed++; > + if (seed == do_validation) > + seed = 0; > + } > + > + fprintf(stdout, "Validated buffer\n"); > +} > + > +static void reset_flow_steering(void) > +{ > + char command[256]; > + > + memset(command, 0, sizeof(command)); > + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple off", > + "eth1"); > + system(command); > + > + memset(command, 0, sizeof(command)); > + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple on", > + "eth1"); > + system(command); > +} > + > +static void configure_flow_steering(void) > +{ > + char command[256]; > + > + memset(command, 0, sizeof(command)); > + snprintf(command, sizeof(command), > + "sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d", > + ifname, client_ip, server_ip, port, port, queue_num); > + system(command); > +} > + > +/* Triggers a driver reset... > + * > + * The proper way to do this is probably 'ethtool --reset', but I don't have > + * that supported on my current test bed. I resort to changing this > + * configuration in the driver which also causes a driver reset... > + */ > +static void trigger_device_reset(void) > +{ > + char command[256]; > + > + memset(command, 0, sizeof(command)); > + snprintf(command, sizeof(command), > + "sudo ethtool --set-priv-flags %s enable-header-split off", > + ifname); > + system(command); > + > + memset(command, 0, sizeof(command)); > + snprintf(command, sizeof(command), > + "sudo ethtool --set-priv-flags %s enable-header-split on", > + ifname); > + system(command); > +} > + > +static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, > + __u32 *queue_idx, unsigned int n_queue_index, > + struct ynl_sock **ys) > +{ > + struct netdev_bind_rx_req *req = NULL; > + struct ynl_error yerr; > + int ret = 0; > + > + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); > + if (!*ys) { > + fprintf(stderr, "YNL: %s\n", yerr.msg); > + return -1; > + } > + > + if (ynl_subscribe(*ys, "mgmt")) > + goto err_close; > + > + req = netdev_bind_rx_req_alloc(); > + netdev_bind_rx_req_set_ifindex(req, ifindex); > + netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd); > + __netdev_bind_rx_req_set_queues(req, queue_idx, n_queue_index); > + > + ret = netdev_bind_rx(*ys, req); > + if (!ret) { > + perror("netdev_bind_rx"); > + goto err_close; > + } > + > + netdev_bind_rx_req_free(req); > + > + return 0; > + > +err_close: > + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); > + netdev_bind_rx_req_free(req); > + ynl_sock_destroy(*ys); > + return -1; > +} > + > +static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) > +{ > + struct udmabuf_create create; > + int ret; > + > + *devfd = open("/dev/udmabuf", O_RDWR); > + if (*devfd < 0) { > + fprintf(stderr, > + "%s: [skip,no-udmabuf: Unable to access DMA " > + "buffer device file]\n", > + TEST_PREFIX); > + exit(70); > + } > + > + *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); > + if (*memfd < 0) { > + printf("%s: [skip,no-memfd]\n", TEST_PREFIX); > + exit(72); > + } > + > + ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK); > + if (ret < 0) { > + printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); > + exit(73); > + } > + > + ret = ftruncate(*memfd, dmabuf_size); > + if (ret == -1) { > + printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); > + exit(74); > + } > + > + memset(&create, 0, sizeof(create)); > + > + create.memfd = *memfd; > + create.offset = 0; > + create.size = dmabuf_size; > + *buf = ioctl(*devfd, UDMABUF_CREATE, &create); > + if (*buf < 0) { > + printf("%s: [FAIL, create udmabuf]\n", TEST_PREFIX); > + exit(75); > + } > +} > + > +int do_server(void) > +{ > + char ctrl_data[sizeof(int) * 20000]; > + size_t non_page_aligned_frags = 0; > + struct sockaddr_in client_addr; > + struct sockaddr_in server_sin; > + size_t page_aligned_frags = 0; > + int devfd, memfd, buf, ret; > + size_t total_received = 0; > + bool is_devmem = false; > + char *buf_mem = NULL; > + struct ynl_sock *ys; > + size_t dmabuf_size; > + char iobuf[819200]; > + char buffer[256]; > + int socket_fd; > + int client_fd; > + size_t i = 0; > + int opt = 1; > + > + dmabuf_size = getpagesize() * NUM_PAGES; > + > + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); > + > + __u32 *queue_idx = malloc(sizeof(__u32) * 2); > + > + queue_idx[0] = 14; > + queue_idx[1] = 15; > + if (bind_rx_queue(3 /* index for eth1 */, buf, queue_idx, 2, &ys)) { ^^^^^^^^^^^^^^^^^^^ I guess we want to explicitly fetch the "ifname" index. Side note: I'm wondering if we could extend some kind of virtual device to allow single host self-tests? e.g. veth, if that would not cause excessive bloat in the device driver? Cheers, Paolo
My brain is slightly fried after trying to catch up on the thread
for close to 2h. So forgive me if I'm missing something.
This applies to all emails I'm about to send :)
On Sun, 5 Nov 2023 18:44:11 -0800 Mina Almasry wrote:
> + trigger_device_reset();
The user space must not be responsible for the reset.
We can add some temporary "recreate page pools" ndo
until the queue API is ready.
But it should not be visible to the user in any way.
And then the kernel can issue the same reset when the netlink
socket dies to flush device free lists.
Maybe we should also add a "allow device/all-queues reload" flag
to the netlink API to differentiate drivers which can't implement
full queue API later on. We want to make sure the defaults work well
in our "target design", rather than at the first stage. And target
design will reload queues one by one.
On Sun, 5 Nov 2023 18:44:11 -0800 Mina Almasry wrote: > + if (ynl_subscribe(*ys, "mgmt")) > + goto err_close; Why? :)
On Fri, Nov 10, 2023 at 3:13 PM Jakub Kicinski <kuba@kernel.org> wrote: > > My brain is slightly fried after trying to catch up on the thread > for close to 2h. So forgive me if I'm missing something. > This applies to all emails I'm about to send :) > > On Sun, 5 Nov 2023 18:44:11 -0800 Mina Almasry wrote: > > + trigger_device_reset(); > > The user space must not be responsible for the reset. > We can add some temporary "recreate page pools" ndo > until the queue API is ready. > Thanks for the clear requirement. I clearly had something different in mind. Might be dumb suggestions, but instead of creating a new ndo that we maybe end up wanting to deprecate once the queue API is ready, how about we use either of those existing APIs? +void netdev_reset(struct net_device *dev) +{ + int flags = ETH_RESET_ALL; + int err; + +#if 1 + __dev_close(dev); + err = __dev_open(dev, NULL); +#else + err = dev->ethtool_ops->reset(dev, &flags); +#endif +} + I've tested both of these to work with GVE on both bind via the netlink API and unbind via the netlink socket close, but I'm not enough of an expert to tell if there is some bad side effect that can happen or something. > But it should not be visible to the user in any way. > > And then the kernel can issue the same reset when the netlink > socket dies to flush device free lists. > Sure thing, I can do that. > Maybe we should also add a "allow device/all-queues reload" flag > to the netlink API to differentiate drivers which can't implement > full queue API later on. We want to make sure the defaults work well > in our "target design", rather than at the first stage. And target > design will reload queues one by one. I can add a flag, yes.
On Fri, 10 Nov 2023 18:27:08 -0800 Mina Almasry wrote: > Thanks for the clear requirement. I clearly had something different in mind. > > Might be dumb suggestions, but instead of creating a new ndo that we > maybe end up wanting to deprecate once the queue API is ready, how > about we use either of those existing APIs? > > +void netdev_reset(struct net_device *dev) > +{ > + int flags = ETH_RESET_ALL; > + int err; > + > +#if 1 > + __dev_close(dev); > + err = __dev_open(dev, NULL); > +#else > + err = dev->ethtool_ops->reset(dev, &flags); > +#endif > +} > + > > I've tested both of these to work with GVE on both bind via the > netlink API and unbind via the netlink socket close, but I'm not > enough of an expert to tell if there is some bad side effect that can > happen or something. We generally don't accept drivers doing device reconfiguration with full close() + open() because if the open() fails your machine may be cut off. There are drivers which do it, but they are either old... or weren't reviewed hard enough. The driver should allocate memory and whether else it can without stopping the queues first. Once it has all those, stop the queues, reconfigure with already allocated resources, start queues, free old. Even without the queue API in place, good drivers do full device reconfig this way. Hence my mind goes towards a new (temporary?) ndo. It will be replaced by the queue API, but whoever implements it for now has to follow this careful reconfig strategy...
j On Fri, Nov 10, 2023 at 6:36 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Fri, 10 Nov 2023 18:27:08 -0800 Mina Almasry wrote: > > Thanks for the clear requirement. I clearly had something different in mind. > > > > Might be dumb suggestions, but instead of creating a new ndo that we > > maybe end up wanting to deprecate once the queue API is ready, how > > about we use either of those existing APIs? > > > > +void netdev_reset(struct net_device *dev) > > +{ > > + int flags = ETH_RESET_ALL; > > + int err; > > + > > +#if 1 > > + __dev_close(dev); > > + err = __dev_open(dev, NULL); > > +#else > > + err = dev->ethtool_ops->reset(dev, &flags); > > +#endif > > +} > > + > > > > I've tested both of these to work with GVE on both bind via the > > netlink API and unbind via the netlink socket close, but I'm not > > enough of an expert to tell if there is some bad side effect that can > > happen or something. > > We generally don't accept drivers doing device reconfiguration with > full close() + open() because if the open() fails your machine > may be cut off. > > There are drivers which do it, but they are either old... or weren't > reviewed hard enough. > > The driver should allocate memory and whether else it can without > stopping the queues first. Once it has all those, stop the queues, > reconfigure with already allocated resources, start queues, free old. > > Even without the queue API in place, good drivers do full device > reconfig this way. Hence my mind goes towards a new (temporary?) > ndo. It will be replaced by the queue API, but whoever implements > it for now has to follow this careful reconfig strategy... OK, thanks. I managed to get a POC (but only POC) of the queue API working with GVE. I still need to test it more thoroughly and get a review before I can conclude it's actually a viable path but it doesn't seem as grim as I originally thought: https://github.com/torvalds/linux/commit/21b8e108fa88d90870eef53be9320f136b96cca0 So, seems there are 2 paths forward: (a) implement a new 'reconfig' ndo carefully as you described above. (b) implement a minimal version of the queue API as you described here: https://lore.kernel.org/netdev/20230815171638.4c057dcd@kernel.org/ Some questions, sorry if basic: 1. For (b), would it be OK to implement a very minimal version of queue_[stop|start]/queue_mem_[alloc|free], which I use for the sole purpose of reposting buffers to an individual queue, and then later whoever picks up your queue API effort (maybe me) extends the implementation to do the rest of the things you described in your email? If not, what is the minimal queue API I can implement and use for devmem TCP? 2. Since this is adding ndo, do I need to implement the ndo for 2 drivers or is GVE sufficient? -- Thanks, Mina
On Sun, 12 Nov 2023 20:08:10 -0800 Mina Almasry wrote: > 1. For (b), would it be OK to implement a very minimal version of > queue_[stop|start]/queue_mem_[alloc|free], which I use for the sole > purpose of reposting buffers to an individual queue, and then later > whoever picks up your queue API effort (maybe me) extends the > implementation to do the rest of the things you described in your > email? If not, what is the minimal queue API I can implement and use > for devmem TCP? Any form of queue API is better than a temporary ndo. IIUC it will not bubble up into uAPI in any way so we can extend/change it later as needed. > 2. Since this is adding ndo, do I need to implement the ndo for 2 > drivers or is GVE sufficient? One driver is fine, especially if we're doing this instead of the reset hack.
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 2f9d378edec3..b644dbae58b7 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -17,6 +17,7 @@ ipv6_flowlabel ipv6_flowlabel_mgr log.txt msg_zerocopy +ncdevmem nettest psock_fanout psock_snd diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b9804ceb9494..6c6e53c70e99 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -5,6 +5,10 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES) # Additional include paths needed by kselftest.h CFLAGS += -I../ +CFLAGS += -I../../../net/ynl/generated/ +CFLAGS += -I../../../net/ynl/lib/ + +LDLIBS += ../../../net/ynl/lib/ynl.a ../../../net/ynl/generated/protos.a TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh @@ -91,6 +95,7 @@ TEST_PROGS += test_bridge_neigh_suppress.sh TEST_PROGS += test_vxlan_nolocalbypass.sh TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += fdb_flush.sh +TEST_GEN_FILES += ncdevmem TEST_FILES := settings diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c new file mode 100644 index 000000000000..78bc3ad767ca --- /dev/null +++ b/tools/testing/selftests/net/ncdevmem.c @@ -0,0 +1,546 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <linux/uio.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <errno.h> +#define __iovec_defined +#include <fcntl.h> +#include <malloc.h> + +#include <arpa/inet.h> +#include <sys/socket.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> + +#include <linux/memfd.h> +#include <linux/if.h> +#include <linux/dma-buf.h> +#include <linux/udmabuf.h> +#include <libmnl/libmnl.h> +#include <linux/types.h> +#include <linux/netlink.h> +#include <linux/genetlink.h> +#include <linux/netdev.h> +#include <time.h> + +#include "netdev-user.h" +#include <ynl.h> + +#define PAGE_SHIFT 12 +#define TEST_PREFIX "ncdevmem" +#define NUM_PAGES 16000 + +#ifndef MSG_SOCK_DEVMEM +#define MSG_SOCK_DEVMEM 0x2000000 +#endif + +/* + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. + * + * Usage: + * + * * Without validation: + * + * On server: + * ncdevmem -s <server IP> -c <client IP> -f eth1 -n 0000:06:00.0 -l \ + * -p 5201 + * + * On client: + * ncdevmem -s <server IP> -c <client IP> -f eth1 -n 0000:06:00.0 -p 5201 + * + * * With Validation: + * On server: + * ncdevmem -s <server IP> -c <client IP> -l -f eth1 -n 0000:06:00.0 \ + * -p 5202 -v 1 + * + * On client: + * ncdevmem -s <server IP> -c <client IP> -f eth1 -n 0000:06:00.0 -p 5202 \ + * -v 100000 + * + * Note this is compatible with regular netcat. i.e. the sender or receiver can + * be replaced with regular netcat to test the RX or TX path in isolation. + */ + +static char *server_ip = "192.168.1.4"; +static char *client_ip = "192.168.1.2"; +static char *port = "5201"; +static size_t do_validation; +static int queue_num = 15; +static char *ifname = "eth1"; +static char *nic_pci_addr = "0000:06:00.0"; +static unsigned int iterations; + +void print_bytes(void *ptr, size_t size) +{ + unsigned char *p = ptr; + int i; + + for (i = 0; i < size; i++) { + printf("%02hhX ", p[i]); + } + printf("\n"); +} + +void print_nonzero_bytes(void *ptr, size_t size) +{ + unsigned char *p = ptr; + unsigned int i; + + for (i = 0; i < size; i++) + putchar(p[i]); + printf("\n"); +} + +void validate_buffer(void *line, size_t size) +{ + static unsigned char seed = 1; + unsigned char *ptr = line; + int errors = 0; + size_t i; + + for (i = 0; i < size; i++) { + if (ptr[i] != seed) { + fprintf(stderr, + "Failed validation: expected=%u, actual=%u, index=%lu\n", + seed, ptr[i], i); + errors++; + if (errors > 20) + exit(1); + } + seed++; + if (seed == do_validation) + seed = 0; + } + + fprintf(stdout, "Validated buffer\n"); +} + +static void reset_flow_steering(void) +{ + char command[256]; + + memset(command, 0, sizeof(command)); + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple off", + "eth1"); + system(command); + + memset(command, 0, sizeof(command)); + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple on", + "eth1"); + system(command); +} + +static void configure_flow_steering(void) +{ + char command[256]; + + memset(command, 0, sizeof(command)); + snprintf(command, sizeof(command), + "sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d", + ifname, client_ip, server_ip, port, port, queue_num); + system(command); +} + +/* Triggers a driver reset... + * + * The proper way to do this is probably 'ethtool --reset', but I don't have + * that supported on my current test bed. I resort to changing this + * configuration in the driver which also causes a driver reset... + */ +static void trigger_device_reset(void) +{ + char command[256]; + + memset(command, 0, sizeof(command)); + snprintf(command, sizeof(command), + "sudo ethtool --set-priv-flags %s enable-header-split off", + ifname); + system(command); + + memset(command, 0, sizeof(command)); + snprintf(command, sizeof(command), + "sudo ethtool --set-priv-flags %s enable-header-split on", + ifname); + system(command); +} + +static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, + __u32 *queue_idx, unsigned int n_queue_index, + struct ynl_sock **ys) +{ + struct netdev_bind_rx_req *req = NULL; + struct ynl_error yerr; + int ret = 0; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!*ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + if (ynl_subscribe(*ys, "mgmt")) + goto err_close; + + req = netdev_bind_rx_req_alloc(); + netdev_bind_rx_req_set_ifindex(req, ifindex); + netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd); + __netdev_bind_rx_req_set_queues(req, queue_idx, n_queue_index); + + ret = netdev_bind_rx(*ys, req); + if (!ret) { + perror("netdev_bind_rx"); + goto err_close; + } + + netdev_bind_rx_req_free(req); + + return 0; + +err_close: + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); + netdev_bind_rx_req_free(req); + ynl_sock_destroy(*ys); + return -1; +} + +static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) +{ + struct udmabuf_create create; + int ret; + + *devfd = open("/dev/udmabuf", O_RDWR); + if (*devfd < 0) { + fprintf(stderr, + "%s: [skip,no-udmabuf: Unable to access DMA " + "buffer device file]\n", + TEST_PREFIX); + exit(70); + } + + *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); + if (*memfd < 0) { + printf("%s: [skip,no-memfd]\n", TEST_PREFIX); + exit(72); + } + + ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK); + if (ret < 0) { + printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); + exit(73); + } + + ret = ftruncate(*memfd, dmabuf_size); + if (ret == -1) { + printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); + exit(74); + } + + memset(&create, 0, sizeof(create)); + + create.memfd = *memfd; + create.offset = 0; + create.size = dmabuf_size; + *buf = ioctl(*devfd, UDMABUF_CREATE, &create); + if (*buf < 0) { + printf("%s: [FAIL, create udmabuf]\n", TEST_PREFIX); + exit(75); + } +} + +int do_server(void) +{ + char ctrl_data[sizeof(int) * 20000]; + size_t non_page_aligned_frags = 0; + struct sockaddr_in client_addr; + struct sockaddr_in server_sin; + size_t page_aligned_frags = 0; + int devfd, memfd, buf, ret; + size_t total_received = 0; + bool is_devmem = false; + char *buf_mem = NULL; + struct ynl_sock *ys; + size_t dmabuf_size; + char iobuf[819200]; + char buffer[256]; + int socket_fd; + int client_fd; + size_t i = 0; + int opt = 1; + + dmabuf_size = getpagesize() * NUM_PAGES; + + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); + + __u32 *queue_idx = malloc(sizeof(__u32) * 2); + + queue_idx[0] = 14; + queue_idx[1] = 15; + if (bind_rx_queue(3 /* index for eth1 */, buf, queue_idx, 2, &ys)) { + fprintf(stderr, "Failed to bind\n"); + exit(1); + } + + buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED, + buf, 0); + if (buf_mem == MAP_FAILED) { + perror("mmap()"); + exit(1); + } + + /* Need to trigger the NIC to reallocate its RX pages, otherwise the + * bind doesn't take effect. + */ + trigger_device_reset(); + + sleep(1); + + reset_flow_steering(); + configure_flow_steering(); + + server_sin.sin_family = AF_INET; + server_sin.sin_port = htons(atoi(port)); + + ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr); + if (socket < 0) { + printf("%s: [FAIL, create socket]\n", TEST_PREFIX); + exit(79); + } + + socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0); + if (socket < 0) { + printf("%s: [FAIL, create socket]\n", TEST_PREFIX); + exit(76); + } + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt, + sizeof(opt)); + if (ret) { + printf("%s: [FAIL, set sock opt]: %s\n", TEST_PREFIX, + strerror(errno)); + exit(76); + } + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt, + sizeof(opt)); + if (ret) { + printf("%s: [FAIL, set sock opt]: %s\n", TEST_PREFIX, + strerror(errno)); + exit(76); + } + ret = setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, + sizeof(opt)); + if (ret) { + printf("%s: [FAIL, set sock opt]: %s\n", TEST_PREFIX, + strerror(errno)); + exit(76); + } + + printf("binding to address %s:%d\n", server_ip, + ntohs(server_sin.sin_port)); + + ret = bind(socket_fd, &server_sin, sizeof(server_sin)); + if (ret) { + printf("%s: [FAIL, bind]: %s\n", TEST_PREFIX, strerror(errno)); + exit(76); + } + + ret = listen(socket_fd, 1); + if (ret) { + printf("%s: [FAIL, listen]: %s\n", TEST_PREFIX, + strerror(errno)); + exit(76); + } + + socklen_t client_addr_len = sizeof(client_addr); + + inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer, + sizeof(buffer)); + printf("Waiting or connection on %s:%d\n", buffer, + ntohs(server_sin.sin_port)); + client_fd = accept(socket_fd, &client_addr, &client_addr_len); + + inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer, + sizeof(buffer)); + printf("Got connection from %s:%d\n", buffer, + ntohs(client_addr.sin_port)); + + while (1) { + struct iovec iov = { .iov_base = iobuf, + .iov_len = sizeof(iobuf) }; + struct cmsg_devmem *cmsg_devmem = NULL; + struct dma_buf_sync sync = { 0 }; + struct cmsghdr *cm = NULL; + struct msghdr msg = { 0 }; + struct devmemtoken token; + ssize_t ret; + + is_devmem = false; + printf("\n\n"); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM); + printf("recvmsg ret=%ld\n", ret); + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + continue; + } + if (ret < 0) { + perror("recvmsg"); + continue; + } + if (ret == 0) { + printf("client exited\n"); + goto cleanup; + } + + i++; + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level != SOL_SOCKET || + (cm->cmsg_type != SCM_DEVMEM_OFFSET && + cm->cmsg_type != SCM_DEVMEM_HEADER)) { + fprintf(stdout, "skipping non-devmem cmsg\n"); + continue; + } + + cmsg_devmem = (struct cmsg_devmem *)CMSG_DATA(cm); + is_devmem = true; + + if (cm->cmsg_type == SCM_DEVMEM_HEADER) { + /* TODO: process data copied from skb's linear + * buffer. + */ + fprintf(stdout, + "SCM_DEVMEM_HEADER. " + "cmsg_devmem->frag_size=%u\n", + cmsg_devmem->frag_size); + + continue; + } + + token.token_start = cmsg_devmem->frag_token; + token.token_count = 1; + + total_received += cmsg_devmem->frag_size; + printf("received frag_page=%llu, in_page_offset=%llu," + " frag_offset=%llu, frag_size=%u, token=%u" + " total_received=%lu\n", + cmsg_devmem->frag_offset >> PAGE_SHIFT, + cmsg_devmem->frag_offset % getpagesize(), + cmsg_devmem->frag_offset, cmsg_devmem->frag_size, + cmsg_devmem->frag_token, total_received); + + if (cmsg_devmem->frag_size % getpagesize()) + non_page_aligned_frags++; + else + page_aligned_frags++; + + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START; + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); + + if (do_validation) + validate_buffer( + ((unsigned char *)buf_mem) + + cmsg_devmem->frag_offset, + cmsg_devmem->frag_size); + else + print_nonzero_bytes( + ((unsigned char *)buf_mem) + + cmsg_devmem->frag_offset, + cmsg_devmem->frag_size); + + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END; + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); + + ret = setsockopt(client_fd, SOL_SOCKET, + SO_DEVMEM_DONTNEED, &token, + sizeof(token)); + if (ret != 1) { + perror("SO_DEVMEM_DONTNEED not enough tokens"); + exit(1); + } + } + if (!is_devmem) + printf("flow steering error\n"); + + printf("total_received=%lu\n", total_received); + } + + fprintf(stdout, "%s: ok\n", TEST_PREFIX); + + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + page_aligned_frags, non_page_aligned_frags); + + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + page_aligned_frags, non_page_aligned_frags); + +cleanup: + + munmap(buf_mem, dmabuf_size); + close(client_fd); + close(socket_fd); + close(buf); + close(memfd); + close(devfd); + ynl_sock_destroy(ys); + trigger_device_reset(); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int is_server = 0, opt; + + while ((opt = getopt(argc, argv, "ls:c:p:v:q:f:n:i:")) != -1) { + switch (opt) { + case 'l': + is_server = 1; + break; + case 's': + server_ip = optarg; + break; + case 'c': + client_ip = optarg; + break; + case 'p': + port = optarg; + break; + case 'v': + do_validation = atoll(optarg); + break; + case 'q': + queue_num = atoi(optarg); + break; + case 'f': + ifname = optarg; + break; + case 'n': + nic_pci_addr = optarg; + break; + case 'i': + iterations = atoll(optarg); + break; + case '?': + printf("unknown option: %c\n", optopt); + break; + } + } + + for (; optind < argc; optind++) { + printf("extra arguments: %s\n", argv[optind]); + } + + if (is_server) + return do_server(); + + return 0; +}
ncdevmem is a devmem TCP netcat. It works similarly to netcat, but it sends and receives data using the devmem TCP APIs. It uses udmabuf as the dmabuf provider. It is compatible with a regular netcat running on a peer, or a ncdevmem running on a peer. In addition to normal netcat support, ncdevmem has a validation mode, where it sends a specific pattern and validates this pattern on the receiver side to ensure data integrity. Suggested-by: Stanislav Fomichev <sdf@google.com> Signed-off-by: Mina Almasry <almasrymina@google.com> --- RFC v2: - General cleanups (Willem). --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 5 + tools/testing/selftests/net/ncdevmem.c | 546 +++++++++++++++++++++++++ 3 files changed, 552 insertions(+) create mode 100644 tools/testing/selftests/net/ncdevmem.c