diff mbox series

[v1,07/15] io_uring: Allocate zctap device buffers and dma map them.

Message ID 20221108050521.3198458-8-jonathan.lemon@gmail.com (mailing list archive)
State New
Headers show
Series zero-copy RX for io_uring | expand

Commit Message

Jonathan Lemon Nov. 8, 2022, 5:05 a.m. UTC
The goal is to register a memory region with the device, and
later specify the desired packet buffer size.  The code currently
assumes a page size.

Create the desired number of zctap buffers and DMA map them
to the target device, recording the dma address for later use.

Hold a page reference while the page is dma mapped.

Change the freelist from an array of page pointers to an index
into the device buffer list.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 io_uring/zctap.c | 78 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 63 insertions(+), 15 deletions(-)

Comments

Christoph Hellwig Nov. 16, 2022, 8:15 a.m. UTC | #1
> +		dma_unmap_page_attrs(device, buf->dma, PAGE_SIZE,
> +				     DMA_BIDIRECTIONAL,
> +				     DMA_ATTR_SKIP_CPU_SYNC);

> +		addr = dma_map_page_attrs(device, page, 0, PAGE_SIZE,
> +					  DMA_BIDIRECTIONAL,
> +					  DMA_ATTR_SKIP_CPU_SYNC);

You can't just magically skip cpu syncs.  The flag is only valid
for mappings of already mapped memory when following a very careful
protocol.  I can't see any indications of that beeing true here,
but maybe I'm just missing something.
Jonathan Lemon Nov. 17, 2022, 8:51 p.m. UTC | #2
On 16 Nov 2022, at 0:15, Christoph Hellwig wrote:

>> +		dma_unmap_page_attrs(device, buf->dma, PAGE_SIZE,
>> +				     DMA_BIDIRECTIONAL,
>> +				     DMA_ATTR_SKIP_CPU_SYNC);
>
>> +		addr = dma_map_page_attrs(device, page, 0, PAGE_SIZE,
>> +					  DMA_BIDIRECTIONAL,
>> +					  DMA_ATTR_SKIP_CPU_SYNC);
>
> You can't just magically skip cpu syncs.  The flag is only valid
> for mappings of already mapped memory when following a very careful
> protocol.  I can't see any indications of that beeing true here,
> but maybe I'm just missing something.

This was copied from page_pool_dma_map().  The same logic applies -
io_uring pins the memory, zctap creates a pool that is dma mapped, and
the driver performs dma_sync_single_*() when receiving the packet.
—
Jonathan
diff mbox series

Patch

diff --git a/io_uring/zctap.c b/io_uring/zctap.c
index 69a04de87f8f..fe4bb3781636 100644
--- a/io_uring/zctap.c
+++ b/io_uring/zctap.c
@@ -18,11 +18,14 @@ 
 #define NR_ZCTAP_IFQS	1
 
 struct ifq_region {
+	struct io_zctap_ifq	*ifq;		/* only for delayed_work */
 	struct io_mapped_ubuf	*imu;
 	int			free_count;
 	int			nr_pages;
 	u16			id;
-	struct page		*freelist[];
+
+	struct io_zctap_buf	*buf;
+	u16			freelist[];
 };
 
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
@@ -60,49 +63,85 @@  bool io_zctap_put_page(struct io_zctap_ifq *ifq, struct page *page)
 }
 EXPORT_SYMBOL(io_zctap_put_page);
 
+static inline struct device *
+netdev2device(struct net_device *dev)
+{
+	return dev->dev.parent;			/* from SET_NETDEV_DEV() */
+}
+
 static void io_remove_ifq_region(struct ifq_region *ifr)
 {
-	struct io_mapped_ubuf *imu;
-	struct page *page;
+	struct device *device = netdev2device(ifr->ifq->dev);
+	struct io_zctap_buf *buf;
 	int i;
 
-	imu = ifr->imu;
 	for (i = 0; i < ifr->nr_pages; i++) {
-		page = imu->bvec[i].bv_page;
-
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
+		buf = &ifr->buf[i];
+		set_page_private(buf->page, 0);
+		ClearPagePrivate(buf->page);
+		dma_unmap_page_attrs(device, buf->dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+		put_page(buf->page);
 	}
 
+	kvfree(ifr->buf);
 	kvfree(ifr);
 }
 
-static int io_zctap_map_region(struct ifq_region *ifr)
+static int io_zctap_map_region(struct ifq_region *ifr, struct device *device)
 {
 	struct io_mapped_ubuf *imu;
+	struct io_zctap_buf *buf;
 	struct page *page;
+	dma_addr_t addr;
+	int i, err;
 	u64 info;
-	int i;
 
 	imu = ifr->imu;
 	for (i = 0; i < ifr->nr_pages; i++) {
 		page = imu->bvec[i].bv_page;
-		if (PagePrivate(page))
+
+		if (PagePrivate(page)) {
+			err = -EEXIST;
 			goto out;
+		}
+
 		SetPagePrivate(page);
 		info = zctap_mk_page_info(ifr->id, i);
 		zctap_set_page_info(page, info);
-		ifr->freelist[i] = page;
+
+		buf = &ifr->buf[i];
+		addr = dma_map_page_attrs(device, page, 0, PAGE_SIZE,
+					  DMA_BIDIRECTIONAL,
+					  DMA_ATTR_SKIP_CPU_SYNC);
+		if (dma_mapping_error(device, addr)) {
+			set_page_private(page, 0);
+			ClearPagePrivate(page);
+			err = -ENOMEM;
+			goto out;
+		}
+		buf->dma = addr;
+		buf->page = page;
+		atomic_set(&buf->refcount, 0);
+		get_page(page);
+
+		ifr->freelist[i] = i;
 	}
 	return 0;
 
 out:
 	while (i--) {
 		page = imu->bvec[i].bv_page;
-		ClearPagePrivate(page);
 		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		buf = &ifr->buf[i];
+		dma_unmap_page_attrs(device, buf->dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+		put_page(page);
 	}
-	return -EEXIST;
+	return err;
 }
 
 int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id)
@@ -131,13 +170,22 @@  int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id)
 	if (!ifr)
 		return -ENOMEM;
 
+	ifr->buf = kvmalloc_array(nr_pages, sizeof(*ifr->buf), GFP_KERNEL);
+	if (!ifr->buf) {
+		kvfree(ifr);
+		return -ENOMEM;
+	}
+
 	ifr->nr_pages = nr_pages;
 	ifr->imu = imu;
 	ifr->free_count = nr_pages;
 	ifr->id = id;
 
-	err = io_zctap_map_region(ifr);
+	ifr->ifq = ifq;		/* XXX */
+
+	err = io_zctap_map_region(ifr, netdev2device(ifq->dev));
 	if (err) {
+		kvfree(ifr->buf);
 		kvfree(ifr);
 		return err;
 	}