diff mbox series

[v1,05/15] io_uring: mark pages in ifq region with zctap information.

Message ID 20221108050521.3198458-6-jonathan.lemon@gmail.com (mailing list archive)
State New
Headers show
Series zero-copy RX for io_uring | expand

Commit Message

Jonathan Lemon Nov. 8, 2022, 5:05 a.m. UTC
The network stack passes up pages, which must be mapped to
zctap device buffers in order to get the reference count and
other items.  Mark the page as private, and use the page_private
field to record the lookup and ownership information.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 io_uring/zctap.c | 61 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 5 deletions(-)

Comments

Christoph Hellwig Nov. 16, 2022, 8:12 a.m. UTC | #1
On Mon, Nov 07, 2022 at 09:05:11PM -0800, Jonathan Lemon wrote:
> The network stack passes up pages, which must be mapped to
> zctap device buffers in order to get the reference count and
> other items.  Mark the page as private, and use the page_private
> field to record the lookup and ownership information.

Who coordinate ownership of page_private here?  What other parts
of the kernel could touch these pages?
Jonathan Lemon Nov. 17, 2022, 8:48 p.m. UTC | #2
On 16 Nov 2022, at 0:12, Christoph Hellwig wrote:

> On Mon, Nov 07, 2022 at 09:05:11PM -0800, Jonathan Lemon wrote:
>> The network stack passes up pages, which must be mapped to
>> zctap device buffers in order to get the reference count and
>> other items.  Mark the page as private, and use the page_private
>> field to record the lookup and ownership information.
>
> Who coordinate ownership of page_private here?  What other parts
> of the kernel could touch these pages?

This may be an issue.  The driver, network stack, and application
all utilize the pages, which have been pinned by io_uring.  If the
pages are passed to another subsystem which wants to use the private
area for its own purposes, there will be a conflict.

There doesn’t seem to be a good way to maintain long term information
in the current page structure.  A resolution for this would be using
an external lookup indexed by the page, or changing the information
passed up in the skb.
—
Jonathan
diff mbox series

Patch

diff --git a/io_uring/zctap.c b/io_uring/zctap.c
index 0705f5056d07..7426feee1e04 100644
--- a/io_uring/zctap.c
+++ b/io_uring/zctap.c
@@ -27,18 +27,68 @@  struct ifq_region {
 
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
+static void zctap_set_page_info(struct page *page, u64 info)
+{
+	set_page_private(page, info);
+}
+
+static u64 zctap_mk_page_info(u16 region_id, u16 pgid)
+{
+	return (u64)0xface << 48 | (u64)region_id << 16 | (u64)pgid;
+}
+
 static void io_remove_ifq_region(struct ifq_region *ifr)
 {
+	struct io_mapped_ubuf *imu;
+	struct page *page;
+	int i;
+
+	imu = ifr->imu;
+	for (i = 0; i < ifr->nr_pages; i++) {
+		page = imu->bvec[i].bv_page;
+
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+	}
+
 	kvfree(ifr);
 }
 
+static int io_zctap_map_region(struct ifq_region *ifr)
+{
+	struct io_mapped_ubuf *imu;
+	struct page *page;
+	u64 info;
+	int i;
+
+	imu = ifr->imu;
+	for (i = 0; i < ifr->nr_pages; i++) {
+		page = imu->bvec[i].bv_page;
+		if (PagePrivate(page))
+			goto out;
+		SetPagePrivate(page);
+		info = zctap_mk_page_info(ifr->id, i);
+		zctap_set_page_info(page, info);
+		ifr->freelist[i] = page;
+	}
+	return 0;
+
+out:
+	while (i--) {
+		page = imu->bvec[i].bv_page;
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+	}
+	return -EEXIST;
+}
+
 int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id)
 {
 	struct io_ring_ctx *ctx = ifq->ctx;
 	struct io_mapped_ubuf *imu;
 	struct ifq_region *ifr;
-	int i, nr_pages;
-	struct page *page;
+	int nr_pages;
+	int err;
 
 	/* XXX for now, only allow one region per ifq. */
 	if (ifq->region)
@@ -63,9 +113,10 @@  int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id)
 	ifr->free_count = nr_pages;
 	ifr->id = id;
 
-	for (i = 0; i < nr_pages; i++) {
-		page = imu->bvec[i].bv_page;
-		ifr->freelist[i] = page;
+	err = io_zctap_map_region(ifr);
+	if (err) {
+		kvfree(ifr);
+		return err;
 	}
 
 	ifq->region = ifr;