diff mbox series

[02/10] io_uring: get rid of remap_pfn_range() for mapping rings/sqes

Message ID 20240327191933.607220-3-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Move away from remap_pfn_range() | expand

Commit Message

Jens Axboe March 27, 2024, 7:13 p.m. UTC
Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_pages() and have it Just Work.

If possible, allocate a single compound page that covers the range that
is needed. If that works, then we can just use page_address() on that
page. If we fail to get a compound page, allocate single pages and use
vmap() to map them into the kernel virtual address space.

This just covers the rings/sqes, the other remaining user of the mmap
remap_pfn_range() user will be converted separately. Once that is done,
we can kill the old alloc/free code.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 134 +++++++++++++++++++++++++++++++++++++++++---
 io_uring/io_uring.h |   2 +
 2 files changed, 128 insertions(+), 8 deletions(-)

Comments

Johannes Weiner March 28, 2024, 2:08 p.m. UTC | #1
On Wed, Mar 27, 2024 at 01:13:37PM -0600, Jens Axboe wrote:
> Rather than use remap_pfn_range() for this and manually free later,
> switch to using vm_insert_pages() and have it Just Work.
> 
> If possible, allocate a single compound page that covers the range that
> is needed. If that works, then we can just use page_address() on that
> page. If we fail to get a compound page, allocate single pages and use
> vmap() to map them into the kernel virtual address space.
> 
> This just covers the rings/sqes, the other remaining user of the mmap
> remap_pfn_range() user will be converted separately. Once that is done,
> we can kill the old alloc/free code.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

Overall this looks good to me.

Two comments below:

> @@ -2601,6 +2601,27 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
>  }
>  
> +static void io_pages_unmap(void *ptr, struct page ***pages,
> +			   unsigned short *npages)
> +{
> +	bool do_vunmap = false;
> +
> +	if (*npages) {
> +		struct page **to_free = *pages;
> +		int i;
> +
> +		/* only did vmap for non-compound and multiple pages */
> +		do_vunmap = !PageCompound(to_free[0]) && *npages > 1;
> +		for (i = 0; i < *npages; i++)
> +			put_page(to_free[i]);
> +	}
> +	if (do_vunmap)
> +		vunmap(ptr);
> +	kvfree(*pages);
> +	*pages = NULL;
> +	*npages = 0;
> +}
> +
>  void io_mem_free(void *ptr)
>  {
>  	if (!ptr)
> @@ -2701,8 +2722,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
>  static void io_rings_free(struct io_ring_ctx *ctx)
>  {
>  	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
> -		io_mem_free(ctx->rings);
> -		io_mem_free(ctx->sq_sqes);
> +		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
> +		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
>  	} else {
>  		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
>  		ctx->n_ring_pages = 0;
> @@ -2714,6 +2735,84 @@ static void io_rings_free(struct io_ring_ctx *ctx)
>  	ctx->sq_sqes = NULL;
>  }
>  
> +static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
> +				   size_t size, gfp_t gfp)
> +{
> +	struct page *page;
> +	int i, order;
> +
> +	order = get_order(size);
> +	if (order > MAX_PAGE_ORDER)
> +		return NULL;
> +	else if (order)
> +		gfp |= __GFP_COMP;
> +
> +	page = alloc_pages(gfp, order);
> +	if (!page)
> +		return NULL;
> +
> +	/* add pages, grab a ref to tail pages */
> +	for (i = 0; i < nr_pages; i++) {
> +		pages[i] = page + i;
> +		if (i)
> +			get_page(pages[i]);
> +	}

You don't need those extra refs.

__GFP_COMP makes a super page that acts like a single entity. The ref
returned by alloc_pages() keeps the whole thing alive; you can then do
a single put in io_pages_unmap() for the compound case as well.

[ vm_insert_pages() and munmap() still do gets and puts on the tail
  pages as they are individually mapped and unmapped, but those calls
  get implicitly redirected to the compound refcount maintained in the
  head page. IOW, an munmap() of an individual tail page won't free
  that tail as long as you hold the base ref from the alloc_pages(). ]

> +
> +	return page_address(page);
> +}
> +
> +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
> +				 gfp_t gfp)
> +{
> +	void *ret;
> +	int i;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		pages[i] = alloc_page(gfp);
> +		if (!pages[i])
> +			goto err;
> +	}
> +
> +	ret = vmap(pages, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);

You can kill the VM_ALLOW_HUGE_VMAP.

It's a no-op in vmap(), since you're passing an array of order-0
pages, which cannot be mapped by anything larger than PTEs.
Jens Axboe March 28, 2024, 2:49 p.m. UTC | #2
On 3/28/24 8:08 AM, Johannes Weiner wrote:
> On Wed, Mar 27, 2024 at 01:13:37PM -0600, Jens Axboe wrote:
>> Rather than use remap_pfn_range() for this and manually free later,
>> switch to using vm_insert_pages() and have it Just Work.
>>
>> If possible, allocate a single compound page that covers the range that
>> is needed. If that works, then we can just use page_address() on that
>> page. If we fail to get a compound page, allocate single pages and use
>> vmap() to map them into the kernel virtual address space.
>>
>> This just covers the rings/sqes, the other remaining user of the mmap
>> remap_pfn_range() user will be converted separately. Once that is done,
>> we can kill the old alloc/free code.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> Overall this looks good to me.
> 
> Two comments below:
> 
>> @@ -2601,6 +2601,27 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>>  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
>>  }
>>  
>> +static void io_pages_unmap(void *ptr, struct page ***pages,
>> +			   unsigned short *npages)
>> +{
>> +	bool do_vunmap = false;
>> +
>> +	if (*npages) {
>> +		struct page **to_free = *pages;
>> +		int i;
>> +
>> +		/* only did vmap for non-compound and multiple pages */
>> +		do_vunmap = !PageCompound(to_free[0]) && *npages > 1;
>> +		for (i = 0; i < *npages; i++)
>> +			put_page(to_free[i]);
>> +	}
>> +	if (do_vunmap)
>> +		vunmap(ptr);
>> +	kvfree(*pages);
>> +	*pages = NULL;
>> +	*npages = 0;
>> +}
>> +
>>  void io_mem_free(void *ptr)
>>  {
>>  	if (!ptr)
>> @@ -2701,8 +2722,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
>>  static void io_rings_free(struct io_ring_ctx *ctx)
>>  {
>>  	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
>> -		io_mem_free(ctx->rings);
>> -		io_mem_free(ctx->sq_sqes);
>> +		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
>> +		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
>>  	} else {
>>  		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
>>  		ctx->n_ring_pages = 0;
>> @@ -2714,6 +2735,84 @@ static void io_rings_free(struct io_ring_ctx *ctx)
>>  	ctx->sq_sqes = NULL;
>>  }
>>  
>> +static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
>> +				   size_t size, gfp_t gfp)
>> +{
>> +	struct page *page;
>> +	int i, order;
>> +
>> +	order = get_order(size);
>> +	if (order > MAX_PAGE_ORDER)
>> +		return NULL;
>> +	else if (order)
>> +		gfp |= __GFP_COMP;
>> +
>> +	page = alloc_pages(gfp, order);
>> +	if (!page)
>> +		return NULL;
>> +
>> +	/* add pages, grab a ref to tail pages */
>> +	for (i = 0; i < nr_pages; i++) {
>> +		pages[i] = page + i;
>> +		if (i)
>> +			get_page(pages[i]);
>> +	}
> 
> You don't need those extra refs.
> 
> __GFP_COMP makes a super page that acts like a single entity. The ref
> returned by alloc_pages() keeps the whole thing alive; you can then do
> a single put in io_pages_unmap() for the compound case as well.
> 
> [ vm_insert_pages() and munmap() still do gets and puts on the tail
>   pages as they are individually mapped and unmapped, but those calls
>   get implicitly redirected to the compound refcount maintained in the
>   head page. IOW, an munmap() of an individual tail page won't free
>   that tail as long as you hold the base ref from the alloc_pages(). ]

OK then, so I can just do something ala:

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index bf1527055679..d168752c206f 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -29,12 +29,8 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
 	if (!page)
 		return NULL;
 
-	/* add pages, grab a ref to tail pages */
-	for (i = 0; i < nr_pages; i++) {
+	for (i = 0; i < nr_pages; i++)
 		pages[i] = page + i;
-		if (i)
-			get_page(pages[i]);
-	}
 
 	return page_address(page);
 }
@@ -100,8 +96,14 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
 		struct page **to_free = *pages;
 		int i;
 
-		/* only did vmap for non-compound and multiple pages */
-		do_vunmap = !PageCompound(to_free[0]) && *npages > 1;
+		/*
+		 * Only did vmap for the non-compound multiple page case.
+		 * For the compound page, we just need to put the head.
+		 */
+		if (PageCompound(to_free[0]))
+			*npages = 1;
+		else if (*npages > 1)
+			do_vunmap = true;
 		for (i = 0; i < *npages; i++)
 			put_page(to_free[i]);
 	}

and not need any extra refs. I wish the compound page was a bit more
integrated, eg I could just do vm_inser_page() on a single compound page
and have it Just Work. But I have to treat it as separate pages there.

Thanks!


>> +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
>> +				 gfp_t gfp)
>> +{
>> +	void *ret;
>> +	int i;
>> +
>> +	for (i = 0; i < nr_pages; i++) {
>> +		pages[i] = alloc_page(gfp);
>> +		if (!pages[i])
>> +			goto err;
>> +	}
>> +
>> +	ret = vmap(pages, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);
> 
> You can kill the VM_ALLOW_HUGE_VMAP.
> 
> It's a no-op in vmap(), since you're passing an array of order-0
> pages, which cannot be mapped by anything larger than PTEs.

Noted, will kill the VM_ALLOW_HUGE_VMAP.
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 585fbc363eaf..29d0c1764aab 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2601,6 +2601,27 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
+static void io_pages_unmap(void *ptr, struct page ***pages,
+			   unsigned short *npages)
+{
+	bool do_vunmap = false;
+
+	if (*npages) {
+		struct page **to_free = *pages;
+		int i;
+
+		/* only did vmap for non-compound and multiple pages */
+		do_vunmap = !PageCompound(to_free[0]) && *npages > 1;
+		for (i = 0; i < *npages; i++)
+			put_page(to_free[i]);
+	}
+	if (do_vunmap)
+		vunmap(ptr);
+	kvfree(*pages);
+	*pages = NULL;
+	*npages = 0;
+}
+
 void io_mem_free(void *ptr)
 {
 	if (!ptr)
@@ -2701,8 +2722,8 @@  static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
 static void io_rings_free(struct io_ring_ctx *ctx)
 {
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
-		io_mem_free(ctx->rings);
-		io_mem_free(ctx->sq_sqes);
+		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
+		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
 	} else {
 		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
 		ctx->n_ring_pages = 0;
@@ -2714,6 +2735,84 @@  static void io_rings_free(struct io_ring_ctx *ctx)
 	ctx->sq_sqes = NULL;
 }
 
+static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
+				   size_t size, gfp_t gfp)
+{
+	struct page *page;
+	int i, order;
+
+	order = get_order(size);
+	if (order > MAX_PAGE_ORDER)
+		return NULL;
+	else if (order)
+		gfp |= __GFP_COMP;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		return NULL;
+
+	/* add pages, grab a ref to tail pages */
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = page + i;
+		if (i)
+			get_page(pages[i]);
+	}
+
+	return page_address(page);
+}
+
+static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
+				 gfp_t gfp)
+{
+	void *ret;
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = alloc_page(gfp);
+		if (!pages[i])
+			goto err;
+	}
+
+	ret = vmap(pages, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);
+	if (ret)
+		return ret;
+err:
+	while (i--)
+		put_page(pages[i]);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+			  size_t size)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+	struct page **pages;
+	int nr_pages;
+	void *ret;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+	if (ret)
+		goto done;
+
+	ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+	if (ret) {
+done:
+		*out_pages = pages;
+		*npages = nr_pages;
+		return ret;
+	}
+
+	kvfree(pages);
+	*out_pages = NULL;
+	*npages = 0;
+	return ERR_PTR(-ENOMEM);
+}
+
 void *io_mem_alloc(size_t size)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
@@ -3301,14 +3400,12 @@  static void *io_uring_validate_mmap_request(struct file *file,
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
-		ptr = ctx->rings;
-		break;
+		return ctx->rings;
 	case IORING_OFF_SQES:
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
-		ptr = ctx->sq_sqes;
-		break;
+		return ctx->sq_sqes;
 	case IORING_OFF_PBUF_RING: {
 		unsigned int bgid;
 
@@ -3331,11 +3428,22 @@  static void *io_uring_validate_mmap_request(struct file *file,
 	return ptr;
 }
 
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+			struct page **pages, int npages)
+{
+	unsigned long nr_pages = npages;
+
+	vm_flags_set(vma, VM_DONTEXPAND);
+	return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+}
+
 #ifdef CONFIG_MMU
 
 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct io_ring_ctx *ctx = file->private_data;
 	size_t sz = vma->vm_end - vma->vm_start;
+	long offset = vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long pfn;
 	void *ptr;
 
@@ -3343,6 +3451,16 @@  static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
 
+	switch (offset & IORING_OFF_MMAP_MASK) {
+	case IORING_OFF_SQ_RING:
+	case IORING_OFF_CQ_RING:
+		return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
+						ctx->n_ring_pages);
+	case IORING_OFF_SQES:
+		return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+						ctx->n_sqe_pages);
+	}
+
 	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 }
@@ -3632,7 +3750,7 @@  static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
-		rings = io_mem_alloc(size);
+		rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
 	else
 		rings = io_rings_map(ctx, p->cq_off.user_addr, size);
 
@@ -3657,7 +3775,7 @@  static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	}
 
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
-		ptr = io_mem_alloc(size);
+		ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
 	else
 		ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7654dfb34c2e..ac2a84542417 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -70,6 +70,8 @@  bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+			struct page **pages, int npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,