Message ID | 20240327191933.607220-3-axboe@kernel.dk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Move away from remap_pfn_range() | expand |
On Wed, Mar 27, 2024 at 01:13:37PM -0600, Jens Axboe wrote: > Rather than use remap_pfn_range() for this and manually free later, > switch to using vm_insert_pages() and have it Just Work. > > If possible, allocate a single compound page that covers the range that > is needed. If that works, then we can just use page_address() on that > page. If we fail to get a compound page, allocate single pages and use > vmap() to map them into the kernel virtual address space. > > This just covers the rings/sqes, the other remaining user of the mmap > remap_pfn_range() user will be converted separately. Once that is done, > we can kill the old alloc/free code. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> Overall this looks good to me. Two comments below: > @@ -2601,6 +2601,27 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, > return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; > } > > +static void io_pages_unmap(void *ptr, struct page ***pages, > + unsigned short *npages) > +{ > + bool do_vunmap = false; > + > + if (*npages) { > + struct page **to_free = *pages; > + int i; > + > + /* only did vmap for non-compound and multiple pages */ > + do_vunmap = !PageCompound(to_free[0]) && *npages > 1; > + for (i = 0; i < *npages; i++) > + put_page(to_free[i]); > + } > + if (do_vunmap) > + vunmap(ptr); > + kvfree(*pages); > + *pages = NULL; > + *npages = 0; > +} > + > void io_mem_free(void *ptr) > { > if (!ptr) > @@ -2701,8 +2722,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, > static void io_rings_free(struct io_ring_ctx *ctx) > { > if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { > - io_mem_free(ctx->rings); > - io_mem_free(ctx->sq_sqes); > + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); > + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); > } else { > io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); > ctx->n_ring_pages = 0; > @@ -2714,6 +2735,84 @@ static void io_rings_free(struct io_ring_ctx *ctx) > ctx->sq_sqes = NULL; > } > > +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, > + size_t size, gfp_t gfp) > +{ > + struct page *page; > + int i, order; > + > + order = get_order(size); > + if (order > MAX_PAGE_ORDER) > + return NULL; > + else if (order) > + gfp |= __GFP_COMP; > + > + page = alloc_pages(gfp, order); > + if (!page) > + return NULL; > + > + /* add pages, grab a ref to tail pages */ > + for (i = 0; i < nr_pages; i++) { > + pages[i] = page + i; > + if (i) > + get_page(pages[i]); > + } You don't need those extra refs. __GFP_COMP makes a super page that acts like a single entity. The ref returned by alloc_pages() keeps the whole thing alive; you can then do a single put in io_pages_unmap() for the compound case as well. [ vm_insert_pages() and munmap() still do gets and puts on the tail pages as they are individually mapped and unmapped, but those calls get implicitly redirected to the compound refcount maintained in the head page. IOW, an munmap() of an individual tail page won't free that tail as long as you hold the base ref from the alloc_pages(). ] > + > + return page_address(page); > +} > + > +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, > + gfp_t gfp) > +{ > + void *ret; > + int i; > + > + for (i = 0; i < nr_pages; i++) { > + pages[i] = alloc_page(gfp); > + if (!pages[i]) > + goto err; > + } > + > + ret = vmap(pages, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL); You can kill the VM_ALLOW_HUGE_VMAP. It's a no-op in vmap(), since you're passing an array of order-0 pages, which cannot be mapped by anything larger than PTEs.
On 3/28/24 8:08 AM, Johannes Weiner wrote: > On Wed, Mar 27, 2024 at 01:13:37PM -0600, Jens Axboe wrote: >> Rather than use remap_pfn_range() for this and manually free later, >> switch to using vm_insert_pages() and have it Just Work. >> >> If possible, allocate a single compound page that covers the range that >> is needed. If that works, then we can just use page_address() on that >> page. If we fail to get a compound page, allocate single pages and use >> vmap() to map them into the kernel virtual address space. >> >> This just covers the rings/sqes, the other remaining user of the mmap >> remap_pfn_range() user will be converted separately. Once that is done, >> we can kill the old alloc/free code. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> > > Overall this looks good to me. > > Two comments below: > >> @@ -2601,6 +2601,27 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, >> return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; >> } >> >> +static void io_pages_unmap(void *ptr, struct page ***pages, >> + unsigned short *npages) >> +{ >> + bool do_vunmap = false; >> + >> + if (*npages) { >> + struct page **to_free = *pages; >> + int i; >> + >> + /* only did vmap for non-compound and multiple pages */ >> + do_vunmap = !PageCompound(to_free[0]) && *npages > 1; >> + for (i = 0; i < *npages; i++) >> + put_page(to_free[i]); >> + } >> + if (do_vunmap) >> + vunmap(ptr); >> + kvfree(*pages); >> + *pages = NULL; >> + *npages = 0; >> +} >> + >> void io_mem_free(void *ptr) >> { >> if (!ptr) >> @@ -2701,8 +2722,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, >> static void io_rings_free(struct io_ring_ctx *ctx) >> { >> if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { >> - io_mem_free(ctx->rings); >> - io_mem_free(ctx->sq_sqes); >> + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); >> + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); >> } else { >> io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); >> ctx->n_ring_pages = 0; >> @@ -2714,6 +2735,84 @@ static void io_rings_free(struct io_ring_ctx *ctx) >> ctx->sq_sqes = NULL; >> } >> >> +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, >> + size_t size, gfp_t gfp) >> +{ >> + struct page *page; >> + int i, order; >> + >> + order = get_order(size); >> + if (order > MAX_PAGE_ORDER) >> + return NULL; >> + else if (order) >> + gfp |= __GFP_COMP; >> + >> + page = alloc_pages(gfp, order); >> + if (!page) >> + return NULL; >> + >> + /* add pages, grab a ref to tail pages */ >> + for (i = 0; i < nr_pages; i++) { >> + pages[i] = page + i; >> + if (i) >> + get_page(pages[i]); >> + } > > You don't need those extra refs. > > __GFP_COMP makes a super page that acts like a single entity. The ref > returned by alloc_pages() keeps the whole thing alive; you can then do > a single put in io_pages_unmap() for the compound case as well. > > [ vm_insert_pages() and munmap() still do gets and puts on the tail > pages as they are individually mapped and unmapped, but those calls > get implicitly redirected to the compound refcount maintained in the > head page. IOW, an munmap() of an individual tail page won't free > that tail as long as you hold the base ref from the alloc_pages(). ] OK then, so I can just do something ala: diff --git a/io_uring/memmap.c b/io_uring/memmap.c index bf1527055679..d168752c206f 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -29,12 +29,8 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages, if (!page) return NULL; - /* add pages, grab a ref to tail pages */ - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) pages[i] = page + i; - if (i) - get_page(pages[i]); - } return page_address(page); } @@ -100,8 +96,14 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, struct page **to_free = *pages; int i; - /* only did vmap for non-compound and multiple pages */ - do_vunmap = !PageCompound(to_free[0]) && *npages > 1; + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; for (i = 0; i < *npages; i++) put_page(to_free[i]); } and not need any extra refs. I wish the compound page was a bit more integrated, eg I could just do vm_inser_page() on a single compound page and have it Just Work. But I have to treat it as separate pages there. Thanks! >> +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, >> + gfp_t gfp) >> +{ >> + void *ret; >> + int i; >> + >> + for (i = 0; i < nr_pages; i++) { >> + pages[i] = alloc_page(gfp); >> + if (!pages[i]) >> + goto err; >> + } >> + >> + ret = vmap(pages, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL); > > You can kill the VM_ALLOW_HUGE_VMAP. > > It's a no-op in vmap(), since you're passing an array of order-0 > pages, which cannot be mapped by anything larger than PTEs. Noted, will kill the VM_ALLOW_HUGE_VMAP.
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 585fbc363eaf..29d0c1764aab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2601,6 +2601,27 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } +static void io_pages_unmap(void *ptr, struct page ***pages, + unsigned short *npages) +{ + bool do_vunmap = false; + + if (*npages) { + struct page **to_free = *pages; + int i; + + /* only did vmap for non-compound and multiple pages */ + do_vunmap = !PageCompound(to_free[0]) && *npages > 1; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + void io_mem_free(void *ptr) { if (!ptr) @@ -2701,8 +2722,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; @@ -2714,6 +2735,84 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return NULL; + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return NULL; + + /* add pages, grab a ref to tail pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = page + i; + if (i) + get_page(pages[i]); + } + + return page_address(page); +} + +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) +{ + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } + + ret = vmap(pages, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +static void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (ret) + goto done; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (ret) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } + + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ERR_PTR(-ENOMEM); +} + void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; @@ -3301,14 +3400,12 @@ static void *io_uring_validate_mmap_request(struct file *file, /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); - ptr = ctx->rings; - break; + return ctx->rings; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); - ptr = ctx->sq_sqes; - break; + return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { unsigned int bgid; @@ -3331,11 +3428,22 @@ static void *io_uring_validate_mmap_request(struct file *file, return ptr; } +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + #ifdef CONFIG_MMU static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { + struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long pfn; void *ptr; @@ -3343,6 +3451,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) if (IS_ERR(ptr)) return PTR_ERR(ptr); + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, + ctx->n_ring_pages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + } + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } @@ -3632,7 +3750,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_mem_alloc(size); + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); @@ -3657,7 +3775,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_mem_alloc(size); + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7654dfb34c2e..ac2a84542417 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -70,6 +70,8 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
Rather than use remap_pfn_range() for this and manually free later, switch to using vm_insert_pages() and have it Just Work. If possible, allocate a single compound page that covers the range that is needed. If that works, then we can just use page_address() on that page. If we fail to get a compound page, allocate single pages and use vmap() to map them into the kernel virtual address space. This just covers the rings/sqes, the other remaining user of the mmap remap_pfn_range() user will be converted separately. Once that is done, we can kill the old alloc/free code. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 134 +++++++++++++++++++++++++++++++++++++++++--- io_uring/io_uring.h | 2 + 2 files changed, 128 insertions(+), 8 deletions(-)