Message ID | 20240514075444.590910-4-cliang01.li@samsung.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring/rsrc: coalescing multi-hugepage registered buffers | expand |
On 5/14/24 08:54, Chenliang Li wrote: > Introduce two functions to separate the coalesced imu alloc and > accounting path from the original one. This helps to keep the original > code path clean. > > Signed-off-by: Chenliang Li <cliang01.li@samsung.com> > --- > io_uring/rsrc.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 89 insertions(+) > > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 578d382ca9bc..53fac5f27bbf 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -871,6 +871,45 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, > return ret; > } > > +static int io_coalesced_buffer_account_pin(struct io_ring_ctx *ctx, > + struct page **pages, > + struct io_mapped_ubuf *imu, > + struct page **last_hpage, > + struct io_imu_folio_data *data) > +{ > + int i, j, ret; > + > + imu->acct_pages = 0; > + j = 0; > + for (i = 0; i < data->nr_folios; i++) { > + struct page *hpage = pages[j]; > + > + if (hpage == *last_hpage) > + continue; > + *last_hpage = hpage; > + /* > + * Already checked the page array in try coalesce, > + * so pass in nr_pages=0 here to waive that. > + */ > + if (headpage_already_acct(ctx, pages, 0, hpage)) > + continue; > + imu->acct_pages += data->nr_pages_mid; > + if (i) > + j += data->nr_pages_mid; > + else > + j = data->nr_pages_head; You should account an entire folio here, i.e. ->nr_pages_mid in either case. Let's say the first page in the registration is the last page of a huge page, you'd account 4K while it actually pins the entire huge page size. It seems like you can just call io_buffer_account_pin() instead. On that note, you shouldn't duplicate code in either case, just treat the normal discontig pages case as folios of shift=PAGE_SHIFT. Either just plain reuse or adjust io_buffer_account_pin() instead of io_coalesced_buffer_account_pin(). io_coalesced_imu_alloc() should also go away. io_sqe_buffer_register() { struct io_imu_folio_data data; if (!io_sqe_buffer_try_coalesce(pages, folio_data)) { folio_data.shift = PAGE_SHIFT; ... } io_buffer_account_pin(pages, &data); imu->data = uaddr; ... } > + } > + > + if (!imu->acct_pages) > + return 0; > + > + ret = io_account_mem(ctx, imu->acct_pages); > + if (!ret) > + return 0; > + imu->acct_pages = 0; > + return ret; > +} > + > static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages, > struct io_imu_folio_data *data) > { > @@ -949,6 +988,56 @@ static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages, > return true; > } > > +static int io_coalesced_imu_alloc(struct io_ring_ctx *ctx, struct iovec *iov, > + struct io_mapped_ubuf **pimu, > + struct page **last_hpage, struct page **pages, > + struct io_imu_folio_data *data) > +{ > + struct io_mapped_ubuf *imu = NULL; > + unsigned long off; > + size_t size, vec_len; > + int ret, i, j; > + > + ret = -ENOMEM; > + imu = kvmalloc(struct_size(imu, bvec, data->nr_folios), GFP_KERNEL); > + if (!imu) > + return ret; > + > + ret = io_coalesced_buffer_account_pin(ctx, pages, imu, last_hpage, > + data); > + if (ret) { > + unpin_user_page(pages[0]); > + j = data->nr_pages_head; > + for (i = 1; i < data->nr_folios; i++) { > + unpin_user_page(pages[j]); > + j += data->nr_pages_mid; > + } > + return ret; > + } > + off = (unsigned long) iov->iov_base & ~PAGE_MASK; > + size = iov->iov_len; > + /* store original address for later verification */ > + imu->ubuf = (unsigned long) iov->iov_base; > + imu->ubuf_end = imu->ubuf + iov->iov_len; > + imu->nr_bvecs = data->nr_folios; > + imu->folio_shift = data->folio_shift; > + imu->folio_mask = ~((1UL << data->folio_shift) - 1); > + *pimu = imu; > + ret = 0; > + > + vec_len = min_t(size_t, size, PAGE_SIZE * data->nr_pages_head - off); > + bvec_set_page(&imu->bvec[0], pages[0], vec_len, off); > + size -= vec_len; > + j = data->nr_pages_head; > + for (i = 1; i < data->nr_folios; i++) { > + vec_len = min_t(size_t, size, data->folio_size); > + bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0); > + size -= vec_len; > + j += data->nr_pages_mid; > + } > + return ret; > +} > + > static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, > struct io_mapped_ubuf **pimu, > struct page **last_hpage)
Actually here it does account an entire folio. The j is just array index. > It seems like you can just call io_buffer_account_pin() > instead. > > On that note, you shouldn't duplicate code in either case, > just treat the normal discontig pages case as folios of > shift=PAGE_SHIFT. > > Either just plain reuse or adjust io_buffer_account_pin() > instead of io_coalesced_buffer_account_pin(). > io_coalesced_imu_alloc() should also go away. > > io_sqe_buffer_register() { > struct io_imu_folio_data data; > > if (!io_sqe_buffer_try_coalesce(pages, folio_data)) { > folio_data.shift = PAGE_SHIFT; > ... > } > > io_buffer_account_pin(pages, &data); > imu->data = uaddr; > ... > } Will remove them. Thanks, Chenliang Li
On 6/17/24 04:16, Chenliang Li wrote: > Actually here it does account an entire folio. The j is just > array index. Hmm, indeed. Is iteration the only difference here? >> It seems like you can just call io_buffer_account_pin() >> instead. >> >> On that note, you shouldn't duplicate code in either case, >> just treat the normal discontig pages case as folios of >> shift=PAGE_SHIFT. >> >> Either just plain reuse or adjust io_buffer_account_pin() >> instead of io_coalesced_buffer_account_pin(). >> io_coalesced_imu_alloc() should also go away. >> >> io_sqe_buffer_register() { >> struct io_imu_folio_data data; >> >> if (!io_sqe_buffer_try_coalesce(pages, folio_data)) { >> folio_data.shift = PAGE_SHIFT; >> ... >> } >> >> io_buffer_account_pin(pages, &data); >> imu->data = uaddr; >> ... >> } > > Will remove them.
On Mon, 17 Jun 2024 13:22:43 +0100, Pavel Begunkov wrote: > On 6/17/24 04:16, Chenliang Li wrote: >> Actually here it does account an entire folio. The j is just >> array index. > > Hmm, indeed. Is iteration the only difference here? Yes, but I'll try make this less confusing. Thanks, Chenliang Li
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 578d382ca9bc..53fac5f27bbf 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -871,6 +871,45 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } +static int io_coalesced_buffer_account_pin(struct io_ring_ctx *ctx, + struct page **pages, + struct io_mapped_ubuf *imu, + struct page **last_hpage, + struct io_imu_folio_data *data) +{ + int i, j, ret; + + imu->acct_pages = 0; + j = 0; + for (i = 0; i < data->nr_folios; i++) { + struct page *hpage = pages[j]; + + if (hpage == *last_hpage) + continue; + *last_hpage = hpage; + /* + * Already checked the page array in try coalesce, + * so pass in nr_pages=0 here to waive that. + */ + if (headpage_already_acct(ctx, pages, 0, hpage)) + continue; + imu->acct_pages += data->nr_pages_mid; + if (i) + j += data->nr_pages_mid; + else + j = data->nr_pages_head; + } + + if (!imu->acct_pages) + return 0; + + ret = io_account_mem(ctx, imu->acct_pages); + if (!ret) + return 0; + imu->acct_pages = 0; + return ret; +} + static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages, struct io_imu_folio_data *data) { @@ -949,6 +988,56 @@ static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages, return true; } +static int io_coalesced_imu_alloc(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage, struct page **pages, + struct io_imu_folio_data *data) +{ + struct io_mapped_ubuf *imu = NULL; + unsigned long off; + size_t size, vec_len; + int ret, i, j; + + ret = -ENOMEM; + imu = kvmalloc(struct_size(imu, bvec, data->nr_folios), GFP_KERNEL); + if (!imu) + return ret; + + ret = io_coalesced_buffer_account_pin(ctx, pages, imu, last_hpage, + data); + if (ret) { + unpin_user_page(pages[0]); + j = data->nr_pages_head; + for (i = 1; i < data->nr_folios; i++) { + unpin_user_page(pages[j]); + j += data->nr_pages_mid; + } + return ret; + } + off = (unsigned long) iov->iov_base & ~PAGE_MASK; + size = iov->iov_len; + /* store original address for later verification */ + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->nr_bvecs = data->nr_folios; + imu->folio_shift = data->folio_shift; + imu->folio_mask = ~((1UL << data->folio_shift) - 1); + *pimu = imu; + ret = 0; + + vec_len = min_t(size_t, size, PAGE_SIZE * data->nr_pages_head - off); + bvec_set_page(&imu->bvec[0], pages[0], vec_len, off); + size -= vec_len; + j = data->nr_pages_head; + for (i = 1; i < data->nr_folios; i++) { + vec_len = min_t(size_t, size, data->folio_size); + bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0); + size -= vec_len; + j += data->nr_pages_mid; + } + return ret; +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage)
Introduce two functions to separate the coalesced imu alloc and accounting path from the original one. This helps to keep the original code path clean. Signed-off-by: Chenliang Li <cliang01.li@samsung.com> --- io_uring/rsrc.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+)