diff mbox series

[v4,1/4] io_uring/rsrc: add hugepage buffer coalesce helpers

Message ID 20240514075444.590910-2-cliang01.li@samsung.com (mailing list archive)
State New
Headers show
Series io_uring/rsrc: coalescing multi-hugepage registered buffers | expand

Commit Message

Chenliang Li May 14, 2024, 7:54 a.m. UTC
Introduce helper functions to check whether a buffer can
be coalesced or not, and gather folio data for later use.

The coalescing optimizes time and space consumption caused
by mapping and storing multi-hugepage fixed buffers.

A coalescable multi-hugepage buffer should fully cover its folios
(except potentially the first and last one), and these folios should
have the same size. These requirements are for easier later process,
also we need same size'd chunks in io_import_fixed for fast iov_iter
adjust.

Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
---
 io_uring/rsrc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
 io_uring/rsrc.h | 10 +++++++
 2 files changed, 88 insertions(+)

Comments

Anuj gupta May 16, 2024, 2:07 p.m. UTC | #1
On Tue, May 14, 2024 at 1:28 PM Chenliang Li <cliang01.li@samsung.com> wrote:
>
> Introduce helper functions to check whether a buffer can
> be coalesced or not, and gather folio data for later use.
>
> The coalescing optimizes time and space consumption caused
> by mapping and storing multi-hugepage fixed buffers.
>
> A coalescable multi-hugepage buffer should fully cover its folios
> (except potentially the first and last one), and these folios should
> have the same size. These requirements are for easier later process,

Nit: for easier processing later

> also we need same size'd chunks in io_import_fixed for fast iov_iter
> adjust.
>
> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
> ---
>  io_uring/rsrc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
>  io_uring/rsrc.h | 10 +++++++
>  2 files changed, 88 insertions(+)
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 65417c9553b1..d08224c0c5b0 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -871,6 +871,84 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>         return ret;
>  }
>
> +static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
> +                                        struct io_imu_folio_data *data)
> +{
> +       struct folio *folio = page_folio(pages[0]);
> +       unsigned int count = 1;
> +       int i;
> +
> +       data->nr_pages_mid = folio_nr_pages(folio);
> +       if (data->nr_pages_mid == 1)
> +               return false;
> +
> +       data->folio_shift = folio_shift(folio);
> +       data->folio_size = folio_size(folio);
> +       data->nr_folios = 1;
> +       /*
> +        * Check if pages are contiguous inside a folio, and all folios have
> +        * the same page count except for the head and tail.
> +        */
> +       for (i = 1; i < nr_pages; i++) {
> +               if (page_folio(pages[i]) == folio &&
> +                       pages[i] == pages[i-1] + 1) {
> +                       count++;
> +                       continue;
> +               }
> +
> +               if (data->nr_folios == 1)
> +                       data->nr_pages_head = count;
> +               else if (count != data->nr_pages_mid)
> +                       return false;
> +
> +               folio = page_folio(pages[i]);
> +               if (folio_size(folio) != data->folio_size)
> +                       return false;
> +
> +               count = 1;
> +               data->nr_folios++;
> +       }
> +       if (data->nr_folios == 1)
> +               data->nr_pages_head = count;
> +
> +       return true;
> +}
> +
> +static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
> +                                      struct io_imu_folio_data *data)
> +{
> +       int i, j;
> +
> +       if (nr_pages <= 1 ||
> +               !__io_sqe_buffer_try_coalesce(pages, nr_pages, data))
> +               return false;
> +
> +       /*
> +        * The pages are bound to the folio, it doesn't
> +        * actually unpin them but drops all but one reference,
> +        * which is usually put down by io_buffer_unmap().
> +        * Note, needs a better helper.
> +        */
> +       if (data->nr_pages_head > 1)
> +               unpin_user_pages(&pages[1], data->nr_pages_head - 1);
> +
> +       j = data->nr_pages_head;
> +       nr_pages -= data->nr_pages_head;
> +       for (i = 1; i < data->nr_folios; i++) {
> +               unsigned int nr_unpin;
> +
> +               nr_unpin = min_t(unsigned int, nr_pages - 1,
> +                                       data->nr_pages_mid - 1);
> +               if (nr_unpin == 0)
> +                       break;
> +               unpin_user_pages(&pages[j+1], nr_unpin);
> +               j += data->nr_pages_mid;
> +               nr_pages -= data->nr_pages_mid;
> +       }
> +
> +       return true;
> +}
> +
>  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
>                                   struct io_mapped_ubuf **pimu,
>                                   struct page **last_hpage)
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index c032ca3436ca..b2a9d66b76dd 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -50,6 +50,16 @@ struct io_mapped_ubuf {
>         struct bio_vec  bvec[] __counted_by(nr_bvecs);
>  };
>
> +struct io_imu_folio_data {
> +       /* Head folio can be partially included in the fixed buf */
> +       unsigned int    nr_pages_head;
> +       /* For non-head/tail folios, has to be fully included */
> +       unsigned int    nr_pages_mid;
> +       unsigned int    nr_folios;
> +       unsigned int    folio_shift;
> +       size_t          folio_size;
> +};
> +
>  void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
>  void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
>  struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
> --
> 2.34.1
>
>
Looks good:
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
--
Anuj Gupta
Pavel Begunkov June 16, 2024, 6:04 p.m. UTC | #2
On 5/14/24 08:54, Chenliang Li wrote:
> Introduce helper functions to check whether a buffer can
> be coalesced or not, and gather folio data for later use.
> 
> The coalescing optimizes time and space consumption caused
> by mapping and storing multi-hugepage fixed buffers.
> 
> A coalescable multi-hugepage buffer should fully cover its folios
> (except potentially the first and last one), and these folios should
> have the same size. These requirements are for easier later process,
> also we need same size'd chunks in io_import_fixed for fast iov_iter
> adjust.
> 
> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
> ---
>   io_uring/rsrc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
>   io_uring/rsrc.h | 10 +++++++
>   2 files changed, 88 insertions(+)
> 
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 65417c9553b1..d08224c0c5b0 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -871,6 +871,84 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>   	return ret;
>   }
>   
> +static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
> +					 struct io_imu_folio_data *data)

io_can_coalesce_buffer(), you're not actually trying to
do it here.

> +{
> +	struct folio *folio = page_folio(pages[0]);
> +	unsigned int count = 1;
> +	int i;
> +
> +	data->nr_pages_mid = folio_nr_pages(folio);
> +	if (data->nr_pages_mid == 1)
> +		return false;
> +
> +	data->folio_shift = folio_shift(folio);
> +	data->folio_size = folio_size(folio);
> +	data->nr_folios = 1;
> +	/*
> +	 * Check if pages are contiguous inside a folio, and all folios have
> +	 * the same page count except for the head and tail.
> +	 */
> +	for (i = 1; i < nr_pages; i++) {
> +		if (page_folio(pages[i]) == folio &&
> +			pages[i] == pages[i-1] + 1) {
> +			count++;
> +			continue;
> +		}
> +
> +		if (data->nr_folios == 1)
> +			data->nr_pages_head = count;
> +		else if (count != data->nr_pages_mid)
> +			return false;
> +
> +		folio = page_folio(pages[i]);
> +		if (folio_size(folio) != data->folio_size)
> +			return false;
> +
> +		count = 1;
> +		data->nr_folios++;
> +	}
> +	if (data->nr_folios == 1)
> +		data->nr_pages_head = count;
> +
> +	return true;
> +}
> +
> +static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
> +				       struct io_imu_folio_data *data)
> +{
> +	int i, j;
> +
> +	if (nr_pages <= 1 ||
> +		!__io_sqe_buffer_try_coalesce(pages, nr_pages, data))
> +		return false;
> +
> +	/*
> +	 * The pages are bound to the folio, it doesn't
> +	 * actually unpin them but drops all but one reference,
> +	 * which is usually put down by io_buffer_unmap().
> +	 * Note, needs a better helper.
> +	 */
> +	if (data->nr_pages_head > 1)
> +		unpin_user_pages(&pages[1], data->nr_pages_head - 1);

Should be pages[0]. page[1] can be in another folio, and even
though data->nr_pages_head > 1 protects against touching it,
it's still flimsy.

> +
> +	j = data->nr_pages_head;
> +	nr_pages -= data->nr_pages_head;
> +	for (i = 1; i < data->nr_folios; i++) {
> +		unsigned int nr_unpin;
> +
> +		nr_unpin = min_t(unsigned int, nr_pages - 1,
> +					data->nr_pages_mid - 1);
> +		if (nr_unpin == 0)
> +			break;
> +		unpin_user_pages(&pages[j+1], nr_unpin);

same

> +		j += data->nr_pages_mid;

And instead of duplicating this voodoo iteration later,
please just assemble a new compacted ->nr_folios sized
page array.

> +		nr_pages -= data->nr_pages_mid;
> +	}
> +
> +	return true;
> +}
> +
>   static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
>   				  struct io_mapped_ubuf **pimu,
>   				  struct page **last_hpage)
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index c032ca3436ca..b2a9d66b76dd 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -50,6 +50,16 @@ struct io_mapped_ubuf {
>   	struct bio_vec	bvec[] __counted_by(nr_bvecs);
>   };
>   
> +struct io_imu_folio_data {
> +	/* Head folio can be partially included in the fixed buf */
> +	unsigned int	nr_pages_head;
> +	/* For non-head/tail folios, has to be fully included */
> +	unsigned int	nr_pages_mid;
> +	unsigned int	nr_folios;
> +	unsigned int	folio_shift;
> +	size_t		folio_size;
> +};
> +
>   void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
>   void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
>   struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
Chenliang Li June 17, 2024, 3:12 a.m. UTC | #3
On Sun, 16 Jun 2024 19:04:38 +0100, Pavel Begunkov wrote:
> On 5/14/24 08:54, Chenliang Li wrote:
>> Introduce helper functions to check whether a buffer can
>> be coalesced or not, and gather folio data for later use.
>> 
>> The coalescing optimizes time and space consumption caused
>> by mapping and storing multi-hugepage fixed buffers.
>> 
>> A coalescable multi-hugepage buffer should fully cover its folios
>> (except potentially the first and last one), and these folios should
>> have the same size. These requirements are for easier later process,
>> also we need same size'd chunks in io_import_fixed for fast iov_iter
>> adjust.
>> 
>> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
>> ---
>>   io_uring/rsrc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   io_uring/rsrc.h | 10 +++++++
>>   2 files changed, 88 insertions(+)
>> 
>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
>> index 65417c9553b1..d08224c0c5b0 100644
>> --- a/io_uring/rsrc.c
>> +++ b/io_uring/rsrc.c
>> @@ -871,6 +871,84 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>>   	return ret;
>>   }
>>   
>> +static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>> +					 struct io_imu_folio_data *data)
> io_can_coalesce_buffer(), you're not actually trying to
> do it here.

Will change it.

>> +static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>> +				       struct io_imu_folio_data *data)
>> +{
>> +	int i, j;
>> +
>> +	if (nr_pages <= 1 ||
>> +		!__io_sqe_buffer_try_coalesce(pages, nr_pages, data))
>> +		return false;
>> +
>> +	/*
>> +	 * The pages are bound to the folio, it doesn't
>> +	 * actually unpin them but drops all but one reference,
>> +	 * which is usually put down by io_buffer_unmap().
>> +	 * Note, needs a better helper.
>> +	 */
>> +	if (data->nr_pages_head > 1)
>> +		unpin_user_pages(&pages[1], data->nr_pages_head - 1);
> Should be pages[0]. page[1] can be in another folio, and even
> though data->nr_pages_head > 1 protects against touching it,
> it's still flimsy.

But here it is unpinning the tail pages inside those coalesceable folios,
I think we only unpin pages[0] when failure, am I right? And in
__io_sqe_buffer_try_coalesce we have ensured that pages[1:nr_head_pages] are
in same folio and contiguous.

>> +
>> +	j = data->nr_pages_head;
>> +	nr_pages -= data->nr_pages_head;
>> +	for (i = 1; i < data->nr_folios; i++) {
>> +		unsigned int nr_unpin;
>> +
>> +		nr_unpin = min_t(unsigned int, nr_pages - 1,
>> +					data->nr_pages_mid - 1);
>> +		if (nr_unpin == 0)
>> +			break;
>> +		unpin_user_pages(&pages[j+1], nr_unpin);
> same
>> +		j += data->nr_pages_mid;
> And instead of duplicating this voodoo iteration later,
> please just assemble a new compacted ->nr_folios sized
> page array.

Indeed, a new page array would make things a lot easier.
If alloc overhead is not a concern here, then yeah I'll change it.

Thanks,
Chenliang Li
Pavel Begunkov June 17, 2024, 12:38 p.m. UTC | #4
On 6/17/24 04:12, Chenliang Li wrote:
> On Sun, 16 Jun 2024 19:04:38 +0100, Pavel Begunkov wrote:
>> On 5/14/24 08:54, Chenliang Li wrote:
>>> Introduce helper functions to check whether a buffer can
>>> be coalesced or not, and gather folio data for later use.
>>>
>>> The coalescing optimizes time and space consumption caused
>>> by mapping and storing multi-hugepage fixed buffers.
>>>
>>> A coalescable multi-hugepage buffer should fully cover its folios
>>> (except potentially the first and last one), and these folios should
>>> have the same size. These requirements are for easier later process,
>>> also we need same size'd chunks in io_import_fixed for fast iov_iter
>>> adjust.
>>>
>>> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
>>> ---
>>>    io_uring/rsrc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>    io_uring/rsrc.h | 10 +++++++
>>>    2 files changed, 88 insertions(+)
>>>
>>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
>>> index 65417c9553b1..d08224c0c5b0 100644
>>> --- a/io_uring/rsrc.c
>>> +++ b/io_uring/rsrc.c
>>> @@ -871,6 +871,84 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>>>    	return ret;
>>>    }
>>>    
>>> +static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>>> +					 struct io_imu_folio_data *data)
>> io_can_coalesce_buffer(), you're not actually trying to
>> do it here.
> 
> Will change it.
> 
>>> +static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>>> +				       struct io_imu_folio_data *data)
>>> +{
>>> +	int i, j;
>>> +
>>> +	if (nr_pages <= 1 ||
>>> +		!__io_sqe_buffer_try_coalesce(pages, nr_pages, data))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * The pages are bound to the folio, it doesn't
>>> +	 * actually unpin them but drops all but one reference,
>>> +	 * which is usually put down by io_buffer_unmap().
>>> +	 * Note, needs a better helper.
>>> +	 */
>>> +	if (data->nr_pages_head > 1)
>>> +		unpin_user_pages(&pages[1], data->nr_pages_head - 1);
>> Should be pages[0]. page[1] can be in another folio, and even
>> though data->nr_pages_head > 1 protects against touching it,
>> it's still flimsy.
> 
> But here it is unpinning the tail pages inside those coalesceable folios,
> I think we only unpin pages[0] when failure, am I right? And in
> __io_sqe_buffer_try_coalesce we have ensured that pages[1:nr_head_pages] are
> in same folio and contiguous.

We want the entire folio to still be pinned, but don't want to
leave just one reference and not care down the line how many
refcounts / etc. you have to put down.

void unpin_user_page(struct page *page)
{
	sanity_check_pinned_pages(&page, 1);
	gup_put_folio(page_folio(page), 1, FOLL_PIN);
}

And all that goes to the folio as a single object, so doesn't
really matter which page you pass. Anyway, let's then leave it
as is then, I wish there would be unpin_folio_nr(), but there
is unpin_user_page_range_dirty_lock() resembling it.

>>> +
>>> +	j = data->nr_pages_head;
>>> +	nr_pages -= data->nr_pages_head;
>>> +	for (i = 1; i < data->nr_folios; i++) {
>>> +		unsigned int nr_unpin;
>>> +
>>> +		nr_unpin = min_t(unsigned int, nr_pages - 1,
>>> +					data->nr_pages_mid - 1);
>>> +		if (nr_unpin == 0)
>>> +			break;
>>> +		unpin_user_pages(&pages[j+1], nr_unpin);
>> same
>>> +		j += data->nr_pages_mid;
>> And instead of duplicating this voodoo iteration later,
>> please just assemble a new compacted ->nr_folios sized
>> page array.
> 
> Indeed, a new page array would make things a lot easier.
> If alloc overhead is not a concern here, then yeah I'll change it.

It's not, and the upside is reducing memory footprint,
which would be noticeable with huge pages. It's also
kvmalloc'ed, so compacting also improves the TLB situation.
Chenliang Li June 18, 2024, 3:11 a.m. UTC | #5
On Sun, 16 Jun 2024 18:43:13 +0100, Pavel Begunkov wrote:
> On 6/17/24 04:12, Chenliang Li wrote:
>> On Sun, 16 Jun 2024 19:04:38 +0100, Pavel Begunkov wrote:
>>> On 5/14/24 08:54, Chenliang Li wrote:
>>>> Introduce helper functions to check whether a buffer can
>>>> be coalesced or not, and gather folio data for later use.
>>>>
>>>> The coalescing optimizes time and space consumption caused
>>>> by mapping and storing multi-hugepage fixed buffers.
>>>>
>>>> A coalescable multi-hugepage buffer should fully cover its folios
>>>> (except potentially the first and last one), and these folios should
>>>> have the same size. These requirements are for easier later process,
>>>> also we need same size'd chunks in io_import_fixed for fast iov_iter
>>>> adjust.
>>>>
>>>> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
>>>> ---
>>>>    io_uring/rsrc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    io_uring/rsrc.h | 10 +++++++
>>>>    2 files changed, 88 insertions(+)
>>>>
>>>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
>>>> index 65417c9553b1..d08224c0c5b0 100644
>>>> --- a/io_uring/rsrc.c
>>>> +++ b/io_uring/rsrc.c
>>>> @@ -871,6 +871,84 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>>>>    	return ret;
>>>>    }
>>>>    
>>>> +static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>>>> +					 struct io_imu_folio_data *data)
>>> io_can_coalesce_buffer(), you're not actually trying to
>>> do it here.
>> 
>> Will change it.
>> 
>>>> +static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>>>> +				       struct io_imu_folio_data *data)
>>>> +{
>>>> +	int i, j;
>>>> +
>>>> +	if (nr_pages <= 1 ||
>>>> +		!__io_sqe_buffer_try_coalesce(pages, nr_pages, data))
>>>> +		return false;
>>>> +
>>>> +	/*
>>>> +	 * The pages are bound to the folio, it doesn't
>>>> +	 * actually unpin them but drops all but one reference,
>>>> +	 * which is usually put down by io_buffer_unmap().
>>>> +	 * Note, needs a better helper.
>>>> +	 */
>>>> +	if (data->nr_pages_head > 1)
>>>> +		unpin_user_pages(&pages[1], data->nr_pages_head - 1);
>>> Should be pages[0]. page[1] can be in another folio, and even
>>> though data->nr_pages_head > 1 protects against touching it,
>>> it's still flimsy.
>> 
>> But here it is unpinning the tail pages inside those coalesceable folios,
>> I think we only unpin pages[0] when failure, am I right? And in
>> __io_sqe_buffer_try_coalesce we have ensured that pages[1:nr_head_pages] are
>> in same folio and contiguous.
>
> We want the entire folio to still be pinned, but don't want to
> leave just one reference and not care down the line how many
> refcounts / etc. you have to put down.
> 
> void unpin_user_page(struct page *page)
> {
> 	sanity_check_pinned_pages(&page, 1);
> 	gup_put_folio(page_folio(page), 1, FOLL_PIN);
> }
>
> And all that goes to the folio as a single object, so doesn't
> really matter which page you pass. Anyway, let's then leave it
> as is then, I wish there would be unpin_folio_nr(), but there
> is unpin_user_page_range_dirty_lock() resembling it.

I see. Thanks for the explanation.

>>>> +
>>>> +	j = data->nr_pages_head;
>>>> +	nr_pages -= data->nr_pages_head;
>>>> +	for (i = 1; i < data->nr_folios; i++) {
>>>> +		unsigned int nr_unpin;
>>>> +
>>>> +		nr_unpin = min_t(unsigned int, nr_pages - 1,
>>>> +					data->nr_pages_mid - 1);
>>>> +		if (nr_unpin == 0)
>>>> +			break;
>>>> +		unpin_user_pages(&pages[j+1], nr_unpin);
>>> same
>>>> +		j += data->nr_pages_mid;
>>> And instead of duplicating this voodoo iteration later,
>>> please just assemble a new compacted ->nr_folios sized
>>> page array.
>> 
>> Indeed, a new page array would make things a lot easier.
>> If alloc overhead is not a concern here, then yeah I'll change it.
>
> It's not, and the upside is reducing memory footprint,
> which would be noticeable with huge pages. It's also
> kvmalloc'ed, so compacting also improves the TLB situation.

OK, will use a new page array.

Thanks,
Chenliang Li
diff mbox series

Patch

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 65417c9553b1..d08224c0c5b0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -871,6 +871,84 @@  static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
+static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
+					 struct io_imu_folio_data *data)
+{
+	struct folio *folio = page_folio(pages[0]);
+	unsigned int count = 1;
+	int i;
+
+	data->nr_pages_mid = folio_nr_pages(folio);
+	if (data->nr_pages_mid == 1)
+		return false;
+
+	data->folio_shift = folio_shift(folio);
+	data->folio_size = folio_size(folio);
+	data->nr_folios = 1;
+	/*
+	 * Check if pages are contiguous inside a folio, and all folios have
+	 * the same page count except for the head and tail.
+	 */
+	for (i = 1; i < nr_pages; i++) {
+		if (page_folio(pages[i]) == folio &&
+			pages[i] == pages[i-1] + 1) {
+			count++;
+			continue;
+		}
+
+		if (data->nr_folios == 1)
+			data->nr_pages_head = count;
+		else if (count != data->nr_pages_mid)
+			return false;
+
+		folio = page_folio(pages[i]);
+		if (folio_size(folio) != data->folio_size)
+			return false;
+
+		count = 1;
+		data->nr_folios++;
+	}
+	if (data->nr_folios == 1)
+		data->nr_pages_head = count;
+
+	return true;
+}
+
+static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
+				       struct io_imu_folio_data *data)
+{
+	int i, j;
+
+	if (nr_pages <= 1 ||
+		!__io_sqe_buffer_try_coalesce(pages, nr_pages, data))
+		return false;
+
+	/*
+	 * The pages are bound to the folio, it doesn't
+	 * actually unpin them but drops all but one reference,
+	 * which is usually put down by io_buffer_unmap().
+	 * Note, needs a better helper.
+	 */
+	if (data->nr_pages_head > 1)
+		unpin_user_pages(&pages[1], data->nr_pages_head - 1);
+
+	j = data->nr_pages_head;
+	nr_pages -= data->nr_pages_head;
+	for (i = 1; i < data->nr_folios; i++) {
+		unsigned int nr_unpin;
+
+		nr_unpin = min_t(unsigned int, nr_pages - 1,
+					data->nr_pages_mid - 1);
+		if (nr_unpin == 0)
+			break;
+		unpin_user_pages(&pages[j+1], nr_unpin);
+		j += data->nr_pages_mid;
+		nr_pages -= data->nr_pages_mid;
+	}
+
+	return true;
+}
+
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c032ca3436ca..b2a9d66b76dd 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,16 @@  struct io_mapped_ubuf {
 	struct bio_vec	bvec[] __counted_by(nr_bvecs);
 };
 
+struct io_imu_folio_data {
+	/* Head folio can be partially included in the fixed buf */
+	unsigned int	nr_pages_head;
+	/* For non-head/tail folios, has to be fully included */
+	unsigned int	nr_pages_mid;
+	unsigned int	nr_folios;
+	unsigned int	folio_shift;
+	size_t		folio_size;
+};
+
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);