diff mbox series

[PATCHv4,5/5] io_uring: cache nodes and mapped buffers

Message ID 20250218224229.837848-6-kbusch@meta.com (mailing list archive)
State New
Headers show
Series ublk zero-copy support | expand

Commit Message

Keith Busch Feb. 18, 2025, 10:42 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Frequent alloc/free cycles on these is pretty costly. Use an io cache to
more efficiently reuse these buffers.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/io_uring_types.h |  18 ++---
 io_uring/filetable.c           |   2 +-
 io_uring/rsrc.c                | 120 +++++++++++++++++++++++++--------
 io_uring/rsrc.h                |   2 +-
 4 files changed, 104 insertions(+), 38 deletions(-)

Comments

Caleb Sander Mateos Feb. 19, 2025, 4:22 a.m. UTC | #1
On Tue, Feb 18, 2025 at 2:43 PM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> more efficiently reuse these buffers.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  include/linux/io_uring_types.h |  18 ++---
>  io_uring/filetable.c           |   2 +-
>  io_uring/rsrc.c                | 120 +++++++++++++++++++++++++--------
>  io_uring/rsrc.h                |   2 +-
>  4 files changed, 104 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 810d1dccd27b1..bbfb651506522 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -69,8 +69,18 @@ struct io_file_table {
>         unsigned int alloc_hint;
>  };
>
> +struct io_alloc_cache {
> +       void                    **entries;
> +       unsigned int            nr_cached;
> +       unsigned int            max_cached;
> +       size_t                  elem_size;

Is growing this field from unsigned to size_t really necessary? It
probably doesn't make sense to be caching allocations > 4 GB.

> +       unsigned int            init_clear;
> +};
> +
>  struct io_buf_table {
>         struct io_rsrc_data     data;
> +       struct io_alloc_cache   node_cache;
> +       struct io_alloc_cache   imu_cache;
>  };
>
>  struct io_hash_bucket {
> @@ -224,14 +234,6 @@ struct io_submit_state {
>         struct blk_plug         plug;
>  };
>
> -struct io_alloc_cache {
> -       void                    **entries;
> -       unsigned int            nr_cached;
> -       unsigned int            max_cached;
> -       unsigned int            elem_size;
> -       unsigned int            init_clear;
> -};
> -
>  struct io_ring_ctx {
>         /* const or read-mostly hot data */
>         struct {
> diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> index dd8eeec97acf6..a21660e3145ab 100644
> --- a/io_uring/filetable.c
> +++ b/io_uring/filetable.c
> @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
>         if (slot_index >= ctx->file_table.data.nr)
>                 return -EINVAL;
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>         if (!node)
>                 return -ENOMEM;
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 261b5535f46c6..d5cac3a234316 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>  #define IORING_MAX_FIXED_FILES (1U << 20)
>  #define IORING_MAX_REG_BUFFERS (1U << 14)
>
> +#define IO_CACHED_BVECS_SEGS   32
> +
>  int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
>  {
>         unsigned long page_limit, cur_pages, new_pages;
> @@ -101,6 +103,22 @@ int io_buffer_validate(struct iovec *iov)
>         return 0;
>  }
>
> +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> +                                          int nr_bvecs)

Pass nr_bvecs as unsigned to avoid sign-extension in struct_size_t()?

> +{
> +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> +       return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> +                       GFP_KERNEL);
> +}
> +
> +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
> +{
> +       if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS ||
> +           !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu))
> +               kvfree(imu);
> +}
> +
>  static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>  {
>         struct io_mapped_ubuf *imu = node->buf;
> @@ -119,22 +137,35 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>                         io_unaccount_mem(ctx, imu->acct_pages);
>         }
>
> -       kvfree(imu);
> +       io_free_imu(ctx, imu);
>  }
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
>  {
>         struct io_rsrc_node *node;
>
> -       node = kzalloc(sizeof(*node), GFP_KERNEL);
> +       if (type == IORING_RSRC_FILE)
> +               node = kmalloc(sizeof(*node), GFP_KERNEL);
> +       else
> +               node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
>         if (node) {
>                 node->type = type;
>                 node->refs = 1;
> +               node->tag = 0;
> +               node->file_ptr = 0;
>         }
>         return node;
>  }
>
> -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> +{
> +       kvfree(data->nodes);
> +       data->nodes = NULL;
> +       data->nr = 0;
> +}
> +
> +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> +                             struct io_rsrc_data *data)
>  {
>         if (!data->nr)
>                 return;
> @@ -142,9 +173,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
>                 if (data->nodes[data->nr])
>                         io_put_rsrc_node(ctx, data->nodes[data->nr]);
>         }
> -       kvfree(data->nodes);
> -       data->nodes = NULL;
> -       data->nr = 0;
> +       __io_rsrc_data_free(data);
>  }
>
>  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> @@ -158,6 +187,31 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
>         return -ENOMEM;
>  }
>
> +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> +{
> +       const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> +                                                IO_CACHED_BVECS_SEGS);
> +       const int node_size = sizeof(struct io_rsrc_node);
> +       int ret;
> +
> +       ret = io_rsrc_data_alloc(&table->data, nr);
> +       if (ret)
> +               return ret;
> +
> +       if (io_alloc_cache_init(&table->node_cache, nr, node_size, 0))
> +               goto free_data;
> +
> +       if (io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0))
> +               goto free_cache;
> +
> +       return 0;
> +free_cache:
> +       io_alloc_cache_free(&table->node_cache, kfree);
> +free_data:
> +       __io_rsrc_data_free(&table->data);
> +       return -ENOMEM;
> +}
> +
>  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>                                  struct io_uring_rsrc_update2 *up,
>                                  unsigned nr_args)
> @@ -207,7 +261,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>                                 err = -EBADF;
>                                 break;
>                         }
> -                       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +                       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>                         if (!node) {
>                                 err = -ENOMEM;
>                                 fput(file);
> @@ -459,6 +513,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>         case IORING_RSRC_BUFFER:
>                 if (node->buf)
>                         io_buffer_unmap(ctx, node);
> +               if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> +                       return;
>                 break;
>         default:
>                 WARN_ON_ONCE(1);
> @@ -527,7 +583,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
>                         goto fail;
>                 }
>                 ret = -ENOMEM;
> -               node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +               node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>                 if (!node) {
>                         fput(file);
>                         goto fail;
> @@ -547,11 +603,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
>         return ret;
>  }
>
> +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> +                               struct io_buf_table *table)
> +{
> +       io_rsrc_data_free(ctx, &table->data);
> +       io_alloc_cache_free(&table->node_cache, kfree);
> +       io_alloc_cache_free(&table->imu_cache, kfree);
> +}
> +
>  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>  {
>         if (!ctx->buf_table.data.nr)
>                 return -ENXIO;
> -       io_rsrc_data_free(ctx, &ctx->buf_table.data);
> +       io_rsrc_buffer_free(ctx, &ctx->buf_table);
>         return 0;
>  }
>
> @@ -732,7 +796,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>         if (!iov->iov_base)
>                 return NULL;
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>         if (!node)
>                 return ERR_PTR(-ENOMEM);
>         node->buf = NULL;
> @@ -752,7 +816,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>                         coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
>         }
>
> -       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> +       imu = io_alloc_imu(ctx, nr_pages);
>         if (!imu)
>                 goto done;
>
> @@ -789,7 +853,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>         }
>  done:
>         if (ret) {
> -               kvfree(imu);
> +               io_free_imu(ctx, imu);
>                 if (node)
>                         io_put_rsrc_node(ctx, node);
>                 node = ERR_PTR(ret);
> @@ -802,9 +866,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                             unsigned int nr_args, u64 __user *tags)
>  {
>         struct page *last_hpage = NULL;
> -       struct io_rsrc_data data;
>         struct iovec fast_iov, *iov = &fast_iov;
>         const struct iovec __user *uvec;
> +       struct io_buf_table table;
>         int i, ret;
>
>         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> @@ -813,13 +877,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                 return -EBUSY;
>         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
>                 return -EINVAL;
> -       ret = io_rsrc_data_alloc(&data, nr_args);
> +       ret = io_rsrc_buffer_alloc(&table, nr_args);
>         if (ret)
>                 return ret;
>
>         if (!arg)
>                 memset(iov, 0, sizeof(*iov));
>
> +       ctx->buf_table = table;
>         for (i = 0; i < nr_args; i++) {
>                 struct io_rsrc_node *node;
>                 u64 tag = 0;
> @@ -859,10 +924,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                         }
>                         node->tag = tag;
>                 }
> -               data.nodes[i] = node;
> +               table.data.nodes[i] = node;
>         }
> -
> -       ctx->buf_table.data = data;

Still don't see the need to move this assignment. Is there a reason
you prefer setting ctx->buf_table before initializing its nodes? I
find the existing code easier to follow, where the table is moved to
ctx->buf_table after filling it in. It's also consistent with
io_clone_buffers().

>         if (ret)
>                 io_sqe_buffers_unregister(ctx);
>         return ret;
> @@ -887,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
>                 goto unlock;
>         }
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>         if (!node) {
>                 ret = -ENOMEM;
>                 goto unlock;
>         }
>
>         nr_bvecs = blk_rq_nr_phys_segments(rq);
> -       imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> +
> +       imu = io_alloc_imu(ctx, nr_bvecs);

nit: probably don't need to add a blank line here

Best,
Caleb

>         if (!imu) {
>                 kfree(node);
>                 ret = -ENOMEM;
> @@ -1031,7 +1095,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
>  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
>                             struct io_uring_clone_buffers *arg)
>  {
> -       struct io_rsrc_data data;
> +       struct io_buf_table table;
>         int i, ret, off, nr;
>         unsigned int nbufs;
>
> @@ -1062,7 +1126,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>         if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
>                 return -EOVERFLOW;
>
> -       ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> +       ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
>         if (ret)
>                 return ret;
>
> @@ -1071,7 +1135,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                 struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
>
>                 if (src_node) {
> -                       data.nodes[i] = src_node;
> +                       table.data.nodes[i] = src_node;
>                         src_node->refs++;
>                 }
>         }
> @@ -1101,7 +1165,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                 if (!src_node) {
>                         dst_node = NULL;
>                 } else {
> -                       dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +                       dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>                         if (!dst_node) {
>                                 ret = -ENOMEM;
>                                 goto out_free;
> @@ -1110,12 +1174,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                         refcount_inc(&src_node->buf->refs);
>                         dst_node->buf = src_node->buf;
>                 }
> -               data.nodes[off++] = dst_node;
> +               table.data.nodes[off++] = dst_node;
>                 i++;
>         }
>
>         /*
> -        * If asked for replace, put the old table. data->nodes[] holds both
> +        * If asked for replace, put the old table. table.data->nodes[] holds both
>          * old and new nodes at this point.
>          */
>         if (arg->flags & IORING_REGISTER_DST_REPLACE)
> @@ -1128,10 +1192,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>          * entry).
>          */
>         WARN_ON_ONCE(ctx->buf_table.data.nr);
> -       ctx->buf_table.data = data;
> +       ctx->buf_table = table;
>         return 0;
>  out_free:
> -       io_rsrc_data_free(ctx, &data);
> +       io_rsrc_buffer_free(ctx, &table);
>         return ret;
>  }
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 81c31b93d4f7b..69cc212ad0c7c 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -49,7 +49,7 @@ struct io_imu_folio_data {
>         unsigned int    nr_folios;
>  };
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
>  void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
>  void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
>  int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> --
> 2.43.5
>
Pavel Begunkov Feb. 20, 2025, 11:08 a.m. UTC | #2
On 2/18/25 22:42, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> more efficiently reuse these buffers.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   include/linux/io_uring_types.h |  18 ++---
>   io_uring/filetable.c           |   2 +-
>   io_uring/rsrc.c                | 120 +++++++++++++++++++++++++--------
>   io_uring/rsrc.h                |   2 +-
>   4 files changed, 104 insertions(+), 38 deletions(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 810d1dccd27b1..bbfb651506522 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -69,8 +69,18 @@ struct io_file_table {
>   	unsigned int alloc_hint;
...
> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
>   {
>   	struct io_rsrc_node *node;
>   
> -	node = kzalloc(sizeof(*node), GFP_KERNEL);
> +	if (type == IORING_RSRC_FILE)
> +		node = kmalloc(sizeof(*node), GFP_KERNEL);
> +	else
> +		node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);

That's why node allocators shouldn't be a part of the buffer table.
Keith Busch Feb. 20, 2025, 3:24 p.m. UTC | #3
On Thu, Feb 20, 2025 at 11:08:25AM +0000, Pavel Begunkov wrote:
> On 2/18/25 22:42, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> > more efficiently reuse these buffers.
> > 
> > Signed-off-by: Keith Busch <kbusch@kernel.org>
> > ---
> >   include/linux/io_uring_types.h |  18 ++---
> >   io_uring/filetable.c           |   2 +-
> >   io_uring/rsrc.c                | 120 +++++++++++++++++++++++++--------
> >   io_uring/rsrc.h                |   2 +-
> >   4 files changed, 104 insertions(+), 38 deletions(-)
> > 
> > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > index 810d1dccd27b1..bbfb651506522 100644
> > --- a/include/linux/io_uring_types.h
> > +++ b/include/linux/io_uring_types.h
> > @@ -69,8 +69,18 @@ struct io_file_table {
> >   	unsigned int alloc_hint;
> ...
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
> >   {
> >   	struct io_rsrc_node *node;
> > -	node = kzalloc(sizeof(*node), GFP_KERNEL);
> > +	if (type == IORING_RSRC_FILE)
> > +		node = kmalloc(sizeof(*node), GFP_KERNEL);
> > +	else
> > +		node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
> 
> That's why node allocators shouldn't be a part of the buffer table.

Are you saying you want file nodes to also subscribe to the cache? The
two tables can be resized independently of each other, we don't know how
many elements the cache needs to hold.
Pavel Begunkov Feb. 20, 2025, 4:06 p.m. UTC | #4
On 2/20/25 15:24, Keith Busch wrote:
> On Thu, Feb 20, 2025 at 11:08:25AM +0000, Pavel Begunkov wrote:
>> On 2/18/25 22:42, Keith Busch wrote:
>>> From: Keith Busch <kbusch@kernel.org>
>>>
>>> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
>>> more efficiently reuse these buffers.
>>>
>>> Signed-off-by: Keith Busch <kbusch@kernel.org>
>>> ---
>>>    include/linux/io_uring_types.h |  18 ++---
>>>    io_uring/filetable.c           |   2 +-
>>>    io_uring/rsrc.c                | 120 +++++++++++++++++++++++++--------
>>>    io_uring/rsrc.h                |   2 +-
>>>    4 files changed, 104 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>> index 810d1dccd27b1..bbfb651506522 100644
>>> --- a/include/linux/io_uring_types.h
>>> +++ b/include/linux/io_uring_types.h
>>> @@ -69,8 +69,18 @@ struct io_file_table {
>>>    	unsigned int alloc_hint;
>> ...
>>> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
>>> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
>>>    {
>>>    	struct io_rsrc_node *node;
>>> -	node = kzalloc(sizeof(*node), GFP_KERNEL);
>>> +	if (type == IORING_RSRC_FILE)
>>> +		node = kmalloc(sizeof(*node), GFP_KERNEL);
>>> +	else
>>> +		node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
>>
>> That's why node allocators shouldn't be a part of the buffer table.
> 
> Are you saying you want file nodes to also subscribe to the cache? The

Yes, but it might be easier for you to focus on finalising the essential
parts, and then we can improve later.

> two tables can be resized independently of each other, we don't know how
> many elements the cache needs to hold.

I wouldn't try to correlate table sizes with desired cache sizes,
users can have quite different patterns like allocating a barely used
huge table. And you care about the speed of node change, which at
extremes is rather limited by CPU and performance and not spatiality
of the table. And you can also reallocate it as well.
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 810d1dccd27b1..bbfb651506522 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -69,8 +69,18 @@  struct io_file_table {
 	unsigned int alloc_hint;
 };
 
+struct io_alloc_cache {
+	void			**entries;
+	unsigned int		nr_cached;
+	unsigned int		max_cached;
+	size_t			elem_size;
+	unsigned int		init_clear;
+};
+
 struct io_buf_table {
 	struct io_rsrc_data	data;
+	struct io_alloc_cache	node_cache;
+	struct io_alloc_cache	imu_cache;
 };
 
 struct io_hash_bucket {
@@ -224,14 +234,6 @@  struct io_submit_state {
 	struct blk_plug		plug;
 };
 
-struct io_alloc_cache {
-	void			**entries;
-	unsigned int		nr_cached;
-	unsigned int		max_cached;
-	unsigned int		elem_size;
-	unsigned int		init_clear;
-};
-
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index dd8eeec97acf6..a21660e3145ab 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -68,7 +68,7 @@  static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (slot_index >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 	if (!node)
 		return -ENOMEM;
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 261b5535f46c6..d5cac3a234316 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -32,6 +32,8 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
+#define IO_CACHED_BVECS_SEGS	32
+
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -101,6 +103,22 @@  int io_buffer_validate(struct iovec *iov)
 	return 0;
 }
 
+static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
+					   int nr_bvecs)
+{
+	if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
+		return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
+	return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
+			GFP_KERNEL);
+}
+
+static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
+{
+	if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS ||
+	    !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu))
+		kvfree(imu);
+}
+
 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
 	struct io_mapped_ubuf *imu = node->buf;
@@ -119,22 +137,35 @@  static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 			io_unaccount_mem(ctx, imu->acct_pages);
 	}
 
-	kvfree(imu);
+	io_free_imu(ctx, imu);
 }
 
-struct io_rsrc_node *io_rsrc_node_alloc(int type)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 {
 	struct io_rsrc_node *node;
 
-	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (type == IORING_RSRC_FILE)
+		node = kmalloc(sizeof(*node), GFP_KERNEL);
+	else
+		node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
 	if (node) {
 		node->type = type;
 		node->refs = 1;
+		node->tag = 0;
+		node->file_ptr = 0;
 	}
 	return node;
 }
 
-__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
+static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
+{
+	kvfree(data->nodes);
+	data->nodes = NULL;
+	data->nr = 0;
+}
+
+__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
+			      struct io_rsrc_data *data)
 {
 	if (!data->nr)
 		return;
@@ -142,9 +173,7 @@  __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
 		if (data->nodes[data->nr])
 			io_put_rsrc_node(ctx, data->nodes[data->nr]);
 	}
-	kvfree(data->nodes);
-	data->nodes = NULL;
-	data->nr = 0;
+	__io_rsrc_data_free(data);
 }
 
 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
@@ -158,6 +187,31 @@  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
 	return -ENOMEM;
 }
 
+static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
+{
+	const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
+						 IO_CACHED_BVECS_SEGS);
+	const int node_size = sizeof(struct io_rsrc_node);
+	int ret;
+
+	ret = io_rsrc_data_alloc(&table->data, nr);
+	if (ret)
+		return ret;
+
+	if (io_alloc_cache_init(&table->node_cache, nr, node_size, 0))
+		goto free_data;
+
+	if (io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0))
+		goto free_cache;
+
+	return 0;
+free_cache:
+	io_alloc_cache_free(&table->node_cache, kfree);
+free_data:
+	__io_rsrc_data_free(&table->data);
+	return -ENOMEM;
+}
+
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
@@ -207,7 +261,7 @@  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 			if (!node) {
 				err = -ENOMEM;
 				fput(file);
@@ -459,6 +513,8 @@  void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 	case IORING_RSRC_BUFFER:
 		if (node->buf)
 			io_buffer_unmap(ctx, node);
+		if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
+			return;
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -527,7 +583,7 @@  int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			goto fail;
 		}
 		ret = -ENOMEM;
-		node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 		if (!node) {
 			fput(file);
 			goto fail;
@@ -547,11 +603,19 @@  int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
+				struct io_buf_table *table)
+{
+	io_rsrc_data_free(ctx, &table->data);
+	io_alloc_cache_free(&table->node_cache, kfree);
+	io_alloc_cache_free(&table->imu_cache, kfree);
+}
+
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	if (!ctx->buf_table.data.nr)
 		return -ENXIO;
-	io_rsrc_data_free(ctx, &ctx->buf_table.data);
+	io_rsrc_buffer_free(ctx, &ctx->buf_table);
 	return 0;
 }
 
@@ -732,7 +796,7 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	if (!iov->iov_base)
 		return NULL;
 
-	node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
 	node->buf = NULL;
@@ -752,7 +816,7 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 			coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
 	}
 
-	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	imu = io_alloc_imu(ctx, nr_pages);
 	if (!imu)
 		goto done;
 
@@ -789,7 +853,7 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	}
 done:
 	if (ret) {
-		kvfree(imu);
+		io_free_imu(ctx, imu);
 		if (node)
 			io_put_rsrc_node(ctx, node);
 		node = ERR_PTR(ret);
@@ -802,9 +866,9 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
-	struct io_rsrc_data data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec;
+	struct io_buf_table table;
 	int i, ret;
 
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@@ -813,13 +877,14 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(&data, nr_args);
+	ret = io_rsrc_buffer_alloc(&table, nr_args);
 	if (ret)
 		return ret;
 
 	if (!arg)
 		memset(iov, 0, sizeof(*iov));
 
+	ctx->buf_table = table;
 	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
@@ -859,10 +924,8 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			}
 			node->tag = tag;
 		}
-		data.nodes[i] = node;
+		table.data.nodes[i] = node;
 	}
-
-	ctx->buf_table.data = data;
 	if (ret)
 		io_sqe_buffers_unregister(ctx);
 	return ret;
@@ -887,14 +950,15 @@  int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
 		goto unlock;
 	}
 
-	node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 	if (!node) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
 	nr_bvecs = blk_rq_nr_phys_segments(rq);
-	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+
+	imu = io_alloc_imu(ctx, nr_bvecs);
 	if (!imu) {
 		kfree(node);
 		ret = -ENOMEM;
@@ -1031,7 +1095,7 @@  static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
 			    struct io_uring_clone_buffers *arg)
 {
-	struct io_rsrc_data data;
+	struct io_buf_table table;
 	int i, ret, off, nr;
 	unsigned int nbufs;
 
@@ -1062,7 +1126,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
 		return -EOVERFLOW;
 
-	ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
+	ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
 	if (ret)
 		return ret;
 
@@ -1071,7 +1135,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
 
 		if (src_node) {
-			data.nodes[i] = src_node;
+			table.data.nodes[i] = src_node;
 			src_node->refs++;
 		}
 	}
@@ -1101,7 +1165,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		if (!src_node) {
 			dst_node = NULL;
 		} else {
-			dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 			if (!dst_node) {
 				ret = -ENOMEM;
 				goto out_free;
@@ -1110,12 +1174,12 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
 		}
-		data.nodes[off++] = dst_node;
+		table.data.nodes[off++] = dst_node;
 		i++;
 	}
 
 	/*
-	 * If asked for replace, put the old table. data->nodes[] holds both
+	 * If asked for replace, put the old table. table.data->nodes[] holds both
 	 * old and new nodes at this point.
 	 */
 	if (arg->flags & IORING_REGISTER_DST_REPLACE)
@@ -1128,10 +1192,10 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	 * entry).
 	 */
 	WARN_ON_ONCE(ctx->buf_table.data.nr);
-	ctx->buf_table.data = data;
+	ctx->buf_table = table;
 	return 0;
 out_free:
-	io_rsrc_data_free(ctx, &data);
+	io_rsrc_buffer_free(ctx, &table);
 	return ret;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 81c31b93d4f7b..69cc212ad0c7c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,7 +49,7 @@  struct io_imu_folio_data {
 	unsigned int	nr_folios;
 };
 
-struct io_rsrc_node *io_rsrc_node_alloc(int type);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
 void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
 int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);