diff mbox series

[PATCHv3,5/5] io_uring: cache nodes and mapped buffers

Message ID 20250214154348.2952692-6-kbusch@meta.com (mailing list archive)
State New
Headers show
Series ublk zero-copy support | expand

Commit Message

Keith Busch Feb. 14, 2025, 3:43 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Frequent alloc/free cycles on these is pretty costly. Use an io cache to
more efficiently reuse these buffers.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/io_uring_types.h |  18 +++---
 io_uring/filetable.c           |   2 +-
 io_uring/rsrc.c                | 114 +++++++++++++++++++++++++--------
 io_uring/rsrc.h                |   2 +-
 4 files changed, 99 insertions(+), 37 deletions(-)

Comments

Caleb Sander Mateos Feb. 15, 2025, 2:22 a.m. UTC | #1
On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> more efficiently reuse these buffers.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  include/linux/io_uring_types.h |  18 +++---
>  io_uring/filetable.c           |   2 +-
>  io_uring/rsrc.c                | 114 +++++++++++++++++++++++++--------
>  io_uring/rsrc.h                |   2 +-
>  4 files changed, 99 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index d8d717cce427f..ebaaa1c7e210f 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -67,8 +67,18 @@ struct io_file_table {
>         unsigned int alloc_hint;
>  };
>
> +struct io_alloc_cache {
> +       void                    **entries;
> +       unsigned int            nr_cached;
> +       unsigned int            max_cached;
> +       size_t                  elem_size;
> +       unsigned int            init_clear;
> +};
> +
>  struct io_buf_table {
>         struct io_rsrc_data     data;
> +       struct io_alloc_cache   node_cache;
> +       struct io_alloc_cache   imu_cache;
>  };
>
>  struct io_hash_bucket {
> @@ -222,14 +232,6 @@ struct io_submit_state {
>         struct blk_plug         plug;
>  };
>
> -struct io_alloc_cache {
> -       void                    **entries;
> -       unsigned int            nr_cached;
> -       unsigned int            max_cached;
> -       unsigned int            elem_size;
> -       unsigned int            init_clear;
> -};
> -
>  struct io_ring_ctx {
>         /* const or read-mostly hot data */
>         struct {
> diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> index dd8eeec97acf6..a21660e3145ab 100644
> --- a/io_uring/filetable.c
> +++ b/io_uring/filetable.c
> @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
>         if (slot_index >= ctx->file_table.data.nr)
>                 return -EINVAL;
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>         if (!node)
>                 return -ENOMEM;
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index fd7a1b04db8b7..26ff9b5851d94 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>  #define IORING_MAX_FIXED_FILES (1U << 20)
>  #define IORING_MAX_REG_BUFFERS (1U << 14)
>
> +#define IO_CACHED_BVECS_SEGS   32
> +
>  int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
>  {
>         unsigned long page_limit, cur_pages, new_pages;
> @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>         kvfree(imu);
>  }
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> +
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)

nit: extra blank line added here

>  {
>         struct io_rsrc_node *node;
>
> -       node = kzalloc(sizeof(*node), GFP_KERNEL);
> +       if (type == IORING_RSRC_FILE)
> +               node = kmalloc(sizeof(*node), GFP_KERNEL);
> +       else
> +               node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
>         if (node) {
>                 node->type = type;
>                 node->refs = 1;
> +               node->tag = 0;
> +               node->file_ptr = 0;
>         }
>         return node;
>  }
>
> -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> +{
> +       kvfree(data->nodes);
> +       data->nodes = NULL;
> +       data->nr = 0;
> +}
> +
> +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> +                             struct io_rsrc_data *data)
>  {
>         if (!data->nr)
>                 return;
> @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
>                 if (data->nodes[data->nr])
>                         io_put_rsrc_node(ctx, data->nodes[data->nr]);
>         }
> -       kvfree(data->nodes);
> -       data->nodes = NULL;
> -       data->nr = 0;
> +       __io_rsrc_data_free(data);
>  }
>
>  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
>         return -ENOMEM;
>  }
>
> +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> +{
> +       const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> +                                                IO_CACHED_BVECS_SEGS);
> +       int ret;
> +
> +       ret = io_rsrc_data_alloc(&table->data, nr);
> +       if (ret)
> +               return ret;
> +
> +       ret = io_alloc_cache_init(&table->node_cache, nr,
> +                                 sizeof(struct io_rsrc_node), 0);
> +       if (ret)
> +               goto out_1;
> +
> +       ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
> +       if (ret)
> +               goto out_2;

io_alloc_cache_init() returns bool. Probably these cases should return
-ENOMEM instead of 1?

> +
> +       return 0;
> +out_2:
> +       io_alloc_cache_free(&table->node_cache, kfree);
> +out_1:
> +       __io_rsrc_data_free(&table->data);
> +       return ret;
> +}
> +
>  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>                                  struct io_uring_rsrc_update2 *up,
>                                  unsigned nr_args)
> @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>                                 err = -EBADF;
>                                 break;
>                         }
> -                       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +                       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>                         if (!node) {
>                                 err = -ENOMEM;
>                                 fput(file);
> @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>                         }
>                         node->tag = tag;
>                 }
> -               i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
> +               i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);

Looks like this change belongs in the prior patch "io_uring: add
abstraction for buf_table rsrc data"?

>                 io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
>                 ctx->buf_table.data.nodes[i] = node;
>                 if (ctx->compat)
> @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>         case IORING_RSRC_BUFFER:
>                 if (node->buf)
>                         io_buffer_unmap(ctx, node);
> +               if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> +                       return;
>                 break;
>         default:
>                 WARN_ON_ONCE(1);
> @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
>                         goto fail;
>                 }
>                 ret = -ENOMEM;
> -               node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +               node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>                 if (!node) {
>                         fput(file);
>                         goto fail;
> @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
>         return ret;
>  }
>
> +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> +                               struct io_buf_table *table)
> +{
> +       io_rsrc_data_free(ctx, &table->data);
> +       io_alloc_cache_free(&table->node_cache, kfree);
> +       io_alloc_cache_free(&table->imu_cache, kfree);
> +}
> +
>  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>  {
>         if (!ctx->buf_table.data.nr)
>                 return -ENXIO;
> -       io_rsrc_data_free(ctx, &ctx->buf_table.data);
> +       io_rsrc_buffer_free(ctx, &ctx->buf_table);
>         return 0;
>  }
>
> @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
>         return true;
>  }
>
> +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> +                                          int nr_bvecs)
> +{
> +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);

If there is no entry available in the cache, this will heap-allocate
one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
using io_alloc_cache_get() instead of io_cache_alloc(), so the
heap-allocated fallback uses the minimal size.

Also, where are these allocations returned to the imu_cache? Looks
like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
needs to try io_alloc_cache_put() first.

Best,
Caleb

> +       return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> +                       GFP_KERNEL);
> +}
> +
>  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>                                                    struct iovec *iov,
>                                                    struct page **last_hpage)
> @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>         if (!iov->iov_base)
>                 return NULL;
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>         if (!node)
>                 return ERR_PTR(-ENOMEM);
>         node->buf = NULL;
> @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>                         coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
>         }
>
> -       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> +       imu = io_alloc_imu(ctx, nr_pages);
>         if (!imu)
>                 goto done;
>
> @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                             unsigned int nr_args, u64 __user *tags)
>  {
>         struct page *last_hpage = NULL;
> -       struct io_rsrc_data data;
>         struct iovec fast_iov, *iov = &fast_iov;
>         const struct iovec __user *uvec;
> +       struct io_buf_table table;
>         int i, ret;
>
>         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                 return -EBUSY;
>         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
>                 return -EINVAL;
> -       ret = io_rsrc_data_alloc(&data, nr_args);
> +       ret = io_rsrc_buffer_alloc(&table, nr_args);
>         if (ret)
>                 return ret;
>
>         if (!arg)
>                 memset(iov, 0, sizeof(*iov));
>
> +       ctx->buf_table = table;
>         for (i = 0; i < nr_args; i++) {
>                 struct io_rsrc_node *node;
>                 u64 tag = 0;
> @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                         }
>                         node->tag = tag;
>                 }
> -               data.nodes[i] = node;
> +               table.data.nodes[i] = node;
>         }
> -
> -       ctx->buf_table.data = data;

Is it necessary to move this assignment? I found the existing location
easier to reason about, since the assignment of ctx->buf_table
represents a transfer of ownership from the local variable.

>         if (ret)
>                 io_sqe_buffers_unregister(ctx);
>         return ret;
> @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
>                 goto unlock;
>         }
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>         if (!node) {
>                 ret = -ENOMEM;
>                 goto unlock;
>         }
>
>         nr_bvecs = blk_rq_nr_phys_segments(rq);
> -       imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> +
> +       imu = io_alloc_imu(ctx, nr_bvecs);
>         if (!imu) {
>                 kfree(node);
>                 ret = -ENOMEM;
> @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
>  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
>                             struct io_uring_clone_buffers *arg)
>  {
> -       struct io_rsrc_data data;
> +       struct io_buf_table table;
>         int i, ret, off, nr;
>         unsigned int nbufs;
>
> @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>         if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
>                 return -EOVERFLOW;
>
> -       ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> +       ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
>         if (ret)
>                 return ret;
>
> @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                 struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
>
>                 if (src_node) {
> -                       data.nodes[i] = src_node;
> +                       table.data.nodes[i] = src_node;
>                         src_node->refs++;
>                 }
>         }
> @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                 if (!src_node) {
>                         dst_node = NULL;
>                 } else {
> -                       dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +                       dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>                         if (!dst_node) {
>                                 ret = -ENOMEM;
>                                 goto out_free;
> @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                         refcount_inc(&src_node->buf->refs);
>                         dst_node->buf = src_node->buf;
>                 }
> -               data.nodes[off++] = dst_node;
> +               table.data.nodes[off++] = dst_node;
>                 i++;
>         }
>
>         /*
> -        * If asked for replace, put the old table. data->nodes[] holds both
> +        * If asked for replace, put the old table. table.data->nodes[] holds both
>          * old and new nodes at this point.
>          */
>         if (arg->flags & IORING_REGISTER_DST_REPLACE)
> @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>          * entry).
>          */
>         WARN_ON_ONCE(ctx->buf_table.data.nr);
> -       ctx->buf_table.data = data;
> +       ctx->buf_table = table;
>         return 0;
>  out_free:
> -       io_rsrc_data_free(ctx, &data);
> +       io_rsrc_buffer_free(ctx, &table);
>         return ret;
>  }
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 2e8d1862caefc..c5bdac558a2b4 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -47,7 +47,7 @@ struct io_imu_folio_data {
>         unsigned int    nr_folios;
>  };
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
>  void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
>  void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
>  int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> --
> 2.43.5
>
>
Caleb Sander Mateos Feb. 16, 2025, 10:43 p.m. UTC | #2
On Fri, Feb 14, 2025 at 6:22 PM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <kbusch@meta.com> wrote:
> >
> > From: Keith Busch <kbusch@kernel.org>
> >
> > Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> > more efficiently reuse these buffers.
> >
> > Signed-off-by: Keith Busch <kbusch@kernel.org>
> > ---
> >  include/linux/io_uring_types.h |  18 +++---
> >  io_uring/filetable.c           |   2 +-
> >  io_uring/rsrc.c                | 114 +++++++++++++++++++++++++--------
> >  io_uring/rsrc.h                |   2 +-
> >  4 files changed, 99 insertions(+), 37 deletions(-)
> >
> > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > index d8d717cce427f..ebaaa1c7e210f 100644
> > --- a/include/linux/io_uring_types.h
> > +++ b/include/linux/io_uring_types.h
> > @@ -67,8 +67,18 @@ struct io_file_table {
> >         unsigned int alloc_hint;
> >  };
> >
> > +struct io_alloc_cache {
> > +       void                    **entries;
> > +       unsigned int            nr_cached;
> > +       unsigned int            max_cached;
> > +       size_t                  elem_size;
> > +       unsigned int            init_clear;
> > +};
> > +
> >  struct io_buf_table {
> >         struct io_rsrc_data     data;
> > +       struct io_alloc_cache   node_cache;
> > +       struct io_alloc_cache   imu_cache;
> >  };
> >
> >  struct io_hash_bucket {
> > @@ -222,14 +232,6 @@ struct io_submit_state {
> >         struct blk_plug         plug;
> >  };
> >
> > -struct io_alloc_cache {
> > -       void                    **entries;
> > -       unsigned int            nr_cached;
> > -       unsigned int            max_cached;
> > -       unsigned int            elem_size;
> > -       unsigned int            init_clear;
> > -};
> > -
> >  struct io_ring_ctx {
> >         /* const or read-mostly hot data */
> >         struct {
> > diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> > index dd8eeec97acf6..a21660e3145ab 100644
> > --- a/io_uring/filetable.c
> > +++ b/io_uring/filetable.c
> > @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
> >         if (slot_index >= ctx->file_table.data.nr)
> >                 return -EINVAL;
> >
> > -       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> >         if (!node)
> >                 return -ENOMEM;
> >
> > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> > index fd7a1b04db8b7..26ff9b5851d94 100644
> > --- a/io_uring/rsrc.c
> > +++ b/io_uring/rsrc.c
> > @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> >  #define IORING_MAX_FIXED_FILES (1U << 20)
> >  #define IORING_MAX_REG_BUFFERS (1U << 14)
> >
> > +#define IO_CACHED_BVECS_SEGS   32
> > +
> >  int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
> >  {
> >         unsigned long page_limit, cur_pages, new_pages;
> > @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> >         kvfree(imu);
> >  }
> >
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> > +
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
>
> nit: extra blank line added here
>
> >  {
> >         struct io_rsrc_node *node;
> >
> > -       node = kzalloc(sizeof(*node), GFP_KERNEL);
> > +       if (type == IORING_RSRC_FILE)
> > +               node = kmalloc(sizeof(*node), GFP_KERNEL);
> > +       else
> > +               node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
> >         if (node) {
> >                 node->type = type;
> >                 node->refs = 1;
> > +               node->tag = 0;
> > +               node->file_ptr = 0;
> >         }
> >         return node;
> >  }
> >
> > -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> > +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> > +{
> > +       kvfree(data->nodes);
> > +       data->nodes = NULL;
> > +       data->nr = 0;
> > +}
> > +
> > +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> > +                             struct io_rsrc_data *data)
> >  {
> >         if (!data->nr)
> >                 return;
> > @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
> >                 if (data->nodes[data->nr])
> >                         io_put_rsrc_node(ctx, data->nodes[data->nr]);
> >         }
> > -       kvfree(data->nodes);
> > -       data->nodes = NULL;
> > -       data->nr = 0;
> > +       __io_rsrc_data_free(data);
> >  }
> >
> >  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> > @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> >         return -ENOMEM;
> >  }
> >
> > +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> > +{
> > +       const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> > +                                                IO_CACHED_BVECS_SEGS);
> > +       int ret;
> > +
> > +       ret = io_rsrc_data_alloc(&table->data, nr);
> > +       if (ret)
> > +               return ret;
> > +
> > +       ret = io_alloc_cache_init(&table->node_cache, nr,
> > +                                 sizeof(struct io_rsrc_node), 0);
> > +       if (ret)
> > +               goto out_1;
> > +
> > +       ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
> > +       if (ret)
> > +               goto out_2;
>
> io_alloc_cache_init() returns bool. Probably these cases should return
> -ENOMEM instead of 1?
>
> > +
> > +       return 0;
> > +out_2:
> > +       io_alloc_cache_free(&table->node_cache, kfree);
> > +out_1:
> > +       __io_rsrc_data_free(&table->data);
> > +       return ret;
> > +}
> > +
> >  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> >                                  struct io_uring_rsrc_update2 *up,
> >                                  unsigned nr_args)
> > @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> >                                 err = -EBADF;
> >                                 break;
> >                         }
> > -                       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > +                       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> >                         if (!node) {
> >                                 err = -ENOMEM;
> >                                 fput(file);
> > @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> >                         }
> >                         node->tag = tag;
> >                 }
> > -               i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
> > +               i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
>
> Looks like this change belongs in the prior patch "io_uring: add
> abstraction for buf_table rsrc data"?
>
> >                 io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
> >                 ctx->buf_table.data.nodes[i] = node;
> >                 if (ctx->compat)
> > @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> >         case IORING_RSRC_BUFFER:
> >                 if (node->buf)
> >                         io_buffer_unmap(ctx, node);
> > +               if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> > +                       return;
> >                 break;
> >         default:
> >                 WARN_ON_ONCE(1);
> > @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> >                         goto fail;
> >                 }
> >                 ret = -ENOMEM;
> > -               node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > +               node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> >                 if (!node) {
> >                         fput(file);
> >                         goto fail;
> > @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> >         return ret;
> >  }
> >
> > +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> > +                               struct io_buf_table *table)
> > +{
> > +       io_rsrc_data_free(ctx, &table->data);
> > +       io_alloc_cache_free(&table->node_cache, kfree);
> > +       io_alloc_cache_free(&table->imu_cache, kfree);
> > +}
> > +
> >  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> >  {
> >         if (!ctx->buf_table.data.nr)
> >                 return -ENXIO;
> > -       io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > +       io_rsrc_buffer_free(ctx, &ctx->buf_table);
> >         return 0;
> >  }
> >
> > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> >         return true;
> >  }
> >
> > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > +                                          int nr_bvecs)
> > +{
> > +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
>
> If there is no entry available in the cache, this will heap-allocate
> one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> using io_alloc_cache_get() instead of io_cache_alloc(), so the
> heap-allocated fallback uses the minimal size.
>
> Also, where are these allocations returned to the imu_cache? Looks
> like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> needs to try io_alloc_cache_put() first.

Another issue I see is that io_alloc_cache elements are allocated with
kmalloc(), so they can't be freed with kvfree(). When the imu is
freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().

>
> Best,
> Caleb
>
> > +       return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> > +                       GFP_KERNEL);
> > +}
> > +
> >  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> >                                                    struct iovec *iov,
> >                                                    struct page **last_hpage)
> > @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> >         if (!iov->iov_base)
> >                 return NULL;
> >
> > -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> >         if (!node)
> >                 return ERR_PTR(-ENOMEM);
> >         node->buf = NULL;
> > @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> >                         coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
> >         }
> >
> > -       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> > +       imu = io_alloc_imu(ctx, nr_pages);
> >         if (!imu)
> >                 goto done;
> >
> > @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> >                             unsigned int nr_args, u64 __user *tags)
> >  {
> >         struct page *last_hpage = NULL;
> > -       struct io_rsrc_data data;
> >         struct iovec fast_iov, *iov = &fast_iov;
> >         const struct iovec __user *uvec;
> > +       struct io_buf_table table;
> >         int i, ret;
> >
> >         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> > @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> >                 return -EBUSY;
> >         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
> >                 return -EINVAL;
> > -       ret = io_rsrc_data_alloc(&data, nr_args);
> > +       ret = io_rsrc_buffer_alloc(&table, nr_args);
> >         if (ret)
> >                 return ret;
> >
> >         if (!arg)
> >                 memset(iov, 0, sizeof(*iov));
> >
> > +       ctx->buf_table = table;
> >         for (i = 0; i < nr_args; i++) {
> >                 struct io_rsrc_node *node;
> >                 u64 tag = 0;
> > @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> >                         }
> >                         node->tag = tag;
> >                 }
> > -               data.nodes[i] = node;
> > +               table.data.nodes[i] = node;
> >         }
> > -
> > -       ctx->buf_table.data = data;
>
> Is it necessary to move this assignment? I found the existing location
> easier to reason about, since the assignment of ctx->buf_table
> represents a transfer of ownership from the local variable.
>
> >         if (ret)
> >                 io_sqe_buffers_unregister(ctx);
> >         return ret;
> > @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> >                 goto unlock;
> >         }
> >
> > -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> >         if (!node) {
> >                 ret = -ENOMEM;
> >                 goto unlock;
> >         }
> >
> >         nr_bvecs = blk_rq_nr_phys_segments(rq);
> > -       imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> > +
> > +       imu = io_alloc_imu(ctx, nr_bvecs);
> >         if (!imu) {
> >                 kfree(node);
> >                 ret = -ENOMEM;
> > @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
> >  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> >                             struct io_uring_clone_buffers *arg)
> >  {
> > -       struct io_rsrc_data data;
> > +       struct io_buf_table table;
> >         int i, ret, off, nr;
> >         unsigned int nbufs;
> >
> > @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> >         if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
> >                 return -EOVERFLOW;
> >
> > -       ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> > +       ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
> >         if (ret)
> >                 return ret;
> >
> > @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> >                 struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
> >
> >                 if (src_node) {
> > -                       data.nodes[i] = src_node;
> > +                       table.data.nodes[i] = src_node;
> >                         src_node->refs++;
> >                 }
> >         }
> > @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> >                 if (!src_node) {
> >                         dst_node = NULL;
> >                 } else {
> > -                       dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > +                       dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> >                         if (!dst_node) {
> >                                 ret = -ENOMEM;
> >                                 goto out_free;
> > @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> >                         refcount_inc(&src_node->buf->refs);
> >                         dst_node->buf = src_node->buf;
> >                 }
> > -               data.nodes[off++] = dst_node;
> > +               table.data.nodes[off++] = dst_node;
> >                 i++;
> >         }
> >
> >         /*
> > -        * If asked for replace, put the old table. data->nodes[] holds both
> > +        * If asked for replace, put the old table. table.data->nodes[] holds both
> >          * old and new nodes at this point.
> >          */
> >         if (arg->flags & IORING_REGISTER_DST_REPLACE)
> > @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> >          * entry).
> >          */
> >         WARN_ON_ONCE(ctx->buf_table.data.nr);
> > -       ctx->buf_table.data = data;
> > +       ctx->buf_table = table;
> >         return 0;
> >  out_free:
> > -       io_rsrc_data_free(ctx, &data);
> > +       io_rsrc_buffer_free(ctx, &table);
> >         return ret;
> >  }
> >
> > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> > index 2e8d1862caefc..c5bdac558a2b4 100644
> > --- a/io_uring/rsrc.h
> > +++ b/io_uring/rsrc.h
> > @@ -47,7 +47,7 @@ struct io_imu_folio_data {
> >         unsigned int    nr_folios;
> >  };
> >
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
> >  void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
> >  void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
> >  int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> > --
> > 2.43.5
> >
> >
Keith Busch Feb. 18, 2025, 8:09 p.m. UTC | #3
On Fri, Feb 14, 2025 at 06:22:09PM -0800, Caleb Sander Mateos wrote:
> > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > +                                          int nr_bvecs)
> > +{
> > +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> 
> If there is no entry available in the cache, this will heap-allocate
> one with enough space for all IO_CACHED_BVECS_SEGS bvecs. 

The cache can only have fixed size objects in them, so we have to pick
some kind of trade off. The cache starts off empty and fills up on
demand, so whatever we allocate needs to be of that cache's element
size.

> Consider
> using io_alloc_cache_get() instead of io_cache_alloc(), so the
> heap-allocated fallback uses the minimal size.

We wouldn't be able to fill the cache with the new dynamic object if we
did that.

> Also, where are these allocations returned to the imu_cache? Looks
> like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> needs to try io_alloc_cache_put() first.

Oops. I kind of rushed this series last Friday as I needed to shut down
very early in the day.

Thanks for the comments. Will take my time for the next version.
Keith Busch Feb. 18, 2025, 8:12 p.m. UTC | #4
On Sun, Feb 16, 2025 at 02:43:40PM -0800, Caleb Sander Mateos wrote:
> > > +       io_alloc_cache_free(&table->imu_cache, kfree);
> > > +}
> > > +
> > >  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> > >  {
> > >         if (!ctx->buf_table.data.nr)
> > >                 return -ENXIO;
> > > -       io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > > +       io_rsrc_buffer_free(ctx, &ctx->buf_table);
> > >         return 0;
> > >  }
> > >
> > > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> > >         return true;
> > >  }
> > >
> > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > > +                                          int nr_bvecs)
> > > +{
> > > +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > > +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> >
> > If there is no entry available in the cache, this will heap-allocate
> > one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> > using io_alloc_cache_get() instead of io_cache_alloc(), so the
> > heap-allocated fallback uses the minimal size.
> >
> > Also, where are these allocations returned to the imu_cache? Looks
> > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> > needs to try io_alloc_cache_put() first.
> 
> Another issue I see is that io_alloc_cache elements are allocated with
> kmalloc(), so they can't be freed with kvfree(). 

You actually can kvfree(kmalloc()); Here's the kernel doc for it:

  kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc()

> When the imu is
> freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
> to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().

But you're right, it shouldn't even hit this path because it's supposed
to try to insert the imu into the cache if that's where it was allocated
from.
Caleb Sander Mateos Feb. 18, 2025, 8:42 p.m. UTC | #5
On Tue, Feb 18, 2025 at 12:09 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, Feb 14, 2025 at 06:22:09PM -0800, Caleb Sander Mateos wrote:
> > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > > +                                          int nr_bvecs)
> > > +{
> > > +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > > +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> >
> > If there is no entry available in the cache, this will heap-allocate
> > one with enough space for all IO_CACHED_BVECS_SEGS bvecs.
>
> The cache can only have fixed size objects in them, so we have to pick
> some kind of trade off. The cache starts off empty and fills up on
> demand, so whatever we allocate needs to be of that cache's element
> size.
>
> > Consider
> > using io_alloc_cache_get() instead of io_cache_alloc(), so the
> > heap-allocated fallback uses the minimal size.
>
> We wouldn't be able to fill the cache with the new dynamic object if we
> did that.

Right, that's a good point that there's a tradeoff. I think always
allocating space for IO_CACHED_BVECS_SEGS bvecs is reasonable. Maybe
IO_CACHED_BVECS_SEGS should be slightly smaller so the allocation fits
nicely in a power of 2? Currently it looks to take up 560 bytes:
>>> 48 + 16 * 32
560

Using IO_CACHED_BVECS_SEGS = 29 instead would make it 512 bytes:
>>> 48 + 16 * 29
512

Best,
Caleb

>
> > Also, where are these allocations returned to the imu_cache? Looks
> > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> > needs to try io_alloc_cache_put() first.
>
> Oops. I kind of rushed this series last Friday as I needed to shut down
> very early in the day.
>
> Thanks for the comments. Will take my time for the next version.
Caleb Sander Mateos Feb. 18, 2025, 8:45 p.m. UTC | #6
On Tue, Feb 18, 2025 at 12:12 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Sun, Feb 16, 2025 at 02:43:40PM -0800, Caleb Sander Mateos wrote:
> > > > +       io_alloc_cache_free(&table->imu_cache, kfree);
> > > > +}
> > > > +
> > > >  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> > > >  {
> > > >         if (!ctx->buf_table.data.nr)
> > > >                 return -ENXIO;
> > > > -       io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > > > +       io_rsrc_buffer_free(ctx, &ctx->buf_table);
> > > >         return 0;
> > > >  }
> > > >
> > > > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> > > >         return true;
> > > >  }
> > > >
> > > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > > > +                                          int nr_bvecs)
> > > > +{
> > > > +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > > > +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> > >
> > > If there is no entry available in the cache, this will heap-allocate
> > > one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> > > using io_alloc_cache_get() instead of io_cache_alloc(), so the
> > > heap-allocated fallback uses the minimal size.
> > >
> > > Also, where are these allocations returned to the imu_cache? Looks
> > > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> > > needs to try io_alloc_cache_put() first.
> >
> > Another issue I see is that io_alloc_cache elements are allocated with
> > kmalloc(), so they can't be freed with kvfree().
>
> You actually can kvfree(kmalloc()); Here's the kernel doc for it:
>
>   kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc()

Good to know, thanks for the pointer! I guess it might be a bit more
efficient to call kfree() if we know based on nr_bvecs that the
allocation came from kmalloc(), but at least this isn't corrupting the
heap.

Best,
Caleb

>
> > When the imu is
> > freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
> > to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().
>
> But you're right, it shouldn't even hit this path because it's supposed
> to try to insert the imu into the cache if that's where it was allocated
> from.
Keith Busch Feb. 18, 2025, 9:12 p.m. UTC | #7
On Tue, Feb 18, 2025 at 12:42:19PM -0800, Caleb Sander Mateos wrote:
> Right, that's a good point that there's a tradeoff. I think always
> allocating space for IO_CACHED_BVECS_SEGS bvecs is reasonable. Maybe
> IO_CACHED_BVECS_SEGS should be slightly smaller so the allocation fits
> nicely in a power of 2? Currently it looks to take up 560 bytes:
> >>> 48 + 16 * 32
> 560
> 
> Using IO_CACHED_BVECS_SEGS = 29 instead would make it 512 bytes:
> >>> 48 + 16 * 29
> 512

Right, and it's even smaller on 32-bit architectures.

I don't think it's worth optimizing the cached object size like this.
It's probably better we optimize for a particular transfer size. If your
bvec is physically discontiguous, 32 bvecs gets you to 128k transfers on
a 4k page platform. That seems like a common transfer size for
benchmarking throughput. It is arbitrary at the end of the day, so I'm
not set on that if there's an argument for something different.
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d8d717cce427f..ebaaa1c7e210f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,8 +67,18 @@  struct io_file_table {
 	unsigned int alloc_hint;
 };
 
+struct io_alloc_cache {
+	void			**entries;
+	unsigned int		nr_cached;
+	unsigned int		max_cached;
+	size_t			elem_size;
+	unsigned int		init_clear;
+};
+
 struct io_buf_table {
 	struct io_rsrc_data	data;
+	struct io_alloc_cache	node_cache;
+	struct io_alloc_cache	imu_cache;
 };
 
 struct io_hash_bucket {
@@ -222,14 +232,6 @@  struct io_submit_state {
 	struct blk_plug		plug;
 };
 
-struct io_alloc_cache {
-	void			**entries;
-	unsigned int		nr_cached;
-	unsigned int		max_cached;
-	unsigned int		elem_size;
-	unsigned int		init_clear;
-};
-
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index dd8eeec97acf6..a21660e3145ab 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -68,7 +68,7 @@  static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (slot_index >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 	if (!node)
 		return -ENOMEM;
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fd7a1b04db8b7..26ff9b5851d94 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -32,6 +32,8 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
+#define IO_CACHED_BVECS_SEGS	32
+
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -122,19 +124,33 @@  static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 	kvfree(imu);
 }
 
-struct io_rsrc_node *io_rsrc_node_alloc(int type)
+
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 {
 	struct io_rsrc_node *node;
 
-	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (type == IORING_RSRC_FILE)
+		node = kmalloc(sizeof(*node), GFP_KERNEL);
+	else
+		node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
 	if (node) {
 		node->type = type;
 		node->refs = 1;
+		node->tag = 0;
+		node->file_ptr = 0;
 	}
 	return node;
 }
 
-__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
+static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
+{
+	kvfree(data->nodes);
+	data->nodes = NULL;
+	data->nr = 0;
+}
+
+__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
+			      struct io_rsrc_data *data)
 {
 	if (!data->nr)
 		return;
@@ -142,9 +158,7 @@  __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
 		if (data->nodes[data->nr])
 			io_put_rsrc_node(ctx, data->nodes[data->nr]);
 	}
-	kvfree(data->nodes);
-	data->nodes = NULL;
-	data->nr = 0;
+	__io_rsrc_data_free(data);
 }
 
 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
@@ -158,6 +172,33 @@  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
 	return -ENOMEM;
 }
 
+static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
+{
+	const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
+						 IO_CACHED_BVECS_SEGS);
+	int ret;
+
+	ret = io_rsrc_data_alloc(&table->data, nr);
+	if (ret)
+		return ret;
+
+	ret = io_alloc_cache_init(&table->node_cache, nr,
+				  sizeof(struct io_rsrc_node), 0);
+	if (ret)
+		goto out_1;
+
+	ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
+	if (ret)
+		goto out_2;
+
+	return 0;
+out_2:
+	io_alloc_cache_free(&table->node_cache, kfree);
+out_1:
+	__io_rsrc_data_free(&table->data);
+	return ret;
+}
+
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
@@ -207,7 +248,7 @@  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 			if (!node) {
 				err = -ENOMEM;
 				fput(file);
@@ -269,7 +310,7 @@  static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			}
 			node->tag = tag;
 		}
-		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+		i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
 		io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
 		ctx->buf_table.data.nodes[i] = node;
 		if (ctx->compat)
@@ -459,6 +500,8 @@  void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 	case IORING_RSRC_BUFFER:
 		if (node->buf)
 			io_buffer_unmap(ctx, node);
+		if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
+			return;
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -527,7 +570,7 @@  int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			goto fail;
 		}
 		ret = -ENOMEM;
-		node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 		if (!node) {
 			fput(file);
 			goto fail;
@@ -547,11 +590,19 @@  int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
+				struct io_buf_table *table)
+{
+	io_rsrc_data_free(ctx, &table->data);
+	io_alloc_cache_free(&table->node_cache, kfree);
+	io_alloc_cache_free(&table->imu_cache, kfree);
+}
+
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	if (!ctx->buf_table.data.nr)
 		return -ENXIO;
-	io_rsrc_data_free(ctx, &ctx->buf_table.data);
+	io_rsrc_buffer_free(ctx, &ctx->buf_table);
 	return 0;
 }
 
@@ -716,6 +767,15 @@  bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
 	return true;
 }
 
+static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
+					   int nr_bvecs)
+{
+	if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
+		return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
+	return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
+			GFP_KERNEL);
+}
+
 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 						   struct iovec *iov,
 						   struct page **last_hpage)
@@ -732,7 +792,7 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	if (!iov->iov_base)
 		return NULL;
 
-	node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
 	node->buf = NULL;
@@ -752,7 +812,7 @@  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 			coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
 	}
 
-	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	imu = io_alloc_imu(ctx, nr_pages);
 	if (!imu)
 		goto done;
 
@@ -800,9 +860,9 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
-	struct io_rsrc_data data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec;
+	struct io_buf_table table;
 	int i, ret;
 
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@@ -811,13 +871,14 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(&data, nr_args);
+	ret = io_rsrc_buffer_alloc(&table, nr_args);
 	if (ret)
 		return ret;
 
 	if (!arg)
 		memset(iov, 0, sizeof(*iov));
 
+	ctx->buf_table = table;
 	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
@@ -857,10 +918,8 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			}
 			node->tag = tag;
 		}
-		data.nodes[i] = node;
+		table.data.nodes[i] = node;
 	}
-
-	ctx->buf_table.data = data;
 	if (ret)
 		io_sqe_buffers_unregister(ctx);
 	return ret;
@@ -891,14 +950,15 @@  int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
 		goto unlock;
 	}
 
-	node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 	if (!node) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
 	nr_bvecs = blk_rq_nr_phys_segments(rq);
-	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+
+	imu = io_alloc_imu(ctx, nr_bvecs);
 	if (!imu) {
 		kfree(node);
 		ret = -ENOMEM;
@@ -1028,7 +1088,7 @@  static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
 			    struct io_uring_clone_buffers *arg)
 {
-	struct io_rsrc_data data;
+	struct io_buf_table table;
 	int i, ret, off, nr;
 	unsigned int nbufs;
 
@@ -1059,7 +1119,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
 		return -EOVERFLOW;
 
-	ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
+	ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
 	if (ret)
 		return ret;
 
@@ -1068,7 +1128,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
 
 		if (src_node) {
-			data.nodes[i] = src_node;
+			table.data.nodes[i] = src_node;
 			src_node->refs++;
 		}
 	}
@@ -1098,7 +1158,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		if (!src_node) {
 			dst_node = NULL;
 		} else {
-			dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 			if (!dst_node) {
 				ret = -ENOMEM;
 				goto out_free;
@@ -1107,12 +1167,12 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
 		}
-		data.nodes[off++] = dst_node;
+		table.data.nodes[off++] = dst_node;
 		i++;
 	}
 
 	/*
-	 * If asked for replace, put the old table. data->nodes[] holds both
+	 * If asked for replace, put the old table. table.data->nodes[] holds both
 	 * old and new nodes at this point.
 	 */
 	if (arg->flags & IORING_REGISTER_DST_REPLACE)
@@ -1125,10 +1185,10 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	 * entry).
 	 */
 	WARN_ON_ONCE(ctx->buf_table.data.nr);
-	ctx->buf_table.data = data;
+	ctx->buf_table = table;
 	return 0;
 out_free:
-	io_rsrc_data_free(ctx, &data);
+	io_rsrc_buffer_free(ctx, &table);
 	return ret;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 2e8d1862caefc..c5bdac558a2b4 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -47,7 +47,7 @@  struct io_imu_folio_data {
 	unsigned int	nr_folios;
 };
 
-struct io_rsrc_node *io_rsrc_node_alloc(int type);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
 void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
 int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);