diff mbox series

[GIT,PULL] io_uring updates for 6.14-rc1

Message ID 1481b709-d47b-4346-8802-0222d8a79a7e@kernel.dk (mailing list archive)
State New
Headers show
Series [GIT,PULL] io_uring updates for 6.14-rc1 | expand

Pull-request

git://git.kernel.dk/linux.git tags/for-6.14/io_uring-20250119

Commit Message

Jens Axboe Jan. 19, 2025, 3:05 p.m. UTC
Hi Linus,

Here are the io_uring updates for 6.14. Not a lot in terms of features
this time around, mostly just cleanups and code consolidation. In
detail, this pull request contains:

- Support for PI meta data read/write via io_uring, with NVMe and SCSI
  covered.

- Cleanup the per-op structure caching, making it consistent across
  various command types.

- Consolidate the various user mapped features into a concept called
  regions, making the various users of that consistent.

- Various cleanups and fixes.

Note that this will throw a merge conflict, as there's a conflict
between a fix that went into mainline after 6.13-rc4. The
io_uring/register.c one is trivial, the io_uring/uring_cmd.c requires a
small addition. Here's my resolution, while merging it into my for-next
branch:

commit 3761b21b00320fc676aa8f5df8c9158046372b73
Merge: 65a64ecb3357 bab4b2cca027
Author: Jens Axboe <axboe@kernel.dk>
Date:   Wed Jan 15 08:52:03 2025 -0700

    Merge branch 'for-6.14/io_uring' into for-next
    
    * for-6.14/io_uring: (55 commits)
      io_uring: reuse io_should_terminate_tw() for cmds
      io_uring: Factor out a function to parse restrictions
      io_uring/rsrc: require cloned buffers to share accounting contexts
      io_uring: simplify the SQPOLL thread check when cancelling requests
      io_uring: expose read/write attribute capability
      io_uring/rw: don't gate retry on completion context
      io_uring/rw: handle -EAGAIN retry at IO completion time
      io_uring/rw: use io_rw_recycle() from cleanup path
      io_uring/rsrc: simplify the bvec iter count calculation
      io_uring: ensure io_queue_deferred() is out-of-line
      io_uring/rw: always clear ->bytes_done on io_async_rw setup
      io_uring/rw: use NULL for rw->free_iovec assigment
      io_uring/rw: don't mask in f_iocb_flags
      io_uring/msg_ring: Drop custom destructor
      io_uring: Move old async data allocation helper to header
      io_uring/rw: Allocate async data through helper
      io_uring/net: Allocate msghdr async data through helper
      io_uring/uring_cmd: Allocate async data through generic helper
      io_uring/poll: Allocate apoll with generic alloc_cache helper
      io_uring/futex: Allocate ifd with generic alloc_cache helper
      ...
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

+++ b/io_uring/register.c
@@@ -403,11 -396,11 +396,11 @@@ static void io_register_free_rings(stru
  
  static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
  {
+ 	struct io_uring_region_desc rd;
  	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
  	size_t size, sq_array_offset;
 +	unsigned i, tail, old_head;
  	struct io_uring_params p;
- 	void *ptr;
 -	unsigned i, tail;
  	int ret;
  
  	/* for single issuer, must be owner resizing */
@@@ -441,29 -434,26 +434,34 @@@
  	if (size == SIZE_MAX)
  		return -EOVERFLOW;
  
- 	if (!(p.flags & IORING_SETUP_NO_MMAP))
- 		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
- 	else
- 		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
- 						p.cq_off.user_addr, size);
- 	if (IS_ERR(n.rings))
- 		return PTR_ERR(n.rings);
+ 	memset(&rd, 0, sizeof(rd));
+ 	rd.size = PAGE_ALIGN(size);
+ 	if (p.flags & IORING_SETUP_NO_MMAP) {
+ 		rd.user_addr = p.cq_off.user_addr;
+ 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ 	}
+ 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
+ 	if (ret) {
+ 		io_register_free_rings(ctx, &p, &n);
+ 		return ret;
+ 	}
+ 	n.rings = io_region_get_ptr(&n.ring_region);
  
 -	n.rings->sq_ring_mask = p.sq_entries - 1;
 -	n.rings->cq_ring_mask = p.cq_entries - 1;
 -	n.rings->sq_ring_entries = p.sq_entries;
 -	n.rings->cq_ring_entries = p.cq_entries;
 +	/*
 +	 * At this point n.rings is shared with userspace, just like o.rings
 +	 * is as well. While we don't expect userspace to modify it while
 +	 * a resize is in progress, and it's most likely that userspace will
 +	 * shoot itself in the foot if it does, we can't always assume good
 +	 * intent... Use read/write once helpers from here on to indicate the
 +	 * shared nature of it.
 +	 */
 +	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
 +	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
 +	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
 +	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
  
  	if (copy_to_user(arg, &p, sizeof(p))) {
- 		io_register_free_rings(&p, &n);
+ 		io_register_free_rings(ctx, &p, &n);
  		return -EFAULT;
  	}
  
@@@ -516,14 -508,12 +516,13 @@@
  	 * Now copy SQ and CQ entries, if any. If either of the destination
  	 * rings can't hold what is already there, then fail the operation.
  	 */
- 	n.sq_sqes = ptr;
- 	tail = READ_ONCE(o.rings->sq.tail);
+ 	tail = o.rings->sq.tail;
 -	if (tail - o.rings->sq.head > p.sq_entries)
 +	old_head = READ_ONCE(o.rings->sq.head);
 +	if (tail - old_head > p.sq_entries)
  		goto overflow;
 -	for (i = o.rings->sq.head; i < tail; i++) {
 +	for (i = old_head; i < tail; i++) {
  		unsigned src_head = i & (ctx->sq_entries - 1);
 -		unsigned dst_head = i & n.rings->sq_ring_mask;
 +		unsigned dst_head = i & (p.sq_entries - 1);
  
  		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
  	}
+++ b/io_uring/uring_cmd.c
@@@ -188,14 -163,14 +168,22 @@@ void io_uring_cmd_done(struct io_uring_
  }
  EXPORT_SYMBOL_GPL(io_uring_cmd_done);
  
++static void io_uring_cmd_init_once(void *obj)
++{
++	struct io_uring_cmd_data *data = obj;
++
++	data->op_data = NULL;
++}
++
  static int io_uring_cmd_prep_setup(struct io_kiocb *req,
  				   const struct io_uring_sqe *sqe)
  {
  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 -	struct uring_cache *cache;
 +	struct io_uring_cmd_data *cache;
  
- 	cache = io_uring_async_get(req);
- 	if (unlikely(!cache))
 -	cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, NULL);
++	cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req,
++						io_uring_cmd_init_once);
+ 	if (!cache)
  		return -ENOMEM;
  
  	if (!(req->flags & REQ_F_FORCE_ASYNC)) {


Please pull!


The following changes since commit 4bbf9020becbfd8fc2c3da790855b7042fad455b:

  Linux 6.13-rc4 (2024-12-22 13:22:21 -0800)

are available in the Git repository at:

  git://git.kernel.dk/linux.git tags/for-6.14/io_uring-20250119

for you to fetch changes up to 561e3a0c40dc7e3ab7b0b3647a2b89eca16215d9:

  io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname (2025-01-19 07:28:37 -0700)

----------------------------------------------------------------
for-6.14/io_uring-20250119

----------------------------------------------------------------
Al Viro (1):
      io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname

Anuj Gupta (8):
      block: define set of integrity flags to be inherited by cloned bip
      block: modify bio_integrity_map_user to accept iov_iter as argument
      fs, iov_iter: define meta io descriptor
      fs: introduce IOCB_HAS_METADATA for metadata
      io_uring: introduce attributes for read/write and PI support
      block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags
      scsi: add support for user-meta interface
      io_uring: expose read/write attribute capability

Bui Quang Minh (2):
      io_uring/rsrc: simplify the bvec iter count calculation
      io_uring: simplify the SQPOLL thread check when cancelling requests

Christoph Hellwig (1):
      block: copy back bounce buffer to user-space correctly in case of split

Colin Ian King (1):
      io_uring/kbuf: fix unintentional sign extension on shift of reg.bgid

David Wei (1):
      io_uring: clean up io_prep_rw_setup()

Gabriel Krisman Bertazi (9):
      io_uring: Fold allocation into alloc_cache helper
      io_uring: Add generic helper to allocate async data
      io_uring/futex: Allocate ifd with generic alloc_cache helper
      io_uring/poll: Allocate apoll with generic alloc_cache helper
      io_uring/uring_cmd: Allocate async data through generic helper
      io_uring/net: Allocate msghdr async data through helper
      io_uring/rw: Allocate async data through helper
      io_uring: Move old async data allocation helper to header
      io_uring/msg_ring: Drop custom destructor

Jann Horn (1):
      io_uring/rsrc: require cloned buffers to share accounting contexts

Jens Axboe (8):
      block: make bio_integrity_map_user() static inline
      io_uring/rw: don't mask in f_iocb_flags
      io_uring/rw: use NULL for rw->free_iovec assigment
      io_uring/rw: always clear ->bytes_done on io_async_rw setup
      io_uring: ensure io_queue_deferred() is out-of-line
      io_uring/rw: use io_rw_recycle() from cleanup path
      io_uring/rw: handle -EAGAIN retry at IO completion time
      io_uring/rw: don't gate retry on completion context

Josh Triplett (1):
      io_uring: Factor out a function to parse restrictions

Kanchan Joshi (2):
      nvme: add support for passing on the application tag
      block: add support to pass user meta buffer

Pavel Begunkov (21):
      io_uring: rename ->resize_lock
      io_uring/rsrc: export io_check_coalesce_buffer
      io_uring/memmap: flag vmap'ed regions
      io_uring/memmap: flag regions with user pages
      io_uring/memmap: account memory before pinning
      io_uring/memmap: reuse io_free_region for failure path
      io_uring/memmap: optimise single folio regions
      io_uring/memmap: helper for pinning region pages
      io_uring/memmap: add IO_REGION_F_SINGLE_REF
      io_uring/memmap: implement kernel allocated regions
      io_uring/memmap: implement mmap for regions
      io_uring: pass ctx to io_register_free_rings
      io_uring: use region api for SQ
      io_uring: use region api for CQ
      io_uring/kbuf: use mmap_lock to sync with mmap
      io_uring/kbuf: remove pbuf ring refcounting
      io_uring/kbuf: use region api for pbuf rings
      io_uring/memmap: unify io_uring mmap'ing code
      io_uring: don't vmap single page regions
      io_uring: prevent reg-wait speculations
      io_uring: reuse io_should_terminate_tw() for cmds

 block/bio-integrity.c          |  84 +++++++--
 block/blk-integrity.c          |  10 +-
 block/fops.c                   |  45 +++--
 drivers/nvme/host/core.c       |  21 ++-
 drivers/scsi/sd.c              |   4 +-
 include/linux/bio-integrity.h  |  25 ++-
 include/linux/fs.h             |   1 +
 include/linux/io_uring_types.h |  26 +--
 include/linux/uio.h            |   9 +
 include/uapi/linux/fs.h        |   9 +
 include/uapi/linux/io_uring.h  |  17 ++
 io_uring/alloc_cache.h         |  13 ++
 io_uring/fdinfo.c              |   9 +-
 io_uring/futex.c               |  13 +-
 io_uring/io_uring.c            | 136 +++++++--------
 io_uring/io_uring.h            |  23 +++
 io_uring/kbuf.c                | 226 ++++++++-----------------
 io_uring/kbuf.h                |  20 +--
 io_uring/memmap.c              | 375 ++++++++++++++++++++---------------------
 io_uring/memmap.h              |  23 ++-
 io_uring/msg_ring.c            |   7 -
 io_uring/msg_ring.h            |   1 -
 io_uring/net.c                 |  35 ++--
 io_uring/poll.c                |  13 +-
 io_uring/register.c            | 155 +++++++++--------
 io_uring/rsrc.c                |  40 +++--
 io_uring/rsrc.h                |   4 +
 io_uring/rw.c                  | 212 +++++++++++++----------
 io_uring/rw.h                  |  14 +-
 io_uring/timeout.c             |   5 +-
 io_uring/uring_cmd.c           |  22 +--
 io_uring/waitid.c              |   4 +-
 32 files changed, 833 insertions(+), 768 deletions(-)

Comments

Linus Torvalds Jan. 21, 2025, 4:38 a.m. UTC | #1
On Sun, 19 Jan 2025 at 07:05, Jens Axboe <axboe@kernel.dk> wrote:
>
> Note that this will throw a merge conflict, as there's a conflict
> between a fix that went into mainline after 6.13-rc4. The
> io_uring/register.c one is trivial, the io_uring/uring_cmd.c requires a
> small addition. Here's my resolution [..]

Ok, so while doing this merge, I absolutely *hate* your resolution in
both files.

The READ_ONCE/WRITE_ONCE changes during resize operations may not
actually matter, but the way you wrote things, it does multiple
"READ_ONCE()" operations. Which is kind of against the whole *point*.

So in io_uring/register.c, after the loop that copies the old ring contents with

        for (i = old_head; i < tail; i++) {

I changed the

        WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
        WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));

to instead just *use* the original READ_ONCE() values, and thus do

        WRITE_ONCE(n.rings->sq.head, old_head);
        WRITE_ONCE(n.rings->sq.tail, tail);

instead (and same for the 'cq' head/tail logic)

Otherwise, what's the point of reading "once", when you then read again?

Now, presumably (and hopefully) this doesn't actually matter, and
nobody should even have access to the old ring when it gets resized,
but it really bothered me.

And it's also entirely possible that I have now screwed up royally,
and I actually messed up. Maybe I just misunderstood the code. But the
old code really looked nonsensical, and I felt I couldn't leave it
alone.

Now, the other conflict didn't look nonsensical, and I *did* leave it
alone, but I still do hate it even if I did it as you did. Because I
hate that pattern.

There are now three cases of 'use the init_once callback" for
io_uring_alloc_async_data(), and all of them just clear out a couple
of fields.

Is it really worth it?

Could we not get rid of that 'init_once' pattern completely, and
replace it with just always using 'kzalloc()' to clear the *whole*
allocation initially?

From what I can tell, all those things are fairly small structures.
Doing a simple 'memset()' is *cheaper* than calling an indirect
function pointer that then messes up the cache by setting just one or
two fields (and has to do a read-for-ownership in order to do so).

Are there cases where the allocations are so big that doing a
kmalloc() and then clearing one field (using an indirect function
pointer) really is worth it?

Anyway, I left that logic alone, because my hatred for it may run hot
and deep, but the pattern goes beyond just the conflict.

So please tell me why I'm wrong, and please take a look at the
WRITE_ONCE() changes I *did* do, to see if I might be confused there
too.

               Linus
pr-tracker-bot@kernel.org Jan. 21, 2025, 5:30 a.m. UTC | #2
The pull request you sent on Sun, 19 Jan 2025 08:05:26 -0700:

> git://git.kernel.dk/linux.git tags/for-6.14/io_uring-20250119

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/a312e1706ce6c124f04ec85ddece240f3bb2a696

Thank you!
Jens Axboe Jan. 22, 2025, 7:41 p.m. UTC | #3
On 1/20/25 9:38 PM, Linus Torvalds wrote:
> On Sun, 19 Jan 2025 at 07:05, Jens Axboe <axboe@kernel.dk> wrote:
>>
>> Note that this will throw a merge conflict, as there's a conflict
>> between a fix that went into mainline after 6.13-rc4. The
>> io_uring/register.c one is trivial, the io_uring/uring_cmd.c requires a
>> small addition. Here's my resolution [..]
> 
> Ok, so while doing this merge, I absolutely *hate* your resolution in
> both files.

Hah, noted!

> 
> The READ_ONCE/WRITE_ONCE changes during resize operations may not
> actually matter, but the way you wrote things, it does multiple
> "READ_ONCE()" operations. Which is kind of against the whole *point*.
> 
> So in io_uring/register.c, after the loop that copies the old ring contents with
> 
>         for (i = old_head; i < tail; i++) {
> 
> I changed the
> 
>         WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
>         WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
> 
> to instead just *use* the original READ_ONCE() values, and thus do
> 
>         WRITE_ONCE(n.rings->sq.head, old_head);
>         WRITE_ONCE(n.rings->sq.tail, tail);
> 
> instead (and same for the 'cq' head/tail logic)
> 
> Otherwise, what's the point of reading "once", when you then read again?
> 
> Now, presumably (and hopefully) this doesn't actually matter, and
> nobody should even have access to the old ring when it gets resized,
> but it really bothered me.
> 
> And it's also entirely possible that I have now screwed up royally,
> and I actually messed up. Maybe I just misunderstood the code. But the
> old code really looked nonsensical, and I felt I couldn't leave it
> alone.

I do agree with you in that it's nonsensical to use READ_ONCE when it's
ready multiple times, even if it is for documentation purposes. Even
things like the old_head doesn't matter - yes userspace can screw itself
if it updates it after the initial read, but it won't cause any harm. At
the same time, if we've read the old_head, then we should just use that
going forward. So I think it all looks fine, thanks for cleaning that up
while merging.

> Now, the other conflict didn't look nonsensical, and I *did* leave it
> alone, but I still do hate it even if I did it as you did. Because I
> hate that pattern.
> 
> There are now three cases of 'use the init_once callback" for
> io_uring_alloc_async_data(), and all of them just clear out a couple
> of fields.
> 
> Is it really worth it?
> 
> Could we not get rid of that 'init_once' pattern completely, and
> replace it with just always using 'kzalloc()' to clear the *whole*
> allocation initially?
> 
> From what I can tell, all those things are fairly small structures.
> Doing a simple 'memset()' is *cheaper* than calling an indirect
> function pointer that then messes up the cache by setting just one or
> two fields (and has to do a read-for-ownership in order to do so).
> 
> Are there cases where the allocations are so big that doing a
> kmalloc() and then clearing one field (using an indirect function
> pointer) really is worth it?
> 
> Anyway, I left that logic alone, because my hatred for it may run hot
> and deep, but the pattern goes beyond just the conflict.

I'll take a look at this and see if we can't clean that up. The fast
path should be cached anyway.

> So please tell me why I'm wrong, and please take a look at the
> WRITE_ONCE() changes I *did* do, to see if I might be confused there
> too.

Looks good to me.
diff mbox series

Patch

diff --cc io_uring/register.c
index 371aec87e078,4d507a0390e8..6efbeee734a9
--- a/io_uring/register.c
diff --cc io_uring/uring_cmd.c
index ce7726a04883,d235043db21e..7fb7c0a08996
--- a/io_uring/uring_cmd.c