diff mbox series

io_uring/rsrc: Simplify buffer cloning by locking both rings

Message ID 20250115-uring-clone-refactor-v1-1-b2d951577201@google.com (mailing list archive)
State New
Headers show
Series io_uring/rsrc: Simplify buffer cloning by locking both rings | expand

Commit Message

Jann Horn Jan. 15, 2025, 4:25 p.m. UTC
The locking in the buffer cloning code is somewhat complex because it goes
back and forth between locking the source ring and the destination ring.

Make it easier to reason about by locking both rings at the same time.
To avoid ABBA deadlocks, lock the rings in ascending kernel address order,
just like in lock_two_nondirectories().

Signed-off-by: Jann Horn <jannh@google.com>
---
Just an idea for how I think io_clone_buffers() could be changed so it
becomes slightly easier to reason about.
I left the out_unlock jump label with its current name for now, though
I guess that should probably be adjusted.
---
 io_uring/rsrc.c | 58 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 27 deletions(-)


---
base-commit: c1c03ee7957ec178756cae09c39d77194e8cddb7
change-id: 20250115-uring-clone-refactor-06ddceddeb00

Comments

Jens Axboe Jan. 15, 2025, 5:18 p.m. UTC | #1
On 1/15/25 9:25 AM, Jann Horn wrote:
> The locking in the buffer cloning code is somewhat complex because it goes
> back and forth between locking the source ring and the destination ring.
> 
> Make it easier to reason about by locking both rings at the same time.
> To avoid ABBA deadlocks, lock the rings in ascending kernel address order,
> just like in lock_two_nondirectories().
> 
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
> Just an idea for how I think io_clone_buffers() could be changed so it
> becomes slightly easier to reason about.
> I left the out_unlock jump label with its current name for now, though
> I guess that should probably be adjusted.

Looks pretty clean to me, and does make it easier to reason about. Only
thing that stuck out to me was:

> @@ -1067,7 +1060,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
>  	file = io_uring_register_get_file(buf.src_fd, registered_src);
>  	if (IS_ERR(file))
>  		return PTR_ERR(file);
> -	ret = io_clone_buffers(ctx, file->private_data, &buf);
> +	src_ctx = file->private_data;
> +	if (src_ctx == ctx) {
> +		ret = -ELOOP;
> +		goto out_put;
> +	}

which is a change, as previously it would've been legal to do something ala:

struct io_uring ring;
struct iovec vecs[2];

vecs[0] = real_buffer;
vecs[1] = sparse_buffer;

io_uring_register_buffers(&ring, vecs, 2);

io_uring_clone_buffers_offset(&ring, &ring, 1, 0, 1, IORING_REGISTER_DST_REPLACE);

and clone vecs[0] into slot 1. With the patch, that'll return -ELOOP instead.

Maybe something like the below incremental, to just make the unlock +
double lock depending on whether they are different or not? And also
cleaning up the label naming at the same time.


diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4b030382ad03..a1c7c8db5545 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -938,6 +938,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	int i, ret, off, nr;
 	unsigned int nbufs;
 
+	lockdep_assert_held(&ctx->uring_lock);
+	lockdep_assert_held(&src_ctx->uring_lock);
+
 	/*
 	 * Accounting state is shared between the two rings; that only works if
 	 * both rings are accounted towards the same counters.
@@ -979,17 +982,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	ret = -ENXIO;
 	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
-		goto out_unlock;
+		goto out_free;
 	ret = -EINVAL;
 	if (!arg->nr)
 		arg->nr = nbufs;
 	else if (arg->nr > nbufs)
-		goto out_unlock;
+		goto out_free;
 	ret = -EOVERFLOW;
 	if (check_add_overflow(arg->nr, arg->src_off, &off))
-		goto out_unlock;
+		goto out_free;
 	if (off > nbufs)
-		goto out_unlock;
+		goto out_free;
 
 	off = arg->dst_off;
 	i = arg->src_off;
@@ -1004,7 +1007,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 			if (!dst_node) {
 				ret = -ENOMEM;
-				goto out_unlock;
+				goto out_free;
 			}
 
 			refcount_inc(&src_node->buf->refs);
@@ -1027,11 +1030,11 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	 * copied to a ring that does not have buffers yet (checked at function
 	 * entry).
 	 */
-	WARN_ON(ctx->buf_table.nr);
+	WARN_ON_ONCE(ctx->buf_table.nr);
 	ctx->buf_table = data;
 	return 0;
 
-out_unlock:
+out_free:
 	io_rsrc_data_free(ctx, &data);
 	return ret;
 }
@@ -1064,18 +1067,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	file = io_uring_register_get_file(buf.src_fd, registered_src);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
+
 	src_ctx = file->private_data;
-	if (src_ctx == ctx) {
-		ret = -ELOOP;
-		goto out_put;
+	if (src_ctx != ctx) {
+		mutex_unlock(&ctx->uring_lock);
+		lock_two_rings(ctx, src_ctx);
 	}
 
-	mutex_unlock(&ctx->uring_lock);
-	lock_two_rings(ctx, src_ctx);
 	ret = io_clone_buffers(ctx, src_ctx, &buf);
-	mutex_unlock(&src_ctx->uring_lock);
 
-out_put:
+	if (src_ctx != ctx)
+		mutex_unlock(&src_ctx->uring_lock);
+
 	if (!registered_src)
 		fput(file);
 	return ret;
Jann Horn Jan. 15, 2025, 8:20 p.m. UTC | #2
On Wed, Jan 15, 2025 at 6:18 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 1/15/25 9:25 AM, Jann Horn wrote:
> > The locking in the buffer cloning code is somewhat complex because it goes
> > back and forth between locking the source ring and the destination ring.
> >
> > Make it easier to reason about by locking both rings at the same time.
> > To avoid ABBA deadlocks, lock the rings in ascending kernel address order,
> > just like in lock_two_nondirectories().
> >
> > Signed-off-by: Jann Horn <jannh@google.com>
> > ---
> > Just an idea for how I think io_clone_buffers() could be changed so it
> > becomes slightly easier to reason about.
> > I left the out_unlock jump label with its current name for now, though
> > I guess that should probably be adjusted.
>
> Looks pretty clean to me, and does make it easier to reason about. Only
> thing that stuck out to me was:
>
> > @@ -1067,7 +1060,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
> >       file = io_uring_register_get_file(buf.src_fd, registered_src);
> >       if (IS_ERR(file))
> >               return PTR_ERR(file);
> > -     ret = io_clone_buffers(ctx, file->private_data, &buf);
> > +     src_ctx = file->private_data;
> > +     if (src_ctx == ctx) {
> > +             ret = -ELOOP;
> > +             goto out_put;
> > +     }
>
> which is a change, as previously it would've been legal to do something ala:
>
> struct io_uring ring;
> struct iovec vecs[2];
>
> vecs[0] = real_buffer;
> vecs[1] = sparse_buffer;
>
> io_uring_register_buffers(&ring, vecs, 2);
>
> io_uring_clone_buffers_offset(&ring, &ring, 1, 0, 1, IORING_REGISTER_DST_REPLACE);
>
> and clone vecs[0] into slot 1. With the patch, that'll return -ELOOP instead.
>
> Maybe something like the below incremental, to just make the unlock +
> double lock depending on whether they are different or not? And also
> cleaning up the label naming at the same time.

Yeah, looks good to me. If nobody else has review feedback, do you
want to fold that in locally? If there's more feedback, I'll fold that
incremental into my v2.
Jens Axboe Jan. 15, 2025, 8:22 p.m. UTC | #3
On 1/15/25 1:20 PM, Jann Horn wrote:
> On Wed, Jan 15, 2025 at 6:18?PM Jens Axboe <axboe@kernel.dk> wrote:
>> On 1/15/25 9:25 AM, Jann Horn wrote:
>>> The locking in the buffer cloning code is somewhat complex because it goes
>>> back and forth between locking the source ring and the destination ring.
>>>
>>> Make it easier to reason about by locking both rings at the same time.
>>> To avoid ABBA deadlocks, lock the rings in ascending kernel address order,
>>> just like in lock_two_nondirectories().
>>>
>>> Signed-off-by: Jann Horn <jannh@google.com>
>>> ---
>>> Just an idea for how I think io_clone_buffers() could be changed so it
>>> becomes slightly easier to reason about.
>>> I left the out_unlock jump label with its current name for now, though
>>> I guess that should probably be adjusted.
>>
>> Looks pretty clean to me, and does make it easier to reason about. Only
>> thing that stuck out to me was:
>>
>>> @@ -1067,7 +1060,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
>>>       file = io_uring_register_get_file(buf.src_fd, registered_src);
>>>       if (IS_ERR(file))
>>>               return PTR_ERR(file);
>>> -     ret = io_clone_buffers(ctx, file->private_data, &buf);
>>> +     src_ctx = file->private_data;
>>> +     if (src_ctx == ctx) {
>>> +             ret = -ELOOP;
>>> +             goto out_put;
>>> +     }
>>
>> which is a change, as previously it would've been legal to do something ala:
>>
>> struct io_uring ring;
>> struct iovec vecs[2];
>>
>> vecs[0] = real_buffer;
>> vecs[1] = sparse_buffer;
>>
>> io_uring_register_buffers(&ring, vecs, 2);
>>
>> io_uring_clone_buffers_offset(&ring, &ring, 1, 0, 1, IORING_REGISTER_DST_REPLACE);
>>
>> and clone vecs[0] into slot 1. With the patch, that'll return -ELOOP instead.
>>
>> Maybe something like the below incremental, to just make the unlock +
>> double lock depending on whether they are different or not? And also
>> cleaning up the label naming at the same time.
> 
> Yeah, looks good to me. If nobody else has review feedback, do you
> want to fold that in locally? If there's more feedback, I'll fold that
> incremental into my v2.

If you want to send off a v2, just fold it in. That would be the most
appropriate imho, rather than me modifying your patch :)
diff mbox series

Patch

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 69937d0c94f9518a2fd2cde1e5b2e7139078e1d3..8e26f78230eaaaf7910c1b84a50b40b6ab5fbf16 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -924,6 +924,16 @@  int io_import_fixed(int ddir, struct iov_iter *iter,
 	return 0;
 }
 
+/* Lock two rings at once. The rings must be different! */
+static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
+{
+	if (ctx1 > ctx2)
+		swap(ctx1, ctx2);
+	mutex_lock(&ctx1->uring_lock);
+	mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
+}
+
+/* Both rings are locked by the caller. */
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
 			    struct io_uring_clone_buffers *arg)
 {
@@ -938,7 +948,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
 		return -EBUSY;
 
-	nbufs = READ_ONCE(src_ctx->buf_table.nr);
+	nbufs = src_ctx->buf_table.nr;
 	if (!arg->nr)
 		arg->nr = nbufs;
 	else if (arg->nr > nbufs)
@@ -962,13 +972,6 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		}
 	}
 
-	/*
-	 * Drop our own lock here. We'll setup the data we need and reference
-	 * the source buffers, then re-grab, check, and assign at the end.
-	 */
-	mutex_unlock(&ctx->uring_lock);
-
-	mutex_lock(&src_ctx->uring_lock);
 	ret = -ENXIO;
 	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
@@ -1007,10 +1010,6 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		i++;
 	}
 
-	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
-	mutex_unlock(&src_ctx->uring_lock);
-	mutex_lock(&ctx->uring_lock);
-
 	/*
 	 * If asked for replace, put the old table. data->nodes[] holds both
 	 * old and new nodes at this point.
@@ -1019,24 +1018,17 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		io_rsrc_data_free(ctx, &ctx->buf_table);
 
 	/*
-	 * ctx->buf_table should be empty now - either the contents are being
-	 * replaced and we just freed the table, or someone raced setting up
-	 * a buffer table while the clone was happening. If not empty, fall
-	 * through to failure handling.
+	 * ctx->buf_table must be empty now - either the contents are being
+	 * replaced and we just freed the table, or the contents are being
+	 * copied to a ring that does not have buffers yet (checked at function
+	 * entry).
 	 */
-	if (!ctx->buf_table.nr) {
-		ctx->buf_table = data;
-		return 0;
-	}
+	WARN_ON(ctx->buf_table.nr);
+	ctx->buf_table = data;
+	return 0;
 
-	mutex_unlock(&ctx->uring_lock);
-	mutex_lock(&src_ctx->uring_lock);
-	/* someone raced setting up buffers, dump ours */
-	ret = -EBUSY;
 out_unlock:
 	io_rsrc_data_free(ctx, &data);
-	mutex_unlock(&src_ctx->uring_lock);
-	mutex_lock(&ctx->uring_lock);
 	return ret;
 }
 
@@ -1050,6 +1042,7 @@  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 {
 	struct io_uring_clone_buffers buf;
+	struct io_ring_ctx *src_ctx;
 	bool registered_src;
 	struct file *file;
 	int ret;
@@ -1067,7 +1060,18 @@  int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	file = io_uring_register_get_file(buf.src_fd, registered_src);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
-	ret = io_clone_buffers(ctx, file->private_data, &buf);
+	src_ctx = file->private_data;
+	if (src_ctx == ctx) {
+		ret = -ELOOP;
+		goto out_put;
+	}
+
+	mutex_unlock(&ctx->uring_lock);
+	lock_two_rings(ctx, src_ctx);
+	ret = io_clone_buffers(ctx, src_ctx, &buf);
+	mutex_unlock(&src_ctx->uring_lock);
+
+out_put:
 	if (!registered_src)
 		fput(file);
 	return ret;