diff mbox series

[for-next] io_uring: fix CQE reordering

Message ID ec3bc55687b0768bbe20fb62d7d06cfced7d7e70.1663892031.git.asml.silence@gmail.com (mailing list archive)
State New
Headers show
Series [for-next] io_uring: fix CQE reordering | expand

Commit Message

Pavel Begunkov Sept. 23, 2022, 1:53 p.m. UTC
Overflowing CQEs may result in reordeing, which is buggy in case of
links, F_MORE and so.

Reported-by: Dylan Yudaken <dylany@fb.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/io_uring.c | 12 ++++++++++--
 io_uring/io_uring.h | 12 +++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

Comments

Jens Axboe Sept. 23, 2022, 2:19 p.m. UTC | #1
On 9/23/22 7:53 AM, Pavel Begunkov wrote:
> Overflowing CQEs may result in reordeing, which is buggy in case of
> links, F_MORE and so.
> 
> Reported-by: Dylan Yudaken <dylany@fb.com>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  io_uring/io_uring.c | 12 ++++++++++--
>  io_uring/io_uring.h | 12 +++++++++---
>  2 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index f359e24b46c3..62d1f55fde55 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -609,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>  
>  	io_cq_lock(ctx);
>  	while (!list_empty(&ctx->cq_overflow_list)) {
> -		struct io_uring_cqe *cqe = io_get_cqe(ctx);
> +		struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
>  		struct io_overflow_cqe *ocqe;
>  
>  		if (!cqe && !force)
> @@ -736,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
>   * control dependency is enough as we're using WRITE_ONCE to
>   * fill the cq entry
>   */
> -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
> +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
>  {
>  	struct io_rings *rings = ctx->rings;
>  	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
>  	unsigned int free, queued, len;
>  
> +	/*
> +	 * Posting into the CQ when there are pending overflowed CQEs may break
> +	 * ordering guarantees, which will affect links, F_MORE users and more.
> +	 * Force overflow the completion.
> +	 */
> +	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
> +		return NULL;

Rather than pass this bool around for the hot path, why not add a helper
for the case where 'overflow' isn't known? That can leave the regular
io_get_cqe() avoiding this altogether.
Pavel Begunkov Sept. 23, 2022, 2:26 p.m. UTC | #2
On 9/23/22 15:19, Jens Axboe wrote:
> On 9/23/22 7:53 AM, Pavel Begunkov wrote:
>> Overflowing CQEs may result in reordeing, which is buggy in case of
>> links, F_MORE and so.
>>
>> Reported-by: Dylan Yudaken <dylany@fb.com>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>   io_uring/io_uring.c | 12 ++++++++++--
>>   io_uring/io_uring.h | 12 +++++++++---
>>   2 files changed, 19 insertions(+), 5 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index f359e24b46c3..62d1f55fde55 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -609,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>   
>>   	io_cq_lock(ctx);
>>   	while (!list_empty(&ctx->cq_overflow_list)) {
>> -		struct io_uring_cqe *cqe = io_get_cqe(ctx);
>> +		struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
>>   		struct io_overflow_cqe *ocqe;
>>   
>>   		if (!cqe && !force)
>> @@ -736,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
>>    * control dependency is enough as we're using WRITE_ONCE to
>>    * fill the cq entry
>>    */
>> -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
>> +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
>>   {
>>   	struct io_rings *rings = ctx->rings;
>>   	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
>>   	unsigned int free, queued, len;
>>   
>> +	/*
>> +	 * Posting into the CQ when there are pending overflowed CQEs may break
>> +	 * ordering guarantees, which will affect links, F_MORE users and more.
>> +	 * Force overflow the completion.
>> +	 */
>> +	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
>> +		return NULL;
> 
> Rather than pass this bool around for the hot path, why not add a helper
> for the case where 'overflow' isn't known? That can leave the regular
> io_get_cqe() avoiding this altogether.

Was choosing from two ugly-ish solutions, but io_get_cqe() should be
inline and shouldn't really matter, but that's only the case in theory
though. If someone cleans up the CQE32 part and puts it into a separate
non-inline function, it'll be actually inlined.
Dylan Yudaken Sept. 23, 2022, 2:32 p.m. UTC | #3
On Fri, 2022-09-23 at 14:53 +0100, Pavel Begunkov wrote:
> Overflowing CQEs may result in reordeing, which is buggy in case of
> links, F_MORE and so.
> 

Maybe the commit message got cut off?


I think this is probably ok, the downside being that CQE's with no
ordering constraints will have ordering forced on them. An alternative
would be for each case (eg linked, zerocopy, multishot) to either pause
or force CQE's to be overflow ones. This wouldnt slow down the other
codepaths. I don't have an idea for how difficult this might be.

By the way, if you do go with this approach then I believe you can
revert these patches:
a2da676376fe ("io_uring: fix multishot poll on overflow")
cbd25748545c ("io_uring: fix multishot accept ordering")

and do something similar for multishot receive.


Dylan
Jens Axboe Sept. 23, 2022, 2:34 p.m. UTC | #4
On 9/23/22 8:32 AM, Dylan Yudaken wrote:
> On Fri, 2022-09-23 at 14:53 +0100, Pavel Begunkov wrote:
>> Overflowing CQEs may result in reordeing, which is buggy in case of
>> links, F_MORE and so.
>>
> 
> Maybe the commit message got cut off?
> 
> 
> I think this is probably ok, the downside being that CQE's with no
> ordering constraints will have ordering forced on them. An alternative
> would be for each case (eg linked, zerocopy, multishot) to either pause
> or force CQE's to be overflow ones. This wouldnt slow down the other
> codepaths. I don't have an idea for how difficult this might be.

I don't think this matters at all. If you hit overflow, things are
screwed and slow anyway. Doesn't make sense to optimize for that path,
so we may as well impose ordering for everything at that point.
Jens Axboe Sept. 23, 2022, 2:35 p.m. UTC | #5
On 9/23/22 8:26 AM, Pavel Begunkov wrote:
> On 9/23/22 15:19, Jens Axboe wrote:
>> On 9/23/22 7:53 AM, Pavel Begunkov wrote:
>>> Overflowing CQEs may result in reordeing, which is buggy in case of
>>> links, F_MORE and so.
>>>
>>> Reported-by: Dylan Yudaken <dylany@fb.com>
>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>> ---
>>>   io_uring/io_uring.c | 12 ++++++++++--
>>>   io_uring/io_uring.h | 12 +++++++++---
>>>   2 files changed, 19 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> index f359e24b46c3..62d1f55fde55 100644
>>> --- a/io_uring/io_uring.c
>>> +++ b/io_uring/io_uring.c
>>> @@ -609,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>         io_cq_lock(ctx);
>>>       while (!list_empty(&ctx->cq_overflow_list)) {
>>> -        struct io_uring_cqe *cqe = io_get_cqe(ctx);
>>> +        struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
>>>           struct io_overflow_cqe *ocqe;
>>>             if (!cqe && !force)
>>> @@ -736,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
>>>    * control dependency is enough as we're using WRITE_ONCE to
>>>    * fill the cq entry
>>>    */
>>> -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
>>> +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
>>>   {
>>>       struct io_rings *rings = ctx->rings;
>>>       unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
>>>       unsigned int free, queued, len;
>>>   +    /*
>>> +     * Posting into the CQ when there are pending overflowed CQEs may break
>>> +     * ordering guarantees, which will affect links, F_MORE users and more.
>>> +     * Force overflow the completion.
>>> +     */
>>> +    if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
>>> +        return NULL;
>>
>> Rather than pass this bool around for the hot path, why not add a helper
>> for the case where 'overflow' isn't known? That can leave the regular
>> io_get_cqe() avoiding this altogether.
> 
> Was choosing from two ugly-ish solutions, but io_get_cqe() should be
> inline and shouldn't really matter, but that's only the case in theory
> though. If someone cleans up the CQE32 part and puts it into a separate
> non-inline function, it'll be actually inlined.

Yes, in theory the current one will be fine as it's known at compile
time. In theory... Didn't check if practice agrees with that, would
prefer if we didn't leave this to the compiler. Fiddling some other
bits, will check in a bit if I have a better idea.
Pavel Begunkov Sept. 23, 2022, 2:43 p.m. UTC | #6
On 9/23/22 15:35, Jens Axboe wrote:
> On 9/23/22 8:26 AM, Pavel Begunkov wrote:
>> On 9/23/22 15:19, Jens Axboe wrote:
>>> On 9/23/22 7:53 AM, Pavel Begunkov wrote:
>>>> Overflowing CQEs may result in reordeing, which is buggy in case of
>>>> links, F_MORE and so.
>>>>
>>>> Reported-by: Dylan Yudaken <dylany@fb.com>
>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> ---
>>>>    io_uring/io_uring.c | 12 ++++++++++--
>>>>    io_uring/io_uring.h | 12 +++++++++---
>>>>    2 files changed, 19 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>> index f359e24b46c3..62d1f55fde55 100644
>>>> --- a/io_uring/io_uring.c
>>>> +++ b/io_uring/io_uring.c
>>>> @@ -609,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>>          io_cq_lock(ctx);
>>>>        while (!list_empty(&ctx->cq_overflow_list)) {
>>>> -        struct io_uring_cqe *cqe = io_get_cqe(ctx);
>>>> +        struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
>>>>            struct io_overflow_cqe *ocqe;
>>>>              if (!cqe && !force)
>>>> @@ -736,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
>>>>     * control dependency is enough as we're using WRITE_ONCE to
>>>>     * fill the cq entry
>>>>     */
>>>> -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
>>>> +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
>>>>    {
>>>>        struct io_rings *rings = ctx->rings;
>>>>        unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
>>>>        unsigned int free, queued, len;
>>>>    +    /*
>>>> +     * Posting into the CQ when there are pending overflowed CQEs may break
>>>> +     * ordering guarantees, which will affect links, F_MORE users and more.
>>>> +     * Force overflow the completion.
>>>> +     */
>>>> +    if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
>>>> +        return NULL;
>>>
>>> Rather than pass this bool around for the hot path, why not add a helper
>>> for the case where 'overflow' isn't known? That can leave the regular
>>> io_get_cqe() avoiding this altogether.
>>
>> Was choosing from two ugly-ish solutions, but io_get_cqe() should be
>> inline and shouldn't really matter, but that's only the case in theory
>> though. If someone cleans up the CQE32 part and puts it into a separate
>> non-inline function, it'll be actually inlined.
> 
> Yes, in theory the current one will be fine as it's known at compile
> time. In theory... Didn't check if practice agrees with that, would
> prefer if we didn't leave this to the compiler. Fiddling some other
> bits, will check in a bit if I have a better idea.

When inline constants are propagated to the moment they're needed,
no sane compiler will do otherwise, that's one of the most basic
optimisations. Don't think it's sane not relying on that.
Jens Axboe Sept. 23, 2022, 2:51 p.m. UTC | #7
On 9/23/22 8:43 AM, Pavel Begunkov wrote:
> On 9/23/22 15:35, Jens Axboe wrote:
>> On 9/23/22 8:26 AM, Pavel Begunkov wrote:
>>> On 9/23/22 15:19, Jens Axboe wrote:
>>>> On 9/23/22 7:53 AM, Pavel Begunkov wrote:
>>>>> Overflowing CQEs may result in reordeing, which is buggy in case of
>>>>> links, F_MORE and so.
>>>>>
>>>>> Reported-by: Dylan Yudaken <dylany@fb.com>
>>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>>> ---
>>>>>    io_uring/io_uring.c | 12 ++++++++++--
>>>>>    io_uring/io_uring.h | 12 +++++++++---
>>>>>    2 files changed, 19 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>>> index f359e24b46c3..62d1f55fde55 100644
>>>>> --- a/io_uring/io_uring.c
>>>>> +++ b/io_uring/io_uring.c
>>>>> @@ -609,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>>>          io_cq_lock(ctx);
>>>>>        while (!list_empty(&ctx->cq_overflow_list)) {
>>>>> -        struct io_uring_cqe *cqe = io_get_cqe(ctx);
>>>>> +        struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
>>>>>            struct io_overflow_cqe *ocqe;
>>>>>              if (!cqe && !force)
>>>>> @@ -736,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
>>>>>     * control dependency is enough as we're using WRITE_ONCE to
>>>>>     * fill the cq entry
>>>>>     */
>>>>> -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
>>>>> +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
>>>>>    {
>>>>>        struct io_rings *rings = ctx->rings;
>>>>>        unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
>>>>>        unsigned int free, queued, len;
>>>>>    +    /*
>>>>> +     * Posting into the CQ when there are pending overflowed CQEs may break
>>>>> +     * ordering guarantees, which will affect links, F_MORE users and more.
>>>>> +     * Force overflow the completion.
>>>>> +     */
>>>>> +    if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
>>>>> +        return NULL;
>>>>
>>>> Rather than pass this bool around for the hot path, why not add a helper
>>>> for the case where 'overflow' isn't known? That can leave the regular
>>>> io_get_cqe() avoiding this altogether.
>>>
>>> Was choosing from two ugly-ish solutions, but io_get_cqe() should be
>>> inline and shouldn't really matter, but that's only the case in theory
>>> though. If someone cleans up the CQE32 part and puts it into a separate
>>> non-inline function, it'll be actually inlined.
>>
>> Yes, in theory the current one will be fine as it's known at compile
>> time. In theory... Didn't check if practice agrees with that, would
>> prefer if we didn't leave this to the compiler. Fiddling some other
>> bits, will check in a bit if I have a better idea.
> 
> When inline constants are propagated to the moment they're needed,
> no sane compiler will do otherwise, that's one of the most basic
> optimisations. Don't think it's sane not relying on that.

Yeah it's probably fine as-is, I'd expect it to as well for sure.-- 
Jens Axboe
Jens Axboe Sept. 23, 2022, 9:05 p.m. UTC | #8
On Fri, 23 Sep 2022 14:53:25 +0100, Pavel Begunkov wrote:
> Overflowing CQEs may result in reordeing, which is buggy in case of
> links, F_MORE and so.
> 
> 

Applied, thanks!

[1/1] io_uring: fix CQE reordering
      commit: aa1df3a360a0c50e0f0086a785d75c2785c29967

Best regards,
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f359e24b46c3..62d1f55fde55 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -609,7 +609,7 @@  static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	io_cq_lock(ctx);
 	while (!list_empty(&ctx->cq_overflow_list)) {
-		struct io_uring_cqe *cqe = io_get_cqe(ctx);
+		struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
 		struct io_overflow_cqe *ocqe;
 
 		if (!cqe && !force)
@@ -736,12 +736,19 @@  bool io_req_cqe_overflow(struct io_kiocb *req)
  * control dependency is enough as we're using WRITE_ONCE to
  * fill the cq entry
  */
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
 	unsigned int free, queued, len;
 
+	/*
+	 * Posting into the CQ when there are pending overflowed CQEs may break
+	 * ordering guarantees, which will affect links, F_MORE users and more.
+	 * Force overflow the completion.
+	 */
+	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
+		return NULL;
 
 	/* userspace may cheat modifying the tail, be safe and do min */
 	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
@@ -2394,6 +2401,7 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		if (ret < 0)
 			return ret;
 		io_cqring_overflow_flush(ctx);
+
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
 	} while (ret > 0);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d38173b9ac19..177bd55357d7 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -24,7 +24,7 @@  enum {
 	IOU_STOP_MULTISHOT	= -ECANCELED,
 };
 
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
 bool io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 int __io_run_local_work(struct io_ring_ctx *ctx, bool locked);
@@ -93,7 +93,8 @@  static inline void io_cq_lock(struct io_ring_ctx *ctx)
 
 void io_cq_unlock_post(struct io_ring_ctx *ctx);
 
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
+						       bool overflow)
 {
 	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
 		struct io_uring_cqe *cqe = ctx->cqe_cached;
@@ -105,7 +106,12 @@  static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 		return cqe;
 	}
 
-	return __io_get_cqe(ctx);
+	return __io_get_cqe(ctx, overflow);
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+	return io_get_cqe_overflow(ctx, false);
 }
 
 static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,