diff mbox series

[RFC,v4,12/15] io_uring/cmd: let cmds to know about dying task

Message ID 20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@ddn.com (mailing list archive)
State New
Headers show
Series fuse: fuse-over-io-uring | expand

Commit Message

Bernd Schubert Oct. 16, 2024, 12:05 a.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

When the taks that submitted a request is dying, a task work for that
request might get run by a kernel thread or even worse by a half
dismantled task. We can't just cancel the task work without running the
callback as the cmd might need to do some clean up, so pass a flag
instead. If set, it's not safe to access any task resources and the
callback is expected to cancel the cmd ASAP.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/uring_cmd.c           | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

Comments

Pavel Begunkov Nov. 4, 2024, 12:28 a.m. UTC | #1
On 10/16/24 01:05, Bernd Schubert wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> When the taks that submitted a request is dying, a task work for that
> request might get run by a kernel thread or even worse by a half
> dismantled task. We can't just cancel the task work without running the
> callback as the cmd might need to do some clean up, so pass a flag
> instead. If set, it's not safe to access any task resources and the
> callback is expected to cancel the cmd ASAP.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>   include/linux/io_uring_types.h | 1 +
>   io_uring/uring_cmd.c           | 6 +++++-
>   2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 7abdc09271245ff7de3fb9a905ca78b7561e37eb..869a81c63e4970576155043fce7fe656293d7f58 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -37,6 +37,7 @@ enum io_uring_cmd_flags {
>   	/* set when uring wants to cancel a previously issued command */
>   	IO_URING_F_CANCEL		= (1 << 11),
>   	IO_URING_F_COMPAT		= (1 << 12),
> +	IO_URING_F_TASK_DEAD		= (1 << 13),
>   };
>   
>   struct io_wq_work_node {
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index 21ac5fb2d5f087e1174d5c94815d580972db6e3f..82c6001cc0696bbcbebb92153e1461f2a9aeebc3 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
>   static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
>   {
>   	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> +	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
> +
> +	if (req->task != current)
> +		flags |= IO_URING_F_TASK_DEAD;

Bernd, please don't change patches under my name without any
notice. This check is wrong, just stick to the original

https://lore.kernel.org/io-uring/d2528a1c-3d7c-4124-953c-02e8e415529e@gmail.com/

In general if you need to change something, either stick your
name, so that I know it might be a derivative, or reflect it in
the commit message, e.g.

Signed-off-by: initial author
[Person 2: changed this and that]
Signed-off-by: person 2

Also, a quick note that btrfs also need the patch, so it'll likely
get queued via either io_uring or btrfs trees for next.

>   	/* task_work executor checks the deffered list completion */
> -	ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
> +	ioucmd->task_work_cb(ioucmd, flags);
>   }
>   
>   void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
>
Bernd Schubert Nov. 4, 2024, 10:15 p.m. UTC | #2
On 11/4/24 01:28, Pavel Begunkov wrote:
> On 10/16/24 01:05, Bernd Schubert wrote:
>> From: Pavel Begunkov <asml.silence@gmail.com>
>>
>> When the taks that submitted a request is dying, a task work for that
>> request might get run by a kernel thread or even worse by a half
>> dismantled task. We can't just cancel the task work without running the
>> callback as the cmd might need to do some clean up, so pass a flag
>> instead. If set, it's not safe to access any task resources and the
>> callback is expected to cancel the cmd ASAP.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>   include/linux/io_uring_types.h | 1 +
>>   io_uring/uring_cmd.c           | 6 +++++-
>>   2 files changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/io_uring_types.h
>> b/include/linux/io_uring_types.h
>> index
>> 7abdc09271245ff7de3fb9a905ca78b7561e37eb..869a81c63e4970576155043fce7fe656293d7f58 100644
>> --- a/include/linux/io_uring_types.h
>> +++ b/include/linux/io_uring_types.h
>> @@ -37,6 +37,7 @@ enum io_uring_cmd_flags {
>>       /* set when uring wants to cancel a previously issued command */
>>       IO_URING_F_CANCEL        = (1 << 11),
>>       IO_URING_F_COMPAT        = (1 << 12),
>> +    IO_URING_F_TASK_DEAD        = (1 << 13),
>>   };
>>     struct io_wq_work_node {
>> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
>> index
>> 21ac5fb2d5f087e1174d5c94815d580972db6e3f..82c6001cc0696bbcbebb92153e1461f2a9aeebc3 100644
>> --- a/io_uring/uring_cmd.c
>> +++ b/io_uring/uring_cmd.c
>> @@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
>>   static void io_uring_cmd_work(struct io_kiocb *req, struct
>> io_tw_state *ts)
>>   {
>>       struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct
>> io_uring_cmd);
>> +    unsigned int flags = IO_URING_F_COMPLETE_DEFER;
>> +
>> +    if (req->task != current)
>> +        flags |= IO_URING_F_TASK_DEAD;
> 
> Bernd, please don't change patches under my name without any
> notice. This check is wrong, just stick to the original
> 
> https://lore.kernel.org/io-uring/d2528a1c-3d7c-4124-953c-02e8e415529e@gmail.com/
> 
> In general if you need to change something, either stick your
> name, so that I know it might be a derivative, or reflect it in
> the commit message, e.g.
> 
> Signed-off-by: initial author
> [Person 2: changed this and that]
> Signed-off-by: person 2

Oh sorry, for sure. I totally forgot to update the commit message.

Somehow the initial version didn't trigger. I need to double check to
see if there wasn't a testing issue on my side - going to check tomorrow.


> 
> Also, a quick note that btrfs also need the patch, so it'll likely
> get queued via either io_uring or btrfs trees for next.

Thanks, good to know, one patch less to carry :)



Thanks,
Bernd
Pavel Begunkov Nov. 5, 2024, 1:08 a.m. UTC | #3
On 11/4/24 22:15, Bernd Schubert wrote:
> On 11/4/24 01:28, Pavel Begunkov wrote:
...
>> In general if you need to change something, either stick your
>> name, so that I know it might be a derivative, or reflect it in
>> the commit message, e.g.
>>
>> Signed-off-by: initial author
>> [Person 2: changed this and that]
>> Signed-off-by: person 2
> 
> Oh sorry, for sure. I totally forgot to update the commit message.
> 
> Somehow the initial version didn't trigger. I need to double check to

"Didn't trigger" like in "kernel was still crashing"?

FWIW, the original version is how it's handled in several places
across io_uring, and the difference is a gap for !DEFER_TASKRUN
when a task_work is queued somewhere in between when a task is
started going through exit() but haven't got PF_EXITING set yet.
IOW, should be harder to hit.
Bernd Schubert Nov. 5, 2024, 11:02 p.m. UTC | #4
On 11/5/24 02:08, Pavel Begunkov wrote:
> On 11/4/24 22:15, Bernd Schubert wrote:
>> On 11/4/24 01:28, Pavel Begunkov wrote:
> ...
>>> In general if you need to change something, either stick your
>>> name, so that I know it might be a derivative, or reflect it in
>>> the commit message, e.g.
>>>
>>> Signed-off-by: initial author
>>> [Person 2: changed this and that]
>>> Signed-off-by: person 2
>>
>> Oh sorry, for sure. I totally forgot to update the commit message.
>>
>> Somehow the initial version didn't trigger. I need to double check to
> 
> "Didn't trigger" like in "kernel was still crashing"?

My initial problem was a crash in iov_iter_get_pages2() on process
kill. And when I tested your initial patch IO_URING_F_TASK_DEAD didn't
get set. Jens then asked to test with the version that I have in my
branch and that worked fine. Although in the mean time I wonder if
I made test mistake (like just fuse.ko reload instead of reboot with
new kernel). Just fixed a couple of issues in my branch (basically
ready for the next version send), will test the initial patch
again as first thing in the morning.


> 
> FWIW, the original version is how it's handled in several places
> across io_uring, and the difference is a gap for !DEFER_TASKRUN
> when a task_work is queued somewhere in between when a task is
> started going through exit() but haven't got PF_EXITING set yet.
> IOW, should be harder to hit.
> 

Does that mean that the test for PF_EXITING is racy and we cannot
entirely rely on it?


Thanks,
Bernd
Pavel Begunkov Nov. 6, 2024, 12:14 a.m. UTC | #5
On 11/5/24 23:02, Bernd Schubert wrote:
> On 11/5/24 02:08, Pavel Begunkov wrote:
>> On 11/4/24 22:15, Bernd Schubert wrote:
>>> On 11/4/24 01:28, Pavel Begunkov wrote:
>> ...
>>>> In general if you need to change something, either stick your
>>>> name, so that I know it might be a derivative, or reflect it in
>>>> the commit message, e.g.
>>>>
>>>> Signed-off-by: initial author
>>>> [Person 2: changed this and that]
>>>> Signed-off-by: person 2
>>>
>>> Oh sorry, for sure. I totally forgot to update the commit message.
>>>
>>> Somehow the initial version didn't trigger. I need to double check to
>>
>> "Didn't trigger" like in "kernel was still crashing"?
> 
> My initial problem was a crash in iov_iter_get_pages2() on process
> kill. And when I tested your initial patch IO_URING_F_TASK_DEAD didn't
> get set. Jens then asked to test with the version that I have in my
> branch and that worked fine. Although in the mean time I wonder if
> I made test mistake (like just fuse.ko reload instead of reboot with
> new kernel). Just fixed a couple of issues in my branch (basically
> ready for the next version send), will test the initial patch
> again as first thing in the morning.

I see. Please let know if it doesn't work, it's not specific
to fuse, if there is a problem it'd also affects other core
io_uring parts.

>> FWIW, the original version is how it's handled in several places
>> across io_uring, and the difference is a gap for !DEFER_TASKRUN
>> when a task_work is queued somewhere in between when a task is
>> started going through exit() but haven't got PF_EXITING set yet.
>> IOW, should be harder to hit.
>>
> 
> Does that mean that the test for PF_EXITING is racy and we cannot
> entirely rely on it?

No, the PF_EXITING check was fine, even though it'll look
different now for unrelated reasons. What I'm saying is that the
callback can get executed from the desired task, i.e.
req->task == current, but it can happen from a late exit(2)/etc.
path where the task is botched and likely doesn't have ->mm.
Ming Lei Nov. 6, 2024, 4:44 a.m. UTC | #6
On Wed, Nov 6, 2024 at 7:02 AM Bernd Schubert <bschubert@ddn.com> wrote:
>
>
>
> On 11/5/24 02:08, Pavel Begunkov wrote:
> > On 11/4/24 22:15, Bernd Schubert wrote:
> >> On 11/4/24 01:28, Pavel Begunkov wrote:
> > ...
> >>> In general if you need to change something, either stick your
> >>> name, so that I know it might be a derivative, or reflect it in
> >>> the commit message, e.g.
> >>>
> >>> Signed-off-by: initial author
> >>> [Person 2: changed this and that]
> >>> Signed-off-by: person 2
> >>
> >> Oh sorry, for sure. I totally forgot to update the commit message.
> >>
> >> Somehow the initial version didn't trigger. I need to double check to
> >
> > "Didn't trigger" like in "kernel was still crashing"?
>
> My initial problem was a crash in iov_iter_get_pages2() on process
> kill. And when I tested your initial patch IO_URING_F_TASK_DEAD didn't
> get set. Jens then asked to test with the version that I have in my
> branch and that worked fine. Although in the mean time I wonder if
> I made test mistake (like just fuse.ko reload instead of reboot with
> new kernel). Just fixed a couple of issues in my branch (basically
> ready for the next version send), will test the initial patch
> again as first thing in the morning.
>
>
> >
> > FWIW, the original version is how it's handled in several places
> > across io_uring, and the difference is a gap for !DEFER_TASKRUN
> > when a task_work is queued somewhere in between when a task is
> > started going through exit() but haven't got PF_EXITING set yet.
> > IOW, should be harder to hit.
> >
>
> Does that mean that the test for PF_EXITING is racy and we cannot
> entirely rely on it?

Another solution is to mark uring_cmd as io_uring_cmd_mark_cancelable(),
which provides a chance to cancel cmd in the current context.

Thanks,
Bernd Schubert Nov. 6, 2024, 7:28 p.m. UTC | #7
On 11/6/24 01:14, Pavel Begunkov wrote:
> On 11/5/24 23:02, Bernd Schubert wrote:
>> On 11/5/24 02:08, Pavel Begunkov wrote:
>>> On 11/4/24 22:15, Bernd Schubert wrote:
>>>> On 11/4/24 01:28, Pavel Begunkov wrote:
>>> ...
>>>>> In general if you need to change something, either stick your
>>>>> name, so that I know it might be a derivative, or reflect it in
>>>>> the commit message, e.g.
>>>>>
>>>>> Signed-off-by: initial author
>>>>> [Person 2: changed this and that]
>>>>> Signed-off-by: person 2
>>>>
>>>> Oh sorry, for sure. I totally forgot to update the commit message.
>>>>
>>>> Somehow the initial version didn't trigger. I need to double check to
>>>
>>> "Didn't trigger" like in "kernel was still crashing"?
>>
>> My initial problem was a crash in iov_iter_get_pages2() on process
>> kill. And when I tested your initial patch IO_URING_F_TASK_DEAD didn't
>> get set. Jens then asked to test with the version that I have in my
>> branch and that worked fine. Although in the mean time I wonder if
>> I made test mistake (like just fuse.ko reload instead of reboot with
>> new kernel). Just fixed a couple of issues in my branch (basically
>> ready for the next version send), will test the initial patch
>> again as first thing in the morning.
> 
> I see. Please let know if it doesn't work, it's not specific
> to fuse, if there is a problem it'd also affects other core
> io_uring parts.

In my current branch getting that situation is rather hard, but 
eventually got IO_URING_F_TASK_DEAD (>40 test rounds vs. every time
before...) - with the initial patch version - I think my testing was
flawed back that time.


> 
>>> FWIW, the original version is how it's handled in several places
>>> across io_uring, and the difference is a gap for !DEFER_TASKRUN
>>> when a task_work is queued somewhere in between when a task is
>>> started going through exit() but haven't got PF_EXITING set yet.
>>> IOW, should be harder to hit.
>>>
>>
>> Does that mean that the test for PF_EXITING is racy and we cannot
>> entirely rely on it?
> 
> No, the PF_EXITING check was fine, even though it'll look
> different now for unrelated reasons. What I'm saying is that the
> callback can get executed from the desired task, i.e.
> req->task == current, but it can happen from a late exit(2)/etc.
> path where the task is botched and likely doesn't have ->mm.
> 

Ah ok, thanks for the info!



Thanks,
Bernd
Bernd Schubert Nov. 6, 2024, 7:34 p.m. UTC | #8
On 11/6/24 05:44, Ming Lei wrote:
> On Wed, Nov 6, 2024 at 7:02 AM Bernd Schubert <bschubert@ddn.com> wrote:
>>
>>
>>
>> On 11/5/24 02:08, Pavel Begunkov wrote:
>>>
>>> FWIW, the original version is how it's handled in several places
>>> across io_uring, and the difference is a gap for !DEFER_TASKRUN
>>> when a task_work is queued somewhere in between when a task is
>>> started going through exit() but haven't got PF_EXITING set yet.
>>> IOW, should be harder to hit.
>>>
>>
>> Does that mean that the test for PF_EXITING is racy and we cannot
>> entirely rely on it?
> 
> Another solution is to mark uring_cmd as io_uring_cmd_mark_cancelable(),
> which provides a chance to cancel cmd in the current context.

Yeah, I have that, see 
[PATCH RFC v4 14/15] fuse: {io-uring} Prevent mount point hang on fuse-server termination

As I just wrote to Pavel, getting IO_URING_F_TASK_DEAD is rather hard
in my current branch.IO_URING_F_CANCEL didn't make a difference ,
I had especially tried to disable it - still neither 
IO_URING_F_TASK_DEAD nor the crash got easily triggered. So I 
reenabled IO_URING_F_CANCEL and then eventually
got IO_URING_F_TASK_DEAD - i.e. without checking the underlying code,
looks like we need both for safety measures.


Thanks,
Bernd
Pavel Begunkov Nov. 7, 2024, 4:11 p.m. UTC | #9
On 11/6/24 19:34, Bernd Schubert wrote:
> On 11/6/24 05:44, Ming Lei wrote:
>> On Wed, Nov 6, 2024 at 7:02 AM Bernd Schubert <bschubert@ddn.com> wrote:
>>>
>>>
>>>
>>> On 11/5/24 02:08, Pavel Begunkov wrote:
>>>>
>>>> FWIW, the original version is how it's handled in several places
>>>> across io_uring, and the difference is a gap for !DEFER_TASKRUN
>>>> when a task_work is queued somewhere in between when a task is
>>>> started going through exit() but haven't got PF_EXITING set yet.
>>>> IOW, should be harder to hit.
>>>>
>>>
>>> Does that mean that the test for PF_EXITING is racy and we cannot
>>> entirely rely on it?
>>
>> Another solution is to mark uring_cmd as io_uring_cmd_mark_cancelable(),
>> which provides a chance to cancel cmd in the current context.

In short, F_CANCEL not going to help, unfortunately.

The F_CANCEL path can and likely to be triggered from a kthread instead
of the original task. See call sites of io_uring_try_cancel_requests(),
where the task termination/exit path, i.e. io_uring_cancel_generic(), in
most cases will skip the call bc of the tctx_inflight() check.

Also, io_uring doesn't try to cancel queued task_work (the callback
is supposed to check if it need to fail the request), so if someone
queued up a task_work including via __io_uring_cmd_do_in_task() and
friends, even F_CANCEL won't be able to cancel it.


> Yeah, I have that, see
> [PATCH RFC v4 14/15] fuse: {io-uring} Prevent mount point hang on fuse-server termination
> 
> As I just wrote to Pavel, getting IO_URING_F_TASK_DEAD is rather hard
> in my current branch.IO_URING_F_CANCEL didn't make a difference ,
> I had especially tried to disable it - still neither
> IO_URING_F_TASK_DEAD nor the crash got easily triggered. So I
> reenabled IO_URING_F_CANCEL and then eventually
> got IO_URING_F_TASK_DEAD - i.e. without checking the underlying code,
> looks like we need both for safety measures.
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 7abdc09271245ff7de3fb9a905ca78b7561e37eb..869a81c63e4970576155043fce7fe656293d7f58 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -37,6 +37,7 @@  enum io_uring_cmd_flags {
 	/* set when uring wants to cancel a previously issued command */
 	IO_URING_F_CANCEL		= (1 << 11),
 	IO_URING_F_COMPAT		= (1 << 12),
+	IO_URING_F_TASK_DEAD		= (1 << 13),
 };
 
 struct io_wq_work_node {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 21ac5fb2d5f087e1174d5c94815d580972db6e3f..82c6001cc0696bbcbebb92153e1461f2a9aeebc3 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -119,9 +119,13 @@  EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
+
+	if (req->task != current)
+		flags |= IO_URING_F_TASK_DEAD;
 
 	/* task_work executor checks the deffered list completion */
-	ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
+	ioucmd->task_work_cb(ioucmd, flags);
 }
 
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,