diff mbox series

[2/5] io_uring: mark exit side kworkers as task_work capable

Message ID 20240604191314.454554-3-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Wait on cancelations at release time | expand

Commit Message

Jens Axboe June 4, 2024, 7:01 p.m. UTC
There are two types of work here:

1) Fallback work, if the task is exiting
2) The exit side cancelations

and both of them may do the final fput() of a file. When this happens,
fput() will schedule delayed work. This slows down exits when io_uring
needs to wait for that work to finish. It is possible to flush this via
flush_delayed_fput(), but that's a big hammer as other unrelated files
could be involved, and from other tasks as well.

Add two io_uring helpers to temporarily clear PF_NO_TASKWORK for the
worker threads, and run any queued task_work before setting the flag
again. Then we can ensure we only flush related items that received
their final fput as part of work cancelation and flushing.

For now these are io_uring private, but could obviously be made
generically available, should there be a need to do so.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

Comments

Pavel Begunkov June 5, 2024, 3:01 p.m. UTC | #1
On 6/4/24 20:01, Jens Axboe wrote:
> There are two types of work here:
> 
> 1) Fallback work, if the task is exiting
> 2) The exit side cancelations
> 
> and both of them may do the final fput() of a file. When this happens,
> fput() will schedule delayed work. This slows down exits when io_uring
> needs to wait for that work to finish. It is possible to flush this via
> flush_delayed_fput(), but that's a big hammer as other unrelated files
> could be involved, and from other tasks as well.
> 
> Add two io_uring helpers to temporarily clear PF_NO_TASKWORK for the
> worker threads, and run any queued task_work before setting the flag
> again. Then we can ensure we only flush related items that received
> their final fput as part of work cancelation and flushing.
> 
> For now these are io_uring private, but could obviously be made
> generically available, should there be a need to do so.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>   io_uring/io_uring.c | 21 +++++++++++++++++++++
>   1 file changed, 21 insertions(+)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 96f6da0bf5cd..3ad915262a45 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -234,6 +234,20 @@ static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx
>   	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
>   }
>   
> +static __cold void io_kworker_tw_start(void)
> +{
> +	if (WARN_ON_ONCE(!(current->flags & PF_NO_TASKWORK)))
> +		return;
> +	current->flags &= ~PF_NO_TASKWORK;
> +}
> +
> +static __cold void io_kworker_tw_end(void)
> +{
> +	while (task_work_pending(current))
> +		task_work_run();

Clear TIF_NOTIFY_SIGNAL/RESUME? Maybe even retrying task_work_run()
after and looping around if there are items to execute.


> +	current->flags |= PF_NO_TASKWORK;
> +}
> +
>   static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
>   {
>   	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
> @@ -249,6 +263,8 @@ static __cold void io_fallback_req_func(struct work_struct *work)
>   	struct io_kiocb *req, *tmp;
>   	struct io_tw_state ts = {};
>   
> +	io_kworker_tw_start();
> +
>   	percpu_ref_get(&ctx->refs);
>   	mutex_lock(&ctx->uring_lock);
>   	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
> @@ -256,6 +272,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
>   	io_submit_flush_completions(ctx);
>   	mutex_unlock(&ctx->uring_lock);
>   	percpu_ref_put(&ctx->refs);
> +	io_kworker_tw_end();
>   }
>   
>   static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
> @@ -2720,6 +2737,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
>   	struct io_tctx_node *node;
>   	int ret;
>   
> +	io_kworker_tw_start();
> +
>   	/*
>   	 * If we're doing polled IO and end up having requests being
>   	 * submitted async (out-of-line), then completions can come in while
> @@ -2770,6 +2789,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
>   		 */
>   	} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
>   
> +	io_kworker_tw_end();
> +
>   	init_completion(&exit.completion);
>   	init_task_work(&exit.task_work, io_tctx_exit_cb);
>   	exit.ctx = ctx;
Jens Axboe June 5, 2024, 6:08 p.m. UTC | #2
On 6/5/24 9:01 AM, Pavel Begunkov wrote:
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 96f6da0bf5cd..3ad915262a45 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -234,6 +234,20 @@ static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx
>>       wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
>>   }
>>   +static __cold void io_kworker_tw_start(void)
>> +{
>> +    if (WARN_ON_ONCE(!(current->flags & PF_NO_TASKWORK)))
>> +        return;
>> +    current->flags &= ~PF_NO_TASKWORK;
>> +}
>> +
>> +static __cold void io_kworker_tw_end(void)
>> +{
>> +    while (task_work_pending(current))
>> +        task_work_run();
> 
> Clear TIF_NOTIFY_SIGNAL/RESUME? Maybe even retrying task_work_run()
> after and looping around if there are items to execute.

Yeah good point, it should handle clear the notifiers too. Will make
that change.
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 96f6da0bf5cd..3ad915262a45 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -234,6 +234,20 @@  static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx
 	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
 }
 
+static __cold void io_kworker_tw_start(void)
+{
+	if (WARN_ON_ONCE(!(current->flags & PF_NO_TASKWORK)))
+		return;
+	current->flags &= ~PF_NO_TASKWORK;
+}
+
+static __cold void io_kworker_tw_end(void)
+{
+	while (task_work_pending(current))
+		task_work_run();
+	current->flags |= PF_NO_TASKWORK;
+}
+
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -249,6 +263,8 @@  static __cold void io_fallback_req_func(struct work_struct *work)
 	struct io_kiocb *req, *tmp;
 	struct io_tw_state ts = {};
 
+	io_kworker_tw_start();
+
 	percpu_ref_get(&ctx->refs);
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
@@ -256,6 +272,7 @@  static __cold void io_fallback_req_func(struct work_struct *work)
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
+	io_kworker_tw_end();
 }
 
 static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
@@ -2720,6 +2737,8 @@  static __cold void io_ring_exit_work(struct work_struct *work)
 	struct io_tctx_node *node;
 	int ret;
 
+	io_kworker_tw_start();
+
 	/*
 	 * If we're doing polled IO and end up having requests being
 	 * submitted async (out-of-line), then completions can come in while
@@ -2770,6 +2789,8 @@  static __cold void io_ring_exit_work(struct work_struct *work)
 		 */
 	} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
 
+	io_kworker_tw_end();
+
 	init_completion(&exit.completion);
 	init_task_work(&exit.task_work, io_tctx_exit_cb);
 	exit.ctx = ctx;