diff mbox series

io-wq: backoff when retrying worker creation

Message ID 20250206-wq_retry-v1-1-6d79bde1e69f@purestorage.com (mailing list archive)
State New
Headers show
Series io-wq: backoff when retrying worker creation | expand

Commit Message

Uday Shankar Feb. 6, 2025, 10:38 p.m. UTC
When io_uring submission goes async for the first time on a given task,
we'll try to create a worker thread to handle the submission. Creating
this worker thread can fail due to various transient conditions, such as
an outstanding signal in the forking thread, so we have retry logic with
a limit of 3 retries. However, this retry logic appears to be too
aggressive/fast - we've observed a thread blowing through the retry
limit while having the same outstanding signal the whole time. Here's an
excerpt of some tracing that demonstrates the issue:

First, signal 26 is generated for the process. It ends up getting routed
to thread 92942.

 0)   cbd-92284    /* signal_generate: sig=26 errno=0 code=-2 comm=psblkdASD pid=92934 grp=1 res=0 */

This causes create_io_thread in the signalled thread to fail with
ERESTARTNOINTR, and thus a retry is queued.

13) task_th-92942  /* io_uring_queue_async_work: ring 000000007325c9ae, request 0000000080c96d8e, user_data 0x0, opcode URING_CMD, flags 0x8240001, normal queue, work 000000006e96dd3f */
13) task_th-92942  io_wq_enqueue() {
13) task_th-92942    _raw_spin_lock();
13) task_th-92942    io_wq_activate_free_worker();
13) task_th-92942    _raw_spin_lock();
13) task_th-92942    create_io_worker() {
13) task_th-92942      __kmalloc_cache_noprof();
13) task_th-92942      __init_swait_queue_head();
13) task_th-92942      kprobe_ftrace_handler() {
13) task_th-92942        get_kprobe();
13) task_th-92942        aggr_pre_handler() {
13) task_th-92942          pre_handler_kretprobe();
13) task_th-92942          /* create_enter: (create_io_thread+0x0/0x50) fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */
13) task_th-92942        } /* aggr_pre_handler */
...
13) task_th-92942        } /* copy_process */
13) task_th-92942      } /* create_io_thread */
13) task_th-92942      kretprobe_rethook_handler() {
13) task_th-92942        /* create_exit: (create_io_worker+0x8a/0x1a0 <- create_io_thread) arg1=0xfffffffffffffdff */
13) task_th-92942      } /* kretprobe_rethook_handler */
13) task_th-92942    queue_work_on() {
...

The CPU is then handed to a kworker to process the queued retry:

------------------------------------------
 13) task_th-92942  => kworker-54154
------------------------------------------
13) kworker-54154  io_workqueue_create() {
13) kworker-54154    io_queue_worker_create() {
13) kworker-54154      task_work_add() {
13) kworker-54154        wake_up_state() {
13) kworker-54154          try_to_wake_up() {
13) kworker-54154            _raw_spin_lock_irqsave();
13) kworker-54154            _raw_spin_unlock_irqrestore();
13) kworker-54154          } /* try_to_wake_up */
13) kworker-54154        } /* wake_up_state */
13) kworker-54154        kick_process();
13) kworker-54154      } /* task_work_add */
13) kworker-54154    } /* io_queue_worker_create */
13) kworker-54154  } /* io_workqueue_create */

And then we immediately switch back to the original task to try creating
a worker again. This fails, because the original task still hasn't
handled its signal.

-----------------------------------------
 13) kworker-54154  => task_th-92942
------------------------------------------
13) task_th-92942  create_worker_cont() {
13) task_th-92942    kprobe_ftrace_handler() {
13) task_th-92942      get_kprobe();
13) task_th-92942      aggr_pre_handler() {
13) task_th-92942        pre_handler_kretprobe();
13) task_th-92942        /* create_enter: (create_io_thread+0x0/0x50) fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */
13) task_th-92942      } /* aggr_pre_handler */
13) task_th-92942    } /* kprobe_ftrace_handler */
13) task_th-92942    create_io_thread() {
13) task_th-92942      copy_process() {
13) task_th-92942        task_active_pid_ns();
13) task_th-92942        _raw_spin_lock_irq();
13) task_th-92942        recalc_sigpending();
13) task_th-92942        _raw_spin_lock_irq();
13) task_th-92942      } /* copy_process */
13) task_th-92942    } /* create_io_thread */
13) task_th-92942    kretprobe_rethook_handler() {
13) task_th-92942      /* create_exit: (create_worker_cont+0x35/0x1b0 <- create_io_thread) arg1=0xfffffffffffffdff */
13) task_th-92942    } /* kretprobe_rethook_handler */
13) task_th-92942    io_worker_release();
13) task_th-92942    queue_work_on() {
13) task_th-92942      clear_pending_if_disabled();
13) task_th-92942      __queue_work() {
13) task_th-92942      } /* __queue_work */
13) task_th-92942    } /* queue_work_on */
13) task_th-92942  } /* create_worker_cont */

The pattern repeats another couple times until we blow through the retry
counter, at which point we give up. All outstanding work is canceled,
and the io_uring command which triggered all this is failed with
ECANCELED:

13) task_th-92942  io_acct_cancel_pending_work() {
...
13) task_th-92942  /* io_uring_complete: ring 000000007325c9ae, req 0000000080c96d8e, user_data 0x0, result -125, cflags 0x0 extra1 0 extra2 0  */

Finally, the task gets around to processing its outstanding signal 26,
but it's too late.

13) task_th-92942  /* signal_deliver: sig=26 errno=0 code=-2 sa_handler=59566a0 sa_flags=14000000 */

Try to address this issue by adding a small scaling delay when retrying
worker creation. This should give the forking thread time to handle its
signal in the above case. This isn't a particularly satisfying solution,
as sufficiently paradoxical scheduling would still have us hitting the
same issue, and I'm open to suggestions for something better. But this
is likely to prevent this (already rare) issue from hitting in practice.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
---
 io_uring/io-wq.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)


---
base-commit: ec4ef55172d4539abff470568a4369a6e1c317b8
change-id: 20250206-wq_retry-1cb7f5c1935d

Best regards,

Comments

lizetao Feb. 7, 2025, 2:20 a.m. UTC | #1
Hi,

> -----Original Message-----
> From: Uday Shankar <ushankar@purestorage.com>
> Sent: Friday, February 7, 2025 6:39 AM
> To: Jens Axboe <axboe@kernel.dk>; Pavel Begunkov <asml.silence@gmail.com>
> Cc: io-uring@vger.kernel.org; Uday Shankar <ushankar@purestorage.com>
> Subject: [PATCH] io-wq: backoff when retrying worker creation
> 
> When io_uring submission goes async for the first time on a given task, we'll try to
> create a worker thread to handle the submission. Creating this worker thread can
> fail due to various transient conditions, such as an outstanding signal in the forking
> thread, so we have retry logic with a limit of 3 retries. However, this retry logic
> appears to be too aggressive/fast - we've observed a thread blowing through the
> retry limit while having the same outstanding signal the whole time. Here's an
> excerpt of some tracing that demonstrates the issue:
> 
> First, signal 26 is generated for the process. It ends up getting routed to thread
> 92942.
> 
>  0)   cbd-92284    /* signal_generate: sig=26 errno=0 code=-2
> comm=psblkdASD pid=92934 grp=1 res=0 */
> 
> This causes create_io_thread in the signalled thread to fail with ERESTARTNOINTR,
> and thus a retry is queued.
> 
> 13) task_th-92942  /* io_uring_queue_async_work: ring 000000007325c9ae,
> request 0000000080c96d8e, user_data 0x0, opcode URING_CMD, flags
> 0x8240001, normal queue, work 000000006e96dd3f */
> 13) task_th-92942  io_wq_enqueue() {
> 13) task_th-92942    _raw_spin_lock();
> 13) task_th-92942    io_wq_activate_free_worker();
> 13) task_th-92942    _raw_spin_lock();
> 13) task_th-92942    create_io_worker() {
> 13) task_th-92942      __kmalloc_cache_noprof();
> 13) task_th-92942      __init_swait_queue_head();
> 13) task_th-92942      kprobe_ftrace_handler() {
> 13) task_th-92942        get_kprobe();
> 13) task_th-92942        aggr_pre_handler() {
> 13) task_th-92942          pre_handler_kretprobe();
> 13) task_th-92942          /* create_enter: (create_io_thread+0x0/0x50)
> fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */
> 13) task_th-92942        } /* aggr_pre_handler */
> ...
> 13) task_th-92942        } /* copy_process */
> 13) task_th-92942      } /* create_io_thread */
> 13) task_th-92942      kretprobe_rethook_handler() {
> 13) task_th-92942        /* create_exit: (create_io_worker+0x8a/0x1a0 <-
> create_io_thread) arg1=0xfffffffffffffdff */
> 13) task_th-92942      } /* kretprobe_rethook_handler */
> 13) task_th-92942    queue_work_on() {
> ...
> 
> The CPU is then handed to a kworker to process the queued retry:
> 
> ------------------------------------------
>  13) task_th-92942  => kworker-54154
> ------------------------------------------
> 13) kworker-54154  io_workqueue_create() {
> 13) kworker-54154    io_queue_worker_create() {
> 13) kworker-54154      task_work_add() {
> 13) kworker-54154        wake_up_state() {
> 13) kworker-54154          try_to_wake_up() {
> 13) kworker-54154            _raw_spin_lock_irqsave();
> 13) kworker-54154            _raw_spin_unlock_irqrestore();
> 13) kworker-54154          } /* try_to_wake_up */
> 13) kworker-54154        } /* wake_up_state */
> 13) kworker-54154        kick_process();
> 13) kworker-54154      } /* task_work_add */
> 13) kworker-54154    } /* io_queue_worker_create */
> 13) kworker-54154  } /* io_workqueue_create */
> 
> And then we immediately switch back to the original task to try creating a worker
> again. This fails, because the original task still hasn't handled its signal.
> 
> -----------------------------------------
>  13) kworker-54154  => task_th-92942
> ------------------------------------------
> 13) task_th-92942  create_worker_cont() {
> 13) task_th-92942    kprobe_ftrace_handler() {
> 13) task_th-92942      get_kprobe();
> 13) task_th-92942      aggr_pre_handler() {
> 13) task_th-92942        pre_handler_kretprobe();
> 13) task_th-92942        /* create_enter: (create_io_thread+0x0/0x50)
> fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */
> 13) task_th-92942      } /* aggr_pre_handler */
> 13) task_th-92942    } /* kprobe_ftrace_handler */
> 13) task_th-92942    create_io_thread() {
> 13) task_th-92942      copy_process() {
> 13) task_th-92942        task_active_pid_ns();
> 13) task_th-92942        _raw_spin_lock_irq();
> 13) task_th-92942        recalc_sigpending();
> 13) task_th-92942        _raw_spin_lock_irq();
> 13) task_th-92942      } /* copy_process */
> 13) task_th-92942    } /* create_io_thread */
> 13) task_th-92942    kretprobe_rethook_handler() {
> 13) task_th-92942      /* create_exit: (create_worker_cont+0x35/0x1b0 <-
> create_io_thread) arg1=0xfffffffffffffdff */
> 13) task_th-92942    } /* kretprobe_rethook_handler */
> 13) task_th-92942    io_worker_release();
> 13) task_th-92942    queue_work_on() {
> 13) task_th-92942      clear_pending_if_disabled();
> 13) task_th-92942      __queue_work() {
> 13) task_th-92942      } /* __queue_work */
> 13) task_th-92942    } /* queue_work_on */
> 13) task_th-92942  } /* create_worker_cont */
> 
> The pattern repeats another couple times until we blow through the retry counter,
> at which point we give up. All outstanding work is canceled, and the io_uring
> command which triggered all this is failed with
> ECANCELED:
> 
> 13) task_th-92942  io_acct_cancel_pending_work() { ...
> 13) task_th-92942  /* io_uring_complete: ring 000000007325c9ae, req
> 0000000080c96d8e, user_data 0x0, result -125, cflags 0x0 extra1 0 extra2 0  */
> 
> Finally, the task gets around to processing its outstanding signal 26, but it's too
> late.
> 
> 13) task_th-92942  /* signal_deliver: sig=26 errno=0 code=-2
> sa_handler=59566a0 sa_flags=14000000 */
> 
> Try to address this issue by adding a small scaling delay when retrying worker
> creation. This should give the forking thread time to handle its signal in the above
> case. This isn't a particularly satisfying solution, as sufficiently paradoxical
> scheduling would still have us hitting the same issue, and I'm open to suggestions
> for something better. But this is likely to prevent this (already rare) issue from
> hitting in practice.
> 
> Signed-off-by: Uday Shankar <ushankar@purestorage.com>
> ---
>  io_uring/io-wq.c | 23 ++++++++++++++++++-----
>  1 file changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index
> f7d328feb7225d809601707e423c86a85ebb1c3c..173c77b70060bbbb2cd6009
> 614c079095fab3e3c 100644
> --- a/io_uring/io-wq.c
> +++ b/io_uring/io-wq.c
> @@ -63,7 +63,7 @@ struct io_worker {
> 
>  	union {
>  		struct rcu_head rcu;
> -		struct work_struct work;
> +		struct delayed_work work;
>  	};
>  };
> 
> @@ -784,6 +784,18 @@ static inline bool io_should_retry_thread(struct
> io_worker *worker, long err)
>  	}
>  }
> 
> +static void queue_create_worker_retry(struct io_worker *worker) {
> +	/*
> +	 * We only bother retrying because there's a chance that the
> +	 * failure to create a worker is due to some temporary condition
> +	 * in the forking task (e.g. outstanding signal); give the task
> +	 * some time to clear that condition.
> +	 */
> +	schedule_delayed_work(
> +		&worker->work, msecs_to_jiffies(worker->init_retries * 5)); }

Why choose (worker->init_retries * 5) as the delay time here? Is it the estimated time
to process the signal?

> +
>  static void create_worker_cont(struct callback_head *cb)  {
>  	struct io_worker *worker;
> @@ -823,12 +835,13 @@ static void create_worker_cont(struct callback_head
> *cb)
> 
>  	/* re-create attempts grab a new worker ref, drop the existing one */
>  	io_worker_release(worker);
> -	schedule_work(&worker->work);
> +	queue_create_worker_retry(worker);
>  }
> 
>  static void io_workqueue_create(struct work_struct *work)  {
> -	struct io_worker *worker = container_of(work, struct io_worker, work);
> +	struct io_worker *worker = container_of(
> +		work, struct io_worker, work.work);
>  	struct io_wq_acct *acct = io_wq_get_acct(worker);
> 
>  	if (!io_queue_worker_create(worker, acct, create_worker_cont)) @@ -866,8
> +879,8 @@ static bool create_io_worker(struct io_wq *wq, struct io_wq_acct
> *acct)
>  		kfree(worker);
>  		goto fail;
>  	} else {
> -		INIT_WORK(&worker->work, io_workqueue_create);
> -		schedule_work(&worker->work);
> +		INIT_DELAYED_WORK(&worker->work, io_workqueue_create);
> +		queue_create_worker_retry(worker);
>  	}
> 
>  	return true;
> 
> ---
> base-commit: ec4ef55172d4539abff470568a4369a6e1c317b8
> change-id: 20250206-wq_retry-1cb7f5c1935d
> 
> Best regards,
> --
> Uday Shankar <ushankar@purestorage.com>
> 

---
Li Zetao
Jens Axboe Feb. 7, 2025, 6:24 p.m. UTC | #2
On 2/6/25 3:38 PM, Uday Shankar wrote:
>  io_uring/io-wq.c | 23 ++++++++++++++++++-----
>  1 file changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
> index f7d328feb7225d809601707e423c86a85ebb1c3c..173c77b70060bbbb2cd6009614c079095fab3e3c 100644
> --- a/io_uring/io-wq.c
> +++ b/io_uring/io-wq.c
> @@ -63,7 +63,7 @@ struct io_worker {
>  
>  	union {
>  		struct rcu_head rcu;
> -		struct work_struct work;
> +		struct delayed_work work;
>  	};
>  };
>  
> @@ -784,6 +784,18 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err)
>  	}
>  }
>  
> +static void queue_create_worker_retry(struct io_worker *worker)
> +{
> +	/*
> +	 * We only bother retrying because there's a chance that the
> +	 * failure to create a worker is due to some temporary condition
> +	 * in the forking task (e.g. outstanding signal); give the task
> +	 * some time to clear that condition.
> +	 */
> +	schedule_delayed_work(
> +		&worker->work, msecs_to_jiffies(worker->init_retries * 5));
> +}

This should be broken ala:

	schedule_delayed_work(&worker->work,
				msecs_to_jiffies(worker->init_retries * 5));

as that's more readable. But that's just a minor nit. I do think the
increased backoff makes sense, and while I agree that this isn't the
prettiest solution (or guaranteed foolproof), it'll most likely fix the
issue.


>  static void io_workqueue_create(struct work_struct *work)
>  {
> -	struct io_worker *worker = container_of(work, struct io_worker, work);
> +	struct io_worker *worker = container_of(
> +		work, struct io_worker, work.work);

Same here on line break.

I can make these edits while applying, or just send a v2. Thanks!
diff mbox series

Patch

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index f7d328feb7225d809601707e423c86a85ebb1c3c..173c77b70060bbbb2cd6009614c079095fab3e3c 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -63,7 +63,7 @@  struct io_worker {
 
 	union {
 		struct rcu_head rcu;
-		struct work_struct work;
+		struct delayed_work work;
 	};
 };
 
@@ -784,6 +784,18 @@  static inline bool io_should_retry_thread(struct io_worker *worker, long err)
 	}
 }
 
+static void queue_create_worker_retry(struct io_worker *worker)
+{
+	/*
+	 * We only bother retrying because there's a chance that the
+	 * failure to create a worker is due to some temporary condition
+	 * in the forking task (e.g. outstanding signal); give the task
+	 * some time to clear that condition.
+	 */
+	schedule_delayed_work(
+		&worker->work, msecs_to_jiffies(worker->init_retries * 5));
+}
+
 static void create_worker_cont(struct callback_head *cb)
 {
 	struct io_worker *worker;
@@ -823,12 +835,13 @@  static void create_worker_cont(struct callback_head *cb)
 
 	/* re-create attempts grab a new worker ref, drop the existing one */
 	io_worker_release(worker);
-	schedule_work(&worker->work);
+	queue_create_worker_retry(worker);
 }
 
 static void io_workqueue_create(struct work_struct *work)
 {
-	struct io_worker *worker = container_of(work, struct io_worker, work);
+	struct io_worker *worker = container_of(
+		work, struct io_worker, work.work);
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
 
 	if (!io_queue_worker_create(worker, acct, create_worker_cont))
@@ -866,8 +879,8 @@  static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct)
 		kfree(worker);
 		goto fail;
 	} else {
-		INIT_WORK(&worker->work, io_workqueue_create);
-		schedule_work(&worker->work);
+		INIT_DELAYED_WORK(&worker->work, io_workqueue_create);
+		queue_create_worker_retry(worker);
 	}
 
 	return true;