@@ -42,6 +42,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
+#include <linux/sched/sysctl.h>
#include <linux/elf.h>
#include <linux/uaccess.h>
@@ -406,6 +407,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
int core_waiters = -EBUSY;
+ unsigned long hang_check = sysctl_hung_task_timeout_secs;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
@@ -415,8 +417,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
if (core_waiters > 0) {
struct core_thread *ptr;
- wait_for_completion_state(&core_state->startup,
- TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+ wait_for_completion_state_timeout(&core_state->startup, TASK_UNINTERRUPTIBLE|TASK_FREEZABLE,
+ hang_check * (HZ/2));
/*
* Wait for all the threads to become inactive, so that
* all the thread context (extended register state, like
@@ -104,6 +104,8 @@ extern void wait_for_completion_io(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern int wait_for_completion_state(struct completion *x, unsigned int state);
+extern int wait_for_completion_state_timeout(struct completion *x, unsigned int state,
+ unsigned long timeout);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
extern unsigned long wait_for_completion_io_timeout(struct completion *x,
@@ -141,7 +141,6 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
struct io_wq_acct *acct,
struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
-static void io_wq_cancel_tw_create(struct io_wq *wq);
static bool io_worker_get(struct io_worker *worker)
{
@@ -1230,7 +1229,7 @@ void io_wq_exit_start(struct io_wq *wq)
set_bit(IO_WQ_BIT_EXIT, &wq->state);
}
-static void io_wq_cancel_tw_create(struct io_wq *wq)
+void io_wq_cancel_tw_create(struct io_wq *wq)
{
struct callback_head *cb;
@@ -43,6 +43,7 @@ struct io_wq_data {
free_work_fn *free_work;
};
+void io_wq_cancel_tw_create(struct io_wq *wq);
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
@@ -269,6 +269,17 @@ int __sched wait_for_completion_state(struct completion *x, unsigned int state)
}
EXPORT_SYMBOL(wait_for_completion_state);
+int __sched wait_for_completion_state_timeout(struct completion *x, unsigned int state,
+ unsigned long timeout)
+{
+ long t = wait_for_common(x, timeout, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state_timeout);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
@@ -6790,6 +6790,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
{
static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
unsigned int task_flags;
+ struct io_uring_task *io_uring = tsk->io_uring;
/*
* Establish LD_WAIT_CONFIG context to ensure none of the code called
@@ -6806,6 +6807,11 @@ static inline void sched_submit_work(struct task_struct *tsk)
wq_worker_sleeping(tsk);
else if (task_flags & PF_IO_WORKER)
io_wq_worker_sleeping(tsk);
+ else if ((task_flags & PF_DUMPCORE) && io_uring) {
+ struct io_wq *wq = io_uring->io_wq;
+
+ io_wq_cancel_tw_create(wq);
+ }
/*
* spinlock and rwlock must not flush block requests. This will
In our production environment, we found many hung tasks. Thead A (exit_mm) ... if (core_state) { struct core_thread self; mmap_read_unlock(mm); self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } __set_current_state(TASK_RUNNING); mmap_read_lock(mm); } ... Thead B (coredump_wait) ... if (core_waiters > 0) { struct core_thread *ptr; freezer_do_not_count(); wait_for_completion(&core_state->startup); freezer_count(); /* * Wait for all the threads to become inactive, so that * all the thread context (extended register state, like * fpu etc) gets copied to the memory. */ ptr = core_state->dumper.next; while (ptr != NULL) { wait_task_inactive(ptr->task, 0); ptr = ptr->next; } ... Thead C (io_worker_exit) ... if (refcount_dec_and_test(&worker->ref)) complete(&worker->ref_done); wait_for_completion(&worker->ref_done); ... Thread A is waiting Thead B to finish core dump, but Thead B found that there is still one thread which doesn't step into exit_mm() to dec core_state->nr_threads. The thead is Thread C, it has submitted a task_work (create_worker_cb) to Thread B and then wait Thread B to execute or cancel the work. So this causes deadlock between io_uring and core dump. Our kernel vesion is stable 5.15.125, and the commit 1d5f5ea7cb7d ("io-wq: remove worker to owner tw dependency") is included. When the last io woker exits, it doesn't find any callback. Once scheduled out, it will invoke io_wq_worker_sleeping() to submit a task work to the master thread. So the commit 1d5f5ea7cb7d ("io-wq: remove worker to owner tw dependency") won't help in this case. For the core dump thread, we can set a timeout to check whether the taks_work callback exists, If needed, cancel the task_work and wake up the io worker, so the dead lock will be resolved. Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com> --- fs/coredump.c | 6 ++++-- include/linux/completion.h | 2 ++ io_uring/io-wq.c | 3 +-- io_uring/io-wq.h | 1 + kernel/sched/completion.c | 11 +++++++++++ kernel/sched/core.c | 6 ++++++ 6 files changed, 25 insertions(+), 4 deletions(-)