diff mbox series

[5/6] sched/isolation: Introduce isolated task work

Message ID 20250410152327.24504-6-frederic@kernel.org (mailing list archive)
State New
Headers show
Series sched/mm: LRU drain flush on nohz_full | expand

Commit Message

Frederic Weisbecker April 10, 2025, 3:23 p.m. UTC
Some asynchronous kernel work may be pending upon resume to userspace
and execute later on. On isolated workload this becomes problematic once
the process is done with preparatory work involving syscalls and wants
to run in userspace without being interrupted.

Provide an infrastructure to queue a work to be executed from the current
isolated task context right before resuming to userspace. This goes with
the assumption that isolated tasks are pinned to a single nohz_full CPU.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/sched.h           |  1 +
 include/linux/sched/isolation.h | 17 +++++++++++++++++
 kernel/sched/core.c             |  1 +
 kernel/sched/isolation.c        | 31 +++++++++++++++++++++++++++++++
 kernel/sched/sched.h            |  1 +
 5 files changed, 51 insertions(+)

Comments

Oleg Nesterov April 11, 2025, 10:25 a.m. UTC | #1
I know nothing about this code so I can't review, but let me
ask anyway...

On 04/10, Frederic Weisbecker wrote:
>
> +int __isolated_task_work_queue(void)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	if (current->flags & PF_KTHREAD)
> +		return -EINVAL;

What about PF_USER_WORKER's ? IIUC, these (in fact kernel) threads
never return to userspace and never call task_work_run().

Or PF_IO_WORKER's, they too run only in kernel mode... But iirc they
do call task_work_run().

> +	local_irq_save(flags);
> +	if (task_work_queued(&current->nohz_full_work)) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	ret = task_work_add(current, &current->nohz_full_work, TWA_RESUME);
> +out:
> +	local_irq_restore(flags);
> +	return ret;

Hmm, why not

	local_irq_save(flags);
	if (task_work_queued(...))
		ret = 0;
	else
		ret = task_work_add(...);

?

Oleg.
Frederic Weisbecker April 11, 2025, 10 p.m. UTC | #2
Le Fri, Apr 11, 2025 at 12:25:56PM +0200, Oleg Nesterov a écrit :
> I know nothing about this code so I can't review, but let me
> ask anyway...
> 
> On 04/10, Frederic Weisbecker wrote:
> >
> > +int __isolated_task_work_queue(void)
> > +{
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	if (current->flags & PF_KTHREAD)
> > +		return -EINVAL;
> 
> What about PF_USER_WORKER's ? IIUC, these (in fact kernel) threads
> never return to userspace and never call task_work_run().

Ah good catch! (though I'm having a hard time finding out what this is
about)...

> 
> Or PF_IO_WORKER's, they too run only in kernel mode... But iirc they
> do call task_work_run().

At least I see a lot of task_work usage in io_uring, and there are some
explicit calls to task_work_run() there...

> 
> > +	local_irq_save(flags);
> > +	if (task_work_queued(&current->nohz_full_work)) {
> > +		ret = 0;
> > +		goto out;
> > +	}
> > +
> > +	ret = task_work_add(current, &current->nohz_full_work, TWA_RESUME);
> > +out:
> > +	local_irq_restore(flags);
> > +	return ret;
> 
> Hmm, why not
> 
> 	local_irq_save(flags);
> 	if (task_work_queued(...))
> 		ret = 0;
> 	else
> 		ret = task_work_add(...);

Hehe, yes indeed!

Thanks!
K Prateek Nayak April 12, 2025, 5:12 a.m. UTC | #3
On 4/11/2025 3:55 PM, Oleg Nesterov wrote:
> 
>> +	local_irq_save(flags);
>> +	if (task_work_queued(&current->nohz_full_work)) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	ret = task_work_add(current, &current->nohz_full_work, TWA_RESUME);
>> +out:
>> +	local_irq_restore(flags);
>> +	return ret;
> 
> Hmm, why not
> 
> 	local_irq_save(flags);
> 	if (task_work_queued(...))
> 		ret = 0;
> 	else
> 		ret = task_work_add(...);
> 
> ?

Or use guard() sand save on flags and ret:

	guard(irqsave)();

	if (task_work_queued(...))
		return 0;

	return task_work_add(...);
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5ce76db6d75..4d764eb96e3e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1437,6 +1437,7 @@  struct task_struct {
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
+	struct callback_head		nohz_full_work;
 	atomic_t			tick_dep_mask;
 #endif
 
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d8501f4709b5..74da4324b984 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -77,4 +77,21 @@  static inline bool cpu_is_isolated(int cpu)
 	       cpuset_cpu_is_isolated(cpu);
 }
 
+#if defined(CONFIG_NO_HZ_FULL)
+extern int __isolated_task_work_queue(void);
+
+static inline int isolated_task_work_queue(void)
+{
+	if (!housekeeping_cpu(raw_smp_processor_id(), HK_TYPE_KERNEL_NOISE))
+		return -ENOTSUPP;
+
+	return __isolated_task_work_queue();
+}
+
+extern void isolated_task_work_init(struct task_struct *tsk);
+#else
+static inline int isolated_task_work_queue(void) { return -ENOTSUPP; }
+static inline void isolated_task_work_init(struct task_struct *tsk) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
 #endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index add41254b6e5..c8b8b61ac3a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4524,6 +4524,7 @@  static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->migration_pending = NULL;
 #endif
 	init_sched_mm_cid(p);
+	isolated_task_work_init(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 81bc8b329ef1..e246287de9fa 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -249,3 +249,34 @@  static int __init housekeeping_isolcpus_setup(char *str)
 	return housekeeping_setup(str, flags);
 }
 __setup("isolcpus=", housekeeping_isolcpus_setup);
+
+#if defined(CONFIG_NO_HZ_FULL)
+static void isolated_task_work(struct callback_head *head)
+{
+}
+
+int __isolated_task_work_queue(void)
+{
+	unsigned long flags;
+	int ret;
+
+	if (current->flags & PF_KTHREAD)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	if (task_work_queued(&current->nohz_full_work)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = task_work_add(current, &current->nohz_full_work, TWA_RESUME);
+out:
+	local_irq_restore(flags);
+	return ret;
+}
+
+void isolated_task_work_init(struct task_struct *tsk)
+{
+	init_task_work(&tsk->nohz_full_work, isolated_task_work);
+}
+#endif /* CONFIG_NO_HZ_FULL */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..e7dc4ae5ccc1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -60,6 +60,7 @@ 
 #include <linux/stop_machine.h>
 #include <linux/syscalls_api.h>
 #include <linux/syscalls.h>
+#include <linux/task_work.h>
 #include <linux/tick.h>
 #include <linux/topology.h>
 #include <linux/types.h>