diff mbox series

[1/2] sched: Compact RSEQ concurrency IDs in batches

Message ID 20250217112317.258716-2-gmonaco@redhat.com (mailing list archive)
State New
Headers show
Series [1/2] sched: Compact RSEQ concurrency IDs in batches | expand

Commit Message

Gabriele Monaco Feb. 17, 2025, 11:23 a.m. UTC
Currently, the task_mm_cid_work function is called in a task work
triggered by a scheduler tick to frequently compact the mm_cids of each
process for each core. This can delay the execution of the corresponding
thread for the entire duration of the function, negatively affecting the
response in case of real time tasks. In practice, we observe
task_mm_cid_work increasing the latency of 30-35us on a 128 cores
system, this order of magnitude is meaningful under PREEMPT_RT.

Run the task_mm_cid_work in batches of up to CONFIG_RSEQ_CID_SCAN_BATCH
cpus, this contains the duration of the delay for each scan.
Also improve the duration by iterating for all present cpus and not for
all possible.

The task_mm_cid_work already contains a mechanism to avoid running more
frequently than every 100ms, considering the function runs at every
tick, assuming ticks every 1ms (HZ=1000 is common on distros) and
assuming an unfavorable scenario of 1/10 ticks during task T runtime, we
can compact the CIDs for task T in about 130ms by setting
CONFIG_RSEQ_CID_SCAN_BATCH to 10 on a 128 cores machine.
This value also drastically reduces the task work duration and is a more
acceptable latency for the aforementioned machine.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/mm_types.h |  8 ++++++++
 init/Kconfig             | 12 ++++++++++++
 kernel/sched/core.c      | 27 ++++++++++++++++++++++++---
 3 files changed, 44 insertions(+), 3 deletions(-)

Comments

Mathieu Desnoyers Feb. 17, 2025, 7:46 p.m. UTC | #1
On 2025-02-17 06:23, Gabriele Monaco wrote:
> Currently, the task_mm_cid_work function is called in a task work
> triggered by a scheduler tick to frequently compact the mm_cids of each
> process for each core. This can delay the execution of the corresponding
> thread for the entire duration of the function, negatively affecting the
> response in case of real time tasks. In practice, we observe
> task_mm_cid_work increasing the latency of 30-35us on a 128 cores
> system, this order of magnitude is meaningful under PREEMPT_RT.
> 
> Run the task_mm_cid_work in batches of up to CONFIG_RSEQ_CID_SCAN_BATCH
> cpus, this contains the duration of the delay for each scan.
> Also improve the duration by iterating for all present cpus and not for
> all possible.

Iterating only on present cpus is not enough on CONFIG_HOTPLUG=y,
because ACPI can dynamically add/remove CPUs from the set. If we end
up iterating only on present cpus, then we need to add a cpu hotplug
callback to handle the removal case, and I'm not sure the added
complexity is worth it here.

> 
> The task_mm_cid_work already contains a mechanism to avoid running more
> frequently than every 100ms, considering the function runs at every
> tick, assuming ticks every 1ms (HZ=1000 is common on distros) and
> assuming an unfavorable scenario of 1/10 ticks during task T runtime, we
> can compact the CIDs for task T in about 130ms by setting
> CONFIG_RSEQ_CID_SCAN_BATCH to 10 on a 128 cores machine.
> This value also drastically reduces the task work duration and is a more
> acceptable latency for the aforementioned machine.
> 
> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
>   include/linux/mm_types.h |  8 ++++++++
>   init/Kconfig             | 12 ++++++++++++
>   kernel/sched/core.c      | 27 ++++++++++++++++++++++++---
>   3 files changed, 44 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0234f14f2aa6b..1e0e491d2c5c2 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -867,6 +867,13 @@ struct mm_struct {
>   		 * When the next mm_cid scan is due (in jiffies).
>   		 */
>   		unsigned long mm_cid_next_scan;
> +		/*
> +		 * @mm_cid_scan_cpu: Which cpu to start from in the next scan

Other similar comments have a "." at end of line.

> +		 *
> +		 * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH after each scan
> +		 * save the next cpu index here (or 0 if we are done)

Suggested rewording:

Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH. This field holds
the next cpu index after each scan, or 0 if all batches are
done.


> +		 */
> +		unsigned int mm_cid_scan_cpu;
>   		/**
>   		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
>   		 *
> @@ -1249,6 +1256,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
>   	raw_spin_lock_init(&mm->cpus_allowed_lock);
>   	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
>   	cpumask_clear(mm_cidmask(mm));
> +	mm->mm_cid_scan_cpu = 0;
>   }
>   
>   static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
> diff --git a/init/Kconfig b/init/Kconfig
> index d0d021b3fa3b3..39f1d4c7980c0 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1813,6 +1813,18 @@ config DEBUG_RSEQ
>   
>   	  If unsure, say N.
>   
> +config RSEQ_CID_SCAN_BATCH
> +	int "Number of CPUs to scan every time we attempt mm_cid compaction"

Reword without "we".

> +	range 1 NR_CPUS
> +	default 10
> +	depends on SCHED_MM_CID
> +	help
> +	  CPUs are scanned pseudo-periodically to compact the CID of each task,
> +	  this operation can take a longer amount of time on systems with many
> +	  CPUs, resulting in higher scheduling latency for the current task.
> +	  A higher value means the CID is compacted faster, but results in
> +	  higher scheduling latency.
> +
>   config CACHESTAT_SYSCALL
>   	bool "Enable cachestat() system call" if EXPERT
>   	default y
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9aecd914ac691..8d1cce4ed62c6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10536,7 +10536,7 @@ static void task_mm_cid_work(struct callback_head *work)
>   	struct task_struct *t = current;
>   	struct cpumask *cidmask;
>   	struct mm_struct *mm;
> -	int weight, cpu;
> +	int weight, cpu, from_cpu, to_cpu;
>   
>   	SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
>   
> @@ -10546,6 +10546,15 @@ static void task_mm_cid_work(struct callback_head *work)
>   	mm = t->mm;
>   	if (!mm)
>   		return;
> +	cpu = from_cpu = READ_ONCE(mm->mm_cid_scan_cpu);
> +	to_cpu = from_cpu + CONFIG_RSEQ_CID_SCAN_BATCH;
> +	if (from_cpu > cpumask_last(cpu_present_mask)) {

See explanation about using possible rather than present.

> +		from_cpu = 0;
> +		to_cpu = CONFIG_RSEQ_CID_SCAN_BATCH;

If the cpu_possible_mask is sparsely populated, this will end
up doing batches that hit very few cpus. Instead, we should
count how many cpus are handled within each
for_each_cpu_from(cpu, cpu_possible_mask) loops below and break
when reaching CONFIG_RSEQ_CID_SCAN_BATCH.

> +	}
> +	if (from_cpu != 0)
> +		/* Delay scan only if we are done with all cpus. */
> +		goto cid_compact;
>   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
>   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	if (!old_scan) {
> @@ -10561,17 +10570,29 @@ static void task_mm_cid_work(struct callback_head *work)
>   		return;
>   	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
>   		return;
> +
> +cid_compact:
> +	if (!try_cmpxchg(&mm->mm_cid_scan_cpu, &cpu, to_cpu))
> +		return;
>   	cidmask = mm_cidmask(mm);
>   	/* Clear cids that were not recently used. */
> -	for_each_possible_cpu(cpu)
> +	cpu = from_cpu;
> +	for_each_cpu_from(cpu, cpu_present_mask) {
> +		if (cpu == to_cpu)
> +			break;
>   		sched_mm_cid_remote_clear_old(mm, cpu);
> +	}
>   	weight = cpumask_weight(cidmask);
>   	/*
>   	 * Clear cids that are greater or equal to the cidmask weight to
>   	 * recompact it.
>   	 */
> -	for_each_possible_cpu(cpu)
> +	cpu = from_cpu;
> +	for_each_cpu_from(cpu, cpu_present_mask) {
> +		if (cpu == to_cpu)
> +			break;
>   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> +	}

Here set mm->mm_cid_scan_cpu to the new next position which is
the result from the "for each" loop.

Thanks,

Mathieu

>   }
>   
>   void init_sched_mm_cid(struct task_struct *t)
Gabriele Monaco Feb. 18, 2025, 9:52 a.m. UTC | #2
On Mon, 2025-02-17 at 14:46 -0500, Mathieu Desnoyers wrote:
> On 2025-02-17 06:23, Gabriele Monaco wrote:
> > Currently, the task_mm_cid_work function is called in a task work
> > triggered by a scheduler tick to frequently compact the mm_cids of
> > each
> > process for each core. This can delay the execution of the
> > corresponding
> > thread for the entire duration of the function, negatively
> > affecting the
> > response in case of real time tasks. In practice, we observe
> > task_mm_cid_work increasing the latency of 30-35us on a 128 cores
> > system, this order of magnitude is meaningful under PREEMPT_RT.
> > 
> > Run the task_mm_cid_work in batches of up to
> > CONFIG_RSEQ_CID_SCAN_BATCH
> > cpus, this contains the duration of the delay for each scan.
> > Also improve the duration by iterating for all present cpus and not
> > for
> > all possible.
> 
> Iterating only on present cpus is not enough on CONFIG_HOTPLUG=y,
> because ACPI can dynamically add/remove CPUs from the set. If we end
> up iterating only on present cpus, then we need to add a cpu hotplug
> callback to handle the removal case, and I'm not sure the added
> complexity is worth it here.
> 

Got it, didn't think of that..

> > 
> > The task_mm_cid_work already contains a mechanism to avoid running
> > more
> > frequently than every 100ms, considering the function runs at every
> > tick, assuming ticks every 1ms (HZ=1000 is common on distros) and
> > assuming an unfavorable scenario of 1/10 ticks during task T
> > runtime, we
> > can compact the CIDs for task T in about 130ms by setting
> > CONFIG_RSEQ_CID_SCAN_BATCH to 10 on a 128 cores machine.
> > This value also drastically reduces the task work duration and is a
> > more
> > acceptable latency for the aforementioned machine.
> > 
> > Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced
> > by mm_cid")
> > Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> > ---
> >   include/linux/mm_types.h |  8 ++++++++
> >   init/Kconfig             | 12 ++++++++++++
> >   kernel/sched/core.c      | 27 ++++++++++++++++++++++++---
> >   3 files changed, 44 insertions(+), 3 deletions(-)
> >   
> > @@ -10546,6 +10546,15 @@ static void task_mm_cid_work(struct
> > callback_head *work)
> >   	mm = t->mm;
> >   	if (!mm)
> >   		return;
> > +	cpu = from_cpu = READ_ONCE(mm->mm_cid_scan_cpu);
> > +	to_cpu = from_cpu + CONFIG_RSEQ_CID_SCAN_BATCH;
> > +	if (from_cpu > cpumask_last(cpu_present_mask)) {
> 
> See explanation about using possible rather than present.
> 
> > +		from_cpu = 0;
> > +		to_cpu = CONFIG_RSEQ_CID_SCAN_BATCH;
> 
> If the cpu_possible_mask is sparsely populated, this will end
> up doing batches that hit very few cpus. Instead, we should
> count how many cpus are handled within each
> for_each_cpu_from(cpu, cpu_possible_mask) loops below and break
> when reaching CONFIG_RSEQ_CID_SCAN_BATCH.
> 
> > +	}
> > [...]
> > +	for_each_cpu_from(cpu, cpu_present_mask) {
> > +		if (cpu == to_cpu)
> > +			break;
> >   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> > +	}
> 
> Here set mm->mm_cid_scan_cpu to the new next position which is
> the result from the "for each" loop.
> 

Mmh, good point, I wonder though if we need to care for multiple
threads scanning the same mm concurrently. In my patch it shouldn't
happen (threads /book/ up to to_cpu writing it before scanning).
To do so, I'd probably need to create a map with N elements starting
from from_cpu and use that, or have a dry loop before actually
scanning.

Thanks,
Gabriele
diff mbox series

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6b..1e0e491d2c5c2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -867,6 +867,13 @@  struct mm_struct {
 		 * When the next mm_cid scan is due (in jiffies).
 		 */
 		unsigned long mm_cid_next_scan;
+		/*
+		 * @mm_cid_scan_cpu: Which cpu to start from in the next scan
+		 *
+		 * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH after each scan
+		 * save the next cpu index here (or 0 if we are done)
+		 */
+		unsigned int mm_cid_scan_cpu;
 		/**
 		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
 		 *
@@ -1249,6 +1256,7 @@  static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	raw_spin_lock_init(&mm->cpus_allowed_lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	cpumask_clear(mm_cidmask(mm));
+	mm->mm_cid_scan_cpu = 0;
 }
 
 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
diff --git a/init/Kconfig b/init/Kconfig
index d0d021b3fa3b3..39f1d4c7980c0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1813,6 +1813,18 @@  config DEBUG_RSEQ
 
 	  If unsure, say N.
 
+config RSEQ_CID_SCAN_BATCH
+	int "Number of CPUs to scan every time we attempt mm_cid compaction"
+	range 1 NR_CPUS
+	default 10
+	depends on SCHED_MM_CID
+	help
+	  CPUs are scanned pseudo-periodically to compact the CID of each task,
+	  this operation can take a longer amount of time on systems with many
+	  CPUs, resulting in higher scheduling latency for the current task.
+	  A higher value means the CID is compacted faster, but results in
+	  higher scheduling latency.
+
 config CACHESTAT_SYSCALL
 	bool "Enable cachestat() system call" if EXPERT
 	default y
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9aecd914ac691..8d1cce4ed62c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10536,7 +10536,7 @@  static void task_mm_cid_work(struct callback_head *work)
 	struct task_struct *t = current;
 	struct cpumask *cidmask;
 	struct mm_struct *mm;
-	int weight, cpu;
+	int weight, cpu, from_cpu, to_cpu;
 
 	SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
 
@@ -10546,6 +10546,15 @@  static void task_mm_cid_work(struct callback_head *work)
 	mm = t->mm;
 	if (!mm)
 		return;
+	cpu = from_cpu = READ_ONCE(mm->mm_cid_scan_cpu);
+	to_cpu = from_cpu + CONFIG_RSEQ_CID_SCAN_BATCH;
+	if (from_cpu > cpumask_last(cpu_present_mask)) {
+		from_cpu = 0;
+		to_cpu = CONFIG_RSEQ_CID_SCAN_BATCH;
+	}
+	if (from_cpu != 0)
+		/* Delay scan only if we are done with all cpus. */
+		goto cid_compact;
 	old_scan = READ_ONCE(mm->mm_cid_next_scan);
 	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
 	if (!old_scan) {
@@ -10561,17 +10570,29 @@  static void task_mm_cid_work(struct callback_head *work)
 		return;
 	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
 		return;
+
+cid_compact:
+	if (!try_cmpxchg(&mm->mm_cid_scan_cpu, &cpu, to_cpu))
+		return;
 	cidmask = mm_cidmask(mm);
 	/* Clear cids that were not recently used. */
-	for_each_possible_cpu(cpu)
+	cpu = from_cpu;
+	for_each_cpu_from(cpu, cpu_present_mask) {
+		if (cpu == to_cpu)
+			break;
 		sched_mm_cid_remote_clear_old(mm, cpu);
+	}
 	weight = cpumask_weight(cidmask);
 	/*
 	 * Clear cids that are greater or equal to the cidmask weight to
 	 * recompact it.
 	 */
-	for_each_possible_cpu(cpu)
+	cpu = from_cpu;
+	for_each_cpu_from(cpu, cpu_present_mask) {
+		if (cpu == to_cpu)
+			break;
 		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+	}
 }
 
 void init_sched_mm_cid(struct task_struct *t)