diff mbox series

fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct

Message ID 20250410125030.215239-1-gmonaco@redhat.com (mailing list archive)
State New
Headers show
Series fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct | expand

Commit Message

Gabriele Monaco April 10, 2025, 12:50 p.m. UTC
Thanks both for the comments, I tried to implement what Mathieu
suggested. This patch applies directly on 2/3 but I'm sending it here
first to get feedback.

Essentially, I refactored a bit to avoid the need to add more
dependencies to rseq, the rseq_tick is now called task_tick_mm_cid (as
before the series) and it does the two things you mentioned:
 * A) trigger the mm_cid recompaction
 * B) trigger an update of the task's rseq->mm_cid field at some point
   after recompaction, so it can get a mm_cid value closer to 0.

Now, A occurs only after the scan time elapsed, which means it could
potentially run multiple times in case the work is not scheduled before
the next tick, I'm not sure adding more checks to make sure it
happens once and only once really makes sense here.

B is occurring after the work updates the last scan time, so we are in a
condition where the runtime is above threshold but the (next) scan time
did not expire yet.
I tried to account for multiple threads updating the mm_cid (not
necessarily the long running one, or in case more are long running), for
this I'm tracking the last time we updated the mm_cid, if that occurred
before the last mm_cid scan, we need to update (and preempt).

Does this make sense to you?

Thanks,
Gabriele

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/rseq.h  | 14 +-------------
 include/linux/sched.h |  1 +
 kernel/sched/core.c   | 42 +++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h  |  3 +++
 4 files changed, 46 insertions(+), 14 deletions(-)


base-commit: c59c19fcfad857c96effa3b2e9eb6d934d2380d8

Comments

Mathieu Desnoyers April 10, 2025, 2:04 p.m. UTC | #1
On 2025-04-10 08:50, Gabriele Monaco wrote:
> Thanks both for the comments, I tried to implement what Mathieu
> suggested. This patch applies directly on 2/3 but I'm sending it here
> first to get feedback.
> 
> Essentially, I refactored a bit to avoid the need to add more
> dependencies to rseq, the rseq_tick is now called task_tick_mm_cid (as
> before the series) and it does the two things you mentioned:
>   * A) trigger the mm_cid recompaction
>   * B) trigger an update of the task's rseq->mm_cid field at some point
>     after recompaction, so it can get a mm_cid value closer to 0.
> 
> Now, A occurs only after the scan time elapsed, which means it could
> potentially run multiple times in case the work is not scheduled before
> the next tick, I'm not sure adding more checks to make sure it
> happens once and only once really makes sense here.

The scan is gated by two checks now:

> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> +		if (mm_cid_needs_scan(t->mm))

And likewise for the periodic check for preemption:

> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
[...]
> +		else if (time_after(jiffies, t->last_rseq_preempt +
> +				      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {

Those second levels of time checks would prevent adding significant
overhead on every tick after the threshold is reached.

> 
> B is occurring after the work updates the last scan time, so we are in a
> condition where the runtime is above threshold but the (next) scan time
> did not expire yet.
> I tried to account for multiple threads updating the mm_cid (not
> necessarily the long running one, or in case more are long running), for
> this I'm tracking the last time we updated the mm_cid, if that occurred
> before the last mm_cid scan, we need to update (and preempt).
> 
> Does this make sense to you?

It makes sense. Note that it adds overhead to rseq_preempt() (a store
to t->last_rseq_preempt), which is a fast path. I don't know if we
should care.

Also part of task_tick_mm_cid could be moved to a helper, e.g.:

static
void task_reset_mm_cid(struct rq *rq, struct task_struct *t)
{
         int old_cid = t->mm_cid;
         
         if (!t->mm_cid_active)
                 return;
         mm_cid_snapshot_time(rq, t->mm);
         mm_cid_put_lazy(t);
         t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
         if (old_cid == t->mm_cid)
                 return;
         rseq_preempt(t);
}

Thanks,

Mathieu

> 
> Thanks,
> Gabriele
> 
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
>   include/linux/rseq.h  | 14 +-------------
>   include/linux/sched.h |  1 +
>   kernel/sched/core.c   | 42 +++++++++++++++++++++++++++++++++++++++++-
>   kernel/sched/sched.h  |  3 +++
>   4 files changed, 46 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/rseq.h b/include/linux/rseq.h
> index d20fd72f4c80d..7e3fa2ae9e7a4 100644
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -7,8 +7,6 @@
>   #include <linux/preempt.h>
>   #include <linux/sched.h>
>   
> -#define RSEQ_UNPREEMPTED_THRESHOLD	(100ULL * 1000000)	/* 100ms */
> -
>   /*
>    * Map the event mask on the user-space ABI enum rseq_cs_flags
>    * for direct mask checks.
> @@ -54,14 +52,7 @@ static inline void rseq_preempt(struct task_struct *t)
>   {
>   	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
>   	rseq_set_notify_resume(t);
> -}
> -
> -static inline void rseq_preempt_from_tick(struct task_struct *t)
> -{
> -	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> -
> -	if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
> -		rseq_preempt(t);
> +	t->last_rseq_preempt = jiffies;
>   }
>   
>   /* rseq_migrate() requires preemption to be disabled. */
> @@ -114,9 +105,6 @@ static inline void rseq_signal_deliver(struct ksignal *ksig,
>   static inline void rseq_preempt(struct task_struct *t)
>   {
>   }
> -static inline void rseq_preempt_from_tick(struct task_struct *t)
> -{
> -}
>   static inline void rseq_migrate(struct task_struct *t)
>   {
>   }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 851933e62bed3..5b057095d5dc0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1424,6 +1424,7 @@ struct task_struct {
>   	int				last_mm_cid;	/* Most recent cid in mm */
>   	int				migrate_from_cpu;
>   	int				mm_cid_active;	/* Whether cid bitmap is active */
> +	unsigned long			last_rseq_preempt; /* Time of last preempt in jiffies */
>   #endif
>   
>   	struct tlbflush_unmap_batch	tlb_ubc;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 52ad709094167..9f0c9cc284804 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5663,7 +5663,7 @@ void sched_tick(void)
>   		resched_latency = cpu_resched_latency(rq);
>   	calc_global_load_tick(rq);
>   	sched_core_tick(rq);
> -	rseq_preempt_from_tick(donor);
> +	task_tick_mm_cid(rq, donor);
>   	scx_tick(rq);
>   
>   	rq_unlock(rq, &rf);
> @@ -10618,6 +10618,46 @@ void init_sched_mm_cid(struct task_struct *t)
>   	}
>   }
>   
> +void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
> +{
> +	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> +
> +	/*
> +	 * If a task is running unpreempted for a long time, it won't get its
> +	 * mm_cid compacted and won't update its mm_cid value after a
> +	 * compaction occurs.
> +	 * For such a task, this function does two things:
> +	 * A) trigger the mm_cid recompaction,
> +	 * B) trigger an update of the task's rseq->mm_cid field at some point
> +	 * after recompaction, so it can get a mm_cid value closer to 0.
> +	 * A change in the mm_cid triggers an rseq_preempt.
> +	 *
> +	 * A occurs only after the next scan time elapsed but before the
> +	 * compaction work is actually scheduled.
> +	 * B occurs once after the compaction work completes, that is when scan
> +	 * is no longer needed (it occurred for this mm) but the last rseq
> +	 * preempt was done before the last mm_cid scan.
> +	 */
> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> +		if (mm_cid_needs_scan(t->mm))
> +			rseq_set_notify_resume(t);
> +		else if (time_after(jiffies, t->last_rseq_preempt +
> +				      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> +			int old_cid = t->mm_cid;
> +
> +			if (!t->mm_cid_active)
> +				return;
> +			mm_cid_snapshot_time(rq, t->mm);
> +			mm_cid_put_lazy(t);
> +			t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
> +			if (old_cid == t->mm_cid)
> +				t->last_rseq_preempt = jiffies;
> +			else
> +				rseq_preempt(t);
> +		}
> +	}
> +}
> +
>   /* Call only when curr is a user thread. */
>   void task_queue_mm_cid(struct task_struct *curr)
>   {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1703cd16d5433..7d104d12ed974 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3582,12 +3582,14 @@ extern const char *preempt_modes[];
>   
>   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
>   #define MM_CID_SCAN_DELAY	100			/* 100ms */
> +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
>   
>   extern raw_spinlock_t cid_lock;
>   extern int use_cid_lock;
>   
>   extern void sched_mm_cid_migrate_from(struct task_struct *t);
>   extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
> +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
>   extern void init_sched_mm_cid(struct task_struct *t);
>   
>   static inline void __mm_cid_put(struct mm_struct *mm, int cid)
> @@ -3856,6 +3858,7 @@ static inline void switch_mm_cid(struct rq *rq,
>   static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
>   static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
>   static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
> +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   
> 
> base-commit: c59c19fcfad857c96effa3b2e9eb6d934d2380d8
Gabriele Monaco April 10, 2025, 2:36 p.m. UTC | #2
On Thu, 2025-04-10 at 10:04 -0400, Mathieu Desnoyers wrote:
> On 2025-04-10 08:50, Gabriele Monaco wrote:
> > Thanks both for the comments, I tried to implement what Mathieu
> > suggested. This patch applies directly on 2/3 but I'm sending it
> > here
> > first to get feedback.
> > 
> > Essentially, I refactored a bit to avoid the need to add more
> > dependencies to rseq, the rseq_tick is now called task_tick_mm_cid
> > (as
> > before the series) and it does the two things you mentioned:
> >   * A) trigger the mm_cid recompaction
> >   * B) trigger an update of the task's rseq->mm_cid field at some
> > point
> >     after recompaction, so it can get a mm_cid value closer to 0.
> > 
> > Now, A occurs only after the scan time elapsed, which means it
> > could
> > potentially run multiple times in case the work is not scheduled
> > before
> > the next tick, I'm not sure adding more checks to make sure it
> > happens once and only once really makes sense here.
> 
> The scan is gated by two checks now:
> 
> > +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> > +		if (mm_cid_needs_scan(t->mm))
> 
> And likewise for the periodic check for preemption:
> 

Alright, I could add another flag here to prevent re-scheduling,

> > +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> [...]
> > +		else if (time_after(jiffies, t->last_rseq_preempt
> > +
> > +				     
> > msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> 
> Those second levels of time checks would prevent adding significant
> overhead on every tick after the threshold is reached.
> 

But I'm not sure what to do here: in my understanding, this second
branch can only run once for task t and may run despite the previous
path was skipped (let's assume two long running threads share the mm,
the first thread schedules the work and it completes before the second
threads meets a qualifying tick).

> > 
> > B is occurring after the work updates the last scan time, so we are
> > in a
> > condition where the runtime is above threshold but the (next) scan
> > time
> > did not expire yet.
> > I tried to account for multiple threads updating the mm_cid (not
> > necessarily the long running one, or in case more are long
> > running), for
> > this I'm tracking the last time we updated the mm_cid, if that
> > occurred
> > before the last mm_cid scan, we need to update (and preempt).
> > 
> > Does this make sense to you?
> 
> It makes sense. Note that it adds overhead to rseq_preempt() (a store
> to t->last_rseq_preempt), which is a fast path. I don't know if we
> should care.

Well, I'm trying to track the last time a reset occurred, that happens
rather in mm_cid_get, which doesn't look like a fast path to me.
I could then rename it to last_rseq_reset since it won't be related to
preemption.

> 
> Also part of task_tick_mm_cid could be moved to a helper, e.g.:
> 
> static
> void task_reset_mm_cid(struct rq *rq, struct task_struct *t)
> {
>          int old_cid = t->mm_cid;
>          
>          if (!t->mm_cid_active)
>                  return;
>          mm_cid_snapshot_time(rq, t->mm);
>          mm_cid_put_lazy(t);
>          t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
>          if (old_cid == t->mm_cid)
>                  return;
>          rseq_preempt(t);
> }

Yeah good point

Thanks,
Gabriele
diff mbox series

Patch

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index d20fd72f4c80d..7e3fa2ae9e7a4 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,8 +7,6 @@ 
 #include <linux/preempt.h>
 #include <linux/sched.h>
 
-#define RSEQ_UNPREEMPTED_THRESHOLD	(100ULL * 1000000)	/* 100ms */
-
 /*
  * Map the event mask on the user-space ABI enum rseq_cs_flags
  * for direct mask checks.
@@ -54,14 +52,7 @@  static inline void rseq_preempt(struct task_struct *t)
 {
 	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
 	rseq_set_notify_resume(t);
-}
-
-static inline void rseq_preempt_from_tick(struct task_struct *t)
-{
-	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
-
-	if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
-		rseq_preempt(t);
+	t->last_rseq_preempt = jiffies;
 }
 
 /* rseq_migrate() requires preemption to be disabled. */
@@ -114,9 +105,6 @@  static inline void rseq_signal_deliver(struct ksignal *ksig,
 static inline void rseq_preempt(struct task_struct *t)
 {
 }
-static inline void rseq_preempt_from_tick(struct task_struct *t)
-{
-}
 static inline void rseq_migrate(struct task_struct *t)
 {
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 851933e62bed3..5b057095d5dc0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1424,6 +1424,7 @@  struct task_struct {
 	int				last_mm_cid;	/* Most recent cid in mm */
 	int				migrate_from_cpu;
 	int				mm_cid_active;	/* Whether cid bitmap is active */
+	unsigned long			last_rseq_preempt; /* Time of last preempt in jiffies */
 #endif
 
 	struct tlbflush_unmap_batch	tlb_ubc;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52ad709094167..9f0c9cc284804 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5663,7 +5663,7 @@  void sched_tick(void)
 		resched_latency = cpu_resched_latency(rq);
 	calc_global_load_tick(rq);
 	sched_core_tick(rq);
-	rseq_preempt_from_tick(donor);
+	task_tick_mm_cid(rq, donor);
 	scx_tick(rq);
 
 	rq_unlock(rq, &rf);
@@ -10618,6 +10618,46 @@  void init_sched_mm_cid(struct task_struct *t)
 	}
 }
 
+void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
+{
+	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
+
+	/*
+	 * If a task is running unpreempted for a long time, it won't get its
+	 * mm_cid compacted and won't update its mm_cid value after a
+	 * compaction occurs.
+	 * For such a task, this function does two things:
+	 * A) trigger the mm_cid recompaction,
+	 * B) trigger an update of the task's rseq->mm_cid field at some point
+	 * after recompaction, so it can get a mm_cid value closer to 0.
+	 * A change in the mm_cid triggers an rseq_preempt.
+	 *
+	 * A occurs only after the next scan time elapsed but before the
+	 * compaction work is actually scheduled.
+	 * B occurs once after the compaction work completes, that is when scan
+	 * is no longer needed (it occurred for this mm) but the last rseq
+	 * preempt was done before the last mm_cid scan.
+	 */
+	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
+		if (mm_cid_needs_scan(t->mm))
+			rseq_set_notify_resume(t);
+		else if (time_after(jiffies, t->last_rseq_preempt +
+				      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
+			int old_cid = t->mm_cid;
+
+			if (!t->mm_cid_active)
+				return;
+			mm_cid_snapshot_time(rq, t->mm);
+			mm_cid_put_lazy(t);
+			t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
+			if (old_cid == t->mm_cid)
+				t->last_rseq_preempt = jiffies;
+			else
+				rseq_preempt(t);
+		}
+	}
+}
+
 /* Call only when curr is a user thread. */
 void task_queue_mm_cid(struct task_struct *curr)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1703cd16d5433..7d104d12ed974 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3582,12 +3582,14 @@  extern const char *preempt_modes[];
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
 #define MM_CID_SCAN_DELAY	100			/* 100ms */
+#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
 
 extern raw_spinlock_t cid_lock;
 extern int use_cid_lock;
 
 extern void sched_mm_cid_migrate_from(struct task_struct *t);
 extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
 extern void init_sched_mm_cid(struct task_struct *t);
 
 static inline void __mm_cid_put(struct mm_struct *mm, int cid)
@@ -3856,6 +3858,7 @@  static inline void switch_mm_cid(struct rq *rq,
 static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
 static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
 static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
 static inline void init_sched_mm_cid(struct task_struct *t) { }
 #endif /* !CONFIG_SCHED_MM_CID */