diff mbox series

[05/39] sched: Add sched_class->switching_to() and expose check_class_changing/changed()

Message ID 20240501151312.635565-6-tj@kernel.org (mailing list archive)
State Not Applicable
Headers show
Series [01/39] cgroup: Implement cgroup_show_cftypes() | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch, async

Commit Message

Tejun Heo May 1, 2024, 3:09 p.m. UTC
When a task switches to a new sched_class, the prev and new classes are
notified through ->switched_from() and ->switched_to(), respectively, after
the switching is done.

A new BPF extensible sched_class will have callbacks that allow the BPF
scheduler to keep track of relevant task states (like priority and cpumask).
Those callbacks aren't called while a task is on a different sched_class.
When a task comes back, we wanna tell the BPF progs the up-to-date state
before the task gets enqueued, so we need a hook which is called before the
switching is committed.

This patch adds ->switching_to() which is called during sched_class switch
through check_class_changing() before the task is restored. Also, this patch
exposes check_class_changing/changed() in kernel/sched/sched.h. They will be
used by the new BPF extensible sched_class to implement implicit sched_class
switching which is used e.g. when falling back to CFS when the BPF scheduler
fails or unloads.

This is a prep patch and doesn't cause any behavior changes. The new
operation and exposed functions aren't used yet.

v2: Improve patch description w/ details on planned use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 19 ++++++++++++++++---
 kernel/sched/sched.h |  7 +++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

Comments

Peter Zijlstra June 24, 2024, 11:06 a.m. UTC | #1
On Wed, May 01, 2024 at 05:09:40AM -1000, Tejun Heo wrote:
> When a task switches to a new sched_class, the prev and new classes are
> notified through ->switched_from() and ->switched_to(), respectively, after
> the switching is done.
> 
> A new BPF extensible sched_class will have callbacks that allow the BPF
> scheduler to keep track of relevant task states (like priority and cpumask).
> Those callbacks aren't called while a task is on a different sched_class.
> When a task comes back, we wanna tell the BPF progs the up-to-date state
> before the task gets enqueued, so we need a hook which is called before the
> switching is committed.
> 
> This patch adds ->switching_to() which is called during sched_class switch
> through check_class_changing() before the task is restored. Also, this patch
> exposes check_class_changing/changed() in kernel/sched/sched.h. They will be
> used by the new BPF extensible sched_class to implement implicit sched_class
> switching which is used e.g. when falling back to CFS when the BPF scheduler
> fails or unloads.
> 
> This is a prep patch and doesn't cause any behavior changes. The new
> operation and exposed functions aren't used yet.
> 
> v2: Improve patch description w/ details on planned use.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reviewed-by: David Vernet <dvernet@meta.com>
> Acked-by: Josh Don <joshdon@google.com>
> Acked-by: Hao Luo <haoluo@google.com>
> Acked-by: Barret Rhoden <brho@google.com>

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 8e23f19e8096..99e292368d11 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2301,6 +2301,7 @@ struct sched_class {
>  	 * cannot assume the switched_from/switched_to pair is serialized by
>  	 * rq->lock. They are however serialized by p->pi_lock.
>  	 */
> +	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
>  	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
>  	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
>  	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,

So I *think* that I can handle all the current cases in
sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags.

Would that work for the BPF thing as well?

Something like the very much incomplete below... It would allow removing
all these switch{ed,ing}_{to,from}() things entirely, instead of
adding yet more.

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0935f9d4bb7b..da54c9f8f78d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6864,15 +6864,22 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-void __setscheduler_prio(struct task_struct *p, int prio)
+struct sched_class *__setscheduler_class(int prio)
 {
+	struct sched_class *class;
+
 	if (dl_prio(prio))
-		p->sched_class = &dl_sched_class;
+		class = &dl_sched_class;
 	else if (rt_prio(prio))
-		p->sched_class = &rt_sched_class;
+		class = &rt_sched_class;
 	else
-		p->sched_class = &fair_sched_class;
+		class = &fair_sched_class;
 
+	return class;
+}
+
+void __setscheduler_prio(struct task_struct *p, int prio)
+{
 	p->prio = prio;
 }
 
@@ -6919,7 +6926,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
 	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
-	const struct sched_class *prev_class;
+	const struct sched_class *prev_class, *class;
 	struct rq_flags rf;
 	struct rq *rq;
 
@@ -6977,6 +6984,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
+	class = __setscheduler_class(prio);
+	if (prev_class != class)
+		queue_flags |= DEQUEUE_CLASS;
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -7014,6 +7025,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 			p->rt.timeout = 0;
 	}
 
+	p->class = class;
 	__setscheduler_prio(p, prio);
 
 	if (queued)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fd8bc6fd08..a03995d81c75 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2251,6 +2251,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
 #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_CLASS		0x200 /* Matches ENQUEUE_CLASS */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2266,6 +2267,7 @@ extern const u32		sched_prio_to_wmult[40];
 #endif
 #define ENQUEUE_INITIAL		0x80
 #define ENQUEUE_MIGRATING	0x100
+#define ENQUEUE_CLASS		0x200
 
 #define RETRY_TASK		((void *)-1UL)
 
@@ -3603,6 +3605,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+extern struct sched_class *__setscheduler_class(int prio);
 extern void __setscheduler_prio(struct task_struct *p, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ae1b42775ef9..dc104d996204 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -612,7 +612,7 @@ int __sched_setscheduler(struct task_struct *p,
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
 	int retval, oldprio, newprio, queued, running;
-	const struct sched_class *prev_class;
+	const struct sched_class *prev_class, *class;
 	struct balance_callback *head;
 	struct rq_flags rf;
 	int reset_on_fork;
@@ -783,6 +783,12 @@ int __sched_setscheduler(struct task_struct *p,
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
+	class = prev_class = p->sched_class;
+	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS))
+		class = __setscheduler_class(newprio);
+	if (prev_class != class)
+		queue_flags |= DEQUEUE_CLASS;
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -790,10 +796,9 @@ int __sched_setscheduler(struct task_struct *p,
 	if (running)
 		put_prev_task(rq, p);
 
-	prev_class = p->sched_class;
-
 	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
 		__setscheduler_params(p, attr);
+		p->class = class;
 		__setscheduler_prio(p, newprio);
 	}
 	__setscheduler_uclamp(p, attr);
Tejun Heo June 24, 2024, 10:18 p.m. UTC | #2
Hello, Peter.

On Mon, Jun 24, 2024 at 01:06:24PM +0200, Peter Zijlstra wrote:
...
> > +	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
> >  	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
> >  	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
> 
> So I *think* that I can handle all the current cases in
> sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags.
> 
> Would that work for the BPF thing as well?
>
> Something like the very much incomplete below... It would allow removing
> all these switch{ed,ing}_{to,from}() things entirely, instead of
> adding yet more.

Hmm... so, I tried to make it work for SCX but enqueue() and dequeue() are
only called if the task was queued at the time of sched_class change, right?
However, these callbacks expect to be called even when the task is not
currently queued. Maybe I'm misreading code but it looks like that'd break
other classes too. What am I missing?

Thanks.
Peter Zijlstra June 25, 2024, 8:16 a.m. UTC | #3
On Mon, Jun 24, 2024 at 12:18:07PM -1000, Tejun Heo wrote:
> Hello, Peter.
> 
> On Mon, Jun 24, 2024 at 01:06:24PM +0200, Peter Zijlstra wrote:
> ...
> > > +	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
> > >  	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
> > >  	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
> > 
> > So I *think* that I can handle all the current cases in
> > sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags.
> > 
> > Would that work for the BPF thing as well?
> >
> > Something like the very much incomplete below... It would allow removing
> > all these switch{ed,ing}_{to,from}() things entirely, instead of
> > adding yet more.
> 
> Hmm... so, I tried to make it work for SCX but enqueue() and dequeue() are
> only called if the task was queued at the time of sched_class change, right?
> However, these callbacks expect to be called even when the task is not
> currently queued. Maybe I'm misreading code but it looks like that'd break
> other classes too. What am I missing?

Ah,.. so I think the RT/DL ones can work (which is what I looked at),
they're only concerned with balancing tasks that are on the queue.
But yeah, I missed that the fair thing needs it regardless.

Bummer. I was hoping to reduce calls.
diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4b9cb2228b04..311efc00da63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2214,6 +2214,17 @@  inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+/*
+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
+ * mess with locking.
+ */
+void check_class_changing(struct rq *rq, struct task_struct *p,
+			  const struct sched_class *prev_class)
+{
+	if (prev_class != p->sched_class && p->sched_class->switching_to)
+		p->sched_class->switching_to(rq, p);
+}
+
 /*
  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
  * use the balance_callback list if you want balancing.
@@ -2221,9 +2232,9 @@  inline int task_curr(const struct task_struct *p)
  * this means any call to check_class_changed() must be followed by a call to
  * balance_callback().
  */
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
-				       const struct sched_class *prev_class,
-				       int oldprio)
+void check_class_changed(struct rq *rq, struct task_struct *p,
+			 const struct sched_class *prev_class,
+			 int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
@@ -7253,6 +7264,7 @@  void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 	}
 
 	__setscheduler_prio(p, prio);
+	check_class_changing(rq, p, prev_class);
 
 	if (queued)
 		enqueue_task(rq, p, queue_flag);
@@ -7898,6 +7910,7 @@  static int __sched_setscheduler(struct task_struct *p,
 		__setscheduler_prio(p, newprio);
 	}
 	__setscheduler_uclamp(p, attr);
+	check_class_changing(rq, p, prev_class);
 
 	if (queued) {
 		/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8e23f19e8096..99e292368d11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2301,6 +2301,7 @@  struct sched_class {
 	 * cannot assume the switched_from/switched_to pair is serialized by
 	 * rq->lock. They are however serialized by p->pi_lock.
 	 */
+	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
@@ -2540,6 +2541,12 @@  static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
+extern void check_class_changing(struct rq *rq, struct task_struct *p,
+				 const struct sched_class *prev_class);
+extern void check_class_changed(struct rq *rq, struct task_struct *p,
+				const struct sched_class *prev_class,
+				int oldprio);
+
 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
 #ifdef CONFIG_PREEMPT_RT