Message ID | 20240501151312.635565-6-tj@kernel.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | [01/39] cgroup: Implement cgroup_show_cftypes() | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch, async |
On Wed, May 01, 2024 at 05:09:40AM -1000, Tejun Heo wrote: > When a task switches to a new sched_class, the prev and new classes are > notified through ->switched_from() and ->switched_to(), respectively, after > the switching is done. > > A new BPF extensible sched_class will have callbacks that allow the BPF > scheduler to keep track of relevant task states (like priority and cpumask). > Those callbacks aren't called while a task is on a different sched_class. > When a task comes back, we wanna tell the BPF progs the up-to-date state > before the task gets enqueued, so we need a hook which is called before the > switching is committed. > > This patch adds ->switching_to() which is called during sched_class switch > through check_class_changing() before the task is restored. Also, this patch > exposes check_class_changing/changed() in kernel/sched/sched.h. They will be > used by the new BPF extensible sched_class to implement implicit sched_class > switching which is used e.g. when falling back to CFS when the BPF scheduler > fails or unloads. > > This is a prep patch and doesn't cause any behavior changes. The new > operation and exposed functions aren't used yet. > > v2: Improve patch description w/ details on planned use. > > Signed-off-by: Tejun Heo <tj@kernel.org> > Reviewed-by: David Vernet <dvernet@meta.com> > Acked-by: Josh Don <joshdon@google.com> > Acked-by: Hao Luo <haoluo@google.com> > Acked-by: Barret Rhoden <brho@google.com> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 8e23f19e8096..99e292368d11 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -2301,6 +2301,7 @@ struct sched_class { > * cannot assume the switched_from/switched_to pair is serialized by > * rq->lock. They are however serialized by p->pi_lock. > */ > + void (*switching_to) (struct rq *this_rq, struct task_struct *task); > void (*switched_from)(struct rq *this_rq, struct task_struct *task); > void (*switched_to) (struct rq *this_rq, struct task_struct *task); > void (*reweight_task)(struct rq *this_rq, struct task_struct *task, So I *think* that I can handle all the current cases in sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags. Would that work for the BPF thing as well? Something like the very much incomplete below... It would allow removing all these switch{ed,ing}_{to,from}() things entirely, instead of adding yet more. --- diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0935f9d4bb7b..da54c9f8f78d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6864,15 +6864,22 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +struct sched_class *__setscheduler_class(int prio) { + struct sched_class *class; + if (dl_prio(prio)) - p->sched_class = &dl_sched_class; + class = &dl_sched_class; else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + class = &rt_sched_class; else - p->sched_class = &fair_sched_class; + class = &fair_sched_class; + return class; +} + +void __setscheduler_prio(struct task_struct *p, int prio) +{ p->prio = prio; } @@ -6919,7 +6926,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *class; struct rq_flags rf; struct rq *rq; @@ -6977,6 +6984,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + class = __setscheduler_class(prio); + if (prev_class != class) + queue_flags |= DEQUEUE_CLASS; + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7014,6 +7025,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } + p->class = class; __setscheduler_prio(p, prio); if (queued) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 62fd8bc6fd08..a03995d81c75 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2251,6 +2251,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_CLASS 0x200 /* Matches ENQUEUE_CLASS */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2266,6 +2267,7 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_CLASS 0x200 #define RETRY_TASK ((void *)-1UL) @@ -3603,6 +3605,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern struct sched_class *__setscheduler_class(int prio); extern void __setscheduler_prio(struct task_struct *p, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ae1b42775ef9..dc104d996204 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -612,7 +612,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -783,6 +783,12 @@ int __sched_setscheduler(struct task_struct *p, queue_flags &= ~DEQUEUE_MOVE; } + class = prev_class = p->sched_class; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) + class = __setscheduler_class(newprio); + if (prev_class != class) + queue_flags |= DEQUEUE_CLASS; + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -790,10 +796,9 @@ int __sched_setscheduler(struct task_struct *p, if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); + p->class = class; __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr);
Hello, Peter. On Mon, Jun 24, 2024 at 01:06:24PM +0200, Peter Zijlstra wrote: ... > > + void (*switching_to) (struct rq *this_rq, struct task_struct *task); > > void (*switched_from)(struct rq *this_rq, struct task_struct *task); > > void (*switched_to) (struct rq *this_rq, struct task_struct *task); > > So I *think* that I can handle all the current cases in > sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags. > > Would that work for the BPF thing as well? > > Something like the very much incomplete below... It would allow removing > all these switch{ed,ing}_{to,from}() things entirely, instead of > adding yet more. Hmm... so, I tried to make it work for SCX but enqueue() and dequeue() are only called if the task was queued at the time of sched_class change, right? However, these callbacks expect to be called even when the task is not currently queued. Maybe I'm misreading code but it looks like that'd break other classes too. What am I missing? Thanks.
On Mon, Jun 24, 2024 at 12:18:07PM -1000, Tejun Heo wrote: > Hello, Peter. > > On Mon, Jun 24, 2024 at 01:06:24PM +0200, Peter Zijlstra wrote: > ... > > > + void (*switching_to) (struct rq *this_rq, struct task_struct *task); > > > void (*switched_from)(struct rq *this_rq, struct task_struct *task); > > > void (*switched_to) (struct rq *this_rq, struct task_struct *task); > > > > So I *think* that I can handle all the current cases in > > sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags. > > > > Would that work for the BPF thing as well? > > > > Something like the very much incomplete below... It would allow removing > > all these switch{ed,ing}_{to,from}() things entirely, instead of > > adding yet more. > > Hmm... so, I tried to make it work for SCX but enqueue() and dequeue() are > only called if the task was queued at the time of sched_class change, right? > However, these callbacks expect to be called even when the task is not > currently queued. Maybe I'm misreading code but it looks like that'd break > other classes too. What am I missing? Ah,.. so I think the RT/DL ones can work (which is what I looked at), they're only concerned with balancing tasks that are on the queue. But yeah, I missed that the fair thing needs it regardless. Bummer. I was hoping to reduce calls.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b9cb2228b04..311efc00da63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2214,6 +2214,17 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + /* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. @@ -2221,9 +2232,9 @@ inline int task_curr(const struct task_struct *p) * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -7253,6 +7264,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); + check_class_changing(rq, p, prev_class); if (queued) enqueue_task(rq, p, queue_flag); @@ -7898,6 +7910,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); if (queued) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8e23f19e8096..99e292368d11 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2301,6 +2301,7 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, @@ -2540,6 +2541,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT