Message ID | 20181119141857.8625-10-quentin.perret@arm.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | Energy Aware Scheduling | expand |
On Mon, Nov 19, 2018 at 02:18:51PM +0000, Quentin Perret wrote: > +static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) > +{ > + /* > + * The conditions for EAS to start are checked during the creation of > + * root domains. If one of them meets all conditions, it will have a > + * non-null list of performance domains. > + */ > + while (ndoms_new) { > + if (cpu_rq(cpumask_first(doms_new[ndoms_new - 1]))->rd->pd) > + goto enable; > + ndoms_new--; > + } That seems quite ugly; can't you simply return a bool from build_perf_domains() ? Something like the below, but less fugly ;-) --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -299,7 +299,7 @@ static void destroy_perf_domain_rcu(stru #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; -static void build_perf_domains(const struct cpumask *cpu_map) +static bool build_perf_domains(const struct cpumask *cpu_map) { int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; @@ -365,7 +365,7 @@ static void build_perf_domains(const str if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu); - return; + return !!pd; free: free_pd(pd); @@ -373,6 +373,8 @@ static void build_perf_domains(const str rcu_assign_pointer(rd->pd, NULL); if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu); + + return false; } #else static void free_pd(struct perf_domain *pd) { } @@ -2173,6 +2175,9 @@ void partition_sched_domains(int ndoms_n } #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + bool has_eas = false; + /* Build perf. domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { @@ -2181,10 +2186,13 @@ void partition_sched_domains(int ndoms_n goto match3; } /* No match - add perf. domains for a new rd */ - build_perf_domains(doms_new[i]); + has_eas |= build_perf_domains(doms_new[i]); match3: ; } + + sched_energy_set(has_eas); + } #endif /* Remember the new sched domains: */
Hi Peter, On Wednesday 21 Nov 2018 at 14:08:22 (+0100), Peter Zijlstra wrote: > On Mon, Nov 19, 2018 at 02:18:51PM +0000, Quentin Perret wrote: > > > +static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) > > +{ > > + /* > > + * The conditions for EAS to start are checked during the creation of > > + * root domains. If one of them meets all conditions, it will have a > > + * non-null list of performance domains. > > + */ > > + while (ndoms_new) { > > + if (cpu_rq(cpumask_first(doms_new[ndoms_new - 1]))->rd->pd) > > + goto enable; > > + ndoms_new--; > > + } > > That seems quite ugly; can't you simply return a bool from > build_perf_domains() ? > > Something like the below, but less fugly ;-) > > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -299,7 +299,7 @@ static void destroy_perf_domain_rcu(stru > #define EM_MAX_COMPLEXITY 2048 > > extern struct cpufreq_governor schedutil_gov; > -static void build_perf_domains(const struct cpumask *cpu_map) > +static bool build_perf_domains(const struct cpumask *cpu_map) > { > int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); > struct perf_domain *pd = NULL, *tmp; > @@ -365,7 +365,7 @@ static void build_perf_domains(const str > if (tmp) > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > > - return; > + return !!pd; > > free: > free_pd(pd); > @@ -373,6 +373,8 @@ static void build_perf_domains(const str > rcu_assign_pointer(rd->pd, NULL); > if (tmp) > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > + > + return false; > } > #else > static void free_pd(struct perf_domain *pd) { } > @@ -2173,6 +2175,9 @@ void partition_sched_domains(int ndoms_n > } > > #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) > + { > + bool has_eas = false; That one wants to be declared '__maybe_unused' at the top of partition_sched_domains() perhaps ? Just to avoid the { } section. Hopefully the compiler will be smart enough to not use stack space for it if not needed. > + > /* Build perf. domains: */ > for (i = 0; i < ndoms_new; i++) { > for (j = 0; j < n && !sched_energy_update; j++) { > @@ -2181,10 +2186,13 @@ void partition_sched_domains(int ndoms_n > goto match3; And we want to do 'has_eas = true' just before 'goto match3' here. Otherwise we'll end up disabling EAS if we hit 'goto match3' for all root domains. > } > /* No match - add perf. domains for a new rd */ > - build_perf_domains(doms_new[i]); > + has_eas |= build_perf_domains(doms_new[i]); > match3: > ; > } > + > + sched_energy_set(has_eas); > + } > #endif > > /* Remember the new sched domains: */ Other than that I guess that should work OK :-) Thanks, Quentin
On Wednesday 21 Nov 2018 at 15:14:45 (+0000), Quentin Perret wrote: > Hi Peter, > > On Wednesday 21 Nov 2018 at 14:08:22 (+0100), Peter Zijlstra wrote: > > On Mon, Nov 19, 2018 at 02:18:51PM +0000, Quentin Perret wrote: > > > > > +static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) > > > +{ > > > + /* > > > + * The conditions for EAS to start are checked during the creation of > > > + * root domains. If one of them meets all conditions, it will have a > > > + * non-null list of performance domains. > > > + */ > > > + while (ndoms_new) { > > > + if (cpu_rq(cpumask_first(doms_new[ndoms_new - 1]))->rd->pd) > > > + goto enable; > > > + ndoms_new--; > > > + } > > > > That seems quite ugly; can't you simply return a bool from > > build_perf_domains() ? > > > > Something like the below, but less fugly ;-) > > > > --- a/kernel/sched/topology.c > > +++ b/kernel/sched/topology.c > > @@ -299,7 +299,7 @@ static void destroy_perf_domain_rcu(stru > > #define EM_MAX_COMPLEXITY 2048 > > > > extern struct cpufreq_governor schedutil_gov; > > -static void build_perf_domains(const struct cpumask *cpu_map) > > +static bool build_perf_domains(const struct cpumask *cpu_map) > > { > > int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); > > struct perf_domain *pd = NULL, *tmp; > > @@ -365,7 +365,7 @@ static void build_perf_domains(const str > > if (tmp) > > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > > > > - return; > > + return !!pd; > > > > free: > > free_pd(pd); > > @@ -373,6 +373,8 @@ static void build_perf_domains(const str > > rcu_assign_pointer(rd->pd, NULL); > > if (tmp) > > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > > + > > + return false; > > } > > #else > > static void free_pd(struct perf_domain *pd) { } > > @@ -2173,6 +2175,9 @@ void partition_sched_domains(int ndoms_n > > } > > > > #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) > > + { > > + bool has_eas = false; > > That one wants to be declared '__maybe_unused' at the top of > partition_sched_domains() perhaps ? Just to avoid the { } section. > Hopefully the compiler will be smart enough to not use stack space for > it if not needed. > > > + > > /* Build perf. domains: */ > > for (i = 0; i < ndoms_new; i++) { > > for (j = 0; j < n && !sched_energy_update; j++) { > > @@ -2181,10 +2186,13 @@ void partition_sched_domains(int ndoms_n > > goto match3; > > And we want to do 'has_eas = true' just before 'goto match3' here. > Otherwise we'll end up disabling EAS if we hit 'goto match3' for all > root domains. > > > } > > /* No match - add perf. domains for a new rd */ > > - build_perf_domains(doms_new[i]); > > + has_eas |= build_perf_domains(doms_new[i]); > > match3: > > ; > > } > > + > > + sched_energy_set(has_eas); > > + } > > #endif > > > > /* Remember the new sched domains: */ > > Other than that I guess that should work OK :-) So I actually just came across your patch that does static_branch_{inc,dec} on sched_smt_present. Maybe I could do something similar here ? How about something like the below (totally untested): diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8f9dfea60344..10ac32a0dc2e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -298,6 +298,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) pd = container_of(rp, struct perf_domain, rcu); free_pd(pd); + static_branch_dec_cpuslocked(&sched_energy_present); } static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) @@ -421,6 +422,8 @@ static void build_perf_domains(const struct cpumask *cpu_map) /* Attach the new list of performance domains to the root domain. */ tmp = rd->pd; rcu_assign_pointer(rd->pd, pd); + static_branch_inc_cpuslocked(&sched_energy_present); + if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu); @@ -2246,7 +2249,6 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], match3: ; } - sched_energy_start(ndoms_new, doms_new); #endif /* Remember the new sched domains: */
On Thursday 22 Nov 2018 at 09:17:16 (+0000), Quentin Perret wrote: > On Wednesday 21 Nov 2018 at 15:14:45 (+0000), Quentin Perret wrote: > > Hi Peter, > > > > On Wednesday 21 Nov 2018 at 14:08:22 (+0100), Peter Zijlstra wrote: > > > On Mon, Nov 19, 2018 at 02:18:51PM +0000, Quentin Perret wrote: > > > > > > > +static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) > > > > +{ > > > > + /* > > > > + * The conditions for EAS to start are checked during the creation of > > > > + * root domains. If one of them meets all conditions, it will have a > > > > + * non-null list of performance domains. > > > > + */ > > > > + while (ndoms_new) { > > > > + if (cpu_rq(cpumask_first(doms_new[ndoms_new - 1]))->rd->pd) > > > > + goto enable; > > > > + ndoms_new--; > > > > + } > > > > > > That seems quite ugly; can't you simply return a bool from > > > build_perf_domains() ? > > > > > > Something like the below, but less fugly ;-) > > > > > > --- a/kernel/sched/topology.c > > > +++ b/kernel/sched/topology.c > > > @@ -299,7 +299,7 @@ static void destroy_perf_domain_rcu(stru > > > #define EM_MAX_COMPLEXITY 2048 > > > > > > extern struct cpufreq_governor schedutil_gov; > > > -static void build_perf_domains(const struct cpumask *cpu_map) > > > +static bool build_perf_domains(const struct cpumask *cpu_map) > > > { > > > int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); > > > struct perf_domain *pd = NULL, *tmp; > > > @@ -365,7 +365,7 @@ static void build_perf_domains(const str > > > if (tmp) > > > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > > > > > > - return; > > > + return !!pd; > > > > > > free: > > > free_pd(pd); > > > @@ -373,6 +373,8 @@ static void build_perf_domains(const str > > > rcu_assign_pointer(rd->pd, NULL); > > > if (tmp) > > > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > > > + > > > + return false; > > > } > > > #else > > > static void free_pd(struct perf_domain *pd) { } > > > @@ -2173,6 +2175,9 @@ void partition_sched_domains(int ndoms_n > > > } > > > > > > #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) > > > + { > > > + bool has_eas = false; > > > > That one wants to be declared '__maybe_unused' at the top of > > partition_sched_domains() perhaps ? Just to avoid the { } section. > > Hopefully the compiler will be smart enough to not use stack space for > > it if not needed. > > > > > + > > > /* Build perf. domains: */ > > > for (i = 0; i < ndoms_new; i++) { > > > for (j = 0; j < n && !sched_energy_update; j++) { > > > @@ -2181,10 +2186,13 @@ void partition_sched_domains(int ndoms_n > > > goto match3; > > > > And we want to do 'has_eas = true' just before 'goto match3' here. > > Otherwise we'll end up disabling EAS if we hit 'goto match3' for all > > root domains. > > > > > } > > > /* No match - add perf. domains for a new rd */ > > > - build_perf_domains(doms_new[i]); > > > + has_eas |= build_perf_domains(doms_new[i]); > > > match3: > > > ; > > > } > > > + > > > + sched_energy_set(has_eas); > > > + } > > > #endif > > > > > > /* Remember the new sched domains: */ > > > > Other than that I guess that should work OK :-) > > So I actually just came across your patch that does > static_branch_{inc,dec} on sched_smt_present. Maybe I could do something > similar here ? > > How about something like the below (totally untested): > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index 8f9dfea60344..10ac32a0dc2e 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -298,6 +298,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) > > pd = container_of(rp, struct perf_domain, rcu); > free_pd(pd); > + static_branch_dec_cpuslocked(&sched_energy_present); > } > > static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) > @@ -421,6 +422,8 @@ static void build_perf_domains(const struct cpumask *cpu_map) > /* Attach the new list of performance domains to the root domain. */ > tmp = rd->pd; > rcu_assign_pointer(rd->pd, pd); > + static_branch_inc_cpuslocked(&sched_energy_present); > + > if (tmp) > call_rcu(&tmp->rcu, destroy_perf_domain_rcu); > > @@ -2246,7 +2249,6 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], > match3: > ; > } > - sched_energy_start(ndoms_new, doms_new); > #endif > > /* Remember the new sched domains: */ > Hmm, I went too fast, that's totally broken. But there's still something we can do with static_branch_{inc,dec} I think. I'll come back later with a better solution. Sorry for the noise :/
On Thu, Nov 22, 2018 at 09:32:39AM +0000, Quentin Perret wrote: > Hmm, I went too fast, that's totally broken. But there's still something > we can do with static_branch_{inc,dec} I think. I'll come back later > with a better solution. Right; if you count the rd's that have pd set, it should work-ish. Yes, much cleaner if you can get it to work.
On Thursday 22 Nov 2018 at 11:25:45 (+0100), Peter Zijlstra wrote: > On Thu, Nov 22, 2018 at 09:32:39AM +0000, Quentin Perret wrote: > > Hmm, I went too fast, that's totally broken. But there's still something > > we can do with static_branch_{inc,dec} I think. I'll come back later > > with a better solution. > > Right; if you count the rd's that have pd set, it should work-ish. Yes, > much cleaner if you can get it to work. So, I came up with the following code which seems to work OK. It's not as I clean as I'd like, though. The fact that free_pd() can be called in softirq context is annoying to manipulate the static key ... An alternative to this work item workaround is to do static_branch_dec() from build_perf_domains() and next to the three call sites of free_rootdomain() in order to avoid the call_rcu() context. Not very pretty either. Or we can just stick with your original suggestion to carry a boolean around. Any preference ? Thanks, Quentin --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -232,7 +232,7 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, } #endif -static void free_pd(struct perf_domain *pd) +static void _free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -243,6 +243,24 @@ static void free_pd(struct perf_domain *pd) } } +static void dec_sched_energy_workfn(struct work_struct *work) +{ + static_branch_dec(&sched_energy_present); +} +static DECLARE_WORK(dec_sched_energy_work, dec_sched_energy_workfn); + +static void free_pd(struct perf_domain *pd) +{ + if (pd) { + /* + * XXX: The static key can't be modified from SOFTIRQ context, + * so do it using a work item. + */ + schedule_work(&dec_sched_energy_work); + _free_pd(pd); + } +} + static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) { while (pd) { @@ -421,13 +410,14 @@ static void build_perf_domains(const struct cpumask *cpu_map) /* Attach the new list of performance domains to the root domain. */ tmp = rd->pd; rcu_assign_pointer(rd->pd, pd); + static_branch_inc_cpuslocked(&sched_energy_present); if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu); return; free: - free_pd(pd); + _free_pd(pd); tmp = rd->pd; rcu_assign_pointer(rd->pd, NULL); if (tmp) @@ -2246,7 +2236,6 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], match3: ; } - sched_energy_start(ndoms_new, doms_new); #endif /* Remember the new sched domains: */
On Thu, Nov 22, 2018 at 4:25 PM Quentin Perret <quentin.perret@arm.com> wrote: > > On Thursday 22 Nov 2018 at 11:25:45 (+0100), Peter Zijlstra wrote: > > On Thu, Nov 22, 2018 at 09:32:39AM +0000, Quentin Perret wrote: > > > Hmm, I went too fast, that's totally broken. But there's still something > > > we can do with static_branch_{inc,dec} I think. I'll come back later > > > with a better solution. > > > > Right; if you count the rd's that have pd set, it should work-ish. Yes, > > much cleaner if you can get it to work. > > So, I came up with the following code which seems to work OK. It's not > as I clean as I'd like, though. The fact that free_pd() can be called in > softirq context is annoying to manipulate the static key ... > > An alternative to this work item workaround is to do static_branch_dec() > from build_perf_domains() and next to the three call sites of > free_rootdomain() in order to avoid the call_rcu() context. Not very > pretty either. > > Or we can just stick with your original suggestion to carry a boolean > around. What's problematic with carrying a boolean around?
On Thursday 22 Nov 2018 at 16:51:41 (+0100), Rafael J. Wysocki wrote: > On Thu, Nov 22, 2018 at 4:25 PM Quentin Perret <quentin.perret@arm.com> wrote: > > > > On Thursday 22 Nov 2018 at 11:25:45 (+0100), Peter Zijlstra wrote: > > > On Thu, Nov 22, 2018 at 09:32:39AM +0000, Quentin Perret wrote: > > > > Hmm, I went too fast, that's totally broken. But there's still something > > > > we can do with static_branch_{inc,dec} I think. I'll come back later > > > > with a better solution. > > > > > > Right; if you count the rd's that have pd set, it should work-ish. Yes, > > > much cleaner if you can get it to work. > > > > So, I came up with the following code which seems to work OK. It's not > > as I clean as I'd like, though. The fact that free_pd() can be called in > > softirq context is annoying to manipulate the static key ... > > > > An alternative to this work item workaround is to do static_branch_dec() > > from build_perf_domains() and next to the three call sites of > > free_rootdomain() in order to avoid the call_rcu() context. Not very > > pretty either. > > > > Or we can just stick with your original suggestion to carry a boolean > > around. > > What's problematic with carrying a boolean around? Nothing, I was just trying to see if I could find a more elegant way to increment/decrement the static key as we create/destroy the perf domains. I would not describe the work item thing I came up with as particularly elegant, though :-) So, all in all, the boolean is fine by me if we all agree it is nicer. Thanks, Quentin
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0278b251c379..5b636a8aa900 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2298,3 +2298,7 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #else #define perf_domain_span(pd) NULL #endif + +#ifdef CONFIG_SMP +extern struct static_key_false sched_energy_present; +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 70492d4bfed6..1457ef80ee94 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,6 +201,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -273,6 +274,35 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) free_pd(pd); } +static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) +{ + /* + * The conditions for EAS to start are checked during the creation of + * root domains. If one of them meets all conditions, it will have a + * non-null list of performance domains. + */ + while (ndoms_new) { + if (cpu_rq(cpumask_first(doms_new[ndoms_new - 1]))->rd->pd) + goto enable; + ndoms_new--; + } + + if (static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: stopping EAS\n", __func__); + static_branch_disable_cpuslocked(&sched_energy_present); + } + + return; + +enable: + if (!static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: starting EAS\n", __func__); + static_branch_enable_cpuslocked(&sched_energy_present); + } +} + /* * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; @@ -2187,6 +2217,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], match3: ; } + sched_energy_start(ndoms_new, doms_new); #endif /* Remember the new sched domains: */
In order to make sure Energy Aware Scheduling (EAS) will not impact systems where no Energy Model is available, introduce a static key guarding the access to EAS code. Since EAS is enabled on a per-root-domain basis, the static key is enabled when at least one root domain meets all conditions for EAS. Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Quentin Perret <quentin.perret@arm.com> --- kernel/sched/sched.h | 4 ++++ kernel/sched/topology.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+)