Message ID | 20231012121707.51368-3-yangyicong@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | sched/fair: Scan cluster before scanning LLC in wake-up path | expand |
On Thu, 12 Oct 2023 at 14:19, Yicong Yang <yangyicong@huawei.com> wrote: > > From: Barry Song <song.bao.hua@hisilicon.com> > > For platforms having clusters like Kunpeng920, CPUs within the same cluster > have lower latency when synchronizing and accessing shared resources like > cache. Thus, this patch tries to find an idle cpu within the cluster of the > target CPU before scanning the whole LLC to gain lower latency. This > will be implemented in 2 steps in select_idle_sibling(): > 1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them > if they're sharing cluster with the target CPU. Otherwise trying to > scan for an idle CPU in the target's cluster. > 2. Scanning the cluster prior to the LLC of the target CPU for an > idle CPU to wakeup. > > Testing has been done on Kunpeng920 by pinning tasks to one numa and two > numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. > > With this patch, We noticed enhancement on tbench and netperf within one > numa or cross two numa on top of tip-sched-core commit > 9b46f1abc6d4 ("sched/debug: Print 'tgid' in sched_show_task()") > > tbench results (node 0): > baseline patched > 1: 327.2833 372.4623 ( 13.80%) > 4: 1320.5933 1479.8833 ( 12.06%) > 8: 2638.4867 2921.5267 ( 10.73%) > 16: 5282.7133 5891.5633 ( 11.53%) > 32: 9810.6733 9877.3400 ( 0.68%) > 64: 7408.9367 7447.9900 ( 0.53%) > 128: 6203.2600 6191.6500 ( -0.19%) > tbench results (node 0-1): > baseline patched > 1: 332.0433 372.7223 ( 12.25%) > 4: 1325.4667 1477.6733 ( 11.48%) > 8: 2622.9433 2897.9967 ( 10.49%) > 16: 5218.6100 5878.2967 ( 12.64%) > 32: 10211.7000 11494.4000 ( 12.56%) > 64: 13313.7333 16740.0333 ( 25.74%) > 128: 13959.1000 14533.9000 ( 4.12%) > > netperf results TCP_RR (node 0): > baseline patched > 1: 76546.5033 90649.9867 ( 18.42%) > 4: 77292.4450 90932.7175 ( 17.65%) > 8: 77367.7254 90882.3467 ( 17.47%) > 16: 78519.9048 90938.8344 ( 15.82%) > 32: 72169.5035 72851.6730 ( 0.95%) > 64: 25911.2457 25882.2315 ( -0.11%) > 128: 10752.6572 10768.6038 ( 0.15%) > > netperf results TCP_RR (node 0-1): > baseline patched > 1: 76857.6667 90892.2767 ( 18.26%) > 4: 78236.6475 90767.3017 ( 16.02%) > 8: 77929.6096 90684.1633 ( 16.37%) > 16: 77438.5873 90502.5787 ( 16.87%) > 32: 74205.6635 88301.5612 ( 19.00%) > 64: 69827.8535 71787.6706 ( 2.81%) > 128: 25281.4366 25771.3023 ( 1.94%) > > netperf results UDP_RR (node 0): > baseline patched > 1: 96869.8400 110800.8467 ( 14.38%) > 4: 97744.9750 109680.5425 ( 12.21%) > 8: 98783.9863 110409.9637 ( 11.77%) > 16: 99575.0235 110636.2435 ( 11.11%) > 32: 95044.7250 97622.8887 ( 2.71%) > 64: 32925.2146 32644.4991 ( -0.85%) > 128: 12859.2343 12824.0051 ( -0.27%) > > netperf results UDP_RR (node 0-1): > baseline patched > 1: 97202.4733 110190.1200 ( 13.36%) > 4: 95954.0558 106245.7258 ( 10.73%) > 8: 96277.1958 105206.5304 ( 9.27%) > 16: 97692.7810 107927.2125 ( 10.48%) > 32: 79999.6702 103550.2999 ( 29.44%) > 64: 80592.7413 87284.0856 ( 8.30%) > 128: 27701.5770 29914.5820 ( 7.99%) > > Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch > in the code has not been tested but it supposed to work. > > Chen Yu also noticed this will improve the performance of tbench and > netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one > cluster sharing L2 Cache. > > Suggested-by: Peter Zijlstra <peterz@infradead.org> > [https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net] > Tested-by: Yicong Yang <yangyicong@hisilicon.com> > Signed-off-by: Barry Song <song.bao.hua@hisilicon.com> > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com> > Reviewed-by: Chen Yu <yu.c.chen@intel.com> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org> > --- > kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++---- > kernel/sched/sched.h | 1 + > kernel/sched/topology.c | 12 ++++++++++++ > 3 files changed, 50 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 666ece65987f..4039f9b348ec 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7259,6 +7259,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > } > } > > + if (static_branch_unlikely(&sched_cluster_active)) { > + struct sched_group *sg = sd->groups; > + > + if (sg->flags & SD_CLUSTER) { > + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { > + if (!cpumask_test_cpu(cpu, cpus)) > + continue; > + > + if (has_idle_core) { > + i = select_idle_core(p, cpu, cpus, &idle_cpu); > + if ((unsigned int)i < nr_cpumask_bits) > + return i; > + } else { > + if (--nr <= 0) > + return -1; > + idle_cpu = __select_idle_cpu(cpu, p); > + if ((unsigned int)idle_cpu < nr_cpumask_bits) > + return idle_cpu; > + } > + } > + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > + } > + } > + > for_each_cpu_wrap(cpu, cpus, target + 1) { > if (has_idle_core) { > i = select_idle_core(p, cpu, cpus, &idle_cpu); > @@ -7266,7 +7290,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > return i; > > } else { > - if (!--nr) > + if (--nr <= 0) > return -1; > idle_cpu = __select_idle_cpu(cpu, p); > if ((unsigned int)idle_cpu < nr_cpumask_bits) > @@ -7395,8 +7419,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > */ > if (prev != target && cpus_share_cache(prev, target) && > (available_idle_cpu(prev) || sched_idle_cpu(prev)) && > - asym_fits_cpu(task_util, util_min, util_max, prev)) > - return prev; > + asym_fits_cpu(task_util, util_min, util_max, prev)) { > + if (!static_branch_unlikely(&sched_cluster_active)) > + return prev; > + > + if (cpus_share_resources(prev, target)) > + return prev; > + } > > /* > * Allow a per-cpu kthread to stack with the wakee if the > @@ -7423,7 +7452,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && > cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && > asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { > - return recent_used_cpu; > + if (!static_branch_unlikely(&sched_cluster_active)) > + return recent_used_cpu; > + > + if (cpus_share_resources(recent_used_cpu, target)) > + return recent_used_cpu; > } > > /* > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 998f03d02de0..ef4fe7bcf740 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); > DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); > DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); > extern struct static_key_false sched_asym_cpucapacity; > +extern struct static_key_false sched_cluster_active; > > static __always_inline bool sched_asym_cpucap_active(void) > { > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index 48cd88350d18..925cd68abc8b 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); > + > DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); > +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); > > static void update_top_cache_domain(int cpu) > { > @@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att > struct rq *rq = NULL; > int i, ret = -ENOMEM; > bool has_asym = false; > + bool has_cluster = false; > > if (WARN_ON(cpumask_empty(cpu_map))) > goto error; > @@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att > WRITE_ONCE(d.rd->max_cpu_capacity, capacity); > > cpu_attach_domain(sd, d.rd, i); > + > + if (lowest_flag_domain(i, SD_CLUSTER)) > + has_cluster = true; > } > rcu_read_unlock(); > > if (has_asym) > static_branch_inc_cpuslocked(&sched_asym_cpucapacity); > > + if (has_cluster) > + static_branch_inc_cpuslocked(&sched_cluster_active); > + > if (rq && sched_debug_verbose) { > pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", > cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); > @@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) > if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) > static_branch_dec_cpuslocked(&sched_asym_cpucapacity); > > + if (static_branch_unlikely(&sched_cluster_active)) > + static_branch_dec_cpuslocked(&sched_cluster_active); > + > rcu_read_lock(); > for_each_cpu(i, cpu_map) > cpu_attach_domain(NULL, &def_root_domain, i); > -- > 2.24.0 >
Hello Yicong, Barry, On Thu, Oct 12, 2023 at 08:17:06PM +0800, Yicong Yang wrote: > From: Barry Song <song.bao.hua@hisilicon.com> > > For platforms having clusters like Kunpeng920, CPUs within the same cluster > have lower latency when synchronizing and accessing shared resources like > cache. Thus, this patch tries to find an idle cpu within the cluster of the > target CPU before scanning the whole LLC to gain lower latency. This > will be implemented in 2 steps in select_idle_sibling(): > 1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them > if they're sharing cluster with the target CPU. Otherwise trying to > scan for an idle CPU in the target's cluster. > 2. Scanning the cluster prior to the LLC of the target CPU for an > idle CPU to wakeup. > > Testing has been done on Kunpeng920 by pinning tasks to one numa and two > numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. > > With this patch, We noticed enhancement on tbench and netperf within one > numa or cross two numa on top of tip-sched-core commit > 9b46f1abc6d4 ("sched/debug: Print 'tgid' in sched_show_task()") > > tbench results (node 0): > baseline patched > 1: 327.2833 372.4623 ( 13.80%) > 4: 1320.5933 1479.8833 ( 12.06%) > 8: 2638.4867 2921.5267 ( 10.73%) > 16: 5282.7133 5891.5633 ( 11.53%) > 32: 9810.6733 9877.3400 ( 0.68%) > 64: 7408.9367 7447.9900 ( 0.53%) > 128: 6203.2600 6191.6500 ( -0.19%) > tbench results (node 0-1): > baseline patched > 1: 332.0433 372.7223 ( 12.25%) > 4: 1325.4667 1477.6733 ( 11.48%) > 8: 2622.9433 2897.9967 ( 10.49%) > 16: 5218.6100 5878.2967 ( 12.64%) > 32: 10211.7000 11494.4000 ( 12.56%) > 64: 13313.7333 16740.0333 ( 25.74%) > 128: 13959.1000 14533.9000 ( 4.12%) > > netperf results TCP_RR (node 0): > baseline patched > 1: 76546.5033 90649.9867 ( 18.42%) > 4: 77292.4450 90932.7175 ( 17.65%) > 8: 77367.7254 90882.3467 ( 17.47%) > 16: 78519.9048 90938.8344 ( 15.82%) > 32: 72169.5035 72851.6730 ( 0.95%) > 64: 25911.2457 25882.2315 ( -0.11%) > 128: 10752.6572 10768.6038 ( 0.15%) > > netperf results TCP_RR (node 0-1): > baseline patched > 1: 76857.6667 90892.2767 ( 18.26%) > 4: 78236.6475 90767.3017 ( 16.02%) > 8: 77929.6096 90684.1633 ( 16.37%) > 16: 77438.5873 90502.5787 ( 16.87%) > 32: 74205.6635 88301.5612 ( 19.00%) > 64: 69827.8535 71787.6706 ( 2.81%) > 128: 25281.4366 25771.3023 ( 1.94%) > > netperf results UDP_RR (node 0): > baseline patched > 1: 96869.8400 110800.8467 ( 14.38%) > 4: 97744.9750 109680.5425 ( 12.21%) > 8: 98783.9863 110409.9637 ( 11.77%) > 16: 99575.0235 110636.2435 ( 11.11%) > 32: 95044.7250 97622.8887 ( 2.71%) > 64: 32925.2146 32644.4991 ( -0.85%) > 128: 12859.2343 12824.0051 ( -0.27%) > > netperf results UDP_RR (node 0-1): > baseline patched > 1: 97202.4733 110190.1200 ( 13.36%) > 4: 95954.0558 106245.7258 ( 10.73%) > 8: 96277.1958 105206.5304 ( 9.27%) > 16: 97692.7810 107927.2125 ( 10.48%) > 32: 79999.6702 103550.2999 ( 29.44%) > 64: 80592.7413 87284.0856 ( 8.30%) > 128: 27701.5770 29914.5820 ( 7.99%) > > Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch > in the code has not been tested but it supposed to work. > > Chen Yu also noticed this will improve the performance of tbench and > netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one > cluster sharing L2 Cache. > > Suggested-by: Peter Zijlstra <peterz@infradead.org> > [https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net] > Tested-by: Yicong Yang <yangyicong@hisilicon.com> > Signed-off-by: Barry Song <song.bao.hua@hisilicon.com> > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com> > Reviewed-by: Chen Yu <yu.c.chen@intel.com> This version looks good to me. Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> -- Thanks and Regards gautham. > --- > kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++---- > kernel/sched/sched.h | 1 + > kernel/sched/topology.c | 12 ++++++++++++ > 3 files changed, 50 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 666ece65987f..4039f9b348ec 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7259,6 +7259,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > } > } > > + if (static_branch_unlikely(&sched_cluster_active)) { > + struct sched_group *sg = sd->groups; > + > + if (sg->flags & SD_CLUSTER) { > + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { > + if (!cpumask_test_cpu(cpu, cpus)) > + continue; > + > + if (has_idle_core) { > + i = select_idle_core(p, cpu, cpus, &idle_cpu); > + if ((unsigned int)i < nr_cpumask_bits) > + return i; > + } else { > + if (--nr <= 0) > + return -1; > + idle_cpu = __select_idle_cpu(cpu, p); > + if ((unsigned int)idle_cpu < nr_cpumask_bits) > + return idle_cpu; > + } > + } > + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > + } > + } > + > for_each_cpu_wrap(cpu, cpus, target + 1) { > if (has_idle_core) { > i = select_idle_core(p, cpu, cpus, &idle_cpu); > @@ -7266,7 +7290,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > return i; > > } else { > - if (!--nr) > + if (--nr <= 0) > return -1; > idle_cpu = __select_idle_cpu(cpu, p); > if ((unsigned int)idle_cpu < nr_cpumask_bits) > @@ -7395,8 +7419,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > */ > if (prev != target && cpus_share_cache(prev, target) && > (available_idle_cpu(prev) || sched_idle_cpu(prev)) && > - asym_fits_cpu(task_util, util_min, util_max, prev)) > - return prev; > + asym_fits_cpu(task_util, util_min, util_max, prev)) { > + if (!static_branch_unlikely(&sched_cluster_active)) > + return prev; > + > + if (cpus_share_resources(prev, target)) > + return prev; > + } > > /* > * Allow a per-cpu kthread to stack with the wakee if the > @@ -7423,7 +7452,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && > cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && > asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { > - return recent_used_cpu; > + if (!static_branch_unlikely(&sched_cluster_active)) > + return recent_used_cpu; > + > + if (cpus_share_resources(recent_used_cpu, target)) > + return recent_used_cpu; > } > > /* > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 998f03d02de0..ef4fe7bcf740 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); > DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); > DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); > extern struct static_key_false sched_asym_cpucapacity; > +extern struct static_key_false sched_cluster_active; > > static __always_inline bool sched_asym_cpucap_active(void) > { > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index 48cd88350d18..925cd68abc8b 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); > + > DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); > +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); > > static void update_top_cache_domain(int cpu) > { > @@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att > struct rq *rq = NULL; > int i, ret = -ENOMEM; > bool has_asym = false; > + bool has_cluster = false; > > if (WARN_ON(cpumask_empty(cpu_map))) > goto error; > @@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att > WRITE_ONCE(d.rd->max_cpu_capacity, capacity); > > cpu_attach_domain(sd, d.rd, i); > + > + if (lowest_flag_domain(i, SD_CLUSTER)) > + has_cluster = true; > } > rcu_read_unlock(); > > if (has_asym) > static_branch_inc_cpuslocked(&sched_asym_cpucapacity); > > + if (has_cluster) > + static_branch_inc_cpuslocked(&sched_cluster_active); > + > if (rq && sched_debug_verbose) { > pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", > cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); > @@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) > if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) > static_branch_dec_cpuslocked(&sched_asym_cpucapacity); > > + if (static_branch_unlikely(&sched_cluster_active)) > + static_branch_dec_cpuslocked(&sched_cluster_active); > + > rcu_read_lock(); > for_each_cpu(i, cpu_map) > cpu_attach_domain(NULL, &def_root_domain, i); > -- > 2.24.0 >
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 666ece65987f..4039f9b348ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7259,6 +7259,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7266,7 +7290,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7395,8 +7419,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + if (!static_branch_unlikely(&sched_cluster_active)) + return prev; + + if (cpus_share_resources(prev, target)) + return prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7423,7 +7452,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + if (!static_branch_unlikely(&sched_cluster_active)) + return recent_used_cpu; + + if (cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 998f03d02de0..ef4fe7bcf740 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 48cd88350d18..925cd68abc8b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att WRITE_ONCE(d.rd->max_cpu_capacity, capacity); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i);