Message ID | 20230530070253.33306-3-yangyicong@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | sched/fair: Scan cluster before scanning LLC in wake-up path | expand |
On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 373ff5f55884..b8c129ed8b47 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > } > } > > + if (static_branch_unlikely(&sched_cluster_active)) { > + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); > + > + if (sdc) { > + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { > + if (!cpumask_test_cpu(cpu, cpus)) > + continue; > + > + if (has_idle_core) { > + i = select_idle_core(p, cpu, cpus, &idle_cpu); > + if ((unsigned int)i < nr_cpumask_bits) > + return i; > + } else { > + if (--nr <= 0) > + return -1; > + idle_cpu = __select_idle_cpu(cpu, p); > + if ((unsigned int)idle_cpu < nr_cpumask_bits) > + return idle_cpu; > + } > + } > + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); > + } > + } Would not this: --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned)i < nr_cpumask_bits) + return 1; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); also work? Then we can avoid the extra sd_cluster per-cpu variable.
On 2023/5/30 19:55, Peter Zijlstra wrote: > On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: > >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index 373ff5f55884..b8c129ed8b47 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >> } >> } >> >> + if (static_branch_unlikely(&sched_cluster_active)) { >> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); >> + >> + if (sdc) { >> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { >> + if (!cpumask_test_cpu(cpu, cpus)) >> + continue; >> + >> + if (has_idle_core) { >> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >> + if ((unsigned int)i < nr_cpumask_bits) >> + return i; >> + } else { >> + if (--nr <= 0) >> + return -1; >> + idle_cpu = __select_idle_cpu(cpu, p); >> + if ((unsigned int)idle_cpu < nr_cpumask_bits) >> + return idle_cpu; >> + } >> + } >> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); >> + } >> + } > > Would not this: > > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s > } > } > > + if (static_branch_unlikely(&sched_cluster_active)) { > + struct sched_group *sg = sd->groups; > + if (sg->flags & SD_CLUSTER) { > + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { > + if (!cpumask_test_cpu(cpu, cpus)) > + continue; > + > + if (has_idle_core) { > + i = select_idle_core(p, cpu, cpus, &idle_cpu); > + if ((unsigned)i < nr_cpumask_bits) > + return 1; > + } else { > + if (--nr <= 0) > + return -1; > + idle_cpu = __select_idle_cpu(cpu, p); > + if ((unsigned)idle_cpu < nr_cpumask_bits) > + return idle_cpu; > + } > + } > + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > + } > + } > + > for_each_cpu_wrap(cpu, cpus, target + 1) { > if (has_idle_core) { > i = select_idle_core(p, cpu, cpus, &idle_cpu); > > also work? Then we can avoid the extra sd_cluster per-cpu variable. > I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 69968ed9ffb9..5c443b74abf5 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_span(group)); - printk(KERN_CONT " %d:{ span=%*pbl", - group->sgc->id, + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", cpumask_pr_args(sched_group_span(group))); if ((sd->flags & SD_OVERLAP) && Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked as cluster: [ 8.886099] CPU0 attaching sched-domain(s): [ 8.889539] domain-0: span=0,40 level=SMT [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } [ 8.897538] domain-1: span=0-19,40-59 level=MC [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } [ 8.905538] domain-2: span=0-79 level=NUMA [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. Thanks, Yicong
On 2023/5/30 22:39, Yicong Yang wrote: > On 2023/5/30 19:55, Peter Zijlstra wrote: >> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: >> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>> index 373ff5f55884..b8c129ed8b47 100644 >>> --- a/kernel/sched/fair.c >>> +++ b/kernel/sched/fair.c >>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>> } >>> } >>> >>> + if (static_branch_unlikely(&sched_cluster_active)) { >>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); >>> + >>> + if (sdc) { >>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { >>> + if (!cpumask_test_cpu(cpu, cpus)) >>> + continue; >>> + >>> + if (has_idle_core) { >>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>> + if ((unsigned int)i < nr_cpumask_bits) >>> + return i; >>> + } else { >>> + if (--nr <= 0) >>> + return -1; >>> + idle_cpu = __select_idle_cpu(cpu, p); >>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) >>> + return idle_cpu; >>> + } >>> + } >>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); >>> + } >>> + } >> >> Would not this: >> >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s >> } >> } >> >> + if (static_branch_unlikely(&sched_cluster_active)) { >> + struct sched_group *sg = sd->groups; >> + if (sg->flags & SD_CLUSTER) { >> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { >> + if (!cpumask_test_cpu(cpu, cpus)) >> + continue; >> + >> + if (has_idle_core) { >> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >> + if ((unsigned)i < nr_cpumask_bits) >> + return 1; >> + } else { >> + if (--nr <= 0) >> + return -1; >> + idle_cpu = __select_idle_cpu(cpu, p); >> + if ((unsigned)idle_cpu < nr_cpumask_bits) >> + return idle_cpu; >> + } >> + } >> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); >> + } >> + } >> + >> for_each_cpu_wrap(cpu, cpus, target + 1) { >> if (has_idle_core) { >> i = select_idle_core(p, cpu, cpus, &idle_cpu); >> >> also work? Then we can avoid the extra sd_cluster per-cpu variable. >> > > I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. > Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index 69968ed9ffb9..5c443b74abf5 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, > > cpumask_or(groupmask, groupmask, sched_group_span(group)); > > - printk(KERN_CONT " %d:{ span=%*pbl", > - group->sgc->id, > + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", > + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", > cpumask_pr_args(sched_group_span(group))); > > if ((sd->flags & SD_OVERLAP) && > > Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked > as cluster: > > [ 8.886099] CPU0 attaching sched-domain(s): > [ 8.889539] domain-0: span=0,40 level=SMT > [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } > [ 8.897538] domain-1: span=0-19,40-59 level=MC > [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } > [ 8.905538] domain-2: span=0-79 level=NUMA > [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } > > I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems > we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. > Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper domains' group won't be updated if degeneration happens. So we cannot use the suggested approach for cluster scanning and sd_cluster per-cpu variable is still needed. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 Thanks.
On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: > On 2023/5/30 22:39, Yicong Yang wrote: > > On 2023/5/30 19:55, Peter Zijlstra wrote: > >> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: > >> > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > >>> index 373ff5f55884..b8c129ed8b47 100644 > >>> --- a/kernel/sched/fair.c > >>> +++ b/kernel/sched/fair.c > >>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > >>> } > >>> } > >>> > >>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); > >>> + > >>> + if (sdc) { > >>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { > >>> + if (!cpumask_test_cpu(cpu, cpus)) > >>> + continue; > >>> + > >>> + if (has_idle_core) { > >>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>> + if ((unsigned int)i < nr_cpumask_bits) > >>> + return i; > >>> + } else { > >>> + if (--nr <= 0) > >>> + return -1; > >>> + idle_cpu = __select_idle_cpu(cpu, p); > >>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) > >>> + return idle_cpu; > >>> + } > >>> + } > >>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); > >>> + } > >>> + } > >> > >> Would not this: > >> > >> --- a/kernel/sched/fair.c > >> +++ b/kernel/sched/fair.c > >> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s > >> } > >> } > >> > >> + if (static_branch_unlikely(&sched_cluster_active)) { > >> + struct sched_group *sg = sd->groups; > >> + if (sg->flags & SD_CLUSTER) { > >> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { > >> + if (!cpumask_test_cpu(cpu, cpus)) > >> + continue; > >> + > >> + if (has_idle_core) { > >> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >> + if ((unsigned)i < nr_cpumask_bits) > >> + return 1; > >> + } else { > >> + if (--nr <= 0) > >> + return -1; > >> + idle_cpu = __select_idle_cpu(cpu, p); > >> + if ((unsigned)idle_cpu < nr_cpumask_bits) > >> + return idle_cpu; > >> + } > >> + } > >> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > >> + } > >> + } > >> + > >> for_each_cpu_wrap(cpu, cpus, target + 1) { > >> if (has_idle_core) { > >> i = select_idle_core(p, cpu, cpus, &idle_cpu); > >> > >> also work? Then we can avoid the extra sd_cluster per-cpu variable. > >> > > > > I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. > > Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: > > > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > > index 69968ed9ffb9..5c443b74abf5 100644 > > --- a/kernel/sched/topology.c > > +++ b/kernel/sched/topology.c > > @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, > > > > cpumask_or(groupmask, groupmask, sched_group_span(group)); > > > > - printk(KERN_CONT " %d:{ span=%*pbl", > > - group->sgc->id, > > + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", > > + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", > > cpumask_pr_args(sched_group_span(group))); > > > > if ((sd->flags & SD_OVERLAP) && > > > > Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked > > as cluster: > > > > [ 8.886099] CPU0 attaching sched-domain(s): > > [ 8.889539] domain-0: span=0,40 level=SMT > > [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } > > [ 8.897538] domain-1: span=0-19,40-59 level=MC > > [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } > > [ 8.905538] domain-2: span=0-79 level=NUMA > > [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } > > > > I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems > > we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. > > > > Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper > domains' group won't be updated if degeneration happens. So we cannot use the suggested approach > for cluster scanning and sd_cluster per-cpu variable is still needed. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 > > Thanks. > > Is this an issue? Suppose sched domain A's parent domain is B, B's parent sched domain is C. When B degenerates, C's child domain pointer is adjusted to A. However, currently the code does not adjust C's sched groups' flags. Should we adjust C's sched groups flags to be the same as A to keep consistency? diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6198fa135176..fe3fd70f2313 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; + struct sched_domain *parent = tmp->parent, *pparent; if (!parent) break; if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; + pparent = parent->parent; + tmp->parent = pparent; /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) */ if (parent->flags & SD_PREFER_SIBLING) tmp->flags |= SD_PREFER_SIBLING; + + if (pparent) { + struct sched_group *sg = pparent->groups; + + do { + sg->flags = tmp->flags; + sg = sg->next; + } while (sg != pparent->groups); + + pparent->child = tmp; + } + destroy_sched_domain(parent); } else tmp = tmp->parent;
On 2023/6/8 11:26, Chen Yu wrote: > On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: >> On 2023/5/30 22:39, Yicong Yang wrote: >>> On 2023/5/30 19:55, Peter Zijlstra wrote: >>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: >>>> >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>> index 373ff5f55884..b8c129ed8b47 100644 >>>>> --- a/kernel/sched/fair.c >>>>> +++ b/kernel/sched/fair.c >>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>>> } >>>>> } >>>>> >>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); >>>>> + >>>>> + if (sdc) { >>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { >>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>> + continue; >>>>> + >>>>> + if (has_idle_core) { >>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>> + if ((unsigned int)i < nr_cpumask_bits) >>>>> + return i; >>>>> + } else { >>>>> + if (--nr <= 0) >>>>> + return -1; >>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) >>>>> + return idle_cpu; >>>>> + } >>>>> + } >>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); >>>>> + } >>>>> + } >>>> >>>> Would not this: >>>> >>>> --- a/kernel/sched/fair.c >>>> +++ b/kernel/sched/fair.c >>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s >>>> } >>>> } >>>> >>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>> + struct sched_group *sg = sd->groups; >>>> + if (sg->flags & SD_CLUSTER) { >>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { >>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>> + continue; >>>> + >>>> + if (has_idle_core) { >>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>> + if ((unsigned)i < nr_cpumask_bits) >>>> + return 1; >>>> + } else { >>>> + if (--nr <= 0) >>>> + return -1; >>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) >>>> + return idle_cpu; >>>> + } >>>> + } >>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); >>>> + } >>>> + } >>>> + >>>> for_each_cpu_wrap(cpu, cpus, target + 1) { >>>> if (has_idle_core) { >>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>> >>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. >>>> >>> >>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. >>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: >>> >>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>> index 69968ed9ffb9..5c443b74abf5 100644 >>> --- a/kernel/sched/topology.c >>> +++ b/kernel/sched/topology.c >>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, >>> >>> cpumask_or(groupmask, groupmask, sched_group_span(group)); >>> >>> - printk(KERN_CONT " %d:{ span=%*pbl", >>> - group->sgc->id, >>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", >>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", >>> cpumask_pr_args(sched_group_span(group))); >>> >>> if ((sd->flags & SD_OVERLAP) && >>> >>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked >>> as cluster: >>> >>> [ 8.886099] CPU0 attaching sched-domain(s): >>> [ 8.889539] domain-0: span=0,40 level=SMT >>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } >>> [ 8.897538] domain-1: span=0-19,40-59 level=MC >>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } >>> [ 8.905538] domain-2: span=0-79 level=NUMA >>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } >>> >>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems >>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. >>> >> >> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper >> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach >> for cluster scanning and sd_cluster per-cpu variable is still needed. >> >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 >> >> Thanks. >> >> > Is this an issue? Suppose sched domain A's parent domain > is B, B's parent sched domain is C. When B degenerates, C's child domain > pointer is adjusted to A. However, currently the code does not adjust C's > sched groups' flags. Should we adjust C's sched groups flags to be the same > as A to keep consistency? It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses it within the SMT so I think update the lowest domain's group flag works. For correctness all the domain group's flag should derives from its real child. I tried to solve this at group building but seems hard to do, at that time we don't know whether a domain is going to degenerate or not since the groups it not built. > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index 6198fa135176..fe3fd70f2313 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > > /* Remove the sched domains which do not contribute to scheduling. */ > for (tmp = sd; tmp; ) { > - struct sched_domain *parent = tmp->parent; > + struct sched_domain *parent = tmp->parent, *pparent; > if (!parent) > break; > > if (sd_parent_degenerate(tmp, parent)) { > - tmp->parent = parent->parent; > - if (parent->parent) > - parent->parent->child = tmp; > + pparent = parent->parent; > + tmp->parent = pparent; > /* > * Transfer SD_PREFER_SIBLING down in case of a > * degenerate parent; the spans match for this > @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > */ > if (parent->flags & SD_PREFER_SIBLING) > tmp->flags |= SD_PREFER_SIBLING; > + > + if (pparent) { > + struct sched_group *sg = pparent->groups; > + > + do { > + sg->flags = tmp->flags; May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from remote group have the same flags with @tmp? > + sg = sg->next; > + } while (sg != pparent->groups); > + > + pparent->child = tmp; > + } > + > destroy_sched_domain(parent); > } else > tmp = tmp->parent; > The code you pasted at last of your mail is correct I think, not sure it doesn't appear here when reply... Thanks.
On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote: > On 2023/6/8 11:26, Chen Yu wrote: > > On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: > >> On 2023/5/30 22:39, Yicong Yang wrote: > >>> On 2023/5/30 19:55, Peter Zijlstra wrote: > >>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: > >>>> > >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > >>>>> index 373ff5f55884..b8c129ed8b47 100644 > >>>>> --- a/kernel/sched/fair.c > >>>>> +++ b/kernel/sched/fair.c > >>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > >>>>> } > >>>>> } > >>>>> > >>>>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); > >>>>> + > >>>>> + if (sdc) { > >>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { > >>>>> + if (!cpumask_test_cpu(cpu, cpus)) > >>>>> + continue; > >>>>> + > >>>>> + if (has_idle_core) { > >>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>> + if ((unsigned int)i < nr_cpumask_bits) > >>>>> + return i; > >>>>> + } else { > >>>>> + if (--nr <= 0) > >>>>> + return -1; > >>>>> + idle_cpu = __select_idle_cpu(cpu, p); > >>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) > >>>>> + return idle_cpu; > >>>>> + } > >>>>> + } > >>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); > >>>>> + } > >>>>> + } > >>>> > >>>> Would not this: > >>>> > >>>> --- a/kernel/sched/fair.c > >>>> +++ b/kernel/sched/fair.c > >>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s > >>>> } > >>>> } > >>>> > >>>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>>> + struct sched_group *sg = sd->groups; > >>>> + if (sg->flags & SD_CLUSTER) { > >>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { > >>>> + if (!cpumask_test_cpu(cpu, cpus)) > >>>> + continue; > >>>> + > >>>> + if (has_idle_core) { > >>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>> + if ((unsigned)i < nr_cpumask_bits) > >>>> + return 1; > >>>> + } else { > >>>> + if (--nr <= 0) > >>>> + return -1; > >>>> + idle_cpu = __select_idle_cpu(cpu, p); > >>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) > >>>> + return idle_cpu; > >>>> + } > >>>> + } > >>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > >>>> + } > >>>> + } > >>>> + > >>>> for_each_cpu_wrap(cpu, cpus, target + 1) { > >>>> if (has_idle_core) { > >>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>> > >>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. > >>>> > >>> > >>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. > >>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: > >>> > >>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > >>> index 69968ed9ffb9..5c443b74abf5 100644 > >>> --- a/kernel/sched/topology.c > >>> +++ b/kernel/sched/topology.c > >>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, > >>> > >>> cpumask_or(groupmask, groupmask, sched_group_span(group)); > >>> > >>> - printk(KERN_CONT " %d:{ span=%*pbl", > >>> - group->sgc->id, > >>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", > >>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", > >>> cpumask_pr_args(sched_group_span(group))); > >>> > >>> if ((sd->flags & SD_OVERLAP) && > >>> > >>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked > >>> as cluster: > >>> > >>> [ 8.886099] CPU0 attaching sched-domain(s): > >>> [ 8.889539] domain-0: span=0,40 level=SMT > >>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } > >>> [ 8.897538] domain-1: span=0-19,40-59 level=MC > >>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } > >>> [ 8.905538] domain-2: span=0-79 level=NUMA > >>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } > >>> > >>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems > >>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. > >>> > >> > >> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper > >> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach > >> for cluster scanning and sd_cluster per-cpu variable is still needed. > >> > >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 > >> > >> Thanks. > >> > >> > > Is this an issue? Suppose sched domain A's parent domain > > is B, B's parent sched domain is C. When B degenerates, C's child domain > > pointer is adjusted to A. However, currently the code does not adjust C's > > sched groups' flags. Should we adjust C's sched groups flags to be the same > > as A to keep consistency? > > It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses > it within the SMT so I think update the lowest domain's group flag works. For correctness > all the domain group's flag should derives from its real child. I tried to solve this at group > building but seems hard to do, at that time we don't know whether a domain is going to degenerate > or not since the groups it not built. > > > > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > > index 6198fa135176..fe3fd70f2313 100644 > > --- a/kernel/sched/topology.c > > +++ b/kernel/sched/topology.c > > @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > > > > /* Remove the sched domains which do not contribute to scheduling. */ > > for (tmp = sd; tmp; ) { > > - struct sched_domain *parent = tmp->parent; > > + struct sched_domain *parent = tmp->parent, *pparent; > > if (!parent) > > break; > > > > if (sd_parent_degenerate(tmp, parent)) { > > - tmp->parent = parent->parent; > > - if (parent->parent) > > - parent->parent->child = tmp; > > + pparent = parent->parent; > > + tmp->parent = pparent; > > /* > > * Transfer SD_PREFER_SIBLING down in case of a > > * degenerate parent; the spans match for this > > @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > > */ > > if (parent->flags & SD_PREFER_SIBLING) > > tmp->flags |= SD_PREFER_SIBLING; > > + > > + if (pparent) { > > + struct sched_group *sg = pparent->groups; > > + > > + do { > > + sg->flags = tmp->flags; > > May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from > remote group have the same flags with @tmp? > Good question, for heterogenous platforms sched groups within the same domain might not always have the same flags, because if group1 and group2 sit in big/small-core child domain, they could have different balance flags in theory. Maybe only update the local group's flag is accurate. I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it should be in tip: https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ We could adjust it based on his change to remove SD_CLUSTER, or we can replace groups->flag with tmp->flag directly, in case we have other flags to be adjusted in the future. thanks, Chenyu
On 2023/6/9 18:50, Chen Yu wrote: > On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote: >> On 2023/6/8 11:26, Chen Yu wrote: >>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: >>>> On 2023/5/30 22:39, Yicong Yang wrote: >>>>> On 2023/5/30 19:55, Peter Zijlstra wrote: >>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: >>>>>> >>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>>>> index 373ff5f55884..b8c129ed8b47 100644 >>>>>>> --- a/kernel/sched/fair.c >>>>>>> +++ b/kernel/sched/fair.c >>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>>>>> } >>>>>>> } >>>>>>> >>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); >>>>>>> + >>>>>>> + if (sdc) { >>>>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { >>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>>>> + continue; >>>>>>> + >>>>>>> + if (has_idle_core) { >>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>> + if ((unsigned int)i < nr_cpumask_bits) >>>>>>> + return i; >>>>>>> + } else { >>>>>>> + if (--nr <= 0) >>>>>>> + return -1; >>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) >>>>>>> + return idle_cpu; >>>>>>> + } >>>>>>> + } >>>>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); >>>>>>> + } >>>>>>> + } >>>>>> >>>>>> Would not this: >>>>>> >>>>>> --- a/kernel/sched/fair.c >>>>>> +++ b/kernel/sched/fair.c >>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s >>>>>> } >>>>>> } >>>>>> >>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>>> + struct sched_group *sg = sd->groups; >>>>>> + if (sg->flags & SD_CLUSTER) { >>>>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { >>>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>>> + continue; >>>>>> + >>>>>> + if (has_idle_core) { >>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>> + if ((unsigned)i < nr_cpumask_bits) >>>>>> + return 1; >>>>>> + } else { >>>>>> + if (--nr <= 0) >>>>>> + return -1; >>>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) >>>>>> + return idle_cpu; >>>>>> + } >>>>>> + } >>>>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); >>>>>> + } >>>>>> + } >>>>>> + >>>>>> for_each_cpu_wrap(cpu, cpus, target + 1) { >>>>>> if (has_idle_core) { >>>>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>> >>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. >>>>>> >>>>> >>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. >>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: >>>>> >>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>>>> index 69968ed9ffb9..5c443b74abf5 100644 >>>>> --- a/kernel/sched/topology.c >>>>> +++ b/kernel/sched/topology.c >>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, >>>>> >>>>> cpumask_or(groupmask, groupmask, sched_group_span(group)); >>>>> >>>>> - printk(KERN_CONT " %d:{ span=%*pbl", >>>>> - group->sgc->id, >>>>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", >>>>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", >>>>> cpumask_pr_args(sched_group_span(group))); >>>>> >>>>> if ((sd->flags & SD_OVERLAP) && >>>>> >>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked >>>>> as cluster: >>>>> >>>>> [ 8.886099] CPU0 attaching sched-domain(s): >>>>> [ 8.889539] domain-0: span=0,40 level=SMT >>>>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } >>>>> [ 8.897538] domain-1: span=0-19,40-59 level=MC >>>>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } >>>>> [ 8.905538] domain-2: span=0-79 level=NUMA >>>>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } >>>>> >>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems >>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. >>>>> >>>> >>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper >>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach >>>> for cluster scanning and sd_cluster per-cpu variable is still needed. >>>> >>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 >>>> >>>> Thanks. >>>> >>>> >>> Is this an issue? Suppose sched domain A's parent domain >>> is B, B's parent sched domain is C. When B degenerates, C's child domain >>> pointer is adjusted to A. However, currently the code does not adjust C's >>> sched groups' flags. Should we adjust C's sched groups flags to be the same >>> as A to keep consistency? >> >> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses >> it within the SMT so I think update the lowest domain's group flag works. For correctness >> all the domain group's flag should derives from its real child. I tried to solve this at group >> building but seems hard to do, at that time we don't know whether a domain is going to degenerate >> or not since the groups it not built. >> >>> >>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>> index 6198fa135176..fe3fd70f2313 100644 >>> --- a/kernel/sched/topology.c >>> +++ b/kernel/sched/topology.c >>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>> >>> /* Remove the sched domains which do not contribute to scheduling. */ >>> for (tmp = sd; tmp; ) { >>> - struct sched_domain *parent = tmp->parent; >>> + struct sched_domain *parent = tmp->parent, *pparent; >>> if (!parent) >>> break; >>> >>> if (sd_parent_degenerate(tmp, parent)) { >>> - tmp->parent = parent->parent; >>> - if (parent->parent) >>> - parent->parent->child = tmp; >>> + pparent = parent->parent; >>> + tmp->parent = pparent; >>> /* >>> * Transfer SD_PREFER_SIBLING down in case of a >>> * degenerate parent; the spans match for this >>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>> */ >>> if (parent->flags & SD_PREFER_SIBLING) >>> tmp->flags |= SD_PREFER_SIBLING; >>> + >>> + if (pparent) { >>> + struct sched_group *sg = pparent->groups; >>> + >>> + do { >>> + sg->flags = tmp->flags; >> >> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from >> remote group have the same flags with @tmp? >> > Good question, for heterogenous platforms sched groups within the same domain might not always > have the same flags, because if group1 and group2 sit in big/small-core child domain, they could > have different balance flags in theory. Maybe only update the local group's flag is accurate. > > I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it > should be in tip: > https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ > We could adjust it based on his change to remove SD_CLUSTER, or we can > replace groups->flag with tmp->flag directly, in case we have other flags to be > adjusted in the future. > Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update local groups flag should satisfy our needs. I tried to use the correct child domains to build the sched groups then all the groups will have the correct flags, but it'll be a bit more complex. Thanks.
On 2023/6/12 13:22, Chen Yu wrote: > On 2023-06-12 at 10:31:39 +0530, Gautham R. Shenoy wrote: >> Hello Yicong, >> >> >> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: >>> From: Barry Song <song.bao.hua@hisilicon.com> >> [..snip..] >> >>> @@ -7103,7 +7127,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) >>> bool has_idle_core = false; >>> struct sched_domain *sd; >>> unsigned long task_util, util_min, util_max; >>> - int i, recent_used_cpu; >>> + int i, recent_used_cpu, prev_aff = -1; >>> >>> /* >>> * On asymmetric system, update task utilization because we will check >>> @@ -7130,8 +7154,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) >>> */ >>> if (prev != target && cpus_share_cache(prev, target) && >>> (available_idle_cpu(prev) || sched_idle_cpu(prev)) && >>> - asym_fits_cpu(task_util, util_min, util_max, prev)) >>> - return prev; >>> + asym_fits_cpu(task_util, util_min, util_max, prev)) { >>> + if (cpus_share_lowest_cache(prev, target)) >> >> For platforms without the cluster domain, the cpus_share_lowest_cache >> check is a repetition of the cpus_share_cache(prev, target) check. Can >> we avoid this using a static branch check for cluster ? >> >> > Sounds good. >>> + return prev; >>> + prev_aff = prev; >>> + } >>> >>> /* >>> * Allow a per-cpu kthread to stack with the wakee if the >>> @@ -7158,7 +7185,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) >>> (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && >>> cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && >>> asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { >>> - return recent_used_cpu; >>> + if (cpus_share_lowest_cache(recent_used_cpu, target)) >> >> Same here. >> >>> + return recent_used_cpu; >>> + } else { >>> + recent_used_cpu = -1; >>> } >>> >>> /* >>> @@ -7199,6 +7229,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) >>> if ((unsigned)i < nr_cpumask_bits) >>> return i; >>> >>> + /* >>> + * For cluster machines which have lower sharing cache like L2 or >>> + * LLC Tag, we tend to find an idle CPU in the target's cluster >>> + * first. But prev_cpu or recent_used_cpu may also be a good candidate, >>> + * use them if possible when no idle CPU found in select_idle_cpu(). >>> + */ >>> + if ((unsigned int)prev_aff < nr_cpumask_bits) >>> + return prev_aff; >> >> Shouldn't we check if prev_aff (and the recent_used_cpu below) is >> still idle ? >> >> > When we reach here, the target is non-idle, and the prev_aff is idle. > Although there is a race condition that prev_aff becomes non-idle > and target becomes idle after select_idle_cpu(), this window might be > small IMO. > Yes. Races here but adding a check won't cost much, so it's ok for me to check it or not. Thanks.
On 2023/6/13 15:36, Yicong Yang wrote: > On 2023/6/9 18:50, Chen Yu wrote: >> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote: >>> On 2023/6/8 11:26, Chen Yu wrote: >>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: >>>>> On 2023/5/30 22:39, Yicong Yang wrote: >>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote: >>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: >>>>>>> >>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>>>>> index 373ff5f55884..b8c129ed8b47 100644 >>>>>>>> --- a/kernel/sched/fair.c >>>>>>>> +++ b/kernel/sched/fair.c >>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>>>>>> } >>>>>>>> } >>>>>>>> >>>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); >>>>>>>> + >>>>>>>> + if (sdc) { >>>>>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { >>>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>>>>> + continue; >>>>>>>> + >>>>>>>> + if (has_idle_core) { >>>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>>> + if ((unsigned int)i < nr_cpumask_bits) >>>>>>>> + return i; >>>>>>>> + } else { >>>>>>>> + if (--nr <= 0) >>>>>>>> + return -1; >>>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) >>>>>>>> + return idle_cpu; >>>>>>>> + } >>>>>>>> + } >>>>>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); >>>>>>>> + } >>>>>>>> + } >>>>>>> >>>>>>> Would not this: >>>>>>> >>>>>>> --- a/kernel/sched/fair.c >>>>>>> +++ b/kernel/sched/fair.c >>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s >>>>>>> } >>>>>>> } >>>>>>> >>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>>>> + struct sched_group *sg = sd->groups; >>>>>>> + if (sg->flags & SD_CLUSTER) { >>>>>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { >>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>>>> + continue; >>>>>>> + >>>>>>> + if (has_idle_core) { >>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>> + if ((unsigned)i < nr_cpumask_bits) >>>>>>> + return 1; >>>>>>> + } else { >>>>>>> + if (--nr <= 0) >>>>>>> + return -1; >>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) >>>>>>> + return idle_cpu; >>>>>>> + } >>>>>>> + } >>>>>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); >>>>>>> + } >>>>>>> + } >>>>>>> + >>>>>>> for_each_cpu_wrap(cpu, cpus, target + 1) { >>>>>>> if (has_idle_core) { >>>>>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>> >>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. >>>>>>> >>>>>> >>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. >>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: >>>>>> >>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>>>>> index 69968ed9ffb9..5c443b74abf5 100644 >>>>>> --- a/kernel/sched/topology.c >>>>>> +++ b/kernel/sched/topology.c >>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, >>>>>> >>>>>> cpumask_or(groupmask, groupmask, sched_group_span(group)); >>>>>> >>>>>> - printk(KERN_CONT " %d:{ span=%*pbl", >>>>>> - group->sgc->id, >>>>>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", >>>>>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", >>>>>> cpumask_pr_args(sched_group_span(group))); >>>>>> >>>>>> if ((sd->flags & SD_OVERLAP) && >>>>>> >>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked >>>>>> as cluster: >>>>>> >>>>>> [ 8.886099] CPU0 attaching sched-domain(s): >>>>>> [ 8.889539] domain-0: span=0,40 level=SMT >>>>>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } >>>>>> [ 8.897538] domain-1: span=0-19,40-59 level=MC >>>>>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } >>>>>> [ 8.905538] domain-2: span=0-79 level=NUMA >>>>>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } >>>>>> >>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems >>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. >>>>>> >>>>> >>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper >>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach >>>>> for cluster scanning and sd_cluster per-cpu variable is still needed. >>>>> >>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 >>>>> >>>>> Thanks. >>>>> >>>>> >>>> Is this an issue? Suppose sched domain A's parent domain >>>> is B, B's parent sched domain is C. When B degenerates, C's child domain >>>> pointer is adjusted to A. However, currently the code does not adjust C's >>>> sched groups' flags. Should we adjust C's sched groups flags to be the same >>>> as A to keep consistency? >>> >>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses >>> it within the SMT so I think update the lowest domain's group flag works. For correctness >>> all the domain group's flag should derives from its real child. I tried to solve this at group >>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate >>> or not since the groups it not built. >>> >>>> >>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>>> index 6198fa135176..fe3fd70f2313 100644 >>>> --- a/kernel/sched/topology.c >>>> +++ b/kernel/sched/topology.c >>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>>> >>>> /* Remove the sched domains which do not contribute to scheduling. */ >>>> for (tmp = sd; tmp; ) { >>>> - struct sched_domain *parent = tmp->parent; >>>> + struct sched_domain *parent = tmp->parent, *pparent; >>>> if (!parent) >>>> break; >>>> >>>> if (sd_parent_degenerate(tmp, parent)) { >>>> - tmp->parent = parent->parent; >>>> - if (parent->parent) >>>> - parent->parent->child = tmp; >>>> + pparent = parent->parent; >>>> + tmp->parent = pparent; >>>> /* >>>> * Transfer SD_PREFER_SIBLING down in case of a >>>> * degenerate parent; the spans match for this >>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>>> */ >>>> if (parent->flags & SD_PREFER_SIBLING) >>>> tmp->flags |= SD_PREFER_SIBLING; >>>> + >>>> + if (pparent) { >>>> + struct sched_group *sg = pparent->groups; >>>> + >>>> + do { >>>> + sg->flags = tmp->flags; >>> >>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from >>> remote group have the same flags with @tmp? >>> >> Good question, for heterogenous platforms sched groups within the same domain might not always >> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could >> have different balance flags in theory. Maybe only update the local group's flag is accurate. >> >> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it >> should be in tip: >> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ >> We could adjust it based on his change to remove SD_CLUSTER, or we can >> replace groups->flag with tmp->flag directly, in case we have other flags to be >> adjusted in the future. >> > > Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update > local groups flag should satisfy our needs. I tried to use the correct child domains to build the > sched groups then all the groups will have the correct flags, but it'll be a bit more complex. > something like below, detect the sched domain degeneration first and try to rebuild the groups if necessary. Works on a non-cluster machine emulated with QEMU: qemu-system-aarch64 \ -kernel ${Image} \ -smp cores=16,threads=2 \ -cpu host --enable-kvm \ -m 32G \ -object memory-backend-ram,id=node0,size=8G \ -object memory-backend-ram,id=node1,size=8G \ -object memory-backend-ram,id=node2,size=8G \ -object memory-backend-ram,id=node3,size=8G \ -numa node,memdev=node0,cpus=0-7,nodeid=0 \ -numa node,memdev=node1,cpus=8-15,nodeid=1 \ -numa node,memdev=node2,cpus=16-23,nodeid=2 \ -numa node,memdev=node3,cpus=24-31,nodeid=3 \ -numa dist,src=0,dst=1,val=12 \ -numa dist,src=0,dst=2,val=20 \ -numa dist,src=0,dst=3,val=22 \ -numa dist,src=1,dst=2,val=22 \ -numa dist,src=1,dst=3,val=24 \ -numa dist,src=2,dst=3,val=12 \ -machine virt,iommu=smmuv3 \ -net none -initrd ${Rootfs} \ -nographic -bios $(pwd)/QEMU_EFI.fd \ -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8" The flags looks correct: [ 4.379910] CPU0 attaching sched-domain(s): [ 4.380540] domain-0: span=0-1 level=SMT [ 4.381130] groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 } [ 4.382353] domain-1: span=0-7 level=MC [ 4.382950] groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 } [ 4.385378] domain-2: span=0-15 level=NUMA [ 4.386047] groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 } [ 4.387542] domain-3: span=0-23 level=NUMA [ 4.388197] groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 } [ 4.389661] domain-4: span=0-31 level=NUMA [ 4.390358] groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c0d21667ddf3..d3bf5a1c85bc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -99,6 +99,7 @@ struct sched_domain { int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ int level; + int need_degeneration; /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 69968ed9ffb9..561649ddcfe7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_span(group)); - printk(KERN_CONT " %d:{ span=%*pbl", - group->sgc->id, + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", cpumask_pr_args(sched_group_span(group))); if ((sd->flags & SD_OVERLAP) && @@ -623,6 +623,32 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } +/* + * Reset the sched groups for a rebuild. + */ +static void reset_sched_domain_groups(struct sched_domain *sd) +{ + struct sched_group *sg, *first, *next; + + if (!sd->groups) + return; + + sg = first = sd->groups; + do { + next = sg->next; + + if (sd->flags & SD_OVERLAP) { + if (atomic_dec_and_test(&sg->ref)) + kfree(sg); + } else { + atomic_dec(&sg->ref); + } + atomic_dec(&sg->sgc->ref); + + sg = next; + } while (sg != first); +} + static void destroy_sched_domain(struct sched_domain *sd) { /* @@ -717,6 +743,42 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } +static bool probe_sched_domain_degeneration(struct sched_domain *sd, int cpu) +{ + bool has_degeneration = false; + struct sched_domain *tmp = sd, *parent; + + if (tmp) { + parent = tmp->parent; + while (parent) { + if (sd_parent_degenerate(tmp, parent)) { + parent->need_degeneration = 1; + has_degeneration = true; + parent = parent->parent; + } else { + tmp = parent; + parent = tmp->parent; + } + } + } + + if (sd && sd_degenerate(sd)) { + sd->need_degeneration = 1; + has_degeneration = true; + } + + /* + * On degeneration reset the sched group's refcount since we need to + * rebuild them. + */ + if (has_degeneration) { + for (tmp = sd; tmp; tmp = tmp->parent) + reset_sched_domain_groups(tmp); + } + + return has_degeneration; +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -1008,9 +1070,9 @@ find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) * The proper descendant would be the one whose child won't span out * of sd */ - while (sibling->child && + while (sibling->child && (sibling->need_degeneration || !cpumask_subset(sched_domain_span(sibling->child), - sched_domain_span(sd))) + sched_domain_span(sd)))) sibling = sibling->child; /* @@ -1018,9 +1080,9 @@ find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) * to go down to skip those sched_domains which don't contribute to * scheduling because they will be degenerated in cpu_attach_domain */ - while (sibling->child && + while (sibling->child && (sibling->need_degeneration || cpumask_equal(sched_domain_span(sibling->child), - sched_domain_span(sibling))) + sched_domain_span(sibling)))) sibling = sibling->child; return sibling; @@ -1199,6 +1261,9 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) struct sched_group *sg; bool already_visited; + while (child && child->need_degeneration) + child = child->child; + if (child) cpu = cpumask_first(sched_domain_span(child)); @@ -2366,6 +2431,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att int i, ret = -ENOMEM; bool has_asym = false; bool has_cluster = false; + bool rebuild = true, need_rebuild = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2398,6 +2464,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } +rebuild_groups: /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { @@ -2412,6 +2479,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * If some sched domains are going to be degenerated, the groups may not + * be built from the real child domains thus with incorrect flags or + * span. Detect the generation here and rebuild the sched groups if + * necessary. + */ + if (rebuild) { + rebuild = false; + + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d.sd, i); + + need_rebuild = probe_sched_domain_degeneration(sd, i); + } + + if (need_rebuild) + goto rebuild_groups; + } + /* * Calculate an allowed NUMA imbalance such that LLCs do not get * imbalanced.
On 2023-06-13 at 16:09:17 +0800, Yicong Yang wrote: > On 2023/6/13 15:36, Yicong Yang wrote: > > On 2023/6/9 18:50, Chen Yu wrote: > >> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote: > >>> On 2023/6/8 11:26, Chen Yu wrote: > >>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: > >>>>> On 2023/5/30 22:39, Yicong Yang wrote: > >>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote: > >>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: > >>>>>>> > >>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > >>>>>>>> index 373ff5f55884..b8c129ed8b47 100644 > >>>>>>>> --- a/kernel/sched/fair.c > >>>>>>>> +++ b/kernel/sched/fair.c > >>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > >>>>>>>> } > >>>>>>>> } > >>>>>>>> > >>>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>>>>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); > >>>>>>>> + > >>>>>>>> + if (sdc) { > >>>>>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { > >>>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) > >>>>>>>> + continue; > >>>>>>>> + > >>>>>>>> + if (has_idle_core) { > >>>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>>>>> + if ((unsigned int)i < nr_cpumask_bits) > >>>>>>>> + return i; > >>>>>>>> + } else { > >>>>>>>> + if (--nr <= 0) > >>>>>>>> + return -1; > >>>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); > >>>>>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) > >>>>>>>> + return idle_cpu; > >>>>>>>> + } > >>>>>>>> + } > >>>>>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); > >>>>>>>> + } > >>>>>>>> + } > >>>>>>> > >>>>>>> Would not this: > >>>>>>> > >>>>>>> --- a/kernel/sched/fair.c > >>>>>>> +++ b/kernel/sched/fair.c > >>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s > >>>>>>> } > >>>>>>> } > >>>>>>> > >>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>>>>>> + struct sched_group *sg = sd->groups; > >>>>>>> + if (sg->flags & SD_CLUSTER) { > >>>>>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { > >>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) > >>>>>>> + continue; > >>>>>>> + > >>>>>>> + if (has_idle_core) { > >>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>>>> + if ((unsigned)i < nr_cpumask_bits) > >>>>>>> + return 1; > >>>>>>> + } else { > >>>>>>> + if (--nr <= 0) > >>>>>>> + return -1; > >>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); > >>>>>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) > >>>>>>> + return idle_cpu; > >>>>>>> + } > >>>>>>> + } > >>>>>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > >>>>>>> + } > >>>>>>> + } > >>>>>>> + > >>>>>>> for_each_cpu_wrap(cpu, cpus, target + 1) { > >>>>>>> if (has_idle_core) { > >>>>>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>>>> > >>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. > >>>>>>> > >>>>>> > >>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. > >>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: > >>>>>> > >>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > >>>>>> index 69968ed9ffb9..5c443b74abf5 100644 > >>>>>> --- a/kernel/sched/topology.c > >>>>>> +++ b/kernel/sched/topology.c > >>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, > >>>>>> > >>>>>> cpumask_or(groupmask, groupmask, sched_group_span(group)); > >>>>>> > >>>>>> - printk(KERN_CONT " %d:{ span=%*pbl", > >>>>>> - group->sgc->id, > >>>>>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", > >>>>>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", > >>>>>> cpumask_pr_args(sched_group_span(group))); > >>>>>> > >>>>>> if ((sd->flags & SD_OVERLAP) && > >>>>>> > >>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked > >>>>>> as cluster: > >>>>>> > >>>>>> [ 8.886099] CPU0 attaching sched-domain(s): > >>>>>> [ 8.889539] domain-0: span=0,40 level=SMT > >>>>>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } > >>>>>> [ 8.897538] domain-1: span=0-19,40-59 level=MC > >>>>>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } > >>>>>> [ 8.905538] domain-2: span=0-79 level=NUMA > >>>>>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } > >>>>>> > >>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems > >>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. > >>>>>> > >>>>> > >>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper > >>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach > >>>>> for cluster scanning and sd_cluster per-cpu variable is still needed. > >>>>> > >>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 > >>>>> > >>>>> Thanks. > >>>>> > >>>>> > >>>> Is this an issue? Suppose sched domain A's parent domain > >>>> is B, B's parent sched domain is C. When B degenerates, C's child domain > >>>> pointer is adjusted to A. However, currently the code does not adjust C's > >>>> sched groups' flags. Should we adjust C's sched groups flags to be the same > >>>> as A to keep consistency? > >>> > >>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses > >>> it within the SMT so I think update the lowest domain's group flag works. For correctness > >>> all the domain group's flag should derives from its real child. I tried to solve this at group > >>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate > >>> or not since the groups it not built. > >>> > >>>> > >>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > >>>> index 6198fa135176..fe3fd70f2313 100644 > >>>> --- a/kernel/sched/topology.c > >>>> +++ b/kernel/sched/topology.c > >>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > >>>> > >>>> /* Remove the sched domains which do not contribute to scheduling. */ > >>>> for (tmp = sd; tmp; ) { > >>>> - struct sched_domain *parent = tmp->parent; > >>>> + struct sched_domain *parent = tmp->parent, *pparent; > >>>> if (!parent) > >>>> break; > >>>> > >>>> if (sd_parent_degenerate(tmp, parent)) { > >>>> - tmp->parent = parent->parent; > >>>> - if (parent->parent) > >>>> - parent->parent->child = tmp; > >>>> + pparent = parent->parent; > >>>> + tmp->parent = pparent; > >>>> /* > >>>> * Transfer SD_PREFER_SIBLING down in case of a > >>>> * degenerate parent; the spans match for this > >>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > >>>> */ > >>>> if (parent->flags & SD_PREFER_SIBLING) > >>>> tmp->flags |= SD_PREFER_SIBLING; > >>>> + > >>>> + if (pparent) { > >>>> + struct sched_group *sg = pparent->groups; > >>>> + > >>>> + do { > >>>> + sg->flags = tmp->flags; > >>> > >>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from > >>> remote group have the same flags with @tmp? > >>> > >> Good question, for heterogenous platforms sched groups within the same domain might not always > >> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could > >> have different balance flags in theory. Maybe only update the local group's flag is accurate. > >> > >> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it > >> should be in tip: > >> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ > >> We could adjust it based on his change to remove SD_CLUSTER, or we can > >> replace groups->flag with tmp->flag directly, in case we have other flags to be > >> adjusted in the future. > >> > > > > Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update > > local groups flag should satisfy our needs. I tried to use the correct child domains to build the > > sched groups then all the groups will have the correct flags, but it'll be a bit more complex. > > > > something like below, detect the sched domain degeneration first and try to rebuild the groups if > necessary. Not sure if we need to rebuild the groups. With only if (parent->flags & SD_CLUSTER) parent->parent->groups->flags &= ~SD_CLUSTER; I see the correct flags. My understanding is that, although remote groups's flag might be incorrect, later when other sched domain degenerates, these remote groups becomes local groups for those sched domains, and the flags will be adjusted accordingly. > Works on a non-cluster machine emulated with QEMU: > > qemu-system-aarch64 \ > -kernel ${Image} \ > -smp cores=16,threads=2 \ > -cpu host --enable-kvm \ > -m 32G \ > -object memory-backend-ram,id=node0,size=8G \ > -object memory-backend-ram,id=node1,size=8G \ > -object memory-backend-ram,id=node2,size=8G \ > -object memory-backend-ram,id=node3,size=8G \ > -numa node,memdev=node0,cpus=0-7,nodeid=0 \ > -numa node,memdev=node1,cpus=8-15,nodeid=1 \ > -numa node,memdev=node2,cpus=16-23,nodeid=2 \ > -numa node,memdev=node3,cpus=24-31,nodeid=3 \ > -numa dist,src=0,dst=1,val=12 \ > -numa dist,src=0,dst=2,val=20 \ > -numa dist,src=0,dst=3,val=22 \ > -numa dist,src=1,dst=2,val=22 \ > -numa dist,src=1,dst=3,val=24 \ > -numa dist,src=2,dst=3,val=12 \ > -machine virt,iommu=smmuv3 \ > -net none -initrd ${Rootfs} \ > -nographic -bios $(pwd)/QEMU_EFI.fd \ > -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8" > > The flags looks correct: > [ 4.379910] CPU0 attaching sched-domain(s): > [ 4.380540] domain-0: span=0-1 level=SMT > [ 4.381130] groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 } > [ 4.382353] domain-1: span=0-7 level=MC > [ 4.382950] groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 } > [ 4.385378] domain-2: span=0-15 level=NUMA > [ 4.386047] groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 } > [ 4.387542] domain-3: span=0-23 level=NUMA > [ 4.388197] groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 } > [ 4.389661] domain-4: span=0-31 level=NUMA > [ 4.390358] groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 } > > I did similar test based on your config: qemu-system-x86_64 \ -m 2G \ -smp 16,threads=2,cores=4,sockets=2 \ -numa node,mem=1G,cpus=0-7,nodeid=0 \ -numa node,mem=1G,cpus=8-15,nodeid=1 \ -kernel bzImage \ -drive file=./ubuntu.raw,format=raw \ -append "console=ttyS0 root=/dev/sda5 earlyprintk=serial ignore_loglevel sched_verbose" \ -cpu host \ -enable-kvm \ -nographic and print the group address as well as the SD_CLUSTER flag: [ 0.208583] CPU0 attaching sched-domain(s): [ 0.208998] domain-0: span=0-1 level=SMT [ 0.209393] groups: 0:{ cluster: false group 0xffff9ed3c19e6140 span=0 }, 1:{ cluster: false group 0xffff9ed3c19e6440 span=1 } [ 0.212463] domain-1: span=0-7 level=MC [ 0.212856] groups: 0:{ cluster: false group 0xffff9ed3c1a8f3c0 span=0-1 cap=2048 }, 2:{ cluster: true group 0xffff9ed3c1a8fac0 span=2-3 cap=2048 }, Yeah, group 0xffff9ed3c1a8fac0 has SD_CLUSTER, but... 4:{ cluster: true group 0xffff9ed3c1a8fe40 span=4-5 cap=2048 }, 6:{ cluster: true group 0xffff9ed3c1a8f700 span=6-7 cap=2048 } [ 0.230214] CPU2 attaching sched-domain(s): [ 0.232462] domain-0: span=2-3 level=SMT [ 0.232855] groups: 2:{ cluster: false group 0xffff9ed3c19e66c0 span=2 }, 3:{ cluster: false group 0xffff9ed3c19e6400 span=3 } [ 0.233971] domain-1: span=0-7 level=MC [ 0.236463] groups: 2:{ cluster: false group 0xffff9ed3c1a8fac0 span=2-3 cap=2048 }, 4:{ cluster: true group 0xffff9ed3c1a8fe40 span=4-5 cap=2048 }, 6:{ cluster: true group 0xffff9ed3c1a8f700 span=6-7 cap=2048 }, 0:{ cluster: false group 0xffff9ed3c1a8f3c0 span=0-1 cap=2048 } group 0xffff9ed3c1a8fac0's SD_CLUSTER flag will be cleared here. thanks, Chenyu
On 2023/6/13 20:44, Chen Yu wrote: > On 2023-06-13 at 16:09:17 +0800, Yicong Yang wrote: >> On 2023/6/13 15:36, Yicong Yang wrote: >>> On 2023/6/9 18:50, Chen Yu wrote: >>>> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote: >>>>> On 2023/6/8 11:26, Chen Yu wrote: >>>>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: >>>>>>> On 2023/5/30 22:39, Yicong Yang wrote: >>>>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote: >>>>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: >>>>>>>>> >>>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>>>>>>> index 373ff5f55884..b8c129ed8b47 100644 >>>>>>>>>> --- a/kernel/sched/fair.c >>>>>>>>>> +++ b/kernel/sched/fair.c >>>>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> >>>>>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>>>>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); >>>>>>>>>> + >>>>>>>>>> + if (sdc) { >>>>>>>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { >>>>>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>>>>>>> + continue; >>>>>>>>>> + >>>>>>>>>> + if (has_idle_core) { >>>>>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>>>>> + if ((unsigned int)i < nr_cpumask_bits) >>>>>>>>>> + return i; >>>>>>>>>> + } else { >>>>>>>>>> + if (--nr <= 0) >>>>>>>>>> + return -1; >>>>>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>>>>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) >>>>>>>>>> + return idle_cpu; >>>>>>>>>> + } >>>>>>>>>> + } >>>>>>>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); >>>>>>>>>> + } >>>>>>>>>> + } >>>>>>>>> >>>>>>>>> Would not this: >>>>>>>>> >>>>>>>>> --- a/kernel/sched/fair.c >>>>>>>>> +++ b/kernel/sched/fair.c >>>>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s >>>>>>>>> } >>>>>>>>> } >>>>>>>>> >>>>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { >>>>>>>>> + struct sched_group *sg = sd->groups; >>>>>>>>> + if (sg->flags & SD_CLUSTER) { >>>>>>>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { >>>>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) >>>>>>>>> + continue; >>>>>>>>> + >>>>>>>>> + if (has_idle_core) { >>>>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>>>> + if ((unsigned)i < nr_cpumask_bits) >>>>>>>>> + return 1; >>>>>>>>> + } else { >>>>>>>>> + if (--nr <= 0) >>>>>>>>> + return -1; >>>>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); >>>>>>>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) >>>>>>>>> + return idle_cpu; >>>>>>>>> + } >>>>>>>>> + } >>>>>>>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); >>>>>>>>> + } >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> for_each_cpu_wrap(cpu, cpus, target + 1) { >>>>>>>>> if (has_idle_core) { >>>>>>>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); >>>>>>>>> >>>>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. >>>>>>>>> >>>>>>>> >>>>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. >>>>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: >>>>>>>> >>>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>>>>>>> index 69968ed9ffb9..5c443b74abf5 100644 >>>>>>>> --- a/kernel/sched/topology.c >>>>>>>> +++ b/kernel/sched/topology.c >>>>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, >>>>>>>> >>>>>>>> cpumask_or(groupmask, groupmask, sched_group_span(group)); >>>>>>>> >>>>>>>> - printk(KERN_CONT " %d:{ span=%*pbl", >>>>>>>> - group->sgc->id, >>>>>>>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", >>>>>>>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", >>>>>>>> cpumask_pr_args(sched_group_span(group))); >>>>>>>> >>>>>>>> if ((sd->flags & SD_OVERLAP) && >>>>>>>> >>>>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked >>>>>>>> as cluster: >>>>>>>> >>>>>>>> [ 8.886099] CPU0 attaching sched-domain(s): >>>>>>>> [ 8.889539] domain-0: span=0,40 level=SMT >>>>>>>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } >>>>>>>> [ 8.897538] domain-1: span=0-19,40-59 level=MC >>>>>>>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } >>>>>>>> [ 8.905538] domain-2: span=0-79 level=NUMA >>>>>>>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } >>>>>>>> >>>>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems >>>>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. >>>>>>>> >>>>>>> >>>>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper >>>>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach >>>>>>> for cluster scanning and sd_cluster per-cpu variable is still needed. >>>>>>> >>>>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 >>>>>>> >>>>>>> Thanks. >>>>>>> >>>>>>> >>>>>> Is this an issue? Suppose sched domain A's parent domain >>>>>> is B, B's parent sched domain is C. When B degenerates, C's child domain >>>>>> pointer is adjusted to A. However, currently the code does not adjust C's >>>>>> sched groups' flags. Should we adjust C's sched groups flags to be the same >>>>>> as A to keep consistency? >>>>> >>>>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses >>>>> it within the SMT so I think update the lowest domain's group flag works. For correctness >>>>> all the domain group's flag should derives from its real child. I tried to solve this at group >>>>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate >>>>> or not since the groups it not built. >>>>> >>>>>> >>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c >>>>>> index 6198fa135176..fe3fd70f2313 100644 >>>>>> --- a/kernel/sched/topology.c >>>>>> +++ b/kernel/sched/topology.c >>>>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>>>>> >>>>>> /* Remove the sched domains which do not contribute to scheduling. */ >>>>>> for (tmp = sd; tmp; ) { >>>>>> - struct sched_domain *parent = tmp->parent; >>>>>> + struct sched_domain *parent = tmp->parent, *pparent; >>>>>> if (!parent) >>>>>> break; >>>>>> >>>>>> if (sd_parent_degenerate(tmp, parent)) { >>>>>> - tmp->parent = parent->parent; >>>>>> - if (parent->parent) >>>>>> - parent->parent->child = tmp; >>>>>> + pparent = parent->parent; >>>>>> + tmp->parent = pparent; >>>>>> /* >>>>>> * Transfer SD_PREFER_SIBLING down in case of a >>>>>> * degenerate parent; the spans match for this >>>>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>>>>> */ >>>>>> if (parent->flags & SD_PREFER_SIBLING) >>>>>> tmp->flags |= SD_PREFER_SIBLING; >>>>>> + >>>>>> + if (pparent) { >>>>>> + struct sched_group *sg = pparent->groups; >>>>>> + >>>>>> + do { >>>>>> + sg->flags = tmp->flags; >>>>> >>>>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from >>>>> remote group have the same flags with @tmp? >>>>> >>>> Good question, for heterogenous platforms sched groups within the same domain might not always >>>> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could >>>> have different balance flags in theory. Maybe only update the local group's flag is accurate. >>>> >>>> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it >>>> should be in tip: >>>> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ >>>> We could adjust it based on his change to remove SD_CLUSTER, or we can >>>> replace groups->flag with tmp->flag directly, in case we have other flags to be >>>> adjusted in the future. >>>> >>> >>> Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update >>> local groups flag should satisfy our needs. I tried to use the correct child domains to build the >>> sched groups then all the groups will have the correct flags, but it'll be a bit more complex. >>> >> >> something like below, detect the sched domain degeneration first and try to rebuild the groups if >> necessary. > Not sure if we need to rebuild the groups. With only > > if (parent->flags & SD_CLUSTER) > parent->parent->groups->flags &= ~SD_CLUSTER; > > I see the correct flags. > > My understanding is that, although remote groups's flag might be incorrect, > later when other sched domain degenerates, these remote groups becomes local > groups for those sched domains, and the flags will be adjusted accordingly. Maybe worth a try to build the groups correctly at very beginning rather correct it later when needed. Considering we've used it in several places[1][2] and this time we're going to use it for cluster. [1] 16d364ba6ef2 ("sched/topology: Introduce sched_group::flags") [2] https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ >> Works on a non-cluster machine emulated with QEMU: >> >> qemu-system-aarch64 \ >> -kernel ${Image} \ >> -smp cores=16,threads=2 \ >> -cpu host --enable-kvm \ >> -m 32G \ >> -object memory-backend-ram,id=node0,size=8G \ >> -object memory-backend-ram,id=node1,size=8G \ >> -object memory-backend-ram,id=node2,size=8G \ >> -object memory-backend-ram,id=node3,size=8G \ >> -numa node,memdev=node0,cpus=0-7,nodeid=0 \ >> -numa node,memdev=node1,cpus=8-15,nodeid=1 \ >> -numa node,memdev=node2,cpus=16-23,nodeid=2 \ >> -numa node,memdev=node3,cpus=24-31,nodeid=3 \ >> -numa dist,src=0,dst=1,val=12 \ >> -numa dist,src=0,dst=2,val=20 \ >> -numa dist,src=0,dst=3,val=22 \ >> -numa dist,src=1,dst=2,val=22 \ >> -numa dist,src=1,dst=3,val=24 \ >> -numa dist,src=2,dst=3,val=12 \ >> -machine virt,iommu=smmuv3 \ >> -net none -initrd ${Rootfs} \ >> -nographic -bios $(pwd)/QEMU_EFI.fd \ >> -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8" >> >> The flags looks correct: >> [ 4.379910] CPU0 attaching sched-domain(s): >> [ 4.380540] domain-0: span=0-1 level=SMT >> [ 4.381130] groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 } >> [ 4.382353] domain-1: span=0-7 level=MC >> [ 4.382950] groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 } >> [ 4.385378] domain-2: span=0-15 level=NUMA >> [ 4.386047] groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 } >> [ 4.387542] domain-3: span=0-23 level=NUMA >> [ 4.388197] groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 } >> [ 4.389661] domain-4: span=0-31 level=NUMA >> [ 4.390358] groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 } >> >> > > I did similar test based on your config: > qemu-system-x86_64 \ > -m 2G \ > -smp 16,threads=2,cores=4,sockets=2 \ > -numa node,mem=1G,cpus=0-7,nodeid=0 \ > -numa node,mem=1G,cpus=8-15,nodeid=1 \ > -kernel bzImage \ > -drive file=./ubuntu.raw,format=raw \ > -append "console=ttyS0 root=/dev/sda5 earlyprintk=serial ignore_loglevel sched_verbose" \ > -cpu host \ > -enable-kvm \ > -nographic > > and print the group address as well as the SD_CLUSTER flag: > > [ 0.208583] CPU0 attaching sched-domain(s): > [ 0.208998] domain-0: span=0-1 level=SMT > [ 0.209393] groups: 0:{ cluster: false group 0xffff9ed3c19e6140 span=0 }, 1:{ cluster: false group 0xffff9ed3c19e6440 span=1 } > [ 0.212463] domain-1: span=0-7 level=MC > [ 0.212856] groups: 0:{ cluster: false group 0xffff9ed3c1a8f3c0 span=0-1 cap=2048 }, 2:{ cluster: true group 0xffff9ed3c1a8fac0 span=2-3 cap=2048 }, > > Yeah, group 0xffff9ed3c1a8fac0 has SD_CLUSTER, but... > Something's wrong here. 0xffff9ed3c1a8fac0 shouldn't have SD_CLUSTER. Code above should works to successfully build MC's groups from SMT rather than CLS, Tested with qemu version 4.2.1 (from ubuntu 20.04) and mainline v8.0.0-1358-gac84b57b4d with your x86 topology, it looks like: [ 0.517077] CPU0 attaching sched-domain(s): [ 0.520858] domain-0: span=0-1 level=SMT [ 0.524858] groups: 0:{ cluster: false span=0 }, 1:{ cluster: false span=1 } [ 0.528858] domain-1: span=0-7 level=MC [ 0.532858] groups: 0:{ cluster: false span=0-1 cap=2048 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 } [ 0.536858] domain-2: span=0-15 level=NUMA [ 0.540858] groups: 0:{ cluster: false span=0-7 cap=8192 }, 8:{ cluster: false span=8-15 cap=8192 } and for CPU 2: [ 0.572859] CPU2 attaching sched-domain(s): [ 0.576858] domain-0: span=2-3 level=SMT [ 0.580858] groups: 2:{ cluster: false span=2 }, 3:{ cluster: false span=3 } [ 0.584858] domain-1: span=0-7 level=MC [ 0.588858] groups: 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }, 0:{ cluster: false span=0-1 cap=2048 } [ 0.592858] domain-2: span=0-15 level=NUMA [ 0.596858] groups: 0:{ cluster: false span=0-7 cap=8192 }, 8:{ cluster: false span=8-15 cap=8192 } Thanks.
On 2023-06-15 at 15:59:10 +0800, Yicong Yang wrote: > On 2023/6/13 20:44, Chen Yu wrote: > > On 2023-06-13 at 16:09:17 +0800, Yicong Yang wrote: > >> On 2023/6/13 15:36, Yicong Yang wrote: > >>> On 2023/6/9 18:50, Chen Yu wrote: > >>>> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote: > >>>>> On 2023/6/8 11:26, Chen Yu wrote: > >>>>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote: > >>>>>>> On 2023/5/30 22:39, Yicong Yang wrote: > >>>>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote: > >>>>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote: > >>>>>>>>> > >>>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > >>>>>>>>>> index 373ff5f55884..b8c129ed8b47 100644 > >>>>>>>>>> --- a/kernel/sched/fair.c > >>>>>>>>>> +++ b/kernel/sched/fair.c > >>>>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > >>>>>>>>>> } > >>>>>>>>>> } > >>>>>>>>>> > >>>>>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>>>>>>>>> + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); > >>>>>>>>>> + > >>>>>>>>>> + if (sdc) { > >>>>>>>>>> + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { > >>>>>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) > >>>>>>>>>> + continue; > >>>>>>>>>> + > >>>>>>>>>> + if (has_idle_core) { > >>>>>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>>>>>>> + if ((unsigned int)i < nr_cpumask_bits) > >>>>>>>>>> + return i; > >>>>>>>>>> + } else { > >>>>>>>>>> + if (--nr <= 0) > >>>>>>>>>> + return -1; > >>>>>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); > >>>>>>>>>> + if ((unsigned int)idle_cpu < nr_cpumask_bits) > >>>>>>>>>> + return idle_cpu; > >>>>>>>>>> + } > >>>>>>>>>> + } > >>>>>>>>>> + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); > >>>>>>>>>> + } > >>>>>>>>>> + } > >>>>>>>>> > >>>>>>>>> Would not this: > >>>>>>>>> > >>>>>>>>> --- a/kernel/sched/fair.c > >>>>>>>>> +++ b/kernel/sched/fair.c > >>>>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s > >>>>>>>>> } > >>>>>>>>> } > >>>>>>>>> > >>>>>>>>> + if (static_branch_unlikely(&sched_cluster_active)) { > >>>>>>>>> + struct sched_group *sg = sd->groups; > >>>>>>>>> + if (sg->flags & SD_CLUSTER) { > >>>>>>>>> + for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) { > >>>>>>>>> + if (!cpumask_test_cpu(cpu, cpus)) > >>>>>>>>> + continue; > >>>>>>>>> + > >>>>>>>>> + if (has_idle_core) { > >>>>>>>>> + i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>>>>>> + if ((unsigned)i < nr_cpumask_bits) > >>>>>>>>> + return 1; > >>>>>>>>> + } else { > >>>>>>>>> + if (--nr <= 0) > >>>>>>>>> + return -1; > >>>>>>>>> + idle_cpu = __select_idle_cpu(cpu, p); > >>>>>>>>> + if ((unsigned)idle_cpu < nr_cpumask_bits) > >>>>>>>>> + return idle_cpu; > >>>>>>>>> + } > >>>>>>>>> + } > >>>>>>>>> + cpumask_andnot(cpus, cpus, sched_group_span(sg)); > >>>>>>>>> + } > >>>>>>>>> + } > >>>>>>>>> + > >>>>>>>>> for_each_cpu_wrap(cpu, cpus, target + 1) { > >>>>>>>>> if (has_idle_core) { > >>>>>>>>> i = select_idle_core(p, cpu, cpus, &idle_cpu); > >>>>>>>>> > >>>>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable. > >>>>>>>>> > >>>>>>>> > >>>>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't. > >>>>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears: > >>>>>>>> > >>>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > >>>>>>>> index 69968ed9ffb9..5c443b74abf5 100644 > >>>>>>>> --- a/kernel/sched/topology.c > >>>>>>>> +++ b/kernel/sched/topology.c > >>>>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, > >>>>>>>> > >>>>>>>> cpumask_or(groupmask, groupmask, sched_group_span(group)); > >>>>>>>> > >>>>>>>> - printk(KERN_CONT " %d:{ span=%*pbl", > >>>>>>>> - group->sgc->id, > >>>>>>>> + printk(KERN_CONT " %d:{ cluster: %s span=%*pbl", > >>>>>>>> + group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false", > >>>>>>>> cpumask_pr_args(sched_group_span(group))); > >>>>>>>> > >>>>>>>> if ((sd->flags & SD_OVERLAP) && > >>>>>>>> > >>>>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked > >>>>>>>> as cluster: > >>>>>>>> > >>>>>>>> [ 8.886099] CPU0 attaching sched-domain(s): > >>>>>>>> [ 8.889539] domain-0: span=0,40 level=SMT > >>>>>>>> [ 8.893538] groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 } > >>>>>>>> [ 8.897538] domain-1: span=0-19,40-59 level=MC > >>>>>>>> [ 8.901538] groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 } > >>>>>>>> [ 8.905538] domain-2: span=0-79 level=NUMA > >>>>>>>> [ 8.909538] groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 } > >>>>>>>> > >>>>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems > >>>>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong. > >>>>>>>> > >>>>>>> > >>>>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper > >>>>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach > >>>>>>> for cluster scanning and sd_cluster per-cpu variable is still needed. > >>>>>>> > >>>>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749 > >>>>>>> > >>>>>>> Thanks. > >>>>>>> > >>>>>>> > >>>>>> Is this an issue? Suppose sched domain A's parent domain > >>>>>> is B, B's parent sched domain is C. When B degenerates, C's child domain > >>>>>> pointer is adjusted to A. However, currently the code does not adjust C's > >>>>>> sched groups' flags. Should we adjust C's sched groups flags to be the same > >>>>>> as A to keep consistency? > >>>>> > >>>>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses > >>>>> it within the SMT so I think update the lowest domain's group flag works. For correctness > >>>>> all the domain group's flag should derives from its real child. I tried to solve this at group > >>>>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate > >>>>> or not since the groups it not built. > >>>>> > >>>>>> > >>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > >>>>>> index 6198fa135176..fe3fd70f2313 100644 > >>>>>> --- a/kernel/sched/topology.c > >>>>>> +++ b/kernel/sched/topology.c > >>>>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > >>>>>> > >>>>>> /* Remove the sched domains which do not contribute to scheduling. */ > >>>>>> for (tmp = sd; tmp; ) { > >>>>>> - struct sched_domain *parent = tmp->parent; > >>>>>> + struct sched_domain *parent = tmp->parent, *pparent; > >>>>>> if (!parent) > >>>>>> break; > >>>>>> > >>>>>> if (sd_parent_degenerate(tmp, parent)) { > >>>>>> - tmp->parent = parent->parent; > >>>>>> - if (parent->parent) > >>>>>> - parent->parent->child = tmp; > >>>>>> + pparent = parent->parent; > >>>>>> + tmp->parent = pparent; > >>>>>> /* > >>>>>> * Transfer SD_PREFER_SIBLING down in case of a > >>>>>> * degenerate parent; the spans match for this > >>>>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > >>>>>> */ > >>>>>> if (parent->flags & SD_PREFER_SIBLING) > >>>>>> tmp->flags |= SD_PREFER_SIBLING; > >>>>>> + > >>>>>> + if (pparent) { > >>>>>> + struct sched_group *sg = pparent->groups; > >>>>>> + > >>>>>> + do { > >>>>>> + sg->flags = tmp->flags; > >>>>> > >>>>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from > >>>>> remote group have the same flags with @tmp? > >>>>> > >>>> Good question, for heterogenous platforms sched groups within the same domain might not always > >>>> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could > >>>> have different balance flags in theory. Maybe only update the local group's flag is accurate. > >>>> > >>>> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it > >>>> should be in tip: > >>>> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ > >>>> We could adjust it based on his change to remove SD_CLUSTER, or we can > >>>> replace groups->flag with tmp->flag directly, in case we have other flags to be > >>>> adjusted in the future. > >>>> > >>> > >>> Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update > >>> local groups flag should satisfy our needs. I tried to use the correct child domains to build the > >>> sched groups then all the groups will have the correct flags, but it'll be a bit more complex. > >>> > >> > >> something like below, detect the sched domain degeneration first and try to rebuild the groups if > >> necessary. > > Not sure if we need to rebuild the groups. With only > > > > if (parent->flags & SD_CLUSTER) > > parent->parent->groups->flags &= ~SD_CLUSTER; > > > > I see the correct flags. > > > > My understanding is that, although remote groups's flag might be incorrect, > > later when other sched domain degenerates, these remote groups becomes local > > groups for those sched domains, and the flags will be adjusted accordingly. > > Maybe worth a try to build the groups correctly at very beginning rather > correct it later when needed. Considering we've used it in several places[1][2] > and this time we're going to use it for cluster. > > [1] 16d364ba6ef2 ("sched/topology: Introduce sched_group::flags") > [2] https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/ > Do you mean clearing the SD_CLUSTER during degenerating? Yup, that could be enough for now. I guess you are going to send a new version with this SD_CLUSTER flag cleared + using group->flags to detect SD_CLUSTER rather than percpu cluster id. I'd be happy to test once you send them out. thanks, Chenyu
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 373ff5f55884..b8c129ed8b47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); + + if (sdc) { + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7001,7 +7025,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7103,7 +7127,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -7130,8 +7154,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + if (cpus_share_lowest_cache(prev, target)) + return prev; + prev_aff = prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7158,7 +7185,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + if (cpus_share_lowest_cache(recent_used_cpu, target)) + return recent_used_cpu; + } else { + recent_used_cpu = -1; } /* @@ -7199,6 +7229,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 23dabfc3668b..5097f93b635f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1816,6 +1816,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2c4cc6c95a9a..69968ed9ffb9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -672,7 +672,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -2363,6 +2365,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2384,6 +2387,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; + has_cluster |= sd->flags & SD_CLUSTER; if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; @@ -2494,6 +2498,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2593,6 +2600,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i);