diff mbox series

[v8,2/2] sched/fair: Scan cluster before scanning LLC in wake-up path

Message ID 20230530070253.33306-3-yangyicong@huawei.com (mailing list archive)
State New, archived
Headers show
Series sched/fair: Scan cluster before scanning LLC in wake-up path | expand

Commit Message

Yicong Yang May 30, 2023, 7:02 a.m. UTC
From: Barry Song <song.bao.hua@hisilicon.com>

For platforms having clusters like Kunpeng920, CPUs within the same cluster
have lower latency when synchronizing and accessing shared resources like
cache. Thus, this patch tries to find an idle cpu within the cluster of the
target CPU before scanning the whole LLC to gain lower latency. This
will be implemented in 3 steps in select_idle_sibling():
1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them
   if they're sharing cluster with the target CPU. Otherwise record them
   and do the scanning first.
2. Scanning the cluster prior to the LLC of the target CPU for an
   idle CPU to wakeup.
3. If no idle CPU found after scanning and the prev_cpu/recent_used_cpu
   can be used, use them.

Testing has been done on Kunpeng920 by pinning tasks to one numa and two
numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs.

With this patch, We noticed enhancement on tbench and netperf within one
numa or cross two numa on 6.4-rc4:
tbench results (node 0):
            baseline                     patched
  1:        326.2337        372.0611 (   14.05%)
  4:       1311.0912       1467.6606 (   11.94%)
  8:       2635.7595       2919.4199 (   10.76%)
 16:       5280.4710       5881.6082 (   11.38%)
 32:      10013.8106      10295.7659 (    2.82%)
 64:       7866.9267       7990.9609 (    1.58%)
128:       6643.0075       6773.0634 (    1.96%)
tbench results (node 0-1):
            baseline                     patched
  1:        328.2215        371.8220 (   13.28%)
  4:       1318.7803       1463.8069 (   11.00%)
  8:       2610.1637       2890.8220 (   10.75%)
 16:       5191.1229       5608.0970 (    8.03%)
 32:       9255.6653      10312.0177 (   11.41%)
 64:      16053.9385      17516.5449 (    9.11%)
128:      14145.9979      14190.7678 (    0.32%)
netperf results TCP_RR (node 0):
            baseline                     patched
  1:      77045.1699      92320.0580 (   19.83%)
  4:      78419.5796      92010.5521 (   17.33%)
  8:      79044.9299      92154.7030 (   16.59%)
 16:      80559.1244      92531.6847 (   14.86%)
 32:      78005.1397      79176.5900 (    1.50%)
 64:      29246.8246      29312.8208 (    0.23%)
128:      12098.8488      12169.5650 (    0.58%)
netperf results TCP_RR (node 0-1):
            baseline                     patched
  1:      77614.5377      92504.7655 (   19.18%)
  4:      79324.3967      91717.0429 (   15.62%)
  8:      79281.3608      91807.1218 (   15.80%)
 16:      79064.0960      92004.1390 (   16.37%)
 32:      78033.7068      86588.8343 (   10.96%)
 64:      75946.3002      76128.3367 (    0.24%)
128:      28518.5077      27985.0884 (   -1.87%)
netperf results UDP_RR (node 0):
            baseline                     patched
  1:      93981.2392     105321.3925 (   12.07%)
  4:      94939.0909     104816.2619 (   10.40%)
  8:      96025.7748     105125.4418 (    9.48%)
 16:      96218.2809     104576.4454 (    8.69%)
 32:      80740.3541      83242.5556 (    3.10%)
 64:      30622.1298      30805.0830 (    0.60%)
128:      12369.6187      12659.8038 (    2.35%)
netperf results UDP_RR (node 0-1):
            baseline                     patched
  1:      94372.8042     105957.8761 (   12.28%)
  4:      92867.0020     103963.9574 (   11.95%)
  8:      92832.1536     103722.3126 (   11.73%)
 16:      93171.2927     103496.3700 (   11.08%)
 32:      76859.0806      95176.8247 (   23.83%)
 64:      53131.3217      77129.8854 (   45.17%)
128:      24055.1642      30826.3553 (   28.15%)

Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch
in the code has not been tested but it supposed to work.

Chen Yu also noticed this will improve the performance of tbench and
netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one
cluster sharing L2 Cache.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
[https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net]
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
---
 kernel/sched/fair.c     | 51 +++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h    |  1 +
 kernel/sched/topology.c | 10 ++++++++
 3 files changed, 57 insertions(+), 5 deletions(-)

Comments

Peter Zijlstra May 30, 2023, 11:55 a.m. UTC | #1
On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 373ff5f55884..b8c129ed8b47 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>  		}
>  	}
>  
> +	if (static_branch_unlikely(&sched_cluster_active)) {
> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
> +
> +		if (sdc) {
> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
> +				if (!cpumask_test_cpu(cpu, cpus))
> +					continue;
> +
> +				if (has_idle_core) {
> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> +					if ((unsigned int)i < nr_cpumask_bits)
> +						return i;
> +				} else {
> +					if (--nr <= 0)
> +						return -1;
> +					idle_cpu = __select_idle_cpu(cpu, p);
> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
> +						return idle_cpu;
> +				}
> +			}
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
> +		}
> +	}

Would not this:

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
 		}
 	}
 
+	if (static_branch_unlikely(&sched_cluster_active)) {
+		struct sched_group *sg = sd->groups;
+		if (sg->flags & SD_CLUSTER) {
+			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
+				if (!cpumask_test_cpu(cpu, cpus))
+					continue;
+
+				if (has_idle_core) {
+					i = select_idle_core(p, cpu, cpus, &idle_cpu);
+					if ((unsigned)i < nr_cpumask_bits)
+						return 1;
+				} else {
+					if (--nr <= 0)
+						return -1;
+					idle_cpu = __select_idle_cpu(cpu, p);
+					if ((unsigned)idle_cpu < nr_cpumask_bits)
+						return idle_cpu;
+				}
+			}
+			cpumask_andnot(cpus, cpus, sched_group_span(sg));
+		}
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);

also work? Then we can avoid the extra sd_cluster per-cpu variable.
Yicong Yang May 30, 2023, 2:39 p.m. UTC | #2
On 2023/5/30 19:55, Peter Zijlstra wrote:
> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 373ff5f55884..b8c129ed8b47 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>  		}
>>  	}
>>  
>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>> +
>> +		if (sdc) {
>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>> +				if (!cpumask_test_cpu(cpu, cpus))
>> +					continue;
>> +
>> +				if (has_idle_core) {
>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>> +					if ((unsigned int)i < nr_cpumask_bits)
>> +						return i;
>> +				} else {
>> +					if (--nr <= 0)
>> +						return -1;
>> +					idle_cpu = __select_idle_cpu(cpu, p);
>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>> +						return idle_cpu;
>> +				}
>> +			}
>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>> +		}
>> +	}
> 
> Would not this:
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>  		}
>  	}
>  
> +	if (static_branch_unlikely(&sched_cluster_active)) {
> +		struct sched_group *sg = sd->groups;
> +		if (sg->flags & SD_CLUSTER) {
> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
> +				if (!cpumask_test_cpu(cpu, cpus))
> +					continue;
> +
> +				if (has_idle_core) {
> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> +					if ((unsigned)i < nr_cpumask_bits)
> +						return 1;
> +				} else {
> +					if (--nr <= 0)
> +						return -1;
> +					idle_cpu = __select_idle_cpu(cpu, p);
> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
> +						return idle_cpu;
> +				}
> +			}
> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
> +		}
> +	}
> +
>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>  		if (has_idle_core) {
>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
> 
> also work? Then we can avoid the extra sd_cluster per-cpu variable.
> 

I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 69968ed9ffb9..5c443b74abf5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

                cpumask_or(groupmask, groupmask, sched_group_span(group));

-               printk(KERN_CONT " %d:{ span=%*pbl",
-                               group->sgc->id,
+               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
+                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
                                cpumask_pr_args(sched_group_span(group)));

                if ((sd->flags & SD_OVERLAP) &&

Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
as cluster:

[    8.886099] CPU0 attaching sched-domain(s):
[    8.889539]  domain-0: span=0,40 level=SMT
[    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
[    8.897538]   domain-1: span=0-19,40-59 level=MC
[    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
[    8.905538]    domain-2: span=0-79 level=NUMA
[    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }

I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.

Thanks,
Yicong
Yicong Yang May 31, 2023, 8:21 a.m. UTC | #3
On 2023/5/30 22:39, Yicong Yang wrote:
> On 2023/5/30 19:55, Peter Zijlstra wrote:
>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 373ff5f55884..b8c129ed8b47 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>  		}
>>>  	}
>>>  
>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>>> +
>>> +		if (sdc) {
>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>> +					continue;
>>> +
>>> +				if (has_idle_core) {
>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>> +					if ((unsigned int)i < nr_cpumask_bits)
>>> +						return i;
>>> +				} else {
>>> +					if (--nr <= 0)
>>> +						return -1;
>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>>> +						return idle_cpu;
>>> +				}
>>> +			}
>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>>> +		}
>>> +	}
>>
>> Would not this:
>>
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>>  		}
>>  	}
>>  
>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>> +		struct sched_group *sg = sd->groups;
>> +		if (sg->flags & SD_CLUSTER) {
>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
>> +				if (!cpumask_test_cpu(cpu, cpus))
>> +					continue;
>> +
>> +				if (has_idle_core) {
>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>> +					if ((unsigned)i < nr_cpumask_bits)
>> +						return 1;
>> +				} else {
>> +					if (--nr <= 0)
>> +						return -1;
>> +					idle_cpu = __select_idle_cpu(cpu, p);
>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
>> +						return idle_cpu;
>> +				}
>> +			}
>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
>> +		}
>> +	}
>> +
>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>>  		if (has_idle_core) {
>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>
>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
>>
> 
> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
> 
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 69968ed9ffb9..5c443b74abf5 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
> 
>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
> 
> -               printk(KERN_CONT " %d:{ span=%*pbl",
> -                               group->sgc->id,
> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
>                                 cpumask_pr_args(sched_group_span(group)));
> 
>                 if ((sd->flags & SD_OVERLAP) &&
> 
> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
> as cluster:
> 
> [    8.886099] CPU0 attaching sched-domain(s):
> [    8.889539]  domain-0: span=0,40 level=SMT
> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
> [    8.897538]   domain-1: span=0-19,40-59 level=MC
> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
> [    8.905538]    domain-2: span=0-79 level=NUMA
> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
> 
> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
> 

Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
for cluster scanning and sd_cluster per-cpu variable is still needed.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749

Thanks.
Chen Yu June 8, 2023, 3:26 a.m. UTC | #4
On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
> On 2023/5/30 22:39, Yicong Yang wrote:
> > On 2023/5/30 19:55, Peter Zijlstra wrote:
> >> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
> >>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 373ff5f55884..b8c129ed8b47 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >>>  		}
> >>>  	}
> >>>  
> >>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
> >>> +
> >>> +		if (sdc) {
> >>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
> >>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>> +					continue;
> >>> +
> >>> +				if (has_idle_core) {
> >>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>> +					if ((unsigned int)i < nr_cpumask_bits)
> >>> +						return i;
> >>> +				} else {
> >>> +					if (--nr <= 0)
> >>> +						return -1;
> >>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
> >>> +						return idle_cpu;
> >>> +				}
> >>> +			}
> >>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
> >>> +		}
> >>> +	}
> >>
> >> Would not this:
> >>
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
> >>  		}
> >>  	}
> >>  
> >> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >> +		struct sched_group *sg = sd->groups;
> >> +		if (sg->flags & SD_CLUSTER) {
> >> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
> >> +				if (!cpumask_test_cpu(cpu, cpus))
> >> +					continue;
> >> +
> >> +				if (has_idle_core) {
> >> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >> +					if ((unsigned)i < nr_cpumask_bits)
> >> +						return 1;
> >> +				} else {
> >> +					if (--nr <= 0)
> >> +						return -1;
> >> +					idle_cpu = __select_idle_cpu(cpu, p);
> >> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
> >> +						return idle_cpu;
> >> +				}
> >> +			}
> >> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
> >> +		}
> >> +	}
> >> +
> >>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
> >>  		if (has_idle_core) {
> >>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>
> >> also work? Then we can avoid the extra sd_cluster per-cpu variable.
> >>
> > 
> > I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
> > Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
> > 
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 69968ed9ffb9..5c443b74abf5 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
> > 
> >                 cpumask_or(groupmask, groupmask, sched_group_span(group));
> > 
> > -               printk(KERN_CONT " %d:{ span=%*pbl",
> > -                               group->sgc->id,
> > +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
> > +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
> >                                 cpumask_pr_args(sched_group_span(group)));
> > 
> >                 if ((sd->flags & SD_OVERLAP) &&
> > 
> > Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
> > as cluster:
> > 
> > [    8.886099] CPU0 attaching sched-domain(s):
> > [    8.889539]  domain-0: span=0,40 level=SMT
> > [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
> > [    8.897538]   domain-1: span=0-19,40-59 level=MC
> > [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
> > [    8.905538]    domain-2: span=0-79 level=NUMA
> > [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
> > 
> > I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
> > we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
> > 
> 
> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
> for cluster scanning and sd_cluster per-cpu variable is still needed.
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
> 
> Thanks.
> 
>
Is this an issue? Suppose sched domain A's parent domain
is B, B's parent sched domain is C. When B degenerates, C's child domain
pointer is adjusted to A. However, currently the code does not adjust C's
sched groups' flags. Should we adjust C's sched groups flags to be the same
as A to keep consistency?

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6198fa135176..fe3fd70f2313 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
-		struct sched_domain *parent = tmp->parent;
+		struct sched_domain *parent = tmp->parent, *pparent;
 		if (!parent)
 			break;
 
 		if (sd_parent_degenerate(tmp, parent)) {
-			tmp->parent = parent->parent;
-			if (parent->parent)
-				parent->parent->child = tmp;
+			pparent = parent->parent;
+			tmp->parent = pparent;
 			/*
 			 * Transfer SD_PREFER_SIBLING down in case of a
 			 * degenerate parent; the spans match for this
@@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 			 */
 			if (parent->flags & SD_PREFER_SIBLING)
 				tmp->flags |= SD_PREFER_SIBLING;
+
+			if (pparent) {
+				struct sched_group *sg = pparent->groups;
+
+				do {
+					sg->flags = tmp->flags;
+					sg = sg->next;
+				} while (sg != pparent->groups);
+
+				pparent->child = tmp;
+			}
+
 			destroy_sched_domain(parent);
 		} else
 			tmp = tmp->parent;
Yicong Yang June 8, 2023, 6:45 a.m. UTC | #5
On 2023/6/8 11:26, Chen Yu wrote:
> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
>> On 2023/5/30 22:39, Yicong Yang wrote:
>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 373ff5f55884..b8c129ed8b47 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>>>  		}
>>>>>  	}
>>>>>  
>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>>>>> +
>>>>> +		if (sdc) {
>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>> +					continue;
>>>>> +
>>>>> +				if (has_idle_core) {
>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
>>>>> +						return i;
>>>>> +				} else {
>>>>> +					if (--nr <= 0)
>>>>> +						return -1;
>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>>>>> +						return idle_cpu;
>>>>> +				}
>>>>> +			}
>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>>>>> +		}
>>>>> +	}
>>>>
>>>> Would not this:
>>>>
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>>>>  		}
>>>>  	}
>>>>  
>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>> +		struct sched_group *sg = sd->groups;
>>>> +		if (sg->flags & SD_CLUSTER) {
>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>> +					continue;
>>>> +
>>>> +				if (has_idle_core) {
>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>> +					if ((unsigned)i < nr_cpumask_bits)
>>>> +						return 1;
>>>> +				} else {
>>>> +					if (--nr <= 0)
>>>> +						return -1;
>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
>>>> +						return idle_cpu;
>>>> +				}
>>>> +			}
>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
>>>> +		}
>>>> +	}
>>>> +
>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>>>>  		if (has_idle_core) {
>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>
>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
>>>>
>>>
>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
>>>
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index 69968ed9ffb9..5c443b74abf5 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
>>>
>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
>>>
>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
>>> -                               group->sgc->id,
>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
>>>                                 cpumask_pr_args(sched_group_span(group)));
>>>
>>>                 if ((sd->flags & SD_OVERLAP) &&
>>>
>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
>>> as cluster:
>>>
>>> [    8.886099] CPU0 attaching sched-domain(s):
>>> [    8.889539]  domain-0: span=0,40 level=SMT
>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
>>> [    8.905538]    domain-2: span=0-79 level=NUMA
>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
>>>
>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
>>>
>>
>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
>> for cluster scanning and sd_cluster per-cpu variable is still needed.
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
>>
>> Thanks.
>>
>>
> Is this an issue? Suppose sched domain A's parent domain
> is B, B's parent sched domain is C. When B degenerates, C's child domain
> pointer is adjusted to A. However, currently the code does not adjust C's
> sched groups' flags. Should we adjust C's sched groups flags to be the same
> as A to keep consistency?

It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
it within the SMT so I think update the lowest domain's group flag works. For correctness
all the domain group's flag should derives from its real child. I tried to solve this at group
building but seems hard to do, at that time we don't know whether a domain is going to degenerate
or not since the groups it not built.

> 
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 6198fa135176..fe3fd70f2313 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>  
>  	/* Remove the sched domains which do not contribute to scheduling. */
>  	for (tmp = sd; tmp; ) {
> -		struct sched_domain *parent = tmp->parent;
> +		struct sched_domain *parent = tmp->parent, *pparent;
>  		if (!parent)
>  			break;
>  
>  		if (sd_parent_degenerate(tmp, parent)) {
> -			tmp->parent = parent->parent;
> -			if (parent->parent)
> -				parent->parent->child = tmp;
> +			pparent = parent->parent;
> +			tmp->parent = pparent;
>  			/*
>  			 * Transfer SD_PREFER_SIBLING down in case of a
>  			 * degenerate parent; the spans match for this
> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>  			 */
>  			if (parent->flags & SD_PREFER_SIBLING)
>  				tmp->flags |= SD_PREFER_SIBLING;
> +
> +			if (pparent) {
> +				struct sched_group *sg = pparent->groups;
> +
> +				do {
> +					sg->flags = tmp->flags;

May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
remote group have the same flags with @tmp?

> +					sg = sg->next;
> +				} while (sg != pparent->groups);
> +
> +				pparent->child = tmp;
> +			}
> +
>  			destroy_sched_domain(parent);
>  		} else
>  			tmp = tmp->parent;
> 

The code you pasted at last of your mail is correct I think, not sure it doesn't appear here when reply...

Thanks.
Chen Yu June 9, 2023, 10:50 a.m. UTC | #6
On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
> On 2023/6/8 11:26, Chen Yu wrote:
> > On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
> >> On 2023/5/30 22:39, Yicong Yang wrote:
> >>> On 2023/5/30 19:55, Peter Zijlstra wrote:
> >>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
> >>>>
> >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>> index 373ff5f55884..b8c129ed8b47 100644
> >>>>> --- a/kernel/sched/fair.c
> >>>>> +++ b/kernel/sched/fair.c
> >>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >>>>>  		}
> >>>>>  	}
> >>>>>  
> >>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
> >>>>> +
> >>>>> +		if (sdc) {
> >>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
> >>>>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>>>> +					continue;
> >>>>> +
> >>>>> +				if (has_idle_core) {
> >>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>> +					if ((unsigned int)i < nr_cpumask_bits)
> >>>>> +						return i;
> >>>>> +				} else {
> >>>>> +					if (--nr <= 0)
> >>>>> +						return -1;
> >>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
> >>>>> +						return idle_cpu;
> >>>>> +				}
> >>>>> +			}
> >>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
> >>>>> +		}
> >>>>> +	}
> >>>>
> >>>> Would not this:
> >>>>
> >>>> --- a/kernel/sched/fair.c
> >>>> +++ b/kernel/sched/fair.c
> >>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
> >>>>  		}
> >>>>  	}
> >>>>  
> >>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>>> +		struct sched_group *sg = sd->groups;
> >>>> +		if (sg->flags & SD_CLUSTER) {
> >>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
> >>>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>>> +					continue;
> >>>> +
> >>>> +				if (has_idle_core) {
> >>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>> +					if ((unsigned)i < nr_cpumask_bits)
> >>>> +						return 1;
> >>>> +				} else {
> >>>> +					if (--nr <= 0)
> >>>> +						return -1;
> >>>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
> >>>> +						return idle_cpu;
> >>>> +				}
> >>>> +			}
> >>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
> >>>> +		}
> >>>> +	}
> >>>> +
> >>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
> >>>>  		if (has_idle_core) {
> >>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>
> >>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
> >>>>
> >>>
> >>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
> >>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
> >>>
> >>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >>> index 69968ed9ffb9..5c443b74abf5 100644
> >>> --- a/kernel/sched/topology.c
> >>> +++ b/kernel/sched/topology.c
> >>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
> >>>
> >>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
> >>>
> >>> -               printk(KERN_CONT " %d:{ span=%*pbl",
> >>> -                               group->sgc->id,
> >>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
> >>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
> >>>                                 cpumask_pr_args(sched_group_span(group)));
> >>>
> >>>                 if ((sd->flags & SD_OVERLAP) &&
> >>>
> >>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
> >>> as cluster:
> >>>
> >>> [    8.886099] CPU0 attaching sched-domain(s):
> >>> [    8.889539]  domain-0: span=0,40 level=SMT
> >>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
> >>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
> >>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
> >>> [    8.905538]    domain-2: span=0-79 level=NUMA
> >>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
> >>>
> >>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
> >>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
> >>>
> >>
> >> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
> >> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
> >> for cluster scanning and sd_cluster per-cpu variable is still needed.
> >>
> >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
> >>
> >> Thanks.
> >>
> >>
> > Is this an issue? Suppose sched domain A's parent domain
> > is B, B's parent sched domain is C. When B degenerates, C's child domain
> > pointer is adjusted to A. However, currently the code does not adjust C's
> > sched groups' flags. Should we adjust C's sched groups flags to be the same
> > as A to keep consistency?
> 
> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
> it within the SMT so I think update the lowest domain's group flag works. For correctness
> all the domain group's flag should derives from its real child. I tried to solve this at group
> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
> or not since the groups it not built.
> 
> > 
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 6198fa135176..fe3fd70f2313 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> >  
> >  	/* Remove the sched domains which do not contribute to scheduling. */
> >  	for (tmp = sd; tmp; ) {
> > -		struct sched_domain *parent = tmp->parent;
> > +		struct sched_domain *parent = tmp->parent, *pparent;
> >  		if (!parent)
> >  			break;
> >  
> >  		if (sd_parent_degenerate(tmp, parent)) {
> > -			tmp->parent = parent->parent;
> > -			if (parent->parent)
> > -				parent->parent->child = tmp;
> > +			pparent = parent->parent;
> > +			tmp->parent = pparent;
> >  			/*
> >  			 * Transfer SD_PREFER_SIBLING down in case of a
> >  			 * degenerate parent; the spans match for this
> > @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> >  			 */
> >  			if (parent->flags & SD_PREFER_SIBLING)
> >  				tmp->flags |= SD_PREFER_SIBLING;
> > +
> > +			if (pparent) {
> > +				struct sched_group *sg = pparent->groups;
> > +
> > +				do {
> > +					sg->flags = tmp->flags;
> 
> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
> remote group have the same flags with @tmp?
>
Good question, for heterogenous platforms sched groups within the same domain might not always
have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
have different balance flags in theory. Maybe only update the local group's flag is accurate.

I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
should be in tip:
https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
We could adjust it based on his change to remove SD_CLUSTER, or we can
replace groups->flag with tmp->flag directly, in case we have other flags to be
adjusted in the future.

thanks,
Chenyu
Yicong Yang June 13, 2023, 7:36 a.m. UTC | #7
On 2023/6/9 18:50, Chen Yu wrote:
> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
>> On 2023/6/8 11:26, Chen Yu wrote:
>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
>>>> On 2023/5/30 22:39, Yicong Yang wrote:
>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>>>>>
>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>> index 373ff5f55884..b8c129ed8b47 100644
>>>>>>> --- a/kernel/sched/fair.c
>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>>>>>  		}
>>>>>>>  	}
>>>>>>>  
>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>>>>>>> +
>>>>>>> +		if (sdc) {
>>>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>> +					continue;
>>>>>>> +
>>>>>>> +				if (has_idle_core) {
>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
>>>>>>> +						return i;
>>>>>>> +				} else {
>>>>>>> +					if (--nr <= 0)
>>>>>>> +						return -1;
>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>>>>>>> +						return idle_cpu;
>>>>>>> +				}
>>>>>>> +			}
>>>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>>>>>>> +		}
>>>>>>> +	}
>>>>>>
>>>>>> Would not this:
>>>>>>
>>>>>> --- a/kernel/sched/fair.c
>>>>>> +++ b/kernel/sched/fair.c
>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>>>>>>  		}
>>>>>>  	}
>>>>>>  
>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>> +		struct sched_group *sg = sd->groups;
>>>>>> +		if (sg->flags & SD_CLUSTER) {
>>>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>> +					continue;
>>>>>> +
>>>>>> +				if (has_idle_core) {
>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>> +					if ((unsigned)i < nr_cpumask_bits)
>>>>>> +						return 1;
>>>>>> +				} else {
>>>>>> +					if (--nr <= 0)
>>>>>> +						return -1;
>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
>>>>>> +						return idle_cpu;
>>>>>> +				}
>>>>>> +			}
>>>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>>>>>>  		if (has_idle_core) {
>>>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>
>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
>>>>>>
>>>>>
>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
>>>>>
>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>>> index 69968ed9ffb9..5c443b74abf5 100644
>>>>> --- a/kernel/sched/topology.c
>>>>> +++ b/kernel/sched/topology.c
>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
>>>>>
>>>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
>>>>>
>>>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
>>>>> -                               group->sgc->id,
>>>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
>>>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
>>>>>                                 cpumask_pr_args(sched_group_span(group)));
>>>>>
>>>>>                 if ((sd->flags & SD_OVERLAP) &&
>>>>>
>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
>>>>> as cluster:
>>>>>
>>>>> [    8.886099] CPU0 attaching sched-domain(s):
>>>>> [    8.889539]  domain-0: span=0,40 level=SMT
>>>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
>>>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
>>>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
>>>>> [    8.905538]    domain-2: span=0-79 level=NUMA
>>>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
>>>>>
>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
>>>>>
>>>>
>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
>>>> for cluster scanning and sd_cluster per-cpu variable is still needed.
>>>>
>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
>>>>
>>>> Thanks.
>>>>
>>>>
>>> Is this an issue? Suppose sched domain A's parent domain
>>> is B, B's parent sched domain is C. When B degenerates, C's child domain
>>> pointer is adjusted to A. However, currently the code does not adjust C's
>>> sched groups' flags. Should we adjust C's sched groups flags to be the same
>>> as A to keep consistency?
>>
>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
>> it within the SMT so I think update the lowest domain's group flag works. For correctness
>> all the domain group's flag should derives from its real child. I tried to solve this at group
>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
>> or not since the groups it not built.
>>
>>>
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index 6198fa135176..fe3fd70f2313 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>  
>>>  	/* Remove the sched domains which do not contribute to scheduling. */
>>>  	for (tmp = sd; tmp; ) {
>>> -		struct sched_domain *parent = tmp->parent;
>>> +		struct sched_domain *parent = tmp->parent, *pparent;
>>>  		if (!parent)
>>>  			break;
>>>  
>>>  		if (sd_parent_degenerate(tmp, parent)) {
>>> -			tmp->parent = parent->parent;
>>> -			if (parent->parent)
>>> -				parent->parent->child = tmp;
>>> +			pparent = parent->parent;
>>> +			tmp->parent = pparent;
>>>  			/*
>>>  			 * Transfer SD_PREFER_SIBLING down in case of a
>>>  			 * degenerate parent; the spans match for this
>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>  			 */
>>>  			if (parent->flags & SD_PREFER_SIBLING)
>>>  				tmp->flags |= SD_PREFER_SIBLING;
>>> +
>>> +			if (pparent) {
>>> +				struct sched_group *sg = pparent->groups;
>>> +
>>> +				do {
>>> +					sg->flags = tmp->flags;
>>
>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
>> remote group have the same flags with @tmp?
>>
> Good question, for heterogenous platforms sched groups within the same domain might not always
> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
> have different balance flags in theory. Maybe only update the local group's flag is accurate.
> 
> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
> should be in tip:
> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
> We could adjust it based on his change to remove SD_CLUSTER, or we can
> replace groups->flag with tmp->flag directly, in case we have other flags to be
> adjusted in the future.
> 

Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update
local groups flag should satisfy our needs. I tried to use the correct child domains to build the
sched groups then all the groups will have the correct flags, but it'll be a bit more complex.

Thanks.
Yicong Yang June 13, 2023, 7:44 a.m. UTC | #8
On 2023/6/12 13:22, Chen Yu wrote:
> On 2023-06-12 at 10:31:39 +0530, Gautham R. Shenoy wrote:
>> Hello Yicong,
>>
>>
>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>> From: Barry Song <song.bao.hua@hisilicon.com>
>> [..snip..]
>>
>>> @@ -7103,7 +7127,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>>  	bool has_idle_core = false;
>>>  	struct sched_domain *sd;
>>>  	unsigned long task_util, util_min, util_max;
>>> -	int i, recent_used_cpu;
>>> +	int i, recent_used_cpu, prev_aff = -1;
>>>  
>>>  	/*
>>>  	 * On asymmetric system, update task utilization because we will check
>>> @@ -7130,8 +7154,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>>  	 */
>>>  	if (prev != target && cpus_share_cache(prev, target) &&
>>>  	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
>>> -	    asym_fits_cpu(task_util, util_min, util_max, prev))
>>> -		return prev;
>>> +	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
>>> +		if (cpus_share_lowest_cache(prev, target))
>>
>> For platforms without the cluster domain, the cpus_share_lowest_cache
>> check is a repetition of the cpus_share_cache(prev, target) check. Can
>> we avoid this using a static branch check for cluster ?
>>
>>
> Sounds good. 
>>> +			return prev;
>>> +		prev_aff = prev;
>>> +	}
>>>  
>>>  	/*
>>>  	 * Allow a per-cpu kthread to stack with the wakee if the
>>> @@ -7158,7 +7185,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>>  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
>>>  	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
>>>  	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>>> -		return recent_used_cpu;
>>> +		if (cpus_share_lowest_cache(recent_used_cpu, target))
>>
>> Same here.
>>
>>> +			return recent_used_cpu;
>>> +	} else {
>>> +		recent_used_cpu = -1;
>>>  	}
>>>  
>>>  	/*
>>> @@ -7199,6 +7229,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>>  	if ((unsigned)i < nr_cpumask_bits)
>>>  		return i;
>>>  
>>> +	/*
>>> +	 * For cluster machines which have lower sharing cache like L2 or
>>> +	 * LLC Tag, we tend to find an idle CPU in the target's cluster
>>> +	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
>>> +	 * use them if possible when no idle CPU found in select_idle_cpu().
>>> +	 */
>>> +	if ((unsigned int)prev_aff < nr_cpumask_bits)
>>> +		return prev_aff;
>>
>> Shouldn't we check if prev_aff (and the recent_used_cpu below) is
>> still idle ?
>>
>>
> When we reach here, the target is non-idle, and the prev_aff is idle.
> Although there is a race condition that prev_aff becomes non-idle
> and target becomes idle after select_idle_cpu(), this window might be
> small IMO.
> 

Yes. Races here but adding a check won't cost much, so it's ok for me
to check it or not.

Thanks.
Yicong Yang June 13, 2023, 8:09 a.m. UTC | #9
On 2023/6/13 15:36, Yicong Yang wrote:
> On 2023/6/9 18:50, Chen Yu wrote:
>> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
>>> On 2023/6/8 11:26, Chen Yu wrote:
>>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
>>>>> On 2023/5/30 22:39, Yicong Yang wrote:
>>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
>>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>>>>>>
>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>>> index 373ff5f55884..b8c129ed8b47 100644
>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>>>>>>  		}
>>>>>>>>  	}
>>>>>>>>  
>>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>>>>>>>> +
>>>>>>>> +		if (sdc) {
>>>>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>>> +					continue;
>>>>>>>> +
>>>>>>>> +				if (has_idle_core) {
>>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
>>>>>>>> +						return i;
>>>>>>>> +				} else {
>>>>>>>> +					if (--nr <= 0)
>>>>>>>> +						return -1;
>>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>>>>>>>> +						return idle_cpu;
>>>>>>>> +				}
>>>>>>>> +			}
>>>>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>
>>>>>>> Would not this:
>>>>>>>
>>>>>>> --- a/kernel/sched/fair.c
>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>>>>>>>  		}
>>>>>>>  	}
>>>>>>>  
>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>> +		struct sched_group *sg = sd->groups;
>>>>>>> +		if (sg->flags & SD_CLUSTER) {
>>>>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>> +					continue;
>>>>>>> +
>>>>>>> +				if (has_idle_core) {
>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>> +					if ((unsigned)i < nr_cpumask_bits)
>>>>>>> +						return 1;
>>>>>>> +				} else {
>>>>>>> +					if (--nr <= 0)
>>>>>>> +						return -1;
>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
>>>>>>> +						return idle_cpu;
>>>>>>> +				}
>>>>>>> +			}
>>>>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
>>>>>>> +		}
>>>>>>> +	}
>>>>>>> +
>>>>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>>>>>>>  		if (has_idle_core) {
>>>>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>
>>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
>>>>>>>
>>>>>>
>>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
>>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
>>>>>>
>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>>>> index 69968ed9ffb9..5c443b74abf5 100644
>>>>>> --- a/kernel/sched/topology.c
>>>>>> +++ b/kernel/sched/topology.c
>>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
>>>>>>
>>>>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
>>>>>>
>>>>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
>>>>>> -                               group->sgc->id,
>>>>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
>>>>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
>>>>>>                                 cpumask_pr_args(sched_group_span(group)));
>>>>>>
>>>>>>                 if ((sd->flags & SD_OVERLAP) &&
>>>>>>
>>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
>>>>>> as cluster:
>>>>>>
>>>>>> [    8.886099] CPU0 attaching sched-domain(s):
>>>>>> [    8.889539]  domain-0: span=0,40 level=SMT
>>>>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
>>>>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
>>>>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
>>>>>> [    8.905538]    domain-2: span=0-79 level=NUMA
>>>>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
>>>>>>
>>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
>>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
>>>>>>
>>>>>
>>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
>>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
>>>>> for cluster scanning and sd_cluster per-cpu variable is still needed.
>>>>>
>>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
>>>>>
>>>>> Thanks.
>>>>>
>>>>>
>>>> Is this an issue? Suppose sched domain A's parent domain
>>>> is B, B's parent sched domain is C. When B degenerates, C's child domain
>>>> pointer is adjusted to A. However, currently the code does not adjust C's
>>>> sched groups' flags. Should we adjust C's sched groups flags to be the same
>>>> as A to keep consistency?
>>>
>>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
>>> it within the SMT so I think update the lowest domain's group flag works. For correctness
>>> all the domain group's flag should derives from its real child. I tried to solve this at group
>>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
>>> or not since the groups it not built.
>>>
>>>>
>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>> index 6198fa135176..fe3fd70f2313 100644
>>>> --- a/kernel/sched/topology.c
>>>> +++ b/kernel/sched/topology.c
>>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>>  
>>>>  	/* Remove the sched domains which do not contribute to scheduling. */
>>>>  	for (tmp = sd; tmp; ) {
>>>> -		struct sched_domain *parent = tmp->parent;
>>>> +		struct sched_domain *parent = tmp->parent, *pparent;
>>>>  		if (!parent)
>>>>  			break;
>>>>  
>>>>  		if (sd_parent_degenerate(tmp, parent)) {
>>>> -			tmp->parent = parent->parent;
>>>> -			if (parent->parent)
>>>> -				parent->parent->child = tmp;
>>>> +			pparent = parent->parent;
>>>> +			tmp->parent = pparent;
>>>>  			/*
>>>>  			 * Transfer SD_PREFER_SIBLING down in case of a
>>>>  			 * degenerate parent; the spans match for this
>>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>>  			 */
>>>>  			if (parent->flags & SD_PREFER_SIBLING)
>>>>  				tmp->flags |= SD_PREFER_SIBLING;
>>>> +
>>>> +			if (pparent) {
>>>> +				struct sched_group *sg = pparent->groups;
>>>> +
>>>> +				do {
>>>> +					sg->flags = tmp->flags;
>>>
>>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
>>> remote group have the same flags with @tmp?
>>>
>> Good question, for heterogenous platforms sched groups within the same domain might not always
>> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
>> have different balance flags in theory. Maybe only update the local group's flag is accurate.
>>
>> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
>> should be in tip:
>> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
>> We could adjust it based on his change to remove SD_CLUSTER, or we can
>> replace groups->flag with tmp->flag directly, in case we have other flags to be
>> adjusted in the future.
>>
> 
> Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update
> local groups flag should satisfy our needs. I tried to use the correct child domains to build the
> sched groups then all the groups will have the correct flags, but it'll be a bit more complex.
> 

something like below, detect the sched domain degeneration first and try to rebuild the groups if
necessary. Works on a non-cluster machine emulated with QEMU:

qemu-system-aarch64 \
        -kernel ${Image} \
        -smp cores=16,threads=2 \
        -cpu host --enable-kvm \
        -m 32G \
        -object memory-backend-ram,id=node0,size=8G \
        -object memory-backend-ram,id=node1,size=8G \
        -object memory-backend-ram,id=node2,size=8G \
        -object memory-backend-ram,id=node3,size=8G \
        -numa node,memdev=node0,cpus=0-7,nodeid=0 \
        -numa node,memdev=node1,cpus=8-15,nodeid=1 \
        -numa node,memdev=node2,cpus=16-23,nodeid=2 \
        -numa node,memdev=node3,cpus=24-31,nodeid=3 \
        -numa dist,src=0,dst=1,val=12 \
        -numa dist,src=0,dst=2,val=20 \
        -numa dist,src=0,dst=3,val=22 \
        -numa dist,src=1,dst=2,val=22 \
        -numa dist,src=1,dst=3,val=24 \
        -numa dist,src=2,dst=3,val=12 \
        -machine virt,iommu=smmuv3 \
        -net none -initrd ${Rootfs} \
        -nographic -bios $(pwd)/QEMU_EFI.fd \
        -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8"

The flags looks correct:
[    4.379910] CPU0 attaching sched-domain(s):
[    4.380540]  domain-0: span=0-1 level=SMT
[    4.381130]   groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 }
[    4.382353]   domain-1: span=0-7 level=MC
[    4.382950]    groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }
[    4.385378]    domain-2: span=0-15 level=NUMA
[    4.386047]     groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 }
[    4.387542]     domain-3: span=0-23 level=NUMA
[    4.388197]      groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 }
[    4.389661]      domain-4: span=0-31 level=NUMA
[    4.390358]       groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 }


diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c0d21667ddf3..d3bf5a1c85bc 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -99,6 +99,7 @@ struct sched_domain {
        int nohz_idle;                  /* NOHZ IDLE status */
        int flags;                      /* See SD_* */
        int level;
+       int need_degeneration;

        /* Runtime fields. */
        unsigned long last_balance;     /* init to jiffies. units in jiffies */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 69968ed9ffb9..561649ddcfe7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

                cpumask_or(groupmask, groupmask, sched_group_span(group));

-               printk(KERN_CONT " %d:{ span=%*pbl",
-                               group->sgc->id,
+               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
+                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
                                cpumask_pr_args(sched_group_span(group)));

                if ((sd->flags & SD_OVERLAP) &&
@@ -623,6 +623,32 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
        } while (sg != first);
 }

+/*
+ * Reset the sched groups for a rebuild.
+ */
+static void reset_sched_domain_groups(struct sched_domain *sd)
+{
+       struct sched_group *sg, *first, *next;
+
+       if (!sd->groups)
+               return;
+
+       sg = first = sd->groups;
+       do {
+               next = sg->next;
+
+               if (sd->flags & SD_OVERLAP) {
+                       if (atomic_dec_and_test(&sg->ref))
+                               kfree(sg);
+               } else {
+                       atomic_dec(&sg->ref);
+               }
+               atomic_dec(&sg->sgc->ref);
+
+               sg = next;
+       } while (sg != first);
+}
+
 static void destroy_sched_domain(struct sched_domain *sd)
 {
        /*
@@ -717,6 +743,42 @@ static void update_top_cache_domain(int cpu)
        rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }

+static bool probe_sched_domain_degeneration(struct sched_domain *sd, int cpu)
+{
+       bool has_degeneration = false;
+       struct sched_domain *tmp = sd, *parent;
+
+       if (tmp) {
+               parent = tmp->parent;
+               while (parent) {
+                       if (sd_parent_degenerate(tmp, parent)) {
+                               parent->need_degeneration = 1;
+                               has_degeneration = true;
+                               parent = parent->parent;
+                       } else {
+                               tmp = parent;
+                               parent = tmp->parent;
+                       }
+               }
+       }
+
+       if (sd && sd_degenerate(sd)) {
+               sd->need_degeneration = 1;
+               has_degeneration = true;
+       }
+
+       /*
+        * On degeneration reset the sched group's refcount since we need to
+        * rebuild them.
+        */
+       if (has_degeneration) {
+               for (tmp = sd; tmp; tmp = tmp->parent)
+                       reset_sched_domain_groups(tmp);
+       }
+
+       return has_degeneration;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
@@ -1008,9 +1070,9 @@ find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
         * The proper descendant would be the one whose child won't span out
         * of sd
         */
-       while (sibling->child &&
+       while (sibling->child && (sibling->need_degeneration ||
               !cpumask_subset(sched_domain_span(sibling->child),
-                              sched_domain_span(sd)))
+                              sched_domain_span(sd))))
                sibling = sibling->child;

        /*
@@ -1018,9 +1080,9 @@ find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
         * to go down to skip those sched_domains which don't contribute to
         * scheduling because they will be degenerated in cpu_attach_domain
         */
-       while (sibling->child &&
+       while (sibling->child && (sibling->need_degeneration ||
               cpumask_equal(sched_domain_span(sibling->child),
-                            sched_domain_span(sibling)))
+                            sched_domain_span(sibling))))
                sibling = sibling->child;

        return sibling;
@@ -1199,6 +1261,9 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
        struct sched_group *sg;
        bool already_visited;

+       while (child && child->need_degeneration)
+               child = child->child;
+
        if (child)
                cpu = cpumask_first(sched_domain_span(child));

@@ -2366,6 +2431,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
        int i, ret = -ENOMEM;
        bool has_asym = false;
        bool has_cluster = false;
+       bool rebuild = true, need_rebuild = false;

        if (WARN_ON(cpumask_empty(cpu_map)))
                goto error;
@@ -2398,6 +2464,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                }
        }

+rebuild_groups:
        /* Build the groups for the domains */
        for_each_cpu(i, cpu_map) {
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
@@ -2412,6 +2479,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                }
        }

+       /*
+        * If some sched domains are going to be degenerated, the groups may not
+        * be built from the real child domains thus with incorrect flags or
+        * span. Detect the generation here and rebuild the sched groups if
+        * necessary.
+        */
+       if (rebuild) {
+               rebuild = false;
+
+               for_each_cpu(i, cpu_map) {
+                       sd = *per_cpu_ptr(d.sd, i);
+
+                       need_rebuild = probe_sched_domain_degeneration(sd, i);
+               }
+
+               if (need_rebuild)
+                       goto rebuild_groups;
+       }
+
        /*
         * Calculate an allowed NUMA imbalance such that LLCs do not get
         * imbalanced.
Chen Yu June 13, 2023, 12:44 p.m. UTC | #10
On 2023-06-13 at 16:09:17 +0800, Yicong Yang wrote:
> On 2023/6/13 15:36, Yicong Yang wrote:
> > On 2023/6/9 18:50, Chen Yu wrote:
> >> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
> >>> On 2023/6/8 11:26, Chen Yu wrote:
> >>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
> >>>>> On 2023/5/30 22:39, Yicong Yang wrote:
> >>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
> >>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
> >>>>>>>
> >>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>>>>> index 373ff5f55884..b8c129ed8b47 100644
> >>>>>>>> --- a/kernel/sched/fair.c
> >>>>>>>> +++ b/kernel/sched/fair.c
> >>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >>>>>>>>  		}
> >>>>>>>>  	}
> >>>>>>>>  
> >>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>>>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
> >>>>>>>> +
> >>>>>>>> +		if (sdc) {
> >>>>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
> >>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>>>>>>> +					continue;
> >>>>>>>> +
> >>>>>>>> +				if (has_idle_core) {
> >>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
> >>>>>>>> +						return i;
> >>>>>>>> +				} else {
> >>>>>>>> +					if (--nr <= 0)
> >>>>>>>> +						return -1;
> >>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>>>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
> >>>>>>>> +						return idle_cpu;
> >>>>>>>> +				}
> >>>>>>>> +			}
> >>>>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
> >>>>>>>> +		}
> >>>>>>>> +	}
> >>>>>>>
> >>>>>>> Would not this:
> >>>>>>>
> >>>>>>> --- a/kernel/sched/fair.c
> >>>>>>> +++ b/kernel/sched/fair.c
> >>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
> >>>>>>>  		}
> >>>>>>>  	}
> >>>>>>>  
> >>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>>>>>> +		struct sched_group *sg = sd->groups;
> >>>>>>> +		if (sg->flags & SD_CLUSTER) {
> >>>>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
> >>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>>>>>> +					continue;
> >>>>>>> +
> >>>>>>> +				if (has_idle_core) {
> >>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>>>> +					if ((unsigned)i < nr_cpumask_bits)
> >>>>>>> +						return 1;
> >>>>>>> +				} else {
> >>>>>>> +					if (--nr <= 0)
> >>>>>>> +						return -1;
> >>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>>>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
> >>>>>>> +						return idle_cpu;
> >>>>>>> +				}
> >>>>>>> +			}
> >>>>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
> >>>>>>> +		}
> >>>>>>> +	}
> >>>>>>> +
> >>>>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
> >>>>>>>  		if (has_idle_core) {
> >>>>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>>>>
> >>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
> >>>>>>>
> >>>>>>
> >>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
> >>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
> >>>>>>
> >>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >>>>>> index 69968ed9ffb9..5c443b74abf5 100644
> >>>>>> --- a/kernel/sched/topology.c
> >>>>>> +++ b/kernel/sched/topology.c
> >>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
> >>>>>>
> >>>>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
> >>>>>>
> >>>>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
> >>>>>> -                               group->sgc->id,
> >>>>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
> >>>>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
> >>>>>>                                 cpumask_pr_args(sched_group_span(group)));
> >>>>>>
> >>>>>>                 if ((sd->flags & SD_OVERLAP) &&
> >>>>>>
> >>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
> >>>>>> as cluster:
> >>>>>>
> >>>>>> [    8.886099] CPU0 attaching sched-domain(s):
> >>>>>> [    8.889539]  domain-0: span=0,40 level=SMT
> >>>>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
> >>>>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
> >>>>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
> >>>>>> [    8.905538]    domain-2: span=0-79 level=NUMA
> >>>>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
> >>>>>>
> >>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
> >>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
> >>>>>>
> >>>>>
> >>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
> >>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
> >>>>> for cluster scanning and sd_cluster per-cpu variable is still needed.
> >>>>>
> >>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
> >>>>>
> >>>>> Thanks.
> >>>>>
> >>>>>
> >>>> Is this an issue? Suppose sched domain A's parent domain
> >>>> is B, B's parent sched domain is C. When B degenerates, C's child domain
> >>>> pointer is adjusted to A. However, currently the code does not adjust C's
> >>>> sched groups' flags. Should we adjust C's sched groups flags to be the same
> >>>> as A to keep consistency?
> >>>
> >>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
> >>> it within the SMT so I think update the lowest domain's group flag works. For correctness
> >>> all the domain group's flag should derives from its real child. I tried to solve this at group
> >>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
> >>> or not since the groups it not built.
> >>>
> >>>>
> >>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >>>> index 6198fa135176..fe3fd70f2313 100644
> >>>> --- a/kernel/sched/topology.c
> >>>> +++ b/kernel/sched/topology.c
> >>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> >>>>  
> >>>>  	/* Remove the sched domains which do not contribute to scheduling. */
> >>>>  	for (tmp = sd; tmp; ) {
> >>>> -		struct sched_domain *parent = tmp->parent;
> >>>> +		struct sched_domain *parent = tmp->parent, *pparent;
> >>>>  		if (!parent)
> >>>>  			break;
> >>>>  
> >>>>  		if (sd_parent_degenerate(tmp, parent)) {
> >>>> -			tmp->parent = parent->parent;
> >>>> -			if (parent->parent)
> >>>> -				parent->parent->child = tmp;
> >>>> +			pparent = parent->parent;
> >>>> +			tmp->parent = pparent;
> >>>>  			/*
> >>>>  			 * Transfer SD_PREFER_SIBLING down in case of a
> >>>>  			 * degenerate parent; the spans match for this
> >>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> >>>>  			 */
> >>>>  			if (parent->flags & SD_PREFER_SIBLING)
> >>>>  				tmp->flags |= SD_PREFER_SIBLING;
> >>>> +
> >>>> +			if (pparent) {
> >>>> +				struct sched_group *sg = pparent->groups;
> >>>> +
> >>>> +				do {
> >>>> +					sg->flags = tmp->flags;
> >>>
> >>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
> >>> remote group have the same flags with @tmp?
> >>>
> >> Good question, for heterogenous platforms sched groups within the same domain might not always
> >> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
> >> have different balance flags in theory. Maybe only update the local group's flag is accurate.
> >>
> >> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
> >> should be in tip:
> >> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
> >> We could adjust it based on his change to remove SD_CLUSTER, or we can
> >> replace groups->flag with tmp->flag directly, in case we have other flags to be
> >> adjusted in the future.
> >>
> > 
> > Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update
> > local groups flag should satisfy our needs. I tried to use the correct child domains to build the
> > sched groups then all the groups will have the correct flags, but it'll be a bit more complex.
> > 
> 
> something like below, detect the sched domain degeneration first and try to rebuild the groups if
> necessary.
Not sure if we need to rebuild the groups. With only

if (parent->flags & SD_CLUSTER)
	parent->parent->groups->flags &= ~SD_CLUSTER;

I see the correct flags.

My understanding is that, although remote groups's flag might be incorrect,
later when other sched domain degenerates, these remote groups becomes local
groups for those sched domains, and the flags will be adjusted accordingly.
> Works on a non-cluster machine emulated with QEMU:
> 
> qemu-system-aarch64 \
>         -kernel ${Image} \
>         -smp cores=16,threads=2 \
>         -cpu host --enable-kvm \
>         -m 32G \
>         -object memory-backend-ram,id=node0,size=8G \
>         -object memory-backend-ram,id=node1,size=8G \
>         -object memory-backend-ram,id=node2,size=8G \
>         -object memory-backend-ram,id=node3,size=8G \
>         -numa node,memdev=node0,cpus=0-7,nodeid=0 \
>         -numa node,memdev=node1,cpus=8-15,nodeid=1 \
>         -numa node,memdev=node2,cpus=16-23,nodeid=2 \
>         -numa node,memdev=node3,cpus=24-31,nodeid=3 \
>         -numa dist,src=0,dst=1,val=12 \
>         -numa dist,src=0,dst=2,val=20 \
>         -numa dist,src=0,dst=3,val=22 \
>         -numa dist,src=1,dst=2,val=22 \
>         -numa dist,src=1,dst=3,val=24 \
>         -numa dist,src=2,dst=3,val=12 \
>         -machine virt,iommu=smmuv3 \
>         -net none -initrd ${Rootfs} \
>         -nographic -bios $(pwd)/QEMU_EFI.fd \
>         -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8"
> 
> The flags looks correct:
> [    4.379910] CPU0 attaching sched-domain(s):
> [    4.380540]  domain-0: span=0-1 level=SMT
> [    4.381130]   groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 }
> [    4.382353]   domain-1: span=0-7 level=MC
> [    4.382950]    groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }
> [    4.385378]    domain-2: span=0-15 level=NUMA
> [    4.386047]     groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 }
> [    4.387542]     domain-3: span=0-23 level=NUMA
> [    4.388197]      groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 }
> [    4.389661]      domain-4: span=0-31 level=NUMA
> [    4.390358]       groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 }
> 
>

I did similar test based on your config:
qemu-system-x86_64 \
        -m 2G \
        -smp 16,threads=2,cores=4,sockets=2 \
        -numa node,mem=1G,cpus=0-7,nodeid=0 \
        -numa node,mem=1G,cpus=8-15,nodeid=1 \
        -kernel bzImage \
        -drive file=./ubuntu.raw,format=raw \
        -append "console=ttyS0 root=/dev/sda5 earlyprintk=serial ignore_loglevel sched_verbose" \
        -cpu host \
        -enable-kvm \
        -nographic

and print the group address as well as the SD_CLUSTER flag:

[    0.208583] CPU0 attaching sched-domain(s):
[    0.208998]  domain-0: span=0-1 level=SMT
[    0.209393]   groups: 0:{ cluster: false group 0xffff9ed3c19e6140 span=0 }, 1:{ cluster: false group 0xffff9ed3c19e6440 span=1 }
[    0.212463]   domain-1: span=0-7 level=MC
[    0.212856]    groups: 0:{ cluster: false group 0xffff9ed3c1a8f3c0 span=0-1 cap=2048 }, 2:{ cluster: true group 0xffff9ed3c1a8fac0 span=2-3 cap=2048 },

Yeah, group 0xffff9ed3c1a8fac0 has SD_CLUSTER, but...

                         4:{ cluster: true group 0xffff9ed3c1a8fe40 span=4-5 cap=2048 }, 6:{ cluster: true group 0xffff9ed3c1a8f700 span=6-7 cap=2048 }
[    0.230214] CPU2 attaching sched-domain(s):
[    0.232462]  domain-0: span=2-3 level=SMT
[    0.232855]   groups: 2:{ cluster: false group 0xffff9ed3c19e66c0 span=2 }, 3:{ cluster: false group 0xffff9ed3c19e6400 span=3 }
[    0.233971]   domain-1: span=0-7 level=MC
[    0.236463]    groups: 2:{ cluster: false group 0xffff9ed3c1a8fac0 span=2-3 cap=2048 }, 4:{ cluster: true group 0xffff9ed3c1a8fe40 span=4-5 cap=2048 },
                          6:{ cluster: true group 0xffff9ed3c1a8f700 span=6-7 cap=2048 }, 0:{ cluster: false group 0xffff9ed3c1a8f3c0 span=0-1 cap=2048 }

group 0xffff9ed3c1a8fac0's SD_CLUSTER flag will be cleared here.


thanks,
Chenyu
Yicong Yang June 15, 2023, 7:59 a.m. UTC | #11
On 2023/6/13 20:44, Chen Yu wrote:
> On 2023-06-13 at 16:09:17 +0800, Yicong Yang wrote:
>> On 2023/6/13 15:36, Yicong Yang wrote:
>>> On 2023/6/9 18:50, Chen Yu wrote:
>>>> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
>>>>> On 2023/6/8 11:26, Chen Yu wrote:
>>>>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
>>>>>>> On 2023/5/30 22:39, Yicong Yang wrote:
>>>>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
>>>>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>>>>>>>>
>>>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>>>>> index 373ff5f55884..b8c129ed8b47 100644
>>>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>>>>>>>>  		}
>>>>>>>>>>  	}
>>>>>>>>>>  
>>>>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>>>>>>>>>> +
>>>>>>>>>> +		if (sdc) {
>>>>>>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>>>>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>>>>> +					continue;
>>>>>>>>>> +
>>>>>>>>>> +				if (has_idle_core) {
>>>>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
>>>>>>>>>> +						return i;
>>>>>>>>>> +				} else {
>>>>>>>>>> +					if (--nr <= 0)
>>>>>>>>>> +						return -1;
>>>>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>>>>>>>>>> +						return idle_cpu;
>>>>>>>>>> +				}
>>>>>>>>>> +			}
>>>>>>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>>>>>>>>>> +		}
>>>>>>>>>> +	}
>>>>>>>>>
>>>>>>>>> Would not this:
>>>>>>>>>
>>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>>>>>>>>>  		}
>>>>>>>>>  	}
>>>>>>>>>  
>>>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>>>> +		struct sched_group *sg = sd->groups;
>>>>>>>>> +		if (sg->flags & SD_CLUSTER) {
>>>>>>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
>>>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>>>> +					continue;
>>>>>>>>> +
>>>>>>>>> +				if (has_idle_core) {
>>>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>>> +					if ((unsigned)i < nr_cpumask_bits)
>>>>>>>>> +						return 1;
>>>>>>>>> +				} else {
>>>>>>>>> +					if (--nr <= 0)
>>>>>>>>> +						return -1;
>>>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
>>>>>>>>> +						return idle_cpu;
>>>>>>>>> +				}
>>>>>>>>> +			}
>>>>>>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
>>>>>>>>> +		}
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>>>>>>>>>  		if (has_idle_core) {
>>>>>>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>>>
>>>>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
>>>>>>>>>
>>>>>>>>
>>>>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
>>>>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
>>>>>>>>
>>>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>>>>>> index 69968ed9ffb9..5c443b74abf5 100644
>>>>>>>> --- a/kernel/sched/topology.c
>>>>>>>> +++ b/kernel/sched/topology.c
>>>>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
>>>>>>>>
>>>>>>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
>>>>>>>>
>>>>>>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
>>>>>>>> -                               group->sgc->id,
>>>>>>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
>>>>>>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
>>>>>>>>                                 cpumask_pr_args(sched_group_span(group)));
>>>>>>>>
>>>>>>>>                 if ((sd->flags & SD_OVERLAP) &&
>>>>>>>>
>>>>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
>>>>>>>> as cluster:
>>>>>>>>
>>>>>>>> [    8.886099] CPU0 attaching sched-domain(s):
>>>>>>>> [    8.889539]  domain-0: span=0,40 level=SMT
>>>>>>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
>>>>>>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
>>>>>>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
>>>>>>>> [    8.905538]    domain-2: span=0-79 level=NUMA
>>>>>>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
>>>>>>>>
>>>>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
>>>>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
>>>>>>>>
>>>>>>>
>>>>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
>>>>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
>>>>>>> for cluster scanning and sd_cluster per-cpu variable is still needed.
>>>>>>>
>>>>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
>>>>>>>
>>>>>>> Thanks.
>>>>>>>
>>>>>>>
>>>>>> Is this an issue? Suppose sched domain A's parent domain
>>>>>> is B, B's parent sched domain is C. When B degenerates, C's child domain
>>>>>> pointer is adjusted to A. However, currently the code does not adjust C's
>>>>>> sched groups' flags. Should we adjust C's sched groups flags to be the same
>>>>>> as A to keep consistency?
>>>>>
>>>>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
>>>>> it within the SMT so I think update the lowest domain's group flag works. For correctness
>>>>> all the domain group's flag should derives from its real child. I tried to solve this at group
>>>>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
>>>>> or not since the groups it not built.
>>>>>
>>>>>>
>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>>>> index 6198fa135176..fe3fd70f2313 100644
>>>>>> --- a/kernel/sched/topology.c
>>>>>> +++ b/kernel/sched/topology.c
>>>>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>>>>  
>>>>>>  	/* Remove the sched domains which do not contribute to scheduling. */
>>>>>>  	for (tmp = sd; tmp; ) {
>>>>>> -		struct sched_domain *parent = tmp->parent;
>>>>>> +		struct sched_domain *parent = tmp->parent, *pparent;
>>>>>>  		if (!parent)
>>>>>>  			break;
>>>>>>  
>>>>>>  		if (sd_parent_degenerate(tmp, parent)) {
>>>>>> -			tmp->parent = parent->parent;
>>>>>> -			if (parent->parent)
>>>>>> -				parent->parent->child = tmp;
>>>>>> +			pparent = parent->parent;
>>>>>> +			tmp->parent = pparent;
>>>>>>  			/*
>>>>>>  			 * Transfer SD_PREFER_SIBLING down in case of a
>>>>>>  			 * degenerate parent; the spans match for this
>>>>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>>>>  			 */
>>>>>>  			if (parent->flags & SD_PREFER_SIBLING)
>>>>>>  				tmp->flags |= SD_PREFER_SIBLING;
>>>>>> +
>>>>>> +			if (pparent) {
>>>>>> +				struct sched_group *sg = pparent->groups;
>>>>>> +
>>>>>> +				do {
>>>>>> +					sg->flags = tmp->flags;
>>>>>
>>>>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
>>>>> remote group have the same flags with @tmp?
>>>>>
>>>> Good question, for heterogenous platforms sched groups within the same domain might not always
>>>> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
>>>> have different balance flags in theory. Maybe only update the local group's flag is accurate.
>>>>
>>>> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
>>>> should be in tip:
>>>> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
>>>> We could adjust it based on his change to remove SD_CLUSTER, or we can
>>>> replace groups->flag with tmp->flag directly, in case we have other flags to be
>>>> adjusted in the future.
>>>>
>>>
>>> Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update
>>> local groups flag should satisfy our needs. I tried to use the correct child domains to build the
>>> sched groups then all the groups will have the correct flags, but it'll be a bit more complex.
>>>
>>
>> something like below, detect the sched domain degeneration first and try to rebuild the groups if
>> necessary.
> Not sure if we need to rebuild the groups. With only
> 
> if (parent->flags & SD_CLUSTER)
> 	parent->parent->groups->flags &= ~SD_CLUSTER;
> 
> I see the correct flags.
> 
> My understanding is that, although remote groups's flag might be incorrect,
> later when other sched domain degenerates, these remote groups becomes local
> groups for those sched domains, and the flags will be adjusted accordingly.

Maybe worth a try to build the groups correctly at very beginning rather
correct it later when needed. Considering we've used it in several places[1][2]
and this time we're going to use it for cluster.

[1] 16d364ba6ef2 ("sched/topology: Introduce sched_group::flags")
[2] https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/

>> Works on a non-cluster machine emulated with QEMU:
>>
>> qemu-system-aarch64 \
>>         -kernel ${Image} \
>>         -smp cores=16,threads=2 \
>>         -cpu host --enable-kvm \
>>         -m 32G \
>>         -object memory-backend-ram,id=node0,size=8G \
>>         -object memory-backend-ram,id=node1,size=8G \
>>         -object memory-backend-ram,id=node2,size=8G \
>>         -object memory-backend-ram,id=node3,size=8G \
>>         -numa node,memdev=node0,cpus=0-7,nodeid=0 \
>>         -numa node,memdev=node1,cpus=8-15,nodeid=1 \
>>         -numa node,memdev=node2,cpus=16-23,nodeid=2 \
>>         -numa node,memdev=node3,cpus=24-31,nodeid=3 \
>>         -numa dist,src=0,dst=1,val=12 \
>>         -numa dist,src=0,dst=2,val=20 \
>>         -numa dist,src=0,dst=3,val=22 \
>>         -numa dist,src=1,dst=2,val=22 \
>>         -numa dist,src=1,dst=3,val=24 \
>>         -numa dist,src=2,dst=3,val=12 \
>>         -machine virt,iommu=smmuv3 \
>>         -net none -initrd ${Rootfs} \
>>         -nographic -bios $(pwd)/QEMU_EFI.fd \
>>         -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8"
>>
>> The flags looks correct:
>> [    4.379910] CPU0 attaching sched-domain(s):
>> [    4.380540]  domain-0: span=0-1 level=SMT
>> [    4.381130]   groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 }
>> [    4.382353]   domain-1: span=0-7 level=MC
>> [    4.382950]    groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }
>> [    4.385378]    domain-2: span=0-15 level=NUMA
>> [    4.386047]     groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 }
>> [    4.387542]     domain-3: span=0-23 level=NUMA
>> [    4.388197]      groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 }
>> [    4.389661]      domain-4: span=0-31 level=NUMA
>> [    4.390358]       groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 }
>>
>>
> 
> I did similar test based on your config:
> qemu-system-x86_64 \
>         -m 2G \
>         -smp 16,threads=2,cores=4,sockets=2 \
>         -numa node,mem=1G,cpus=0-7,nodeid=0 \
>         -numa node,mem=1G,cpus=8-15,nodeid=1 \
>         -kernel bzImage \
>         -drive file=./ubuntu.raw,format=raw \
>         -append "console=ttyS0 root=/dev/sda5 earlyprintk=serial ignore_loglevel sched_verbose" \
>         -cpu host \
>         -enable-kvm \
>         -nographic
> 
> and print the group address as well as the SD_CLUSTER flag:
> 
> [    0.208583] CPU0 attaching sched-domain(s):
> [    0.208998]  domain-0: span=0-1 level=SMT
> [    0.209393]   groups: 0:{ cluster: false group 0xffff9ed3c19e6140 span=0 }, 1:{ cluster: false group 0xffff9ed3c19e6440 span=1 }
> [    0.212463]   domain-1: span=0-7 level=MC
> [    0.212856]    groups: 0:{ cluster: false group 0xffff9ed3c1a8f3c0 span=0-1 cap=2048 }, 2:{ cluster: true group 0xffff9ed3c1a8fac0 span=2-3 cap=2048 },
> 
> Yeah, group 0xffff9ed3c1a8fac0 has SD_CLUSTER, but...
> 

Something's wrong here. 0xffff9ed3c1a8fac0 shouldn't have SD_CLUSTER. Code above should works to successfully build
MC's groups from SMT rather than CLS, Tested with qemu version 4.2.1 (from ubuntu 20.04) and mainline v8.0.0-1358-gac84b57b4d
with your x86 topology, it looks like:

[    0.517077] CPU0 attaching sched-domain(s):
[    0.520858]  domain-0: span=0-1 level=SMT
[    0.524858]   groups: 0:{ cluster: false span=0 }, 1:{ cluster: false span=1 }
[    0.528858]   domain-1: span=0-7 level=MC
[    0.532858]    groups: 0:{ cluster: false span=0-1 cap=2048 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }
[    0.536858]    domain-2: span=0-15 level=NUMA
[    0.540858]     groups: 0:{ cluster: false span=0-7 cap=8192 }, 8:{ cluster: false span=8-15 cap=8192 }

and for CPU 2:

[    0.572859] CPU2 attaching sched-domain(s):
[    0.576858]  domain-0: span=2-3 level=SMT
[    0.580858]   groups: 2:{ cluster: false span=2 }, 3:{ cluster: false span=3 }
[    0.584858]   domain-1: span=0-7 level=MC
[    0.588858]    groups: 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }, 0:{ cluster: false span=0-1 cap=2048 }
[    0.592858]    domain-2: span=0-15 level=NUMA
[    0.596858]     groups: 0:{ cluster: false span=0-7 cap=8192 }, 8:{ cluster: false span=8-15 cap=8192 }

Thanks.
Chen Yu June 16, 2023, 6 a.m. UTC | #12
On 2023-06-15 at 15:59:10 +0800, Yicong Yang wrote:
> On 2023/6/13 20:44, Chen Yu wrote:
> > On 2023-06-13 at 16:09:17 +0800, Yicong Yang wrote:
> >> On 2023/6/13 15:36, Yicong Yang wrote:
> >>> On 2023/6/9 18:50, Chen Yu wrote:
> >>>> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
> >>>>> On 2023/6/8 11:26, Chen Yu wrote:
> >>>>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
> >>>>>>> On 2023/5/30 22:39, Yicong Yang wrote:
> >>>>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
> >>>>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
> >>>>>>>>>
> >>>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>>>>>>> index 373ff5f55884..b8c129ed8b47 100644
> >>>>>>>>>> --- a/kernel/sched/fair.c
> >>>>>>>>>> +++ b/kernel/sched/fair.c
> >>>>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >>>>>>>>>>  		}
> >>>>>>>>>>  	}
> >>>>>>>>>>  
> >>>>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>>>>>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
> >>>>>>>>>> +
> >>>>>>>>>> +		if (sdc) {
> >>>>>>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
> >>>>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>>>>>>>>> +					continue;
> >>>>>>>>>> +
> >>>>>>>>>> +				if (has_idle_core) {
> >>>>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>>>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
> >>>>>>>>>> +						return i;
> >>>>>>>>>> +				} else {
> >>>>>>>>>> +					if (--nr <= 0)
> >>>>>>>>>> +						return -1;
> >>>>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>>>>>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
> >>>>>>>>>> +						return idle_cpu;
> >>>>>>>>>> +				}
> >>>>>>>>>> +			}
> >>>>>>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
> >>>>>>>>>> +		}
> >>>>>>>>>> +	}
> >>>>>>>>>
> >>>>>>>>> Would not this:
> >>>>>>>>>
> >>>>>>>>> --- a/kernel/sched/fair.c
> >>>>>>>>> +++ b/kernel/sched/fair.c
> >>>>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
> >>>>>>>>>  		}
> >>>>>>>>>  	}
> >>>>>>>>>  
> >>>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
> >>>>>>>>> +		struct sched_group *sg = sd->groups;
> >>>>>>>>> +		if (sg->flags & SD_CLUSTER) {
> >>>>>>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
> >>>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
> >>>>>>>>> +					continue;
> >>>>>>>>> +
> >>>>>>>>> +				if (has_idle_core) {
> >>>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>>>>>> +					if ((unsigned)i < nr_cpumask_bits)
> >>>>>>>>> +						return 1;
> >>>>>>>>> +				} else {
> >>>>>>>>> +					if (--nr <= 0)
> >>>>>>>>> +						return -1;
> >>>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
> >>>>>>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
> >>>>>>>>> +						return idle_cpu;
> >>>>>>>>> +				}
> >>>>>>>>> +			}
> >>>>>>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
> >>>>>>>>> +		}
> >>>>>>>>> +	}
> >>>>>>>>> +
> >>>>>>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
> >>>>>>>>>  		if (has_idle_core) {
> >>>>>>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
> >>>>>>>>>
> >>>>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
> >>>>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
> >>>>>>>>
> >>>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >>>>>>>> index 69968ed9ffb9..5c443b74abf5 100644
> >>>>>>>> --- a/kernel/sched/topology.c
> >>>>>>>> +++ b/kernel/sched/topology.c
> >>>>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
> >>>>>>>>
> >>>>>>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
> >>>>>>>>
> >>>>>>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
> >>>>>>>> -                               group->sgc->id,
> >>>>>>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
> >>>>>>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
> >>>>>>>>                                 cpumask_pr_args(sched_group_span(group)));
> >>>>>>>>
> >>>>>>>>                 if ((sd->flags & SD_OVERLAP) &&
> >>>>>>>>
> >>>>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
> >>>>>>>> as cluster:
> >>>>>>>>
> >>>>>>>> [    8.886099] CPU0 attaching sched-domain(s):
> >>>>>>>> [    8.889539]  domain-0: span=0,40 level=SMT
> >>>>>>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
> >>>>>>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
> >>>>>>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
> >>>>>>>> [    8.905538]    domain-2: span=0-79 level=NUMA
> >>>>>>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
> >>>>>>>>
> >>>>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
> >>>>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
> >>>>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
> >>>>>>> for cluster scanning and sd_cluster per-cpu variable is still needed.
> >>>>>>>
> >>>>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
> >>>>>>>
> >>>>>>> Thanks.
> >>>>>>>
> >>>>>>>
> >>>>>> Is this an issue? Suppose sched domain A's parent domain
> >>>>>> is B, B's parent sched domain is C. When B degenerates, C's child domain
> >>>>>> pointer is adjusted to A. However, currently the code does not adjust C's
> >>>>>> sched groups' flags. Should we adjust C's sched groups flags to be the same
> >>>>>> as A to keep consistency?
> >>>>>
> >>>>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
> >>>>> it within the SMT so I think update the lowest domain's group flag works. For correctness
> >>>>> all the domain group's flag should derives from its real child. I tried to solve this at group
> >>>>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
> >>>>> or not since the groups it not built.
> >>>>>
> >>>>>>
> >>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >>>>>> index 6198fa135176..fe3fd70f2313 100644
> >>>>>> --- a/kernel/sched/topology.c
> >>>>>> +++ b/kernel/sched/topology.c
> >>>>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> >>>>>>  
> >>>>>>  	/* Remove the sched domains which do not contribute to scheduling. */
> >>>>>>  	for (tmp = sd; tmp; ) {
> >>>>>> -		struct sched_domain *parent = tmp->parent;
> >>>>>> +		struct sched_domain *parent = tmp->parent, *pparent;
> >>>>>>  		if (!parent)
> >>>>>>  			break;
> >>>>>>  
> >>>>>>  		if (sd_parent_degenerate(tmp, parent)) {
> >>>>>> -			tmp->parent = parent->parent;
> >>>>>> -			if (parent->parent)
> >>>>>> -				parent->parent->child = tmp;
> >>>>>> +			pparent = parent->parent;
> >>>>>> +			tmp->parent = pparent;
> >>>>>>  			/*
> >>>>>>  			 * Transfer SD_PREFER_SIBLING down in case of a
> >>>>>>  			 * degenerate parent; the spans match for this
> >>>>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> >>>>>>  			 */
> >>>>>>  			if (parent->flags & SD_PREFER_SIBLING)
> >>>>>>  				tmp->flags |= SD_PREFER_SIBLING;
> >>>>>> +
> >>>>>> +			if (pparent) {
> >>>>>> +				struct sched_group *sg = pparent->groups;
> >>>>>> +
> >>>>>> +				do {
> >>>>>> +					sg->flags = tmp->flags;
> >>>>>
> >>>>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
> >>>>> remote group have the same flags with @tmp?
> >>>>>
> >>>> Good question, for heterogenous platforms sched groups within the same domain might not always
> >>>> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
> >>>> have different balance flags in theory. Maybe only update the local group's flag is accurate.
> >>>>
> >>>> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
> >>>> should be in tip:
> >>>> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
> >>>> We could adjust it based on his change to remove SD_CLUSTER, or we can
> >>>> replace groups->flag with tmp->flag directly, in case we have other flags to be
> >>>> adjusted in the future.
> >>>>
> >>>
> >>> Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update
> >>> local groups flag should satisfy our needs. I tried to use the correct child domains to build the
> >>> sched groups then all the groups will have the correct flags, but it'll be a bit more complex.
> >>>
> >>
> >> something like below, detect the sched domain degeneration first and try to rebuild the groups if
> >> necessary.
> > Not sure if we need to rebuild the groups. With only
> > 
> > if (parent->flags & SD_CLUSTER)
> > 	parent->parent->groups->flags &= ~SD_CLUSTER;
> > 
> > I see the correct flags.
> > 
> > My understanding is that, although remote groups's flag might be incorrect,
> > later when other sched domain degenerates, these remote groups becomes local
> > groups for those sched domains, and the flags will be adjusted accordingly.
> 
> Maybe worth a try to build the groups correctly at very beginning rather
> correct it later when needed. Considering we've used it in several places[1][2]
> and this time we're going to use it for cluster.
> 
> [1] 16d364ba6ef2 ("sched/topology: Introduce sched_group::flags")
> [2] https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
>
Do you mean clearing the SD_CLUSTER during degenerating? Yup, that could
be enough for now. I guess you are going to send a new version with this
SD_CLUSTER flag cleared  + using group->flags to detect SD_CLUSTER rather
than percpu cluster id. I'd be happy to test once you send them out.

thanks,
Chenyu
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..b8c129ed8b47 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6994,6 +6994,30 @@  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 		}
 	}
 
+	if (static_branch_unlikely(&sched_cluster_active)) {
+		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
+
+		if (sdc) {
+			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
+				if (!cpumask_test_cpu(cpu, cpus))
+					continue;
+
+				if (has_idle_core) {
+					i = select_idle_core(p, cpu, cpus, &idle_cpu);
+					if ((unsigned int)i < nr_cpumask_bits)
+						return i;
+				} else {
+					if (--nr <= 0)
+						return -1;
+					idle_cpu = __select_idle_cpu(cpu, p);
+					if ((unsigned int)idle_cpu < nr_cpumask_bits)
+						return idle_cpu;
+				}
+			}
+			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
+		}
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7001,7 +7025,7 @@  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 				return i;
 
 		} else {
-			if (!--nr)
+			if (--nr <= 0)
 				return -1;
 			idle_cpu = __select_idle_cpu(cpu, p);
 			if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7103,7 +7127,7 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	bool has_idle_core = false;
 	struct sched_domain *sd;
 	unsigned long task_util, util_min, util_max;
-	int i, recent_used_cpu;
+	int i, recent_used_cpu, prev_aff = -1;
 
 	/*
 	 * On asymmetric system, update task utilization because we will check
@@ -7130,8 +7154,11 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev))
-		return prev;
+	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+		if (cpus_share_lowest_cache(prev, target))
+			return prev;
+		prev_aff = prev;
+	}
 
 	/*
 	 * Allow a per-cpu kthread to stack with the wakee if the
@@ -7158,7 +7185,10 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
-		return recent_used_cpu;
+		if (cpus_share_lowest_cache(recent_used_cpu, target))
+			return recent_used_cpu;
+	} else {
+		recent_used_cpu = -1;
 	}
 
 	/*
@@ -7199,6 +7229,17 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	/*
+	 * For cluster machines which have lower sharing cache like L2 or
+	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+	 * use them if possible when no idle CPU found in select_idle_cpu().
+	 */
+	if ((unsigned int)prev_aff < nr_cpumask_bits)
+		return prev_aff;
+	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+		return recent_used_cpu;
+
 	return target;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 23dabfc3668b..5097f93b635f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1816,6 +1816,7 @@  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
 
 static __always_inline bool sched_asym_cpucap_active(void)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2c4cc6c95a9a..69968ed9ffb9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -672,7 +672,9 @@  DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -2363,6 +2365,7 @@  build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 	bool has_asym = false;
+	bool has_cluster = false;
 
 	if (WARN_ON(cpumask_empty(cpu_map)))
 		goto error;
@@ -2384,6 +2387,7 @@  build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
 
 			has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
+			has_cluster |= sd->flags & SD_CLUSTER;
 
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
@@ -2494,6 +2498,9 @@  build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	if (has_asym)
 		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
 
+	if (has_cluster)
+		static_branch_inc_cpuslocked(&sched_cluster_active);
+
 	if (rq && sched_debug_verbose) {
 		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
@@ -2593,6 +2600,9 @@  static void detach_destroy_domains(const struct cpumask *cpu_map)
 	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
 		static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
 
+	if (rcu_access_pointer(per_cpu(sd_cluster, cpu)))
+		static_branch_dec_cpuslocked(&sched_cluster_active);
+
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);