diff mbox series

[v5,3/4] arm64: topology: Support SMT control on ACPI based system

Message ID 20240806085320.63514-4-yangyicong@huawei.com (mailing list archive)
State New, archived
Headers show
Series Support SMT control on arm64 | expand

Commit Message

Yicong Yang Aug. 6, 2024, 8:53 a.m. UTC
From: Yicong Yang <yangyicong@hisilicon.com>

For ACPI we'll build the topology from PPTT and we cannot directly
get the SMT number of each core. Instead using a temporary xarray
to record the SMT number of each core when building the topology
and we can know the largest SMT number in the system. Then we can
enable the support of SMT control.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
---
 arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

Comments

Dietmar Eggemann Aug. 16, 2024, 3:55 p.m. UTC | #1
On 06/08/2024 10:53, Yicong Yang wrote:
> From: Yicong Yang <yangyicong@hisilicon.com>
> 
> For ACPI we'll build the topology from PPTT and we cannot directly
> get the SMT number of each core. Instead using a temporary xarray
> to record the SMT number of each core when building the topology
> and we can know the largest SMT number in the system. Then we can
> enable the support of SMT control.
> 
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> ---
>  arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>  1 file changed, 24 insertions(+)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 1a2c72f3e7f8..f72e1e55b05e 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -15,8 +15,10 @@
>  #include <linux/arch_topology.h>
>  #include <linux/cacheinfo.h>
>  #include <linux/cpufreq.h>
> +#include <linux/cpu_smt.h>
>  #include <linux/init.h>
>  #include <linux/percpu.h>
> +#include <linux/xarray.h>
>  
>  #include <asm/cpu.h>
>  #include <asm/cputype.h>
> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>   */
>  int __init parse_acpi_topology(void)
>  {
> +	int thread_num, max_smt_thread_num = 1;
> +	struct xarray core_threads;
>  	int cpu, topology_id;
> +	void *entry;
>  
>  	if (acpi_disabled)
>  		return 0;
>  
> +	xa_init(&core_threads);
> +
>  	for_each_possible_cpu(cpu) {
>  		topology_id = find_acpi_cpu_topology(cpu, 0);
>  		if (topology_id < 0)
> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>  			cpu_topology[cpu].thread_id = topology_id;
>  			topology_id = find_acpi_cpu_topology(cpu, 1);
>  			cpu_topology[cpu].core_id   = topology_id;
> +
> +			entry = xa_load(&core_threads, topology_id);
> +			if (!entry) {
> +				xa_store(&core_threads, topology_id,
> +					 xa_mk_value(1), GFP_KERNEL);
> +			} else {
> +				thread_num = xa_to_value(entry);
> +				thread_num++;
> +				xa_store(&core_threads, topology_id,
> +					 xa_mk_value(thread_num), GFP_KERNEL);
> +
> +				if (thread_num > max_smt_thread_num)
> +					max_smt_thread_num = thread_num;
> +			}

So the xarray contains one element for each core_id with the information
how often the core_id occurs? I assume you have to iterate over all
possible CPUs since you don't know which logical CPUs belong to the same
core_id.

>  		} else {
>  			cpu_topology[cpu].thread_id  = -1;
>  			cpu_topology[cpu].core_id    = topology_id;
> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>  		cpu_topology[cpu].package_id = topology_id;
>  	}
>  
> +	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
> +
> +	xa_destroy(&core_threads);
>  	return 0;
>  }
>  #endif

Tested on ThunderX2:

$ cat /proc/schedstat | head -6 | tail -4 | awk '{ print $1, $2 }'
cpu0 0
domain0 00000000,00000000,00000000,00000000,00000001,00000001,00000001,00000001
                                                   ^        ^        ^        ^
domain1 00000000,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,ffffffff
domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff

detecting 'max_smt_thread_num = 4' correctly.
Yicong Yang Aug. 19, 2024, 7:03 a.m. UTC | #2
On 2024/8/16 23:55, Dietmar Eggemann wrote:
> On 06/08/2024 10:53, Yicong Yang wrote:
>> From: Yicong Yang <yangyicong@hisilicon.com>
>>
>> For ACPI we'll build the topology from PPTT and we cannot directly
>> get the SMT number of each core. Instead using a temporary xarray
>> to record the SMT number of each core when building the topology
>> and we can know the largest SMT number in the system. Then we can
>> enable the support of SMT control.
>>
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>> ---
>>  arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>>  1 file changed, 24 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>> index 1a2c72f3e7f8..f72e1e55b05e 100644
>> --- a/arch/arm64/kernel/topology.c
>> +++ b/arch/arm64/kernel/topology.c
>> @@ -15,8 +15,10 @@
>>  #include <linux/arch_topology.h>
>>  #include <linux/cacheinfo.h>
>>  #include <linux/cpufreq.h>
>> +#include <linux/cpu_smt.h>
>>  #include <linux/init.h>
>>  #include <linux/percpu.h>
>> +#include <linux/xarray.h>
>>  
>>  #include <asm/cpu.h>
>>  #include <asm/cputype.h>
>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>   */
>>  int __init parse_acpi_topology(void)
>>  {
>> +	int thread_num, max_smt_thread_num = 1;
>> +	struct xarray core_threads;
>>  	int cpu, topology_id;
>> +	void *entry;
>>  
>>  	if (acpi_disabled)
>>  		return 0;
>>  
>> +	xa_init(&core_threads);
>> +
>>  	for_each_possible_cpu(cpu) {
>>  		topology_id = find_acpi_cpu_topology(cpu, 0);
>>  		if (topology_id < 0)
>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>>  			cpu_topology[cpu].thread_id = topology_id;
>>  			topology_id = find_acpi_cpu_topology(cpu, 1);
>>  			cpu_topology[cpu].core_id   = topology_id;
>> +
>> +			entry = xa_load(&core_threads, topology_id);
>> +			if (!entry) {
>> +				xa_store(&core_threads, topology_id,
>> +					 xa_mk_value(1), GFP_KERNEL);
>> +			} else {
>> +				thread_num = xa_to_value(entry);
>> +				thread_num++;
>> +				xa_store(&core_threads, topology_id,
>> +					 xa_mk_value(thread_num), GFP_KERNEL);
>> +
>> +				if (thread_num > max_smt_thread_num)
>> +					max_smt_thread_num = thread_num;
>> +			}
> 
> So the xarray contains one element for each core_id with the information
> how often the core_id occurs? I assume you have to iterate over all
> possible CPUs since you don't know which logical CPUs belong to the same
> core_id.
> 

Each xarray element counts the thread number of a certain core id. so the logic is like below:
1. if the "core id" entry doesn't exists, then we're accessing this core for the 1st time. create
   one and make the thread number to 1
2. otherwise increment the thread number of "core id" this cpu belongs (PPTT already
   told us which core this CPU belongs to). Update the max_smt_thread_num if necessary.

Then we can know max_smt_thread_num by meanwhile iterating the PPTT table and
build the topology for all the possible CPUs.

Otherwise we need to do a second scan for the max thread number after built the
topology. This way is implemented in v1 and it's complained about the overhead on large
scale systems since we need to loop the CPUs twice.

>>  		} else {
>>  			cpu_topology[cpu].thread_id  = -1;
>>  			cpu_topology[cpu].core_id    = topology_id;
>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>>  		cpu_topology[cpu].package_id = topology_id;
>>  	}
>>  
>> +	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>> +
>> +	xa_destroy(&core_threads);
>>  	return 0;
>>  }
>>  #endif
> 
> Tested on ThunderX2:
> 
> $ cat /proc/schedstat | head -6 | tail -4 | awk '{ print $1, $2 }'
> cpu0 0
> domain0 00000000,00000000,00000000,00000000,00000001,00000001,00000001,00000001
>                                                    ^        ^        ^        ^
> domain1 00000000,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,ffffffff
> domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff
> 
> detecting 'max_smt_thread_num = 4' correctly.
> 

Thanks for the testing. ok for a tag?

Thanks.
Dietmar Eggemann Aug. 22, 2024, 7:19 a.m. UTC | #3
On 19/08/2024 09:03, Yicong Yang wrote:
> On 2024/8/16 23:55, Dietmar Eggemann wrote:
>> On 06/08/2024 10:53, Yicong Yang wrote:
>>> From: Yicong Yang <yangyicong@hisilicon.com>

[...]

>> So the xarray contains one element for each core_id with the information
>> how often the core_id occurs? I assume you have to iterate over all
>> possible CPUs since you don't know which logical CPUs belong to the same
>> core_id.
>>
> 
> Each xarray element counts the thread number of a certain core id. so the logic is like below:
> 1. if the "core id" entry doesn't exists, then we're accessing this core for the 1st time. create
>    one and make the thread number to 1
> 2. otherwise increment the thread number of "core id" this cpu belongs (PPTT already
>    told us which core this CPU belongs to). Update the max_smt_thread_num if necessary.
> 
> Then we can know max_smt_thread_num by meanwhile iterating the PPTT table and
> build the topology for all the possible CPUs.
> 
> Otherwise we need to do a second scan for the max thread number after built the
> topology. This way is implemented in v1 and it's complained about the overhead on large
> scale systems since we need to loop the CPUs twice.

OK.

[...]

>> Tested on ThunderX2:
>>
>> $ cat /proc/schedstat | head -6 | tail -4 | awk '{ print $1, $2 }'
>> cpu0 0
>> domain0 00000000,00000000,00000000,00000000,00000001,00000001,00000001,00000001
>>                                                    ^        ^        ^        ^
>> domain1 00000000,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,ffffffff
>> domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff
>>
>> detecting 'max_smt_thread_num = 4' correctly.
>>
> 
> Thanks for the testing. ok for a tag?

Yes, please go ahead.

[...]
Pierre Gondois Aug. 27, 2024, 3:40 p.m. UTC | #4
Hello Yicong,
IIUC we are looking for the maximum number of threads a CPU can have
in the platform. But is is actually possible to have a platform with
CPUs not having the same number of threads ?

For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check
that the number of threads is either 1 or cpu_smt_max_threads (as
CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However
a (hypothetical) platform with:
- CPU0: 2 threads
- CPU1: 4 threads
should not be able to set the number of threads to 2, as
1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()).

So if there is an assumption that all the CPUs have the same number of
threads, it should be sufficient to count the number of threads for one
CPU right ?

In the other (unlikely) case (i.e. CPUs can have various number of threads),
I think we should either:
- use your method and check that all the CPUs have the same number of threads
- get the number of threads for one CPU and check that all the CPUs are
    identical using the ACPI_PPTT_ACPI_IDENTICAL tag
- have a per-cpu cpu_smt_max_threads value ?

Same comment for the DT patch. If there is an assumption that all CPUs have
the same number of threads, then update_smt_num_threads() could only be called
once I suppose,

Regards,
Pierre


On 8/6/24 10:53, Yicong Yang wrote:
> From: Yicong Yang <yangyicong@hisilicon.com>
> 
> For ACPI we'll build the topology from PPTT and we cannot directly
> get the SMT number of each core. Instead using a temporary xarray
> to record the SMT number of each core when building the topology
> and we can know the largest SMT number in the system. Then we can
> enable the support of SMT control.
> 
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> ---
>   arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>   1 file changed, 24 insertions(+)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 1a2c72f3e7f8..f72e1e55b05e 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -15,8 +15,10 @@
>   #include <linux/arch_topology.h>
>   #include <linux/cacheinfo.h>
>   #include <linux/cpufreq.h>
> +#include <linux/cpu_smt.h>
>   #include <linux/init.h>
>   #include <linux/percpu.h>
> +#include <linux/xarray.h>
>   
>   #include <asm/cpu.h>
>   #include <asm/cputype.h>
> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>    */
>   int __init parse_acpi_topology(void)
>   {
> +	int thread_num, max_smt_thread_num = 1;
> +	struct xarray core_threads;
>   	int cpu, topology_id;
> +	void *entry;
>   
>   	if (acpi_disabled)
>   		return 0;
>   
> +	xa_init(&core_threads);
> +
>   	for_each_possible_cpu(cpu) {
>   		topology_id = find_acpi_cpu_topology(cpu, 0);
>   		if (topology_id < 0)
> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>   			cpu_topology[cpu].thread_id = topology_id;
>   			topology_id = find_acpi_cpu_topology(cpu, 1);
>   			cpu_topology[cpu].core_id   = topology_id;
> +
> +			entry = xa_load(&core_threads, topology_id);
> +			if (!entry) {
> +				xa_store(&core_threads, topology_id,
> +					 xa_mk_value(1), GFP_KERNEL);
> +			} else {
> +				thread_num = xa_to_value(entry);
> +				thread_num++;
> +				xa_store(&core_threads, topology_id,
> +					 xa_mk_value(thread_num), GFP_KERNEL);
> +
> +				if (thread_num > max_smt_thread_num)
> +					max_smt_thread_num = thread_num;
> +			}
>   		} else {
>   			cpu_topology[cpu].thread_id  = -1;
>   			cpu_topology[cpu].core_id    = topology_id;
> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>   		cpu_topology[cpu].package_id = topology_id;
>   	}
>   
> +	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
> +
> +	xa_destroy(&core_threads);
>   	return 0;
>   }
>   #endif
Yicong Yang Aug. 29, 2024, 7:40 a.m. UTC | #5
Hi Pierre,

On 2024/8/27 23:40, Pierre Gondois wrote:
> Hello Yicong,
> IIUC we are looking for the maximum number of threads a CPU can have
> in the platform. But is is actually possible to have a platform with
> CPUs not having the same number of threads ?
> 

I was thinking of the heterogenous platforms, for example part of the
cores have SMT and others don't (I don't have any though, but there
should be some such platforms for arm64).

> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check
> that the number of threads is either 1 or cpu_smt_max_threads (as
> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However
> a (hypothetical) platform with:
> - CPU0: 2 threads
> - CPU1: 4 threads
> should not be able to set the number of threads to 2, as
> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()).
> 

It's a bit more complex. If we enable/disable the SMT using on/off control
then we won't have this problem. We'll online all the CPUs on enabling and
offline CPUs which is not primary thread and don't care about the thread
number of each core.

Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC
and only enabled on powerpc. I think this interface is not enough to handle
the hypothetical we assumed, since it assumes the homogenous platform that
all the CPUs have the same thread number. But this should be fine for
the platforms with SMT(with same thread number) and non-SMT cores, since it do
indicates the real thread number of the SMT cores.

> So if there is an assumption that all the CPUs have the same number of
> threads, it should be sufficient to count the number of threads for one
> CPU right ?
> 

Naturally and conveniently I may think use of the threads number of CPU0 (boot
cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous
platform :(

> In the other (unlikely) case (i.e. CPUs can have various number of threads),
> I think we should either:
> - use your method and check that all the CPUs have the same number of threads
> - get the number of threads for one CPU and check that all the CPUs are
>    identical using the ACPI_PPTT_ACPI_IDENTICAL tag

I think this won't be simpler since we still need to iterate all the CPUs to see
if they have the same hetero_id (checked how we're using this ACPI tag in
arm_acpi_register_pmu_device()). We could already know if they're identical by
comparing the thread number and do the update if necessary.

> - have a per-cpu cpu_smt_max_threads value ?

This should be unnecessary in currently stage if there's no platforms
with several kind cores have different thread number like in your assumption
and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using
a global cpu_smt_max_threads to enable the SMT control should be enough.
Does it make sense?

Thanks,
Yicong

> 
> Same comment for the DT patch. If there is an assumption that all CPUs have
> the same number of threads, then update_smt_num_threads() could only be called
> once I suppose,
> 
> Regards,
> Pierre
> 
> 
> On 8/6/24 10:53, Yicong Yang wrote:
>> From: Yicong Yang <yangyicong@hisilicon.com>
>>
>> For ACPI we'll build the topology from PPTT and we cannot directly
>> get the SMT number of each core. Instead using a temporary xarray
>> to record the SMT number of each core when building the topology
>> and we can know the largest SMT number in the system. Then we can
>> enable the support of SMT control.
>>
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>> ---
>>   arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>>   1 file changed, 24 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>> index 1a2c72f3e7f8..f72e1e55b05e 100644
>> --- a/arch/arm64/kernel/topology.c
>> +++ b/arch/arm64/kernel/topology.c
>> @@ -15,8 +15,10 @@
>>   #include <linux/arch_topology.h>
>>   #include <linux/cacheinfo.h>
>>   #include <linux/cpufreq.h>
>> +#include <linux/cpu_smt.h>
>>   #include <linux/init.h>
>>   #include <linux/percpu.h>
>> +#include <linux/xarray.h>
>>     #include <asm/cpu.h>
>>   #include <asm/cputype.h>
>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>    */
>>   int __init parse_acpi_topology(void)
>>   {
>> +    int thread_num, max_smt_thread_num = 1;
>> +    struct xarray core_threads;
>>       int cpu, topology_id;
>> +    void *entry;
>>         if (acpi_disabled)
>>           return 0;
>>   +    xa_init(&core_threads);
>> +
>>       for_each_possible_cpu(cpu) {
>>           topology_id = find_acpi_cpu_topology(cpu, 0);
>>           if (topology_id < 0)
>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>>               cpu_topology[cpu].thread_id = topology_id;
>>               topology_id = find_acpi_cpu_topology(cpu, 1);
>>               cpu_topology[cpu].core_id   = topology_id;
>> +
>> +            entry = xa_load(&core_threads, topology_id);
>> +            if (!entry) {
>> +                xa_store(&core_threads, topology_id,
>> +                     xa_mk_value(1), GFP_KERNEL);
>> +            } else {
>> +                thread_num = xa_to_value(entry);
>> +                thread_num++;
>> +                xa_store(&core_threads, topology_id,
>> +                     xa_mk_value(thread_num), GFP_KERNEL);
>> +
>> +                if (thread_num > max_smt_thread_num)
>> +                    max_smt_thread_num = thread_num;
>> +            }
>>           } else {
>>               cpu_topology[cpu].thread_id  = -1;
>>               cpu_topology[cpu].core_id    = topology_id;
>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>>           cpu_topology[cpu].package_id = topology_id;
>>       }
>>   +    cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>> +
>> +    xa_destroy(&core_threads);
>>       return 0;
>>   }
>>   #endif
> 
> .
Pierre Gondois Aug. 29, 2024, 12:46 p.m. UTC | #6
Hello Yicong,

On 8/29/24 09:40, Yicong Yang wrote:
> Hi Pierre,
> 
> On 2024/8/27 23:40, Pierre Gondois wrote:
>> Hello Yicong,
>> IIUC we are looking for the maximum number of threads a CPU can have
>> in the platform. But is is actually possible to have a platform with
>> CPUs not having the same number of threads ?
>>
> 
> I was thinking of the heterogenous platforms, for example part of the
> cores have SMT and others don't (I don't have any though, but there
> should be some such platforms for arm64).
> 
>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check
>> that the number of threads is either 1 or cpu_smt_max_threads (as
>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However
>> a (hypothetical) platform with:
>> - CPU0: 2 threads
>> - CPU1: 4 threads
>> should not be able to set the number of threads to 2, as
>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()).
>>
> 
> It's a bit more complex. If we enable/disable the SMT using on/off control
> then we won't have this problem. We'll online all the CPUs on enabling and
> offline CPUs which is not primary thread and don't care about the thread
> number of each core.
> 
> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC
> and only enabled on powerpc. I think this interface is not enough to handle
> the hypothetical we assumed, since it assumes the homogenous platform that
> all the CPUs have the same thread number. But this should be fine for
> the platforms with SMT(with same thread number) and non-SMT cores, since it do
> indicates the real thread number of the SMT cores.
> 
>> So if there is an assumption that all the CPUs have the same number of
>> threads, it should be sufficient to count the number of threads for one
>> CPU right ?
>>
> 
> Naturally and conveniently I may think use of the threads number of CPU0 (boot
> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous
> platform :(
> 
>> In the other (unlikely) case (i.e. CPUs can have various number of threads),
>> I think we should either:
>> - use your method and check that all the CPUs have the same number of threads
>> - get the number of threads for one CPU and check that all the CPUs are
>>     identical using the ACPI_PPTT_ACPI_IDENTICAL tag
> 
> I think this won't be simpler since we still need to iterate all the CPUs to see
> if they have the same hetero_id (checked how we're using this ACPI tag in
> arm_acpi_register_pmu_device()). We could already know if they're identical by
> comparing the thread number and do the update if necessary.
> 
>> - have a per-cpu cpu_smt_max_threads value ?
> 
> This should be unnecessary in currently stage if there's no platforms
> with several kind cores have different thread number like in your assumption
> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using
> a global cpu_smt_max_threads to enable the SMT control should be enough.
> Does it make sense?

Yes, I agree with all the things you said:
- the current smt/control interface cannot handle asymmetric SMT platforms
- I am not aware if such platform exist so far

I think it would still be good to check all the CPUs have the same number of
threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the
patch attached (and to check CPUs have the same number of threads). Feel free
to take what you like/ignore what is inside the attached patch, or let me know
if I should submit a part in a separate patchset,

----------------------------

     [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
     
     - On arm64 ACPI systems, change the thread_id assignment to have
       increasing values starting from 0. This is already the case for DT
       based systems. Doing so allows to uniformly identify the n-th
       thread of a given CPU.
     - Check that all CPUs have the same number of threads (for DT/ACPI)
     - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
     
     On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96
     for socket0 and 128 + (0,32,64,96) for socket1:
     $ cd /sys/devices/system/cpu/smt/
     $ cat ../online
     0-255
     $ echo 2 > control
     $ cat ../online
     0-63,128-191
     $ echo 3 > control
     $ cat ../online
     0-95,128-223
     $ echo on > control
     $ cat ../online
     0-255

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index bd3bc2f5e0ec..1d8521483065 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -239,6 +239,7 @@ config ARM64
         select HAVE_GENERIC_VDSO
         select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
         select HOTPLUG_SMT if (SMP && HOTPLUG_CPU)
+       select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT
         select IRQ_DOMAIN
         select IRQ_FORCED_THREADING
         select KASAN_VMALLOC if KASAN
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 0f6ef432fb84..7dd211f81687 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -39,6 +39,14 @@ void update_freq_counters_refs(void);
  #define arch_scale_hw_pressure topology_get_hw_pressure
  #define arch_update_hw_pressure        topology_update_hw_pressure
  
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+#include <linux/cpu_smt.h>
+static inline bool topology_smt_thread_allowed(unsigned int cpu)
+{
+       return topology_thread_id(cpu) < cpu_smt_num_threads;
+}
+#endif
+
  #include <asm-generic/topology.h>
  
  #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index f72e1e55b05e..a83babe19972 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -47,7 +47,9 @@ int __init parse_acpi_topology(void)
  {
         int thread_num, max_smt_thread_num = 1;
         struct xarray core_threads;
+       bool have_thread = false;
         int cpu, topology_id;
+       unsigned long i;
         void *entry;
  
         if (acpi_disabled)
@@ -61,6 +63,8 @@ int __init parse_acpi_topology(void)
                         return topology_id;
  
                 if (acpi_cpu_is_threaded(cpu)) {
+                       have_thread = true;
+
                         cpu_topology[cpu].thread_id = topology_id;
                         topology_id = find_acpi_cpu_topology(cpu, 1);
                         cpu_topology[cpu].core_id   = topology_id;
@@ -69,9 +73,10 @@ int __init parse_acpi_topology(void)
                         if (!entry) {
                                 xa_store(&core_threads, topology_id,
                                          xa_mk_value(1), GFP_KERNEL);
+                               cpu_topology[cpu].thread_id = 0;
                         } else {
                                 thread_num = xa_to_value(entry);
-                               thread_num++;
+                               cpu_topology[cpu].thread_id = thread_num++;
                                 xa_store(&core_threads, topology_id,
                                          xa_mk_value(thread_num), GFP_KERNEL);
  
@@ -86,8 +91,17 @@ int __init parse_acpi_topology(void)
                 cpu_topology[cpu].cluster_id = topology_id;
                 topology_id = find_acpi_cpu_topology_package(cpu);
                 cpu_topology[cpu].package_id = topology_id;
+
+               pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n",
+                       cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id,
+                       cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id);
         }
  
+       if (have_thread)
+               xa_for_each(&core_threads, i, entry)
+                       if (xa_to_value(entry) != max_smt_thread_num)
+                               pr_warn("Heterogeneous SMT topology not handled");
+
         cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
  
         xa_destroy(&core_threads);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 95513abd664f..20d7f5b72ddd 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node)
         return cpu;
  }
  
-static void __init update_smt_num_threads(unsigned int num_threads)
+static void __init update_smt_num_threads(int num_threads)
  {
-       static unsigned int max_smt_thread_num = 1;
+       static int max_smt_thread_num = -1;
  
-       if (num_threads > max_smt_thread_num) {
+       if (max_smt_thread_num < 0) {
                 max_smt_thread_num = num_threads;
                 cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+       } else if (num_threads != max_smt_thread_num) {
+               pr_warn("Heterogeneous SMT topology not handled");
         }
  }
  
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index b721f360d759..afdfdc64a0a1 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
  #define topology_cluster_id(cpu)       (cpu_topology[cpu].cluster_id)
  #define topology_core_id(cpu)          (cpu_topology[cpu].core_id)
+#define topology_thread_id(cpu)                (cpu_topology[cpu].thread_id)
  #define topology_core_cpumask(cpu)     (&cpu_topology[cpu].core_sibling)
  #define topology_sibling_cpumask(cpu)  (&cpu_topology[cpu].thread_sibling)
  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)

----------------------------


Regards,
Pierre

> 
> Thanks,
> Yicong
> 
>>
>> Same comment for the DT patch. If there is an assumption that all CPUs have
>> the same number of threads, then update_smt_num_threads() could only be called
>> once I suppose,
>>
>> Regards,
>> Pierre
>>
>>
>> On 8/6/24 10:53, Yicong Yang wrote:
>>> From: Yicong Yang <yangyicong@hisilicon.com>
>>>
>>> For ACPI we'll build the topology from PPTT and we cannot directly
>>> get the SMT number of each core. Instead using a temporary xarray
>>> to record the SMT number of each core when building the topology
>>> and we can know the largest SMT number in the system. Then we can
>>> enable the support of SMT control.
>>>
>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>>> ---
>>>    arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>>>    1 file changed, 24 insertions(+)
>>>
>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>>> index 1a2c72f3e7f8..f72e1e55b05e 100644
>>> --- a/arch/arm64/kernel/topology.c
>>> +++ b/arch/arm64/kernel/topology.c
>>> @@ -15,8 +15,10 @@
>>>    #include <linux/arch_topology.h>
>>>    #include <linux/cacheinfo.h>
>>>    #include <linux/cpufreq.h>
>>> +#include <linux/cpu_smt.h>
>>>    #include <linux/init.h>
>>>    #include <linux/percpu.h>
>>> +#include <linux/xarray.h>
>>>      #include <asm/cpu.h>
>>>    #include <asm/cputype.h>
>>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>>     */
>>>    int __init parse_acpi_topology(void)
>>>    {
>>> +    int thread_num, max_smt_thread_num = 1;
>>> +    struct xarray core_threads;
>>>        int cpu, topology_id;
>>> +    void *entry;
>>>          if (acpi_disabled)
>>>            return 0;
>>>    +    xa_init(&core_threads);
>>> +
>>>        for_each_possible_cpu(cpu) {
>>>            topology_id = find_acpi_cpu_topology(cpu, 0);
>>>            if (topology_id < 0)
>>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>>>                cpu_topology[cpu].thread_id = topology_id;
>>>                topology_id = find_acpi_cpu_topology(cpu, 1);
>>>                cpu_topology[cpu].core_id   = topology_id;
>>> +
>>> +            entry = xa_load(&core_threads, topology_id);
>>> +            if (!entry) {
>>> +                xa_store(&core_threads, topology_id,
>>> +                     xa_mk_value(1), GFP_KERNEL);
>>> +            } else {
>>> +                thread_num = xa_to_value(entry);
>>> +                thread_num++;
>>> +                xa_store(&core_threads, topology_id,
>>> +                     xa_mk_value(thread_num), GFP_KERNEL);
>>> +
>>> +                if (thread_num > max_smt_thread_num)
>>> +                    max_smt_thread_num = thread_num;
>>> +            }
>>>            } else {
>>>                cpu_topology[cpu].thread_id  = -1;
>>>                cpu_topology[cpu].core_id    = topology_id;
>>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>>>            cpu_topology[cpu].package_id = topology_id;
>>>        }
>>>    +    cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>> +
>>> +    xa_destroy(&core_threads);
>>>        return 0;
>>>    }
>>>    #endif
>>
>> .
Yicong Yang Aug. 30, 2024, 9:35 a.m. UTC | #7
On 2024/8/29 20:46, Pierre Gondois wrote:
> Hello Yicong,
> 
> On 8/29/24 09:40, Yicong Yang wrote:
>> Hi Pierre,
>>
>> On 2024/8/27 23:40, Pierre Gondois wrote:
>>> Hello Yicong,
>>> IIUC we are looking for the maximum number of threads a CPU can have
>>> in the platform. But is is actually possible to have a platform with
>>> CPUs not having the same number of threads ?
>>>
>>
>> I was thinking of the heterogenous platforms, for example part of the
>> cores have SMT and others don't (I don't have any though, but there
>> should be some such platforms for arm64).
>>
>>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check
>>> that the number of threads is either 1 or cpu_smt_max_threads (as
>>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However
>>> a (hypothetical) platform with:
>>> - CPU0: 2 threads
>>> - CPU1: 4 threads
>>> should not be able to set the number of threads to 2, as
>>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()).
>>>
>>
>> It's a bit more complex. If we enable/disable the SMT using on/off control
>> then we won't have this problem. We'll online all the CPUs on enabling and
>> offline CPUs which is not primary thread and don't care about the thread
>> number of each core.
>>
>> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC
>> and only enabled on powerpc. I think this interface is not enough to handle
>> the hypothetical we assumed, since it assumes the homogenous platform that
>> all the CPUs have the same thread number. But this should be fine for
>> the platforms with SMT(with same thread number) and non-SMT cores, since it do
>> indicates the real thread number of the SMT cores.
>>
>>> So if there is an assumption that all the CPUs have the same number of
>>> threads, it should be sufficient to count the number of threads for one
>>> CPU right ?
>>>
>>
>> Naturally and conveniently I may think use of the threads number of CPU0 (boot
>> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous
>> platform :(
>>
>>> In the other (unlikely) case (i.e. CPUs can have various number of threads),
>>> I think we should either:
>>> - use your method and check that all the CPUs have the same number of threads
>>> - get the number of threads for one CPU and check that all the CPUs are
>>>     identical using the ACPI_PPTT_ACPI_IDENTICAL tag
>>
>> I think this won't be simpler since we still need to iterate all the CPUs to see
>> if they have the same hetero_id (checked how we're using this ACPI tag in
>> arm_acpi_register_pmu_device()). We could already know if they're identical by
>> comparing the thread number and do the update if necessary.
>>
>>> - have a per-cpu cpu_smt_max_threads value ?
>>
>> This should be unnecessary in currently stage if there's no platforms
>> with several kind cores have different thread number like in your assumption
>> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using
>> a global cpu_smt_max_threads to enable the SMT control should be enough.
>> Does it make sense?
> 
> Yes, I agree with all the things you said:
> - the current smt/control interface cannot handle asymmetric SMT platforms
> - I am not aware if such platform exist so far
> 
> I think it would still be good to check all the CPUs have the same number of
> threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the
> patch attached (and to check CPUs have the same number of threads). Feel free
> to take what you like/ignore what is inside the attached patch, or let me know
> if I should submit a part in a separate patchset,
> 

Checked the changes, we can make this series as the basic support and a separate
series of yours as a further support of SMT control on arm64:
- support thread_id on ACPI based arm64 platform
- support partial SMT control by enable CONFIG_SMT_NUM_THREADS_DYNAMIC

some minor comments below.

> ----------------------------
> 
>     [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
>         - On arm64 ACPI systems, change the thread_id assignment to have
>       increasing values starting from 0. This is already the case for DT
>       based systems. Doing so allows to uniformly identify the n-th
>       thread of a given CPU.
>     - Check that all CPUs have the same number of threads (for DT/ACPI)
>     - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
>         On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96
>     for socket0 and 128 + (0,32,64,96) for socket1:
>     $ cd /sys/devices/system/cpu/smt/
>     $ cat ../online
>     0-255
>     $ echo 2 > control
>     $ cat ../online
>     0-63,128-191
>     $ echo 3 > control
>     $ cat ../online
>     0-95,128-223
>     $ echo on > control
>     $ cat ../online
>     0-255
> 

So it's a real SMT4 system, it does make sense to have this partial SMT control
support.

> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index bd3bc2f5e0ec..1d8521483065 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -239,6 +239,7 @@ config ARM64
>         select HAVE_GENERIC_VDSO
>         select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
>         select HOTPLUG_SMT if (SMP && HOTPLUG_CPU)
> +       select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT
>         select IRQ_DOMAIN
>         select IRQ_FORCED_THREADING
>         select KASAN_VMALLOC if KASAN
> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
> index 0f6ef432fb84..7dd211f81687 100644
> --- a/arch/arm64/include/asm/topology.h
> +++ b/arch/arm64/include/asm/topology.h
> @@ -39,6 +39,14 @@ void update_freq_counters_refs(void);
>  #define arch_scale_hw_pressure topology_get_hw_pressure
>  #define arch_update_hw_pressure        topology_update_hw_pressure
>  
> +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
> +#include <linux/cpu_smt.h>
> +static inline bool topology_smt_thread_allowed(unsigned int cpu)
> +{
> +       return topology_thread_id(cpu) < cpu_smt_num_threads;
> +}
> +#endif
> +
>  #include <asm-generic/topology.h>
>  
>  #endif /* _ASM_ARM_TOPOLOGY_H */
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index f72e1e55b05e..a83babe19972 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void)
>  {
>         int thread_num, max_smt_thread_num = 1;
>         struct xarray core_threads;
> +       bool have_thread = false;
>         int cpu, topology_id;
> +       unsigned long i;
>         void *entry;
>  
>         if (acpi_disabled)
> @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void)
>                         return topology_id;
>  
>                 if (acpi_cpu_is_threaded(cpu)) {
> +                       have_thread = true;
> +
>                         cpu_topology[cpu].thread_id = topology_id;
>                         topology_id = find_acpi_cpu_topology(cpu, 1);
>                         cpu_topology[cpu].core_id   = topology_id;
> @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void)
>                         if (!entry) {
>                                 xa_store(&core_threads, topology_id,
>                                          xa_mk_value(1), GFP_KERNEL);
> +                               cpu_topology[cpu].thread_id = 0;
>                         } else {
>                                 thread_num = xa_to_value(entry);
> -                               thread_num++;
> +                               cpu_topology[cpu].thread_id = thread_num++;
>                                 xa_store(&core_threads, topology_id,
>                                          xa_mk_value(thread_num), GFP_KERNEL);
>  
> @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void)
>                 cpu_topology[cpu].cluster_id = topology_id;
>                 topology_id = find_acpi_cpu_topology_package(cpu);
>                 cpu_topology[cpu].package_id = topology_id;
> +
> +               pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n",
> +                       cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id,
> +                       cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id);
>         }
>  
> +       if (have_thread)

we could know this from max_smt_thread_num so have_thread maybe unnecessary.

> +               xa_for_each(&core_threads, i, entry)
> +                       if (xa_to_value(entry) != max_smt_thread_num)
> +                               pr_warn("Heterogeneous SMT topology not handled");\

Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale
system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need
further think.

Thanks.

> +
>         cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>  
>         xa_destroy(&core_threads);
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 95513abd664f..20d7f5b72ddd 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node)
>         return cpu;
>  }
>  
> -static void __init update_smt_num_threads(unsigned int num_threads)
> +static void __init update_smt_num_threads(int num_threads)
>  {
> -       static unsigned int max_smt_thread_num = 1;
> +       static int max_smt_thread_num = -1;
>  
> -       if (num_threads > max_smt_thread_num) {
> +       if (max_smt_thread_num < 0) {
>                 max_smt_thread_num = num_threads;
>                 cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
> +       } else if (num_threads != max_smt_thread_num) {
> +               pr_warn("Heterogeneous SMT topology not handled");
>         }
>  }
>  
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index b721f360d759..afdfdc64a0a1 100644
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
>  #define topology_cluster_id(cpu)       (cpu_topology[cpu].cluster_id)
>  #define topology_core_id(cpu)          (cpu_topology[cpu].core_id)
> +#define topology_thread_id(cpu)                (cpu_topology[cpu].thread_id)
>  #define topology_core_cpumask(cpu)     (&cpu_topology[cpu].core_sibling)
>  #define topology_sibling_cpumask(cpu)  (&cpu_topology[cpu].thread_sibling)
>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
> 
> ----------------------------
> 
> 
> Regards,
> Pierre
> 
>>
>> Thanks,
>> Yicong
>>
>>>
>>> Same comment for the DT patch. If there is an assumption that all CPUs have
>>> the same number of threads, then update_smt_num_threads() could only be called
>>> once I suppose,
>>>
>>> Regards,
>>> Pierre
>>>
>>>
>>> On 8/6/24 10:53, Yicong Yang wrote:
>>>> From: Yicong Yang <yangyicong@hisilicon.com>
>>>>
>>>> For ACPI we'll build the topology from PPTT and we cannot directly
>>>> get the SMT number of each core. Instead using a temporary xarray
>>>> to record the SMT number of each core when building the topology
>>>> and we can know the largest SMT number in the system. Then we can
>>>> enable the support of SMT control.
>>>>
>>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>>>> ---
>>>>    arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>>>>    1 file changed, 24 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>>>> index 1a2c72f3e7f8..f72e1e55b05e 100644
>>>> --- a/arch/arm64/kernel/topology.c
>>>> +++ b/arch/arm64/kernel/topology.c
>>>> @@ -15,8 +15,10 @@
>>>>    #include <linux/arch_topology.h>
>>>>    #include <linux/cacheinfo.h>
>>>>    #include <linux/cpufreq.h>
>>>> +#include <linux/cpu_smt.h>
>>>>    #include <linux/init.h>
>>>>    #include <linux/percpu.h>
>>>> +#include <linux/xarray.h>
>>>>      #include <asm/cpu.h>
>>>>    #include <asm/cputype.h>
>>>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>>>     */
>>>>    int __init parse_acpi_topology(void)
>>>>    {
>>>> +    int thread_num, max_smt_thread_num = 1;
>>>> +    struct xarray core_threads;
>>>>        int cpu, topology_id;
>>>> +    void *entry;
>>>>          if (acpi_disabled)
>>>>            return 0;
>>>>    +    xa_init(&core_threads);
>>>> +
>>>>        for_each_possible_cpu(cpu) {
>>>>            topology_id = find_acpi_cpu_topology(cpu, 0);
>>>>            if (topology_id < 0)
>>>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>>>>                cpu_topology[cpu].thread_id = topology_id;
>>>>                topology_id = find_acpi_cpu_topology(cpu, 1);
>>>>                cpu_topology[cpu].core_id   = topology_id;
>>>> +
>>>> +            entry = xa_load(&core_threads, topology_id);
>>>> +            if (!entry) {
>>>> +                xa_store(&core_threads, topology_id,
>>>> +                     xa_mk_value(1), GFP_KERNEL);
>>>> +            } else {
>>>> +                thread_num = xa_to_value(entry);
>>>> +                thread_num++;
>>>> +                xa_store(&core_threads, topology_id,
>>>> +                     xa_mk_value(thread_num), GFP_KERNEL);
>>>> +
>>>> +                if (thread_num > max_smt_thread_num)
>>>> +                    max_smt_thread_num = thread_num;
>>>> +            }
>>>>            } else {
>>>>                cpu_topology[cpu].thread_id  = -1;
>>>>                cpu_topology[cpu].core_id    = topology_id;
>>>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>>>>            cpu_topology[cpu].package_id = topology_id;
>>>>        }
>>>>    +    cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>>> +
>>>> +    xa_destroy(&core_threads);
>>>>        return 0;
>>>>    }
>>>>    #endif
>>>
>>> .
Pierre Gondois Sept. 2, 2024, 7:43 a.m. UTC | #8
Hello Yicong

On 8/30/24 11:35, Yicong Yang wrote:
> On 2024/8/29 20:46, Pierre Gondois wrote:
>> Hello Yicong,
>>
>> On 8/29/24 09:40, Yicong Yang wrote:
>>> Hi Pierre,
>>>
>>> On 2024/8/27 23:40, Pierre Gondois wrote:
>>>> Hello Yicong,
>>>> IIUC we are looking for the maximum number of threads a CPU can have
>>>> in the platform. But is is actually possible to have a platform with
>>>> CPUs not having the same number of threads ?
>>>>
>>>
>>> I was thinking of the heterogenous platforms, for example part of the
>>> cores have SMT and others don't (I don't have any though, but there
>>> should be some such platforms for arm64).
>>>
>>>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check
>>>> that the number of threads is either 1 or cpu_smt_max_threads (as
>>>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However
>>>> a (hypothetical) platform with:
>>>> - CPU0: 2 threads
>>>> - CPU1: 4 threads
>>>> should not be able to set the number of threads to 2, as
>>>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()).
>>>>
>>>
>>> It's a bit more complex. If we enable/disable the SMT using on/off control
>>> then we won't have this problem. We'll online all the CPUs on enabling and
>>> offline CPUs which is not primary thread and don't care about the thread
>>> number of each core.
>>>
>>> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC
>>> and only enabled on powerpc. I think this interface is not enough to handle
>>> the hypothetical we assumed, since it assumes the homogenous platform that
>>> all the CPUs have the same thread number. But this should be fine for
>>> the platforms with SMT(with same thread number) and non-SMT cores, since it do
>>> indicates the real thread number of the SMT cores.
>>>
>>>> So if there is an assumption that all the CPUs have the same number of
>>>> threads, it should be sufficient to count the number of threads for one
>>>> CPU right ?
>>>>
>>>
>>> Naturally and conveniently I may think use of the threads number of CPU0 (boot
>>> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous
>>> platform :(
>>>
>>>> In the other (unlikely) case (i.e. CPUs can have various number of threads),
>>>> I think we should either:
>>>> - use your method and check that all the CPUs have the same number of threads
>>>> - get the number of threads for one CPU and check that all the CPUs are
>>>>      identical using the ACPI_PPTT_ACPI_IDENTICAL tag
>>>
>>> I think this won't be simpler since we still need to iterate all the CPUs to see
>>> if they have the same hetero_id (checked how we're using this ACPI tag in
>>> arm_acpi_register_pmu_device()). We could already know if they're identical by
>>> comparing the thread number and do the update if necessary.
>>>
>>>> - have a per-cpu cpu_smt_max_threads value ?
>>>
>>> This should be unnecessary in currently stage if there's no platforms
>>> with several kind cores have different thread number like in your assumption
>>> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using
>>> a global cpu_smt_max_threads to enable the SMT control should be enough.
>>> Does it make sense?
>>
>> Yes, I agree with all the things you said:
>> - the current smt/control interface cannot handle asymmetric SMT platforms
>> - I am not aware if such platform exist so far
>>
>> I think it would still be good to check all the CPUs have the same number of
>> threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the
>> patch attached (and to check CPUs have the same number of threads). Feel free
>> to take what you like/ignore what is inside the attached patch, or let me know
>> if I should submit a part in a separate patchset,
>>
> 
> Checked the changes, we can make this series as the basic support and a separate
> series of yours as a further support of SMT control on arm64:
> - support thread_id on ACPI based arm64 platform
> - support partial SMT control by enable CONFIG_SMT_NUM_THREADS_DYNAMIC

I'm not sure I fully understand what you mean. I can send patches to
enable SMT_NUM_THREADS_DYNAMIC on top of a v6 of yours IIUC. I let you pick
the changes that you estimate to make more sense in your serie (if that makes
sense) ? Please let met know if that works for you (or not).

> 
> some minor comments below.
> 
>> ----------------------------
>>
>>      [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
>>          - On arm64 ACPI systems, change the thread_id assignment to have
>>        increasing values starting from 0. This is already the case for DT
>>        based systems. Doing so allows to uniformly identify the n-th
>>        thread of a given CPU.
>>      - Check that all CPUs have the same number of threads (for DT/ACPI)
>>      - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
>>          On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96
>>      for socket0 and 128 + (0,32,64,96) for socket1:
>>      $ cd /sys/devices/system/cpu/smt/
>>      $ cat ../online
>>      0-255
>>      $ echo 2 > control
>>      $ cat ../online
>>      0-63,128-191
>>      $ echo 3 > control
>>      $ cat ../online
>>      0-95,128-223
>>      $ echo on > control
>>      $ cat ../online
>>      0-255
>>
> 
> So it's a real SMT4 system, it does make sense to have this partial SMT control
> support.

Yes right

> 
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index bd3bc2f5e0ec..1d8521483065 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -239,6 +239,7 @@ config ARM64
>>          select HAVE_GENERIC_VDSO
>>          select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
>>          select HOTPLUG_SMT if (SMP && HOTPLUG_CPU)
>> +       select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT
>>          select IRQ_DOMAIN
>>          select IRQ_FORCED_THREADING
>>          select KASAN_VMALLOC if KASAN
>> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
>> index 0f6ef432fb84..7dd211f81687 100644
>> --- a/arch/arm64/include/asm/topology.h
>> +++ b/arch/arm64/include/asm/topology.h
>> @@ -39,6 +39,14 @@ void update_freq_counters_refs(void);
>>   #define arch_scale_hw_pressure topology_get_hw_pressure
>>   #define arch_update_hw_pressure        topology_update_hw_pressure
>>   
>> +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
>> +#include <linux/cpu_smt.h>
>> +static inline bool topology_smt_thread_allowed(unsigned int cpu)
>> +{
>> +       return topology_thread_id(cpu) < cpu_smt_num_threads;
>> +}
>> +#endif
>> +
>>   #include <asm-generic/topology.h>
>>   
>>   #endif /* _ASM_ARM_TOPOLOGY_H */
>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>> index f72e1e55b05e..a83babe19972 100644
>> --- a/arch/arm64/kernel/topology.c
>> +++ b/arch/arm64/kernel/topology.c
>> @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void)
>>   {
>>          int thread_num, max_smt_thread_num = 1;
>>          struct xarray core_threads;
>> +       bool have_thread = false;
>>          int cpu, topology_id;
>> +       unsigned long i;
>>          void *entry;
>>   
>>          if (acpi_disabled)
>> @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void)
>>                          return topology_id;
>>   
>>                  if (acpi_cpu_is_threaded(cpu)) {
>> +                       have_thread = true;
>> +
>>                          cpu_topology[cpu].thread_id = topology_id;
>>                          topology_id = find_acpi_cpu_topology(cpu, 1);
>>                          cpu_topology[cpu].core_id   = topology_id;
>> @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void)
>>                          if (!entry) {
>>                                  xa_store(&core_threads, topology_id,
>>                                           xa_mk_value(1), GFP_KERNEL);
>> +                               cpu_topology[cpu].thread_id = 0;
>>                          } else {
>>                                  thread_num = xa_to_value(entry);
>> -                               thread_num++;
>> +                               cpu_topology[cpu].thread_id = thread_num++;
>>                                  xa_store(&core_threads, topology_id,
>>                                           xa_mk_value(thread_num), GFP_KERNEL);
>>   
>> @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void)
>>                  cpu_topology[cpu].cluster_id = topology_id;
>>                  topology_id = find_acpi_cpu_topology_package(cpu);
>>                  cpu_topology[cpu].package_id = topology_id;
>> +
>> +               pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n",
>> +                       cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id,
>> +                       cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id);
>>          }
>>   
>> +       if (have_thread)
> 
> we could know this from max_smt_thread_num so have_thread maybe unnecessary.

Yes right, I will change that.

> 
>> +               xa_for_each(&core_threads, i, entry)
>> +                       if (xa_to_value(entry) != max_smt_thread_num)
>> +                               pr_warn("Heterogeneous SMT topology not handled");\
> 
> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale
> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need
> further think.

I found this comments (not sure this is what you are refering to):
- https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/
- https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452

I don't see another way to do it right now. Also, I thing the complexity is in
O(2n), which should be better than the original O(n**2),

Regards,
Pierre

> 
> Thanks.
> 
>> +
>>          cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>   
>>          xa_destroy(&core_threads);
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>> index 95513abd664f..20d7f5b72ddd 100644
>> --- a/drivers/base/arch_topology.c
>> +++ b/drivers/base/arch_topology.c
>> @@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node)
>>          return cpu;
>>   }
>>   
>> -static void __init update_smt_num_threads(unsigned int num_threads)
>> +static void __init update_smt_num_threads(int num_threads)
>>   {
>> -       static unsigned int max_smt_thread_num = 1;
>> +       static int max_smt_thread_num = -1;
>>   
>> -       if (num_threads > max_smt_thread_num) {
>> +       if (max_smt_thread_num < 0) {
>>                  max_smt_thread_num = num_threads;
>>                  cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>> +       } else if (num_threads != max_smt_thread_num) {
>> +               pr_warn("Heterogeneous SMT topology not handled");
>>          }
>>   }
>>   
>> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
>> index b721f360d759..afdfdc64a0a1 100644
>> --- a/include/linux/arch_topology.h
>> +++ b/include/linux/arch_topology.h
>> @@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>>   #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
>>   #define topology_cluster_id(cpu)       (cpu_topology[cpu].cluster_id)
>>   #define topology_core_id(cpu)          (cpu_topology[cpu].core_id)
>> +#define topology_thread_id(cpu)                (cpu_topology[cpu].thread_id)
>>   #define topology_core_cpumask(cpu)     (&cpu_topology[cpu].core_sibling)
>>   #define topology_sibling_cpumask(cpu)  (&cpu_topology[cpu].thread_sibling)
>>   #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
>>
>> ----------------------------
>>
>>
>> Regards,
>> Pierre
>>
>>>
>>> Thanks,
>>> Yicong
>>>
>>>>
>>>> Same comment for the DT patch. If there is an assumption that all CPUs have
>>>> the same number of threads, then update_smt_num_threads() could only be called
>>>> once I suppose,
>>>>
>>>> Regards,
>>>> Pierre
>>>>
>>>>
>>>> On 8/6/24 10:53, Yicong Yang wrote:
>>>>> From: Yicong Yang <yangyicong@hisilicon.com>
>>>>>
>>>>> For ACPI we'll build the topology from PPTT and we cannot directly
>>>>> get the SMT number of each core. Instead using a temporary xarray
>>>>> to record the SMT number of each core when building the topology
>>>>> and we can know the largest SMT number in the system. Then we can
>>>>> enable the support of SMT control.
>>>>>
>>>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>>>>> ---
>>>>>     arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>>>>>     1 file changed, 24 insertions(+)
>>>>>
>>>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>>>>> index 1a2c72f3e7f8..f72e1e55b05e 100644
>>>>> --- a/arch/arm64/kernel/topology.c
>>>>> +++ b/arch/arm64/kernel/topology.c
>>>>> @@ -15,8 +15,10 @@
>>>>>     #include <linux/arch_topology.h>
>>>>>     #include <linux/cacheinfo.h>
>>>>>     #include <linux/cpufreq.h>
>>>>> +#include <linux/cpu_smt.h>
>>>>>     #include <linux/init.h>
>>>>>     #include <linux/percpu.h>
>>>>> +#include <linux/xarray.h>
>>>>>       #include <asm/cpu.h>
>>>>>     #include <asm/cputype.h>
>>>>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>>>>      */
>>>>>     int __init parse_acpi_topology(void)
>>>>>     {
>>>>> +    int thread_num, max_smt_thread_num = 1;
>>>>> +    struct xarray core_threads;
>>>>>         int cpu, topology_id;
>>>>> +    void *entry;
>>>>>           if (acpi_disabled)
>>>>>             return 0;
>>>>>     +    xa_init(&core_threads);
>>>>> +
>>>>>         for_each_possible_cpu(cpu) {
>>>>>             topology_id = find_acpi_cpu_topology(cpu, 0);
>>>>>             if (topology_id < 0)
>>>>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>>>>>                 cpu_topology[cpu].thread_id = topology_id;
>>>>>                 topology_id = find_acpi_cpu_topology(cpu, 1);
>>>>>                 cpu_topology[cpu].core_id   = topology_id;
>>>>> +
>>>>> +            entry = xa_load(&core_threads, topology_id);
>>>>> +            if (!entry) {
>>>>> +                xa_store(&core_threads, topology_id,
>>>>> +                     xa_mk_value(1), GFP_KERNEL);
>>>>> +            } else {
>>>>> +                thread_num = xa_to_value(entry);
>>>>> +                thread_num++;
>>>>> +                xa_store(&core_threads, topology_id,
>>>>> +                     xa_mk_value(thread_num), GFP_KERNEL);
>>>>> +
>>>>> +                if (thread_num > max_smt_thread_num)
>>>>> +                    max_smt_thread_num = thread_num;
>>>>> +            }
>>>>>             } else {
>>>>>                 cpu_topology[cpu].thread_id  = -1;
>>>>>                 cpu_topology[cpu].core_id    = topology_id;
>>>>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>>>>>             cpu_topology[cpu].package_id = topology_id;
>>>>>         }
>>>>>     +    cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>>>> +
>>>>> +    xa_destroy(&core_threads);
>>>>>         return 0;
>>>>>     }
>>>>>     #endif
>>>>
>>>> .
Yicong Yang Sept. 3, 2024, 12:44 p.m. UTC | #9
On 2024/9/2 15:43, Pierre Gondois wrote:
> Hello Yicong
> 
> On 8/30/24 11:35, Yicong Yang wrote:
>> On 2024/8/29 20:46, Pierre Gondois wrote:
>>> Hello Yicong,
>>>
>>> On 8/29/24 09:40, Yicong Yang wrote:
>>>> Hi Pierre,
>>>>
>>>> On 2024/8/27 23:40, Pierre Gondois wrote:
>>>>> Hello Yicong,
>>>>> IIUC we are looking for the maximum number of threads a CPU can have
>>>>> in the platform. But is is actually possible to have a platform with
>>>>> CPUs not having the same number of threads ?
>>>>>
>>>>
>>>> I was thinking of the heterogenous platforms, for example part of the
>>>> cores have SMT and others don't (I don't have any though, but there
>>>> should be some such platforms for arm64).
>>>>
>>>>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check
>>>>> that the number of threads is either 1 or cpu_smt_max_threads (as
>>>>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However
>>>>> a (hypothetical) platform with:
>>>>> - CPU0: 2 threads
>>>>> - CPU1: 4 threads
>>>>> should not be able to set the number of threads to 2, as
>>>>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()).
>>>>>
>>>>
>>>> It's a bit more complex. If we enable/disable the SMT using on/off control
>>>> then we won't have this problem. We'll online all the CPUs on enabling and
>>>> offline CPUs which is not primary thread and don't care about the thread
>>>> number of each core.
>>>>
>>>> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC
>>>> and only enabled on powerpc. I think this interface is not enough to handle
>>>> the hypothetical we assumed, since it assumes the homogenous platform that
>>>> all the CPUs have the same thread number. But this should be fine for
>>>> the platforms with SMT(with same thread number) and non-SMT cores, since it do
>>>> indicates the real thread number of the SMT cores.
>>>>
>>>>> So if there is an assumption that all the CPUs have the same number of
>>>>> threads, it should be sufficient to count the number of threads for one
>>>>> CPU right ?
>>>>>
>>>>
>>>> Naturally and conveniently I may think use of the threads number of CPU0 (boot
>>>> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous
>>>> platform :(
>>>>
>>>>> In the other (unlikely) case (i.e. CPUs can have various number of threads),
>>>>> I think we should either:
>>>>> - use your method and check that all the CPUs have the same number of threads
>>>>> - get the number of threads for one CPU and check that all the CPUs are
>>>>>      identical using the ACPI_PPTT_ACPI_IDENTICAL tag
>>>>
>>>> I think this won't be simpler since we still need to iterate all the CPUs to see
>>>> if they have the same hetero_id (checked how we're using this ACPI tag in
>>>> arm_acpi_register_pmu_device()). We could already know if they're identical by
>>>> comparing the thread number and do the update if necessary.
>>>>
>>>>> - have a per-cpu cpu_smt_max_threads value ?
>>>>
>>>> This should be unnecessary in currently stage if there's no platforms
>>>> with several kind cores have different thread number like in your assumption
>>>> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using
>>>> a global cpu_smt_max_threads to enable the SMT control should be enough.
>>>> Does it make sense?
>>>
>>> Yes, I agree with all the things you said:
>>> - the current smt/control interface cannot handle asymmetric SMT platforms
>>> - I am not aware if such platform exist so far
>>>
>>> I think it would still be good to check all the CPUs have the same number of
>>> threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the
>>> patch attached (and to check CPUs have the same number of threads). Feel free
>>> to take what you like/ignore what is inside the attached patch, or let me know
>>> if I should submit a part in a separate patchset,
>>>
>>
>> Checked the changes, we can make this series as the basic support and a separate
>> series of yours as a further support of SMT control on arm64:
>> - support thread_id on ACPI based arm64 platform
>> - support partial SMT control by enable CONFIG_SMT_NUM_THREADS_DYNAMIC
> 
> I'm not sure I fully understand what you mean. I can send patches to
> enable SMT_NUM_THREADS_DYNAMIC on top of a v6 of yours IIUC. I let you pick
> the changes that you estimate to make more sense in your serie (if that makes
> sense) ? Please let met know if that works for you (or not).
> 

yes it works for me :)

>>
>> some minor comments below.
>>
>>> ----------------------------
>>>
>>>      [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
>>>          - On arm64 ACPI systems, change the thread_id assignment to have
>>>        increasing values starting from 0. This is already the case for DT
>>>        based systems. Doing so allows to uniformly identify the n-th
>>>        thread of a given CPU.
>>>      - Check that all CPUs have the same number of threads (for DT/ACPI)
>>>      - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC
>>>          On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96
>>>      for socket0 and 128 + (0,32,64,96) for socket1:
>>>      $ cd /sys/devices/system/cpu/smt/
>>>      $ cat ../online
>>>      0-255
>>>      $ echo 2 > control
>>>      $ cat ../online
>>>      0-63,128-191
>>>      $ echo 3 > control
>>>      $ cat ../online
>>>      0-95,128-223
>>>      $ echo on > control
>>>      $ cat ../online
>>>      0-255
>>>
>>
>> So it's a real SMT4 system, it does make sense to have this partial SMT control
>> support.
> 
> Yes right
> 
>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index bd3bc2f5e0ec..1d8521483065 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -239,6 +239,7 @@ config ARM64
>>>          select HAVE_GENERIC_VDSO
>>>          select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
>>>          select HOTPLUG_SMT if (SMP && HOTPLUG_CPU)
>>> +       select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT
>>>          select IRQ_DOMAIN
>>>          select IRQ_FORCED_THREADING
>>>          select KASAN_VMALLOC if KASAN
>>> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
>>> index 0f6ef432fb84..7dd211f81687 100644
>>> --- a/arch/arm64/include/asm/topology.h
>>> +++ b/arch/arm64/include/asm/topology.h
>>> @@ -39,6 +39,14 @@ void update_freq_counters_refs(void);
>>>   #define arch_scale_hw_pressure topology_get_hw_pressure
>>>   #define arch_update_hw_pressure        topology_update_hw_pressure
>>>   +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
>>> +#include <linux/cpu_smt.h>
>>> +static inline bool topology_smt_thread_allowed(unsigned int cpu)
>>> +{
>>> +       return topology_thread_id(cpu) < cpu_smt_num_threads;
>>> +}
>>> +#endif
>>> +
>>>   #include <asm-generic/topology.h>
>>>     #endif /* _ASM_ARM_TOPOLOGY_H */
>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>>> index f72e1e55b05e..a83babe19972 100644
>>> --- a/arch/arm64/kernel/topology.c
>>> +++ b/arch/arm64/kernel/topology.c
>>> @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void)
>>>   {
>>>          int thread_num, max_smt_thread_num = 1;
>>>          struct xarray core_threads;
>>> +       bool have_thread = false;
>>>          int cpu, topology_id;
>>> +       unsigned long i;
>>>          void *entry;
>>>            if (acpi_disabled)
>>> @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void)
>>>                          return topology_id;
>>>                    if (acpi_cpu_is_threaded(cpu)) {
>>> +                       have_thread = true;
>>> +
>>>                          cpu_topology[cpu].thread_id = topology_id;
>>>                          topology_id = find_acpi_cpu_topology(cpu, 1);
>>>                          cpu_topology[cpu].core_id   = topology_id;
>>> @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void)
>>>                          if (!entry) {
>>>                                  xa_store(&core_threads, topology_id,
>>>                                           xa_mk_value(1), GFP_KERNEL);
>>> +                               cpu_topology[cpu].thread_id = 0;
>>>                          } else {
>>>                                  thread_num = xa_to_value(entry);
>>> -                               thread_num++;
>>> +                               cpu_topology[cpu].thread_id = thread_num++;
>>>                                  xa_store(&core_threads, topology_id,
>>>                                           xa_mk_value(thread_num), GFP_KERNEL);
>>>   @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void)
>>>                  cpu_topology[cpu].cluster_id = topology_id;
>>>                  topology_id = find_acpi_cpu_topology_package(cpu);
>>>                  cpu_topology[cpu].package_id = topology_id;
>>> +
>>> +               pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n",
>>> +                       cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id,
>>> +                       cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id);
>>>          }
>>>   +       if (have_thread)
>>
>> we could know this from max_smt_thread_num so have_thread maybe unnecessary.
> 
> Yes right, I will change that.
> 
>>
>>> +               xa_for_each(&core_threads, i, entry)
>>> +                       if (xa_to_value(entry) != max_smt_thread_num)
>>> +                               pr_warn("Heterogeneous SMT topology not handled");\
>>
>> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale
>> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need
>> further think.
> 
> I found this comments (not sure this is what you are refering to):
> - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/
> - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452
> 
> I don't see another way to do it right now. Also, I thing the complexity is in
> O(2n), which should be better than the original O(n**2),
> 

yes it's less complex. I'm wondering build up the xarray in another way then we can avoid the
long loops. What about below:

From 5ff5d0100435982764cd85566a6fe006e60ee98e Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 20 Oct 2023 15:38:38 +0800
Subject: [PATCH] arm64: topology: Support SMT control on ACPI based system

For ACPI we'll build the topology from PPTT and we cannot directly
get the SMT number of each core. Instead using a temporary xarray
to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
and SMT information of the first core in its heterogeneous CPU cluster
when building the topology. Then we can know the largest SMT number
in the system. Warn if heterogeneous SMT topology exists (multiple
heterogeneous CPU clusters with different SMT thread number) since the
SMT control cannot handle this well. Then enable the support of SMT
control.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
---
 arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 1a2c72f3e7f8..f6ec30fae70e 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -15,8 +15,10 @@
 #include <linux/arch_topology.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_smt.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/xarray.h>

 #include <asm/cpu.h>
 #include <asm/cputype.h>
@@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu)
 	return !!is_threaded;
 }

+struct cpu_smt_info {
+	int thread_num;
+	int core_id;
+	int cpu;
+};
+
 /*
  * Propagate the topology information of the processor_topology_node tree to the
  * cpu_topology array.
  */
 int __init parse_acpi_topology(void)
 {
+	int max_smt_thread_num = 1;
+	struct cpu_smt_info *entry;
+	struct xarray hetero_cpu;
+	unsigned long hetero_id;
 	int cpu, topology_id;

 	if (acpi_disabled)
 		return 0;

+	xa_init(&hetero_cpu);
+
 	for_each_possible_cpu(cpu) {
 		topology_id = find_acpi_cpu_topology(cpu, 0);
 		if (topology_id < 0)
@@ -57,6 +71,30 @@ int __init parse_acpi_topology(void)
 			cpu_topology[cpu].thread_id = topology_id;
 			topology_id = find_acpi_cpu_topology(cpu, 1);
 			cpu_topology[cpu].core_id   = topology_id;
+
+			/*
+			 * Build up the XArray using the heterogeneous ID of
+			 * the CPU cluster. Store the CPU and SMT information
+			 * of the first appeared CPU in the CPU cluster of this
+			 * heterogeneous ID since the SMT information should be
+			 * the same in this CPU cluster. Then we can know the
+			 * SMT information of each heterogeneous CPUs in the
+			 * system.
+			 */
+			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
+			entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);
+			if (!entry) {
+				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+				WARN_ON(!entry);
+
+				entry->cpu = cpu;
+				entry->core_id = topology_id;
+				entry->thread_num = 1;
+				xa_store(&hetero_cpu, hetero_id,
+					 entry, GFP_KERNEL);
+			} else if (entry->core_id == topology_id) {
+				entry->thread_num++;
+			}
 		} else {
 			cpu_topology[cpu].thread_id  = -1;
 			cpu_topology[cpu].core_id    = topology_id;
@@ -67,6 +105,28 @@ int __init parse_acpi_topology(void)
 		cpu_topology[cpu].package_id = topology_id;
 	}

+	/*
+	 * This should be a short loop depending on the number of heterogeneous
+	 * CPU clusters. Typically on a homogeneous system there's only one
+	 * entry in the XArray.
+	 */
+	xa_for_each(&hetero_cpu, hetero_id, entry) {
+		if (entry->thread_num == 1)
+			continue;
+
+		if (entry->thread_num != max_smt_thread_num &&
+		    max_smt_thread_num != 1)
+			pr_warn("Heterogeneous SMT topology not handled");
+
+		if (entry->thread_num > max_smt_thread_num)
+			max_smt_thread_num = entry->thread_num;
+
+		xa_erase(&hetero_cpu, hetero_id);
+		kfree(entry);
+	}
+
+	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+	xa_destroy(&hetero_cpu);
 	return 0;
 }
 #endif
Pierre Gondois Sept. 5, 2024, 8:34 a.m. UTC | #10
Hello Yicong,

>>> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale
>>> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need
>>> further think.
>>
>> I found this comments (not sure this is what you are refering to):
>> - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/
>> - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452
>>
>> I don't see another way to do it right now. Also, I thing the complexity is in
>> O(2n), which should be better than the original O(n**2),
>>
> 
> yes it's less complex. I'm wondering build up the xarray in another way then we can avoid the
> long loops. What about below:

I tried the patch on a ThunderX2 with 4 threads per CPU. PPTT topology describes
2 clusters of 128 cores each. Each cluster is described as independent (i.e. there
is no root node in the PPTT). The PPTT is of revision 1, so the IDENTICAL
flag is not available on the platform.

I also tried it on a faked SMT asymmetric platform and there might be a small
correction required.

> 
>  From 5ff5d0100435982764cd85566a6fe006e60ee98e Mon Sep 17 00:00:00 2001
> From: Yicong Yang <yangyicong@hisilicon.com>
> Date: Fri, 20 Oct 2023 15:38:38 +0800
> Subject: [PATCH] arm64: topology: Support SMT control on ACPI based system
> 
> For ACPI we'll build the topology from PPTT and we cannot directly
> get the SMT number of each core. Instead using a temporary xarray
> to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
> and SMT information of the first core in its heterogeneous CPU cluster
> when building the topology. Then we can know the largest SMT number
> in the system. Warn if heterogeneous SMT topology exists (multiple
> heterogeneous CPU clusters with different SMT thread number) since the
> SMT control cannot handle this well. Then enable the support of SMT
> control.
> 
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> ---
>   arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++
>   1 file changed, 60 insertions(+)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 1a2c72f3e7f8..f6ec30fae70e 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -15,8 +15,10 @@
>   #include <linux/arch_topology.h>
>   #include <linux/cacheinfo.h>
>   #include <linux/cpufreq.h>
> +#include <linux/cpu_smt.h>
>   #include <linux/init.h>
>   #include <linux/percpu.h>
> +#include <linux/xarray.h>
> 
>   #include <asm/cpu.h>
>   #include <asm/cputype.h>
> @@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>   	return !!is_threaded;
>   }
> 
> +struct cpu_smt_info {
> +	int thread_num;
> +	int core_id;
> +	int cpu;
> +};
> +
>   /*
>    * Propagate the topology information of the processor_topology_node tree to the
>    * cpu_topology array.
>    */
>   int __init parse_acpi_topology(void)
>   {
> +	int max_smt_thread_num = 1;
> +	struct cpu_smt_info *entry;
> +	struct xarray hetero_cpu;
> +	unsigned long hetero_id;
>   	int cpu, topology_id;
> 
>   	if (acpi_disabled)
>   		return 0;
> 
> +	xa_init(&hetero_cpu);
> +
>   	for_each_possible_cpu(cpu) {
>   		topology_id = find_acpi_cpu_topology(cpu, 0);
>   		if (topology_id < 0)
> @@ -57,6 +71,30 @@ int __init parse_acpi_topology(void)
>   			cpu_topology[cpu].thread_id = topology_id;
>   			topology_id = find_acpi_cpu_topology(cpu, 1);
>   			cpu_topology[cpu].core_id   = topology_id;
> +
> +			/*
> +			 * Build up the XArray using the heterogeneous ID of
> +			 * the CPU cluster. Store the CPU and SMT information
> +			 * of the first appeared CPU in the CPU cluster of this
> +			 * heterogeneous ID since the SMT information should be
> +			 * the same in this CPU cluster. Then we can know the
> +			 * SMT information of each heterogeneous CPUs in the
> +			 * system.
> +			 */
> +			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
> +			entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);
> +			if (!entry) {
> +				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +				WARN_ON(!entry);
> +
> +				entry->cpu = cpu;
> +				entry->core_id = topology_id;
> +				entry->thread_num = 1;
> +				xa_store(&hetero_cpu, hetero_id,
> +					 entry, GFP_KERNEL);
> +			} else if (entry->core_id == topology_id) {
> +				entry->thread_num++;
> +			}
>   		} else {
>   			cpu_topology[cpu].thread_id  = -1;
>   			cpu_topology[cpu].core_id    = topology_id;
> @@ -67,6 +105,28 @@ int __init parse_acpi_topology(void)
>   		cpu_topology[cpu].package_id = topology_id;
>   	}
> 
> +	/*
> +	 * This should be a short loop depending on the number of heterogeneous
> +	 * CPU clusters. Typically on a homogeneous system there's only one
> +	 * entry in the XArray.
> +	 */
> +	xa_for_each(&hetero_cpu, hetero_id, entry) {
> +		if (entry->thread_num == 1)
> +			continue;

If a platform has CPUs with:
- 1 thread
- X (!= 1) threads
Then I think that the asymmetry is not detected

> +
> +		if (entry->thread_num != max_smt_thread_num &&
> +		    max_smt_thread_num != 1)
> +			pr_warn("Heterogeneous SMT topology not handled");
> +
> +		if (entry->thread_num > max_smt_thread_num)
> +			max_smt_thread_num = entry->thread_num;
> +
> +		xa_erase(&hetero_cpu, hetero_id);
> +		kfree(entry);
> +	}
> +
> +	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
> +	xa_destroy(&hetero_cpu);
>   	return 0;
>   }
>   #endif
Yicong Yang Sept. 5, 2024, 12:02 p.m. UTC | #11
On 2024/9/5 16:34, Pierre Gondois wrote:
> Hello Yicong,
> 
>>>> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale
>>>> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need
>>>> further think.
>>>
>>> I found this comments (not sure this is what you are refering to):
>>> - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/
>>> - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452
>>>
>>> I don't see another way to do it right now. Also, I thing the complexity is in
>>> O(2n), which should be better than the original O(n**2),
>>>
>>
>> yes it's less complex. I'm wondering build up the xarray in another way then we can avoid the
>> long loops. What about below:
> 
> I tried the patch on a ThunderX2 with 4 threads per CPU. PPTT topology describes
> 2 clusters of 128 cores each. Each cluster is described as independent (i.e. there
> is no root node in the PPTT). The PPTT is of revision 1, so the IDENTICAL
> flag is not available on the platform.
> 

I think the SMT detection should work on this case, does it? In this case there should be
2 entries for each cluster respectively in the hetero_cpu array. Since the thread number
are the same, we'll detect a symmetric SMT topology.

> I also tried it on a faked SMT asymmetric platform and there might be a small
> correction required.
> 
>>
>>  From 5ff5d0100435982764cd85566a6fe006e60ee98e Mon Sep 17 00:00:00 2001
>> From: Yicong Yang <yangyicong@hisilicon.com>
>> Date: Fri, 20 Oct 2023 15:38:38 +0800
>> Subject: [PATCH] arm64: topology: Support SMT control on ACPI based system
>>
>> For ACPI we'll build the topology from PPTT and we cannot directly
>> get the SMT number of each core. Instead using a temporary xarray
>> to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
>> and SMT information of the first core in its heterogeneous CPU cluster
>> when building the topology. Then we can know the largest SMT number
>> in the system. Warn if heterogeneous SMT topology exists (multiple
>> heterogeneous CPU clusters with different SMT thread number) since the
>> SMT control cannot handle this well. Then enable the support of SMT
>> control.
>>
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>> ---
>>   arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++
>>   1 file changed, 60 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>> index 1a2c72f3e7f8..f6ec30fae70e 100644
>> --- a/arch/arm64/kernel/topology.c
>> +++ b/arch/arm64/kernel/topology.c
>> @@ -15,8 +15,10 @@
>>   #include <linux/arch_topology.h>
>>   #include <linux/cacheinfo.h>
>>   #include <linux/cpufreq.h>
>> +#include <linux/cpu_smt.h>
>>   #include <linux/init.h>
>>   #include <linux/percpu.h>
>> +#include <linux/xarray.h>
>>
>>   #include <asm/cpu.h>
>>   #include <asm/cputype.h>
>> @@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>       return !!is_threaded;
>>   }
>>
>> +struct cpu_smt_info {
>> +    int thread_num;
>> +    int core_id;
>> +    int cpu;
>> +};
>> +
>>   /*
>>    * Propagate the topology information of the processor_topology_node tree to the
>>    * cpu_topology array.
>>    */
>>   int __init parse_acpi_topology(void)
>>   {
>> +    int max_smt_thread_num = 1;
>> +    struct cpu_smt_info *entry;
>> +    struct xarray hetero_cpu;
>> +    unsigned long hetero_id;
>>       int cpu, topology_id;
>>
>>       if (acpi_disabled)
>>           return 0;
>>
>> +    xa_init(&hetero_cpu);
>> +
>>       for_each_possible_cpu(cpu) {
>>           topology_id = find_acpi_cpu_topology(cpu, 0);
>>           if (topology_id < 0)
>> @@ -57,6 +71,30 @@ int __init parse_acpi_topology(void)
>>               cpu_topology[cpu].thread_id = topology_id;
>>               topology_id = find_acpi_cpu_topology(cpu, 1);
>>               cpu_topology[cpu].core_id   = topology_id;
>> +
>> +            /*
>> +             * Build up the XArray using the heterogeneous ID of
>> +             * the CPU cluster. Store the CPU and SMT information
>> +             * of the first appeared CPU in the CPU cluster of this
>> +             * heterogeneous ID since the SMT information should be
>> +             * the same in this CPU cluster. Then we can know the
>> +             * SMT information of each heterogeneous CPUs in the
>> +             * system.
>> +             */
>> +            hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
>> +            entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);
>> +            if (!entry) {
>> +                entry = kzalloc(sizeof(*entry), GFP_KERNEL);
>> +                WARN_ON(!entry);
>> +
>> +                entry->cpu = cpu;
>> +                entry->core_id = topology_id;
>> +                entry->thread_num = 1;
>> +                xa_store(&hetero_cpu, hetero_id,
>> +                     entry, GFP_KERNEL);
>> +            } else if (entry->core_id == topology_id) {
>> +                entry->thread_num++;
>> +            }
>>           } else {
>>               cpu_topology[cpu].thread_id  = -1;
>>               cpu_topology[cpu].core_id    = topology_id;
>> @@ -67,6 +105,28 @@ int __init parse_acpi_topology(void)
>>           cpu_topology[cpu].package_id = topology_id;
>>       }
>>
>> +    /*
>> +     * This should be a short loop depending on the number of heterogeneous
>> +     * CPU clusters. Typically on a homogeneous system there's only one
>> +     * entry in the XArray.
>> +     */
>> +    xa_for_each(&hetero_cpu, hetero_id, entry) {
>> +        if (entry->thread_num == 1)
>> +            continue;
> 
> If a platform has CPUs with:
> - 1 thread
> - X (!= 1) threads
> Then I think that the asymmetry is not detected

Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the
system. For this case I was thinking we don't need to handle this since there's only one kind
of SMT core in the system, control should works fine for the SMT CPU clusters and we may not
care about the CPUs with no SMT.

Below code should handle the case if we initialize the max_smt_thread_num to 0. I also
reword the warning messages to match the fact. For heterogeneous SMT topology we still
support control SMT by on/off toggle but not fully support setting the thread number.

	int max_smt_thread_num = 0;
	[...]
	/*
	 * This should be a short loop depending on the number of heterogeneous
	 * CPU clusters. Typically on a homogeneous system there's only one
	 * entry in the XArray.
	 */
	xa_for_each(&hetero_cpu, hetero_id, entry) {
		/*
		 * If max_smt_thread_num has been initialized and doesn't match
		 * the thread number of this entry, then the system has
		 * heterogeneous SMT topology.
		 */
		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");

		if (entry->thread_num > max_smt_thread_num)
			max_smt_thread_num = entry->thread_num;

		xa_erase(&hetero_cpu, hetero_id);
		kfree(entry);
	}

Thanks.

> 
>> +
>> +        if (entry->thread_num != max_smt_thread_num &&
>> +            max_smt_thread_num != 1)
>> +            pr_warn("Heterogeneous SMT topology not handled");
>> +
>> +        if (entry->thread_num > max_smt_thread_num)
>> +            max_smt_thread_num = entry->thread_num;
>> +
>> +        xa_erase(&hetero_cpu, hetero_id);
>> +        kfree(entry);
>> +    }
>> +
>> +    cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>> +    xa_destroy(&hetero_cpu);
>>       return 0;
>>   }
>>   #endif
> 
> .
Morten Rasmussen Sept. 6, 2024, 7:06 a.m. UTC | #12
Hi Yicong,

On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote:
> On 2024/9/5 16:34, Pierre Gondois wrote:
> > Hello Yicong,
> > 
> > If a platform has CPUs with:
> > - 1 thread
> > - X (!= 1) threads
> > Then I think that the asymmetry is not detected
> 
> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the
> system. For this case I was thinking we don't need to handle this since there's only one kind
> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not
> care about the CPUs with no SMT.
> 
> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also
> reword the warning messages to match the fact. For heterogeneous SMT topology we still
> support control SMT by on/off toggle but not fully support setting the thread number.
> 
> 	int max_smt_thread_num = 0;
> 	[...]
> 	/*
> 	 * This should be a short loop depending on the number of heterogeneous
> 	 * CPU clusters. Typically on a homogeneous system there's only one
> 	 * entry in the XArray.
> 	 */
> 	xa_for_each(&hetero_cpu, hetero_id, entry) {
> 		/*
> 		 * If max_smt_thread_num has been initialized and doesn't match
> 		 * the thread number of this entry, then the system has
> 		 * heterogeneous SMT topology.
> 		 */
> 		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
> 			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");

What does 'partly supported' mean here?

If the SMT control doesn't work as intended for this topology, I don't
think it should be enabled for it.

Morten
Yicong Yang Sept. 6, 2024, 8:36 a.m. UTC | #13
On 2024/9/6 15:06, Morten Rasmussen wrote:
> Hi Yicong,
> 
> On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote:
>> On 2024/9/5 16:34, Pierre Gondois wrote:
>>> Hello Yicong,
>>>
>>> If a platform has CPUs with:
>>> - 1 thread
>>> - X (!= 1) threads
>>> Then I think that the asymmetry is not detected
>>
>> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the
>> system. For this case I was thinking we don't need to handle this since there's only one kind
>> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not
>> care about the CPUs with no SMT.
>>
>> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also
>> reword the warning messages to match the fact. For heterogeneous SMT topology we still
>> support control SMT by on/off toggle but not fully support setting the thread number.
>>
>> 	int max_smt_thread_num = 0;
>> 	[...]
>> 	/*
>> 	 * This should be a short loop depending on the number of heterogeneous
>> 	 * CPU clusters. Typically on a homogeneous system there's only one
>> 	 * entry in the XArray.
>> 	 */
>> 	xa_for_each(&hetero_cpu, hetero_id, entry) {
>> 		/*
>> 		 * If max_smt_thread_num has been initialized and doesn't match
>> 		 * the thread number of this entry, then the system has
>> 		 * heterogeneous SMT topology.
>> 		 */
>> 		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
>> 			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");
> 
> What does 'partly supported' mean here?
> 
> If the SMT control doesn't work as intended for this topology, I don't
> think it should be enabled for it.
> 

The /sys/devices/system/cpu/smt/control supports 2 kind of controls [1]
(a) enable/disable SMT entirely by writing on/off
(b) enable/disable SMT partially by writing a valid thread number (CONFIG_SMT_NUM_THREADS_DYNAMIC)

We assume 3 kind of SMT topology:
1. homogeneous SMT topology, all the CPUs support SMT or not
2.1 heterogeneous SMT topology, part of CPU clusters have SMT and others not. Clusters support SMT
    have the same SMT thread number
2.2 heterogeneous SMT topology, part of CPU clusters have SMT and the thread number may differs
    (e.g. cluster 1 is of SMT 2 and cluster 2 is of SMT 4. not sure there's such a real system)

For any of above SMT topology, control (a) should work as expected. When enabling SMT by writing "on"
the SMT is disabled for those CPU clusters who have SMT. Same for disabling SMT by writing "off".
But control (b) may not work for case 2.2 since the SMT thread number is not the same across
the system.

For this series alone we won't met this since CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled.
So control (b) only supports write 1/max_thread which behaves same as write off/on and will
work as intended for all the 3 cases. As discussed Pierre will add support for
 CONFIG_SMT_NUM_THREADS_DYNAMIC since thunderX2 is a symmetric SMT4 machine and
CONFIG_SMT_NUM_THREADS_DYNAMIC would be useful. We thought a warning should be useful
if running on a system of case 2.2.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/sysfs-devices-system-cpu#n542

Thanks.
Morten Rasmussen Sept. 12, 2024, 11:59 a.m. UTC | #14
On Fri, Sep 06, 2024 at 04:36:30PM +0800, Yicong Yang wrote:
> On 2024/9/6 15:06, Morten Rasmussen wrote:
> > Hi Yicong,
> > 
> > On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote:
> >> On 2024/9/5 16:34, Pierre Gondois wrote:
> >>> Hello Yicong,
> >>>
> >>> If a platform has CPUs with:
> >>> - 1 thread
> >>> - X (!= 1) threads
> >>> Then I think that the asymmetry is not detected
> >>
> >> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the
> >> system. For this case I was thinking we don't need to handle this since there's only one kind
> >> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not
> >> care about the CPUs with no SMT.
> >>
> >> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also
> >> reword the warning messages to match the fact. For heterogeneous SMT topology we still
> >> support control SMT by on/off toggle but not fully support setting the thread number.
> >>
> >> 	int max_smt_thread_num = 0;
> >> 	[...]
> >> 	/*
> >> 	 * This should be a short loop depending on the number of heterogeneous
> >> 	 * CPU clusters. Typically on a homogeneous system there's only one
> >> 	 * entry in the XArray.
> >> 	 */
> >> 	xa_for_each(&hetero_cpu, hetero_id, entry) {
> >> 		/*
> >> 		 * If max_smt_thread_num has been initialized and doesn't match
> >> 		 * the thread number of this entry, then the system has
> >> 		 * heterogeneous SMT topology.
> >> 		 */
> >> 		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
> >> 			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");
> > 
> > What does 'partly supported' mean here?
> > 
> > If the SMT control doesn't work as intended for this topology, I don't
> > think it should be enabled for it.
> > 
> 
> The /sys/devices/system/cpu/smt/control supports 2 kind of controls [1]
> (a) enable/disable SMT entirely by writing on/off
> (b) enable/disable SMT partially by writing a valid thread number (CONFIG_SMT_NUM_THREADS_DYNAMIC)
> 
> We assume 3 kind of SMT topology:
> 1. homogeneous SMT topology, all the CPUs support SMT or not
> 2.1 heterogeneous SMT topology, part of CPU clusters have SMT and others not. Clusters support SMT
>     have the same SMT thread number
> 2.2 heterogeneous SMT topology, part of CPU clusters have SMT and the thread number may differs
>     (e.g. cluster 1 is of SMT 2 and cluster 2 is of SMT 4. not sure there's such a real system)
> 
> For any of above SMT topology, control (a) should work as expected. When enabling SMT by writing "on"
> the SMT is disabled for those CPU clusters who have SMT. Same for disabling SMT by writing "off".
> But control (b) may not work for case 2.2 since the SMT thread number is not the same across
> the system.
> 
> For this series alone we won't met this since CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled.
> So control (b) only supports write 1/max_thread which behaves same as write off/on and will
> work as intended for all the 3 cases. As discussed Pierre will add support for
>  CONFIG_SMT_NUM_THREADS_DYNAMIC since thunderX2 is a symmetric SMT4 machine and
> CONFIG_SMT_NUM_THREADS_DYNAMIC would be useful. We thought a warning should be useful
> if running on a system of case 2.2.

Thanks for explaining the situation.

So IIUC, for case 2.2 there will be _no_ failures if someone writes a
value different from 1 or max_threads?

The SMT control code can handle that max_threads isn't the correct
number of threads for all cores in the system?
Michal Suchanek Sept. 12, 2024, 12:53 p.m. UTC | #15
Hello,

On Thu, Sep 12, 2024 at 01:59:16PM +0200, Morten Rasmussen wrote:
> On Fri, Sep 06, 2024 at 04:36:30PM +0800, Yicong Yang wrote:
> > On 2024/9/6 15:06, Morten Rasmussen wrote:
> > > Hi Yicong,
> > > 
> > > On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote:
> > >> On 2024/9/5 16:34, Pierre Gondois wrote:
> > >>> Hello Yicong,
> > >>>
> > >>> If a platform has CPUs with:
> > >>> - 1 thread
> > >>> - X (!= 1) threads
> > >>> Then I think that the asymmetry is not detected
> > >>
> > >> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the
> > >> system. For this case I was thinking we don't need to handle this since there's only one kind
> > >> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not
> > >> care about the CPUs with no SMT.
> > >>
> > >> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also
> > >> reword the warning messages to match the fact. For heterogeneous SMT topology we still
> > >> support control SMT by on/off toggle but not fully support setting the thread number.
> > >>
> > >> 	int max_smt_thread_num = 0;
> > >> 	[...]
> > >> 	/*
> > >> 	 * This should be a short loop depending on the number of heterogeneous
> > >> 	 * CPU clusters. Typically on a homogeneous system there's only one
> > >> 	 * entry in the XArray.
> > >> 	 */
> > >> 	xa_for_each(&hetero_cpu, hetero_id, entry) {
> > >> 		/*
> > >> 		 * If max_smt_thread_num has been initialized and doesn't match
> > >> 		 * the thread number of this entry, then the system has
> > >> 		 * heterogeneous SMT topology.
> > >> 		 */
> > >> 		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
> > >> 			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");
> > > 
> > > What does 'partly supported' mean here?
> > > 
> > > If the SMT control doesn't work as intended for this topology, I don't
> > > think it should be enabled for it.
> > > 
> > 
> > The /sys/devices/system/cpu/smt/control supports 2 kind of controls [1]
> > (a) enable/disable SMT entirely by writing on/off
> > (b) enable/disable SMT partially by writing a valid thread number (CONFIG_SMT_NUM_THREADS_DYNAMIC)
> > 
> > We assume 3 kind of SMT topology:
> > 1. homogeneous SMT topology, all the CPUs support SMT or not
> > 2.1 heterogeneous SMT topology, part of CPU clusters have SMT and others not. Clusters support SMT
> >     have the same SMT thread number
> > 2.2 heterogeneous SMT topology, part of CPU clusters have SMT and the thread number may differs
> >     (e.g. cluster 1 is of SMT 2 and cluster 2 is of SMT 4. not sure there's such a real system)
> > 
> > For any of above SMT topology, control (a) should work as expected. When enabling SMT by writing "on"
> > the SMT is disabled for those CPU clusters who have SMT. Same for disabling SMT by writing "off".
> > But control (b) may not work for case 2.2 since the SMT thread number is not the same across
> > the system.
> > 
> > For this series alone we won't met this since CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled.
> > So control (b) only supports write 1/max_thread which behaves same as write off/on and will
> > work as intended for all the 3 cases. As discussed Pierre will add support for
> >  CONFIG_SMT_NUM_THREADS_DYNAMIC since thunderX2 is a symmetric SMT4 machine and
> > CONFIG_SMT_NUM_THREADS_DYNAMIC would be useful. We thought a warning should be useful
> > if running on a system of case 2.2.
> 
> Thanks for explaining the situation.
> 
> So IIUC, for case 2.2 there will be _no_ failures if someone writes a
> value different from 1 or max_threads?
> 
> The SMT control code can handle that max_threads isn't the correct
> number of threads for all cores in the system?

Hello,

I suppose a number can be interpreted as 'up to this number'. If that's
what the user wanted is another question, on a hypothetical heterogenous
system with different number of threads in different CPUs it is
questionalble what this achieves.

Arguably once such system is found and the desired configurations
undestood this can be expanded to handle them.

Thanks

Michal
diff mbox series

Patch

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 1a2c72f3e7f8..f72e1e55b05e 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -15,8 +15,10 @@ 
 #include <linux/arch_topology.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_smt.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/xarray.h>
 
 #include <asm/cpu.h>
 #include <asm/cputype.h>
@@ -43,11 +45,16 @@  static bool __init acpi_cpu_is_threaded(int cpu)
  */
 int __init parse_acpi_topology(void)
 {
+	int thread_num, max_smt_thread_num = 1;
+	struct xarray core_threads;
 	int cpu, topology_id;
+	void *entry;
 
 	if (acpi_disabled)
 		return 0;
 
+	xa_init(&core_threads);
+
 	for_each_possible_cpu(cpu) {
 		topology_id = find_acpi_cpu_topology(cpu, 0);
 		if (topology_id < 0)
@@ -57,6 +64,20 @@  int __init parse_acpi_topology(void)
 			cpu_topology[cpu].thread_id = topology_id;
 			topology_id = find_acpi_cpu_topology(cpu, 1);
 			cpu_topology[cpu].core_id   = topology_id;
+
+			entry = xa_load(&core_threads, topology_id);
+			if (!entry) {
+				xa_store(&core_threads, topology_id,
+					 xa_mk_value(1), GFP_KERNEL);
+			} else {
+				thread_num = xa_to_value(entry);
+				thread_num++;
+				xa_store(&core_threads, topology_id,
+					 xa_mk_value(thread_num), GFP_KERNEL);
+
+				if (thread_num > max_smt_thread_num)
+					max_smt_thread_num = thread_num;
+			}
 		} else {
 			cpu_topology[cpu].thread_id  = -1;
 			cpu_topology[cpu].core_id    = topology_id;
@@ -67,6 +88,9 @@  int __init parse_acpi_topology(void)
 		cpu_topology[cpu].package_id = topology_id;
 	}
 
+	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+
+	xa_destroy(&core_threads);
 	return 0;
 }
 #endif