Message ID | 20240806085320.63514-4-yangyicong@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Support SMT control on arm64 | expand |
On 06/08/2024 10:53, Yicong Yang wrote: > From: Yicong Yang <yangyicong@hisilicon.com> > > For ACPI we'll build the topology from PPTT and we cannot directly > get the SMT number of each core. Instead using a temporary xarray > to record the SMT number of each core when building the topology > and we can know the largest SMT number in the system. Then we can > enable the support of SMT control. > > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > --- > arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ > 1 file changed, 24 insertions(+) > > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c > index 1a2c72f3e7f8..f72e1e55b05e 100644 > --- a/arch/arm64/kernel/topology.c > +++ b/arch/arm64/kernel/topology.c > @@ -15,8 +15,10 @@ > #include <linux/arch_topology.h> > #include <linux/cacheinfo.h> > #include <linux/cpufreq.h> > +#include <linux/cpu_smt.h> > #include <linux/init.h> > #include <linux/percpu.h> > +#include <linux/xarray.h> > > #include <asm/cpu.h> > #include <asm/cputype.h> > @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) > */ > int __init parse_acpi_topology(void) > { > + int thread_num, max_smt_thread_num = 1; > + struct xarray core_threads; > int cpu, topology_id; > + void *entry; > > if (acpi_disabled) > return 0; > > + xa_init(&core_threads); > + > for_each_possible_cpu(cpu) { > topology_id = find_acpi_cpu_topology(cpu, 0); > if (topology_id < 0) > @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].thread_id = topology_id; > topology_id = find_acpi_cpu_topology(cpu, 1); > cpu_topology[cpu].core_id = topology_id; > + > + entry = xa_load(&core_threads, topology_id); > + if (!entry) { > + xa_store(&core_threads, topology_id, > + xa_mk_value(1), GFP_KERNEL); > + } else { > + thread_num = xa_to_value(entry); > + thread_num++; > + xa_store(&core_threads, topology_id, > + xa_mk_value(thread_num), GFP_KERNEL); > + > + if (thread_num > max_smt_thread_num) > + max_smt_thread_num = thread_num; > + } So the xarray contains one element for each core_id with the information how often the core_id occurs? I assume you have to iterate over all possible CPUs since you don't know which logical CPUs belong to the same core_id. > } else { > cpu_topology[cpu].thread_id = -1; > cpu_topology[cpu].core_id = topology_id; > @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].package_id = topology_id; > } > > + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); > + > + xa_destroy(&core_threads); > return 0; > } > #endif Tested on ThunderX2: $ cat /proc/schedstat | head -6 | tail -4 | awk '{ print $1, $2 }' cpu0 0 domain0 00000000,00000000,00000000,00000000,00000001,00000001,00000001,00000001 ^ ^ ^ ^ domain1 00000000,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,ffffffff domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff detecting 'max_smt_thread_num = 4' correctly.
On 2024/8/16 23:55, Dietmar Eggemann wrote: > On 06/08/2024 10:53, Yicong Yang wrote: >> From: Yicong Yang <yangyicong@hisilicon.com> >> >> For ACPI we'll build the topology from PPTT and we cannot directly >> get the SMT number of each core. Instead using a temporary xarray >> to record the SMT number of each core when building the topology >> and we can know the largest SMT number in the system. Then we can >> enable the support of SMT control. >> >> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >> --- >> arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ >> 1 file changed, 24 insertions(+) >> >> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >> index 1a2c72f3e7f8..f72e1e55b05e 100644 >> --- a/arch/arm64/kernel/topology.c >> +++ b/arch/arm64/kernel/topology.c >> @@ -15,8 +15,10 @@ >> #include <linux/arch_topology.h> >> #include <linux/cacheinfo.h> >> #include <linux/cpufreq.h> >> +#include <linux/cpu_smt.h> >> #include <linux/init.h> >> #include <linux/percpu.h> >> +#include <linux/xarray.h> >> >> #include <asm/cpu.h> >> #include <asm/cputype.h> >> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) >> */ >> int __init parse_acpi_topology(void) >> { >> + int thread_num, max_smt_thread_num = 1; >> + struct xarray core_threads; >> int cpu, topology_id; >> + void *entry; >> >> if (acpi_disabled) >> return 0; >> >> + xa_init(&core_threads); >> + >> for_each_possible_cpu(cpu) { >> topology_id = find_acpi_cpu_topology(cpu, 0); >> if (topology_id < 0) >> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].thread_id = topology_id; >> topology_id = find_acpi_cpu_topology(cpu, 1); >> cpu_topology[cpu].core_id = topology_id; >> + >> + entry = xa_load(&core_threads, topology_id); >> + if (!entry) { >> + xa_store(&core_threads, topology_id, >> + xa_mk_value(1), GFP_KERNEL); >> + } else { >> + thread_num = xa_to_value(entry); >> + thread_num++; >> + xa_store(&core_threads, topology_id, >> + xa_mk_value(thread_num), GFP_KERNEL); >> + >> + if (thread_num > max_smt_thread_num) >> + max_smt_thread_num = thread_num; >> + } > > So the xarray contains one element for each core_id with the information > how often the core_id occurs? I assume you have to iterate over all > possible CPUs since you don't know which logical CPUs belong to the same > core_id. > Each xarray element counts the thread number of a certain core id. so the logic is like below: 1. if the "core id" entry doesn't exists, then we're accessing this core for the 1st time. create one and make the thread number to 1 2. otherwise increment the thread number of "core id" this cpu belongs (PPTT already told us which core this CPU belongs to). Update the max_smt_thread_num if necessary. Then we can know max_smt_thread_num by meanwhile iterating the PPTT table and build the topology for all the possible CPUs. Otherwise we need to do a second scan for the max thread number after built the topology. This way is implemented in v1 and it's complained about the overhead on large scale systems since we need to loop the CPUs twice. >> } else { >> cpu_topology[cpu].thread_id = -1; >> cpu_topology[cpu].core_id = topology_id; >> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].package_id = topology_id; >> } >> >> + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >> + >> + xa_destroy(&core_threads); >> return 0; >> } >> #endif > > Tested on ThunderX2: > > $ cat /proc/schedstat | head -6 | tail -4 | awk '{ print $1, $2 }' > cpu0 0 > domain0 00000000,00000000,00000000,00000000,00000001,00000001,00000001,00000001 > ^ ^ ^ ^ > domain1 00000000,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,ffffffff > domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff > > detecting 'max_smt_thread_num = 4' correctly. > Thanks for the testing. ok for a tag? Thanks.
On 19/08/2024 09:03, Yicong Yang wrote: > On 2024/8/16 23:55, Dietmar Eggemann wrote: >> On 06/08/2024 10:53, Yicong Yang wrote: >>> From: Yicong Yang <yangyicong@hisilicon.com> [...] >> So the xarray contains one element for each core_id with the information >> how often the core_id occurs? I assume you have to iterate over all >> possible CPUs since you don't know which logical CPUs belong to the same >> core_id. >> > > Each xarray element counts the thread number of a certain core id. so the logic is like below: > 1. if the "core id" entry doesn't exists, then we're accessing this core for the 1st time. create > one and make the thread number to 1 > 2. otherwise increment the thread number of "core id" this cpu belongs (PPTT already > told us which core this CPU belongs to). Update the max_smt_thread_num if necessary. > > Then we can know max_smt_thread_num by meanwhile iterating the PPTT table and > build the topology for all the possible CPUs. > > Otherwise we need to do a second scan for the max thread number after built the > topology. This way is implemented in v1 and it's complained about the overhead on large > scale systems since we need to loop the CPUs twice. OK. [...] >> Tested on ThunderX2: >> >> $ cat /proc/schedstat | head -6 | tail -4 | awk '{ print $1, $2 }' >> cpu0 0 >> domain0 00000000,00000000,00000000,00000000,00000001,00000001,00000001,00000001 >> ^ ^ ^ ^ >> domain1 00000000,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,ffffffff >> domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff >> >> detecting 'max_smt_thread_num = 4' correctly. >> > > Thanks for the testing. ok for a tag? Yes, please go ahead. [...]
Hello Yicong, IIUC we are looking for the maximum number of threads a CPU can have in the platform. But is is actually possible to have a platform with CPUs not having the same number of threads ? For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check that the number of threads is either 1 or cpu_smt_max_threads (as CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However a (hypothetical) platform with: - CPU0: 2 threads - CPU1: 4 threads should not be able to set the number of threads to 2, as 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()). So if there is an assumption that all the CPUs have the same number of threads, it should be sufficient to count the number of threads for one CPU right ? In the other (unlikely) case (i.e. CPUs can have various number of threads), I think we should either: - use your method and check that all the CPUs have the same number of threads - get the number of threads for one CPU and check that all the CPUs are identical using the ACPI_PPTT_ACPI_IDENTICAL tag - have a per-cpu cpu_smt_max_threads value ? Same comment for the DT patch. If there is an assumption that all CPUs have the same number of threads, then update_smt_num_threads() could only be called once I suppose, Regards, Pierre On 8/6/24 10:53, Yicong Yang wrote: > From: Yicong Yang <yangyicong@hisilicon.com> > > For ACPI we'll build the topology from PPTT and we cannot directly > get the SMT number of each core. Instead using a temporary xarray > to record the SMT number of each core when building the topology > and we can know the largest SMT number in the system. Then we can > enable the support of SMT control. > > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > --- > arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ > 1 file changed, 24 insertions(+) > > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c > index 1a2c72f3e7f8..f72e1e55b05e 100644 > --- a/arch/arm64/kernel/topology.c > +++ b/arch/arm64/kernel/topology.c > @@ -15,8 +15,10 @@ > #include <linux/arch_topology.h> > #include <linux/cacheinfo.h> > #include <linux/cpufreq.h> > +#include <linux/cpu_smt.h> > #include <linux/init.h> > #include <linux/percpu.h> > +#include <linux/xarray.h> > > #include <asm/cpu.h> > #include <asm/cputype.h> > @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) > */ > int __init parse_acpi_topology(void) > { > + int thread_num, max_smt_thread_num = 1; > + struct xarray core_threads; > int cpu, topology_id; > + void *entry; > > if (acpi_disabled) > return 0; > > + xa_init(&core_threads); > + > for_each_possible_cpu(cpu) { > topology_id = find_acpi_cpu_topology(cpu, 0); > if (topology_id < 0) > @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].thread_id = topology_id; > topology_id = find_acpi_cpu_topology(cpu, 1); > cpu_topology[cpu].core_id = topology_id; > + > + entry = xa_load(&core_threads, topology_id); > + if (!entry) { > + xa_store(&core_threads, topology_id, > + xa_mk_value(1), GFP_KERNEL); > + } else { > + thread_num = xa_to_value(entry); > + thread_num++; > + xa_store(&core_threads, topology_id, > + xa_mk_value(thread_num), GFP_KERNEL); > + > + if (thread_num > max_smt_thread_num) > + max_smt_thread_num = thread_num; > + } > } else { > cpu_topology[cpu].thread_id = -1; > cpu_topology[cpu].core_id = topology_id; > @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].package_id = topology_id; > } > > + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); > + > + xa_destroy(&core_threads); > return 0; > } > #endif
Hi Pierre, On 2024/8/27 23:40, Pierre Gondois wrote: > Hello Yicong, > IIUC we are looking for the maximum number of threads a CPU can have > in the platform. But is is actually possible to have a platform with > CPUs not having the same number of threads ? > I was thinking of the heterogenous platforms, for example part of the cores have SMT and others don't (I don't have any though, but there should be some such platforms for arm64). > For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check > that the number of threads is either 1 or cpu_smt_max_threads (as > CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However > a (hypothetical) platform with: > - CPU0: 2 threads > - CPU1: 4 threads > should not be able to set the number of threads to 2, as > 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()). > It's a bit more complex. If we enable/disable the SMT using on/off control then we won't have this problem. We'll online all the CPUs on enabling and offline CPUs which is not primary thread and don't care about the thread number of each core. Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC and only enabled on powerpc. I think this interface is not enough to handle the hypothetical we assumed, since it assumes the homogenous platform that all the CPUs have the same thread number. But this should be fine for the platforms with SMT(with same thread number) and non-SMT cores, since it do indicates the real thread number of the SMT cores. > So if there is an assumption that all the CPUs have the same number of > threads, it should be sufficient to count the number of threads for one > CPU right ? > Naturally and conveniently I may think use of the threads number of CPU0 (boot cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous platform :( > In the other (unlikely) case (i.e. CPUs can have various number of threads), > I think we should either: > - use your method and check that all the CPUs have the same number of threads > - get the number of threads for one CPU and check that all the CPUs are > identical using the ACPI_PPTT_ACPI_IDENTICAL tag I think this won't be simpler since we still need to iterate all the CPUs to see if they have the same hetero_id (checked how we're using this ACPI tag in arm_acpi_register_pmu_device()). We could already know if they're identical by comparing the thread number and do the update if necessary. > - have a per-cpu cpu_smt_max_threads value ? This should be unnecessary in currently stage if there's no platforms with several kind cores have different thread number like in your assumption and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using a global cpu_smt_max_threads to enable the SMT control should be enough. Does it make sense? Thanks, Yicong > > Same comment for the DT patch. If there is an assumption that all CPUs have > the same number of threads, then update_smt_num_threads() could only be called > once I suppose, > > Regards, > Pierre > > > On 8/6/24 10:53, Yicong Yang wrote: >> From: Yicong Yang <yangyicong@hisilicon.com> >> >> For ACPI we'll build the topology from PPTT and we cannot directly >> get the SMT number of each core. Instead using a temporary xarray >> to record the SMT number of each core when building the topology >> and we can know the largest SMT number in the system. Then we can >> enable the support of SMT control. >> >> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >> --- >> arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ >> 1 file changed, 24 insertions(+) >> >> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >> index 1a2c72f3e7f8..f72e1e55b05e 100644 >> --- a/arch/arm64/kernel/topology.c >> +++ b/arch/arm64/kernel/topology.c >> @@ -15,8 +15,10 @@ >> #include <linux/arch_topology.h> >> #include <linux/cacheinfo.h> >> #include <linux/cpufreq.h> >> +#include <linux/cpu_smt.h> >> #include <linux/init.h> >> #include <linux/percpu.h> >> +#include <linux/xarray.h> >> #include <asm/cpu.h> >> #include <asm/cputype.h> >> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) >> */ >> int __init parse_acpi_topology(void) >> { >> + int thread_num, max_smt_thread_num = 1; >> + struct xarray core_threads; >> int cpu, topology_id; >> + void *entry; >> if (acpi_disabled) >> return 0; >> + xa_init(&core_threads); >> + >> for_each_possible_cpu(cpu) { >> topology_id = find_acpi_cpu_topology(cpu, 0); >> if (topology_id < 0) >> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].thread_id = topology_id; >> topology_id = find_acpi_cpu_topology(cpu, 1); >> cpu_topology[cpu].core_id = topology_id; >> + >> + entry = xa_load(&core_threads, topology_id); >> + if (!entry) { >> + xa_store(&core_threads, topology_id, >> + xa_mk_value(1), GFP_KERNEL); >> + } else { >> + thread_num = xa_to_value(entry); >> + thread_num++; >> + xa_store(&core_threads, topology_id, >> + xa_mk_value(thread_num), GFP_KERNEL); >> + >> + if (thread_num > max_smt_thread_num) >> + max_smt_thread_num = thread_num; >> + } >> } else { >> cpu_topology[cpu].thread_id = -1; >> cpu_topology[cpu].core_id = topology_id; >> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].package_id = topology_id; >> } >> + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >> + >> + xa_destroy(&core_threads); >> return 0; >> } >> #endif > > .
Hello Yicong, On 8/29/24 09:40, Yicong Yang wrote: > Hi Pierre, > > On 2024/8/27 23:40, Pierre Gondois wrote: >> Hello Yicong, >> IIUC we are looking for the maximum number of threads a CPU can have >> in the platform. But is is actually possible to have a platform with >> CPUs not having the same number of threads ? >> > > I was thinking of the heterogenous platforms, for example part of the > cores have SMT and others don't (I don't have any though, but there > should be some such platforms for arm64). > >> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check >> that the number of threads is either 1 or cpu_smt_max_threads (as >> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However >> a (hypothetical) platform with: >> - CPU0: 2 threads >> - CPU1: 4 threads >> should not be able to set the number of threads to 2, as >> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()). >> > > It's a bit more complex. If we enable/disable the SMT using on/off control > then we won't have this problem. We'll online all the CPUs on enabling and > offline CPUs which is not primary thread and don't care about the thread > number of each core. > > Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC > and only enabled on powerpc. I think this interface is not enough to handle > the hypothetical we assumed, since it assumes the homogenous platform that > all the CPUs have the same thread number. But this should be fine for > the platforms with SMT(with same thread number) and non-SMT cores, since it do > indicates the real thread number of the SMT cores. > >> So if there is an assumption that all the CPUs have the same number of >> threads, it should be sufficient to count the number of threads for one >> CPU right ? >> > > Naturally and conveniently I may think use of the threads number of CPU0 (boot > cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous > platform :( > >> In the other (unlikely) case (i.e. CPUs can have various number of threads), >> I think we should either: >> - use your method and check that all the CPUs have the same number of threads >> - get the number of threads for one CPU and check that all the CPUs are >> identical using the ACPI_PPTT_ACPI_IDENTICAL tag > > I think this won't be simpler since we still need to iterate all the CPUs to see > if they have the same hetero_id (checked how we're using this ACPI tag in > arm_acpi_register_pmu_device()). We could already know if they're identical by > comparing the thread number and do the update if necessary. > >> - have a per-cpu cpu_smt_max_threads value ? > > This should be unnecessary in currently stage if there's no platforms > with several kind cores have different thread number like in your assumption > and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using > a global cpu_smt_max_threads to enable the SMT control should be enough. > Does it make sense? Yes, I agree with all the things you said: - the current smt/control interface cannot handle asymmetric SMT platforms - I am not aware if such platform exist so far I think it would still be good to check all the CPUs have the same number of threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the patch attached (and to check CPUs have the same number of threads). Feel free to take what you like/ignore what is inside the attached patch, or let me know if I should submit a part in a separate patchset, ---------------------------- [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC - On arm64 ACPI systems, change the thread_id assignment to have increasing values starting from 0. This is already the case for DT based systems. Doing so allows to uniformly identify the n-th thread of a given CPU. - Check that all CPUs have the same number of threads (for DT/ACPI) - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96 for socket0 and 128 + (0,32,64,96) for socket1: $ cd /sys/devices/system/cpu/smt/ $ cat ../online 0-255 $ echo 2 > control $ cat ../online 0-63,128-191 $ echo 3 > control $ cat ../online 0-95,128-223 $ echo on > control $ cat ../online 0-255 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bd3bc2f5e0ec..1d8521483065 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -239,6 +239,7 @@ config ARM64 select HAVE_GENERIC_VDSO select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select HOTPLUG_SMT if (SMP && HOTPLUG_CPU) + select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0f6ef432fb84..7dd211f81687 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -39,6 +39,14 @@ void update_freq_counters_refs(void); #define arch_scale_hw_pressure topology_get_hw_pressure #define arch_update_hw_pressure topology_update_hw_pressure +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC +#include <linux/cpu_smt.h> +static inline bool topology_smt_thread_allowed(unsigned int cpu) +{ + return topology_thread_id(cpu) < cpu_smt_num_threads; +} +#endif + #include <asm-generic/topology.h> #endif /* _ASM_ARM_TOPOLOGY_H */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index f72e1e55b05e..a83babe19972 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void) { int thread_num, max_smt_thread_num = 1; struct xarray core_threads; + bool have_thread = false; int cpu, topology_id; + unsigned long i; void *entry; if (acpi_disabled) @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void) return topology_id; if (acpi_cpu_is_threaded(cpu)) { + have_thread = true; + cpu_topology[cpu].thread_id = topology_id; topology_id = find_acpi_cpu_topology(cpu, 1); cpu_topology[cpu].core_id = topology_id; @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void) if (!entry) { xa_store(&core_threads, topology_id, xa_mk_value(1), GFP_KERNEL); + cpu_topology[cpu].thread_id = 0; } else { thread_num = xa_to_value(entry); - thread_num++; + cpu_topology[cpu].thread_id = thread_num++; xa_store(&core_threads, topology_id, xa_mk_value(thread_num), GFP_KERNEL); @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].cluster_id = topology_id; topology_id = find_acpi_cpu_topology_package(cpu); cpu_topology[cpu].package_id = topology_id; + + pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n", + cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id, + cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id); } + if (have_thread) + xa_for_each(&core_threads, i, entry) + if (xa_to_value(entry) != max_smt_thread_num) + pr_warn("Heterogeneous SMT topology not handled"); + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); xa_destroy(&core_threads); diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 95513abd664f..20d7f5b72ddd 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node) return cpu; } -static void __init update_smt_num_threads(unsigned int num_threads) +static void __init update_smt_num_threads(int num_threads) { - static unsigned int max_smt_thread_num = 1; + static int max_smt_thread_num = -1; - if (num_threads > max_smt_thread_num) { + if (max_smt_thread_num < 0) { max_smt_thread_num = num_threads; cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + } else if (num_threads != max_smt_thread_num) { + pr_warn("Heterogeneous SMT topology not handled"); } } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index b721f360d759..afdfdc64a0a1 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) #define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) #define topology_core_id(cpu) (cpu_topology[cpu].core_id) +#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) ---------------------------- Regards, Pierre > > Thanks, > Yicong > >> >> Same comment for the DT patch. If there is an assumption that all CPUs have >> the same number of threads, then update_smt_num_threads() could only be called >> once I suppose, >> >> Regards, >> Pierre >> >> >> On 8/6/24 10:53, Yicong Yang wrote: >>> From: Yicong Yang <yangyicong@hisilicon.com> >>> >>> For ACPI we'll build the topology from PPTT and we cannot directly >>> get the SMT number of each core. Instead using a temporary xarray >>> to record the SMT number of each core when building the topology >>> and we can know the largest SMT number in the system. Then we can >>> enable the support of SMT control. >>> >>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >>> --- >>> arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ >>> 1 file changed, 24 insertions(+) >>> >>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >>> index 1a2c72f3e7f8..f72e1e55b05e 100644 >>> --- a/arch/arm64/kernel/topology.c >>> +++ b/arch/arm64/kernel/topology.c >>> @@ -15,8 +15,10 @@ >>> #include <linux/arch_topology.h> >>> #include <linux/cacheinfo.h> >>> #include <linux/cpufreq.h> >>> +#include <linux/cpu_smt.h> >>> #include <linux/init.h> >>> #include <linux/percpu.h> >>> +#include <linux/xarray.h> >>> #include <asm/cpu.h> >>> #include <asm/cputype.h> >>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) >>> */ >>> int __init parse_acpi_topology(void) >>> { >>> + int thread_num, max_smt_thread_num = 1; >>> + struct xarray core_threads; >>> int cpu, topology_id; >>> + void *entry; >>> if (acpi_disabled) >>> return 0; >>> + xa_init(&core_threads); >>> + >>> for_each_possible_cpu(cpu) { >>> topology_id = find_acpi_cpu_topology(cpu, 0); >>> if (topology_id < 0) >>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) >>> cpu_topology[cpu].thread_id = topology_id; >>> topology_id = find_acpi_cpu_topology(cpu, 1); >>> cpu_topology[cpu].core_id = topology_id; >>> + >>> + entry = xa_load(&core_threads, topology_id); >>> + if (!entry) { >>> + xa_store(&core_threads, topology_id, >>> + xa_mk_value(1), GFP_KERNEL); >>> + } else { >>> + thread_num = xa_to_value(entry); >>> + thread_num++; >>> + xa_store(&core_threads, topology_id, >>> + xa_mk_value(thread_num), GFP_KERNEL); >>> + >>> + if (thread_num > max_smt_thread_num) >>> + max_smt_thread_num = thread_num; >>> + } >>> } else { >>> cpu_topology[cpu].thread_id = -1; >>> cpu_topology[cpu].core_id = topology_id; >>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) >>> cpu_topology[cpu].package_id = topology_id; >>> } >>> + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >>> + >>> + xa_destroy(&core_threads); >>> return 0; >>> } >>> #endif >> >> .
On 2024/8/29 20:46, Pierre Gondois wrote: > Hello Yicong, > > On 8/29/24 09:40, Yicong Yang wrote: >> Hi Pierre, >> >> On 2024/8/27 23:40, Pierre Gondois wrote: >>> Hello Yicong, >>> IIUC we are looking for the maximum number of threads a CPU can have >>> in the platform. But is is actually possible to have a platform with >>> CPUs not having the same number of threads ? >>> >> >> I was thinking of the heterogenous platforms, for example part of the >> cores have SMT and others don't (I don't have any though, but there >> should be some such platforms for arm64). >> >>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check >>> that the number of threads is either 1 or cpu_smt_max_threads (as >>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However >>> a (hypothetical) platform with: >>> - CPU0: 2 threads >>> - CPU1: 4 threads >>> should not be able to set the number of threads to 2, as >>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()). >>> >> >> It's a bit more complex. If we enable/disable the SMT using on/off control >> then we won't have this problem. We'll online all the CPUs on enabling and >> offline CPUs which is not primary thread and don't care about the thread >> number of each core. >> >> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC >> and only enabled on powerpc. I think this interface is not enough to handle >> the hypothetical we assumed, since it assumes the homogenous platform that >> all the CPUs have the same thread number. But this should be fine for >> the platforms with SMT(with same thread number) and non-SMT cores, since it do >> indicates the real thread number of the SMT cores. >> >>> So if there is an assumption that all the CPUs have the same number of >>> threads, it should be sufficient to count the number of threads for one >>> CPU right ? >>> >> >> Naturally and conveniently I may think use of the threads number of CPU0 (boot >> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous >> platform :( >> >>> In the other (unlikely) case (i.e. CPUs can have various number of threads), >>> I think we should either: >>> - use your method and check that all the CPUs have the same number of threads >>> - get the number of threads for one CPU and check that all the CPUs are >>> identical using the ACPI_PPTT_ACPI_IDENTICAL tag >> >> I think this won't be simpler since we still need to iterate all the CPUs to see >> if they have the same hetero_id (checked how we're using this ACPI tag in >> arm_acpi_register_pmu_device()). We could already know if they're identical by >> comparing the thread number and do the update if necessary. >> >>> - have a per-cpu cpu_smt_max_threads value ? >> >> This should be unnecessary in currently stage if there's no platforms >> with several kind cores have different thread number like in your assumption >> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using >> a global cpu_smt_max_threads to enable the SMT control should be enough. >> Does it make sense? > > Yes, I agree with all the things you said: > - the current smt/control interface cannot handle asymmetric SMT platforms > - I am not aware if such platform exist so far > > I think it would still be good to check all the CPUs have the same number of > threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the > patch attached (and to check CPUs have the same number of threads). Feel free > to take what you like/ignore what is inside the attached patch, or let me know > if I should submit a part in a separate patchset, > Checked the changes, we can make this series as the basic support and a separate series of yours as a further support of SMT control on arm64: - support thread_id on ACPI based arm64 platform - support partial SMT control by enable CONFIG_SMT_NUM_THREADS_DYNAMIC some minor comments below. > ---------------------------- > > [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC > - On arm64 ACPI systems, change the thread_id assignment to have > increasing values starting from 0. This is already the case for DT > based systems. Doing so allows to uniformly identify the n-th > thread of a given CPU. > - Check that all CPUs have the same number of threads (for DT/ACPI) > - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC > On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96 > for socket0 and 128 + (0,32,64,96) for socket1: > $ cd /sys/devices/system/cpu/smt/ > $ cat ../online > 0-255 > $ echo 2 > control > $ cat ../online > 0-63,128-191 > $ echo 3 > control > $ cat ../online > 0-95,128-223 > $ echo on > control > $ cat ../online > 0-255 > So it's a real SMT4 system, it does make sense to have this partial SMT control support. > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index bd3bc2f5e0ec..1d8521483065 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -239,6 +239,7 @@ config ARM64 > select HAVE_GENERIC_VDSO > select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU > select HOTPLUG_SMT if (SMP && HOTPLUG_CPU) > + select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT > select IRQ_DOMAIN > select IRQ_FORCED_THREADING > select KASAN_VMALLOC if KASAN > diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h > index 0f6ef432fb84..7dd211f81687 100644 > --- a/arch/arm64/include/asm/topology.h > +++ b/arch/arm64/include/asm/topology.h > @@ -39,6 +39,14 @@ void update_freq_counters_refs(void); > #define arch_scale_hw_pressure topology_get_hw_pressure > #define arch_update_hw_pressure topology_update_hw_pressure > > +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC > +#include <linux/cpu_smt.h> > +static inline bool topology_smt_thread_allowed(unsigned int cpu) > +{ > + return topology_thread_id(cpu) < cpu_smt_num_threads; > +} > +#endif > + > #include <asm-generic/topology.h> > > #endif /* _ASM_ARM_TOPOLOGY_H */ > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c > index f72e1e55b05e..a83babe19972 100644 > --- a/arch/arm64/kernel/topology.c > +++ b/arch/arm64/kernel/topology.c > @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void) > { > int thread_num, max_smt_thread_num = 1; > struct xarray core_threads; > + bool have_thread = false; > int cpu, topology_id; > + unsigned long i; > void *entry; > > if (acpi_disabled) > @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void) > return topology_id; > > if (acpi_cpu_is_threaded(cpu)) { > + have_thread = true; > + > cpu_topology[cpu].thread_id = topology_id; > topology_id = find_acpi_cpu_topology(cpu, 1); > cpu_topology[cpu].core_id = topology_id; > @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void) > if (!entry) { > xa_store(&core_threads, topology_id, > xa_mk_value(1), GFP_KERNEL); > + cpu_topology[cpu].thread_id = 0; > } else { > thread_num = xa_to_value(entry); > - thread_num++; > + cpu_topology[cpu].thread_id = thread_num++; > xa_store(&core_threads, topology_id, > xa_mk_value(thread_num), GFP_KERNEL); > > @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].cluster_id = topology_id; > topology_id = find_acpi_cpu_topology_package(cpu); > cpu_topology[cpu].package_id = topology_id; > + > + pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n", > + cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id, > + cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id); > } > > + if (have_thread) we could know this from max_smt_thread_num so have_thread maybe unnecessary. > + xa_for_each(&core_threads, i, entry) > + if (xa_to_value(entry) != max_smt_thread_num) > + pr_warn("Heterogeneous SMT topology not handled");\ Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need further think. Thanks. > + > cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); > > xa_destroy(&core_threads); > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c > index 95513abd664f..20d7f5b72ddd 100644 > --- a/drivers/base/arch_topology.c > +++ b/drivers/base/arch_topology.c > @@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node) > return cpu; > } > > -static void __init update_smt_num_threads(unsigned int num_threads) > +static void __init update_smt_num_threads(int num_threads) > { > - static unsigned int max_smt_thread_num = 1; > + static int max_smt_thread_num = -1; > > - if (num_threads > max_smt_thread_num) { > + if (max_smt_thread_num < 0) { > max_smt_thread_num = num_threads; > cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); > + } else if (num_threads != max_smt_thread_num) { > + pr_warn("Heterogeneous SMT topology not handled"); > } > } > > diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h > index b721f360d759..afdfdc64a0a1 100644 > --- a/include/linux/arch_topology.h > +++ b/include/linux/arch_topology.h > @@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; > #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) > #define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) > #define topology_core_id(cpu) (cpu_topology[cpu].core_id) > +#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) > #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) > #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) > #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) > > ---------------------------- > > > Regards, > Pierre > >> >> Thanks, >> Yicong >> >>> >>> Same comment for the DT patch. If there is an assumption that all CPUs have >>> the same number of threads, then update_smt_num_threads() could only be called >>> once I suppose, >>> >>> Regards, >>> Pierre >>> >>> >>> On 8/6/24 10:53, Yicong Yang wrote: >>>> From: Yicong Yang <yangyicong@hisilicon.com> >>>> >>>> For ACPI we'll build the topology from PPTT and we cannot directly >>>> get the SMT number of each core. Instead using a temporary xarray >>>> to record the SMT number of each core when building the topology >>>> and we can know the largest SMT number in the system. Then we can >>>> enable the support of SMT control. >>>> >>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >>>> --- >>>> arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ >>>> 1 file changed, 24 insertions(+) >>>> >>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >>>> index 1a2c72f3e7f8..f72e1e55b05e 100644 >>>> --- a/arch/arm64/kernel/topology.c >>>> +++ b/arch/arm64/kernel/topology.c >>>> @@ -15,8 +15,10 @@ >>>> #include <linux/arch_topology.h> >>>> #include <linux/cacheinfo.h> >>>> #include <linux/cpufreq.h> >>>> +#include <linux/cpu_smt.h> >>>> #include <linux/init.h> >>>> #include <linux/percpu.h> >>>> +#include <linux/xarray.h> >>>> #include <asm/cpu.h> >>>> #include <asm/cputype.h> >>>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) >>>> */ >>>> int __init parse_acpi_topology(void) >>>> { >>>> + int thread_num, max_smt_thread_num = 1; >>>> + struct xarray core_threads; >>>> int cpu, topology_id; >>>> + void *entry; >>>> if (acpi_disabled) >>>> return 0; >>>> + xa_init(&core_threads); >>>> + >>>> for_each_possible_cpu(cpu) { >>>> topology_id = find_acpi_cpu_topology(cpu, 0); >>>> if (topology_id < 0) >>>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) >>>> cpu_topology[cpu].thread_id = topology_id; >>>> topology_id = find_acpi_cpu_topology(cpu, 1); >>>> cpu_topology[cpu].core_id = topology_id; >>>> + >>>> + entry = xa_load(&core_threads, topology_id); >>>> + if (!entry) { >>>> + xa_store(&core_threads, topology_id, >>>> + xa_mk_value(1), GFP_KERNEL); >>>> + } else { >>>> + thread_num = xa_to_value(entry); >>>> + thread_num++; >>>> + xa_store(&core_threads, topology_id, >>>> + xa_mk_value(thread_num), GFP_KERNEL); >>>> + >>>> + if (thread_num > max_smt_thread_num) >>>> + max_smt_thread_num = thread_num; >>>> + } >>>> } else { >>>> cpu_topology[cpu].thread_id = -1; >>>> cpu_topology[cpu].core_id = topology_id; >>>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) >>>> cpu_topology[cpu].package_id = topology_id; >>>> } >>>> + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >>>> + >>>> + xa_destroy(&core_threads); >>>> return 0; >>>> } >>>> #endif >>> >>> .
Hello Yicong On 8/30/24 11:35, Yicong Yang wrote: > On 2024/8/29 20:46, Pierre Gondois wrote: >> Hello Yicong, >> >> On 8/29/24 09:40, Yicong Yang wrote: >>> Hi Pierre, >>> >>> On 2024/8/27 23:40, Pierre Gondois wrote: >>>> Hello Yicong, >>>> IIUC we are looking for the maximum number of threads a CPU can have >>>> in the platform. But is is actually possible to have a platform with >>>> CPUs not having the same number of threads ? >>>> >>> >>> I was thinking of the heterogenous platforms, for example part of the >>> cores have SMT and others don't (I don't have any though, but there >>> should be some such platforms for arm64). >>> >>>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check >>>> that the number of threads is either 1 or cpu_smt_max_threads (as >>>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However >>>> a (hypothetical) platform with: >>>> - CPU0: 2 threads >>>> - CPU1: 4 threads >>>> should not be able to set the number of threads to 2, as >>>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()). >>>> >>> >>> It's a bit more complex. If we enable/disable the SMT using on/off control >>> then we won't have this problem. We'll online all the CPUs on enabling and >>> offline CPUs which is not primary thread and don't care about the thread >>> number of each core. >>> >>> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC >>> and only enabled on powerpc. I think this interface is not enough to handle >>> the hypothetical we assumed, since it assumes the homogenous platform that >>> all the CPUs have the same thread number. But this should be fine for >>> the platforms with SMT(with same thread number) and non-SMT cores, since it do >>> indicates the real thread number of the SMT cores. >>> >>>> So if there is an assumption that all the CPUs have the same number of >>>> threads, it should be sufficient to count the number of threads for one >>>> CPU right ? >>>> >>> >>> Naturally and conveniently I may think use of the threads number of CPU0 (boot >>> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous >>> platform :( >>> >>>> In the other (unlikely) case (i.e. CPUs can have various number of threads), >>>> I think we should either: >>>> - use your method and check that all the CPUs have the same number of threads >>>> - get the number of threads for one CPU and check that all the CPUs are >>>> identical using the ACPI_PPTT_ACPI_IDENTICAL tag >>> >>> I think this won't be simpler since we still need to iterate all the CPUs to see >>> if they have the same hetero_id (checked how we're using this ACPI tag in >>> arm_acpi_register_pmu_device()). We could already know if they're identical by >>> comparing the thread number and do the update if necessary. >>> >>>> - have a per-cpu cpu_smt_max_threads value ? >>> >>> This should be unnecessary in currently stage if there's no platforms >>> with several kind cores have different thread number like in your assumption >>> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using >>> a global cpu_smt_max_threads to enable the SMT control should be enough. >>> Does it make sense? >> >> Yes, I agree with all the things you said: >> - the current smt/control interface cannot handle asymmetric SMT platforms >> - I am not aware if such platform exist so far >> >> I think it would still be good to check all the CPUs have the same number of >> threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the >> patch attached (and to check CPUs have the same number of threads). Feel free >> to take what you like/ignore what is inside the attached patch, or let me know >> if I should submit a part in a separate patchset, >> > > Checked the changes, we can make this series as the basic support and a separate > series of yours as a further support of SMT control on arm64: > - support thread_id on ACPI based arm64 platform > - support partial SMT control by enable CONFIG_SMT_NUM_THREADS_DYNAMIC I'm not sure I fully understand what you mean. I can send patches to enable SMT_NUM_THREADS_DYNAMIC on top of a v6 of yours IIUC. I let you pick the changes that you estimate to make more sense in your serie (if that makes sense) ? Please let met know if that works for you (or not). > > some minor comments below. > >> ---------------------------- >> >> [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC >> - On arm64 ACPI systems, change the thread_id assignment to have >> increasing values starting from 0. This is already the case for DT >> based systems. Doing so allows to uniformly identify the n-th >> thread of a given CPU. >> - Check that all CPUs have the same number of threads (for DT/ACPI) >> - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC >> On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96 >> for socket0 and 128 + (0,32,64,96) for socket1: >> $ cd /sys/devices/system/cpu/smt/ >> $ cat ../online >> 0-255 >> $ echo 2 > control >> $ cat ../online >> 0-63,128-191 >> $ echo 3 > control >> $ cat ../online >> 0-95,128-223 >> $ echo on > control >> $ cat ../online >> 0-255 >> > > So it's a real SMT4 system, it does make sense to have this partial SMT control > support. Yes right > >> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >> index bd3bc2f5e0ec..1d8521483065 100644 >> --- a/arch/arm64/Kconfig >> +++ b/arch/arm64/Kconfig >> @@ -239,6 +239,7 @@ config ARM64 >> select HAVE_GENERIC_VDSO >> select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU >> select HOTPLUG_SMT if (SMP && HOTPLUG_CPU) >> + select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT >> select IRQ_DOMAIN >> select IRQ_FORCED_THREADING >> select KASAN_VMALLOC if KASAN >> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h >> index 0f6ef432fb84..7dd211f81687 100644 >> --- a/arch/arm64/include/asm/topology.h >> +++ b/arch/arm64/include/asm/topology.h >> @@ -39,6 +39,14 @@ void update_freq_counters_refs(void); >> #define arch_scale_hw_pressure topology_get_hw_pressure >> #define arch_update_hw_pressure topology_update_hw_pressure >> >> +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC >> +#include <linux/cpu_smt.h> >> +static inline bool topology_smt_thread_allowed(unsigned int cpu) >> +{ >> + return topology_thread_id(cpu) < cpu_smt_num_threads; >> +} >> +#endif >> + >> #include <asm-generic/topology.h> >> >> #endif /* _ASM_ARM_TOPOLOGY_H */ >> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >> index f72e1e55b05e..a83babe19972 100644 >> --- a/arch/arm64/kernel/topology.c >> +++ b/arch/arm64/kernel/topology.c >> @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void) >> { >> int thread_num, max_smt_thread_num = 1; >> struct xarray core_threads; >> + bool have_thread = false; >> int cpu, topology_id; >> + unsigned long i; >> void *entry; >> >> if (acpi_disabled) >> @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void) >> return topology_id; >> >> if (acpi_cpu_is_threaded(cpu)) { >> + have_thread = true; >> + >> cpu_topology[cpu].thread_id = topology_id; >> topology_id = find_acpi_cpu_topology(cpu, 1); >> cpu_topology[cpu].core_id = topology_id; >> @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void) >> if (!entry) { >> xa_store(&core_threads, topology_id, >> xa_mk_value(1), GFP_KERNEL); >> + cpu_topology[cpu].thread_id = 0; >> } else { >> thread_num = xa_to_value(entry); >> - thread_num++; >> + cpu_topology[cpu].thread_id = thread_num++; >> xa_store(&core_threads, topology_id, >> xa_mk_value(thread_num), GFP_KERNEL); >> >> @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].cluster_id = topology_id; >> topology_id = find_acpi_cpu_topology_package(cpu); >> cpu_topology[cpu].package_id = topology_id; >> + >> + pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n", >> + cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id, >> + cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id); >> } >> >> + if (have_thread) > > we could know this from max_smt_thread_num so have_thread maybe unnecessary. Yes right, I will change that. > >> + xa_for_each(&core_threads, i, entry) >> + if (xa_to_value(entry) != max_smt_thread_num) >> + pr_warn("Heterogeneous SMT topology not handled");\ > > Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale > system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need > further think. I found this comments (not sure this is what you are refering to): - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/ - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452 I don't see another way to do it right now. Also, I thing the complexity is in O(2n), which should be better than the original O(n**2), Regards, Pierre > > Thanks. > >> + >> cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >> >> xa_destroy(&core_threads); >> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c >> index 95513abd664f..20d7f5b72ddd 100644 >> --- a/drivers/base/arch_topology.c >> +++ b/drivers/base/arch_topology.c >> @@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node) >> return cpu; >> } >> >> -static void __init update_smt_num_threads(unsigned int num_threads) >> +static void __init update_smt_num_threads(int num_threads) >> { >> - static unsigned int max_smt_thread_num = 1; >> + static int max_smt_thread_num = -1; >> >> - if (num_threads > max_smt_thread_num) { >> + if (max_smt_thread_num < 0) { >> max_smt_thread_num = num_threads; >> cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >> + } else if (num_threads != max_smt_thread_num) { >> + pr_warn("Heterogeneous SMT topology not handled"); >> } >> } >> >> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h >> index b721f360d759..afdfdc64a0a1 100644 >> --- a/include/linux/arch_topology.h >> +++ b/include/linux/arch_topology.h >> @@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; >> #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) >> #define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) >> #define topology_core_id(cpu) (cpu_topology[cpu].core_id) >> +#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) >> #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) >> #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) >> #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) >> >> ---------------------------- >> >> >> Regards, >> Pierre >> >>> >>> Thanks, >>> Yicong >>> >>>> >>>> Same comment for the DT patch. If there is an assumption that all CPUs have >>>> the same number of threads, then update_smt_num_threads() could only be called >>>> once I suppose, >>>> >>>> Regards, >>>> Pierre >>>> >>>> >>>> On 8/6/24 10:53, Yicong Yang wrote: >>>>> From: Yicong Yang <yangyicong@hisilicon.com> >>>>> >>>>> For ACPI we'll build the topology from PPTT and we cannot directly >>>>> get the SMT number of each core. Instead using a temporary xarray >>>>> to record the SMT number of each core when building the topology >>>>> and we can know the largest SMT number in the system. Then we can >>>>> enable the support of SMT control. >>>>> >>>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >>>>> --- >>>>> arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++ >>>>> 1 file changed, 24 insertions(+) >>>>> >>>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >>>>> index 1a2c72f3e7f8..f72e1e55b05e 100644 >>>>> --- a/arch/arm64/kernel/topology.c >>>>> +++ b/arch/arm64/kernel/topology.c >>>>> @@ -15,8 +15,10 @@ >>>>> #include <linux/arch_topology.h> >>>>> #include <linux/cacheinfo.h> >>>>> #include <linux/cpufreq.h> >>>>> +#include <linux/cpu_smt.h> >>>>> #include <linux/init.h> >>>>> #include <linux/percpu.h> >>>>> +#include <linux/xarray.h> >>>>> #include <asm/cpu.h> >>>>> #include <asm/cputype.h> >>>>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) >>>>> */ >>>>> int __init parse_acpi_topology(void) >>>>> { >>>>> + int thread_num, max_smt_thread_num = 1; >>>>> + struct xarray core_threads; >>>>> int cpu, topology_id; >>>>> + void *entry; >>>>> if (acpi_disabled) >>>>> return 0; >>>>> + xa_init(&core_threads); >>>>> + >>>>> for_each_possible_cpu(cpu) { >>>>> topology_id = find_acpi_cpu_topology(cpu, 0); >>>>> if (topology_id < 0) >>>>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) >>>>> cpu_topology[cpu].thread_id = topology_id; >>>>> topology_id = find_acpi_cpu_topology(cpu, 1); >>>>> cpu_topology[cpu].core_id = topology_id; >>>>> + >>>>> + entry = xa_load(&core_threads, topology_id); >>>>> + if (!entry) { >>>>> + xa_store(&core_threads, topology_id, >>>>> + xa_mk_value(1), GFP_KERNEL); >>>>> + } else { >>>>> + thread_num = xa_to_value(entry); >>>>> + thread_num++; >>>>> + xa_store(&core_threads, topology_id, >>>>> + xa_mk_value(thread_num), GFP_KERNEL); >>>>> + >>>>> + if (thread_num > max_smt_thread_num) >>>>> + max_smt_thread_num = thread_num; >>>>> + } >>>>> } else { >>>>> cpu_topology[cpu].thread_id = -1; >>>>> cpu_topology[cpu].core_id = topology_id; >>>>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) >>>>> cpu_topology[cpu].package_id = topology_id; >>>>> } >>>>> + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >>>>> + >>>>> + xa_destroy(&core_threads); >>>>> return 0; >>>>> } >>>>> #endif >>>> >>>> .
On 2024/9/2 15:43, Pierre Gondois wrote: > Hello Yicong > > On 8/30/24 11:35, Yicong Yang wrote: >> On 2024/8/29 20:46, Pierre Gondois wrote: >>> Hello Yicong, >>> >>> On 8/29/24 09:40, Yicong Yang wrote: >>>> Hi Pierre, >>>> >>>> On 2024/8/27 23:40, Pierre Gondois wrote: >>>>> Hello Yicong, >>>>> IIUC we are looking for the maximum number of threads a CPU can have >>>>> in the platform. But is is actually possible to have a platform with >>>>> CPUs not having the same number of threads ? >>>>> >>>> >>>> I was thinking of the heterogenous platforms, for example part of the >>>> cores have SMT and others don't (I don't have any though, but there >>>> should be some such platforms for arm64). >>>> >>>>> For instance kernel/cpu.c::cpu_smt_num_threads_valid() will check >>>>> that the number of threads is either 1 or cpu_smt_max_threads (as >>>>> CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled for arm64). However >>>>> a (hypothetical) platform with: >>>>> - CPU0: 2 threads >>>>> - CPU1: 4 threads >>>>> should not be able to set the number of threads to 2, as >>>>> 1 < 2 < cpu_smt_max_threads (cf. cpu_smt_num_threads_valid()). >>>>> >>>> >>>> It's a bit more complex. If we enable/disable the SMT using on/off control >>>> then we won't have this problem. We'll online all the CPUs on enabling and >>>> offline CPUs which is not primary thread and don't care about the thread >>>> number of each core. >>>> >>>> Control using thread number is introduced by CONFIG_SMT_NUM_THREADS_DYNAMIC >>>> and only enabled on powerpc. I think this interface is not enough to handle >>>> the hypothetical we assumed, since it assumes the homogenous platform that >>>> all the CPUs have the same thread number. But this should be fine for >>>> the platforms with SMT(with same thread number) and non-SMT cores, since it do >>>> indicates the real thread number of the SMT cores. >>>> >>>>> So if there is an assumption that all the CPUs have the same number of >>>>> threads, it should be sufficient to count the number of threads for one >>>>> CPU right ? >>>>> >>>> >>>> Naturally and conveniently I may think use of the threads number of CPU0 (boot >>>> cpu) in such a solution. But this will be wrong if CPU0 is non-SMT on a heterogenous >>>> platform :( >>>> >>>>> In the other (unlikely) case (i.e. CPUs can have various number of threads), >>>>> I think we should either: >>>>> - use your method and check that all the CPUs have the same number of threads >>>>> - get the number of threads for one CPU and check that all the CPUs are >>>>> identical using the ACPI_PPTT_ACPI_IDENTICAL tag >>>> >>>> I think this won't be simpler since we still need to iterate all the CPUs to see >>>> if they have the same hetero_id (checked how we're using this ACPI tag in >>>> arm_acpi_register_pmu_device()). We could already know if they're identical by >>>> comparing the thread number and do the update if necessary. >>>> >>>>> - have a per-cpu cpu_smt_max_threads value ? >>>> >>>> This should be unnecessary in currently stage if there's no platforms >>>> with several kind cores have different thread number like in your assumption >>>> and enable CONFIG_SMT_NUM_THREADS_DYNAMIC on such platforms. Otherwise using >>>> a global cpu_smt_max_threads to enable the SMT control should be enough. >>>> Does it make sense? >>> >>> Yes, I agree with all the things you said: >>> - the current smt/control interface cannot handle asymmetric SMT platforms >>> - I am not aware if such platform exist so far >>> >>> I think it would still be good to check all the CPUs have the same number of >>> threads. I tried to enable the SMT_NUM_THREADS_DYNAMIC Kconfig with the >>> patch attached (and to check CPUs have the same number of threads). Feel free >>> to take what you like/ignore what is inside the attached patch, or let me know >>> if I should submit a part in a separate patchset, >>> >> >> Checked the changes, we can make this series as the basic support and a separate >> series of yours as a further support of SMT control on arm64: >> - support thread_id on ACPI based arm64 platform >> - support partial SMT control by enable CONFIG_SMT_NUM_THREADS_DYNAMIC > > I'm not sure I fully understand what you mean. I can send patches to > enable SMT_NUM_THREADS_DYNAMIC on top of a v6 of yours IIUC. I let you pick > the changes that you estimate to make more sense in your serie (if that makes > sense) ? Please let met know if that works for you (or not). > yes it works for me :) >> >> some minor comments below. >> >>> ---------------------------- >>> >>> [RFC] arm64: topology: Enable CONFIG_SMT_NUM_THREADS_DYNAMIC >>> - On arm64 ACPI systems, change the thread_id assignment to have >>> increasing values starting from 0. This is already the case for DT >>> based systems. Doing so allows to uniformly identify the n-th >>> thread of a given CPU. >>> - Check that all CPUs have the same number of threads (for DT/ACPI) >>> - Enable CONFIG_SMT_NUM_THREADS_DYNAMIC >>> On a Tx2, with 256 CPUs, threads siblings being 0,32,64,96 >>> for socket0 and 128 + (0,32,64,96) for socket1: >>> $ cd /sys/devices/system/cpu/smt/ >>> $ cat ../online >>> 0-255 >>> $ echo 2 > control >>> $ cat ../online >>> 0-63,128-191 >>> $ echo 3 > control >>> $ cat ../online >>> 0-95,128-223 >>> $ echo on > control >>> $ cat ../online >>> 0-255 >>> >> >> So it's a real SMT4 system, it does make sense to have this partial SMT control >> support. > > Yes right > >> >>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >>> index bd3bc2f5e0ec..1d8521483065 100644 >>> --- a/arch/arm64/Kconfig >>> +++ b/arch/arm64/Kconfig >>> @@ -239,6 +239,7 @@ config ARM64 >>> select HAVE_GENERIC_VDSO >>> select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU >>> select HOTPLUG_SMT if (SMP && HOTPLUG_CPU) >>> + select SMT_NUM_THREADS_DYNAMIC if HOTPLUG_SMT >>> select IRQ_DOMAIN >>> select IRQ_FORCED_THREADING >>> select KASAN_VMALLOC if KASAN >>> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h >>> index 0f6ef432fb84..7dd211f81687 100644 >>> --- a/arch/arm64/include/asm/topology.h >>> +++ b/arch/arm64/include/asm/topology.h >>> @@ -39,6 +39,14 @@ void update_freq_counters_refs(void); >>> #define arch_scale_hw_pressure topology_get_hw_pressure >>> #define arch_update_hw_pressure topology_update_hw_pressure >>> +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC >>> +#include <linux/cpu_smt.h> >>> +static inline bool topology_smt_thread_allowed(unsigned int cpu) >>> +{ >>> + return topology_thread_id(cpu) < cpu_smt_num_threads; >>> +} >>> +#endif >>> + >>> #include <asm-generic/topology.h> >>> #endif /* _ASM_ARM_TOPOLOGY_H */ >>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >>> index f72e1e55b05e..a83babe19972 100644 >>> --- a/arch/arm64/kernel/topology.c >>> +++ b/arch/arm64/kernel/topology.c >>> @@ -47,7 +47,9 @@ int __init parse_acpi_topology(void) >>> { >>> int thread_num, max_smt_thread_num = 1; >>> struct xarray core_threads; >>> + bool have_thread = false; >>> int cpu, topology_id; >>> + unsigned long i; >>> void *entry; >>> if (acpi_disabled) >>> @@ -61,6 +63,8 @@ int __init parse_acpi_topology(void) >>> return topology_id; >>> if (acpi_cpu_is_threaded(cpu)) { >>> + have_thread = true; >>> + >>> cpu_topology[cpu].thread_id = topology_id; >>> topology_id = find_acpi_cpu_topology(cpu, 1); >>> cpu_topology[cpu].core_id = topology_id; >>> @@ -69,9 +73,10 @@ int __init parse_acpi_topology(void) >>> if (!entry) { >>> xa_store(&core_threads, topology_id, >>> xa_mk_value(1), GFP_KERNEL); >>> + cpu_topology[cpu].thread_id = 0; >>> } else { >>> thread_num = xa_to_value(entry); >>> - thread_num++; >>> + cpu_topology[cpu].thread_id = thread_num++; >>> xa_store(&core_threads, topology_id, >>> xa_mk_value(thread_num), GFP_KERNEL); >>> @@ -86,8 +91,17 @@ int __init parse_acpi_topology(void) >>> cpu_topology[cpu].cluster_id = topology_id; >>> topology_id = find_acpi_cpu_topology_package(cpu); >>> cpu_topology[cpu].package_id = topology_id; >>> + >>> + pr_debug("CPU%u: package=0x%x cluster=0x%x core=0x%x thread=0x%x\n", >>> + cpu, cpu_topology[cpu].package_id, cpu_topology[cpu].cluster_id, >>> + cpu_topology[cpu].core_id, cpu_topology[cpu].thread_id); >>> } >>> + if (have_thread) >> >> we could know this from max_smt_thread_num so have_thread maybe unnecessary. > > Yes right, I will change that. > >> >>> + xa_for_each(&core_threads, i, entry) >>> + if (xa_to_value(entry) != max_smt_thread_num) >>> + pr_warn("Heterogeneous SMT topology not handled");\ >> >> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale >> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need >> further think. > > I found this comments (not sure this is what you are refering to): > - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/ > - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452 > > I don't see another way to do it right now. Also, I thing the complexity is in > O(2n), which should be better than the original O(n**2), > yes it's less complex. I'm wondering build up the xarray in another way then we can avoid the long loops. What about below: From 5ff5d0100435982764cd85566a6fe006e60ee98e Mon Sep 17 00:00:00 2001 From: Yicong Yang <yangyicong@hisilicon.com> Date: Fri, 20 Oct 2023 15:38:38 +0800 Subject: [PATCH] arm64: topology: Support SMT control on ACPI based system For ACPI we'll build the topology from PPTT and we cannot directly get the SMT number of each core. Instead using a temporary xarray to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL) and SMT information of the first core in its heterogeneous CPU cluster when building the topology. Then we can know the largest SMT number in the system. Warn if heterogeneous SMT topology exists (multiple heterogeneous CPU clusters with different SMT thread number) since the SMT control cannot handle this well. Then enable the support of SMT control. Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> --- arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f8..f6ec30fae70e 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,8 +15,10 @@ #include <linux/arch_topology.h> #include <linux/cacheinfo.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> @@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu) return !!is_threaded; } +struct cpu_smt_info { + int thread_num; + int core_id; + int cpu; +}; + /* * Propagate the topology information of the processor_topology_node tree to the * cpu_topology array. */ int __init parse_acpi_topology(void) { + int max_smt_thread_num = 1; + struct cpu_smt_info *entry; + struct xarray hetero_cpu; + unsigned long hetero_id; int cpu, topology_id; if (acpi_disabled) return 0; + xa_init(&hetero_cpu); + for_each_possible_cpu(cpu) { topology_id = find_acpi_cpu_topology(cpu, 0); if (topology_id < 0) @@ -57,6 +71,30 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = topology_id; topology_id = find_acpi_cpu_topology(cpu, 1); cpu_topology[cpu].core_id = topology_id; + + /* + * Build up the XArray using the heterogeneous ID of + * the CPU cluster. Store the CPU and SMT information + * of the first appeared CPU in the CPU cluster of this + * heterogeneous ID since the SMT information should be + * the same in this CPU cluster. Then we can know the + * SMT information of each heterogeneous CPUs in the + * system. + */ + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); + entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id); + if (!entry) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + WARN_ON(!entry); + + entry->cpu = cpu; + entry->core_id = topology_id; + entry->thread_num = 1; + xa_store(&hetero_cpu, hetero_id, + entry, GFP_KERNEL); + } else if (entry->core_id == topology_id) { + entry->thread_num++; + } } else { cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; @@ -67,6 +105,28 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].package_id = topology_id; } + /* + * This should be a short loop depending on the number of heterogeneous + * CPU clusters. Typically on a homogeneous system there's only one + * entry in the XArray. + */ + xa_for_each(&hetero_cpu, hetero_id, entry) { + if (entry->thread_num == 1) + continue; + + if (entry->thread_num != max_smt_thread_num && + max_smt_thread_num != 1) + pr_warn("Heterogeneous SMT topology not handled"); + + if (entry->thread_num > max_smt_thread_num) + max_smt_thread_num = entry->thread_num; + + xa_erase(&hetero_cpu, hetero_id); + kfree(entry); + } + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + xa_destroy(&hetero_cpu); return 0; } #endif
Hello Yicong, >>> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale >>> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need >>> further think. >> >> I found this comments (not sure this is what you are refering to): >> - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/ >> - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452 >> >> I don't see another way to do it right now. Also, I thing the complexity is in >> O(2n), which should be better than the original O(n**2), >> > > yes it's less complex. I'm wondering build up the xarray in another way then we can avoid the > long loops. What about below: I tried the patch on a ThunderX2 with 4 threads per CPU. PPTT topology describes 2 clusters of 128 cores each. Each cluster is described as independent (i.e. there is no root node in the PPTT). The PPTT is of revision 1, so the IDENTICAL flag is not available on the platform. I also tried it on a faked SMT asymmetric platform and there might be a small correction required. > > From 5ff5d0100435982764cd85566a6fe006e60ee98e Mon Sep 17 00:00:00 2001 > From: Yicong Yang <yangyicong@hisilicon.com> > Date: Fri, 20 Oct 2023 15:38:38 +0800 > Subject: [PATCH] arm64: topology: Support SMT control on ACPI based system > > For ACPI we'll build the topology from PPTT and we cannot directly > get the SMT number of each core. Instead using a temporary xarray > to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL) > and SMT information of the first core in its heterogeneous CPU cluster > when building the topology. Then we can know the largest SMT number > in the system. Warn if heterogeneous SMT topology exists (multiple > heterogeneous CPU clusters with different SMT thread number) since the > SMT control cannot handle this well. Then enable the support of SMT > control. > > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > --- > arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 60 insertions(+) > > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c > index 1a2c72f3e7f8..f6ec30fae70e 100644 > --- a/arch/arm64/kernel/topology.c > +++ b/arch/arm64/kernel/topology.c > @@ -15,8 +15,10 @@ > #include <linux/arch_topology.h> > #include <linux/cacheinfo.h> > #include <linux/cpufreq.h> > +#include <linux/cpu_smt.h> > #include <linux/init.h> > #include <linux/percpu.h> > +#include <linux/xarray.h> > > #include <asm/cpu.h> > #include <asm/cputype.h> > @@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu) > return !!is_threaded; > } > > +struct cpu_smt_info { > + int thread_num; > + int core_id; > + int cpu; > +}; > + > /* > * Propagate the topology information of the processor_topology_node tree to the > * cpu_topology array. > */ > int __init parse_acpi_topology(void) > { > + int max_smt_thread_num = 1; > + struct cpu_smt_info *entry; > + struct xarray hetero_cpu; > + unsigned long hetero_id; > int cpu, topology_id; > > if (acpi_disabled) > return 0; > > + xa_init(&hetero_cpu); > + > for_each_possible_cpu(cpu) { > topology_id = find_acpi_cpu_topology(cpu, 0); > if (topology_id < 0) > @@ -57,6 +71,30 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].thread_id = topology_id; > topology_id = find_acpi_cpu_topology(cpu, 1); > cpu_topology[cpu].core_id = topology_id; > + > + /* > + * Build up the XArray using the heterogeneous ID of > + * the CPU cluster. Store the CPU and SMT information > + * of the first appeared CPU in the CPU cluster of this > + * heterogeneous ID since the SMT information should be > + * the same in this CPU cluster. Then we can know the > + * SMT information of each heterogeneous CPUs in the > + * system. > + */ > + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); > + entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id); > + if (!entry) { > + entry = kzalloc(sizeof(*entry), GFP_KERNEL); > + WARN_ON(!entry); > + > + entry->cpu = cpu; > + entry->core_id = topology_id; > + entry->thread_num = 1; > + xa_store(&hetero_cpu, hetero_id, > + entry, GFP_KERNEL); > + } else if (entry->core_id == topology_id) { > + entry->thread_num++; > + } > } else { > cpu_topology[cpu].thread_id = -1; > cpu_topology[cpu].core_id = topology_id; > @@ -67,6 +105,28 @@ int __init parse_acpi_topology(void) > cpu_topology[cpu].package_id = topology_id; > } > > + /* > + * This should be a short loop depending on the number of heterogeneous > + * CPU clusters. Typically on a homogeneous system there's only one > + * entry in the XArray. > + */ > + xa_for_each(&hetero_cpu, hetero_id, entry) { > + if (entry->thread_num == 1) > + continue; If a platform has CPUs with: - 1 thread - X (!= 1) threads Then I think that the asymmetry is not detected > + > + if (entry->thread_num != max_smt_thread_num && > + max_smt_thread_num != 1) > + pr_warn("Heterogeneous SMT topology not handled"); > + > + if (entry->thread_num > max_smt_thread_num) > + max_smt_thread_num = entry->thread_num; > + > + xa_erase(&hetero_cpu, hetero_id); > + kfree(entry); > + } > + > + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); > + xa_destroy(&hetero_cpu); > return 0; > } > #endif
On 2024/9/5 16:34, Pierre Gondois wrote: > Hello Yicong, > >>>> Wondering if we can avoid this 2nd loop. Greg express the worries of looping twice on large scale >>>> system in v1. Maybe we could use the hetero_id and get the necessary information in one loop, I need >>>> further think. >>> >>> I found this comments (not sure this is what you are refering to): >>> - https://lore.kernel.org/linux-arm-kernel/20231011103303.00002d8f@Huawei.com/ >>> - https://lore.kernel.org/all/20230921150333.c2zqigs3xxwcg4ln@bogus/T/#m406c4c16871ca7ae431beb20feccfb5e14498452 >>> >>> I don't see another way to do it right now. Also, I thing the complexity is in >>> O(2n), which should be better than the original O(n**2), >>> >> >> yes it's less complex. I'm wondering build up the xarray in another way then we can avoid the >> long loops. What about below: > > I tried the patch on a ThunderX2 with 4 threads per CPU. PPTT topology describes > 2 clusters of 128 cores each. Each cluster is described as independent (i.e. there > is no root node in the PPTT). The PPTT is of revision 1, so the IDENTICAL > flag is not available on the platform. > I think the SMT detection should work on this case, does it? In this case there should be 2 entries for each cluster respectively in the hetero_cpu array. Since the thread number are the same, we'll detect a symmetric SMT topology. > I also tried it on a faked SMT asymmetric platform and there might be a small > correction required. > >> >> From 5ff5d0100435982764cd85566a6fe006e60ee98e Mon Sep 17 00:00:00 2001 >> From: Yicong Yang <yangyicong@hisilicon.com> >> Date: Fri, 20 Oct 2023 15:38:38 +0800 >> Subject: [PATCH] arm64: topology: Support SMT control on ACPI based system >> >> For ACPI we'll build the topology from PPTT and we cannot directly >> get the SMT number of each core. Instead using a temporary xarray >> to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL) >> and SMT information of the first core in its heterogeneous CPU cluster >> when building the topology. Then we can know the largest SMT number >> in the system. Warn if heterogeneous SMT topology exists (multiple >> heterogeneous CPU clusters with different SMT thread number) since the >> SMT control cannot handle this well. Then enable the support of SMT >> control. >> >> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >> --- >> arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++ >> 1 file changed, 60 insertions(+) >> >> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c >> index 1a2c72f3e7f8..f6ec30fae70e 100644 >> --- a/arch/arm64/kernel/topology.c >> +++ b/arch/arm64/kernel/topology.c >> @@ -15,8 +15,10 @@ >> #include <linux/arch_topology.h> >> #include <linux/cacheinfo.h> >> #include <linux/cpufreq.h> >> +#include <linux/cpu_smt.h> >> #include <linux/init.h> >> #include <linux/percpu.h> >> +#include <linux/xarray.h> >> >> #include <asm/cpu.h> >> #include <asm/cputype.h> >> @@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu) >> return !!is_threaded; >> } >> >> +struct cpu_smt_info { >> + int thread_num; >> + int core_id; >> + int cpu; >> +}; >> + >> /* >> * Propagate the topology information of the processor_topology_node tree to the >> * cpu_topology array. >> */ >> int __init parse_acpi_topology(void) >> { >> + int max_smt_thread_num = 1; >> + struct cpu_smt_info *entry; >> + struct xarray hetero_cpu; >> + unsigned long hetero_id; >> int cpu, topology_id; >> >> if (acpi_disabled) >> return 0; >> >> + xa_init(&hetero_cpu); >> + >> for_each_possible_cpu(cpu) { >> topology_id = find_acpi_cpu_topology(cpu, 0); >> if (topology_id < 0) >> @@ -57,6 +71,30 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].thread_id = topology_id; >> topology_id = find_acpi_cpu_topology(cpu, 1); >> cpu_topology[cpu].core_id = topology_id; >> + >> + /* >> + * Build up the XArray using the heterogeneous ID of >> + * the CPU cluster. Store the CPU and SMT information >> + * of the first appeared CPU in the CPU cluster of this >> + * heterogeneous ID since the SMT information should be >> + * the same in this CPU cluster. Then we can know the >> + * SMT information of each heterogeneous CPUs in the >> + * system. >> + */ >> + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); >> + entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id); >> + if (!entry) { >> + entry = kzalloc(sizeof(*entry), GFP_KERNEL); >> + WARN_ON(!entry); >> + >> + entry->cpu = cpu; >> + entry->core_id = topology_id; >> + entry->thread_num = 1; >> + xa_store(&hetero_cpu, hetero_id, >> + entry, GFP_KERNEL); >> + } else if (entry->core_id == topology_id) { >> + entry->thread_num++; >> + } >> } else { >> cpu_topology[cpu].thread_id = -1; >> cpu_topology[cpu].core_id = topology_id; >> @@ -67,6 +105,28 @@ int __init parse_acpi_topology(void) >> cpu_topology[cpu].package_id = topology_id; >> } >> >> + /* >> + * This should be a short loop depending on the number of heterogeneous >> + * CPU clusters. Typically on a homogeneous system there's only one >> + * entry in the XArray. >> + */ >> + xa_for_each(&hetero_cpu, hetero_id, entry) { >> + if (entry->thread_num == 1) >> + continue; > > If a platform has CPUs with: > - 1 thread > - X (!= 1) threads > Then I think that the asymmetry is not detected Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the system. For this case I was thinking we don't need to handle this since there's only one kind of SMT core in the system, control should works fine for the SMT CPU clusters and we may not care about the CPUs with no SMT. Below code should handle the case if we initialize the max_smt_thread_num to 0. I also reword the warning messages to match the fact. For heterogeneous SMT topology we still support control SMT by on/off toggle but not fully support setting the thread number. int max_smt_thread_num = 0; [...] /* * This should be a short loop depending on the number of heterogeneous * CPU clusters. Typically on a homogeneous system there's only one * entry in the XArray. */ xa_for_each(&hetero_cpu, hetero_id, entry) { /* * If max_smt_thread_num has been initialized and doesn't match * the thread number of this entry, then the system has * heterogeneous SMT topology. */ if (entry->thread_num != max_smt_thread_num && max_smt_thread_num) pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n"); if (entry->thread_num > max_smt_thread_num) max_smt_thread_num = entry->thread_num; xa_erase(&hetero_cpu, hetero_id); kfree(entry); } Thanks. > >> + >> + if (entry->thread_num != max_smt_thread_num && >> + max_smt_thread_num != 1) >> + pr_warn("Heterogeneous SMT topology not handled"); >> + >> + if (entry->thread_num > max_smt_thread_num) >> + max_smt_thread_num = entry->thread_num; >> + >> + xa_erase(&hetero_cpu, hetero_id); >> + kfree(entry); >> + } >> + >> + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); >> + xa_destroy(&hetero_cpu); >> return 0; >> } >> #endif > > .
Hi Yicong, On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote: > On 2024/9/5 16:34, Pierre Gondois wrote: > > Hello Yicong, > > > > If a platform has CPUs with: > > - 1 thread > > - X (!= 1) threads > > Then I think that the asymmetry is not detected > > Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the > system. For this case I was thinking we don't need to handle this since there's only one kind > of SMT core in the system, control should works fine for the SMT CPU clusters and we may not > care about the CPUs with no SMT. > > Below code should handle the case if we initialize the max_smt_thread_num to 0. I also > reword the warning messages to match the fact. For heterogeneous SMT topology we still > support control SMT by on/off toggle but not fully support setting the thread number. > > int max_smt_thread_num = 0; > [...] > /* > * This should be a short loop depending on the number of heterogeneous > * CPU clusters. Typically on a homogeneous system there's only one > * entry in the XArray. > */ > xa_for_each(&hetero_cpu, hetero_id, entry) { > /* > * If max_smt_thread_num has been initialized and doesn't match > * the thread number of this entry, then the system has > * heterogeneous SMT topology. > */ > if (entry->thread_num != max_smt_thread_num && max_smt_thread_num) > pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n"); What does 'partly supported' mean here? If the SMT control doesn't work as intended for this topology, I don't think it should be enabled for it. Morten
On 2024/9/6 15:06, Morten Rasmussen wrote: > Hi Yicong, > > On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote: >> On 2024/9/5 16:34, Pierre Gondois wrote: >>> Hello Yicong, >>> >>> If a platform has CPUs with: >>> - 1 thread >>> - X (!= 1) threads >>> Then I think that the asymmetry is not detected >> >> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the >> system. For this case I was thinking we don't need to handle this since there's only one kind >> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not >> care about the CPUs with no SMT. >> >> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also >> reword the warning messages to match the fact. For heterogeneous SMT topology we still >> support control SMT by on/off toggle but not fully support setting the thread number. >> >> int max_smt_thread_num = 0; >> [...] >> /* >> * This should be a short loop depending on the number of heterogeneous >> * CPU clusters. Typically on a homogeneous system there's only one >> * entry in the XArray. >> */ >> xa_for_each(&hetero_cpu, hetero_id, entry) { >> /* >> * If max_smt_thread_num has been initialized and doesn't match >> * the thread number of this entry, then the system has >> * heterogeneous SMT topology. >> */ >> if (entry->thread_num != max_smt_thread_num && max_smt_thread_num) >> pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n"); > > What does 'partly supported' mean here? > > If the SMT control doesn't work as intended for this topology, I don't > think it should be enabled for it. > The /sys/devices/system/cpu/smt/control supports 2 kind of controls [1] (a) enable/disable SMT entirely by writing on/off (b) enable/disable SMT partially by writing a valid thread number (CONFIG_SMT_NUM_THREADS_DYNAMIC) We assume 3 kind of SMT topology: 1. homogeneous SMT topology, all the CPUs support SMT or not 2.1 heterogeneous SMT topology, part of CPU clusters have SMT and others not. Clusters support SMT have the same SMT thread number 2.2 heterogeneous SMT topology, part of CPU clusters have SMT and the thread number may differs (e.g. cluster 1 is of SMT 2 and cluster 2 is of SMT 4. not sure there's such a real system) For any of above SMT topology, control (a) should work as expected. When enabling SMT by writing "on" the SMT is disabled for those CPU clusters who have SMT. Same for disabling SMT by writing "off". But control (b) may not work for case 2.2 since the SMT thread number is not the same across the system. For this series alone we won't met this since CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled. So control (b) only supports write 1/max_thread which behaves same as write off/on and will work as intended for all the 3 cases. As discussed Pierre will add support for CONFIG_SMT_NUM_THREADS_DYNAMIC since thunderX2 is a symmetric SMT4 machine and CONFIG_SMT_NUM_THREADS_DYNAMIC would be useful. We thought a warning should be useful if running on a system of case 2.2. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/sysfs-devices-system-cpu#n542 Thanks.
On Fri, Sep 06, 2024 at 04:36:30PM +0800, Yicong Yang wrote: > On 2024/9/6 15:06, Morten Rasmussen wrote: > > Hi Yicong, > > > > On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote: > >> On 2024/9/5 16:34, Pierre Gondois wrote: > >>> Hello Yicong, > >>> > >>> If a platform has CPUs with: > >>> - 1 thread > >>> - X (!= 1) threads > >>> Then I think that the asymmetry is not detected > >> > >> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the > >> system. For this case I was thinking we don't need to handle this since there's only one kind > >> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not > >> care about the CPUs with no SMT. > >> > >> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also > >> reword the warning messages to match the fact. For heterogeneous SMT topology we still > >> support control SMT by on/off toggle but not fully support setting the thread number. > >> > >> int max_smt_thread_num = 0; > >> [...] > >> /* > >> * This should be a short loop depending on the number of heterogeneous > >> * CPU clusters. Typically on a homogeneous system there's only one > >> * entry in the XArray. > >> */ > >> xa_for_each(&hetero_cpu, hetero_id, entry) { > >> /* > >> * If max_smt_thread_num has been initialized and doesn't match > >> * the thread number of this entry, then the system has > >> * heterogeneous SMT topology. > >> */ > >> if (entry->thread_num != max_smt_thread_num && max_smt_thread_num) > >> pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n"); > > > > What does 'partly supported' mean here? > > > > If the SMT control doesn't work as intended for this topology, I don't > > think it should be enabled for it. > > > > The /sys/devices/system/cpu/smt/control supports 2 kind of controls [1] > (a) enable/disable SMT entirely by writing on/off > (b) enable/disable SMT partially by writing a valid thread number (CONFIG_SMT_NUM_THREADS_DYNAMIC) > > We assume 3 kind of SMT topology: > 1. homogeneous SMT topology, all the CPUs support SMT or not > 2.1 heterogeneous SMT topology, part of CPU clusters have SMT and others not. Clusters support SMT > have the same SMT thread number > 2.2 heterogeneous SMT topology, part of CPU clusters have SMT and the thread number may differs > (e.g. cluster 1 is of SMT 2 and cluster 2 is of SMT 4. not sure there's such a real system) > > For any of above SMT topology, control (a) should work as expected. When enabling SMT by writing "on" > the SMT is disabled for those CPU clusters who have SMT. Same for disabling SMT by writing "off". > But control (b) may not work for case 2.2 since the SMT thread number is not the same across > the system. > > For this series alone we won't met this since CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled. > So control (b) only supports write 1/max_thread which behaves same as write off/on and will > work as intended for all the 3 cases. As discussed Pierre will add support for > CONFIG_SMT_NUM_THREADS_DYNAMIC since thunderX2 is a symmetric SMT4 machine and > CONFIG_SMT_NUM_THREADS_DYNAMIC would be useful. We thought a warning should be useful > if running on a system of case 2.2. Thanks for explaining the situation. So IIUC, for case 2.2 there will be _no_ failures if someone writes a value different from 1 or max_threads? The SMT control code can handle that max_threads isn't the correct number of threads for all cores in the system?
Hello, On Thu, Sep 12, 2024 at 01:59:16PM +0200, Morten Rasmussen wrote: > On Fri, Sep 06, 2024 at 04:36:30PM +0800, Yicong Yang wrote: > > On 2024/9/6 15:06, Morten Rasmussen wrote: > > > Hi Yicong, > > > > > > On Thu, Sep 05, 2024 at 08:02:20PM +0800, Yicong Yang wrote: > > >> On 2024/9/5 16:34, Pierre Gondois wrote: > > >>> Hello Yicong, > > >>> > > >>> If a platform has CPUs with: > > >>> - 1 thread > > >>> - X (!= 1) threads > > >>> Then I think that the asymmetry is not detected > > >> > > >> Ah ok, I only handle the case where there are several thread numbers except no SMT CPUs in the > > >> system. For this case I was thinking we don't need to handle this since there's only one kind > > >> of SMT core in the system, control should works fine for the SMT CPU clusters and we may not > > >> care about the CPUs with no SMT. > > >> > > >> Below code should handle the case if we initialize the max_smt_thread_num to 0. I also > > >> reword the warning messages to match the fact. For heterogeneous SMT topology we still > > >> support control SMT by on/off toggle but not fully support setting the thread number. > > >> > > >> int max_smt_thread_num = 0; > > >> [...] > > >> /* > > >> * This should be a short loop depending on the number of heterogeneous > > >> * CPU clusters. Typically on a homogeneous system there's only one > > >> * entry in the XArray. > > >> */ > > >> xa_for_each(&hetero_cpu, hetero_id, entry) { > > >> /* > > >> * If max_smt_thread_num has been initialized and doesn't match > > >> * the thread number of this entry, then the system has > > >> * heterogeneous SMT topology. > > >> */ > > >> if (entry->thread_num != max_smt_thread_num && max_smt_thread_num) > > >> pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n"); > > > > > > What does 'partly supported' mean here? > > > > > > If the SMT control doesn't work as intended for this topology, I don't > > > think it should be enabled for it. > > > > > > > The /sys/devices/system/cpu/smt/control supports 2 kind of controls [1] > > (a) enable/disable SMT entirely by writing on/off > > (b) enable/disable SMT partially by writing a valid thread number (CONFIG_SMT_NUM_THREADS_DYNAMIC) > > > > We assume 3 kind of SMT topology: > > 1. homogeneous SMT topology, all the CPUs support SMT or not > > 2.1 heterogeneous SMT topology, part of CPU clusters have SMT and others not. Clusters support SMT > > have the same SMT thread number > > 2.2 heterogeneous SMT topology, part of CPU clusters have SMT and the thread number may differs > > (e.g. cluster 1 is of SMT 2 and cluster 2 is of SMT 4. not sure there's such a real system) > > > > For any of above SMT topology, control (a) should work as expected. When enabling SMT by writing "on" > > the SMT is disabled for those CPU clusters who have SMT. Same for disabling SMT by writing "off". > > But control (b) may not work for case 2.2 since the SMT thread number is not the same across > > the system. > > > > For this series alone we won't met this since CONFIG_SMT_NUM_THREADS_DYNAMIC is not enabled. > > So control (b) only supports write 1/max_thread which behaves same as write off/on and will > > work as intended for all the 3 cases. As discussed Pierre will add support for > > CONFIG_SMT_NUM_THREADS_DYNAMIC since thunderX2 is a symmetric SMT4 machine and > > CONFIG_SMT_NUM_THREADS_DYNAMIC would be useful. We thought a warning should be useful > > if running on a system of case 2.2. > > Thanks for explaining the situation. > > So IIUC, for case 2.2 there will be _no_ failures if someone writes a > value different from 1 or max_threads? > > The SMT control code can handle that max_threads isn't the correct > number of threads for all cores in the system? Hello, I suppose a number can be interpreted as 'up to this number'. If that's what the user wanted is another question, on a hypothetical heterogenous system with different number of threads in different CPUs it is questionalble what this achieves. Arguably once such system is found and the desired configurations undestood this can be expanded to handle them. Thanks Michal
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f8..f72e1e55b05e 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,8 +15,10 @@ #include <linux/arch_topology.h> #include <linux/cacheinfo.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu) */ int __init parse_acpi_topology(void) { + int thread_num, max_smt_thread_num = 1; + struct xarray core_threads; int cpu, topology_id; + void *entry; if (acpi_disabled) return 0; + xa_init(&core_threads); + for_each_possible_cpu(cpu) { topology_id = find_acpi_cpu_topology(cpu, 0); if (topology_id < 0) @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = topology_id; topology_id = find_acpi_cpu_topology(cpu, 1); cpu_topology[cpu].core_id = topology_id; + + entry = xa_load(&core_threads, topology_id); + if (!entry) { + xa_store(&core_threads, topology_id, + xa_mk_value(1), GFP_KERNEL); + } else { + thread_num = xa_to_value(entry); + thread_num++; + xa_store(&core_threads, topology_id, + xa_mk_value(thread_num), GFP_KERNEL); + + if (thread_num > max_smt_thread_num) + max_smt_thread_num = thread_num; + } } else { cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].package_id = topology_id; } + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + + xa_destroy(&core_threads); return 0; } #endif