Message ID | 20230508185218.962208640@linutronix.de (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | cpu/hotplug, x86: Reworked parallel CPU bringup | expand |
On Mon, May 08, 2023 at 09:44:17PM +0200, Thomas Gleixner wrote: > From: Thomas Gleixner <tglx@linutronix.de> > > Make the primary thread tracking CPU mask based in preparation for simpler > handling of parallel bootup. > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> > Tested-by: Michael Kelley <mikelley@microsoft.com> > > > --- > arch/x86/include/asm/apic.h | 2 -- > arch/x86/include/asm/topology.h | 19 +++++++++++++++---- > arch/x86/kernel/apic/apic.c | 20 +++++++++----------- > arch/x86/kernel/smpboot.c | 12 +++--------- > 4 files changed, 27 insertions(+), 26 deletions(-) > --- > ... > @@ -2386,20 +2386,16 @@ bool arch_match_cpu_phys_id(int cpu, u64 > } > > #ifdef CONFIG_SMP > -/** > - * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread > - * @apicid: APIC ID to check > - */ > -bool apic_id_is_primary_thread(unsigned int apicid) > +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) > { > - u32 mask; > - > - if (smp_num_siblings == 1) > - return true; > /* Isolate the SMT bit(s) in the APICID and check for 0 */ > - mask = (1U << (fls(smp_num_siblings) - 1)) - 1; > - return !(apicid & mask); > + u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; > + > + if (smp_num_siblings == 1 || !(apicid & mask)) > + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); > } > +#else > +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } > #endif > > /* This patch causes boot regression on TDX guest. The guest crashes on SMP bring up. The change makes use of smp_num_siblings earlier than before: the mask get constructed in acpi_boot_init() codepath. Later on smp_num_siblings gets updated in detect_ht(). In my setup with 16 vCPUs, smp_num_siblings is 16 before detect_ht() and set to 1 in detect_ht().
On Wed, May 24 2023 at 23:48, Kirill A. Shutemov wrote: > On Mon, May 08, 2023 at 09:44:17PM +0200, Thomas Gleixner wrote: >> #ifdef CONFIG_SMP >> -/** >> - * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread >> - * @apicid: APIC ID to check >> - */ >> -bool apic_id_is_primary_thread(unsigned int apicid) >> +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) >> { >> - u32 mask; >> - >> - if (smp_num_siblings == 1) >> - return true; >> /* Isolate the SMT bit(s) in the APICID and check for 0 */ >> - mask = (1U << (fls(smp_num_siblings) - 1)) - 1; >> - return !(apicid & mask); >> + u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; >> + >> + if (smp_num_siblings == 1 || !(apicid & mask)) >> + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); >> } >> +#else >> +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } >> #endif >> >> /* > > This patch causes boot regression on TDX guest. The guest crashes on SMP > bring up. I rather call it a security feature: It makes TDX unbreakably secure. > The change makes use of smp_num_siblings earlier than before: the mask get > constructed in acpi_boot_init() codepath. Later on smp_num_siblings gets > updated in detect_ht(). > > In my setup with 16 vCPUs, smp_num_siblings is 16 before detect_ht() and > set to 1 in detect_ht(). early_init_intel(c) if (detect_extended_topology_early(c) < 0) detect_ht_early(c); acpi_boot_init() .... identify_boot_cpu(c) detect_ht(c); Aaargh. That whole CPU identification code is a complete horrorshow. I'll have a look....
On Fri, May 26 2023 at 12:14, Thomas Gleixner wrote: > On Wed, May 24 2023 at 23:48, Kirill A. Shutemov wrote: >> This patch causes boot regression on TDX guest. The guest crashes on SMP >> bring up. The below should fix that. Sigh... Thanks, tglx ---- Subject: x86/smp: Initialize cpu_primary_thread_mask late From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 26 May 2023 21:38:47 +0200 Marking primary threads in the cpumask during early boot is only correct in certain configurations, but broken e.g. for the legacy hyperthreading detection. This is due to the complete mess in the CPUID evaluation code which initializes smp_num_siblings only half during early init and fixes it up later when identify_boot_cpu() is invoked. So using smp_num_siblings before identify_boot_cpu() leads to incorrect results. Fixing the early CPU init code to provide the proper data is a larger scale surgery as the code has dependencies on data structures which are not initialized during early boot. Move the initialization of cpu_primary_thread_mask wich depends on smp_num_siblings being correct to an early initcall so that it is set up correctly before SMP bringup. Fixes: f54d4434c281 ("x86/apic: Provide cpu_primary_thread mask") Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic/apic.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2398,6 +2398,21 @@ static void cpu_mark_primary_thread(unsi if (smp_num_siblings == 1 || !(apicid & mask)) cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } + +/* + * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid + * during early boot. Initialize the primary thread mask before SMP + * bringup. + */ +static int __init smp_init_primary_thread_mask(void) +{ + unsigned int cpu; + + for (cpu = 0; cpu < nr_logical_cpuids; cpu++) + cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); + return 0; +} +early_initcall(smp_init_primary_thread_mask); #else static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } #endif @@ -2544,7 +2559,8 @@ int generic_processor_info(int apicid, i set_cpu_present(cpu, true); num_processors++; - cpu_mark_primary_thread(cpu, apicid); + if (system_state >= SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); return cpu; }
On Sat, May 27, 2023 at 03:40:02PM +0200, Thomas Gleixner wrote: > On Fri, May 26 2023 at 12:14, Thomas Gleixner wrote: > > On Wed, May 24 2023 at 23:48, Kirill A. Shutemov wrote: > >> This patch causes boot regression on TDX guest. The guest crashes on SMP > >> bring up. > > The below should fix that. Sigh... Okay, this gets me fixes the boot for TDX guest: Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> But it gets broken again on "x86/smpboot: Implement a bit spinlock to protect the realmode stack" with [ 0.554079] .... node #0, CPUs: #1 #2 [ 0.738071] Callback from call_rcu_tasks() invoked. [ 10.562065] CPU2 failed to report alive state [ 10.566337] #3 [ 20.570066] CPU3 failed to report alive state [ 20.574268] #4 ... Notably CPU1 is missing from "failed to report" list. So CPU1 takes the lock fine, but seems never unlocks it. Maybe trampoline_lock(%rip) in head_64.S somehow is not the same as &tr_lock in trampoline_64.S. I donno. I haven't find the root cause yet. But bypassing locking in LOAD_REALMODE_ESP makes the issue go away. I will look more into it.
On Mon, May 29 2023 at 05:39, Kirill A. Shutemov wrote: > On Sat, May 27, 2023 at 03:40:02PM +0200, Thomas Gleixner wrote: > But it gets broken again on "x86/smpboot: Implement a bit spinlock to > protect the realmode stack" with > > [ 0.554079] .... node #0, CPUs: #1 #2 > [ 0.738071] Callback from call_rcu_tasks() invoked. > [ 10.562065] CPU2 failed to report alive state > [ 10.566337] #3 > [ 20.570066] CPU3 failed to report alive state > [ 20.574268] #4 > ... > > Notably CPU1 is missing from "failed to report" list. So CPU1 takes the > lock fine, but seems never unlocks it. > > Maybe trampoline_lock(%rip) in head_64.S somehow is not the same as > &tr_lock in trampoline_64.S. I donno. It's definitely the same in the regular startup (16bit mode), but TDX starts up via: trampoline_start64 trampoline_compat LOAD_REALMODE_ESP <- lock That place cannot work with that LOAD_REALMODE_ESP macro. The untested below should cure it. Thanks, tglx --- --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -37,12 +37,12 @@ .text .code16 -.macro LOAD_REALMODE_ESP +.macro LOAD_REALMODE_ESP lock:req /* * Make sure only one CPU fiddles with the realmode stack */ .Llock_rm\@: - lock btsl $0, tr_lock + lock btsl $0, \lock jnc 2f pause jmp .Llock_rm\@ @@ -63,7 +63,7 @@ SYM_CODE_START(trampoline_start) mov %ax, %es mov %ax, %ss - LOAD_REALMODE_ESP + LOAD_REALMODE_ESP tr_lock call verify_cpu # Verify the cpu supports long mode testl %eax, %eax # Check for return code @@ -106,7 +106,7 @@ SYM_CODE_START(sev_es_trampoline_start) mov %ax, %es mov %ax, %ss - LOAD_REALMODE_ESP + LOAD_REALMODE_ESP tr_lock jmp .Lswitch_to_protected SYM_CODE_END(sev_es_trampoline_start) @@ -189,7 +189,7 @@ SYM_CODE_START(pa_trampoline_compat) * In compatibility mode. Prep ESP and DX for startup_32, then disable * paging and complete the switch to legacy 32-bit mode. */ - LOAD_REALMODE_ESP + LOAD_REALMODE_ESP pa_tr_lock movw $__KERNEL_DS, %dx movl $(CR0_STATE & ~X86_CR0_PG), %eax
On Mon, May 29, 2023 at 09:27:13PM +0200, Thomas Gleixner wrote: > On Mon, May 29 2023 at 05:39, Kirill A. Shutemov wrote: > > On Sat, May 27, 2023 at 03:40:02PM +0200, Thomas Gleixner wrote: > > But it gets broken again on "x86/smpboot: Implement a bit spinlock to > > protect the realmode stack" with > > > > [ 0.554079] .... node #0, CPUs: #1 #2 > > [ 0.738071] Callback from call_rcu_tasks() invoked. > > [ 10.562065] CPU2 failed to report alive state > > [ 10.566337] #3 > > [ 20.570066] CPU3 failed to report alive state > > [ 20.574268] #4 > > ... > > > > Notably CPU1 is missing from "failed to report" list. So CPU1 takes the > > lock fine, but seems never unlocks it. > > > > Maybe trampoline_lock(%rip) in head_64.S somehow is not the same as > > &tr_lock in trampoline_64.S. I donno. > > It's definitely the same in the regular startup (16bit mode), but TDX > starts up via: > > trampoline_start64 > trampoline_compat > LOAD_REALMODE_ESP <- lock > > That place cannot work with that LOAD_REALMODE_ESP macro. The untested > below should cure it. Yep, works for me. Aaand the next patch that breaks TDX boot is... <drum roll> x86/smpboot/64: Implement arch_cpuhp_init_parallel_bringup() and enable it Disabling parallel bringup helps. I didn't look closer yet. If you have an idea let me know.
On Mon, May 29, 2023 at 11:31:29PM +0300, Kirill A. Shutemov wrote: > On Mon, May 29, 2023 at 09:27:13PM +0200, Thomas Gleixner wrote: > > On Mon, May 29 2023 at 05:39, Kirill A. Shutemov wrote: > > > On Sat, May 27, 2023 at 03:40:02PM +0200, Thomas Gleixner wrote: > > > But it gets broken again on "x86/smpboot: Implement a bit spinlock to > > > protect the realmode stack" with > > > > > > [ 0.554079] .... node #0, CPUs: #1 #2 > > > [ 0.738071] Callback from call_rcu_tasks() invoked. > > > [ 10.562065] CPU2 failed to report alive state > > > [ 10.566337] #3 > > > [ 20.570066] CPU3 failed to report alive state > > > [ 20.574268] #4 > > > ... > > > > > > Notably CPU1 is missing from "failed to report" list. So CPU1 takes the > > > lock fine, but seems never unlocks it. > > > > > > Maybe trampoline_lock(%rip) in head_64.S somehow is not the same as > > > &tr_lock in trampoline_64.S. I donno. > > > > It's definitely the same in the regular startup (16bit mode), but TDX > > starts up via: > > > > trampoline_start64 > > trampoline_compat > > LOAD_REALMODE_ESP <- lock > > > > That place cannot work with that LOAD_REALMODE_ESP macro. The untested > > below should cure it. > > Yep, works for me. > > Aaand the next patch that breaks TDX boot is... <drum roll> > > x86/smpboot/64: Implement arch_cpuhp_init_parallel_bringup() and enable it > > Disabling parallel bringup helps. I didn't look closer yet. If you have > an idea let me know. Okay, it crashes around .Lread_apicid due to touching MSRs that trigger #VE. Looks like the patch had no intention to enable parallel bringup on TDX. + * Intel-TDX has a secure RDMSR hypercall, but that needs to be + * implemented seperately in the low level startup ASM code. But CC_ATTR_GUEST_STATE_ENCRYPT that used to filter it out is SEV-ES-specific thingy and doesn't cover TDX. I don't think we have an attribute that fits nicely here.
On Mon, May 29 2023 at 23:31, Kirill A. Shutemov wrote: > Aaand the next patch that breaks TDX boot is... <drum roll> > > x86/smpboot/64: Implement arch_cpuhp_init_parallel_bringup() and enable it > > Disabling parallel bringup helps. I didn't look closer yet. If you have > an idea let me know. So how does TDX end up with actual parallel bringup? if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) { pr_info("Parallel CPU startup disabled due to guest state encryption\n"); return false; } It should take that path, no? Thanks, tglx
On Tue, May 30 2023 at 03:54, Kirill A. Shutemov wrote: > On Mon, May 29, 2023 at 11:31:29PM +0300, Kirill A. Shutemov wrote: >> Disabling parallel bringup helps. I didn't look closer yet. If you have >> an idea let me know. > > Okay, it crashes around .Lread_apicid due to touching MSRs that trigger #VE. > > Looks like the patch had no intention to enable parallel bringup on TDX. > > + * Intel-TDX has a secure RDMSR hypercall, but that needs to be > + * implemented seperately in the low level startup ASM code. > > But CC_ATTR_GUEST_STATE_ENCRYPT that used to filter it out is > SEV-ES-specific thingy and doesn't cover TDX. I don't think we have an > attribute that fits nicely here. Bah. That sucks.
On Tue, May 30 2023 at 11:26, Thomas Gleixner wrote: > On Tue, May 30 2023 at 03:54, Kirill A. Shutemov wrote: >> On Mon, May 29, 2023 at 11:31:29PM +0300, Kirill A. Shutemov wrote: >>> Disabling parallel bringup helps. I didn't look closer yet. If you have >>> an idea let me know. >> >> Okay, it crashes around .Lread_apicid due to touching MSRs that trigger #VE. >> >> Looks like the patch had no intention to enable parallel bringup on TDX. >> >> + * Intel-TDX has a secure RDMSR hypercall, but that needs to be >> + * implemented seperately in the low level startup ASM code. >> >> But CC_ATTR_GUEST_STATE_ENCRYPT that used to filter it out is >> SEV-ES-specific thingy and doesn't cover TDX. I don't think we have an >> attribute that fits nicely here. > > Bah. That sucks. Can we have something consistent in this CC space or needs everything to be extra magic per CC variant?
On Tue, May 30, 2023 at 12:34:45PM +0200, Thomas Gleixner wrote: > On Tue, May 30 2023 at 11:26, Thomas Gleixner wrote: > > On Tue, May 30 2023 at 03:54, Kirill A. Shutemov wrote: > >> On Mon, May 29, 2023 at 11:31:29PM +0300, Kirill A. Shutemov wrote: > >>> Disabling parallel bringup helps. I didn't look closer yet. If you have > >>> an idea let me know. > >> > >> Okay, it crashes around .Lread_apicid due to touching MSRs that trigger #VE. > >> > >> Looks like the patch had no intention to enable parallel bringup on TDX. > >> > >> + * Intel-TDX has a secure RDMSR hypercall, but that needs to be > >> + * implemented seperately in the low level startup ASM code. > >> > >> But CC_ATTR_GUEST_STATE_ENCRYPT that used to filter it out is > >> SEV-ES-specific thingy and doesn't cover TDX. I don't think we have an > >> attribute that fits nicely here. > > > > Bah. That sucks. > > Can we have something consistent in this CC space or needs everything to > be extra magic per CC variant? IIUC, CC_ATTR_GUEST_MEM_ENCRYPT should cover all AMD SEV flavours and Intel TDX. But the name is confusing in this context: memory encryption has nothing to do with the APIC.
--- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -506,10 +506,8 @@ extern int default_check_phys_apicid_pre #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP -bool apic_id_is_primary_thread(unsigned int id); void apic_smt_update(void); #else -static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -31,9 +31,9 @@ * CONFIG_NUMA. */ #include <linux/numa.h> +#include <linux/cpumask.h> #ifdef CONFIG_NUMA -#include <linux/cpumask.h> #include <asm/mpspec.h> #include <asm/percpu.h> @@ -139,9 +139,20 @@ static inline int topology_max_smt_threa int topology_update_package_map(unsigned int apicid, unsigned int cpu); int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); -bool topology_is_primary_thread(unsigned int cpu); bool topology_smt_supported(void); -#else + +extern struct cpumask __cpu_primary_thread_mask; +#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) + +/** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_primary_thread_mask); +} +#else /* CONFIG_SMP */ #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } @@ -152,7 +163,7 @@ static inline int topology_max_die_per_p static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline bool topology_smt_supported(void) { return false; } -#endif +#endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) { --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2386,20 +2386,16 @@ bool arch_match_cpu_phys_id(int cpu, u64 } #ifdef CONFIG_SMP -/** - * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @apicid: APIC ID to check - */ -bool apic_id_is_primary_thread(unsigned int apicid) +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { - u32 mask; - - if (smp_num_siblings == 1) - return true; /* Isolate the SMT bit(s) in the APICID and check for 0 */ - mask = (1U << (fls(smp_num_siblings) - 1)) - 1; - return !(apicid & mask); + u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + + if (smp_num_siblings == 1 || !(apicid & mask)) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } #endif /* @@ -2544,6 +2540,8 @@ int generic_processor_info(int apicid, i set_cpu_present(cpu, true); num_processors++; + cpu_mark_primary_thread(cpu, apicid); + return cpu; } --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -102,6 +102,9 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + /* Representing CPUs for which sibling maps can be computed */ static cpumask_var_t cpu_sibling_setup_mask; @@ -283,15 +286,6 @@ static void notrace start_secondary(void } /** - * topology_is_primary_thread - Check whether CPU is the primary SMT thread - * @cpu: CPU to check - */ -bool topology_is_primary_thread(unsigned int cpu) -{ - return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); -} - -/** * topology_smt_supported - Check whether SMT is supported by the CPUs */ bool topology_smt_supported(void)