Message ID | 20211215145633.5238-7-dwmw2@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Parallel CPU bringup for x86_64 | expand |
On 12/15/21 8:56 AM, David Woodhouse wrote: > From: Thomas Gleixner <tglx@linutronix.de> > ... > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S > index d8b3ebd2bb85..0249212e23d2 100644 > --- a/arch/x86/kernel/head_64.S > +++ b/arch/x86/kernel/head_64.S > @@ -25,6 +25,7 @@ > #include <asm/export.h> > #include <asm/nospec-branch.h> > #include <asm/fixmap.h> > +#include <asm/smp.h> > > /* > * We are not able to switch in one step to the final KERNEL ADDRESS SPACE > @@ -176,6 +177,64 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) > 1: > UNWIND_HINT_EMPTY > > + /* > + * Is this the boot CPU coming up? If so everything is available > + * in initial_gs, initial_stack and early_gdt_descr. > + */ > + movl smpboot_control(%rip), %eax > + testl %eax, %eax > + jz .Lsetup_cpu > + > + /* > + * Secondary CPUs find out the offsets via the APIC ID. For parallel > + * boot the APIC ID is retrieved from CPUID, otherwise it's encoded > + * in smpboot_control: > + * Bit 0-15 APICID if STARTUP_USE_CPUID_0B is not set > + * Bit 16 Secondary boot flag > + * Bit 17 Parallel boot flag > + */ > + testl $STARTUP_USE_CPUID_0B, %eax > + jz .Lsetup_AP > + > + mov $0x0B, %eax > + xorl %ecx, %ecx > + cpuid This will break an SEV-ES guest because CPUID will generate a #VC and a #VC handler has not been established yet. I guess for now, you can probably just not enable parallel startup for SEV-ES guests. Thanks, Tom > + mov %edx, %eax > + > +.Lsetup_AP: > + /* EAX contains the APICID of the current CPU */ > + andl $0xFFFF, %eax > + xorl %ecx, %ecx > + leaq cpuid_to_apicid(%rip), %rbx > + > +.Lfind_cpunr: > + cmpl (%rbx), %eax > + jz .Linit_cpu_data > + addq $4, %rbx > + addq $8, %rcx > + jmp .Lfind_cpunr > + > +.Linit_cpu_data: > + /* Get the per cpu offset */ > + leaq __per_cpu_offset(%rip), %rbx > + addq %rcx, %rbx > + movq (%rbx), %rbx > + /* Save it for GS BASE setup */ > + movq %rbx, initial_gs(%rip) > + > + /* Calculate the GDT address */ > + movq $gdt_page, %rcx > + addq %rbx, %rcx > + movq %rcx, early_gdt_descr_base(%rip) > + > + /* Find the idle task stack */ > + movq $idle_threads, %rcx > + addq %rbx, %rcx > + movq (%rcx), %rcx > + movq TASK_threadsp(%rcx), %rcx > + movq %rcx, initial_stack(%rip) > + > +.Lsetup_cpu: > /* > * We must switch to a new descriptor in kernel space for the GDT > * because soon the kernel won't have access anymore to the userspace > @@ -216,6 +275,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) > */ > movq initial_stack(%rip), %rsp > > + /* Drop the realmode protection. For the boot CPU the pointer is NULL! */ > + movq trampoline_lock(%rip), %rax > + testq %rax, %rax > + jz .Lsetup_idt > + lock > + btrl $0, (%rax) > + > +.Lsetup_idt: > /* Setup and Load IDT */ > pushq %rsi > call early_setup_idt > @@ -347,6 +414,7 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) > * reliably detect the end of the stack. > */ > SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) > +SYM_DATA(trampoline_lock, .quad 0); > __FINITDATA > > __INIT > @@ -572,6 +640,9 @@ SYM_DATA_END(level1_fixmap_pgt) > SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) > SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) > > + .align 16 > +SYM_DATA(smpboot_control, .long 0) > + > .align 16 > /* This must match the first entry in level2_kernel_pgt */ > SYM_DATA(phys_base, .quad 0x0) > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index 7a763b84b6e5..1e38d44c3603 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -1104,9 +1104,19 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, > unsigned long boot_error = 0; > > idle->thread.sp = (unsigned long)task_pt_regs(idle); > - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); > initial_code = (unsigned long)start_secondary; > - initial_stack = idle->thread.sp; > + > + if (IS_ENABLED(CONFIG_X86_32)) { > + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); > + initial_stack = idle->thread.sp; > + } else if (boot_cpu_data.cpuid_level < 0x0B) { > + /* Anything with X2APIC should have CPUID leaf 0x0B */ > + if (WARN_ON_ONCE(x2apic_mode) && apicid > 0xffff) > + return -EIO; > + smpboot_control = apicid | STARTUP_USE_APICID; > + } else { > + smpboot_control = STARTUP_USE_CPUID_0B; > + } > > /* Enable the espfix hack for this CPU */ > init_espfix_ap(cpu); > diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c > index 4a3da7592b99..7dc2e817bd02 100644 > --- a/arch/x86/realmode/init.c > +++ b/arch/x86/realmode/init.c > @@ -127,6 +127,9 @@ static void __init setup_real_mode(void) > > trampoline_header->flags = 0; > > + trampoline_lock = &trampoline_header->lock; > + *trampoline_lock = 0; > + > trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); > trampoline_pgd[0] = trampoline_pgd_entry.pgd; > trampoline_pgd[511] = init_top_pgt[511].pgd; > diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S > index cc8391f86cdb..12a540904e80 100644 > --- a/arch/x86/realmode/rm/trampoline_64.S > +++ b/arch/x86/realmode/rm/trampoline_64.S > @@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start) > mov %ax, %es > mov %ax, %ss > > + /* > + * Make sure only one CPU fiddles with the realmode stack > + */ > +.Llock_rm: > + btl $0, tr_lock > + jnc 2f > + pause > + jmp .Llock_rm > +2: > + lock > + btsl $0, tr_lock > + jc .Llock_rm > + > # Setup stack > movl $rm_stack_end, %esp > > @@ -192,6 +205,7 @@ SYM_DATA_START(trampoline_header) > SYM_DATA(tr_efer, .space 8) > SYM_DATA(tr_cr4, .space 4) > SYM_DATA(tr_flags, .space 4) > + SYM_DATA(tr_lock, .space 4) > SYM_DATA_END(trampoline_header) > > #include "trampoline_common.S" > diff --git a/kernel/smpboot.c b/kernel/smpboot.c > index f6bc0bc8a2aa..934e64ff4eed 100644 > --- a/kernel/smpboot.c > +++ b/kernel/smpboot.c > @@ -25,7 +25,7 @@ > * For the hotplug case we keep the task structs around and reuse > * them. > */ > -static DEFINE_PER_CPU(struct task_struct *, idle_threads); > +DEFINE_PER_CPU(struct task_struct *, idle_threads); > > struct task_struct *idle_thread_get(unsigned int cpu) > { >
On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote: > This will break an SEV-ES guest because CPUID will generate a #VC and a > #VC handler has not been established yet. > > I guess for now, you can probably just not enable parallel startup for > SEV-ES guests. OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then, since it's no longer limited to CPUs without X2APIC. Then we can refrain from doing parallel bringup for SEV-ES guests, as you suggest. What precisely is the check I should be using for that?
On 12/16/21 12:24 PM, David Woodhouse wrote: > On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote: > >> This will break an SEV-ES guest because CPUID will generate a #VC and a >> #VC handler has not been established yet. >> >> I guess for now, you can probably just not enable parallel startup for >> SEV-ES guests. > > OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then, > since it's no longer limited to CPUs without X2APIC. Then we can > refrain from doing parallel bringup for SEV-ES guests, as you suggest. > > What precisely is the check I should be using for that? Calling cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT) will return true for an SEV-ES guest. Thanks, Tom >
On Thu, 2021-12-16 at 13:00 -0600, Tom Lendacky wrote: > On 12/16/21 12:24 PM, David Woodhouse wrote: > > On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote: > > > > > This will break an SEV-ES guest because CPUID will generate a #VC and a > > > #VC handler has not been established yet. > > > > > > I guess for now, you can probably just not enable parallel startup for > > > SEV-ES guests. > > > > OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then, > > since it's no longer limited to CPUs without X2APIC. Then we can > > refrain from doing parallel bringup for SEV-ES guests, as you suggest. > > > > What precisely is the check I should be using for that? > > Calling cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT) will return true for > an SEV-ES guest. Thanks. Incremental patch (which I'll roll into Thomas's patch) looks a bit like this. Testing it now... diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 0b6012fd3e55..1ac33ce1d60e 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -199,7 +199,6 @@ extern unsigned int smpboot_control; #endif /* !__ASSEMBLY__ */ /* Control bits for startup_64 */ -#define STARTUP_USE_APICID 0x10000 -#define STARTUP_USE_CPUID_0B 0x20000 +#define STARTUP_PARALLEL 0x80000000 #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0249212e23d2..3e4c3c416bce 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -189,11 +189,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * Secondary CPUs find out the offsets via the APIC ID. For parallel * boot the APIC ID is retrieved from CPUID, otherwise it's encoded * in smpboot_control: - * Bit 0-15 APICID if STARTUP_USE_CPUID_0B is not set - * Bit 16 Secondary boot flag - * Bit 17 Parallel boot flag + * Bit 0-30 APIC ID if STARTUP_PARALLEL is not set + * Bit 31 Parallel boot flag (use CPUID leaf 0x0b for APIC ID). */ - testl $STARTUP_USE_CPUID_0B, %eax + testl $STARTUP_PARALLEL, %eax jz .Lsetup_AP mov $0x0B, %eax @@ -203,7 +202,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) .Lsetup_AP: /* EAX contains the APICID of the current CPU */ - andl $0xFFFF, %eax xorl %ecx, %ecx leaq cpuid_to_apicid(%rip), %rbx diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 725fede281ac..acfb22ce8d4f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1125,13 +1125,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, if (IS_ENABLED(CONFIG_X86_32)) { early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_stack = idle->thread.sp; - } else if (boot_cpu_data.cpuid_level < 0x0B) { - /* Anything with X2APIC should have CPUID leaf 0x0B */ - if (WARN_ON_ONCE(x2apic_mode) && apicid > 0xffff) - return -EIO; - smpboot_control = apicid | STARTUP_USE_APICID; + } else if (do_parallel_bringup) { + smpboot_control = STARTUP_PARALLEL; } else { - smpboot_control = STARTUP_USE_CPUID_0B; + smpboot_control = apicid; } /* Enable the espfix hack for this CPU */ @@ -1553,9 +1550,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* * We can do 64-bit AP bringup in parallel if the CPU reports its - * APIC ID in CPUID leaf 0x0B. Otherwise it's too hard. + * APIC ID in CPUID leaf 0x0B. Otherwise it's too hard. And not + * for SEV-ES guests because they can't use CPUID that early. */ - if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 0x0B) + if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 0x0B || + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) do_parallel_bringup = false; if (do_parallel_bringup)
On Thu, 2021-12-16 at 19:20 +0000, David Woodhouse wrote: > On Thu, 2021-12-16 at 13:00 -0600, Tom Lendacky wrote: > > On 12/16/21 12:24 PM, David Woodhouse wrote: > > > On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote: > > > > > > > This will break an SEV-ES guest because CPUID will generate a #VC and a > > > > #VC handler has not been established yet. > > > > > > > > I guess for now, you can probably just not enable parallel startup for > > > > SEV-ES guests. > > > > > > OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then, > > > since it's no longer limited to CPUs without X2APIC. Then we can > > > refrain from doing parallel bringup for SEV-ES guests, as you suggest. > > > > > > What precisely is the check I should be using for that? > > > > Calling cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT) will return true for > > an SEV-ES guest. > > Thanks. Incremental patch (which I'll roll into Thomas's patch) looks a > bit like this. Testing it now... Further inspection shows I really did want a bit to indicate that this is a secondary AP startup, which Thomas had documented as such in the comments in head_64.S but then actually called STARTUP_USE_APICID. Otherwise the special case of smpboot_control==zero for startup of the BSP which uses the pre-existing initial_gs etc., might also get invoked in the rare case that an AP has APIC ID #0. So we really do need Sean's fix to do the masking in the right place, which I had 'fixed' by removing that mask altogether. And we also need Sean's fix to stop scribbling on initial_fs when each AP will calculate it for itself anyway. I've rebased and pushed to https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-5.17 I'll do some more testing and repost the series during next week. The win is slightly more modest than the original patch sets because it now only parallelises x86/cpu:kick. I'm going to do more careful review and testing before doing the same for x86/cpu:wait-init in a later series. You can see that coming together in the git tree but I'm only going to post up to the 'Serialise topology updates' patch again for now. The only real change is this patch; perhaps now we've fixed it Thomas will provide a Signed-off-by for it? :) Now looks like this... From 888741f787a2e59b1471f15177c1ba981d06ad04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 12 Feb 2021 18:30:28 +0100 Subject: [PATCH v3.1 6/9] x86/smpboot: Support parallel startup of secondary CPUs To allow for parallel AP bringup, we need to avoid the use of global variables for passing information to the APs, as well as preventing them from all trying to use the same real-mode stack simultaneously. So, introduce a 'lock' field in struct trampoline_header to use as a simple bit-spinlock for the real-mode stack. That lock also protects the global variables initial_gs, initial_stack and early_gdt_descr, which can now be calculated... So how do we calculate those addresses? Well, they they can all be found from the per_cpu data for this CPU. Simples! Except... how does it know what its CPU# is? OK, we export the cpuid_to_apicid[] array and it can search it to find its APIC ID in there. But now you whine at me that it doesn't even know its APIC ID? Well, if it's a relatively modern CPU then the APIC ID is in CPUID leaf 0x0B so we can use that. Otherwise... erm... OK, otherwise it can't have parallel CPU bringup for now. We'll still use a global variable for those CPUs and bring them up one at a time. So add a global 'smpboot_control' field which either contains the APIC ID, or a flag indicating that it can be found in CPUID. This adds the 'do_parallel_bringup' flag in preparation but doesn't actually enable parallel bringup yet. [ dwmw2: Minor tweaks, write a commit message ] [ seanc: Fix stray override of initial_gs in common_cpu_up() ] Not-signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: David Woodhouse <dwmw2@infradead.org> --- arch/x86/include/asm/realmode.h | 3 ++ arch/x86/include/asm/smp.h | 9 +++- arch/x86/kernel/acpi/sleep.c | 1 + arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/head_64.S | 73 ++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 32 ++++++++++-- arch/x86/realmode/init.c | 3 ++ arch/x86/realmode/rm/trampoline_64.S | 14 ++++++ kernel/smpboot.c | 2 +- 9 files changed, 132 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 331474b150f1..1693bc834163 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -51,6 +51,7 @@ struct trampoline_header { u64 efer; u32 cr4; u32 flags; + u32 lock; #endif }; @@ -64,6 +65,8 @@ extern unsigned long initial_stack; extern unsigned long initial_vc_handler; #endif +extern u32 *trampoline_lock; + extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 81a0211a372d..4fe1320c2e8d 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -196,5 +196,12 @@ extern void nmi_selftest(void); #define nmi_selftest() do { } while (0) #endif -#endif /* __ASSEMBLY__ */ +extern unsigned int smpboot_control; + +#endif /* !__ASSEMBLY__ */ + +/* Control bits for startup_64 */ +#define STARTUP_PARALLEL 0x80000000 +#define STARTUP_SECONDARY 0x40000000 + #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1e97f944b47d..4f26cc9346ac 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -114,6 +114,7 @@ int x86_acpi_suspend_lowlevel(void) early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); + smpboot_control = 0; #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0L; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..5b20e051d84c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2335,7 +2335,7 @@ static int nr_logical_cpuids = 1; /* * Used to store mapping between logical CPU IDs and APIC IDs. */ -static int cpuid_to_apicid[] = { +int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9c63fc5988cd..b0d8c9fffc73 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -25,6 +25,7 @@ #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/fixmap.h> +#include <asm/smp.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -193,6 +194,66 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) 1: UNWIND_HINT_EMPTY + /* + * Is this the boot CPU coming up? If so everything is available + * in initial_gs, initial_stack and early_gdt_descr. + */ + movl smpboot_control(%rip), %eax + testl %eax, %eax + jz .Lsetup_cpu + + /* + * Secondary CPUs find out the offsets via the APIC ID. For parallel + * boot the APIC ID is retrieved from CPUID, otherwise it's encoded + * in smpboot_control: + * Bit 0-29 APIC ID if STARTUP_PARALLEL flag is not set + * Bit 30 STARTUP_SECONDARY flag + * Bit 31 STARTUP_PARALLEL flag (use CPUID 0x0b for APIC ID) + */ + testl $STARTUP_PARALLEL, %eax + jnz .Luse_cpuid_0b + andl $0x0FFFFFFF, %eax + jmp .Lsetup_AP + +.Luse_cpuid_0b: + mov $0x0B, %eax + xorl %ecx, %ecx + cpuid + mov %edx, %eax + +.Lsetup_AP: + /* EAX contains the APICID of the current CPU */ + xorl %ecx, %ecx + leaq cpuid_to_apicid(%rip), %rbx + +.Lfind_cpunr: + cmpl (%rbx), %eax + jz .Linit_cpu_data + addq $4, %rbx + addq $8, %rcx + jmp .Lfind_cpunr + +.Linit_cpu_data: + /* Get the per cpu offset */ + leaq __per_cpu_offset(%rip), %rbx + addq %rcx, %rbx + movq (%rbx), %rbx + /* Save it for GS BASE setup */ + movq %rbx, initial_gs(%rip) + + /* Calculate the GDT address */ + movq $gdt_page, %rcx + addq %rbx, %rcx + movq %rcx, early_gdt_descr_base(%rip) + + /* Find the idle task stack */ + movq $idle_threads, %rcx + addq %rbx, %rcx + movq (%rcx), %rcx + movq TASK_threadsp(%rcx), %rcx + movq %rcx, initial_stack(%rip) + +.Lsetup_cpu: /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace @@ -233,6 +294,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) */ movq initial_stack(%rip), %rsp + /* Drop the realmode protection. For the boot CPU the pointer is NULL! */ + movq trampoline_lock(%rip), %rax + testq %rax, %rax + jz .Lsetup_idt + lock + btrl $0, (%rax) + +.Lsetup_idt: /* Setup and Load IDT */ pushq %rsi call early_setup_idt @@ -364,6 +433,7 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) * reliably detect the end of the stack. */ SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) +SYM_DATA(trampoline_lock, .quad 0); __FINITDATA __INIT @@ -589,6 +659,9 @@ SYM_DATA_END(level1_fixmap_pgt) SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) + .align 16 +SYM_DATA(smpboot_control, .long 0) + .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 38c5d65a568d..e060bbd79cc2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -808,6 +808,16 @@ static int __init cpu_init_udelay(char *str) } early_param("cpu_init_udelay", cpu_init_udelay); +static bool do_parallel_bringup = true; + +static int __init no_parallel_bringup(char *str) +{ + do_parallel_bringup = false; + + return 0; +} +early_param("no_parallel_bringup", no_parallel_bringup); + static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ @@ -1095,8 +1105,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); -#else - initial_gs = per_cpu_offset(cpu); #endif return 0; } @@ -1115,9 +1123,16 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; - initial_stack = idle->thread.sp; + + if (IS_ENABLED(CONFIG_X86_32)) { + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + initial_stack = idle->thread.sp; + } else if (do_parallel_bringup) { + smpboot_control = STARTUP_SECONDARY | STARTUP_PARALLEL; + } else { + smpboot_control = STARTUP_SECONDARY | apicid; + } /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); @@ -1516,6 +1531,15 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + /* + * We can do 64-bit AP bringup in parallel if the CPU reports its + * APIC ID in CPUID leaf 0x0B. Otherwise it's too hard. And not + * for SEV-ES guests because they can't use CPUID that early. + */ + if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 0x0B || + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + do_parallel_bringup = false; } void arch_thaw_secondary_cpus_begin(void) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index c5e29db02a46..21b9e8b55618 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -154,6 +154,9 @@ static void __init setup_real_mode(void) trampoline_header->flags = 0; + trampoline_lock = &trampoline_header->lock; + *trampoline_lock = 0; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); /* Map the real mode stub as virtual == physical */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index cc8391f86cdb..12a540904e80 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start) mov %ax, %es mov %ax, %ss + /* + * Make sure only one CPU fiddles with the realmode stack + */ +.Llock_rm: + btl $0, tr_lock + jnc 2f + pause + jmp .Llock_rm +2: + lock + btsl $0, tr_lock + jc .Llock_rm + # Setup stack movl $rm_stack_end, %esp @@ -192,6 +205,7 @@ SYM_DATA_START(trampoline_header) SYM_DATA(tr_efer, .space 8) SYM_DATA(tr_cr4, .space 4) SYM_DATA(tr_flags, .space 4) + SYM_DATA(tr_lock, .space 4) SYM_DATA_END(trampoline_header) #include "trampoline_common.S" diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f6bc0bc8a2aa..934e64ff4eed 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -25,7 +25,7 @@ * For the hotplug case we keep the task structs around and reuse * them. */ -static DEFINE_PER_CPU(struct task_struct *, idle_threads); +DEFINE_PER_CPU(struct task_struct *, idle_threads); struct task_struct *idle_thread_get(unsigned int cpu) {
On Sat, Jan 29, 2022 at 12:04:19PM +0000, David Woodhouse wrote: > I've rebased and pushed to > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-5.17 > > I'll do some more testing and repost the series during next week. The > win is slightly more modest than the original patch sets because it now > only parallelises x86/cpu:kick. I'm going to do more careful review and > testing before doing the same for x86/cpu:wait-init in a later series. > You can see that coming together in the git tree but I'm only going to > post up to the 'Serialise topology updates' patch again for now. Btw, Mr. Cooper points out a very important aspect and I don't know whether you've verified this already or whether this is not affected by your series ... yet. In any case it should be checked: microcode loading. See __reload_late() and all that dance we do to keep SMT siblings do nothing at the same time while updating microcode. With the current boot order, the APs should all do nothing so they won't need that sync for early loading - load_ucode_{ap,bsp} - but I don't know if you're changing that order with the parallel startup. If you do, you'll probably need such syncing for the early loading too...
On Mon, 2022-01-31 at 14:59 +0100, Borislav Petkov wrote: > On Sat, Jan 29, 2022 at 12:04:19PM +0000, David Woodhouse wrote: > > I've rebased and pushed to > > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-5.17 > > > > > > I'll do some more testing and repost the series during next week. The > > win is slightly more modest than the original patch sets because it now > > only parallelises x86/cpu:kick. I'm going to do more careful review and > > testing before doing the same for x86/cpu:wait-init in a later series. > > You can see that coming together in the git tree but I'm only going to > > post up to the 'Serialise topology updates' patch again for now. > > Btw, Mr. Cooper points out a very important aspect and I don't know > whether you've verified this already or whether this is not affected > by your series ... yet. In any case it should be checked: microcode > loading. > > See __reload_late() and all that dance we do to keep SMT siblings do > nothing at the same time while updating microcode. > > With the current boot order, the APs should all do nothing so they won't > need that sync for early loading - load_ucode_{ap,bsp} - but I don't > know if you're changing that order with the parallel startup. > > If you do, you'll probably need such syncing for the early loading > too... Thanks. It looks like that is only invoked after boot, with a write to /sys/devices/system/cpu/microcode/reload. My series is only parallelising the initial bringup at boot time, so it shouldn't make any difference. However... it does look like there's nothing preventing a sibling being brought online *while* the dance you mention above is occurring. Shouldn't __reload_late() take the device_hotplug_lock to prevent that?
On Tue, Feb 01, 2022 at 10:25:01AM +0000, David Woodhouse wrote: > Thanks. It looks like that is only invoked after boot, with a write to > /sys/devices/system/cpu/microcode/reload. > > My series is only parallelising the initial bringup at boot time, so it > shouldn't make any difference. No, I don't mean __reload_late() - I pointed you at that function to show the dance we must do when updating microcode late. The load_ucode_{ap,bsp}() routines are what is called when loading ucode early. So the question is, does the parallelizing change the order in which APs are brought up and can it happen that a SMT sibling of a two-SMT core executes *something* while the other SMT sibling is updating microcode. If so, that would be bad. > However... it does look like there's nothing preventing a sibling being > brought online *while* the dance you mention above is occurring. Bottom line is: of the two SMT siblings, one needs to be updating microcode while the other is idle. I.e., what __reload_late() does. > Shouldn't __reload_late() take the device_hotplug_lock to prevent that? See reload_store().
On Tue, 2022-02-01 at 11:56 +0100, Borislav Petkov wrote: > On Tue, Feb 01, 2022 at 10:25:01AM +0000, David Woodhouse wrote: > > Thanks. It looks like that is only invoked after boot, with a write to > > /sys/devices/system/cpu/microcode/reload. > > > > My series is only parallelising the initial bringup at boot time, so it > > shouldn't make any difference. > > No, I don't mean __reload_late() - I pointed you at that function to > show the dance we must do when updating microcode late. > > The load_ucode_{ap,bsp}() routines are what is called when loading ucode > early. > > So the question is, does the parallelizing change the order in which APs > are brought up and can it happen that a SMT sibling of a two-SMT core > executes *something* while the other SMT sibling is updating microcode. > > If so, that would be bad. Right. So as you surmise, I haven't broken that... yet. At least not in the patches I've posted :) The call to ucode_cpu_init() is in cpu_init(), right after the call to wait_for_master_cpu(), which this AP's bit in cpu_initialized_mask and then waits for the BSP to set its bit in cpu_callout_mask. That's a full synchronization point with do_wait_cpu_initalized() on the BSP, which waits for the former and then sets the later. So... with the series I've posted, all APs end up waiting in wait_for_master_cpu() until the final serialized bringup. In the top of my git tree, you can see a half-baked 'parallel part 2' commit which introduces a new x86/cpu:wait-init cpuhp state that would invoke do_wait_cpu_initialized() for each CPU in turn, which *would* release them all into load_ucode_bsp() at the same time and have precisely the problem you're describing. I'll commit a FIXME comment now so that it doesn't slip my mind. Thanks. > > However... it does look like there's nothing preventing a sibling being > > brought online *while* the dance you mention above is occurring. > > Bottom line is: of the two SMT siblings, one needs to be updating > microcode while the other is idle. I.e., what __reload_late() does. > > > Shouldn't __reload_late() take the device_hotplug_lock to prevent that? > > See reload_store(). Hm, not sure I see how that's protecting itself from someone simultaneously echoing 1 > /sys/devices/system/cpu/cpu${SIBLING}/online
On Tue, Feb 01, 2022 at 12:39:17PM +0000, David Woodhouse wrote: > In the top of my git tree, you can see a half-baked 'parallel part 2' > commit which introduces a new x86/cpu:wait-init cpuhp state that would > invoke do_wait_cpu_initialized() for each CPU in turn, which *would* > release them all into load_ucode_bsp() at the same time and have > precisely the problem you're describing. The load_ucode_bsp() is the variant that runs on the boot CPU but yeah... > I'll commit a FIXME comment now so that it doesn't slip my mind. Yap, thank Cooper for pointing out that whole thing about how microcode loading is special and can't always handle parallelism. :) > Hm, not sure I see how that's protecting itself from someone > simultaneously echoing 1 > /sys/devices/system/cpu/cpu${SIBLING}/online So echo 1 > ../online means onlining the sibling. But reload_store() grabs the CPU hotplug lock *first* and *then* runs check_online_cpus() to see if all CPUs are online. It doesn't do the update if even one CPU is missing. You can't offline any CPU for the duration of the update... So I guess you'd need to explain in more detail what protection hole you're seeing because I might be missing something here. Thx.
On Tue, 2022-02-01 at 13:56 +0100, Borislav Petkov wrote: > On Tue, Feb 01, 2022 at 12:39:17PM +0000, David Woodhouse wrote: > > In the top of my git tree, you can see a half-baked 'parallel part 2' > > commit which introduces a new x86/cpu:wait-init cpuhp state that would > > invoke do_wait_cpu_initialized() for each CPU in turn, which *would* > > release them all into load_ucode_bsp() at the same time and have > > precisely the problem you're describing. > > The load_ucode_bsp() is the variant that runs on the boot CPU but > yeah... Right. Brain not fully online today. Sorry. > > Hm, not sure I see how that's protecting itself from someone > > simultaneously echoing 1 > /sys/devices/system/cpu/cpu${SIBLING}/online > > So > > echo 1 > ../online > > means onlining the sibling. > > But reload_store() grabs the CPU hotplug lock *first* and *then* runs > check_online_cpus() to see if all CPUs are online. It doesn't do the > update if even one CPU is missing. You can't offline any CPU for the > duration of the update... > > So I guess you'd need to explain in more detail what protection hole > you're seeing because I might be missing something here. No, I'd just missed cpus_read_lock() because I was looking for something else. My fault; it looks fine. Thanks.
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 5db5d083c873..e1cc4bc746bc 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -51,6 +51,7 @@ struct trampoline_header { u64 efer; u32 cr4; u32 flags; + u32 lock; #endif }; @@ -64,6 +65,8 @@ extern unsigned long initial_stack; extern unsigned long initial_vc_handler; #endif +extern u32 *trampoline_lock; + extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 81a0211a372d..ca807c29dc34 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -196,5 +196,12 @@ extern void nmi_selftest(void); #define nmi_selftest() do { } while (0) #endif -#endif /* __ASSEMBLY__ */ +extern unsigned int smpboot_control; + +#endif /* !__ASSEMBLY__ */ + +/* Control bits for startup_64 */ +#define STARTUP_USE_APICID 0x10000 +#define STARTUP_USE_CPUID_0B 0x20000 + #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3f85fcae450c..9598ebf4f9d6 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -114,6 +114,7 @@ int x86_acpi_suspend_lowlevel(void) early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); + smpboot_control = 0; #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0L; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..5b20e051d84c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2335,7 +2335,7 @@ static int nr_logical_cpuids = 1; /* * Used to store mapping between logical CPU IDs and APIC IDs. */ -static int cpuid_to_apicid[] = { +int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8b3ebd2bb85..0249212e23d2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -25,6 +25,7 @@ #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/fixmap.h> +#include <asm/smp.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -176,6 +177,64 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) 1: UNWIND_HINT_EMPTY + /* + * Is this the boot CPU coming up? If so everything is available + * in initial_gs, initial_stack and early_gdt_descr. + */ + movl smpboot_control(%rip), %eax + testl %eax, %eax + jz .Lsetup_cpu + + /* + * Secondary CPUs find out the offsets via the APIC ID. For parallel + * boot the APIC ID is retrieved from CPUID, otherwise it's encoded + * in smpboot_control: + * Bit 0-15 APICID if STARTUP_USE_CPUID_0B is not set + * Bit 16 Secondary boot flag + * Bit 17 Parallel boot flag + */ + testl $STARTUP_USE_CPUID_0B, %eax + jz .Lsetup_AP + + mov $0x0B, %eax + xorl %ecx, %ecx + cpuid + mov %edx, %eax + +.Lsetup_AP: + /* EAX contains the APICID of the current CPU */ + andl $0xFFFF, %eax + xorl %ecx, %ecx + leaq cpuid_to_apicid(%rip), %rbx + +.Lfind_cpunr: + cmpl (%rbx), %eax + jz .Linit_cpu_data + addq $4, %rbx + addq $8, %rcx + jmp .Lfind_cpunr + +.Linit_cpu_data: + /* Get the per cpu offset */ + leaq __per_cpu_offset(%rip), %rbx + addq %rcx, %rbx + movq (%rbx), %rbx + /* Save it for GS BASE setup */ + movq %rbx, initial_gs(%rip) + + /* Calculate the GDT address */ + movq $gdt_page, %rcx + addq %rbx, %rcx + movq %rcx, early_gdt_descr_base(%rip) + + /* Find the idle task stack */ + movq $idle_threads, %rcx + addq %rbx, %rcx + movq (%rcx), %rcx + movq TASK_threadsp(%rcx), %rcx + movq %rcx, initial_stack(%rip) + +.Lsetup_cpu: /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace @@ -216,6 +275,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) */ movq initial_stack(%rip), %rsp + /* Drop the realmode protection. For the boot CPU the pointer is NULL! */ + movq trampoline_lock(%rip), %rax + testq %rax, %rax + jz .Lsetup_idt + lock + btrl $0, (%rax) + +.Lsetup_idt: /* Setup and Load IDT */ pushq %rsi call early_setup_idt @@ -347,6 +414,7 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) * reliably detect the end of the stack. */ SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) +SYM_DATA(trampoline_lock, .quad 0); __FINITDATA __INIT @@ -572,6 +640,9 @@ SYM_DATA_END(level1_fixmap_pgt) SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) + .align 16 +SYM_DATA(smpboot_control, .long 0) + .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7a763b84b6e5..1e38d44c3603 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1104,9 +1104,19 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; - initial_stack = idle->thread.sp; + + if (IS_ENABLED(CONFIG_X86_32)) { + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + initial_stack = idle->thread.sp; + } else if (boot_cpu_data.cpuid_level < 0x0B) { + /* Anything with X2APIC should have CPUID leaf 0x0B */ + if (WARN_ON_ONCE(x2apic_mode) && apicid > 0xffff) + return -EIO; + smpboot_control = apicid | STARTUP_USE_APICID; + } else { + smpboot_control = STARTUP_USE_CPUID_0B; + } /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 4a3da7592b99..7dc2e817bd02 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -127,6 +127,9 @@ static void __init setup_real_mode(void) trampoline_header->flags = 0; + trampoline_lock = &trampoline_header->lock; + *trampoline_lock = 0; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_top_pgt[511].pgd; diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index cc8391f86cdb..12a540904e80 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start) mov %ax, %es mov %ax, %ss + /* + * Make sure only one CPU fiddles with the realmode stack + */ +.Llock_rm: + btl $0, tr_lock + jnc 2f + pause + jmp .Llock_rm +2: + lock + btsl $0, tr_lock + jc .Llock_rm + # Setup stack movl $rm_stack_end, %esp @@ -192,6 +205,7 @@ SYM_DATA_START(trampoline_header) SYM_DATA(tr_efer, .space 8) SYM_DATA(tr_cr4, .space 4) SYM_DATA(tr_flags, .space 4) + SYM_DATA(tr_lock, .space 4) SYM_DATA_END(trampoline_header) #include "trampoline_common.S" diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f6bc0bc8a2aa..934e64ff4eed 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -25,7 +25,7 @@ * For the hotplug case we keep the task structs around and reuse * them. */ -static DEFINE_PER_CPU(struct task_struct *, idle_threads); +DEFINE_PER_CPU(struct task_struct *, idle_threads); struct task_struct *idle_thread_get(unsigned int cpu) {