diff mbox series

[v3,6/9] x86/smpboot: Support parallel startup of secondary CPUs

Message ID 20211215145633.5238-7-dwmw2@infradead.org (mailing list archive)
State New, archived
Headers show
Series Parallel CPU bringup for x86_64 | expand

Commit Message

David Woodhouse Dec. 15, 2021, 2:56 p.m. UTC
From: Thomas Gleixner <tglx@linutronix.de>

To allow for parallel AP bringup, we need to avoid the use of global
variables for passing information to the APs, as well as preventing them
from all trying to use the same real-mode stack simultaneously.

So, introduce a 'lock' field in struct trampoline_header to use as a
simple bit-spinlock for the real-mode stack. That lock also protects
the global variables initial_gs, initial_stack and early_gdt_descr,
which can now be calculated...

So how do we calculate those addresses? Well, they they can all be found
from the per_cpu data for this CPU. Simples! Except... how does it know
what its CPU# is? OK, we export the cpuid_to_apicid[] array and it can
search it to find its APIC ID in there.

But now you whine at me that it doesn't even know its APIC ID? Well, if
it's a relatively modern CPU then the APIC ID is in CPUID leaf 0x0B so
we can use that. Otherwise... erm... OK, otherwise it can't have parallel
CPU bringup for now. We'll still use a global variable for those CPUs and
bring them up one at a time.

So add a global 'smpboot_control' field which either contains the APIC
ID, or a flag indicating that it can be found in CPUID.

[ dwmw2: Minor tweaks, write a commit message ]
Not-signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 arch/x86/include/asm/realmode.h      |  3 ++
 arch/x86/include/asm/smp.h           |  9 +++-
 arch/x86/kernel/acpi/sleep.c         |  1 +
 arch/x86/kernel/apic/apic.c          |  2 +-
 arch/x86/kernel/head_64.S            | 71 ++++++++++++++++++++++++++++
 arch/x86/kernel/smpboot.c            | 14 +++++-
 arch/x86/realmode/init.c             |  3 ++
 arch/x86/realmode/rm/trampoline_64.S | 14 ++++++
 kernel/smpboot.c                     |  2 +-
 9 files changed, 114 insertions(+), 5 deletions(-)

Comments

Tom Lendacky Dec. 16, 2021, 2:24 p.m. UTC | #1
On 12/15/21 8:56 AM, David Woodhouse wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
...
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index d8b3ebd2bb85..0249212e23d2 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -25,6 +25,7 @@
>   #include <asm/export.h>
>   #include <asm/nospec-branch.h>
>   #include <asm/fixmap.h>
> +#include <asm/smp.h>
>   
>   /*
>    * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
> @@ -176,6 +177,64 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
>   1:
>   	UNWIND_HINT_EMPTY
>   
> +	/*
> +	 * Is this the boot CPU coming up? If so everything is available
> +	 * in initial_gs, initial_stack and early_gdt_descr.
> +	 */
> +	movl	smpboot_control(%rip), %eax
> +	testl	%eax, %eax
> +	jz	.Lsetup_cpu
> +
> +	/*
> +	 * Secondary CPUs find out the offsets via the APIC ID. For parallel
> +	 * boot the APIC ID is retrieved from CPUID, otherwise it's encoded
> +	 * in smpboot_control:
> +	 * Bit 0-15	APICID if STARTUP_USE_CPUID_0B is not set
> +	 * Bit 16 	Secondary boot flag
> +	 * Bit 17	Parallel boot flag
> +	 */
> +	testl	$STARTUP_USE_CPUID_0B, %eax
> +	jz	.Lsetup_AP
> +
> +	mov	$0x0B, %eax
> +	xorl	%ecx, %ecx
> +	cpuid

This will break an SEV-ES guest because CPUID will generate a #VC and a 
#VC handler has not been established yet.

I guess for now, you can probably just not enable parallel startup for 
SEV-ES guests.

Thanks,
Tom


> +	mov	%edx, %eax
> +
> +.Lsetup_AP:
> +	/* EAX contains the APICID of the current CPU */
> +	andl	$0xFFFF, %eax
> +	xorl	%ecx, %ecx
> +	leaq	cpuid_to_apicid(%rip), %rbx
> +
> +.Lfind_cpunr:
> +	cmpl	(%rbx), %eax
> +	jz	.Linit_cpu_data
> +	addq	$4, %rbx
> +	addq	$8, %rcx
> +	jmp	.Lfind_cpunr
> +
> +.Linit_cpu_data:
> +	/* Get the per cpu offset */
> +	leaq	__per_cpu_offset(%rip), %rbx
> +	addq	%rcx, %rbx
> +	movq	(%rbx), %rbx
> +	/* Save it for GS BASE setup */
> +	movq	%rbx, initial_gs(%rip)
> +
> +	/* Calculate the GDT address */
> +	movq	$gdt_page, %rcx
> +	addq	%rbx, %rcx
> +	movq	%rcx, early_gdt_descr_base(%rip)
> +
> +	/* Find the idle task stack */
> +	movq	$idle_threads, %rcx
> +	addq	%rbx, %rcx
> +	movq	(%rcx), %rcx
> +	movq	TASK_threadsp(%rcx), %rcx
> +	movq	%rcx, initial_stack(%rip)
> +
> +.Lsetup_cpu:
>   	/*
>   	 * We must switch to a new descriptor in kernel space for the GDT
>   	 * because soon the kernel won't have access anymore to the userspace
> @@ -216,6 +275,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
>   	 */
>   	movq initial_stack(%rip), %rsp
>   
> +	/* Drop the realmode protection. For the boot CPU the pointer is NULL! */
> +	movq	trampoline_lock(%rip), %rax
> +	testq	%rax, %rax
> +	jz	.Lsetup_idt
> +	lock
> +	btrl	$0, (%rax)
> +
> +.Lsetup_idt:
>   	/* Setup and Load IDT */
>   	pushq	%rsi
>   	call	early_setup_idt
> @@ -347,6 +414,7 @@ SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
>    * reliably detect the end of the stack.
>    */
>   SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
> +SYM_DATA(trampoline_lock, .quad 0);
>   	__FINITDATA
>   
>   	__INIT
> @@ -572,6 +640,9 @@ SYM_DATA_END(level1_fixmap_pgt)
>   SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
>   SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
>   
> +	.align 16
> +SYM_DATA(smpboot_control,		.long 0)
> +
>   	.align 16
>   /* This must match the first entry in level2_kernel_pgt */
>   SYM_DATA(phys_base, .quad 0x0)
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 7a763b84b6e5..1e38d44c3603 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1104,9 +1104,19 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
>   	unsigned long boot_error = 0;
>   
>   	idle->thread.sp = (unsigned long)task_pt_regs(idle);
> -	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
>   	initial_code = (unsigned long)start_secondary;
> -	initial_stack  = idle->thread.sp;
> +
> +	if (IS_ENABLED(CONFIG_X86_32)) {
> +		early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
> +		initial_stack  = idle->thread.sp;
> +	} else if (boot_cpu_data.cpuid_level < 0x0B) {
> +		/* Anything with X2APIC should have CPUID leaf 0x0B */
> +		if (WARN_ON_ONCE(x2apic_mode) && apicid > 0xffff)
> +			return -EIO;
> +		smpboot_control = apicid | STARTUP_USE_APICID;
> +	} else {
> +		smpboot_control = STARTUP_USE_CPUID_0B;
> +	}
>   
>   	/* Enable the espfix hack for this CPU */
>   	init_espfix_ap(cpu);
> diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
> index 4a3da7592b99..7dc2e817bd02 100644
> --- a/arch/x86/realmode/init.c
> +++ b/arch/x86/realmode/init.c
> @@ -127,6 +127,9 @@ static void __init setup_real_mode(void)
>   
>   	trampoline_header->flags = 0;
>   
> +	trampoline_lock = &trampoline_header->lock;
> +	*trampoline_lock = 0;
> +
>   	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
>   	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
>   	trampoline_pgd[511] = init_top_pgt[511].pgd;
> diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
> index cc8391f86cdb..12a540904e80 100644
> --- a/arch/x86/realmode/rm/trampoline_64.S
> +++ b/arch/x86/realmode/rm/trampoline_64.S
> @@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start)
>   	mov	%ax, %es
>   	mov	%ax, %ss
>   
> +	/*
> +	 * Make sure only one CPU fiddles with the realmode stack
> +	 */
> +.Llock_rm:
> +	btl	$0, tr_lock
> +	jnc	2f
> +	pause
> +	jmp	.Llock_rm
> +2:
> +	lock
> +	btsl	$0, tr_lock
> +	jc	.Llock_rm
> +
>   	# Setup stack
>   	movl	$rm_stack_end, %esp
>   
> @@ -192,6 +205,7 @@ SYM_DATA_START(trampoline_header)
>   	SYM_DATA(tr_efer,		.space 8)
>   	SYM_DATA(tr_cr4,		.space 4)
>   	SYM_DATA(tr_flags,		.space 4)
> +	SYM_DATA(tr_lock,		.space 4)
>   SYM_DATA_END(trampoline_header)
>   
>   #include "trampoline_common.S"
> diff --git a/kernel/smpboot.c b/kernel/smpboot.c
> index f6bc0bc8a2aa..934e64ff4eed 100644
> --- a/kernel/smpboot.c
> +++ b/kernel/smpboot.c
> @@ -25,7 +25,7 @@
>    * For the hotplug case we keep the task structs around and reuse
>    * them.
>    */
> -static DEFINE_PER_CPU(struct task_struct *, idle_threads);
> +DEFINE_PER_CPU(struct task_struct *, idle_threads);
>   
>   struct task_struct *idle_thread_get(unsigned int cpu)
>   {
>
David Woodhouse Dec. 16, 2021, 6:24 p.m. UTC | #2
On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote:

> This will break an SEV-ES guest because CPUID will generate a #VC and a 
> #VC handler has not been established yet.
> 
> I guess for now, you can probably just not enable parallel startup for 
> SEV-ES guests.

OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then,
since it's no longer limited to CPUs without X2APIC. Then we can
refrain from doing parallel bringup for SEV-ES guests, as you suggest.

What precisely is the check I should be using for that?
Tom Lendacky Dec. 16, 2021, 7 p.m. UTC | #3
On 12/16/21 12:24 PM, David Woodhouse wrote:
> On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote:
> 
>> This will break an SEV-ES guest because CPUID will generate a #VC and a
>> #VC handler has not been established yet.
>>
>> I guess for now, you can probably just not enable parallel startup for
>> SEV-ES guests.
> 
> OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then,
> since it's no longer limited to CPUs without X2APIC. Then we can
> refrain from doing parallel bringup for SEV-ES guests, as you suggest.
> 
> What precisely is the check I should be using for that?

Calling cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT) will return true for 
an SEV-ES guest.

Thanks,
Tom

>
David Woodhouse Dec. 16, 2021, 7:20 p.m. UTC | #4
On Thu, 2021-12-16 at 13:00 -0600, Tom Lendacky wrote:
> On 12/16/21 12:24 PM, David Woodhouse wrote:
> > On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote:
> > 
> > > This will break an SEV-ES guest because CPUID will generate a #VC and a
> > > #VC handler has not been established yet.
> > > 
> > > I guess for now, you can probably just not enable parallel startup for
> > > SEV-ES guests.
> > 
> > OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then,
> > since it's no longer limited to CPUs without X2APIC. Then we can
> > refrain from doing parallel bringup for SEV-ES guests, as you suggest.
> > 
> > What precisely is the check I should be using for that?
> 
> Calling cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT) will return true for 
> an SEV-ES guest.

Thanks. Incremental patch (which I'll roll into Thomas's patch) looks a
bit like this. Testing it now...


diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 0b6012fd3e55..1ac33ce1d60e 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -199,7 +199,6 @@ extern unsigned int smpboot_control;
 #endif /* !__ASSEMBLY__ */
 
 /* Control bits for startup_64 */
-#define	STARTUP_USE_APICID	0x10000
-#define	STARTUP_USE_CPUID_0B	0x20000
+#define	STARTUP_PARALLEL	0x80000000
 
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0249212e23d2..3e4c3c416bce 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -189,11 +189,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	 * Secondary CPUs find out the offsets via the APIC ID. For parallel
 	 * boot the APIC ID is retrieved from CPUID, otherwise it's encoded
 	 * in smpboot_control:
-	 * Bit 0-15	APICID if STARTUP_USE_CPUID_0B is not set
-	 * Bit 16 	Secondary boot flag
-	 * Bit 17	Parallel boot flag
+	 * Bit 0-30	APIC ID if STARTUP_PARALLEL is not set
+	 * Bit 31	Parallel boot flag (use CPUID leaf 0x0b for APIC ID).
 	 */
-	testl	$STARTUP_USE_CPUID_0B, %eax
+	testl	$STARTUP_PARALLEL, %eax
 	jz	.Lsetup_AP
 
 	mov	$0x0B, %eax
@@ -203,7 +202,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 
 .Lsetup_AP:
 	/* EAX contains the APICID of the current CPU */
-	andl	$0xFFFF, %eax
 	xorl	%ecx, %ecx
 	leaq	cpuid_to_apicid(%rip), %rbx
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 725fede281ac..acfb22ce8d4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1125,13 +1125,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	if (IS_ENABLED(CONFIG_X86_32)) {
 		early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 		initial_stack  = idle->thread.sp;
-	} else if (boot_cpu_data.cpuid_level < 0x0B) {
-		/* Anything with X2APIC should have CPUID leaf 0x0B */
-		if (WARN_ON_ONCE(x2apic_mode) && apicid > 0xffff)
-			return -EIO;
-		smpboot_control = apicid | STARTUP_USE_APICID;
+	} else if (do_parallel_bringup) {
+		smpboot_control = STARTUP_PARALLEL;
 	} else {
-		smpboot_control = STARTUP_USE_CPUID_0B;
+		smpboot_control = apicid;
 	}
 
 	/* Enable the espfix hack for this CPU */
@@ -1553,9 +1550,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 	/*
 	 * We can do 64-bit AP bringup in parallel if the CPU reports its
-	 * APIC ID in CPUID leaf 0x0B. Otherwise it's too hard.
+	 * APIC ID in CPUID leaf 0x0B. Otherwise it's too hard. And not
+	 * for SEV-ES guests because they can't use CPUID that early.
 	 */
-	if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 0x0B)
+	if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 0x0B ||
+	    cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
 		do_parallel_bringup = false;
 
 	if (do_parallel_bringup)
David Woodhouse Jan. 29, 2022, 12:04 p.m. UTC | #5
On Thu, 2021-12-16 at 19:20 +0000, David Woodhouse wrote:
> On Thu, 2021-12-16 at 13:00 -0600, Tom Lendacky wrote:
> > On 12/16/21 12:24 PM, David Woodhouse wrote:
> > > On Thu, 2021-12-16 at 08:24 -0600, Tom Lendacky wrote:
> > > 
> > > > This will break an SEV-ES guest because CPUID will generate a #VC and a
> > > > #VC handler has not been established yet.
> > > > 
> > > > I guess for now, you can probably just not enable parallel startup for
> > > > SEV-ES guests.
> > > 
> > > OK, thanks. I'll expand it to allow 24 bits of (physical) APIC ID then,
> > > since it's no longer limited to CPUs without X2APIC. Then we can
> > > refrain from doing parallel bringup for SEV-ES guests, as you suggest.
> > > 
> > > What precisely is the check I should be using for that?
> > 
> > Calling cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT) will return true for 
> > an SEV-ES guest.
> 
> Thanks. Incremental patch (which I'll roll into Thomas's patch) looks a
> bit like this. Testing it now...

Further inspection shows I really did want a bit to indicate that this
is a secondary AP startup, which Thomas had documented as such in the
comments in head_64.S but then actually called STARTUP_USE_APICID. 

Otherwise the special case of smpboot_control==zero for startup of the
BSP which uses the pre-existing initial_gs etc., might also get invoked
in the rare case that an AP has APIC ID #0.

So we really do need Sean's fix to do the masking in the right place,
which I had 'fixed' by removing that mask altogether. And we also need
Sean's fix to stop scribbling on initial_fs when each AP will calculate
it for itself anyway.

I've rebased and pushed to
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-5.17

I'll do some more testing and repost the series during next week. The
win is slightly more modest than the original patch sets because it now
only parallelises x86/cpu:kick. I'm going to do more careful review and
testing before doing the same for x86/cpu:wait-init in a later series.
You can see that coming together in the git tree but I'm only going to
post up to the 'Serialise topology updates' patch again for now.

The only real change is this patch; perhaps now we've fixed it Thomas
will provide a Signed-off-by for it? :)

Now looks like this...

From 888741f787a2e59b1471f15177c1ba981d06ad04 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 12 Feb 2021 18:30:28 +0100
Subject: [PATCH v3.1 6/9] x86/smpboot: Support parallel startup of secondary CPUs

To allow for parallel AP bringup, we need to avoid the use of global
variables for passing information to the APs, as well as preventing them
from all trying to use the same real-mode stack simultaneously.

So, introduce a 'lock' field in struct trampoline_header to use as a
simple bit-spinlock for the real-mode stack. That lock also protects
the global variables initial_gs, initial_stack and early_gdt_descr,
which can now be calculated...

So how do we calculate those addresses? Well, they they can all be found
from the per_cpu data for this CPU. Simples! Except... how does it know
what its CPU# is? OK, we export the cpuid_to_apicid[] array and it can
search it to find its APIC ID in there.

But now you whine at me that it doesn't even know its APIC ID? Well, if
it's a relatively modern CPU then the APIC ID is in CPUID leaf 0x0B so
we can use that. Otherwise... erm... OK, otherwise it can't have parallel
CPU bringup for now. We'll still use a global variable for those CPUs and
bring them up one at a time.

So add a global 'smpboot_control' field which either contains the APIC
ID, or a flag indicating that it can be found in CPUID.

This adds the 'do_parallel_bringup' flag in preparation but doesn't
actually enable parallel bringup yet.

[ dwmw2: Minor tweaks, write a commit message ]
[ seanc: Fix stray override of initial_gs in common_cpu_up() ]
Not-signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 arch/x86/include/asm/realmode.h      |  3 ++
 arch/x86/include/asm/smp.h           |  9 +++-
 arch/x86/kernel/acpi/sleep.c         |  1 +
 arch/x86/kernel/apic/apic.c          |  2 +-
 arch/x86/kernel/head_64.S            | 73 ++++++++++++++++++++++++++++
 arch/x86/kernel/smpboot.c            | 32 ++++++++++--
 arch/x86/realmode/init.c             |  3 ++
 arch/x86/realmode/rm/trampoline_64.S | 14 ++++++
 kernel/smpboot.c                     |  2 +-
 9 files changed, 132 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 331474b150f1..1693bc834163 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -51,6 +51,7 @@ struct trampoline_header {
 	u64 efer;
 	u32 cr4;
 	u32 flags;
+	u32 lock;
 #endif
 };
 
@@ -64,6 +65,8 @@ extern unsigned long initial_stack;
 extern unsigned long initial_vc_handler;
 #endif
 
+extern u32 *trampoline_lock;
+
 extern unsigned char real_mode_blob[];
 extern unsigned char real_mode_relocs[];
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81a0211a372d..4fe1320c2e8d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -196,5 +196,12 @@ extern void nmi_selftest(void);
 #define nmi_selftest() do { } while (0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
+/* Control bits for startup_64 */
+#define	STARTUP_PARALLEL	0x80000000
+#define	STARTUP_SECONDARY	0x40000000
+
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 1e97f944b47d..4f26cc9346ac 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -114,6 +114,7 @@ int x86_acpi_suspend_lowlevel(void)
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
+	smpboot_control = 0;
 #endif
 	initial_code = (unsigned long)wakeup_long64;
        saved_magic = 0x123456789abcdef0L;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b70344bf6600..5b20e051d84c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2335,7 +2335,7 @@ static int nr_logical_cpuids = 1;
 /*
  * Used to store mapping between logical CPU IDs and APIC IDs.
  */
-static int cpuid_to_apicid[] = {
+int cpuid_to_apicid[] = {
 	[0 ... NR_CPUS - 1] = -1,
 };
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9c63fc5988cd..b0d8c9fffc73 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -25,6 +25,7 @@
 #include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/fixmap.h>
+#include <asm/smp.h>
 
 /*
  * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -193,6 +194,66 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 1:
 	UNWIND_HINT_EMPTY
 
+	/*
+	 * Is this the boot CPU coming up? If so everything is available
+	 * in initial_gs, initial_stack and early_gdt_descr.
+	 */
+	movl	smpboot_control(%rip), %eax
+	testl	%eax, %eax
+	jz	.Lsetup_cpu
+
+	/*
+	 * Secondary CPUs find out the offsets via the APIC ID. For parallel
+	 * boot the APIC ID is retrieved from CPUID, otherwise it's encoded
+	 * in smpboot_control:
+	 * Bit 0-29	APIC ID if STARTUP_PARALLEL flag is not set
+	 * Bit 30	STARTUP_SECONDARY flag
+	 * Bit 31	STARTUP_PARALLEL flag (use CPUID 0x0b for APIC ID)
+	 */
+	testl	$STARTUP_PARALLEL, %eax
+	jnz	.Luse_cpuid_0b
+	andl	$0x0FFFFFFF, %eax
+	jmp	.Lsetup_AP
+
+.Luse_cpuid_0b:
+	mov	$0x0B, %eax
+	xorl	%ecx, %ecx
+	cpuid
+	mov	%edx, %eax
+
+.Lsetup_AP:
+	/* EAX contains the APICID of the current CPU */
+	xorl	%ecx, %ecx
+	leaq	cpuid_to_apicid(%rip), %rbx
+
+.Lfind_cpunr:
+	cmpl	(%rbx), %eax
+	jz	.Linit_cpu_data
+	addq	$4, %rbx
+	addq	$8, %rcx
+	jmp	.Lfind_cpunr
+
+.Linit_cpu_data:
+	/* Get the per cpu offset */
+	leaq	__per_cpu_offset(%rip), %rbx
+	addq	%rcx, %rbx
+	movq	(%rbx), %rbx
+	/* Save it for GS BASE setup */
+	movq	%rbx, initial_gs(%rip)
+
+	/* Calculate the GDT address */
+	movq	$gdt_page, %rcx
+	addq	%rbx, %rcx
+	movq	%rcx, early_gdt_descr_base(%rip)
+
+	/* Find the idle task stack */
+	movq	$idle_threads, %rcx
+	addq	%rbx, %rcx
+	movq	(%rcx), %rcx
+	movq	TASK_threadsp(%rcx), %rcx
+	movq	%rcx, initial_stack(%rip)
+
+.Lsetup_cpu:
 	/*
 	 * We must switch to a new descriptor in kernel space for the GDT
 	 * because soon the kernel won't have access anymore to the userspace
@@ -233,6 +294,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	 */
 	movq initial_stack(%rip), %rsp
 
+	/* Drop the realmode protection. For the boot CPU the pointer is NULL! */
+	movq	trampoline_lock(%rip), %rax
+	testq	%rax, %rax
+	jz	.Lsetup_idt
+	lock
+	btrl	$0, (%rax)
+
+.Lsetup_idt:
 	/* Setup and Load IDT */
 	pushq	%rsi
 	call	early_setup_idt
@@ -364,6 +433,7 @@ SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
  * reliably detect the end of the stack.
  */
 SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
+SYM_DATA(trampoline_lock, .quad 0);
 	__FINITDATA
 
 	__INIT
@@ -589,6 +659,9 @@ SYM_DATA_END(level1_fixmap_pgt)
 SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
 SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
 
+	.align 16
+SYM_DATA(smpboot_control,		.long 0)
+
 	.align 16
 /* This must match the first entry in level2_kernel_pgt */
 SYM_DATA(phys_base, .quad 0x0)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 38c5d65a568d..e060bbd79cc2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -808,6 +808,16 @@ static int __init cpu_init_udelay(char *str)
 }
 early_param("cpu_init_udelay", cpu_init_udelay);
 
+static bool do_parallel_bringup = true;
+
+static int __init no_parallel_bringup(char *str)
+{
+	do_parallel_bringup = false;
+
+	return 0;
+}
+early_param("no_parallel_bringup", no_parallel_bringup);
+
 static void __init smp_quirk_init_udelay(void)
 {
 	/* if cmdline changed it from default, leave it alone */
@@ -1095,8 +1105,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
-#else
-	initial_gs = per_cpu_offset(cpu);
 #endif
 	return 0;
 }
@@ -1115,9 +1123,16 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	unsigned long boot_error = 0;
 
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
-	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 	initial_code = (unsigned long)start_secondary;
-	initial_stack  = idle->thread.sp;
+
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+		initial_stack  = idle->thread.sp;
+	} else if (do_parallel_bringup) {
+		smpboot_control = STARTUP_SECONDARY | STARTUP_PARALLEL;
+	} else {
+		smpboot_control = STARTUP_SECONDARY | apicid;
+	}
 
 	/* Enable the espfix hack for this CPU */
 	init_espfix_ap(cpu);
@@ -1516,6 +1531,15 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	smp_quirk_init_udelay();
 
 	speculative_store_bypass_ht_init();
+
+	/*
+	 * We can do 64-bit AP bringup in parallel if the CPU reports its
+	 * APIC ID in CPUID leaf 0x0B. Otherwise it's too hard. And not
+	 * for SEV-ES guests because they can't use CPUID that early.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 0x0B ||
+	    cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+		do_parallel_bringup = false;
 }
 
 void arch_thaw_secondary_cpus_begin(void)
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index c5e29db02a46..21b9e8b55618 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -154,6 +154,9 @@ static void __init setup_real_mode(void)
 
 	trampoline_header->flags = 0;
 
+	trampoline_lock = &trampoline_header->lock;
+	*trampoline_lock = 0;
+
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 
 	/* Map the real mode stub as virtual == physical */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index cc8391f86cdb..12a540904e80 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start)
 	mov	%ax, %es
 	mov	%ax, %ss
 
+	/*
+	 * Make sure only one CPU fiddles with the realmode stack
+	 */
+.Llock_rm:
+	btl	$0, tr_lock
+	jnc	2f
+	pause
+	jmp	.Llock_rm
+2:
+	lock
+	btsl	$0, tr_lock
+	jc	.Llock_rm
+
 	# Setup stack
 	movl	$rm_stack_end, %esp
 
@@ -192,6 +205,7 @@ SYM_DATA_START(trampoline_header)
 	SYM_DATA(tr_efer,		.space 8)
 	SYM_DATA(tr_cr4,		.space 4)
 	SYM_DATA(tr_flags,		.space 4)
+	SYM_DATA(tr_lock,		.space 4)
 SYM_DATA_END(trampoline_header)
 
 #include "trampoline_common.S"
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f6bc0bc8a2aa..934e64ff4eed 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -25,7 +25,7 @@
  * For the hotplug case we keep the task structs around and reuse
  * them.
  */
-static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+DEFINE_PER_CPU(struct task_struct *, idle_threads);
 
 struct task_struct *idle_thread_get(unsigned int cpu)
 {
Borislav Petkov Jan. 31, 2022, 1:59 p.m. UTC | #6
On Sat, Jan 29, 2022 at 12:04:19PM +0000, David Woodhouse wrote:
> I've rebased and pushed to
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-5.17
> 
> I'll do some more testing and repost the series during next week. The
> win is slightly more modest than the original patch sets because it now
> only parallelises x86/cpu:kick. I'm going to do more careful review and
> testing before doing the same for x86/cpu:wait-init in a later series.
> You can see that coming together in the git tree but I'm only going to
> post up to the 'Serialise topology updates' patch again for now.

Btw, Mr. Cooper points out a very important aspect and I don't know
whether you've verified this already or whether this is not affected
by your series ... yet. In any case it should be checked: microcode
loading.

See __reload_late() and all that dance we do to keep SMT siblings do
nothing at the same time while updating microcode.

With the current boot order, the APs should all do nothing so they won't
need that sync for early loading - load_ucode_{ap,bsp} - but I don't
know if you're changing that order with the parallel startup.

If you do, you'll probably need such syncing for the early loading
too...
David Woodhouse Feb. 1, 2022, 10:25 a.m. UTC | #7
On Mon, 2022-01-31 at 14:59 +0100, Borislav Petkov wrote:
> On Sat, Jan 29, 2022 at 12:04:19PM +0000, David Woodhouse wrote:
> > I've rebased and pushed to
> > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-5.17
> > 
> > 
> > I'll do some more testing and repost the series during next week. The
> > win is slightly more modest than the original patch sets because it now
> > only parallelises x86/cpu:kick. I'm going to do more careful review and
> > testing before doing the same for x86/cpu:wait-init in a later series.
> > You can see that coming together in the git tree but I'm only going to
> > post up to the 'Serialise topology updates' patch again for now.
> 
> Btw, Mr. Cooper points out a very important aspect and I don't know
> whether you've verified this already or whether this is not affected
> by your series ... yet. In any case it should be checked: microcode
> loading.
> 
> See __reload_late() and all that dance we do to keep SMT siblings do
> nothing at the same time while updating microcode.
> 
> With the current boot order, the APs should all do nothing so they won't
> need that sync for early loading - load_ucode_{ap,bsp} - but I don't
> know if you're changing that order with the parallel startup.
> 
> If you do, you'll probably need such syncing for the early loading
> too...

Thanks. It looks like that is only invoked after boot, with a write to
/sys/devices/system/cpu/microcode/reload.

My series is only parallelising the initial bringup at boot time, so it
shouldn't make any difference.

However... it does look like there's nothing preventing a sibling being
brought online *while* the dance you mention above is occurring.

Shouldn't __reload_late() take the device_hotplug_lock to prevent that?
Borislav Petkov Feb. 1, 2022, 10:56 a.m. UTC | #8
On Tue, Feb 01, 2022 at 10:25:01AM +0000, David Woodhouse wrote:
> Thanks. It looks like that is only invoked after boot, with a write to
> /sys/devices/system/cpu/microcode/reload.
>
> My series is only parallelising the initial bringup at boot time, so it
> shouldn't make any difference.

No, I don't mean __reload_late() - I pointed you at that function to
show the dance we must do when updating microcode late.

The load_ucode_{ap,bsp}() routines are what is called when loading ucode
early.

So the question is, does the parallelizing change the order in which APs
are brought up and can it happen that a SMT sibling of a two-SMT core
executes *something* while the other SMT sibling is updating microcode.

If so, that would be bad.

> However... it does look like there's nothing preventing a sibling being
> brought online *while* the dance you mention above is occurring.

Bottom line is: of the two SMT siblings, one needs to be updating
microcode while the other is idle. I.e., what __reload_late() does.

> Shouldn't __reload_late() take the device_hotplug_lock to prevent that?

See reload_store().
David Woodhouse Feb. 1, 2022, 12:39 p.m. UTC | #9
On Tue, 2022-02-01 at 11:56 +0100, Borislav Petkov wrote:
> On Tue, Feb 01, 2022 at 10:25:01AM +0000, David Woodhouse wrote:
> > Thanks. It looks like that is only invoked after boot, with a write to
> > /sys/devices/system/cpu/microcode/reload.
> > 
> > My series is only parallelising the initial bringup at boot time, so it
> > shouldn't make any difference.
> 
> No, I don't mean __reload_late() - I pointed you at that function to
> show the dance we must do when updating microcode late.
> 
> The load_ucode_{ap,bsp}() routines are what is called when loading ucode
> early.
> 
> So the question is, does the parallelizing change the order in which APs
> are brought up and can it happen that a SMT sibling of a two-SMT core
> executes *something* while the other SMT sibling is updating microcode.
> 
> If so, that would be bad.

Right. So as you surmise, I haven't broken that... yet. At least not in
the patches I've posted :)

The call to ucode_cpu_init() is in cpu_init(), right after the call to
wait_for_master_cpu(), which this AP's bit in cpu_initialized_mask and
then waits for the BSP to set its bit in cpu_callout_mask.

That's a full synchronization point with do_wait_cpu_initalized() on
the BSP, which waits for the former and then sets the later.

So... with the series I've posted, all APs end up waiting in
wait_for_master_cpu() until the final serialized bringup.

In the top of my git tree, you can see a half-baked 'parallel part 2'
commit which introduces a new x86/cpu:wait-init cpuhp state that would
invoke do_wait_cpu_initialized() for each CPU in turn, which *would*
release them all into load_ucode_bsp() at the same time and have
precisely the problem you're describing.

I'll commit a FIXME comment now so that it doesn't slip my mind.

Thanks.


> > However... it does look like there's nothing preventing a sibling being
> > brought online *while* the dance you mention above is occurring.
> 
> Bottom line is: of the two SMT siblings, one needs to be updating
> microcode while the other is idle. I.e., what __reload_late() does.
> 
> > Shouldn't __reload_late() take the device_hotplug_lock to prevent that?
> 
> See reload_store().

Hm, not sure I see how that's protecting itself from someone
simultaneously echoing 1 > /sys/devices/system/cpu/cpu${SIBLING}/online
Borislav Petkov Feb. 1, 2022, 12:56 p.m. UTC | #10
On Tue, Feb 01, 2022 at 12:39:17PM +0000, David Woodhouse wrote:
> In the top of my git tree, you can see a half-baked 'parallel part 2'
> commit which introduces a new x86/cpu:wait-init cpuhp state that would
> invoke do_wait_cpu_initialized() for each CPU in turn, which *would*
> release them all into load_ucode_bsp() at the same time and have
> precisely the problem you're describing.

The load_ucode_bsp() is the variant that runs on the boot CPU but
yeah...

> I'll commit a FIXME comment now so that it doesn't slip my mind.

Yap, thank Cooper for pointing out that whole thing about how microcode
loading is special and can't always handle parallelism. :)

> Hm, not sure I see how that's protecting itself from someone
> simultaneously echoing 1 > /sys/devices/system/cpu/cpu${SIBLING}/online

So

echo 1 > ../online

means onlining the sibling.

But reload_store() grabs the CPU hotplug lock *first* and *then* runs
check_online_cpus() to see if all CPUs are online. It doesn't do the
update if even one CPU is missing. You can't offline any CPU for the
duration of the update...

So I guess you'd need to explain in more detail what protection hole
you're seeing because I might be missing something here.

Thx.
David Woodhouse Feb. 1, 2022, 1:02 p.m. UTC | #11
On Tue, 2022-02-01 at 13:56 +0100, Borislav Petkov wrote:
> On Tue, Feb 01, 2022 at 12:39:17PM +0000, David Woodhouse wrote:
> > In the top of my git tree, you can see a half-baked 'parallel part 2'
> > commit which introduces a new x86/cpu:wait-init cpuhp state that would
> > invoke do_wait_cpu_initialized() for each CPU in turn, which *would*
> > release them all into load_ucode_bsp() at the same time and have
> > precisely the problem you're describing.
> 
> The load_ucode_bsp() is the variant that runs on the boot CPU but
> yeah...

Right. Brain not fully online today.  Sorry.

> > Hm, not sure I see how that's protecting itself from someone
> > simultaneously echoing 1 > /sys/devices/system/cpu/cpu${SIBLING}/online
> 
> So
> 
> echo 1 > ../online
> 
> means onlining the sibling.
> 
> But reload_store() grabs the CPU hotplug lock *first* and *then* runs
> check_online_cpus() to see if all CPUs are online. It doesn't do the
> update if even one CPU is missing. You can't offline any CPU for the
> duration of the update...
> 
> So I guess you'd need to explain in more detail what protection hole
> you're seeing because I might be missing something here.

No, I'd just missed cpus_read_lock() because I was looking for
something else. My fault; it looks fine. Thanks.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 5db5d083c873..e1cc4bc746bc 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -51,6 +51,7 @@  struct trampoline_header {
 	u64 efer;
 	u32 cr4;
 	u32 flags;
+	u32 lock;
 #endif
 };
 
@@ -64,6 +65,8 @@  extern unsigned long initial_stack;
 extern unsigned long initial_vc_handler;
 #endif
 
+extern u32 *trampoline_lock;
+
 extern unsigned char real_mode_blob[];
 extern unsigned char real_mode_relocs[];
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81a0211a372d..ca807c29dc34 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -196,5 +196,12 @@  extern void nmi_selftest(void);
 #define nmi_selftest() do { } while (0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
+/* Control bits for startup_64 */
+#define	STARTUP_USE_APICID	0x10000
+#define	STARTUP_USE_CPUID_0B	0x20000
+
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3f85fcae450c..9598ebf4f9d6 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -114,6 +114,7 @@  int x86_acpi_suspend_lowlevel(void)
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
+	smpboot_control = 0;
 #endif
 	initial_code = (unsigned long)wakeup_long64;
        saved_magic = 0x123456789abcdef0L;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b70344bf6600..5b20e051d84c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2335,7 +2335,7 @@  static int nr_logical_cpuids = 1;
 /*
  * Used to store mapping between logical CPU IDs and APIC IDs.
  */
-static int cpuid_to_apicid[] = {
+int cpuid_to_apicid[] = {
 	[0 ... NR_CPUS - 1] = -1,
 };
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8b3ebd2bb85..0249212e23d2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -25,6 +25,7 @@ 
 #include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/fixmap.h>
+#include <asm/smp.h>
 
 /*
  * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -176,6 +177,64 @@  SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 1:
 	UNWIND_HINT_EMPTY
 
+	/*
+	 * Is this the boot CPU coming up? If so everything is available
+	 * in initial_gs, initial_stack and early_gdt_descr.
+	 */
+	movl	smpboot_control(%rip), %eax
+	testl	%eax, %eax
+	jz	.Lsetup_cpu
+
+	/*
+	 * Secondary CPUs find out the offsets via the APIC ID. For parallel
+	 * boot the APIC ID is retrieved from CPUID, otherwise it's encoded
+	 * in smpboot_control:
+	 * Bit 0-15	APICID if STARTUP_USE_CPUID_0B is not set
+	 * Bit 16 	Secondary boot flag
+	 * Bit 17	Parallel boot flag
+	 */
+	testl	$STARTUP_USE_CPUID_0B, %eax
+	jz	.Lsetup_AP
+
+	mov	$0x0B, %eax
+	xorl	%ecx, %ecx
+	cpuid
+	mov	%edx, %eax
+
+.Lsetup_AP:
+	/* EAX contains the APICID of the current CPU */
+	andl	$0xFFFF, %eax
+	xorl	%ecx, %ecx
+	leaq	cpuid_to_apicid(%rip), %rbx
+
+.Lfind_cpunr:
+	cmpl	(%rbx), %eax
+	jz	.Linit_cpu_data
+	addq	$4, %rbx
+	addq	$8, %rcx
+	jmp	.Lfind_cpunr
+
+.Linit_cpu_data:
+	/* Get the per cpu offset */
+	leaq	__per_cpu_offset(%rip), %rbx
+	addq	%rcx, %rbx
+	movq	(%rbx), %rbx
+	/* Save it for GS BASE setup */
+	movq	%rbx, initial_gs(%rip)
+
+	/* Calculate the GDT address */
+	movq	$gdt_page, %rcx
+	addq	%rbx, %rcx
+	movq	%rcx, early_gdt_descr_base(%rip)
+
+	/* Find the idle task stack */
+	movq	$idle_threads, %rcx
+	addq	%rbx, %rcx
+	movq	(%rcx), %rcx
+	movq	TASK_threadsp(%rcx), %rcx
+	movq	%rcx, initial_stack(%rip)
+
+.Lsetup_cpu:
 	/*
 	 * We must switch to a new descriptor in kernel space for the GDT
 	 * because soon the kernel won't have access anymore to the userspace
@@ -216,6 +275,14 @@  SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	 */
 	movq initial_stack(%rip), %rsp
 
+	/* Drop the realmode protection. For the boot CPU the pointer is NULL! */
+	movq	trampoline_lock(%rip), %rax
+	testq	%rax, %rax
+	jz	.Lsetup_idt
+	lock
+	btrl	$0, (%rax)
+
+.Lsetup_idt:
 	/* Setup and Load IDT */
 	pushq	%rsi
 	call	early_setup_idt
@@ -347,6 +414,7 @@  SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
  * reliably detect the end of the stack.
  */
 SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
+SYM_DATA(trampoline_lock, .quad 0);
 	__FINITDATA
 
 	__INIT
@@ -572,6 +640,9 @@  SYM_DATA_END(level1_fixmap_pgt)
 SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
 SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
 
+	.align 16
+SYM_DATA(smpboot_control,		.long 0)
+
 	.align 16
 /* This must match the first entry in level2_kernel_pgt */
 SYM_DATA(phys_base, .quad 0x0)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7a763b84b6e5..1e38d44c3603 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1104,9 +1104,19 @@  static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	unsigned long boot_error = 0;
 
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
-	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 	initial_code = (unsigned long)start_secondary;
-	initial_stack  = idle->thread.sp;
+
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+		initial_stack  = idle->thread.sp;
+	} else if (boot_cpu_data.cpuid_level < 0x0B) {
+		/* Anything with X2APIC should have CPUID leaf 0x0B */
+		if (WARN_ON_ONCE(x2apic_mode) && apicid > 0xffff)
+			return -EIO;
+		smpboot_control = apicid | STARTUP_USE_APICID;
+	} else {
+		smpboot_control = STARTUP_USE_CPUID_0B;
+	}
 
 	/* Enable the espfix hack for this CPU */
 	init_espfix_ap(cpu);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 4a3da7592b99..7dc2e817bd02 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -127,6 +127,9 @@  static void __init setup_real_mode(void)
 
 	trampoline_header->flags = 0;
 
+	trampoline_lock = &trampoline_header->lock;
+	*trampoline_lock = 0;
+
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
 	trampoline_pgd[511] = init_top_pgt[511].pgd;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index cc8391f86cdb..12a540904e80 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -49,6 +49,19 @@  SYM_CODE_START(trampoline_start)
 	mov	%ax, %es
 	mov	%ax, %ss
 
+	/*
+	 * Make sure only one CPU fiddles with the realmode stack
+	 */
+.Llock_rm:
+	btl	$0, tr_lock
+	jnc	2f
+	pause
+	jmp	.Llock_rm
+2:
+	lock
+	btsl	$0, tr_lock
+	jc	.Llock_rm
+
 	# Setup stack
 	movl	$rm_stack_end, %esp
 
@@ -192,6 +205,7 @@  SYM_DATA_START(trampoline_header)
 	SYM_DATA(tr_efer,		.space 8)
 	SYM_DATA(tr_cr4,		.space 4)
 	SYM_DATA(tr_flags,		.space 4)
+	SYM_DATA(tr_lock,		.space 4)
 SYM_DATA_END(trampoline_header)
 
 #include "trampoline_common.S"
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f6bc0bc8a2aa..934e64ff4eed 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -25,7 +25,7 @@ 
  * For the hotplug case we keep the task structs around and reuse
  * them.
  */
-static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+DEFINE_PER_CPU(struct task_struct *, idle_threads);
 
 struct task_struct *idle_thread_get(unsigned int cpu)
 {