diff mbox series

[for,4.19?] x86/Intel: unlock CPUID earlier for the BSP

Message ID 82277592-ea96-47c8-a991-7afd97d7a7bc@suse.com (mailing list archive)
State New, archived
Headers show
Series [for,4.19?] x86/Intel: unlock CPUID earlier for the BSP | expand

Commit Message

Jan Beulich June 13, 2024, 8:19 a.m. UTC
Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
this bit is set by the BIOS then CPUID evaluation does not work when
data from any leaf greater than two is needed; early_cpu_init() in
particular wants to collect leaf 7 data.

Cure this by unlocking CPUID right before evaluating anything which
depends on the maximum CPUID leaf being greater than two.

Inspired by (and description cloned from) Linux commit 0c2f6d04619e
("x86/topology/intel: Unlock CPUID before evaluating anything").

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
While I couldn't spot anything, it kind of feels as if I'm overlooking
further places where we might be inspecting in particular leaf 7 yet
earlier.

No Fixes: tag(s), as imo it would be too many that would want
enumerating.

Comments

Oleksii Kurochko June 13, 2024, 2:35 p.m. UTC | #1
On Thu, 2024-06-13 at 10:19 +0200, Jan Beulich wrote:
> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
> this bit is set by the BIOS then CPUID evaluation does not work when
> data from any leaf greater than two is needed; early_cpu_init() in
> particular wants to collect leaf 7 data.
> 
> Cure this by unlocking CPUID right before evaluating anything which
> depends on the maximum CPUID leaf being greater than two.
> 
> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
> ("x86/topology/intel: Unlock CPUID before evaluating anything").
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Release-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>

~ Oleksii
> ---
> While I couldn't spot anything, it kind of feels as if I'm
> overlooking
> further places where we might be inspecting in particular leaf 7 yet
> earlier.
> 
> No Fixes: tag(s), as imo it would be too many that would want
> enumerating.
> 
> --- a/xen/arch/x86/cpu/common.c
> +++ b/xen/arch/x86/cpu/common.c
> @@ -336,7 +336,8 @@ void __init early_cpu_init(bool verbose)
>  
>  	c->x86_vendor = x86_cpuid_lookup_vendor(ebx, ecx, edx);
>  	switch (c->x86_vendor) {
> -	case X86_VENDOR_INTEL:    actual_cpu = intel_cpu_dev;   
> break;
> +	case X86_VENDOR_INTEL:    intel_unlock_cpuid_leaves(c);
> +				  actual_cpu = intel_cpu_dev;   
> break;
>  	case X86_VENDOR_AMD:      actual_cpu = amd_cpu_dev;     
> break;
>  	case X86_VENDOR_CENTAUR:  actual_cpu = centaur_cpu_dev; 
> break;
>  	case X86_VENDOR_SHANGHAI: actual_cpu = shanghai_cpu_dev;
> break;
> --- a/xen/arch/x86/cpu/cpu.h
> +++ b/xen/arch/x86/cpu/cpu.h
> @@ -24,3 +24,5 @@ void amd_init_lfence(struct cpuinfo_x86
>  void amd_init_ssbd(const struct cpuinfo_x86 *c);
>  void amd_init_spectral_chicken(void);
>  void detect_zen2_null_seg_behaviour(void);
> +
> +void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c);
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -303,10 +303,24 @@ static void __init noinline intel_init_l
>  		ctxt_switch_masking = intel_ctxt_switch_masking;
>  }
>  
> -static void cf_check early_init_intel(struct cpuinfo_x86 *c)
> +/* Unmask CPUID levels if masked. */
> +void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c)
>  {
> -	u64 misc_enable, disable;
> +	uint64_t misc_enable, disable;
> +
> +	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> +
> +	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
> +	if (disable) {
> +		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable &
> ~disable);
> +		bootsym(trampoline_misc_enable_off) |= disable;
> +		c->cpuid_level = cpuid_eax(0);
> +		printk(KERN_INFO "revised cpuid level: %u\n", c-
> >cpuid_level);
> +	}
> +}
>  
> +static void cf_check early_init_intel(struct cpuinfo_x86 *c)
> +{
>  	/* Netburst reports 64 bytes clflush size, but does IO in
> 128 bytes */
>  	if (c->x86 == 15 && c->x86_cache_alignment == 64)
>  		c->x86_cache_alignment = 128;
> @@ -315,16 +329,7 @@ static void cf_check early_init_intel(st
>  	    bootsym(trampoline_misc_enable_off) &
> MSR_IA32_MISC_ENABLE_XD_DISABLE)
>  		printk(KERN_INFO "re-enabled NX (Execute Disable)
> protection\n");
>  
> -	/* Unmask CPUID levels and NX if masked: */
> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> -
> -	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
> -	if (disable) {
> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable &
> ~disable);
> -		bootsym(trampoline_misc_enable_off) |= disable;
> -		printk(KERN_INFO "revised cpuid level: %d\n",
> -		       cpuid_eax(0));
> -	}
> +	intel_unlock_cpuid_leaves(c);
>  
>  	/* CPUID workaround for Intel 0F33/0F34 CPU */
>  	if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3
> &&
Roger Pau Monne June 13, 2024, 3:16 p.m. UTC | #2
On Thu, Jun 13, 2024 at 10:19:30AM +0200, Jan Beulich wrote:
> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
> this bit is set by the BIOS then CPUID evaluation does not work when
> data from any leaf greater than two is needed; early_cpu_init() in
> particular wants to collect leaf 7 data.
> 
> Cure this by unlocking CPUID right before evaluating anything which
> depends on the maximum CPUID leaf being greater than two.
> 
> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
> ("x86/topology/intel: Unlock CPUID before evaluating anything").
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> While I couldn't spot anything, it kind of feels as if I'm overlooking
> further places where we might be inspecting in particular leaf 7 yet
> earlier.
> 
> No Fixes: tag(s), as imo it would be too many that would want
> enumerating.
> 
> --- a/xen/arch/x86/cpu/common.c
> +++ b/xen/arch/x86/cpu/common.c
> @@ -336,7 +336,8 @@ void __init early_cpu_init(bool verbose)
>  
>  	c->x86_vendor = x86_cpuid_lookup_vendor(ebx, ecx, edx);
>  	switch (c->x86_vendor) {
> -	case X86_VENDOR_INTEL:    actual_cpu = intel_cpu_dev;    break;
> +	case X86_VENDOR_INTEL:    intel_unlock_cpuid_leaves(c);
> +				  actual_cpu = intel_cpu_dev;    break;
>  	case X86_VENDOR_AMD:      actual_cpu = amd_cpu_dev;      break;
>  	case X86_VENDOR_CENTAUR:  actual_cpu = centaur_cpu_dev;  break;
>  	case X86_VENDOR_SHANGHAI: actual_cpu = shanghai_cpu_dev; break;
> --- a/xen/arch/x86/cpu/cpu.h
> +++ b/xen/arch/x86/cpu/cpu.h
> @@ -24,3 +24,5 @@ void amd_init_lfence(struct cpuinfo_x86
>  void amd_init_ssbd(const struct cpuinfo_x86 *c);
>  void amd_init_spectral_chicken(void);
>  void detect_zen2_null_seg_behaviour(void);
> +
> +void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c);
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -303,10 +303,24 @@ static void __init noinline intel_init_l
>  		ctxt_switch_masking = intel_ctxt_switch_masking;
>  }
>  
> -static void cf_check early_init_intel(struct cpuinfo_x86 *c)
> +/* Unmask CPUID levels if masked. */
> +void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c)
>  {
> -	u64 misc_enable, disable;
> +	uint64_t misc_enable, disable;
> +
> +	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> +
> +	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
> +	if (disable) {
> +		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
> +		bootsym(trampoline_misc_enable_off) |= disable;
> +		c->cpuid_level = cpuid_eax(0);
> +		printk(KERN_INFO "revised cpuid level: %u\n", c->cpuid_level);
> +	}
> +}
>  
> +static void cf_check early_init_intel(struct cpuinfo_x86 *c)
> +{
>  	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
>  	if (c->x86 == 15 && c->x86_cache_alignment == 64)
>  		c->x86_cache_alignment = 128;
> @@ -315,16 +329,7 @@ static void cf_check early_init_intel(st
>  	    bootsym(trampoline_misc_enable_off) & MSR_IA32_MISC_ENABLE_XD_DISABLE)
>  		printk(KERN_INFO "re-enabled NX (Execute Disable) protection\n");
>  
> -	/* Unmask CPUID levels and NX if masked: */
> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> -
> -	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
> -	if (disable) {
> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
> -		bootsym(trampoline_misc_enable_off) |= disable;
> -		printk(KERN_INFO "revised cpuid level: %d\n",
> -		       cpuid_eax(0));
> -	}
> +	intel_unlock_cpuid_leaves(c);

Do you really need to call intel_unlock_cpuid_leaves() here?

For the BSP it will be taken care in early_cpu_init(), and for the APs
MSR_IA32_MISC_ENABLE it will be set as part of the trampoline with the
disables from trampoline_misc_enable_off.

Thanks, Roger.
Jan Beulich June 13, 2024, 3:53 p.m. UTC | #3
On 13.06.2024 17:16, Roger Pau Monné wrote:
> On Thu, Jun 13, 2024 at 10:19:30AM +0200, Jan Beulich wrote:
>> @@ -315,16 +329,7 @@ static void cf_check early_init_intel(st
>>  	    bootsym(trampoline_misc_enable_off) & MSR_IA32_MISC_ENABLE_XD_DISABLE)
>>  		printk(KERN_INFO "re-enabled NX (Execute Disable) protection\n");
>>  
>> -	/* Unmask CPUID levels and NX if masked: */
>> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
>> -
>> -	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
>> -	if (disable) {
>> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
>> -		bootsym(trampoline_misc_enable_off) |= disable;
>> -		printk(KERN_INFO "revised cpuid level: %d\n",
>> -		       cpuid_eax(0));
>> -	}
>> +	intel_unlock_cpuid_leaves(c);
> 
> Do you really need to call intel_unlock_cpuid_leaves() here?
> 
> For the BSP it will be taken care in early_cpu_init(), and for the APs
> MSR_IA32_MISC_ENABLE it will be set as part of the trampoline with the
> disables from trampoline_misc_enable_off.

The way the original code works, it allows to deal with the BSP having the
bit clear, but some or all of the APs having it set. I simply didn't want
to break that property.

Jan
Roger Pau Monne June 13, 2024, 4:04 p.m. UTC | #4
On Thu, Jun 13, 2024 at 05:53:02PM +0200, Jan Beulich wrote:
> On 13.06.2024 17:16, Roger Pau Monné wrote:
> > On Thu, Jun 13, 2024 at 10:19:30AM +0200, Jan Beulich wrote:
> >> @@ -315,16 +329,7 @@ static void cf_check early_init_intel(st
> >>  	    bootsym(trampoline_misc_enable_off) & MSR_IA32_MISC_ENABLE_XD_DISABLE)
> >>  		printk(KERN_INFO "re-enabled NX (Execute Disable) protection\n");
> >>  
> >> -	/* Unmask CPUID levels and NX if masked: */
> >> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> >> -
> >> -	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
> >> -	if (disable) {
> >> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
> >> -		bootsym(trampoline_misc_enable_off) |= disable;
> >> -		printk(KERN_INFO "revised cpuid level: %d\n",
> >> -		       cpuid_eax(0));
> >> -	}
> >> +	intel_unlock_cpuid_leaves(c);
> > 
> > Do you really need to call intel_unlock_cpuid_leaves() here?
> > 
> > For the BSP it will be taken care in early_cpu_init(), and for the APs
> > MSR_IA32_MISC_ENABLE it will be set as part of the trampoline with the
> > disables from trampoline_misc_enable_off.
> 
> The way the original code works, it allows to deal with the BSP having the
> bit clear, but some or all of the APs having it set. I simply didn't want
> to break that property.

Oh, I see.  This looks like something we would unconditionally like to
set in trampoline_misc_enable_off?  Except that would trigger an
unnecessary write to the MSR if the CPU already has it disabled.

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

I think the printk could be made a printk_once, since it doesn't even
print the CPU where the bit has been seen as set, but anyway, that
would be further adjustments.

Thanks, Roger.
Jan Beulich June 13, 2024, 4:12 p.m. UTC | #5
On 13.06.2024 18:04, Roger Pau Monné wrote:
> On Thu, Jun 13, 2024 at 05:53:02PM +0200, Jan Beulich wrote:
>> On 13.06.2024 17:16, Roger Pau Monné wrote:
>>> On Thu, Jun 13, 2024 at 10:19:30AM +0200, Jan Beulich wrote:
>>>> @@ -315,16 +329,7 @@ static void cf_check early_init_intel(st
>>>>  	    bootsym(trampoline_misc_enable_off) & MSR_IA32_MISC_ENABLE_XD_DISABLE)
>>>>  		printk(KERN_INFO "re-enabled NX (Execute Disable) protection\n");
>>>>  
>>>> -	/* Unmask CPUID levels and NX if masked: */
>>>> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
>>>> -
>>>> -	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
>>>> -	if (disable) {
>>>> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
>>>> -		bootsym(trampoline_misc_enable_off) |= disable;
>>>> -		printk(KERN_INFO "revised cpuid level: %d\n",
>>>> -		       cpuid_eax(0));
>>>> -	}
>>>> +	intel_unlock_cpuid_leaves(c);
>>>
>>> Do you really need to call intel_unlock_cpuid_leaves() here?
>>>
>>> For the BSP it will be taken care in early_cpu_init(), and for the APs
>>> MSR_IA32_MISC_ENABLE it will be set as part of the trampoline with the
>>> disables from trampoline_misc_enable_off.
>>
>> The way the original code works, it allows to deal with the BSP having the
>> bit clear, but some or all of the APs having it set. I simply didn't want
>> to break that property.
> 
> Oh, I see.  This looks like something we would unconditionally like to
> set in trampoline_misc_enable_off?  Except that would trigger an
> unnecessary write to the MSR if the CPU already has it disabled.

Might be an option; the extra MSR access may not be _that_ harmful.

> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

> I think the printk could be made a printk_once, since it doesn't even
> print the CPU where the bit has been seen as set, but anyway, that
> would be further adjustments.

Well, yes and no. Having it the way it is now and seeing the message twice
in a log would be indicative of a problem.

Jan
Andrew Cooper June 13, 2024, 4:17 p.m. UTC | #6
On 13/06/2024 9:19 am, Jan Beulich wrote:
> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
> this bit is set by the BIOS then CPUID evaluation does not work when
> data from any leaf greater than two is needed; early_cpu_init() in
> particular wants to collect leaf 7 data.
>
> Cure this by unlocking CPUID right before evaluating anything which
> depends on the maximum CPUID leaf being greater than two.
>
> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
> ("x86/topology/intel: Unlock CPUID before evaluating anything").
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> While I couldn't spot anything, it kind of feels as if I'm overlooking
> further places where we might be inspecting in particular leaf 7 yet
> earlier.
>
> No Fixes: tag(s), as imo it would be too many that would want
> enumerating.

I also saw that go by, but concluded that Xen doesn't need it, hence why
I left it alone.

The truth is that only the BSP needs it.  APs sort it out in the
trampoline via trampoline_misc_enable_off, because they need to clear
XD_DISABLE in prior to enabling paging, so we should be taking it out of
early_init_intel().

But, we don't have an early BSP-only early hook, and I'm not overwhelmed
at the idea of exporting it from intel.c

I was intending to leave it alone until I can burn this whole
infrastructure to the ground and make it work nicely with policies, but
that's not a job for this point in the release...

~Andrew
Jan Beulich June 14, 2024, 6:27 a.m. UTC | #7
On 13.06.2024 18:17, Andrew Cooper wrote:
> On 13/06/2024 9:19 am, Jan Beulich wrote:
>> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
>> this bit is set by the BIOS then CPUID evaluation does not work when
>> data from any leaf greater than two is needed; early_cpu_init() in
>> particular wants to collect leaf 7 data.
>>
>> Cure this by unlocking CPUID right before evaluating anything which
>> depends on the maximum CPUID leaf being greater than two.
>>
>> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
>> ("x86/topology/intel: Unlock CPUID before evaluating anything").
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> While I couldn't spot anything, it kind of feels as if I'm overlooking
>> further places where we might be inspecting in particular leaf 7 yet
>> earlier.
>>
>> No Fixes: tag(s), as imo it would be too many that would want
>> enumerating.
> 
> I also saw that go by, but concluded that Xen doesn't need it, hence why
> I left it alone.
> 
> The truth is that only the BSP needs it.  APs sort it out in the
> trampoline via trampoline_misc_enable_off, because they need to clear
> XD_DISABLE in prior to enabling paging, so we should be taking it out of
> early_init_intel().

Except for the (odd) case also mentioned to Roger, where the BSP might have
the bit clear but some (or all) AP(s) have it set.

> But, we don't have an early BSP-only early hook, and I'm not overwhelmed
> at the idea of exporting it from intel.c
> 
> I was intending to leave it alone until I can burn this whole
> infrastructure to the ground and make it work nicely with policies, but
> that's not a job for this point in the release...

This last part reads like the rest of your reply isn't an objection to me
putting this in with Roger's R-b, but it would be nice if you could
confirm this understanding of mine. Without this last part it, especially
the 2nd from last paragraph, certainly reads a little like an objection.

Jan
Andrew Cooper June 14, 2024, 10:14 a.m. UTC | #8
On 14/06/2024 7:27 am, Jan Beulich wrote:
> On 13.06.2024 18:17, Andrew Cooper wrote:
>> On 13/06/2024 9:19 am, Jan Beulich wrote:
>>> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
>>> this bit is set by the BIOS then CPUID evaluation does not work when
>>> data from any leaf greater than two is needed; early_cpu_init() in
>>> particular wants to collect leaf 7 data.
>>>
>>> Cure this by unlocking CPUID right before evaluating anything which
>>> depends on the maximum CPUID leaf being greater than two.
>>>
>>> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
>>> ("x86/topology/intel: Unlock CPUID before evaluating anything").
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> While I couldn't spot anything, it kind of feels as if I'm overlooking
>>> further places where we might be inspecting in particular leaf 7 yet
>>> earlier.
>>>
>>> No Fixes: tag(s), as imo it would be too many that would want
>>> enumerating.
>> I also saw that go by, but concluded that Xen doesn't need it, hence why
>> I left it alone.
>>
>> The truth is that only the BSP needs it.  APs sort it out in the
>> trampoline via trampoline_misc_enable_off, because they need to clear
>> XD_DISABLE in prior to enabling paging, so we should be taking it out of
>> early_init_intel().
> Except for the (odd) case also mentioned to Roger, where the BSP might have
> the bit clear but some (or all) AP(s) have it set.

Fine I suppose.  It's a single MSR adjustment once per CPU.

>
>> But, we don't have an early BSP-only early hook, and I'm not overwhelmed
>> at the idea of exporting it from intel.c
>>
>> I was intending to leave it alone until I can burn this whole
>> infrastructure to the ground and make it work nicely with policies, but
>> that's not a job for this point in the release...
> This last part reads like the rest of your reply isn't an objection to me
> putting this in with Roger's R-b, but it would be nice if you could
> confirm this understanding of mine. Without this last part it, especially
> the 2nd from last paragraph, certainly reads a little like an objection.

I'm -1 to this generally.  It's churn without fixing anything AFAICT,
and is moving in a direction which will need undoing in the future.

But, because it doesn't fix anything, I don't think it's appropriate to
be tagged as 4.19 even if you and roger feel strongly enough to put it
into 4.20.

~Andrew
Jan Beulich June 14, 2024, 11:12 a.m. UTC | #9
On 14.06.2024 12:14, Andrew Cooper wrote:
> On 14/06/2024 7:27 am, Jan Beulich wrote:
>> On 13.06.2024 18:17, Andrew Cooper wrote:
>>> On 13/06/2024 9:19 am, Jan Beulich wrote:
>>>> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
>>>> this bit is set by the BIOS then CPUID evaluation does not work when
>>>> data from any leaf greater than two is needed; early_cpu_init() in
>>>> particular wants to collect leaf 7 data.
>>>>
>>>> Cure this by unlocking CPUID right before evaluating anything which
>>>> depends on the maximum CPUID leaf being greater than two.
>>>>
>>>> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
>>>> ("x86/topology/intel: Unlock CPUID before evaluating anything").
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> ---
>>>> While I couldn't spot anything, it kind of feels as if I'm overlooking
>>>> further places where we might be inspecting in particular leaf 7 yet
>>>> earlier.
>>>>
>>>> No Fixes: tag(s), as imo it would be too many that would want
>>>> enumerating.
>>> I also saw that go by, but concluded that Xen doesn't need it, hence why
>>> I left it alone.
>>>
>>> The truth is that only the BSP needs it.  APs sort it out in the
>>> trampoline via trampoline_misc_enable_off, because they need to clear
>>> XD_DISABLE in prior to enabling paging, so we should be taking it out of
>>> early_init_intel().
>> Except for the (odd) case also mentioned to Roger, where the BSP might have
>> the bit clear but some (or all) AP(s) have it set.
> 
> Fine I suppose.  It's a single MSR adjustment once per CPU.
> 
>>
>>> But, we don't have an early BSP-only early hook, and I'm not overwhelmed
>>> at the idea of exporting it from intel.c
>>>
>>> I was intending to leave it alone until I can burn this whole
>>> infrastructure to the ground and make it work nicely with policies, but
>>> that's not a job for this point in the release...
>> This last part reads like the rest of your reply isn't an objection to me
>> putting this in with Roger's R-b, but it would be nice if you could
>> confirm this understanding of mine. Without this last part it, especially
>> the 2nd from last paragraph, certainly reads a little like an objection.
> 
> I'm -1 to this generally.  It's churn without fixing anything AFAICT,

How that? We clearly do the adjustment too late right now for the BSP.
All the leaf-7 stuff added to early_cpu_init() (iirc in part in the course
of speculation work) is useless on a system where firmware set that flag.

Jan

> and is moving in a direction which will need undoing in the future.
> 
> But, because it doesn't fix anything, I don't think it's appropriate to
> be tagged as 4.19 even if you and roger feel strongly enough to put it
> into 4.20.
> 
> ~Andrew
Andrew Cooper June 18, 2024, 12:46 p.m. UTC | #10
On 14/06/2024 12:12 pm, Jan Beulich wrote:
> On 14.06.2024 12:14, Andrew Cooper wrote:
>> On 14/06/2024 7:27 am, Jan Beulich wrote:
>>> On 13.06.2024 18:17, Andrew Cooper wrote:
>>>> On 13/06/2024 9:19 am, Jan Beulich wrote:
>>>>> Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
>>>>> this bit is set by the BIOS then CPUID evaluation does not work when
>>>>> data from any leaf greater than two is needed; early_cpu_init() in
>>>>> particular wants to collect leaf 7 data.
>>>>>
>>>>> Cure this by unlocking CPUID right before evaluating anything which
>>>>> depends on the maximum CPUID leaf being greater than two.
>>>>>
>>>>> Inspired by (and description cloned from) Linux commit 0c2f6d04619e
>>>>> ("x86/topology/intel: Unlock CPUID before evaluating anything").
>>>>>
>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>> ---
>>>>> While I couldn't spot anything, it kind of feels as if I'm overlooking
>>>>> further places where we might be inspecting in particular leaf 7 yet
>>>>> earlier.
>>>>>
>>>>> No Fixes: tag(s), as imo it would be too many that would want
>>>>> enumerating.
>>>> I also saw that go by, but concluded that Xen doesn't need it, hence why
>>>> I left it alone.
>>>>
>>>> The truth is that only the BSP needs it.  APs sort it out in the
>>>> trampoline via trampoline_misc_enable_off, because they need to clear
>>>> XD_DISABLE in prior to enabling paging, so we should be taking it out of
>>>> early_init_intel().
>>> Except for the (odd) case also mentioned to Roger, where the BSP might have
>>> the bit clear but some (or all) AP(s) have it set.
>> Fine I suppose.  It's a single MSR adjustment once per CPU.
>>
>>>> But, we don't have an early BSP-only early hook, and I'm not overwhelmed
>>>> at the idea of exporting it from intel.c
>>>>
>>>> I was intending to leave it alone until I can burn this whole
>>>> infrastructure to the ground and make it work nicely with policies, but
>>>> that's not a job for this point in the release...
>>> This last part reads like the rest of your reply isn't an objection to me
>>> putting this in with Roger's R-b, but it would be nice if you could
>>> confirm this understanding of mine. Without this last part it, especially
>>> the 2nd from last paragraph, certainly reads a little like an objection.
>> I'm -1 to this generally.  It's churn without fixing anything AFAICT,
> How that? We clearly do the adjustment too late right now for the BSP.
> All the leaf-7 stuff added to early_cpu_init() (iirc in part in the course
> of speculation work) is useless on a system where firmware set that flag.

After discussing this at the x86 maintainers meeting, I agree that it is
fixing a potential bug.

So, while I really dislike the patch, it's the right approach in the
short term, and should go in.

~Andrew
diff mbox series

Patch

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -336,7 +336,8 @@  void __init early_cpu_init(bool verbose)
 
 	c->x86_vendor = x86_cpuid_lookup_vendor(ebx, ecx, edx);
 	switch (c->x86_vendor) {
-	case X86_VENDOR_INTEL:    actual_cpu = intel_cpu_dev;    break;
+	case X86_VENDOR_INTEL:    intel_unlock_cpuid_leaves(c);
+				  actual_cpu = intel_cpu_dev;    break;
 	case X86_VENDOR_AMD:      actual_cpu = amd_cpu_dev;      break;
 	case X86_VENDOR_CENTAUR:  actual_cpu = centaur_cpu_dev;  break;
 	case X86_VENDOR_SHANGHAI: actual_cpu = shanghai_cpu_dev; break;
--- a/xen/arch/x86/cpu/cpu.h
+++ b/xen/arch/x86/cpu/cpu.h
@@ -24,3 +24,5 @@  void amd_init_lfence(struct cpuinfo_x86
 void amd_init_ssbd(const struct cpuinfo_x86 *c);
 void amd_init_spectral_chicken(void);
 void detect_zen2_null_seg_behaviour(void);
+
+void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c);
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -303,10 +303,24 @@  static void __init noinline intel_init_l
 		ctxt_switch_masking = intel_ctxt_switch_masking;
 }
 
-static void cf_check early_init_intel(struct cpuinfo_x86 *c)
+/* Unmask CPUID levels if masked. */
+void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c)
 {
-	u64 misc_enable, disable;
+	uint64_t misc_enable, disable;
+
+	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+
+	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
+	if (disable) {
+		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
+		bootsym(trampoline_misc_enable_off) |= disable;
+		c->cpuid_level = cpuid_eax(0);
+		printk(KERN_INFO "revised cpuid level: %u\n", c->cpuid_level);
+	}
+}
 
+static void cf_check early_init_intel(struct cpuinfo_x86 *c)
+{
 	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
 	if (c->x86 == 15 && c->x86_cache_alignment == 64)
 		c->x86_cache_alignment = 128;
@@ -315,16 +329,7 @@  static void cf_check early_init_intel(st
 	    bootsym(trampoline_misc_enable_off) & MSR_IA32_MISC_ENABLE_XD_DISABLE)
 		printk(KERN_INFO "re-enabled NX (Execute Disable) protection\n");
 
-	/* Unmask CPUID levels and NX if masked: */
-	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
-	disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
-	if (disable) {
-		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
-		bootsym(trampoline_misc_enable_off) |= disable;
-		printk(KERN_INFO "revised cpuid level: %d\n",
-		       cpuid_eax(0));
-	}
+	intel_unlock_cpuid_leaves(c);
 
 	/* CPUID workaround for Intel 0F33/0F34 CPU */
 	if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 &&