diff mbox series

[v10,24/27] KVM: x86: Enable CET virtualization for VMX and advertise to userspace

Message ID 20240219074733.122080-25-weijiang.yang@intel.com (mailing list archive)
State New, archived
Headers show
Series Enable CET Virtualization | expand

Commit Message

Yang, Weijiang Feb. 19, 2024, 7:47 a.m. UTC
Expose CET features to guest if KVM/host can support them, clear CPUID
feature bits if KVM/host cannot support.

Set CPUID feature bits so that CET features are available in guest CPUID.
Add CR4.CET bit support in order to allow guest set CET master control
bit.

Disable KVM CET feature if unrestricted_guest is unsupported/disabled as
KVM does not support emulating CET.

The CET load-bits in VM_ENTRY/VM_EXIT control fields should be set to make
guest CET xstates isolated from host's.

On platforms with VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error
code will fail, and if VMX_BASIC[bit56] == 1, #CP injection with or without
error code is allowed. Disable CET feature bits if the MSR bit is cleared
so that nested VMM can inject #CP if and only if VMX_BASIC[bit56] == 1.

Don't expose CET feature if either of {U,S}_CET xstate bits is cleared
in host XSS or if XSAVES isn't supported.

CET MSR contents after reset, power-up and INIT are set to 0s, clears the
guest fpstate fields so that the guest MSRs are reset to 0s after the events.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm_host.h  |  2 +-
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kvm/cpuid.c             | 25 ++++++++++++++++++++-----
 arch/x86/kvm/vmx/capabilities.h  |  6 ++++++
 arch/x86/kvm/vmx/vmx.c           | 30 +++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/vmx.h           |  6 ++++--
 arch/x86/kvm/x86.c               | 26 ++++++++++++++++++++++++--
 arch/x86/kvm/x86.h               |  3 +++
 8 files changed, 88 insertions(+), 11 deletions(-)

Comments

Sean Christopherson May 1, 2024, 11:15 p.m. UTC | #1
On Sun, Feb 18, 2024, Yang Weijiang wrote:
> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
>  		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
>  	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
>  		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
> +	/*
> +	 * Don't use boot_cpu_has() to check availability of IBT because the
> +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
> +	 * in host cmdline.

I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
PCID and many other features honor the host setup, I don't see what makes IBT
special.

LA57 is special because it's entirely reasonable, likely even, for a host to
only want to use 48-bit virtual addresses, but still want to let the guest enable
LA57.

> @@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  
>  	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
>  
> +	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
> +		vmcs_writel(GUEST_SSP, 0);
> +		vmcs_writel(GUEST_S_CET, 0);
> +		vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
> +	} else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) {
> +		vmcs_writel(GUEST_S_CET, 0);
> +	}

Similar to my comments about MSR interception, I think it would be better to
explicitly handle the "common" field.  At first glance, code like the above makes
it look like IBT is mutually exclusive with SHSTK, e.g. a reader that isn't
looking closely could easily miss that both paths write GUEST_S_CET.

	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
		vmcs_writel(GUEST_SSP, 0);
		vmcs_writel(GUEST_INTR_SSP_TABLE, 0); 
	}
	if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
	    kvm_cpu_cap_has(X86_FEATURE_SHSTK))
		vmcs_writel(GUEST_S_CET, 0);
Edgecombe, Rick P May 1, 2024, 11:24 p.m. UTC | #2
On Wed, 2024-05-01 at 16:15 -0700, Sean Christopherson wrote:
> On Sun, Feb 18, 2024, Yang Weijiang wrote:
> > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
> >                 kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
> >         if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> >                 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
> > +       /*
> > +        * Don't use boot_cpu_has() to check availability of IBT because the
> > +        * feature bit is cleared in boot_cpu_data when ibt=off is applied
> > +        * in host cmdline.
> 
> I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
> PCID and many other features honor the host setup, I don't see what makes IBT
> special.
> 
> LA57 is special because it's entirely reasonable, likely even, for a host to
> only want to use 48-bit virtual addresses, but still want to let the guest
> enable
> LA57.

Definitely. I swear we (Weijiang and I) had a back and forth at some point where
we agreed to match the host support. Plus I think the CET FPU stuff triggers off
of host support for CET. So if the host doesn't have X86_FEATURE_SHSTK or
X86_FEATURE_IBT then... hopefully it's caught later. But then don't report it's
supported.
Sean Christopherson May 1, 2024, 11:34 p.m. UTC | #3
On Sun, Feb 18, 2024, Yang Weijiang wrote:
> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void)
>  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
>  		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
> -		F(SGX_LC) | F(BUS_LOCK_DETECT)
> +		F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
>  	);
>  	/* Set LA57 based on hardware capability. */
>  	if (cpuid_ecx(7) & F(LA57))
> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void)
>  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
>  		F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
>  		F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
> -		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
> +		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
> +		F(IBT)
>  	);

...

> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void)
>  
>  	if (cpu_has_vmx_waitpkg())
>  		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
> +
> +	/*
> +	 * Disable CET if unrestricted_guest is unsupported as KVM doesn't
> +	 * enforce CET HW behaviors in emulator. On platforms with
> +	 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
> +	 * fails, so disable CET in this case too.
> +	 */
> +	if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
> +	    !cpu_has_vmx_basic_no_hw_errcode()) {
> +		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
> +		kvm_cpu_cap_clear(X86_FEATURE_IBT);
> +	}

Oh!  Almost missed it.  This patch should explicitly kvm_cpu_cap_clear()
X86_FEATURE_SHSTK and X86_FEATURE_IBT.  We *know* there are upcoming AMD CPUs
that support at least SHSTK, so enumerating support for common code would yield
a version of KVM that incorrectly advertises support for SHSTK.

I hope to land both Intel and AMD virtualization in the same kernel release, but
there are no guarantees that will happen.  And explicitly clearing both SHSTK and
IBT would guard against IBT showing up in some future AMD CPU in advance of KVM
gaining full support.
Yang, Weijiang May 6, 2024, 9:19 a.m. UTC | #4
On 5/2/2024 7:15 AM, Sean Christopherson wrote:
> On Sun, Feb 18, 2024, Yang Weijiang wrote:
>> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
>>   		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
>>   	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
>>   		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
>> +	/*
>> +	 * Don't use boot_cpu_has() to check availability of IBT because the
>> +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
>> +	 * in host cmdline.
> I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
> PCID and many other features honor the host setup, I don't see what makes IBT
> special.

This is mostly based on our user experience and the hypothesis for cloud computing:
When we evolve host kernels, we constantly encounter issues when kernel IBT is on,
so we have to disable kernel IBT by adding ibt=off. But we need to test the CET features
in VM, if we just simply refer to host boot cpuid data, then IBT cannot be enabled in
VM which makes CET features incomplete in guest.

I guess in cloud computing, it could run into similar dilemma. In this case, the tenant
cannot benefit the feature just because of host SW problem. I know currently KVM
except LA57 always honors host feature configurations, but in CET case, there could be
divergence wrt honoring host configuration as long as there's no quirk for the feature.

But I think the issue is still open for discussion...

>
> LA57 is special because it's entirely reasonable, likely even, for a host to
> only want to use 48-bit virtual addresses, but still want to let the guest enable
> LA57.
>
>> @@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>   
>>   	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
>>   
>> +	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
>> +		vmcs_writel(GUEST_SSP, 0);
>> +		vmcs_writel(GUEST_S_CET, 0);
>> +		vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
>> +	} else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) {
>> +		vmcs_writel(GUEST_S_CET, 0);
>> +	}
> Similar to my comments about MSR interception, I think it would be better to
> explicitly handle the "common" field.  At first glance, code like the above makes
> it look like IBT is mutually exclusive with SHSTK, e.g. a reader that isn't
> looking closely could easily miss that both paths write GUEST_S_CET.

Sure,thanks!

>
> 	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
> 		vmcs_writel(GUEST_SSP, 0);
> 		vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
> 	}
> 	if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
> 	    kvm_cpu_cap_has(X86_FEATURE_SHSTK))
> 		vmcs_writel(GUEST_S_CET, 0);
Yang, Weijiang May 6, 2024, 9:41 a.m. UTC | #5
On 5/2/2024 7:34 AM, Sean Christopherson wrote:
> On Sun, Feb 18, 2024, Yang Weijiang wrote:
>> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void)
>>   		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>>   		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
>>   		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
>> -		F(SGX_LC) | F(BUS_LOCK_DETECT)
>> +		F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
>>   	);
>>   	/* Set LA57 based on hardware capability. */
>>   	if (cpuid_ecx(7) & F(LA57))
>> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void)
>>   		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
>>   		F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
>>   		F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
>> -		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
>> +		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
>> +		F(IBT)
>>   	);
> ...
>
>> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void)
>>   
>>   	if (cpu_has_vmx_waitpkg())
>>   		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
>> +
>> +	/*
>> +	 * Disable CET if unrestricted_guest is unsupported as KVM doesn't
>> +	 * enforce CET HW behaviors in emulator. On platforms with
>> +	 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
>> +	 * fails, so disable CET in this case too.
>> +	 */
>> +	if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
>> +	    !cpu_has_vmx_basic_no_hw_errcode()) {
>> +		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
>> +		kvm_cpu_cap_clear(X86_FEATURE_IBT);
>> +	}
> Oh!  Almost missed it.  This patch should explicitly kvm_cpu_cap_clear()
> X86_FEATURE_SHSTK and X86_FEATURE_IBT.  We *know* there are upcoming AMD CPUs
> that support at least SHSTK, so enumerating support for common code would yield
> a version of KVM that incorrectly advertises support for SHSTK.
>
> I hope to land both Intel and AMD virtualization in the same kernel release, but
> there are no guarantees that will happen.  And explicitly clearing both SHSTK and
> IBT would guard against IBT showing up in some future AMD CPU in advance of KVM
> gaining full support.

Let me be clear on this, you want me to disable SHSTK/IBT with kvm_cpu_cap_clear() unconditionally
for now in this patch, and wait until both AMD's SVM patches and this series are ready for guest CET,
then remove the disabling code in this patch for final merge, am I right?
Sean Christopherson May 6, 2024, 4:54 p.m. UTC | #6
On Mon, May 06, 2024, Weijiang Yang wrote:
> On 5/2/2024 7:15 AM, Sean Christopherson wrote:
> > On Sun, Feb 18, 2024, Yang Weijiang wrote:
> > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
> > >   		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
> > >   	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> > >   		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
> > > +	/*
> > > +	 * Don't use boot_cpu_has() to check availability of IBT because the
> > > +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
> > > +	 * in host cmdline.
> > I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
> > PCID and many other features honor the host setup, I don't see what makes IBT
> > special.
> 
> This is mostly based on our user experience and the hypothesis for cloud
> computing: When we evolve host kernels, we constantly encounter issues when
> kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But we
> need to test the CET features in VM, if we just simply refer to host boot
> cpuid data, then IBT cannot be enabled in VM which makes CET features
> incomplete in guest.
> 
> I guess in cloud computing, it could run into similar dilemma. In this case,
> the tenant cannot benefit the feature just because of host SW problem.

Hmm, but such issues should be found before deploying a kernel to production.

The one scenario that comes to mind where I can see someone wanting to disable
IBT would be running a out-of-tree and/or third party module.

> I know currently KVM except LA57 always honors host feature configurations,
> but in CET case, there could be divergence wrt honoring host configuration as
> long as there's no quirk for the feature.
> 
> But I think the issue is still open for discussion...

I'm not totally opposed to the idea.

Somewhat off-topic, the existing LA57 code upon which the IBT check is based is
flawed, as it doesn't account for the max supported CPUID leaf.  On Intel CPUs,
that could result in a false positive due CPUID (stupidly) returning the value
of the last implemented CPUID leaf, no zeros.  In practice, it doesn't cause
problems because CPUID.0x7 has been supported since forever, but it's still a
bug.

Hmm, actually, __kvm_cpu_cap_mask() has the exact same bug.  And that's much less
theoretical, e.g. kvm_cpu_cap_init_kvm_defined() in particular is likely to cause
problems at some point.

And I really don't like that KVM open codes calls to cpuid_<reg>() for these
"raw" features.  One option would be to and helpers to change this:

	if (cpuid_edx(7) & F(IBT))
		kvm_cpu_cap_set(X86_FEATURE_IBT);

to this:

	if (raw_cpuid_has(X86_FEATURE_IBT))
		kvm_cpu_cap_set(X86_FEATURE_IBT);

but I think we can do better, and harden the CPUID code in the process.  If we
do kvm_cpu_cap_set() _before_ kvm_cpu_cap_mask(), then incorporating the raw host
CPUID will happen automagically, as __kvm_cpu_cap_mask() will clear bits that
aren't in host CPUID.

The most obvious approach would be to simply call kvm_cpu_cap_set() before
kvm_cpu_cap_mask(), but that's more than a bit confusing, and would open the door
for potential bugs due to calling kvm_cpu_cap_set() after kvm_cpu_cap_mask().
And detecting such bugs would be difficult, because there are features that KVM
fully emulates, i.e. _must_ be stuffed after kvm_cpu_cap_mask().

Instead of calling kvm_cpu_cap_set() directly, we can take advantage of the fact
that the F() maskes are fed into kvm_cpu_cap_mask(), i.e. are naturally processed
before the corresponding kvm_cpu_cap_mask().

If we add an array to track which capabilities have been initialized, then F()
can WARN on improper usage.  That would allow detecting bad "raw" usage, *and*
would detect (some) scenarios where a F() is fed into the wrong leaf, e.g. if
we added F(LA57) to CPUID_7_EDX instead of CPUID_7_ECX.

#define F(name)								\
({									\
	u32 __leaf = __feature_leaf(X86_FEATURE_##name);		\
									\
	BUILD_BUG_ON(__leaf >= ARRAY_SIZE(kvm_cpu_cap_initialized));	\
	WARN_ON_ONCE(kvm_cpu_cap_initialized[__leaf]);			\
									\
	feature_bit(name);						\
})

/*
 * Raw Feature - For features that KVM supports based purely on raw host CPUID,
 * i.e. that KVM virtualizes even if the host kernel doesn't use the feature.
 * Simply force set the feature in KVM's capabilities, raw CPUID support will
 * be factored in by kvm_cpu_cap_mask().
 */
#define RAW_F(name)						\
({								\
	kvm_cpu_cap_set(X86_FEATURE_##name);			\
	F(name);						\
})

Assuming testing doesn't poke a hole in my idea, I'll post a small series.
Edgecombe, Rick P May 6, 2024, 5:05 p.m. UTC | #7
On Mon, 2024-05-06 at 17:19 +0800, Yang, Weijiang wrote:
> On 5/2/2024 7:15 AM, Sean Christopherson wrote:
> > On Sun, Feb 18, 2024, Yang Weijiang wrote:
> > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
> > >                 kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
> > >         if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> > >                 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
> > > +       /*
> > > +        * Don't use boot_cpu_has() to check availability of IBT because
> > > the
> > > +        * feature bit is cleared in boot_cpu_data when ibt=off is applied
> > > +        * in host cmdline.
> > I'm not convinced this is a good reason to diverge from the host kernel. 
> > E.g.
> > PCID and many other features honor the host setup, I don't see what makes
> > IBT
> > special.
> 
> This is mostly based on our user experience and the hypothesis for cloud
> computing:
> When we evolve host kernels, we constantly encounter issues when kernel IBT is
> on,
> so we have to disable kernel IBT by adding ibt=off. But we need to test the
> CET features
> in VM, if we just simply refer to host boot cpuid data, then IBT cannot be
> enabled in
> VM which makes CET features incomplete in guest.
> 
> I guess in cloud computing, it could run into similar dilemma. In this case,
> the tenant
> cannot benefit the feature just because of host SW problem. I know currently
> KVM
> except LA57 always honors host feature configurations, but in CET case, there
> could be
> divergence wrt honoring host configuration as long as there's no quirk for the
> feature.
> 
> But I think the issue is still open for discussion...

I think the back and forth I remembered was actually around SGX IBT, but I did
find this thread:
https://lore.kernel.org/lkml/20231128085025.GA3818@noisy.programming.kicks-ass.net/

Disabling kernel IBT enforcement without disabling KVM IBT seems worthwhile. But
the solution is to not to not honor host support. It is to have kernel IBT not
clear the feature flag and instead clear something else. This can be done
independently of the KVM series.
Sean Christopherson May 6, 2024, 11:33 p.m. UTC | #8
On Mon, May 06, 2024, Rick P Edgecombe wrote:
> On Mon, 2024-05-06 at 17:19 +0800, Yang, Weijiang wrote:
> > On 5/2/2024 7:15 AM, Sean Christopherson wrote:
> > > On Sun, Feb 18, 2024, Yang Weijiang wrote:
> > > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
> > > >                 kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
> > > >         if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> > > >                 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
> > > > +       /*
> > > > +        * Don't use boot_cpu_has() to check availability of IBT because
> > > > the
> > > > +        * feature bit is cleared in boot_cpu_data when ibt=off is applied
> > > > +        * in host cmdline.
> > > I'm not convinced this is a good reason to diverge from the host kernel. 
> > > E.g.  PCID and many other features honor the host setup, I don't see what
> > > makes IBT special.
> > 
> > This is mostly based on our user experience and the hypothesis for cloud
> > computing: When we evolve host kernels, we constantly encounter issues when
> > kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But
> > we need to test the CET features in VM, if we just simply refer to host
> > boot cpuid data, then IBT cannot be enabled in VM which makes CET features
> > incomplete in guest.
> > 
> > I guess in cloud computing, it could run into similar dilemma. In this
> > case, the tenant cannot benefit the feature just because of host SW
> > problem. I know currently KVM except LA57 always honors host feature
> > configurations, but in CET case, there could be divergence wrt honoring
> > host configuration as long as there's no quirk for the feature.
> > 
> > But I think the issue is still open for discussion...
> 
> I think the back and forth I remembered was actually around SGX IBT, but I did
> find this thread:
> https://lore.kernel.org/lkml/20231128085025.GA3818@noisy.programming.kicks-ass.net/
> 
> Disabling kernel IBT enforcement without disabling KVM IBT seems worthwhile. But
> the solution is to not to not honor host support. It is to have kernel IBT not
> clear the feature flag and instead clear something else. This can be done
> independently of the KVM series.

Hmm, I don't disagree, but I'm not sure it makes sense to wait on that discussion
to exempt IBT from the "it must be supported in the host" rule.  I suspect that
tweaking the handling X86_FEATURE_IBT of will open a much larger can of worms,
as overhauling feature flag handling is very much on the x86 maintainers todo
list.

IMO, the odds of there being a hardware bug that necessitates hard disabling IBT
are lower than the odds of KVM support for CET landing well before the feature
stuff is sorted out.
Edgecombe, Rick P May 6, 2024, 11:53 p.m. UTC | #9
On Mon, 2024-05-06 at 16:33 -0700, Sean Christopherson wrote:
> 
> Hmm, I don't disagree, but I'm not sure it makes sense to wait on that
> discussion
> to exempt IBT from the "it must be supported in the host" rule.  I suspect
> that
> tweaking the handling X86_FEATURE_IBT of will open a much larger can of worms,
> as overhauling feature flag handling is very much on the x86 maintainers todo
> list.
> 
> IMO, the odds of there being a hardware bug that necessitates hard disabling
> IBT
> are lower than the odds of KVM support for CET landing well before the feature
> stuff is sorted out.

I see a few reasons to tie to the host cpu feature:
1. Disabling it because of some HW concern. I agree with your assessment on the
chances.
2. Having the cpu feature be disabled by some kernel parameter, and having KVM
try to use it without the necessary FPU or other host support. It could cause
lots of problems, guest->host DOS, etc.
3. Confusion for the user about which CPU features are actually in use on the
system. There is a fair chance for compatibility issues to show up with CET.
Today there is clearcpuid. If a user was having issues with CET and just wanted
to turn it off, they might use clearcpuid or something else to just disable CET.
Then boot a VM and find it was still enabled. For shadow stack there is also
nousershstk.

So, my two cents, it's just all easier to reason about for everyone if you tie
it to host cpu features.

I don't immediately see what trouble will be in giving kernel IBT a disable
parameter that doesn't touch X86_FEATURE_IBT at some point in the future. Sorry
if I'm missing the point. So like, ibt=off disables all IBT including kernel
IBT, kernel_ibt=off disables kernel IBT enforcement via a global bool.
Yang, Weijiang May 7, 2024, 2:37 a.m. UTC | #10
On 5/7/2024 12:54 AM, Sean Christopherson wrote:
> On Mon, May 06, 2024, Weijiang Yang wrote:
>> On 5/2/2024 7:15 AM, Sean Christopherson wrote:
>>> On Sun, Feb 18, 2024, Yang Weijiang wrote:
>>>> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
>>>>    		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
>>>>    	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
>>>>    		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
>>>> +	/*
>>>> +	 * Don't use boot_cpu_has() to check availability of IBT because the
>>>> +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
>>>> +	 * in host cmdline.
>>> I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
>>> PCID and many other features honor the host setup, I don't see what makes IBT
>>> special.
>> This is mostly based on our user experience and the hypothesis for cloud
>> computing: When we evolve host kernels, we constantly encounter issues when
>> kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But we
>> need to test the CET features in VM, if we just simply refer to host boot
>> cpuid data, then IBT cannot be enabled in VM which makes CET features
>> incomplete in guest.
>>
>> I guess in cloud computing, it could run into similar dilemma. In this case,
>> the tenant cannot benefit the feature just because of host SW problem.
> Hmm, but such issues should be found before deploying a kernel to production.
>
> The one scenario that comes to mind where I can see someone wanting to disable
> IBT would be running a out-of-tree and/or third party module.

Yes, the developers may neglect IBT violations in modules/kernel components and deploy
them, in this case, host side has to either fix the issues or disable IBT.

>
>> I know currently KVM except LA57 always honors host feature configurations,
>> but in CET case, there could be divergence wrt honoring host configuration as
>> long as there's no quirk for the feature.
>>
>> But I think the issue is still open for discussion...
> I'm not totally opposed to the idea.
>
> Somewhat off-topic, the existing LA57 code upon which the IBT check is based is
> flawed, as it doesn't account for the max supported CPUID leaf.  On Intel CPUs,
> that could result in a false positive due CPUID (stupidly) returning the value
> of the last implemented CPUID leaf, no zeros.  In practice, it doesn't cause
> problems because CPUID.0x7 has been supported since forever, but it's still a
> bug.
>
> Hmm, actually, __kvm_cpu_cap_mask() has the exact same bug.  And that's much less
> theoretical, e.g. kvm_cpu_cap_init_kvm_defined() in particular is likely to cause
> problems at some point.
>
> And I really don't like that KVM open codes calls to cpuid_<reg>() for these
> "raw" features.  One option would be to and helpers to change this:
>
> 	if (cpuid_edx(7) & F(IBT))
> 		kvm_cpu_cap_set(X86_FEATURE_IBT);
>
> to this:
>
> 	if (raw_cpuid_has(X86_FEATURE_IBT))
> 		kvm_cpu_cap_set(X86_FEATURE_IBT);
>
> but I think we can do better, and harden the CPUID code in the process.  If we
> do kvm_cpu_cap_set() _before_ kvm_cpu_cap_mask(), then incorporating the raw host
> CPUID will happen automagically, as __kvm_cpu_cap_mask() will clear bits that
> aren't in host CPUID.
>
> The most obvious approach would be to simply call kvm_cpu_cap_set() before
> kvm_cpu_cap_mask(), but that's more than a bit confusing, and would open the door
> for potential bugs due to calling kvm_cpu_cap_set() after kvm_cpu_cap_mask().
> And detecting such bugs would be difficult, because there are features that KVM
> fully emulates, i.e. _must_ be stuffed after kvm_cpu_cap_mask().
>
> Instead of calling kvm_cpu_cap_set() directly, we can take advantage of the fact
> that the F() maskes are fed into kvm_cpu_cap_mask(), i.e. are naturally processed
> before the corresponding kvm_cpu_cap_mask().
>
> If we add an array to track which capabilities have been initialized, then F()
> can WARN on improper usage.  That would allow detecting bad "raw" usage, *and*
> would detect (some) scenarios where a F() is fed into the wrong leaf, e.g. if
> we added F(LA57) to CPUID_7_EDX instead of CPUID_7_ECX.
>
> #define F(name)								\
> ({									\
> 	u32 __leaf = __feature_leaf(X86_FEATURE_##name);		\
> 									\
> 	BUILD_BUG_ON(__leaf >= ARRAY_SIZE(kvm_cpu_cap_initialized));	\
> 	WARN_ON_ONCE(kvm_cpu_cap_initialized[__leaf]);			\
> 									\
> 	feature_bit(name);						\
> })
>
> /*
>   * Raw Feature - For features that KVM supports based purely on raw host CPUID,
>   * i.e. that KVM virtualizes even if the host kernel doesn't use the feature.
>   * Simply force set the feature in KVM's capabilities, raw CPUID support will
>   * be factored in by kvm_cpu_cap_mask().
>   */
> #define RAW_F(name)						\
> ({								\
> 	kvm_cpu_cap_set(X86_FEATURE_##name);			\
> 	F(name);						\
> })
>
> Assuming testing doesn't poke a hole in my idea, I'll post a small series.

Fancy enough! But I like the idea :-)

>
Sean Christopherson May 7, 2024, 2:21 p.m. UTC | #11
On Mon, May 06, 2024, Rick P Edgecombe wrote:
> I don't immediately see what trouble will be in giving kernel IBT a disable
> parameter that doesn't touch X86_FEATURE_IBT at some point in the future.

Keeping X86_FEATURE_IBT set will result in "ibt" being reported in /proc/cpuinfo,
i.e. will mislead userspace into thinking IBT is supported and fully enabled by
the kernel.  For a security feature, that's a pretty big issue.

To fudge around that, we could add a synthetic feature flag to let the kernel
tell KVM whether or not it's safe to virtualize IBT, but I don't see what value
that adds over KVM checking raw host CPUID.
Edgecombe, Rick P May 7, 2024, 2:45 p.m. UTC | #12
On Tue, 2024-05-07 at 07:21 -0700, Sean Christopherson wrote:
> 
> Keeping X86_FEATURE_IBT set will result in "ibt" being reported in
> /proc/cpuinfo,
> i.e. will mislead userspace into thinking IBT is supported and fully enabled
> by
> the kernel.  For a security feature, that's a pretty big issue.

Since the beginning, if you don't configure kernel IBT in Kconfig but the HW
supports it, "ibt" will appear in /proc/cpuinfo. It never was a reliable
indicator of kernel IBT enforcement. It is just an indicator of if the IBT
feature is usable. I think tying kernel IBT enforcement to the CPU feature is
wrong. But if you disable the HW feature, it makes sense that the enforcement
would be disabled.

CET is something that requires a fair amount of SW enablement. SW needs to do
things in special ways or things will go wrong. So whether IBT is in use and
whether it is supported by the HW are useful to maintain as separate concepts.

> 
> To fudge around that, we could add a synthetic feature flag to let the kernel
> tell KVM whether or not it's safe to virtualize IBT, but I don't see what
> value
> that adds over KVM checking raw host CPUID.

A synthetic feature flag for kernel IBT seems reasonable to me. It's what I
suggested on that thread I linked earlier. But Peterz was advocating for a bool.
How enforcement would be exposed, would just be dmesg I guess. Having a new
feature flag still makes sense to me. Maybe he could be convinced.
Sean Christopherson May 7, 2024, 3:08 p.m. UTC | #13
On Tue, May 07, 2024, Rick P Edgecombe wrote:
> On Tue, 2024-05-07 at 07:21 -0700, Sean Christopherson wrote:
> > 
> > Keeping X86_FEATURE_IBT set will result in "ibt" being reported in
> > /proc/cpuinfo,
> > i.e. will mislead userspace into thinking IBT is supported and fully enabled
> > by
> > the kernel.  For a security feature, that's a pretty big issue.
> 
> Since the beginning, if you don't configure kernel IBT in Kconfig but the HW
> supports it, "ibt" will appear in /proc/cpuinfo. It never was a reliable
> indicator of kernel IBT enforcement.

Ah, good to know.

> It is just an indicator of if the IBT feature is usable.

Does ibt=off make IBT unusable for userspace?  Huh.  Looking at the #CP handler,
I take it userspace support for IBT hasn't landed yet?

> I think tying kernel IBT enforcement to the CPU feature is wrong. But if you
> disable the HW feature, it makes sense that the enforcement would be
> disabled.
> 
> CET is something that requires a fair amount of SW enablement. SW needs to do
> things in special ways or things will go wrong. So whether IBT is in use and
> whether it is supported by the HW are useful to maintain as separate concepts.
> 
> > 
> > To fudge around that, we could add a synthetic feature flag to let the
> > kernel tell KVM whether or not it's safe to virtualize IBT, but I don't see
> > what value that adds over KVM checking raw host CPUID.
> 
> A synthetic feature flag for kernel IBT seems reasonable to me. It's what I
> suggested on that thread I linked earlier. But Peterz was advocating for a bool.
> How enforcement would be exposed, would just be dmesg I guess. Having a new
> feature flag still makes sense to me. Maybe he could be convinced.

If there's a need for IBT and KERNEL_IBT, I agree a synthetic flag makes sense.
But as above, it's not clear to me why both are needed, at least for KVM's sake.
Is the need more apparent when userspace IBT support comes along?
Edgecombe, Rick P May 7, 2024, 3:33 p.m. UTC | #14
On Tue, 2024-05-07 at 08:08 -0700, Sean Christopherson wrote:
> > It is just an indicator of if the IBT feature is usable.
> 
> Does ibt=off make IBT unusable for userspace?  Huh.  Looking at the #CP
> handler,
> I take it userspace support for IBT hasn't landed yet?

Sadly it remains an small internal dev branch that was preempted by the enormous
TDX ramp. It's my #1 thing I want to get back to when time reappears. Despite
being an old feature at this point, there is contingent of HW CFI fans that
still want to see it, including on the gcc/glibc and distro side. So I still
have hope.

> 
> > I think tying kernel IBT enforcement to the CPU feature is wrong. But if you
> > disable the HW feature, it makes sense that the enforcement would be
> > disabled.
> > 
> > CET is something that requires a fair amount of SW enablement. SW needs to
> > do
> > things in special ways or things will go wrong. So whether IBT is in use and
> > whether it is supported by the HW are useful to maintain as separate
> > concepts.
> > 
> > > 
> > > To fudge around that, we could add a synthetic feature flag to let the
> > > kernel tell KVM whether or not it's safe to virtualize IBT, but I don't
> > > see
> > > what value that adds over KVM checking raw host CPUID.
> > 
> > A synthetic feature flag for kernel IBT seems reasonable to me. It's what I
> > suggested on that thread I linked earlier. But Peterz was advocating for a
> > bool.
> > How enforcement would be exposed, would just be dmesg I guess. Having a new
> > feature flag still makes sense to me. Maybe he could be convinced.
> 
> If there's a need for IBT and KERNEL_IBT, I agree a synthetic flag makes
> sense.
> But as above, it's not clear to me why both are needed, at least for KVM's
> sake.
> Is the need more apparent when userspace IBT support comes along?

Isn't KVM CET kind of a second user though? It doesn't depends on CR4.CET like
the rest, but the same host FPU support. Let me try to ping peterz re the
synthetic flag.

For shadow stack we also have user_shstk. This was done because of the expected
introduction of the CET_SSS bit (the shadow stack fracturing busy thing bit). It
can be treated something like a supervisor shadow stack support bit. So for
guests, you might have user shadow stack support and not supervisor. At least
that was the idea.
Yang, Weijiang May 16, 2024, 7:13 a.m. UTC | #15
On 5/2/2024 7:15 AM, Sean Christopherson wrote:
> On Sun, Feb 18, 2024, Yang Weijiang wrote:
>> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
>>   		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
>>   	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
>>   		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
>> +	/*
>> +	 * Don't use boot_cpu_has() to check availability of IBT because the
>> +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
>> +	 * in host cmdline.
> I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
> PCID and many other features honor the host setup, I don't see what makes IBT
> special.
>
>
Hi, Sean,
We synced the issue internally, and got conclusion that KVM should honor host IBT config.
In this case IBT bit in boot_cpu_data should be honored.  With this policy, it can avoid CPUID
confusion to guest side due to host ibt=off config. Host side xstate support couldn't be an issue
because we already have below check in this patch:

+ if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL)) != + (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL); + }

What's your thoughts? Should I just remove the quirk here and keep everything normal and
peaceful?
Yang, Weijiang May 16, 2024, 7:20 a.m. UTC | #16
On 5/6/2024 5:41 PM, Yang, Weijiang wrote:
> On 5/2/2024 7:34 AM, Sean Christopherson wrote:
>> On Sun, Feb 18, 2024, Yang Weijiang wrote:
>>> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void)
>>>           F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>>>           F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
>>>           F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
>>> -        F(SGX_LC) | F(BUS_LOCK_DETECT)
>>> +        F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
>>>       );
>>>       /* Set LA57 based on hardware capability. */
>>>       if (cpuid_ecx(7) & F(LA57))
>>> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void)
>>>           F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
>>>           F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
>>>           F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
>>> -        F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
>>> +        F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
>>> +        F(IBT)
>>>       );
>> ...
>>
>>> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void)
>>>         if (cpu_has_vmx_waitpkg())
>>>           kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
>>> +
>>> +    /*
>>> +     * Disable CET if unrestricted_guest is unsupported as KVM doesn't
>>> +     * enforce CET HW behaviors in emulator. On platforms with
>>> +     * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
>>> +     * fails, so disable CET in this case too.
>>> +     */
>>> +    if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
>>> +        !cpu_has_vmx_basic_no_hw_errcode()) {
>>> +        kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
>>> +        kvm_cpu_cap_clear(X86_FEATURE_IBT);
>>> +    }
>> Oh!  Almost missed it.  This patch should explicitly kvm_cpu_cap_clear()
>> X86_FEATURE_SHSTK and X86_FEATURE_IBT.  We *know* there are upcoming AMD CPUs
>> that support at least SHSTK, so enumerating support for common code would yield
>> a version of KVM that incorrectly advertises support for SHSTK.
>>
>> I hope to land both Intel and AMD virtualization in the same kernel release, but
>> there are no guarantees that will happen.  And explicitly clearing both SHSTK and
>> IBT would guard against IBT showing up in some future AMD CPU in advance of KVM
>> gaining full support.
>
> Let me be clear on this, you want me to disable SHSTK/IBT with kvm_cpu_cap_clear() unconditionally
> for now in this patch, and wait until both AMD's SVM patches and this series are ready for guest CET,
> then remove the disabling code in this patch for final merge, am I right?
Hi, Sean,
I haven't got your reply on above question. Would like to get your confirmation.
Thanks!

>
>
Sean Christopherson May 16, 2024, 2:39 p.m. UTC | #17
On Thu, May 16, 2024, Weijiang Yang wrote:
> On 5/2/2024 7:15 AM, Sean Christopherson wrote:
> > On Sun, Feb 18, 2024, Yang Weijiang wrote:
> > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
> > >   		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
> > >   	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> > >   		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
> > > +	/*
> > > +	 * Don't use boot_cpu_has() to check availability of IBT because the
> > > +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
> > > +	 * in host cmdline.
> > I'm not convinced this is a good reason to diverge from the host kernel.  E.g.
> > PCID and many other features honor the host setup, I don't see what makes IBT
> > special.
> > 
> > 
> Hi, Sean,
> We synced the issue internally, and got conclusion that KVM should honor host
> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
> config.

What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
like the guest has visibility into the host kernel, and raw CPUID will still show
IBT support in the host.

On the other hand, I can definitely see folks wanting to expose IBT to guests
when running non-complaint host kernels, especially when live migration is in
play, i.e. when hiding IBT from the guest will actively cause problems.
Sean Christopherson May 16, 2024, 2:43 p.m. UTC | #18
On Thu, May 16, 2024, Weijiang Yang wrote:
> On 5/6/2024 5:41 PM, Yang, Weijiang wrote:
> > On 5/2/2024 7:34 AM, Sean Christopherson wrote:
> > > On Sun, Feb 18, 2024, Yang Weijiang wrote:
> > > > @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void)
> > > >           F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
> > > >           F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> > > >           F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
> > > > -        F(SGX_LC) | F(BUS_LOCK_DETECT)
> > > > +        F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
> > > >       );
> > > >       /* Set LA57 based on hardware capability. */
> > > >       if (cpuid_ecx(7) & F(LA57))
> > > > @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void)
> > > >           F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> > > >           F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
> > > >           F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
> > > > -        F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
> > > > +        F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
> > > > +        F(IBT)
> > > >       );
> > > ...
> > > 
> > > > @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void)
> > > >         if (cpu_has_vmx_waitpkg())
> > > >           kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
> > > > +
> > > > +    /*
> > > > +     * Disable CET if unrestricted_guest is unsupported as KVM doesn't
> > > > +     * enforce CET HW behaviors in emulator. On platforms with
> > > > +     * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
> > > > +     * fails, so disable CET in this case too.
> > > > +     */
> > > > +    if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
> > > > +        !cpu_has_vmx_basic_no_hw_errcode()) {
> > > > +        kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
> > > > +        kvm_cpu_cap_clear(X86_FEATURE_IBT);
> > > > +    }
> > > Oh!  Almost missed it.  This patch should explicitly kvm_cpu_cap_clear()
> > > X86_FEATURE_SHSTK and X86_FEATURE_IBT.  We *know* there are upcoming AMD CPUs
> > > that support at least SHSTK, so enumerating support for common code would yield
> > > a version of KVM that incorrectly advertises support for SHSTK.
> > > 
> > > I hope to land both Intel and AMD virtualization in the same kernel release, but
> > > there are no guarantees that will happen.  And explicitly clearing both SHSTK and
> > > IBT would guard against IBT showing up in some future AMD CPU in advance of KVM
> > > gaining full support.
> > 
> > Let me be clear on this, you want me to disable SHSTK/IBT with
> > kvm_cpu_cap_clear() unconditionally for now in this patch, and wait until
> > both AMD's SVM patches and this series are ready for guest CET, then remove
> > the disabling code in this patch for final merge, am I right?

No, allow it to be enabled for VMX, but explicitly disable it for SVM, i.e.

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 4aaffbf22531..b3df12af4ee6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5125,6 +5125,10 @@ static __init void svm_set_cpu_caps(void)
        kvm_caps.supported_perf_cap = 0;
        kvm_caps.supported_xss = 0;
 
+       /* KVM doesn't yet support CET virtualization for SVM. */
+       kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+       kvm_cpu_cap_clear(X86_FEATURE_IBT);
+
        /* CPUID 0x80000001 and 0x8000000A (SVM features) */
        if (nested) {
                kvm_cpu_cap_set(X86_FEATURE_SVM);

Then the SVM series can simply delete those lines when all is ready.
Dave Hansen May 16, 2024, 3:36 p.m. UTC | #19
On 5/16/24 07:39, Sean Christopherson wrote:
>> We synced the issue internally, and got conclusion that KVM should honor host
>> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
>> config.
> What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
> like the guest has visibility into the host kernel, and raw CPUID will still show
> IBT support in the host.

I'm basically arguing for the path of least resistance (at least to start).

We should just do what takes the least amount of code for now that
results in mostly sane behavior, then debate about making it perfect later.

In other words, let's say the place we'd *IDEALLY* end up is that guests
can have any random FPU state which is disconnected from the host.  But
the reality, for now, is that the host needs to have XFEATURE_CET_USER
set in order to pass it into the guest and that means keeping
X86_FEATURE_SHSTK set.

If you want guest XFEATURE_CET_USER, you must have host
X86_FEATURE_SHSTK ... for now.
Sean Christopherson May 16, 2024, 4:58 p.m. UTC | #20
On Thu, May 16, 2024, Dave Hansen wrote:
> On 5/16/24 07:39, Sean Christopherson wrote:
> >> We synced the issue internally, and got conclusion that KVM should honor host
> >> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
> >> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
> >> config.
> > What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
> > like the guest has visibility into the host kernel, and raw CPUID will still show
> > IBT support in the host.
> 
> I'm basically arguing for the path of least resistance (at least to start).
> 
> We should just do what takes the least amount of code for now that
> results in mostly sane behavior, then debate about making it perfect later.
> 
> In other words, let's say the place we'd *IDEALLY* end up is that guests
> can have any random FPU state which is disconnected from the host.  But
> the reality, for now, is that the host needs to have XFEATURE_CET_USER
> set in order to pass it into the guest and that means keeping
> X86_FEATURE_SHSTK set.
> 
> If you want guest XFEATURE_CET_USER, you must have host
> X86_FEATURE_SHSTK ... for now.

Ah, because fpu__init_system_xstate() will clear XFEATURE_CET_USER via the
X86_FEATURE_SHSTK connection in xsave_cpuid_features. 

Please put something to that effect in the changelog.  "this literally won't work
(without more changes)" is very different than us making a largely arbitrary
decision.
Yang, Weijiang May 17, 2024, 8:04 a.m. UTC | #21
On 5/16/2024 10:43 PM, Sean Christopherson wrote:
> On Thu, May 16, 2024, Weijiang Yang wrote:
>> On 5/6/2024 5:41 PM, Yang, Weijiang wrote:
>>> On 5/2/2024 7:34 AM, Sean Christopherson wrote:
>>>> On Sun, Feb 18, 2024, Yang Weijiang wrote:
>>>>> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void)
>>>>>            F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>>>>>            F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
>>>>>            F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
>>>>> -        F(SGX_LC) | F(BUS_LOCK_DETECT)
>>>>> +        F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
>>>>>        );
>>>>>        /* Set LA57 based on hardware capability. */
>>>>>        if (cpuid_ecx(7) & F(LA57))
>>>>> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void)
>>>>>            F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
>>>>>            F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
>>>>>            F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
>>>>> -        F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
>>>>> +        F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
>>>>> +        F(IBT)
>>>>>        );
>>>> ...
>>>>
>>>>> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void)
>>>>>          if (cpu_has_vmx_waitpkg())
>>>>>            kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
>>>>> +
>>>>> +    /*
>>>>> +     * Disable CET if unrestricted_guest is unsupported as KVM doesn't
>>>>> +     * enforce CET HW behaviors in emulator. On platforms with
>>>>> +     * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
>>>>> +     * fails, so disable CET in this case too.
>>>>> +     */
>>>>> +    if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
>>>>> +        !cpu_has_vmx_basic_no_hw_errcode()) {
>>>>> +        kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
>>>>> +        kvm_cpu_cap_clear(X86_FEATURE_IBT);
>>>>> +    }
>>>> Oh!  Almost missed it.  This patch should explicitly kvm_cpu_cap_clear()
>>>> X86_FEATURE_SHSTK and X86_FEATURE_IBT.  We *know* there are upcoming AMD CPUs
>>>> that support at least SHSTK, so enumerating support for common code would yield
>>>> a version of KVM that incorrectly advertises support for SHSTK.
>>>>
>>>> I hope to land both Intel and AMD virtualization in the same kernel release, but
>>>> there are no guarantees that will happen.  And explicitly clearing both SHSTK and
>>>> IBT would guard against IBT showing up in some future AMD CPU in advance of KVM
>>>> gaining full support.
>>> Let me be clear on this, you want me to disable SHSTK/IBT with
>>> kvm_cpu_cap_clear() unconditionally for now in this patch, and wait until
>>> both AMD's SVM patches and this series are ready for guest CET, then remove
>>> the disabling code in this patch for final merge, am I right?
> No, allow it to be enabled for VMX, but explicitly disable it for SVM, i.e.
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 4aaffbf22531..b3df12af4ee6 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -5125,6 +5125,10 @@ static __init void svm_set_cpu_caps(void)
>          kvm_caps.supported_perf_cap = 0;
>          kvm_caps.supported_xss = 0;
>   
> +       /* KVM doesn't yet support CET virtualization for SVM. */
> +       kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
> +       kvm_cpu_cap_clear(X86_FEATURE_IBT);
> +
>          /* CPUID 0x80000001 and 0x8000000A (SVM features) */
>          if (nested) {
>                  kvm_cpu_cap_set(X86_FEATURE_SVM);
>
> Then the SVM series can simply delete those lines when all is ready.

Understood, thanks!
Yang, Weijiang May 17, 2024, 8:27 a.m. UTC | #22
On 5/17/2024 12:58 AM, Sean Christopherson wrote:
> On Thu, May 16, 2024, Dave Hansen wrote:
>> On 5/16/24 07:39, Sean Christopherson wrote:
>>>> We synced the issue internally, and got conclusion that KVM should honor host
>>>> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
>>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
>>>> config.
>>> What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
>>> like the guest has visibility into the host kernel, and raw CPUID will still show
>>> IBT support in the host.
>> I'm basically arguing for the path of least resistance (at least to start).
>>
>> We should just do what takes the least amount of code for now that
>> results in mostly sane behavior, then debate about making it perfect later.
>>
>> In other words, let's say the place we'd *IDEALLY* end up is that guests
>> can have any random FPU state which is disconnected from the host.  But
>> the reality, for now, is that the host needs to have XFEATURE_CET_USER
>> set in order to pass it into the guest and that means keeping
>> X86_FEATURE_SHSTK set.
>>
>> If you want guest XFEATURE_CET_USER, you must have host
>> X86_FEATURE_SHSTK ... for now.
> Ah, because fpu__init_system_xstate() will clear XFEATURE_CET_USER via the
> X86_FEATURE_SHSTK connection in xsave_cpuid_features.
>
> Please put something to that effect in the changelog.  "this literally won't work
> (without more changes)" is very different than us making a largely arbitrary
> decision.

So I need to remove the trick here for guest IBT, right?

Side topic:
When X86_FEATURE_SHSTK and X86_FEATURE_IBT (no ibt=off set) are available on host side, then host XFEATURE_CET_USER is enabled. In this case, we still *need* below patch: https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ to correctly enable XFEATURE_CET_USER in *guest kernel*, because VMM userspace can enable IBT alone for guest by -cpu host -shstk, am I right?
Thomas Gleixner May 17, 2024, 8:57 a.m. UTC | #23
On Thu, May 16 2024 at 07:39, Sean Christopherson wrote:
> On Thu, May 16, 2024, Weijiang Yang wrote:
>> We synced the issue internally, and got conclusion that KVM should honor host
>> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
>> config.
>
> What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
> like the guest has visibility into the host kernel, and raw CPUID will still show
> IBT support in the host.
>
> On the other hand, I can definitely see folks wanting to expose IBT to guests
> when running non-complaint host kernels, especially when live migration is in
> play, i.e. when hiding IBT from the guest will actively cause problems.

I have to disagree here violently.

If the exposure of a CPUID bit to a guest requires host side support,
e.g. in xstate handling, then exposing it to a guest is simply not
possible.

Just because virtualization allows to do that does not mean that it's
correct in any way.

Thanks,

        tglx
Sean Christopherson May 17, 2024, 2:26 p.m. UTC | #24
On Fri, May 17, 2024, Thomas Gleixner wrote:
> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote:
> > On Thu, May 16, 2024, Weijiang Yang wrote:
> >> We synced the issue internally, and got conclusion that KVM should honor host
> >> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
> >> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
> >> config.
> >
> > What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
> > like the guest has visibility into the host kernel, and raw CPUID will still show
> > IBT support in the host.
> >
> > On the other hand, I can definitely see folks wanting to expose IBT to guests
> > when running non-complaint host kernels, especially when live migration is in
> > play, i.e. when hiding IBT from the guest will actively cause problems.
> 
> I have to disagree here violently.
> 
> If the exposure of a CPUID bit to a guest requires host side support,
> e.g. in xstate handling, then exposing it to a guest is simply not
> possible.

Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the
supported xfeatures mask.
Yang, Weijiang May 20, 2024, 9:43 a.m. UTC | #25
On 5/17/2024 10:26 PM, Sean Christopherson wrote:
> On Fri, May 17, 2024, Thomas Gleixner wrote:
>> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote:
>>> On Thu, May 16, 2024, Weijiang Yang wrote:
>>>> We synced the issue internally, and got conclusion that KVM should honor host
>>>> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
>>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
>>>> config.
>>> What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
>>> like the guest has visibility into the host kernel, and raw CPUID will still show
>>> IBT support in the host.
>>>
>>> On the other hand, I can definitely see folks wanting to expose IBT to guests
>>> when running non-complaint host kernels, especially when live migration is in
>>> play, i.e. when hiding IBT from the guest will actively cause problems.
>> I have to disagree here violently.
>>
>> If the exposure of a CPUID bit to a guest requires host side support,
>> e.g. in xstate handling, then exposing it to a guest is simply not
>> possible.
> Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the
> supported xfeatures mask.

For host side support, fortunately,  this patch already has some checks for that. But for userspace
CPUID config, it allows IBT to be exposed alone.

IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be exposed as an independent
feature to guest without exposing SHSTK at the same time. If it is, then below patch is not needed
anymore:
https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/

I'd check and clear IBT bit from CPUID when userspace enables only IBT via KVM_SET_CPUID2.
And update related code of the series given the implicit dependency.

Thanks!
Sean Christopherson May 20, 2024, 5:09 p.m. UTC | #26
On Mon, May 20, 2024, Weijiang Yang wrote:
> On 5/17/2024 10:26 PM, Sean Christopherson wrote:
> > On Fri, May 17, 2024, Thomas Gleixner wrote:
> > > On Thu, May 16 2024 at 07:39, Sean Christopherson wrote:
> > > > On Thu, May 16, 2024, Weijiang Yang wrote:
> > > > > We synced the issue internally, and got conclusion that KVM should honor host
> > > > > IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
> > > > > this policy, it can avoid CPUID confusion to guest side due to host ibt=off
> > > > > config.
> > > > What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
> > > > like the guest has visibility into the host kernel, and raw CPUID will still show
> > > > IBT support in the host.
> > > > 
> > > > On the other hand, I can definitely see folks wanting to expose IBT to guests
> > > > when running non-complaint host kernels, especially when live migration is in
> > > > play, i.e. when hiding IBT from the guest will actively cause problems.
> > > I have to disagree here violently.
> > > 
> > > If the exposure of a CPUID bit to a guest requires host side support,
> > > e.g. in xstate handling, then exposing it to a guest is simply not
> > > possible.
> > Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the
> > supported xfeatures mask.
> 
> For host side support, fortunately,  this patch already has some checks for
> that. But for userspace CPUID config, it allows IBT to be exposed alone.
> 
> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be
> exposed as an independent feature to guest without exposing SHSTK at the same
> time. If it is, then below patch is not needed anymore:
> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/

That's a question for the x86 maintainers.  Specifically, do they want to allow
enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled.

I don't think it impacts KVM, at least not directly.  Regardless of what decision
the kernel makes, KVM needs to disable IBT and SHSTK if CET_USER _or_ CET_KERNEL
is missing, which KVM already does via:

	if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER |
	     XFEATURE_MASK_CET_KERNEL)) !=
	    (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) {
		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
		kvm_cpu_cap_clear(X86_FEATURE_IBT);
		kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
					    XFEATURE_MASK_CET_KERNEL);
	}

> I'd check and clear IBT bit from CPUID when userspace enables only IBT via
> KVM_SET_CPUID2.

No.  It is userspace's responsibility to provide a sane CPUID model for the guest.
KVM needs to ensure that *KVM* doesn't treat IBT as supported if the kernel doesn't
allow XFEATURE_CET_USER, but userspace can advertise whatever it wants to the guest
(and gets to keep the pieces if it does something funky).
Dave Hansen May 20, 2024, 5:15 p.m. UTC | #27
On 5/20/24 10:09, Sean Christopherson wrote:
>> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be
>> exposed as an independent feature to guest without exposing SHSTK at the same
>> time. If it is, then below patch is not needed anymore:
>> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/
> That's a question for the x86 maintainers.  Specifically, do they want to allow
> enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled.

I like the sound of "below patch is not needed anymore".

Unless removing the patch causes permanent issues or results in
something that's not functional, I say: jettison it with glee. If it's
that important, it can be considered on its own merits separately.
Yang, Weijiang May 22, 2024, 8:41 a.m. UTC | #28
On 5/21/2024 1:09 AM, Sean Christopherson wrote:
> On Mon, May 20, 2024, Weijiang Yang wrote:
>> On 5/17/2024 10:26 PM, Sean Christopherson wrote:
>>> On Fri, May 17, 2024, Thomas Gleixner wrote:
>>>> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote:
>>>>> On Thu, May 16, 2024, Weijiang Yang wrote:
>>>>>> We synced the issue internally, and got conclusion that KVM should honor host
>>>>>> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
>>>>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
>>>>>> config.
>>>>> What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
>>>>> like the guest has visibility into the host kernel, and raw CPUID will still show
>>>>> IBT support in the host.
>>>>>
>>>>> On the other hand, I can definitely see folks wanting to expose IBT to guests
>>>>> when running non-complaint host kernels, especially when live migration is in
>>>>> play, i.e. when hiding IBT from the guest will actively cause problems.
>>>> I have to disagree here violently.
>>>>
>>>> If the exposure of a CPUID bit to a guest requires host side support,
>>>> e.g. in xstate handling, then exposing it to a guest is simply not
>>>> possible.
>>> Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the
>>> supported xfeatures mask.
>> For host side support, fortunately,  this patch already has some checks for
>> that. But for userspace CPUID config, it allows IBT to be exposed alone.
>>
>> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be
>> exposed as an independent feature to guest without exposing SHSTK at the same
>> time. If it is, then below patch is not needed anymore:
>> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/
> That's a question for the x86 maintainers.  Specifically, do they want to allow
> enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled.
>
> I don't think it impacts KVM, at least not directly.  Regardless of what decision
> the kernel makes, KVM needs to disable IBT and SHSTK if CET_USER _or_ CET_KERNEL
> is missing, which KVM already does via:
>
> 	if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER |
> 	     XFEATURE_MASK_CET_KERNEL)) !=
> 	    (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) {
> 		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
> 		kvm_cpu_cap_clear(X86_FEATURE_IBT);
> 		kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
> 					    XFEATURE_MASK_CET_KERNEL);
> 	}
>
>> I'd check and clear IBT bit from CPUID when userspace enables only IBT via
>> KVM_SET_CPUID2.
> No.  It is userspace's responsibility to provide a sane CPUID model for the guest.
> KVM needs to ensure that *KVM* doesn't treat IBT as supported if the kernel doesn't
> allow XFEATURE_CET_USER, but userspace can advertise whatever it wants to the guest
> (and gets to keep the pieces if it does something funky).

OK, I think we can go ahead to keep KVM patches as-is given the fact user IBT is not enabled in Linux.
I only hope other OSes can enforce both SHSTK and IBT dependency on XFEATURE_CET_USER so
that user IBT can work well there.

Then IBT can be exposed to guest alone because guest *kernel* IBT only relies on S_CET MSR  which is
VMCS auto-saved/restored.

What's your thoughts?
Yang, Weijiang May 22, 2024, 9:03 a.m. UTC | #29
On 5/21/2024 1:15 AM, Dave Hansen wrote:
> On 5/20/24 10:09, Sean Christopherson wrote:
>>> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be
>>> exposed as an independent feature to guest without exposing SHSTK at the same
>>> time. If it is, then below patch is not needed anymore:
>>> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/
>> That's a question for the x86 maintainers.  Specifically, do they want to allow
>> enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled.
> I like the sound of "below patch is not needed anymore".
>
> Unless removing the patch causes permanent issues or results in
> something that's not functional, I say: jettison it with glee. If it's
> that important, it can be considered on its own merits separately.
I guess the existing dependency there is due to the fact that only user SHSTK is landed and there's
possibly no such kind of odd bare metal platform.

Side topic:  would it be reasonable to enforce IBT dependency on XFEATURE_CET_USER when *user* IBT
enabling patches are landing in kernel? Then guest kernel can play with user IBT alone if VMM
userspace just wants to enable IBT for guest. Or when SHSTK is disabled for whatever reason.
Edgecombe, Rick P May 22, 2024, 3:06 p.m. UTC | #30
On Wed, 2024-05-22 at 17:03 +0800, Yang, Weijiang wrote:
> Side topic:  would it be reasonable to enforce IBT dependency on
> XFEATURE_CET_USER when *user* IBT
> enabling patches are landing in kernel? Then guest kernel can play with user
> IBT alone if VMM
> userspace just wants to enable IBT for guest. Or when SHSTK is disabled for
> whatever reason.

I think earlier there was a comment that CET would be less likely to need to be
disabled for security reasons, so there would not be utility for a system wide
disable (that affects KVM). I recently remembered we actually already had a
reason come up.

The EDK2 SMI handler uses shadow stack and had a bug around saving and restoring
CET state. Using IBT in the kernel was causing systems to hang. The temporary
fix was to disable IBT.

So the point is, let's not try to find a narrow way to get away with enabling as
much as technically possible in KVM.

The simple obviously correct solution would be:
XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_IBT = KVM IBT support
XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_SHSTK = KVM SHSTK support

It should be correct both with and without that patch to enable
XFEATURE_CET_USER for X86_FEATURE_IBT.

Then the two missing changes to expand support would be:
1. Fixing that ibt=off disables X86_FEATURE_IBT. The fix is to move to bool as
peterz suggested.
2. Making XFEATURE_CET_USER also depend on X86_FEATURE_IBT (the patch in this
series)

We should do those, but in a later small series. Does it seem reasonable? Can we
just do the simple obvious solution above for now?
Yang, Weijiang May 23, 2024, 10:07 a.m. UTC | #31
On 5/22/2024 11:06 PM, Edgecombe, Rick P wrote:
> On Wed, 2024-05-22 at 17:03 +0800, Yang, Weijiang wrote:
>> Side topic:  would it be reasonable to enforce IBT dependency on
>> XFEATURE_CET_USER when *user* IBT
>> enabling patches are landing in kernel? Then guest kernel can play with user
>> IBT alone if VMM
>> userspace just wants to enable IBT for guest. Or when SHSTK is disabled for
>> whatever reason.
> I think earlier there was a comment that CET would be less likely to need to be
> disabled for security reasons, so there would not be utility for a system wide
> disable (that affects KVM). I recently remembered we actually already had a
> reason come up.
>
> The EDK2 SMI handler uses shadow stack and had a bug around saving and restoring
> CET state. Using IBT in the kernel was causing systems to hang. The temporary
> fix was to disable IBT.
>
> So the point is, let's not try to find a narrow way to get away with enabling as
> much as technically possible in KVM.
>
> The simple obviously correct solution would be:
> XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_IBT = KVM IBT support
> XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_SHSTK = KVM SHSTK support

Yes, I can easily achieve it by removing the raw cpuid check for KVM IBT. Host side CET xstate
support check is already there in this patch.

>
> It should be correct both with and without that patch to enable
> XFEATURE_CET_USER for X86_FEATURE_IBT.

IMHO, given the fact user IBT hasn't been enabled in kernel, it's not too bad just discarding the patch.
I can highlight the issue somewhere in this series.

>
> Then the two missing changes to expand support would be:
> 1. Fixing that ibt=off disables X86_FEATURE_IBT. The fix is to move to bool as
> peterz suggested.
> 2. Making XFEATURE_CET_USER also depend on X86_FEATURE_IBT (the patch in this
> series)
>
> We should do those, but in a later small series. Does it seem reasonable? Can we
> just do the simple obvious solution above for now?

It makes sense for me, but I want to hear x86 and KVM maintainers' voice for it.

Thanks!
Yang, Weijiang May 27, 2024, 9:05 a.m. UTC | #32
On 5/22/2024 4:41 PM, Yang, Weijiang wrote:
> On 5/21/2024 1:09 AM, Sean Christopherson wrote:
>> On Mon, May 20, 2024, Weijiang Yang wrote:
>>> On 5/17/2024 10:26 PM, Sean Christopherson wrote:
>>>> On Fri, May 17, 2024, Thomas Gleixner wrote:
>>>>> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote:
>>>>>> On Thu, May 16, 2024, Weijiang Yang wrote:
>>>>>>> We synced the issue internally, and got conclusion that KVM should honor host
>>>>>>> IBT config.  In this case IBT bit in boot_cpu_data should be honored.  With
>>>>>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off
>>>>>>> config.
>>>>>> What was the reasoning?  CPUID confusion is a weak justification, e.g. it's not
>>>>>> like the guest has visibility into the host kernel, and raw CPUID will still show
>>>>>> IBT support in the host.
>>>>>>
>>>>>> On the other hand, I can definitely see folks wanting to expose IBT to guests
>>>>>> when running non-complaint host kernels, especially when live migration is in
>>>>>> play, i.e. when hiding IBT from the guest will actively cause problems.
>>>>> I have to disagree here violently.
>>>>>
>>>>> If the exposure of a CPUID bit to a guest requires host side support,
>>>>> e.g. in xstate handling, then exposing it to a guest is simply not
>>>>> possible.
>>>> Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the
>>>> supported xfeatures mask.
>>> For host side support, fortunately,  this patch already has some checks for
>>> that. But for userspace CPUID config, it allows IBT to be exposed alone.
>>>
>>> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be
>>> exposed as an independent feature to guest without exposing SHSTK at the same
>>> time. If it is, then below patch is not needed anymore:
>>> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/
>> That's a question for the x86 maintainers.  Specifically, do they want to allow
>> enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled.
>>
>> I don't think it impacts KVM, at least not directly.  Regardless of what decision
>> the kernel makes, KVM needs to disable IBT and SHSTK if CET_USER _or_ CET_KERNEL
>> is missing, which KVM already does via:
>>
>>     if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER |
>>          XFEATURE_MASK_CET_KERNEL)) !=
>>         (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) {
>>         kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
>>         kvm_cpu_cap_clear(X86_FEATURE_IBT);
>>         kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
>>                         XFEATURE_MASK_CET_KERNEL);
>>     }
>>
>>> I'd check and clear IBT bit from CPUID when userspace enables only IBT via
>>> KVM_SET_CPUID2.
>> No.  It is userspace's responsibility to provide a sane CPUID model for the guest.
>> KVM needs to ensure that *KVM* doesn't treat IBT as supported if the kernel doesn't
>> allow XFEATURE_CET_USER, but userspace can advertise whatever it wants to the guest
>> (and gets to keep the pieces if it does something funky).
>
> OK, I think we can go ahead to keep KVM patches as-is given the fact user IBT is not enabled in Linux.
> I only hope other OSes can enforce both SHSTK and IBT dependency on XFEATURE_CET_USER so
> that user IBT can work well there.
>
> Then IBT can be exposed to guest alone because guest *kernel* IBT only relies on S_CET MSR  which is
> VMCS auto-saved/restored.
>
> What's your thoughts?

If there's objection I'll do below changes for this series:
1) Remove patch :
https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/
2) Remove reference to raw CPUID for KVM IBT CPUID, handling it the same as SHSTK.

Meanwhile still allow userspace to enable IBT feature alone because user IBT is not enabled
in kernel now, leave enforcement of user IBT dependency on XFEATURE_CET_USER in future.

>
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 79f7c18c487b..3b263fa171a1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -134,7 +134,7 @@ 
 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
 			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
-			  | X86_CR4_LAM_SUP))
+			  | X86_CR4_LAM_SUP | X86_CR4_CET))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f1bd7b91b3c6..4aa9aaa295f0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1110,6 +1110,7 @@ 
 #define VMX_BASIC_MEM_TYPE_MASK	0x003c000000000000LLU
 #define VMX_BASIC_MEM_TYPE_WB	6LLU
 #define VMX_BASIC_INOUT		0x0040000000000000LLU
+#define VMX_BASIC_NO_HW_ERROR_CODE_CC	0x0100000000000000LLU
 
 /* Resctrl MSRs: */
 /* - Intel: */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c0e13040e35b..d37f41472043 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -150,14 +150,14 @@  static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 	}
 	/*
-	 * Prevent 32-bit guest launch if shadow stack is exposed as SSP
-	 * state is not defined for 32-bit SMRAM.
+	 * CET is not supported for 32-bit guest, prevent guest launch if
+	 * shadow stack or IBT is enabled for 32-bit guest.
 	 */
 	best = cpuid_entry2_find(entries, nent, 0x80000001,
 				 KVM_CPUID_INDEX_NOT_SIGNIFICANT);
 	if (best && !(best->edx & F(LM))) {
 		best = cpuid_entry2_find(entries, nent, 0x7, 0);
-		if (best && (best->ecx & F(SHSTK)))
+		if (best && ((best->ecx & F(SHSTK)) || (best->edx & F(IBT))))
 			return -EINVAL;
 	}
 
@@ -665,7 +665,7 @@  void kvm_set_cpu_caps(void)
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
 		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
-		F(SGX_LC) | F(BUS_LOCK_DETECT)
+		F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
 	);
 	/* Set LA57 based on hardware capability. */
 	if (cpuid_ecx(7) & F(LA57))
@@ -683,7 +683,8 @@  void kvm_set_cpu_caps(void)
 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
 		F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
 		F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
-		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
+		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
+		F(IBT)
 	);
 
 	/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -696,6 +697,20 @@  void kvm_set_cpu_caps(void)
 		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
 	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
 		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
+	/*
+	 * Don't use boot_cpu_has() to check availability of IBT because the
+	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
+	 * in host cmdline.
+	 *
+	 * As currently there's no HW bug which requires disabling IBT feature
+	 * while CPU can enumerate it, host cmdline option ibt=off is most
+	 * likely due to administrative reason on host side, so KVM refers to
+	 * CPU CPUID enumeration to enable the feature. In future if there's
+	 * actually some bug clobbered ibt=off option, then enforce additional
+	 * check here to disable the support in KVM.
+	 */
+	if (cpuid_edx(7) & F(IBT))
+		kvm_cpu_cap_set(X86_FEATURE_IBT);
 
 	kvm_cpu_cap_mask(CPUID_7_1_EAX,
 		F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index ee8938818c8a..e12bc233d88b 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -79,6 +79,12 @@  static inline bool cpu_has_vmx_basic_inout(void)
 	return	(((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
 }
 
+static inline bool cpu_has_vmx_basic_no_hw_errcode(void)
+{
+	return	((u64)vmcs_config.basic_cap << 32) &
+		 VMX_BASIC_NO_HW_ERROR_CODE_CC;
+}
+
 static inline bool cpu_has_virtual_nmis(void)
 {
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 342b5b94c892..9df25c9e80f5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2609,6 +2609,7 @@  static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 		{ VM_ENTRY_LOAD_IA32_EFER,		VM_EXIT_LOAD_IA32_EFER },
 		{ VM_ENTRY_LOAD_BNDCFGS,		VM_EXIT_CLEAR_BNDCFGS },
 		{ VM_ENTRY_LOAD_IA32_RTIT_CTL,		VM_EXIT_CLEAR_IA32_RTIT_CTL },
+		{ VM_ENTRY_LOAD_CET_STATE,		VM_EXIT_LOAD_CET_STATE },
 	};
 
 	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
@@ -4934,6 +4935,14 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
+	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+		vmcs_writel(GUEST_SSP, 0);
+		vmcs_writel(GUEST_S_CET, 0);
+		vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
+	} else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) {
+		vmcs_writel(GUEST_S_CET, 0);
+	}
+
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
 	vpid_sync_context(vmx->vpid);
@@ -6353,6 +6362,10 @@  void dump_vmcs(struct kvm_vcpu *vcpu)
 	if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
 		vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
 
+	if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
+		pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+		       vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
+		       vmcs_readl(GUEST_INTR_SSP_TABLE));
 	pr_err("*** Host State ***\n");
 	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
 	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
@@ -6383,6 +6396,10 @@  void dump_vmcs(struct kvm_vcpu *vcpu)
 		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
 	if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
 		vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+	if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
+		pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+		       vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
+		       vmcs_readl(HOST_INTR_SSP_TABLE));
 
 	pr_err("*** Control State ***\n");
 	pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
@@ -7965,7 +7982,6 @@  static __init void vmx_set_cpu_caps(void)
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
 	/* CPUID 0xD.1 */
-	kvm_caps.supported_xss = 0;
 	if (!cpu_has_vmx_xsaves())
 		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
 
@@ -7977,6 +7993,18 @@  static __init void vmx_set_cpu_caps(void)
 
 	if (cpu_has_vmx_waitpkg())
 		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+
+	/*
+	 * Disable CET if unrestricted_guest is unsupported as KVM doesn't
+	 * enforce CET HW behaviors in emulator. On platforms with
+	 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
+	 * fails, so disable CET in this case too.
+	 */
+	if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
+	    !cpu_has_vmx_basic_no_hw_errcode()) {
+		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+		kvm_cpu_cap_clear(X86_FEATURE_IBT);
+	}
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e3b0985bb74a..d0cad2624564 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -484,7 +484,8 @@  static inline u8 vmx_get_rvi(void)
 	 VM_ENTRY_LOAD_IA32_EFER |					\
 	 VM_ENTRY_LOAD_BNDCFGS |					\
 	 VM_ENTRY_PT_CONCEAL_PIP |					\
-	 VM_ENTRY_LOAD_IA32_RTIT_CTL)
+	 VM_ENTRY_LOAD_IA32_RTIT_CTL |					\
+	 VM_ENTRY_LOAD_CET_STATE)
 
 #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS				\
 	(VM_EXIT_SAVE_DEBUG_CONTROLS |					\
@@ -506,7 +507,8 @@  static inline u8 vmx_get_rvi(void)
 	       VM_EXIT_LOAD_IA32_EFER |					\
 	       VM_EXIT_CLEAR_BNDCFGS |					\
 	       VM_EXIT_PT_CONCEAL_PIP |					\
-	       VM_EXIT_CLEAR_IA32_RTIT_CTL)
+	       VM_EXIT_CLEAR_IA32_RTIT_CTL |				\
+	       VM_EXIT_LOAD_CET_STATE)
 
 #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL			\
 	(PIN_BASED_EXT_INTR_MASK |					\
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 73a55d388dd9..cd656099fbfd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -231,7 +231,8 @@  static struct kvm_user_return_msrs __percpu *user_return_msrs;
 				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
 				| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
 
-#define KVM_SUPPORTED_XSS     0
+#define KVM_SUPPORTED_XSS	(XFEATURE_MASK_CET_USER | \
+				 XFEATURE_MASK_CET_KERNEL)
 
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
@@ -9943,6 +9944,20 @@  int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
 		kvm_caps.supported_xss = 0;
 
+	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
+		kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
+					    XFEATURE_MASK_CET_KERNEL);
+
+	if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER |
+	     XFEATURE_MASK_CET_KERNEL)) !=
+	    (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) {
+		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+		kvm_cpu_cap_clear(X86_FEATURE_IBT);
+		kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
+					    XFEATURE_MASK_CET_KERNEL);
+	}
+
 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
 	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
 #undef __kvm_cpu_cap_has
@@ -12402,7 +12417,9 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 }
 
 #define XSTATE_NEED_RESET_MASK	(XFEATURE_MASK_BNDREGS | \
-				 XFEATURE_MASK_BNDCSR)
+				 XFEATURE_MASK_BNDCSR | \
+				 XFEATURE_MASK_CET_USER | \
+				 XFEATURE_MASK_CET_KERNEL)
 
 static bool kvm_vcpu_has_xstate(unsigned long xfeature)
 {
@@ -12410,6 +12427,11 @@  static bool kvm_vcpu_has_xstate(unsigned long xfeature)
 	case XFEATURE_MASK_BNDREGS:
 	case XFEATURE_MASK_BNDCSR:
 		return kvm_cpu_cap_has(X86_FEATURE_MPX);
+	case XFEATURE_CET_USER:
+		return kvm_cpu_cap_has(X86_FEATURE_SHSTK) ||
+		       kvm_cpu_cap_has(X86_FEATURE_IBT);
+	case XFEATURE_CET_KERNEL:
+		return kvm_cpu_cap_has(X86_FEATURE_SHSTK);
 	default:
 		return false;
 	}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 656107e64c93..cc585051d24b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -533,6 +533,9 @@  bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
 		__reserved_bits |= X86_CR4_PCIDE;       \
 	if (!__cpu_has(__c, X86_FEATURE_LAM))           \
 		__reserved_bits |= X86_CR4_LAM_SUP;     \
+	if (!__cpu_has(__c, X86_FEATURE_SHSTK) &&       \
+	    !__cpu_has(__c, X86_FEATURE_IBT))           \
+		__reserved_bits |= X86_CR4_CET;         \
 	__reserved_bits;                                \
 })