Message ID | 20240219074733.122080-25-weijiang.yang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Enable CET Virtualization | expand |
On Sun, Feb 18, 2024, Yang Weijiang wrote: > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); > + /* > + * Don't use boot_cpu_has() to check availability of IBT because the > + * feature bit is cleared in boot_cpu_data when ibt=off is applied > + * in host cmdline. I'm not convinced this is a good reason to diverge from the host kernel. E.g. PCID and many other features honor the host setup, I don't see what makes IBT special. LA57 is special because it's entirely reasonable, likely even, for a host to only want to use 48-bit virtual addresses, but still want to let the guest enable LA57. > @@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ > > + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { > + vmcs_writel(GUEST_SSP, 0); > + vmcs_writel(GUEST_S_CET, 0); > + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); > + } else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) { > + vmcs_writel(GUEST_S_CET, 0); > + } Similar to my comments about MSR interception, I think it would be better to explicitly handle the "common" field. At first glance, code like the above makes it look like IBT is mutually exclusive with SHSTK, e.g. a reader that isn't looking closely could easily miss that both paths write GUEST_S_CET. if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { vmcs_writel(GUEST_SSP, 0); vmcs_writel(GUEST_INTR_SSP_TABLE, 0); } if (kvm_cpu_cap_has(X86_FEATURE_IBT) || kvm_cpu_cap_has(X86_FEATURE_SHSTK)) vmcs_writel(GUEST_S_CET, 0);
On Wed, 2024-05-01 at 16:15 -0700, Sean Christopherson wrote: > On Sun, Feb 18, 2024, Yang Weijiang wrote: > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); > > + /* > > + * Don't use boot_cpu_has() to check availability of IBT because the > > + * feature bit is cleared in boot_cpu_data when ibt=off is applied > > + * in host cmdline. > > I'm not convinced this is a good reason to diverge from the host kernel. E.g. > PCID and many other features honor the host setup, I don't see what makes IBT > special. > > LA57 is special because it's entirely reasonable, likely even, for a host to > only want to use 48-bit virtual addresses, but still want to let the guest > enable > LA57. Definitely. I swear we (Weijiang and I) had a back and forth at some point where we agreed to match the host support. Plus I think the CET FPU stuff triggers off of host support for CET. So if the host doesn't have X86_FEATURE_SHSTK or X86_FEATURE_IBT then... hopefully it's caught later. But then don't report it's supported.
On Sun, Feb 18, 2024, Yang Weijiang wrote: > @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) > F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | > F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | > F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | > - F(SGX_LC) | F(BUS_LOCK_DETECT) > + F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) > ); > /* Set LA57 based on hardware capability. */ > if (cpuid_ecx(7) & F(LA57)) > @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) > F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | > F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | > F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | > - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) > + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | > + F(IBT) > ); ... > @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void) > > if (cpu_has_vmx_waitpkg()) > kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); > + > + /* > + * Disable CET if unrestricted_guest is unsupported as KVM doesn't > + * enforce CET HW behaviors in emulator. On platforms with > + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code > + * fails, so disable CET in this case too. > + */ > + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || > + !cpu_has_vmx_basic_no_hw_errcode()) { > + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); > + kvm_cpu_cap_clear(X86_FEATURE_IBT); > + } Oh! Almost missed it. This patch should explicitly kvm_cpu_cap_clear() X86_FEATURE_SHSTK and X86_FEATURE_IBT. We *know* there are upcoming AMD CPUs that support at least SHSTK, so enumerating support for common code would yield a version of KVM that incorrectly advertises support for SHSTK. I hope to land both Intel and AMD virtualization in the same kernel release, but there are no guarantees that will happen. And explicitly clearing both SHSTK and IBT would guard against IBT showing up in some future AMD CPU in advance of KVM gaining full support.
On 5/2/2024 7:15 AM, Sean Christopherson wrote: > On Sun, Feb 18, 2024, Yang Weijiang wrote: >> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) >> kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); >> if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) >> kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); >> + /* >> + * Don't use boot_cpu_has() to check availability of IBT because the >> + * feature bit is cleared in boot_cpu_data when ibt=off is applied >> + * in host cmdline. > I'm not convinced this is a good reason to diverge from the host kernel. E.g. > PCID and many other features honor the host setup, I don't see what makes IBT > special. This is mostly based on our user experience and the hypothesis for cloud computing: When we evolve host kernels, we constantly encounter issues when kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But we need to test the CET features in VM, if we just simply refer to host boot cpuid data, then IBT cannot be enabled in VM which makes CET features incomplete in guest. I guess in cloud computing, it could run into similar dilemma. In this case, the tenant cannot benefit the feature just because of host SW problem. I know currently KVM except LA57 always honors host feature configurations, but in CET case, there could be divergence wrt honoring host configuration as long as there's no quirk for the feature. But I think the issue is still open for discussion... > > LA57 is special because it's entirely reasonable, likely even, for a host to > only want to use 48-bit virtual addresses, but still want to let the guest enable > LA57. > >> @@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) >> >> vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ >> >> + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { >> + vmcs_writel(GUEST_SSP, 0); >> + vmcs_writel(GUEST_S_CET, 0); >> + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); >> + } else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) { >> + vmcs_writel(GUEST_S_CET, 0); >> + } > Similar to my comments about MSR interception, I think it would be better to > explicitly handle the "common" field. At first glance, code like the above makes > it look like IBT is mutually exclusive with SHSTK, e.g. a reader that isn't > looking closely could easily miss that both paths write GUEST_S_CET. Sure,thanks! > > if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { > vmcs_writel(GUEST_SSP, 0); > vmcs_writel(GUEST_INTR_SSP_TABLE, 0); > } > if (kvm_cpu_cap_has(X86_FEATURE_IBT) || > kvm_cpu_cap_has(X86_FEATURE_SHSTK)) > vmcs_writel(GUEST_S_CET, 0);
On 5/2/2024 7:34 AM, Sean Christopherson wrote: > On Sun, Feb 18, 2024, Yang Weijiang wrote: >> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) >> F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | >> F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | >> F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | >> - F(SGX_LC) | F(BUS_LOCK_DETECT) >> + F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) >> ); >> /* Set LA57 based on hardware capability. */ >> if (cpuid_ecx(7) & F(LA57)) >> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) >> F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | >> F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | >> F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | >> - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) >> + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | >> + F(IBT) >> ); > ... > >> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void) >> >> if (cpu_has_vmx_waitpkg()) >> kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); >> + >> + /* >> + * Disable CET if unrestricted_guest is unsupported as KVM doesn't >> + * enforce CET HW behaviors in emulator. On platforms with >> + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code >> + * fails, so disable CET in this case too. >> + */ >> + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || >> + !cpu_has_vmx_basic_no_hw_errcode()) { >> + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); >> + kvm_cpu_cap_clear(X86_FEATURE_IBT); >> + } > Oh! Almost missed it. This patch should explicitly kvm_cpu_cap_clear() > X86_FEATURE_SHSTK and X86_FEATURE_IBT. We *know* there are upcoming AMD CPUs > that support at least SHSTK, so enumerating support for common code would yield > a version of KVM that incorrectly advertises support for SHSTK. > > I hope to land both Intel and AMD virtualization in the same kernel release, but > there are no guarantees that will happen. And explicitly clearing both SHSTK and > IBT would guard against IBT showing up in some future AMD CPU in advance of KVM > gaining full support. Let me be clear on this, you want me to disable SHSTK/IBT with kvm_cpu_cap_clear() unconditionally for now in this patch, and wait until both AMD's SVM patches and this series are ready for guest CET, then remove the disabling code in this patch for final merge, am I right?
On Mon, May 06, 2024, Weijiang Yang wrote: > On 5/2/2024 7:15 AM, Sean Christopherson wrote: > > On Sun, Feb 18, 2024, Yang Weijiang wrote: > > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > > > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > > > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > > > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); > > > + /* > > > + * Don't use boot_cpu_has() to check availability of IBT because the > > > + * feature bit is cleared in boot_cpu_data when ibt=off is applied > > > + * in host cmdline. > > I'm not convinced this is a good reason to diverge from the host kernel. E.g. > > PCID and many other features honor the host setup, I don't see what makes IBT > > special. > > This is mostly based on our user experience and the hypothesis for cloud > computing: When we evolve host kernels, we constantly encounter issues when > kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But we > need to test the CET features in VM, if we just simply refer to host boot > cpuid data, then IBT cannot be enabled in VM which makes CET features > incomplete in guest. > > I guess in cloud computing, it could run into similar dilemma. In this case, > the tenant cannot benefit the feature just because of host SW problem. Hmm, but such issues should be found before deploying a kernel to production. The one scenario that comes to mind where I can see someone wanting to disable IBT would be running a out-of-tree and/or third party module. > I know currently KVM except LA57 always honors host feature configurations, > but in CET case, there could be divergence wrt honoring host configuration as > long as there's no quirk for the feature. > > But I think the issue is still open for discussion... I'm not totally opposed to the idea. Somewhat off-topic, the existing LA57 code upon which the IBT check is based is flawed, as it doesn't account for the max supported CPUID leaf. On Intel CPUs, that could result in a false positive due CPUID (stupidly) returning the value of the last implemented CPUID leaf, no zeros. In practice, it doesn't cause problems because CPUID.0x7 has been supported since forever, but it's still a bug. Hmm, actually, __kvm_cpu_cap_mask() has the exact same bug. And that's much less theoretical, e.g. kvm_cpu_cap_init_kvm_defined() in particular is likely to cause problems at some point. And I really don't like that KVM open codes calls to cpuid_<reg>() for these "raw" features. One option would be to and helpers to change this: if (cpuid_edx(7) & F(IBT)) kvm_cpu_cap_set(X86_FEATURE_IBT); to this: if (raw_cpuid_has(X86_FEATURE_IBT)) kvm_cpu_cap_set(X86_FEATURE_IBT); but I think we can do better, and harden the CPUID code in the process. If we do kvm_cpu_cap_set() _before_ kvm_cpu_cap_mask(), then incorporating the raw host CPUID will happen automagically, as __kvm_cpu_cap_mask() will clear bits that aren't in host CPUID. The most obvious approach would be to simply call kvm_cpu_cap_set() before kvm_cpu_cap_mask(), but that's more than a bit confusing, and would open the door for potential bugs due to calling kvm_cpu_cap_set() after kvm_cpu_cap_mask(). And detecting such bugs would be difficult, because there are features that KVM fully emulates, i.e. _must_ be stuffed after kvm_cpu_cap_mask(). Instead of calling kvm_cpu_cap_set() directly, we can take advantage of the fact that the F() maskes are fed into kvm_cpu_cap_mask(), i.e. are naturally processed before the corresponding kvm_cpu_cap_mask(). If we add an array to track which capabilities have been initialized, then F() can WARN on improper usage. That would allow detecting bad "raw" usage, *and* would detect (some) scenarios where a F() is fed into the wrong leaf, e.g. if we added F(LA57) to CPUID_7_EDX instead of CPUID_7_ECX. #define F(name) \ ({ \ u32 __leaf = __feature_leaf(X86_FEATURE_##name); \ \ BUILD_BUG_ON(__leaf >= ARRAY_SIZE(kvm_cpu_cap_initialized)); \ WARN_ON_ONCE(kvm_cpu_cap_initialized[__leaf]); \ \ feature_bit(name); \ }) /* * Raw Feature - For features that KVM supports based purely on raw host CPUID, * i.e. that KVM virtualizes even if the host kernel doesn't use the feature. * Simply force set the feature in KVM's capabilities, raw CPUID support will * be factored in by kvm_cpu_cap_mask(). */ #define RAW_F(name) \ ({ \ kvm_cpu_cap_set(X86_FEATURE_##name); \ F(name); \ }) Assuming testing doesn't poke a hole in my idea, I'll post a small series.
On Mon, 2024-05-06 at 17:19 +0800, Yang, Weijiang wrote: > On 5/2/2024 7:15 AM, Sean Christopherson wrote: > > On Sun, Feb 18, 2024, Yang Weijiang wrote: > > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > > > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > > > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > > > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); > > > + /* > > > + * Don't use boot_cpu_has() to check availability of IBT because > > > the > > > + * feature bit is cleared in boot_cpu_data when ibt=off is applied > > > + * in host cmdline. > > I'm not convinced this is a good reason to diverge from the host kernel. > > E.g. > > PCID and many other features honor the host setup, I don't see what makes > > IBT > > special. > > This is mostly based on our user experience and the hypothesis for cloud > computing: > When we evolve host kernels, we constantly encounter issues when kernel IBT is > on, > so we have to disable kernel IBT by adding ibt=off. But we need to test the > CET features > in VM, if we just simply refer to host boot cpuid data, then IBT cannot be > enabled in > VM which makes CET features incomplete in guest. > > I guess in cloud computing, it could run into similar dilemma. In this case, > the tenant > cannot benefit the feature just because of host SW problem. I know currently > KVM > except LA57 always honors host feature configurations, but in CET case, there > could be > divergence wrt honoring host configuration as long as there's no quirk for the > feature. > > But I think the issue is still open for discussion... I think the back and forth I remembered was actually around SGX IBT, but I did find this thread: https://lore.kernel.org/lkml/20231128085025.GA3818@noisy.programming.kicks-ass.net/ Disabling kernel IBT enforcement without disabling KVM IBT seems worthwhile. But the solution is to not to not honor host support. It is to have kernel IBT not clear the feature flag and instead clear something else. This can be done independently of the KVM series.
On Mon, May 06, 2024, Rick P Edgecombe wrote: > On Mon, 2024-05-06 at 17:19 +0800, Yang, Weijiang wrote: > > On 5/2/2024 7:15 AM, Sean Christopherson wrote: > > > On Sun, Feb 18, 2024, Yang Weijiang wrote: > > > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > > > > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > > > > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > > > > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); > > > > + /* > > > > + * Don't use boot_cpu_has() to check availability of IBT because > > > > the > > > > + * feature bit is cleared in boot_cpu_data when ibt=off is applied > > > > + * in host cmdline. > > > I'm not convinced this is a good reason to diverge from the host kernel. > > > E.g. PCID and many other features honor the host setup, I don't see what > > > makes IBT special. > > > > This is mostly based on our user experience and the hypothesis for cloud > > computing: When we evolve host kernels, we constantly encounter issues when > > kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But > > we need to test the CET features in VM, if we just simply refer to host > > boot cpuid data, then IBT cannot be enabled in VM which makes CET features > > incomplete in guest. > > > > I guess in cloud computing, it could run into similar dilemma. In this > > case, the tenant cannot benefit the feature just because of host SW > > problem. I know currently KVM except LA57 always honors host feature > > configurations, but in CET case, there could be divergence wrt honoring > > host configuration as long as there's no quirk for the feature. > > > > But I think the issue is still open for discussion... > > I think the back and forth I remembered was actually around SGX IBT, but I did > find this thread: > https://lore.kernel.org/lkml/20231128085025.GA3818@noisy.programming.kicks-ass.net/ > > Disabling kernel IBT enforcement without disabling KVM IBT seems worthwhile. But > the solution is to not to not honor host support. It is to have kernel IBT not > clear the feature flag and instead clear something else. This can be done > independently of the KVM series. Hmm, I don't disagree, but I'm not sure it makes sense to wait on that discussion to exempt IBT from the "it must be supported in the host" rule. I suspect that tweaking the handling X86_FEATURE_IBT of will open a much larger can of worms, as overhauling feature flag handling is very much on the x86 maintainers todo list. IMO, the odds of there being a hardware bug that necessitates hard disabling IBT are lower than the odds of KVM support for CET landing well before the feature stuff is sorted out.
On Mon, 2024-05-06 at 16:33 -0700, Sean Christopherson wrote: > > Hmm, I don't disagree, but I'm not sure it makes sense to wait on that > discussion > to exempt IBT from the "it must be supported in the host" rule. I suspect > that > tweaking the handling X86_FEATURE_IBT of will open a much larger can of worms, > as overhauling feature flag handling is very much on the x86 maintainers todo > list. > > IMO, the odds of there being a hardware bug that necessitates hard disabling > IBT > are lower than the odds of KVM support for CET landing well before the feature > stuff is sorted out. I see a few reasons to tie to the host cpu feature: 1. Disabling it because of some HW concern. I agree with your assessment on the chances. 2. Having the cpu feature be disabled by some kernel parameter, and having KVM try to use it without the necessary FPU or other host support. It could cause lots of problems, guest->host DOS, etc. 3. Confusion for the user about which CPU features are actually in use on the system. There is a fair chance for compatibility issues to show up with CET. Today there is clearcpuid. If a user was having issues with CET and just wanted to turn it off, they might use clearcpuid or something else to just disable CET. Then boot a VM and find it was still enabled. For shadow stack there is also nousershstk. So, my two cents, it's just all easier to reason about for everyone if you tie it to host cpu features. I don't immediately see what trouble will be in giving kernel IBT a disable parameter that doesn't touch X86_FEATURE_IBT at some point in the future. Sorry if I'm missing the point. So like, ibt=off disables all IBT including kernel IBT, kernel_ibt=off disables kernel IBT enforcement via a global bool.
On 5/7/2024 12:54 AM, Sean Christopherson wrote: > On Mon, May 06, 2024, Weijiang Yang wrote: >> On 5/2/2024 7:15 AM, Sean Christopherson wrote: >>> On Sun, Feb 18, 2024, Yang Weijiang wrote: >>>> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) >>>> kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); >>>> if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) >>>> kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); >>>> + /* >>>> + * Don't use boot_cpu_has() to check availability of IBT because the >>>> + * feature bit is cleared in boot_cpu_data when ibt=off is applied >>>> + * in host cmdline. >>> I'm not convinced this is a good reason to diverge from the host kernel. E.g. >>> PCID and many other features honor the host setup, I don't see what makes IBT >>> special. >> This is mostly based on our user experience and the hypothesis for cloud >> computing: When we evolve host kernels, we constantly encounter issues when >> kernel IBT is on, so we have to disable kernel IBT by adding ibt=off. But we >> need to test the CET features in VM, if we just simply refer to host boot >> cpuid data, then IBT cannot be enabled in VM which makes CET features >> incomplete in guest. >> >> I guess in cloud computing, it could run into similar dilemma. In this case, >> the tenant cannot benefit the feature just because of host SW problem. > Hmm, but such issues should be found before deploying a kernel to production. > > The one scenario that comes to mind where I can see someone wanting to disable > IBT would be running a out-of-tree and/or third party module. Yes, the developers may neglect IBT violations in modules/kernel components and deploy them, in this case, host side has to either fix the issues or disable IBT. > >> I know currently KVM except LA57 always honors host feature configurations, >> but in CET case, there could be divergence wrt honoring host configuration as >> long as there's no quirk for the feature. >> >> But I think the issue is still open for discussion... > I'm not totally opposed to the idea. > > Somewhat off-topic, the existing LA57 code upon which the IBT check is based is > flawed, as it doesn't account for the max supported CPUID leaf. On Intel CPUs, > that could result in a false positive due CPUID (stupidly) returning the value > of the last implemented CPUID leaf, no zeros. In practice, it doesn't cause > problems because CPUID.0x7 has been supported since forever, but it's still a > bug. > > Hmm, actually, __kvm_cpu_cap_mask() has the exact same bug. And that's much less > theoretical, e.g. kvm_cpu_cap_init_kvm_defined() in particular is likely to cause > problems at some point. > > And I really don't like that KVM open codes calls to cpuid_<reg>() for these > "raw" features. One option would be to and helpers to change this: > > if (cpuid_edx(7) & F(IBT)) > kvm_cpu_cap_set(X86_FEATURE_IBT); > > to this: > > if (raw_cpuid_has(X86_FEATURE_IBT)) > kvm_cpu_cap_set(X86_FEATURE_IBT); > > but I think we can do better, and harden the CPUID code in the process. If we > do kvm_cpu_cap_set() _before_ kvm_cpu_cap_mask(), then incorporating the raw host > CPUID will happen automagically, as __kvm_cpu_cap_mask() will clear bits that > aren't in host CPUID. > > The most obvious approach would be to simply call kvm_cpu_cap_set() before > kvm_cpu_cap_mask(), but that's more than a bit confusing, and would open the door > for potential bugs due to calling kvm_cpu_cap_set() after kvm_cpu_cap_mask(). > And detecting such bugs would be difficult, because there are features that KVM > fully emulates, i.e. _must_ be stuffed after kvm_cpu_cap_mask(). > > Instead of calling kvm_cpu_cap_set() directly, we can take advantage of the fact > that the F() maskes are fed into kvm_cpu_cap_mask(), i.e. are naturally processed > before the corresponding kvm_cpu_cap_mask(). > > If we add an array to track which capabilities have been initialized, then F() > can WARN on improper usage. That would allow detecting bad "raw" usage, *and* > would detect (some) scenarios where a F() is fed into the wrong leaf, e.g. if > we added F(LA57) to CPUID_7_EDX instead of CPUID_7_ECX. > > #define F(name) \ > ({ \ > u32 __leaf = __feature_leaf(X86_FEATURE_##name); \ > \ > BUILD_BUG_ON(__leaf >= ARRAY_SIZE(kvm_cpu_cap_initialized)); \ > WARN_ON_ONCE(kvm_cpu_cap_initialized[__leaf]); \ > \ > feature_bit(name); \ > }) > > /* > * Raw Feature - For features that KVM supports based purely on raw host CPUID, > * i.e. that KVM virtualizes even if the host kernel doesn't use the feature. > * Simply force set the feature in KVM's capabilities, raw CPUID support will > * be factored in by kvm_cpu_cap_mask(). > */ > #define RAW_F(name) \ > ({ \ > kvm_cpu_cap_set(X86_FEATURE_##name); \ > F(name); \ > }) > > Assuming testing doesn't poke a hole in my idea, I'll post a small series. Fancy enough! But I like the idea :-) >
On Mon, May 06, 2024, Rick P Edgecombe wrote: > I don't immediately see what trouble will be in giving kernel IBT a disable > parameter that doesn't touch X86_FEATURE_IBT at some point in the future. Keeping X86_FEATURE_IBT set will result in "ibt" being reported in /proc/cpuinfo, i.e. will mislead userspace into thinking IBT is supported and fully enabled by the kernel. For a security feature, that's a pretty big issue. To fudge around that, we could add a synthetic feature flag to let the kernel tell KVM whether or not it's safe to virtualize IBT, but I don't see what value that adds over KVM checking raw host CPUID.
On Tue, 2024-05-07 at 07:21 -0700, Sean Christopherson wrote: > > Keeping X86_FEATURE_IBT set will result in "ibt" being reported in > /proc/cpuinfo, > i.e. will mislead userspace into thinking IBT is supported and fully enabled > by > the kernel. For a security feature, that's a pretty big issue. Since the beginning, if you don't configure kernel IBT in Kconfig but the HW supports it, "ibt" will appear in /proc/cpuinfo. It never was a reliable indicator of kernel IBT enforcement. It is just an indicator of if the IBT feature is usable. I think tying kernel IBT enforcement to the CPU feature is wrong. But if you disable the HW feature, it makes sense that the enforcement would be disabled. CET is something that requires a fair amount of SW enablement. SW needs to do things in special ways or things will go wrong. So whether IBT is in use and whether it is supported by the HW are useful to maintain as separate concepts. > > To fudge around that, we could add a synthetic feature flag to let the kernel > tell KVM whether or not it's safe to virtualize IBT, but I don't see what > value > that adds over KVM checking raw host CPUID. A synthetic feature flag for kernel IBT seems reasonable to me. It's what I suggested on that thread I linked earlier. But Peterz was advocating for a bool. How enforcement would be exposed, would just be dmesg I guess. Having a new feature flag still makes sense to me. Maybe he could be convinced.
On Tue, May 07, 2024, Rick P Edgecombe wrote: > On Tue, 2024-05-07 at 07:21 -0700, Sean Christopherson wrote: > > > > Keeping X86_FEATURE_IBT set will result in "ibt" being reported in > > /proc/cpuinfo, > > i.e. will mislead userspace into thinking IBT is supported and fully enabled > > by > > the kernel. For a security feature, that's a pretty big issue. > > Since the beginning, if you don't configure kernel IBT in Kconfig but the HW > supports it, "ibt" will appear in /proc/cpuinfo. It never was a reliable > indicator of kernel IBT enforcement. Ah, good to know. > It is just an indicator of if the IBT feature is usable. Does ibt=off make IBT unusable for userspace? Huh. Looking at the #CP handler, I take it userspace support for IBT hasn't landed yet? > I think tying kernel IBT enforcement to the CPU feature is wrong. But if you > disable the HW feature, it makes sense that the enforcement would be > disabled. > > CET is something that requires a fair amount of SW enablement. SW needs to do > things in special ways or things will go wrong. So whether IBT is in use and > whether it is supported by the HW are useful to maintain as separate concepts. > > > > > To fudge around that, we could add a synthetic feature flag to let the > > kernel tell KVM whether or not it's safe to virtualize IBT, but I don't see > > what value that adds over KVM checking raw host CPUID. > > A synthetic feature flag for kernel IBT seems reasonable to me. It's what I > suggested on that thread I linked earlier. But Peterz was advocating for a bool. > How enforcement would be exposed, would just be dmesg I guess. Having a new > feature flag still makes sense to me. Maybe he could be convinced. If there's a need for IBT and KERNEL_IBT, I agree a synthetic flag makes sense. But as above, it's not clear to me why both are needed, at least for KVM's sake. Is the need more apparent when userspace IBT support comes along?
On Tue, 2024-05-07 at 08:08 -0700, Sean Christopherson wrote: > > It is just an indicator of if the IBT feature is usable. > > Does ibt=off make IBT unusable for userspace? Huh. Looking at the #CP > handler, > I take it userspace support for IBT hasn't landed yet? Sadly it remains an small internal dev branch that was preempted by the enormous TDX ramp. It's my #1 thing I want to get back to when time reappears. Despite being an old feature at this point, there is contingent of HW CFI fans that still want to see it, including on the gcc/glibc and distro side. So I still have hope. > > > I think tying kernel IBT enforcement to the CPU feature is wrong. But if you > > disable the HW feature, it makes sense that the enforcement would be > > disabled. > > > > CET is something that requires a fair amount of SW enablement. SW needs to > > do > > things in special ways or things will go wrong. So whether IBT is in use and > > whether it is supported by the HW are useful to maintain as separate > > concepts. > > > > > > > > To fudge around that, we could add a synthetic feature flag to let the > > > kernel tell KVM whether or not it's safe to virtualize IBT, but I don't > > > see > > > what value that adds over KVM checking raw host CPUID. > > > > A synthetic feature flag for kernel IBT seems reasonable to me. It's what I > > suggested on that thread I linked earlier. But Peterz was advocating for a > > bool. > > How enforcement would be exposed, would just be dmesg I guess. Having a new > > feature flag still makes sense to me. Maybe he could be convinced. > > If there's a need for IBT and KERNEL_IBT, I agree a synthetic flag makes > sense. > But as above, it's not clear to me why both are needed, at least for KVM's > sake. > Is the need more apparent when userspace IBT support comes along? Isn't KVM CET kind of a second user though? It doesn't depends on CR4.CET like the rest, but the same host FPU support. Let me try to ping peterz re the synthetic flag. For shadow stack we also have user_shstk. This was done because of the expected introduction of the CET_SSS bit (the shadow stack fracturing busy thing bit). It can be treated something like a supervisor shadow stack support bit. So for guests, you might have user shadow stack support and not supervisor. At least that was the idea.
On 5/2/2024 7:15 AM, Sean Christopherson wrote: > On Sun, Feb 18, 2024, Yang Weijiang wrote: >> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) >> kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); >> if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) >> kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); >> + /* >> + * Don't use boot_cpu_has() to check availability of IBT because the >> + * feature bit is cleared in boot_cpu_data when ibt=off is applied >> + * in host cmdline. > I'm not convinced this is a good reason to diverge from the host kernel. E.g. > PCID and many other features honor the host setup, I don't see what makes IBT > special. > > Hi, Sean, We synced the issue internally, and got conclusion that KVM should honor host IBT config. In this case IBT bit in boot_cpu_data should be honored. With this policy, it can avoid CPUID confusion to guest side due to host ibt=off config. Host side xstate support couldn't be an issue because we already have below check in this patch: + if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL)) != + (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL); + } What's your thoughts? Should I just remove the quirk here and keep everything normal and peaceful?
On 5/6/2024 5:41 PM, Yang, Weijiang wrote: > On 5/2/2024 7:34 AM, Sean Christopherson wrote: >> On Sun, Feb 18, 2024, Yang Weijiang wrote: >>> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) >>> F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | >>> F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | >>> F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | >>> - F(SGX_LC) | F(BUS_LOCK_DETECT) >>> + F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) >>> ); >>> /* Set LA57 based on hardware capability. */ >>> if (cpuid_ecx(7) & F(LA57)) >>> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) >>> F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | >>> F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | >>> F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | >>> - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) >>> + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | >>> + F(IBT) >>> ); >> ... >> >>> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void) >>> if (cpu_has_vmx_waitpkg()) >>> kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); >>> + >>> + /* >>> + * Disable CET if unrestricted_guest is unsupported as KVM doesn't >>> + * enforce CET HW behaviors in emulator. On platforms with >>> + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code >>> + * fails, so disable CET in this case too. >>> + */ >>> + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || >>> + !cpu_has_vmx_basic_no_hw_errcode()) { >>> + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); >>> + kvm_cpu_cap_clear(X86_FEATURE_IBT); >>> + } >> Oh! Almost missed it. This patch should explicitly kvm_cpu_cap_clear() >> X86_FEATURE_SHSTK and X86_FEATURE_IBT. We *know* there are upcoming AMD CPUs >> that support at least SHSTK, so enumerating support for common code would yield >> a version of KVM that incorrectly advertises support for SHSTK. >> >> I hope to land both Intel and AMD virtualization in the same kernel release, but >> there are no guarantees that will happen. And explicitly clearing both SHSTK and >> IBT would guard against IBT showing up in some future AMD CPU in advance of KVM >> gaining full support. > > Let me be clear on this, you want me to disable SHSTK/IBT with kvm_cpu_cap_clear() unconditionally > for now in this patch, and wait until both AMD's SVM patches and this series are ready for guest CET, > then remove the disabling code in this patch for final merge, am I right? Hi, Sean, I haven't got your reply on above question. Would like to get your confirmation. Thanks! > >
On Thu, May 16, 2024, Weijiang Yang wrote: > On 5/2/2024 7:15 AM, Sean Christopherson wrote: > > On Sun, Feb 18, 2024, Yang Weijiang wrote: > > > @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > > > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > > > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > > > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); > > > + /* > > > + * Don't use boot_cpu_has() to check availability of IBT because the > > > + * feature bit is cleared in boot_cpu_data when ibt=off is applied > > > + * in host cmdline. > > I'm not convinced this is a good reason to diverge from the host kernel. E.g. > > PCID and many other features honor the host setup, I don't see what makes IBT > > special. > > > > > Hi, Sean, > We synced the issue internally, and got conclusion that KVM should honor host > IBT config. In this case IBT bit in boot_cpu_data should be honored. With > this policy, it can avoid CPUID confusion to guest side due to host ibt=off > config. What was the reasoning? CPUID confusion is a weak justification, e.g. it's not like the guest has visibility into the host kernel, and raw CPUID will still show IBT support in the host. On the other hand, I can definitely see folks wanting to expose IBT to guests when running non-complaint host kernels, especially when live migration is in play, i.e. when hiding IBT from the guest will actively cause problems.
On Thu, May 16, 2024, Weijiang Yang wrote: > On 5/6/2024 5:41 PM, Yang, Weijiang wrote: > > On 5/2/2024 7:34 AM, Sean Christopherson wrote: > > > On Sun, Feb 18, 2024, Yang Weijiang wrote: > > > > @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) > > > > F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | > > > > F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | > > > > F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | > > > > - F(SGX_LC) | F(BUS_LOCK_DETECT) > > > > + F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) > > > > ); > > > > /* Set LA57 based on hardware capability. */ > > > > if (cpuid_ecx(7) & F(LA57)) > > > > @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) > > > > F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | > > > > F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | > > > > F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | > > > > - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) > > > > + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | > > > > + F(IBT) > > > > ); > > > ... > > > > > > > @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void) > > > > if (cpu_has_vmx_waitpkg()) > > > > kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); > > > > + > > > > + /* > > > > + * Disable CET if unrestricted_guest is unsupported as KVM doesn't > > > > + * enforce CET HW behaviors in emulator. On platforms with > > > > + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code > > > > + * fails, so disable CET in this case too. > > > > + */ > > > > + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || > > > > + !cpu_has_vmx_basic_no_hw_errcode()) { > > > > + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); > > > > + kvm_cpu_cap_clear(X86_FEATURE_IBT); > > > > + } > > > Oh! Almost missed it. This patch should explicitly kvm_cpu_cap_clear() > > > X86_FEATURE_SHSTK and X86_FEATURE_IBT. We *know* there are upcoming AMD CPUs > > > that support at least SHSTK, so enumerating support for common code would yield > > > a version of KVM that incorrectly advertises support for SHSTK. > > > > > > I hope to land both Intel and AMD virtualization in the same kernel release, but > > > there are no guarantees that will happen. And explicitly clearing both SHSTK and > > > IBT would guard against IBT showing up in some future AMD CPU in advance of KVM > > > gaining full support. > > > > Let me be clear on this, you want me to disable SHSTK/IBT with > > kvm_cpu_cap_clear() unconditionally for now in this patch, and wait until > > both AMD's SVM patches and this series are ready for guest CET, then remove > > the disabling code in this patch for final merge, am I right? No, allow it to be enabled for VMX, but explicitly disable it for SVM, i.e. diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4aaffbf22531..b3df12af4ee6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5125,6 +5125,10 @@ static __init void svm_set_cpu_caps(void) kvm_caps.supported_perf_cap = 0; kvm_caps.supported_xss = 0; + /* KVM doesn't yet support CET virtualization for SVM. */ + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); Then the SVM series can simply delete those lines when all is ready.
On 5/16/24 07:39, Sean Christopherson wrote: >> We synced the issue internally, and got conclusion that KVM should honor host >> IBT config. In this case IBT bit in boot_cpu_data should be honored. With >> this policy, it can avoid CPUID confusion to guest side due to host ibt=off >> config. > What was the reasoning? CPUID confusion is a weak justification, e.g. it's not > like the guest has visibility into the host kernel, and raw CPUID will still show > IBT support in the host. I'm basically arguing for the path of least resistance (at least to start). We should just do what takes the least amount of code for now that results in mostly sane behavior, then debate about making it perfect later. In other words, let's say the place we'd *IDEALLY* end up is that guests can have any random FPU state which is disconnected from the host. But the reality, for now, is that the host needs to have XFEATURE_CET_USER set in order to pass it into the guest and that means keeping X86_FEATURE_SHSTK set. If you want guest XFEATURE_CET_USER, you must have host X86_FEATURE_SHSTK ... for now.
On Thu, May 16, 2024, Dave Hansen wrote: > On 5/16/24 07:39, Sean Christopherson wrote: > >> We synced the issue internally, and got conclusion that KVM should honor host > >> IBT config. In this case IBT bit in boot_cpu_data should be honored. With > >> this policy, it can avoid CPUID confusion to guest side due to host ibt=off > >> config. > > What was the reasoning? CPUID confusion is a weak justification, e.g. it's not > > like the guest has visibility into the host kernel, and raw CPUID will still show > > IBT support in the host. > > I'm basically arguing for the path of least resistance (at least to start). > > We should just do what takes the least amount of code for now that > results in mostly sane behavior, then debate about making it perfect later. > > In other words, let's say the place we'd *IDEALLY* end up is that guests > can have any random FPU state which is disconnected from the host. But > the reality, for now, is that the host needs to have XFEATURE_CET_USER > set in order to pass it into the guest and that means keeping > X86_FEATURE_SHSTK set. > > If you want guest XFEATURE_CET_USER, you must have host > X86_FEATURE_SHSTK ... for now. Ah, because fpu__init_system_xstate() will clear XFEATURE_CET_USER via the X86_FEATURE_SHSTK connection in xsave_cpuid_features. Please put something to that effect in the changelog. "this literally won't work (without more changes)" is very different than us making a largely arbitrary decision.
On 5/16/2024 10:43 PM, Sean Christopherson wrote: > On Thu, May 16, 2024, Weijiang Yang wrote: >> On 5/6/2024 5:41 PM, Yang, Weijiang wrote: >>> On 5/2/2024 7:34 AM, Sean Christopherson wrote: >>>> On Sun, Feb 18, 2024, Yang Weijiang wrote: >>>>> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) >>>>> F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | >>>>> F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | >>>>> F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | >>>>> - F(SGX_LC) | F(BUS_LOCK_DETECT) >>>>> + F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) >>>>> ); >>>>> /* Set LA57 based on hardware capability. */ >>>>> if (cpuid_ecx(7) & F(LA57)) >>>>> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) >>>>> F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | >>>>> F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | >>>>> F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | >>>>> - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) >>>>> + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | >>>>> + F(IBT) >>>>> ); >>>> ... >>>> >>>>> @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void) >>>>> if (cpu_has_vmx_waitpkg()) >>>>> kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); >>>>> + >>>>> + /* >>>>> + * Disable CET if unrestricted_guest is unsupported as KVM doesn't >>>>> + * enforce CET HW behaviors in emulator. On platforms with >>>>> + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code >>>>> + * fails, so disable CET in this case too. >>>>> + */ >>>>> + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || >>>>> + !cpu_has_vmx_basic_no_hw_errcode()) { >>>>> + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); >>>>> + kvm_cpu_cap_clear(X86_FEATURE_IBT); >>>>> + } >>>> Oh! Almost missed it. This patch should explicitly kvm_cpu_cap_clear() >>>> X86_FEATURE_SHSTK and X86_FEATURE_IBT. We *know* there are upcoming AMD CPUs >>>> that support at least SHSTK, so enumerating support for common code would yield >>>> a version of KVM that incorrectly advertises support for SHSTK. >>>> >>>> I hope to land both Intel and AMD virtualization in the same kernel release, but >>>> there are no guarantees that will happen. And explicitly clearing both SHSTK and >>>> IBT would guard against IBT showing up in some future AMD CPU in advance of KVM >>>> gaining full support. >>> Let me be clear on this, you want me to disable SHSTK/IBT with >>> kvm_cpu_cap_clear() unconditionally for now in this patch, and wait until >>> both AMD's SVM patches and this series are ready for guest CET, then remove >>> the disabling code in this patch for final merge, am I right? > No, allow it to be enabled for VMX, but explicitly disable it for SVM, i.e. > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index 4aaffbf22531..b3df12af4ee6 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -5125,6 +5125,10 @@ static __init void svm_set_cpu_caps(void) > kvm_caps.supported_perf_cap = 0; > kvm_caps.supported_xss = 0; > > + /* KVM doesn't yet support CET virtualization for SVM. */ > + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); > + kvm_cpu_cap_clear(X86_FEATURE_IBT); > + > /* CPUID 0x80000001 and 0x8000000A (SVM features) */ > if (nested) { > kvm_cpu_cap_set(X86_FEATURE_SVM); > > Then the SVM series can simply delete those lines when all is ready. Understood, thanks!
On 5/17/2024 12:58 AM, Sean Christopherson wrote: > On Thu, May 16, 2024, Dave Hansen wrote: >> On 5/16/24 07:39, Sean Christopherson wrote: >>>> We synced the issue internally, and got conclusion that KVM should honor host >>>> IBT config. In this case IBT bit in boot_cpu_data should be honored. With >>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off >>>> config. >>> What was the reasoning? CPUID confusion is a weak justification, e.g. it's not >>> like the guest has visibility into the host kernel, and raw CPUID will still show >>> IBT support in the host. >> I'm basically arguing for the path of least resistance (at least to start). >> >> We should just do what takes the least amount of code for now that >> results in mostly sane behavior, then debate about making it perfect later. >> >> In other words, let's say the place we'd *IDEALLY* end up is that guests >> can have any random FPU state which is disconnected from the host. But >> the reality, for now, is that the host needs to have XFEATURE_CET_USER >> set in order to pass it into the guest and that means keeping >> X86_FEATURE_SHSTK set. >> >> If you want guest XFEATURE_CET_USER, you must have host >> X86_FEATURE_SHSTK ... for now. > Ah, because fpu__init_system_xstate() will clear XFEATURE_CET_USER via the > X86_FEATURE_SHSTK connection in xsave_cpuid_features. > > Please put something to that effect in the changelog. "this literally won't work > (without more changes)" is very different than us making a largely arbitrary > decision. So I need to remove the trick here for guest IBT, right? Side topic: When X86_FEATURE_SHSTK and X86_FEATURE_IBT (no ibt=off set) are available on host side, then host XFEATURE_CET_USER is enabled. In this case, we still *need* below patch: https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ to correctly enable XFEATURE_CET_USER in *guest kernel*, because VMM userspace can enable IBT alone for guest by -cpu host -shstk, am I right?
On Thu, May 16 2024 at 07:39, Sean Christopherson wrote: > On Thu, May 16, 2024, Weijiang Yang wrote: >> We synced the issue internally, and got conclusion that KVM should honor host >> IBT config. In this case IBT bit in boot_cpu_data should be honored. With >> this policy, it can avoid CPUID confusion to guest side due to host ibt=off >> config. > > What was the reasoning? CPUID confusion is a weak justification, e.g. it's not > like the guest has visibility into the host kernel, and raw CPUID will still show > IBT support in the host. > > On the other hand, I can definitely see folks wanting to expose IBT to guests > when running non-complaint host kernels, especially when live migration is in > play, i.e. when hiding IBT from the guest will actively cause problems. I have to disagree here violently. If the exposure of a CPUID bit to a guest requires host side support, e.g. in xstate handling, then exposing it to a guest is simply not possible. Just because virtualization allows to do that does not mean that it's correct in any way. Thanks, tglx
On Fri, May 17, 2024, Thomas Gleixner wrote: > On Thu, May 16 2024 at 07:39, Sean Christopherson wrote: > > On Thu, May 16, 2024, Weijiang Yang wrote: > >> We synced the issue internally, and got conclusion that KVM should honor host > >> IBT config. In this case IBT bit in boot_cpu_data should be honored. With > >> this policy, it can avoid CPUID confusion to guest side due to host ibt=off > >> config. > > > > What was the reasoning? CPUID confusion is a weak justification, e.g. it's not > > like the guest has visibility into the host kernel, and raw CPUID will still show > > IBT support in the host. > > > > On the other hand, I can definitely see folks wanting to expose IBT to guests > > when running non-complaint host kernels, especially when live migration is in > > play, i.e. when hiding IBT from the guest will actively cause problems. > > I have to disagree here violently. > > If the exposure of a CPUID bit to a guest requires host side support, > e.g. in xstate handling, then exposing it to a guest is simply not > possible. Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the supported xfeatures mask.
On 5/17/2024 10:26 PM, Sean Christopherson wrote: > On Fri, May 17, 2024, Thomas Gleixner wrote: >> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote: >>> On Thu, May 16, 2024, Weijiang Yang wrote: >>>> We synced the issue internally, and got conclusion that KVM should honor host >>>> IBT config. In this case IBT bit in boot_cpu_data should be honored. With >>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off >>>> config. >>> What was the reasoning? CPUID confusion is a weak justification, e.g. it's not >>> like the guest has visibility into the host kernel, and raw CPUID will still show >>> IBT support in the host. >>> >>> On the other hand, I can definitely see folks wanting to expose IBT to guests >>> when running non-complaint host kernels, especially when live migration is in >>> play, i.e. when hiding IBT from the guest will actively cause problems. >> I have to disagree here violently. >> >> If the exposure of a CPUID bit to a guest requires host side support, >> e.g. in xstate handling, then exposing it to a guest is simply not >> possible. > Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the > supported xfeatures mask. For host side support, fortunately, this patch already has some checks for that. But for userspace CPUID config, it allows IBT to be exposed alone. IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be exposed as an independent feature to guest without exposing SHSTK at the same time. If it is, then below patch is not needed anymore: https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ I'd check and clear IBT bit from CPUID when userspace enables only IBT via KVM_SET_CPUID2. And update related code of the series given the implicit dependency. Thanks!
On Mon, May 20, 2024, Weijiang Yang wrote: > On 5/17/2024 10:26 PM, Sean Christopherson wrote: > > On Fri, May 17, 2024, Thomas Gleixner wrote: > > > On Thu, May 16 2024 at 07:39, Sean Christopherson wrote: > > > > On Thu, May 16, 2024, Weijiang Yang wrote: > > > > > We synced the issue internally, and got conclusion that KVM should honor host > > > > > IBT config. In this case IBT bit in boot_cpu_data should be honored. With > > > > > this policy, it can avoid CPUID confusion to guest side due to host ibt=off > > > > > config. > > > > What was the reasoning? CPUID confusion is a weak justification, e.g. it's not > > > > like the guest has visibility into the host kernel, and raw CPUID will still show > > > > IBT support in the host. > > > > > > > > On the other hand, I can definitely see folks wanting to expose IBT to guests > > > > when running non-complaint host kernels, especially when live migration is in > > > > play, i.e. when hiding IBT from the guest will actively cause problems. > > > I have to disagree here violently. > > > > > > If the exposure of a CPUID bit to a guest requires host side support, > > > e.g. in xstate handling, then exposing it to a guest is simply not > > > possible. > > Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the > > supported xfeatures mask. > > For host side support, fortunately, this patch already has some checks for > that. But for userspace CPUID config, it allows IBT to be exposed alone. > > IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be > exposed as an independent feature to guest without exposing SHSTK at the same > time. If it is, then below patch is not needed anymore: > https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ That's a question for the x86 maintainers. Specifically, do they want to allow enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled. I don't think it impacts KVM, at least not directly. Regardless of what decision the kernel makes, KVM needs to disable IBT and SHSTK if CET_USER _or_ CET_KERNEL is missing, which KVM already does via: if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) != (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL); } > I'd check and clear IBT bit from CPUID when userspace enables only IBT via > KVM_SET_CPUID2. No. It is userspace's responsibility to provide a sane CPUID model for the guest. KVM needs to ensure that *KVM* doesn't treat IBT as supported if the kernel doesn't allow XFEATURE_CET_USER, but userspace can advertise whatever it wants to the guest (and gets to keep the pieces if it does something funky).
On 5/20/24 10:09, Sean Christopherson wrote: >> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be >> exposed as an independent feature to guest without exposing SHSTK at the same >> time. If it is, then below patch is not needed anymore: >> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ > That's a question for the x86 maintainers. Specifically, do they want to allow > enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled. I like the sound of "below patch is not needed anymore". Unless removing the patch causes permanent issues or results in something that's not functional, I say: jettison it with glee. If it's that important, it can be considered on its own merits separately.
On 5/21/2024 1:09 AM, Sean Christopherson wrote: > On Mon, May 20, 2024, Weijiang Yang wrote: >> On 5/17/2024 10:26 PM, Sean Christopherson wrote: >>> On Fri, May 17, 2024, Thomas Gleixner wrote: >>>> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote: >>>>> On Thu, May 16, 2024, Weijiang Yang wrote: >>>>>> We synced the issue internally, and got conclusion that KVM should honor host >>>>>> IBT config. In this case IBT bit in boot_cpu_data should be honored. With >>>>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off >>>>>> config. >>>>> What was the reasoning? CPUID confusion is a weak justification, e.g. it's not >>>>> like the guest has visibility into the host kernel, and raw CPUID will still show >>>>> IBT support in the host. >>>>> >>>>> On the other hand, I can definitely see folks wanting to expose IBT to guests >>>>> when running non-complaint host kernels, especially when live migration is in >>>>> play, i.e. when hiding IBT from the guest will actively cause problems. >>>> I have to disagree here violently. >>>> >>>> If the exposure of a CPUID bit to a guest requires host side support, >>>> e.g. in xstate handling, then exposing it to a guest is simply not >>>> possible. >>> Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the >>> supported xfeatures mask. >> For host side support, fortunately, this patch already has some checks for >> that. But for userspace CPUID config, it allows IBT to be exposed alone. >> >> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be >> exposed as an independent feature to guest without exposing SHSTK at the same >> time. If it is, then below patch is not needed anymore: >> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ > That's a question for the x86 maintainers. Specifically, do they want to allow > enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled. > > I don't think it impacts KVM, at least not directly. Regardless of what decision > the kernel makes, KVM needs to disable IBT and SHSTK if CET_USER _or_ CET_KERNEL > is missing, which KVM already does via: > > if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | > XFEATURE_MASK_CET_KERNEL)) != > (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { > kvm_cpu_cap_clear(X86_FEATURE_SHSTK); > kvm_cpu_cap_clear(X86_FEATURE_IBT); > kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | > XFEATURE_MASK_CET_KERNEL); > } > >> I'd check and clear IBT bit from CPUID when userspace enables only IBT via >> KVM_SET_CPUID2. > No. It is userspace's responsibility to provide a sane CPUID model for the guest. > KVM needs to ensure that *KVM* doesn't treat IBT as supported if the kernel doesn't > allow XFEATURE_CET_USER, but userspace can advertise whatever it wants to the guest > (and gets to keep the pieces if it does something funky). OK, I think we can go ahead to keep KVM patches as-is given the fact user IBT is not enabled in Linux. I only hope other OSes can enforce both SHSTK and IBT dependency on XFEATURE_CET_USER so that user IBT can work well there. Then IBT can be exposed to guest alone because guest *kernel* IBT only relies on S_CET MSR which is VMCS auto-saved/restored. What's your thoughts?
On 5/21/2024 1:15 AM, Dave Hansen wrote: > On 5/20/24 10:09, Sean Christopherson wrote: >>> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be >>> exposed as an independent feature to guest without exposing SHSTK at the same >>> time. If it is, then below patch is not needed anymore: >>> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ >> That's a question for the x86 maintainers. Specifically, do they want to allow >> enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled. > I like the sound of "below patch is not needed anymore". > > Unless removing the patch causes permanent issues or results in > something that's not functional, I say: jettison it with glee. If it's > that important, it can be considered on its own merits separately. I guess the existing dependency there is due to the fact that only user SHSTK is landed and there's possibly no such kind of odd bare metal platform. Side topic: would it be reasonable to enforce IBT dependency on XFEATURE_CET_USER when *user* IBT enabling patches are landing in kernel? Then guest kernel can play with user IBT alone if VMM userspace just wants to enable IBT for guest. Or when SHSTK is disabled for whatever reason.
On Wed, 2024-05-22 at 17:03 +0800, Yang, Weijiang wrote: > Side topic: would it be reasonable to enforce IBT dependency on > XFEATURE_CET_USER when *user* IBT > enabling patches are landing in kernel? Then guest kernel can play with user > IBT alone if VMM > userspace just wants to enable IBT for guest. Or when SHSTK is disabled for > whatever reason. I think earlier there was a comment that CET would be less likely to need to be disabled for security reasons, so there would not be utility for a system wide disable (that affects KVM). I recently remembered we actually already had a reason come up. The EDK2 SMI handler uses shadow stack and had a bug around saving and restoring CET state. Using IBT in the kernel was causing systems to hang. The temporary fix was to disable IBT. So the point is, let's not try to find a narrow way to get away with enabling as much as technically possible in KVM. The simple obviously correct solution would be: XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_IBT = KVM IBT support XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_SHSTK = KVM SHSTK support It should be correct both with and without that patch to enable XFEATURE_CET_USER for X86_FEATURE_IBT. Then the two missing changes to expand support would be: 1. Fixing that ibt=off disables X86_FEATURE_IBT. The fix is to move to bool as peterz suggested. 2. Making XFEATURE_CET_USER also depend on X86_FEATURE_IBT (the patch in this series) We should do those, but in a later small series. Does it seem reasonable? Can we just do the simple obvious solution above for now?
On 5/22/2024 11:06 PM, Edgecombe, Rick P wrote: > On Wed, 2024-05-22 at 17:03 +0800, Yang, Weijiang wrote: >> Side topic: would it be reasonable to enforce IBT dependency on >> XFEATURE_CET_USER when *user* IBT >> enabling patches are landing in kernel? Then guest kernel can play with user >> IBT alone if VMM >> userspace just wants to enable IBT for guest. Or when SHSTK is disabled for >> whatever reason. > I think earlier there was a comment that CET would be less likely to need to be > disabled for security reasons, so there would not be utility for a system wide > disable (that affects KVM). I recently remembered we actually already had a > reason come up. > > The EDK2 SMI handler uses shadow stack and had a bug around saving and restoring > CET state. Using IBT in the kernel was causing systems to hang. The temporary > fix was to disable IBT. > > So the point is, let's not try to find a narrow way to get away with enabling as > much as technically possible in KVM. > > The simple obviously correct solution would be: > XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_IBT = KVM IBT support > XFEATURE_CET_USER + XFEATURE_CET_KERNEL + X86_FEATURE_SHSTK = KVM SHSTK support Yes, I can easily achieve it by removing the raw cpuid check for KVM IBT. Host side CET xstate support check is already there in this patch. > > It should be correct both with and without that patch to enable > XFEATURE_CET_USER for X86_FEATURE_IBT. IMHO, given the fact user IBT hasn't been enabled in kernel, it's not too bad just discarding the patch. I can highlight the issue somewhere in this series. > > Then the two missing changes to expand support would be: > 1. Fixing that ibt=off disables X86_FEATURE_IBT. The fix is to move to bool as > peterz suggested. > 2. Making XFEATURE_CET_USER also depend on X86_FEATURE_IBT (the patch in this > series) > > We should do those, but in a later small series. Does it seem reasonable? Can we > just do the simple obvious solution above for now? It makes sense for me, but I want to hear x86 and KVM maintainers' voice for it. Thanks!
On 5/22/2024 4:41 PM, Yang, Weijiang wrote: > On 5/21/2024 1:09 AM, Sean Christopherson wrote: >> On Mon, May 20, 2024, Weijiang Yang wrote: >>> On 5/17/2024 10:26 PM, Sean Christopherson wrote: >>>> On Fri, May 17, 2024, Thomas Gleixner wrote: >>>>> On Thu, May 16 2024 at 07:39, Sean Christopherson wrote: >>>>>> On Thu, May 16, 2024, Weijiang Yang wrote: >>>>>>> We synced the issue internally, and got conclusion that KVM should honor host >>>>>>> IBT config. In this case IBT bit in boot_cpu_data should be honored. With >>>>>>> this policy, it can avoid CPUID confusion to guest side due to host ibt=off >>>>>>> config. >>>>>> What was the reasoning? CPUID confusion is a weak justification, e.g. it's not >>>>>> like the guest has visibility into the host kernel, and raw CPUID will still show >>>>>> IBT support in the host. >>>>>> >>>>>> On the other hand, I can definitely see folks wanting to expose IBT to guests >>>>>> when running non-complaint host kernels, especially when live migration is in >>>>>> play, i.e. when hiding IBT from the guest will actively cause problems. >>>>> I have to disagree here violently. >>>>> >>>>> If the exposure of a CPUID bit to a guest requires host side support, >>>>> e.g. in xstate handling, then exposing it to a guest is simply not >>>>> possible. >>>> Ya, I don't disagree, I just didn't realize that CET_USER would be cleared in the >>>> supported xfeatures mask. >>> For host side support, fortunately, this patch already has some checks for >>> that. But for userspace CPUID config, it allows IBT to be exposed alone. >>> >>> IIUC, this series tries to tie IBT to SHSTK feature, i.e., IBT cannot be >>> exposed as an independent feature to guest without exposing SHSTK at the same >>> time. If it is, then below patch is not needed anymore: >>> https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ >> That's a question for the x86 maintainers. Specifically, do they want to allow >> enabling XFEATURE_CET_USER even if userspace shadow stack support is disabled. >> >> I don't think it impacts KVM, at least not directly. Regardless of what decision >> the kernel makes, KVM needs to disable IBT and SHSTK if CET_USER _or_ CET_KERNEL >> is missing, which KVM already does via: >> >> if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | >> XFEATURE_MASK_CET_KERNEL)) != >> (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { >> kvm_cpu_cap_clear(X86_FEATURE_SHSTK); >> kvm_cpu_cap_clear(X86_FEATURE_IBT); >> kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | >> XFEATURE_MASK_CET_KERNEL); >> } >> >>> I'd check and clear IBT bit from CPUID when userspace enables only IBT via >>> KVM_SET_CPUID2. >> No. It is userspace's responsibility to provide a sane CPUID model for the guest. >> KVM needs to ensure that *KVM* doesn't treat IBT as supported if the kernel doesn't >> allow XFEATURE_CET_USER, but userspace can advertise whatever it wants to the guest >> (and gets to keep the pieces if it does something funky). > > OK, I think we can go ahead to keep KVM patches as-is given the fact user IBT is not enabled in Linux. > I only hope other OSes can enforce both SHSTK and IBT dependency on XFEATURE_CET_USER so > that user IBT can work well there. > > Then IBT can be exposed to guest alone because guest *kernel* IBT only relies on S_CET MSR which is > VMCS auto-saved/restored. > > What's your thoughts? If there's objection I'll do below changes for this series: 1) Remove patch : https://lore.kernel.org/all/20240219074733.122080-3-weijiang.yang@intel.com/ 2) Remove reference to raw CPUID for KVM IBT CPUID, handling it the same as SHSTK. Meanwhile still allow userspace to enable IBT feature alone because user IBT is not enabled in kernel now, leave enforcement of user IBT dependency on XFEATURE_CET_USER in future. > >
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 79f7c18c487b..3b263fa171a1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -134,7 +134,7 @@ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ - | X86_CR4_LAM_SUP)) + | X86_CR4_LAM_SUP | X86_CR4_CET)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f1bd7b91b3c6..4aa9aaa295f0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1110,6 +1110,7 @@ #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU #define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU +#define VMX_BASIC_NO_HW_ERROR_CODE_CC 0x0100000000000000LLU /* Resctrl MSRs: */ /* - Intel: */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c0e13040e35b..d37f41472043 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -150,14 +150,14 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return -EINVAL; } /* - * Prevent 32-bit guest launch if shadow stack is exposed as SSP - * state is not defined for 32-bit SMRAM. + * CET is not supported for 32-bit guest, prevent guest launch if + * shadow stack or IBT is enabled for 32-bit guest. */ best = cpuid_entry2_find(entries, nent, 0x80000001, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best && !(best->edx & F(LM))) { best = cpuid_entry2_find(entries, nent, 0x7, 0); - if (best && (best->ecx & F(SHSTK))) + if (best && ((best->ecx & F(SHSTK)) || (best->edx & F(IBT)))) return -EINVAL; } @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | - F(SGX_LC) | F(BUS_LOCK_DETECT) + F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | + F(IBT) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); + /* + * Don't use boot_cpu_has() to check availability of IBT because the + * feature bit is cleared in boot_cpu_data when ibt=off is applied + * in host cmdline. + * + * As currently there's no HW bug which requires disabling IBT feature + * while CPU can enumerate it, host cmdline option ibt=off is most + * likely due to administrative reason on host side, so KVM refers to + * CPU CPUID enumeration to enable the feature. In future if there's + * actually some bug clobbered ibt=off option, then enforce additional + * check here to disable the support in KVM. + */ + if (cpuid_edx(7) & F(IBT)) + kvm_cpu_cap_set(X86_FEATURE_IBT); kvm_cpu_cap_mask(CPUID_7_1_EAX, F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index ee8938818c8a..e12bc233d88b 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -79,6 +79,12 @@ static inline bool cpu_has_vmx_basic_inout(void) return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); } +static inline bool cpu_has_vmx_basic_no_hw_errcode(void) +{ + return ((u64)vmcs_config.basic_cap << 32) & + VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 342b5b94c892..9df25c9e80f5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2609,6 +2609,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); @@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, 0); + vmcs_writel(GUEST_S_CET, 0); + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); + } else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) { + vmcs_writel(GUEST_S_CET, 0); + } + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); @@ -6353,6 +6362,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), + vmcs_readl(GUEST_INTR_SSP_TABLE)); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); @@ -6383,6 +6396,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); + if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), + vmcs_readl(HOST_INTR_SSP_TABLE)); pr_err("*** Control State ***\n"); pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", @@ -7965,7 +7982,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7977,6 +7993,18 @@ static __init void vmx_set_cpu_caps(void) if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + + /* + * Disable CET if unrestricted_guest is unsupported as KVM doesn't + * enforce CET HW behaviors in emulator. On platforms with + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code + * fails, so disable CET in this case too. + */ + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + !cpu_has_vmx_basic_no_hw_errcode()) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e3b0985bb74a..d0cad2624564 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -484,7 +484,8 @@ static inline u8 vmx_get_rvi(void) VM_ENTRY_LOAD_IA32_EFER | \ VM_ENTRY_LOAD_BNDCFGS | \ VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) + VM_ENTRY_LOAD_IA32_RTIT_CTL | \ + VM_ENTRY_LOAD_CET_STATE) #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ @@ -506,7 +507,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_LOAD_IA32_EFER | \ VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) + VM_EXIT_CLEAR_IA32_RTIT_CTL | \ + VM_EXIT_LOAD_CET_STATE) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 73a55d388dd9..cd656099fbfd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -231,7 +231,8 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) -#define KVM_SUPPORTED_XSS 0 +#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_CET_KERNEL) u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); @@ -9943,6 +9944,20 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL); + + if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL)) != + (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | + XFEATURE_MASK_CET_KERNEL); + } + #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has @@ -12402,7 +12417,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) } #define XSTATE_NEED_RESET_MASK (XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_CET_KERNEL) static bool kvm_vcpu_has_xstate(unsigned long xfeature) { @@ -12410,6 +12427,11 @@ static bool kvm_vcpu_has_xstate(unsigned long xfeature) case XFEATURE_MASK_BNDREGS: case XFEATURE_MASK_BNDCSR: return kvm_cpu_cap_has(X86_FEATURE_MPX); + case XFEATURE_CET_USER: + return kvm_cpu_cap_has(X86_FEATURE_SHSTK) || + kvm_cpu_cap_has(X86_FEATURE_IBT); + case XFEATURE_CET_KERNEL: + return kvm_cpu_cap_has(X86_FEATURE_SHSTK); default: return false; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 656107e64c93..cc585051d24b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -533,6 +533,9 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); __reserved_bits |= X86_CR4_PCIDE; \ if (!__cpu_has(__c, X86_FEATURE_LAM)) \ __reserved_bits |= X86_CR4_LAM_SUP; \ + if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \ + !__cpu_has(__c, X86_FEATURE_IBT)) \ + __reserved_bits |= X86_CR4_CET; \ __reserved_bits; \ })