Message ID | 20230524061634.54141-2-chao.gao@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | MSR_IA32_ARCH_CAPABILITIES cleanups | expand |
On 5/24/2023 2:16 PM, Chao Gao wrote: > to avoid computing the supported value at runtime every time. > > Toggle the ARCH_CAP_SKIP_VMENTRY_L1DFLUSH bit when l1tf_vmx_mitigation > is modified to achieve the same result as runtime computing. It's not the same result. In kvm_get_arch_capabilities(), host's value is honored. I.e., when host supports ARCH_CAP_SKIP_VMENTRY_L1DFLUSH, l1tf_vmx_mitigation doesn't make any difference to the result. > Opportunistically, add a comment to document the problem of allowing > changing the supported value of ARCH_CAPABILITIES and the reason why > we don't fix it. > > No functional change intended. > > Link: https://lore.kernel.org/all/ZGZhW%2Fx5OWPmx1qD@google.com/ > Link: https://lore.kernel.org/all/ZGeU9sYTPxqNGSqI@google.com/ > Signed-off-by: Chao Gao <chao.gao@intel.com> > --- > arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++-- > arch/x86/kvm/x86.c | 7 ++++--- > arch/x86/kvm/x86.h | 1 + > 3 files changed, 28 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index 44fb619803b8..8274ef5e89e5 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -309,10 +309,31 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) > > l1tf_vmx_mitigation = l1tf; > > - if (l1tf != VMENTER_L1D_FLUSH_NEVER) > + /* > + * Update static keys and supported arch capabilities according to > + * the new mitigation state. > + * > + * ARCH_CAP_SKIP_VMENTRY_L1DFLUSH is toggled because if we do cache > + * flushes for L1 guests on (nested) vmlaunch/vmresume to L2, L1 > + * guests can skip the flush and if we don't, then L1 guests need > + * to do a flush. > + * > + * Toggling ARCH_CAP_SKIP_VMENTRY_L1DFLUSH may present inconsistent > + * model to the guest, e.g., if userspace isn't careful, a VM can > + * have vCPUs with different values for ARCH_CAPABILITIES. But > + * there is almost no chance to fix the issue. Because, to present > + * a consistent model, KVM essentially needs to disallow changing > + * the module param after VMs/vCPUs have been created, but that > + * would prevent userspace from toggling the param while VMs are > + * running, e.g., in response to a new vulnerability. > + */ > + if (l1tf != VMENTER_L1D_FLUSH_NEVER) { > static_branch_enable(&vmx_l1d_should_flush); > - else > + kvm_caps.supported_arch_cap |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; > + } else { > static_branch_disable(&vmx_l1d_should_flush); > + kvm_caps.supported_arch_cap &= ~ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; > + } > > if (l1tf == VMENTER_L1D_FLUSH_COND) > static_branch_enable(&vmx_l1d_flush_cond); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index c0778ca39650..2408b5f554b7 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1672,7 +1672,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) > { > switch (msr->index) { > case MSR_IA32_ARCH_CAPABILITIES: > - msr->data = kvm_get_arch_capabilities(); > + msr->data = kvm_caps.supported_arch_cap; > break; > case MSR_IA32_PERF_CAPABILITIES: > msr->data = kvm_caps.supported_perf_cap; > @@ -7156,7 +7156,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) > return; > break; > case MSR_IA32_TSX_CTRL: > - if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) > + if (!(kvm_caps.supported_arch_cap & ARCH_CAP_TSX_CTRL_MSR)) > return; > break; > default: > @@ -9532,6 +9532,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) > kvm_caps.max_guest_tsc_khz = max; > } > kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; > + kvm_caps.supported_arch_cap = kvm_get_arch_capabilities(); > kvm_init_msr_lists(); > return 0; > > @@ -11895,7 +11896,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) > if (r) > goto free_guest_fpu; > > - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); > + vcpu->arch.arch_capabilities = kvm_caps.supported_arch_cap; > vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; > kvm_xen_init_vcpu(vcpu); > kvm_vcpu_mtrr_init(vcpu); > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index c544602d07a3..d3e524bcc169 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -29,6 +29,7 @@ struct kvm_caps { > u64 supported_xcr0; > u64 supported_xss; > u64 supported_perf_cap; > + u64 supported_arch_cap; > }; > > void kvm_spurious_fault(void);
On Wed, May 24, 2023 at 04:14:10PM +0800, Xiaoyao Li wrote: >On 5/24/2023 2:16 PM, Chao Gao wrote: >> to avoid computing the supported value at runtime every time. >> >> Toggle the ARCH_CAP_SKIP_VMENTRY_L1DFLUSH bit when l1tf_vmx_mitigation >> is modified to achieve the same result as runtime computing. > >It's not the same result. it is because ... > >In kvm_get_arch_capabilities(), host's value is honored. I.e., when host >supports ARCH_CAP_SKIP_VMENTRY_L1DFLUSH, l1tf_vmx_mitigation doesn't make any >difference to the result. ... l1tf_vmx_mitigation should be VMENTER_L1D_FLUSH_NOT_REQUIRED in this case. l1tf_vmx_mitigation cannot be VMENTER_L1D_FLUSH_NEVER. > >> Opportunistically, add a comment to document the problem of allowing >> changing the supported value of ARCH_CAPABILITIES and the reason why >> we don't fix it. >> >> No functional change intended. >> >> Link: https://lore.kernel.org/all/ZGZhW%2Fx5OWPmx1qD@google.com/ >> Link: https://lore.kernel.org/all/ZGeU9sYTPxqNGSqI@google.com/ >> Signed-off-by: Chao Gao <chao.gao@intel.com> >> --- >> arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++-- >> arch/x86/kvm/x86.c | 7 ++++--- >> arch/x86/kvm/x86.h | 1 + >> 3 files changed, 28 insertions(+), 5 deletions(-) >> >> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c >> index 44fb619803b8..8274ef5e89e5 100644 >> --- a/arch/x86/kvm/vmx/vmx.c >> +++ b/arch/x86/kvm/vmx/vmx.c >> @@ -309,10 +309,31 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) >> l1tf_vmx_mitigation = l1tf; >> - if (l1tf != VMENTER_L1D_FLUSH_NEVER) >> + /* >> + * Update static keys and supported arch capabilities according to >> + * the new mitigation state. >> + * >> + * ARCH_CAP_SKIP_VMENTRY_L1DFLUSH is toggled because if we do cache >> + * flushes for L1 guests on (nested) vmlaunch/vmresume to L2, L1 >> + * guests can skip the flush and if we don't, then L1 guests need >> + * to do a flush. >> + * >> + * Toggling ARCH_CAP_SKIP_VMENTRY_L1DFLUSH may present inconsistent >> + * model to the guest, e.g., if userspace isn't careful, a VM can >> + * have vCPUs with different values for ARCH_CAPABILITIES. But >> + * there is almost no chance to fix the issue. Because, to present >> + * a consistent model, KVM essentially needs to disallow changing >> + * the module param after VMs/vCPUs have been created, but that >> + * would prevent userspace from toggling the param while VMs are >> + * running, e.g., in response to a new vulnerability. >> + */ >> + if (l1tf != VMENTER_L1D_FLUSH_NEVER) { >> static_branch_enable(&vmx_l1d_should_flush); >> - else >> + kvm_caps.supported_arch_cap |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; >> + } else { >> static_branch_disable(&vmx_l1d_should_flush); >> + kvm_caps.supported_arch_cap &= ~ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; >> + } >> if (l1tf == VMENTER_L1D_FLUSH_COND) >> static_branch_enable(&vmx_l1d_flush_cond); >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index c0778ca39650..2408b5f554b7 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -1672,7 +1672,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) >> { >> switch (msr->index) { >> case MSR_IA32_ARCH_CAPABILITIES: >> - msr->data = kvm_get_arch_capabilities(); >> + msr->data = kvm_caps.supported_arch_cap; >> break; >> case MSR_IA32_PERF_CAPABILITIES: >> msr->data = kvm_caps.supported_perf_cap; >> @@ -7156,7 +7156,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) >> return; >> break; >> case MSR_IA32_TSX_CTRL: >> - if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) >> + if (!(kvm_caps.supported_arch_cap & ARCH_CAP_TSX_CTRL_MSR)) >> return; >> break; >> default: >> @@ -9532,6 +9532,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) >> kvm_caps.max_guest_tsc_khz = max; >> } >> kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; >> + kvm_caps.supported_arch_cap = kvm_get_arch_capabilities(); >> kvm_init_msr_lists(); >> return 0; >> @@ -11895,7 +11896,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) >> if (r) >> goto free_guest_fpu; >> - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); >> + vcpu->arch.arch_capabilities = kvm_caps.supported_arch_cap; >> vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; >> kvm_xen_init_vcpu(vcpu); >> kvm_vcpu_mtrr_init(vcpu); >> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h >> index c544602d07a3..d3e524bcc169 100644 >> --- a/arch/x86/kvm/x86.h >> +++ b/arch/x86/kvm/x86.h >> @@ -29,6 +29,7 @@ struct kvm_caps { >> u64 supported_xcr0; >> u64 supported_xss; >> u64 supported_perf_cap; >> + u64 supported_arch_cap; >> }; >> void kvm_spurious_fault(void); >
On 5/24/2023 4:32 PM, Chao Gao wrote: > On Wed, May 24, 2023 at 04:14:10PM +0800, Xiaoyao Li wrote: >> On 5/24/2023 2:16 PM, Chao Gao wrote: >>> to avoid computing the supported value at runtime every time. >>> >>> Toggle the ARCH_CAP_SKIP_VMENTRY_L1DFLUSH bit when l1tf_vmx_mitigation >>> is modified to achieve the same result as runtime computing. >> >> It's not the same result. > > it is because ... > >> >> In kvm_get_arch_capabilities(), host's value is honored. I.e., when host >> supports ARCH_CAP_SKIP_VMENTRY_L1DFLUSH, l1tf_vmx_mitigation doesn't make any >> difference to the result. > > ... l1tf_vmx_mitigation should be VMENTER_L1D_FLUSH_NOT_REQUIRED in this > case. l1tf_vmx_mitigation cannot be VMENTER_L1D_FLUSH_NEVER. yes. you are right. Maybe we can clarify it in the changelog anyway, Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com> >> >>> Opportunistically, add a comment to document the problem of allowing >>> changing the supported value of ARCH_CAPABILITIES and the reason why >>> we don't fix it. >>> >>> No functional change intended. >>> >>> Link: https://lore.kernel.org/all/ZGZhW%2Fx5OWPmx1qD@google.com/ >>> Link: https://lore.kernel.org/all/ZGeU9sYTPxqNGSqI@google.com/ >>> Signed-off-by: Chao Gao <chao.gao@intel.com> >>> --- >>> arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++-- >>> arch/x86/kvm/x86.c | 7 ++++--- >>> arch/x86/kvm/x86.h | 1 + >>> 3 files changed, 28 insertions(+), 5 deletions(-) >>> >>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c >>> index 44fb619803b8..8274ef5e89e5 100644 >>> --- a/arch/x86/kvm/vmx/vmx.c >>> +++ b/arch/x86/kvm/vmx/vmx.c >>> @@ -309,10 +309,31 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) >>> l1tf_vmx_mitigation = l1tf; >>> - if (l1tf != VMENTER_L1D_FLUSH_NEVER) >>> + /* >>> + * Update static keys and supported arch capabilities according to >>> + * the new mitigation state. >>> + * >>> + * ARCH_CAP_SKIP_VMENTRY_L1DFLUSH is toggled because if we do cache >>> + * flushes for L1 guests on (nested) vmlaunch/vmresume to L2, L1 >>> + * guests can skip the flush and if we don't, then L1 guests need >>> + * to do a flush. >>> + * >>> + * Toggling ARCH_CAP_SKIP_VMENTRY_L1DFLUSH may present inconsistent >>> + * model to the guest, e.g., if userspace isn't careful, a VM can >>> + * have vCPUs with different values for ARCH_CAPABILITIES. But >>> + * there is almost no chance to fix the issue. Because, to present >>> + * a consistent model, KVM essentially needs to disallow changing >>> + * the module param after VMs/vCPUs have been created, but that >>> + * would prevent userspace from toggling the param while VMs are >>> + * running, e.g., in response to a new vulnerability. >>> + */ >>> + if (l1tf != VMENTER_L1D_FLUSH_NEVER) { >>> static_branch_enable(&vmx_l1d_should_flush); >>> - else >>> + kvm_caps.supported_arch_cap |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; >>> + } else { >>> static_branch_disable(&vmx_l1d_should_flush); >>> + kvm_caps.supported_arch_cap &= ~ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; >>> + } >>> if (l1tf == VMENTER_L1D_FLUSH_COND) >>> static_branch_enable(&vmx_l1d_flush_cond); >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>> index c0778ca39650..2408b5f554b7 100644 >>> --- a/arch/x86/kvm/x86.c >>> +++ b/arch/x86/kvm/x86.c >>> @@ -1672,7 +1672,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) >>> { >>> switch (msr->index) { >>> case MSR_IA32_ARCH_CAPABILITIES: >>> - msr->data = kvm_get_arch_capabilities(); >>> + msr->data = kvm_caps.supported_arch_cap; >>> break; >>> case MSR_IA32_PERF_CAPABILITIES: >>> msr->data = kvm_caps.supported_perf_cap; >>> @@ -7156,7 +7156,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) >>> return; >>> break; >>> case MSR_IA32_TSX_CTRL: >>> - if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) >>> + if (!(kvm_caps.supported_arch_cap & ARCH_CAP_TSX_CTRL_MSR)) >>> return; >>> break; >>> default: >>> @@ -9532,6 +9532,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) >>> kvm_caps.max_guest_tsc_khz = max; >>> } >>> kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; >>> + kvm_caps.supported_arch_cap = kvm_get_arch_capabilities(); >>> kvm_init_msr_lists(); >>> return 0; >>> @@ -11895,7 +11896,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) >>> if (r) >>> goto free_guest_fpu; >>> - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); >>> + vcpu->arch.arch_capabilities = kvm_caps.supported_arch_cap; >>> vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; >>> kvm_xen_init_vcpu(vcpu); >>> kvm_vcpu_mtrr_init(vcpu); >>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h >>> index c544602d07a3..d3e524bcc169 100644 >>> --- a/arch/x86/kvm/x86.h >>> +++ b/arch/x86/kvm/x86.h >>> @@ -29,6 +29,7 @@ struct kvm_caps { >>> u64 supported_xcr0; >>> u64 supported_xss; >>> u64 supported_perf_cap; >>> + u64 supported_arch_cap; >>> }; >>> void kvm_spurious_fault(void); >>
On Wed, May 24, 2023, Chao Gao wrote: > @@ -9532,6 +9532,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) > kvm_caps.max_guest_tsc_khz = max; > } > kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; > + kvm_caps.supported_arch_cap = kvm_get_arch_capabilities(); > kvm_init_msr_lists(); > return 0; > > @@ -11895,7 +11896,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) > if (r) > goto free_guest_fpu; > > - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); > + vcpu->arch.arch_capabilities = kvm_caps.supported_arch_cap; > vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; > kvm_xen_init_vcpu(vcpu); > kvm_vcpu_mtrr_init(vcpu); > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index c544602d07a3..d3e524bcc169 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -29,6 +29,7 @@ struct kvm_caps { > u64 supported_xcr0; > u64 supported_xss; > u64 supported_perf_cap; > + u64 supported_arch_cap; Hrm, I take back my earlier vote about using a dynamic snapshot. "supported" isn't quite right. KVM always "supports" advertising SKIP_VMENTRY_L1DFLUSH to the guest. And KVM really does treat the MSR like a CPUID leaf, in that KVM doesn't sanity check the value coming in from userspace. Whether or not that's a good idea is debatable, but it is was it is. The value is more like KVM's current default. Looking at all the uses of both the default/supported value, and the host MSR, I think it makes more sense to snapshot the host value than it does to snapshot and update the default/supported value. The default value is used only when a vCPU is created and when userspace does a system-scoped KVM_GET_MSRS, i.e. avoiding the RDMSR is nice, but making the read super fast isn't necessary, e.g. the overhead of the boot_cpu_has() and boot_cpu_has_bug() checks is negligible. And if KVM snapshots the MSR, the other usage of the host value can be cleaned up too. I'm leaning towards doing this instead of patches [1/3] and [3/3]: From: Sean Christopherson <seanjc@google.com> Date: Tue, 6 Jun 2023 09:20:31 -0700 Subject: [PATCH 1/2] KVM: x86: Snapshot host's MSR_IA32_ARCH_CAPABILITIES Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/vmx/vmx.c | 22 ++++++---------------- arch/x86/kvm/x86.c | 13 +++++++------ arch/x86/kvm/x86.h | 1 + 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2d9d155691a7..42d1148f933c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -255,14 +255,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } + if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; } /* If set to auto use the default l1tf mitigation method */ @@ -373,15 +368,10 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) static void vmx_setup_fb_clear_ctrl(void) { - u64 msr; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && + if ((host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && - !boot_cpu_has_bug(X86_BUG_TAA)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_FB_CLEAR_CTRL) - vmx_fb_clear_ctrl_available = true; - } + !boot_cpu_has_bug(X86_BUG_TAA)) + vmx_fb_clear_ctrl_available = true; } static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7c7be4815eaa..7c2e796fa460 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -237,6 +237,9 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); +u64 __read_mostly host_arch_capabilities; +EXPORT_SYMBOL_GPL(host_arch_capabilities); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -1612,12 +1615,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) static u64 kvm_get_arch_capabilities(void) { - u64 data = 0; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); - data &= KVM_SUPPORTED_ARCH_CAP; - } + u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -9492,6 +9490,9 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_init_pmu_capability(ops->pmu_ops); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); + r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 82e3dafc5453..1e7be1f6ab29 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -323,6 +323,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 host_xss; +extern u64 host_arch_capabilities; extern struct kvm_caps kvm_caps; base-commit: 02f1b0b736606f9870595b3089d9c124f9da8be9
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44fb619803b8..8274ef5e89e5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -309,10 +309,31 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) l1tf_vmx_mitigation = l1tf; - if (l1tf != VMENTER_L1D_FLUSH_NEVER) + /* + * Update static keys and supported arch capabilities according to + * the new mitigation state. + * + * ARCH_CAP_SKIP_VMENTRY_L1DFLUSH is toggled because if we do cache + * flushes for L1 guests on (nested) vmlaunch/vmresume to L2, L1 + * guests can skip the flush and if we don't, then L1 guests need + * to do a flush. + * + * Toggling ARCH_CAP_SKIP_VMENTRY_L1DFLUSH may present inconsistent + * model to the guest, e.g., if userspace isn't careful, a VM can + * have vCPUs with different values for ARCH_CAPABILITIES. But + * there is almost no chance to fix the issue. Because, to present + * a consistent model, KVM essentially needs to disallow changing + * the module param after VMs/vCPUs have been created, but that + * would prevent userspace from toggling the param while VMs are + * running, e.g., in response to a new vulnerability. + */ + if (l1tf != VMENTER_L1D_FLUSH_NEVER) { static_branch_enable(&vmx_l1d_should_flush); - else + kvm_caps.supported_arch_cap |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + } else { static_branch_disable(&vmx_l1d_should_flush); + kvm_caps.supported_arch_cap &= ~ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + } if (l1tf == VMENTER_L1D_FLUSH_COND) static_branch_enable(&vmx_l1d_flush_cond); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..2408b5f554b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1672,7 +1672,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_ARCH_CAPABILITIES: - msr->data = kvm_get_arch_capabilities(); + msr->data = kvm_caps.supported_arch_cap; break; case MSR_IA32_PERF_CAPABILITIES: msr->data = kvm_caps.supported_perf_cap; @@ -7156,7 +7156,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) return; break; case MSR_IA32_TSX_CTRL: - if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) + if (!(kvm_caps.supported_arch_cap & ARCH_CAP_TSX_CTRL_MSR)) return; break; default: @@ -9532,6 +9532,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.max_guest_tsc_khz = max; } kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; + kvm_caps.supported_arch_cap = kvm_get_arch_capabilities(); kvm_init_msr_lists(); return 0; @@ -11895,7 +11896,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (r) goto free_guest_fpu; - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); + vcpu->arch.arch_capabilities = kvm_caps.supported_arch_cap; vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_xen_init_vcpu(vcpu); kvm_vcpu_mtrr_init(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c544602d07a3..d3e524bcc169 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -29,6 +29,7 @@ struct kvm_caps { u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + u64 supported_arch_cap; }; void kvm_spurious_fault(void);
to avoid computing the supported value at runtime every time. Toggle the ARCH_CAP_SKIP_VMENTRY_L1DFLUSH bit when l1tf_vmx_mitigation is modified to achieve the same result as runtime computing. Opportunistically, add a comment to document the problem of allowing changing the supported value of ARCH_CAPABILITIES and the reason why we don't fix it. No functional change intended. Link: https://lore.kernel.org/all/ZGZhW%2Fx5OWPmx1qD@google.com/ Link: https://lore.kernel.org/all/ZGeU9sYTPxqNGSqI@google.com/ Signed-off-by: Chao Gao <chao.gao@intel.com> --- arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++-- arch/x86/kvm/x86.c | 7 ++++--- arch/x86/kvm/x86.h | 1 + 3 files changed, 28 insertions(+), 5 deletions(-)