Message ID | 1479863680-117511-2-git-send-email-dmatlack@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 23/11/2016 02:14, David Matlack wrote: > The VMX capability MSRs advertise the set of features the KVM virtual > CPU can support. This set of features vary across different host CPUs > and KVM versions. This patch aims to addresses both sources of > differences, allowing VMs to be migrated across CPUs and KVM versions > without guest-visible changes to these MSRs. Note that cross-KVM- > version migration is only supported from this point forward. > > When the VMX capability MSRs are restored, they are audited to check > that the set of features advertised are a subset of what KVM and the > CPU support. > > Since the VMX capability MSRs are read-only, they do not need to be on > the default MSR save/restore lists. The userspace hypervisor can set > the values of these MSRs or read them from KVM at VCPU creation time, > and restore the same value after every save/restore. > > Signed-off-by: David Matlack <dmatlack@google.com> > --- > arch/x86/include/asm/vmx.h | 31 +++++ > arch/x86/kvm/vmx.c | 317 +++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 324 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index a002b07..a4ca897 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -25,6 +25,7 @@ > #define VMX_H > > > +#include <linux/bitops.h> > #include <linux/types.h> > #include <uapi/asm/vmx.h> > > @@ -110,6 +111,36 @@ > #define VMX_MISC_SAVE_EFER_LMA 0x00000020 > #define VMX_MISC_ACTIVITY_HLT 0x00000040 > > +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) > +{ > + return vmx_basic & GENMASK_ULL(30, 0); > +} > + > +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) > +{ > + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; > +} > + > +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) > +{ > + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; > +} > + > +static inline int vmx_misc_cr3_count(u64 vmx_misc) > +{ > + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; > +} > + > +static inline int vmx_misc_max_msr(u64 vmx_misc) > +{ > + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; > +} > + > +static inline int vmx_misc_mseg_revid(u64 vmx_misc) > +{ > + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; > +} > + > /* VMCS Encodings */ > enum vmcs_field { > VIRTUAL_PROCESSOR_ID = 0x00000000, > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 5382b82..6ec3832 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -463,6 +463,12 @@ struct nested_vmx { > u32 nested_vmx_misc_high; > u32 nested_vmx_ept_caps; > +/* > + * Called when userspace is restoring VMX MSRs. > + * > + * Returns 0 on success, non-0 otherwise. > + */ > +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > > switch (msr_index) { > case MSR_IA32_VMX_BASIC: > + return vmx_restore_vmx_basic(vmx, data); > + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: > + case MSR_IA32_VMX_PINBASED_CTLS: > + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: > + case MSR_IA32_VMX_PROCBASED_CTLS: > + case MSR_IA32_VMX_TRUE_EXIT_CTLS: > + case MSR_IA32_VMX_EXIT_CTLS: > + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: > + case MSR_IA32_VMX_ENTRY_CTLS: PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived from their "true" counterparts, so I think it's better to remove the "non-true" ones from struct nested_vmx (and/or add the "true" ones when missing) and make them entirely computed. But it can be done on top. Paolo > + case MSR_IA32_VMX_PROCBASED_CTLS2: > + return vmx_restore_control_msr(vmx, msr_index, data); > + case MSR_IA32_VMX_MISC: > + return vmx_restore_vmx_misc(vmx, data); > + case MSR_IA32_VMX_CR0_FIXED0: > + case MSR_IA32_VMX_CR4_FIXED0: > + return vmx_restore_fixed0_msr(vmx, msr_index, data); > + case MSR_IA32_VMX_CR0_FIXED1: > + case MSR_IA32_VMX_CR4_FIXED1: > + return vmx_restore_fixed1_msr(vmx, msr_index, data); > + case MSR_IA32_VMX_EPT_VPID_CAP: > + return vmx_restore_vmx_ept_vpid_cap(vmx, data); > + case MSR_IA32_VMX_VMCS_ENUM: > + vmx->nested.nested_vmx_vmcs_enum = data; > + return 0; > + default: > /* > - * This MSR reports some information about VMX support. We > - * should return information about the VMX we emulate for the > - * guest, and the VMCS structure we give it - not about the > - * VMX support of the underlying hardware. > + * The rest of the VMX capability MSRs do not support restore. > */ > - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | > - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | > - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); > - if (cpu_has_vmx_basic_inout()) > - *pdata |= VMX_BASIC_INOUT; > + return -EINVAL; > + } > + > + return 0; > +} > + > +/* Returns 0 on success, non-0 otherwise. */ > +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + switch (msr_index) { > + case MSR_IA32_VMX_BASIC: > + *pdata = vmx->nested.nested_vmx_basic; > break; > case MSR_IA32_VMX_TRUE_PINBASED_CTLS: > case MSR_IA32_VMX_PINBASED_CTLS: > @@ -2904,27 +3176,20 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) > vmx->nested.nested_vmx_misc_low, > vmx->nested.nested_vmx_misc_high); > break; > - /* > - * These MSRs specify bits which the guest must keep fixed (on or off) > - * while L1 is in VMXON mode (in L1's root mode, or running an L2). > - * We picked the standard core2 setting. > - */ > -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) > -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE > case MSR_IA32_VMX_CR0_FIXED0: > - *pdata = VMXON_CR0_ALWAYSON; > + *pdata = vmx->nested.nested_vmx_cr0_fixed0; > break; > case MSR_IA32_VMX_CR0_FIXED1: > - *pdata = -1ULL; > + *pdata = vmx->nested.nested_vmx_cr0_fixed1; > break; > case MSR_IA32_VMX_CR4_FIXED0: > - *pdata = VMXON_CR4_ALWAYSON; > + *pdata = vmx->nested.nested_vmx_cr4_fixed0; > break; > case MSR_IA32_VMX_CR4_FIXED1: > - *pdata = -1ULL; > + *pdata = vmx->nested.nested_vmx_cr4_fixed1; > break; > case MSR_IA32_VMX_VMCS_ENUM: > - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ > + *pdata = vmx->nested.nested_vmx_vmcs_enum; > break; > case MSR_IA32_VMX_PROCBASED_CTLS2: > *pdata = vmx_control_msr( > @@ -3107,7 +3372,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > vmx_leave_nested(vcpu); > break; > case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: > - return 1; /* they are read-only */ > + if (!msr_info->host_initiated) > + return 1; /* they are read-only */ > + if (!nested_vmx_allowed(vcpu)) > + return 1; > + return vmx_set_vmx_msr(vcpu, msr_index, data); > case MSR_IA32_XSS: > if (!vmx_xsaves_supported()) > return 1; > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 23, 2016 at 3:44 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > On 23/11/2016 02:14, David Matlack wrote: >> switch (msr_index) { >> case MSR_IA32_VMX_BASIC: >> + return vmx_restore_vmx_basic(vmx, data); >> + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: >> + case MSR_IA32_VMX_PINBASED_CTLS: >> + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: >> + case MSR_IA32_VMX_PROCBASED_CTLS: >> + case MSR_IA32_VMX_TRUE_EXIT_CTLS: >> + case MSR_IA32_VMX_EXIT_CTLS: >> + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: >> + case MSR_IA32_VMX_ENTRY_CTLS: > > PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived > from their "true" counterparts, so I think it's better to remove the > "non-true" ones from struct nested_vmx (and/or add the "true" ones when > missing) and make them entirely computed. But it can be done on top. Good point. And that would mean userspace does not need to restore the non-true MSRs, right? KVM does not emulate MSR_IA32_VMX_BASIC[55]=0, and will probably never want to. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 28/11/2016 22:11, David Matlack wrote: > > PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived > > from their "true" counterparts, so I think it's better to remove the > > "non-true" ones from struct nested_vmx (and/or add the "true" ones when > > missing) and make them entirely computed. But it can be done on top. > > Good point. And that would mean userspace does not need to restore the > non-true MSRs, right? Yes, sorry for being a bit too concise. :) > KVM does not emulate MSR_IA32_VMX_BASIC[55]=0, > and will probably never want to. That's a separate question, MSR_IA32_VMX_BASIC[55]=0 basically means that the "true" capabilities are the same as the "default" capabilities. If userspace wanted to set it that way, KVM right now would not hide the "true" capability MSR, but on the other hand the nested hypervisor should not even notice the difference. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Nov 28, 2016 at 2:48 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > On 28/11/2016 22:11, David Matlack wrote: >> > PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived >> > from their "true" counterparts, so I think it's better to remove the >> > "non-true" ones from struct nested_vmx (and/or add the "true" ones when >> > missing) and make them entirely computed. But it can be done on top. >> >> Good point. And that would mean userspace does not need to restore the >> non-true MSRs, right? > > Yes, sorry for being a bit too concise. :) I'll include this cleanup in the next version of the patchset since it affects which MSRs userspace will restore. It looks like a pretty simple patch. > >> KVM does not emulate MSR_IA32_VMX_BASIC[55]=0, >> and will probably never want to. > > That's a separate question, MSR_IA32_VMX_BASIC[55]=0 basically means > that the "true" capabilities are the same as the "default" capabilities. > If userspace wanted to set it that way, KVM right now would not hide > the "true" capability MSR, but on the other hand the nested hypervisor > should not even notice the difference. KVM would also need to use the non-true MSR in place of the true MSRs when checking VMCS12 during VM-entry. > > Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Mon, Nov 28, 2016 at 2:48 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 28/11/2016 22:11, David Matlack wrote: > >> > PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived > >> > from their "true" counterparts, so I think it's better to remove the > >> > "non-true" ones from struct nested_vmx (and/or add the "true" ones when > >> > missing) and make them entirely computed. But it can be done on top. > >> > >> Good point. And that would mean userspace does not need to restore the > >> non-true MSRs, right? > > > > Yes, sorry for being a bit too concise. :) > > I'll include this cleanup in the next version of the patchset since it > affects which MSRs userspace will restore. It looks like a pretty > simple patch. Don't bother removing the "non-true" registers from nested_vmx; you only need to adjust the userspace API. > > > >> KVM does not emulate MSR_IA32_VMX_BASIC[55]=0, > >> and will probably never want to. > > > > That's a separate question, MSR_IA32_VMX_BASIC[55]=0 basically means > > that the "true" capabilities are the same as the "default" capabilities. > > If userspace wanted to set it that way, KVM right now would not hide > > the "true" capability MSR, but on the other hand the nested hypervisor > > should not even notice the difference. > > KVM would also need to use the non-true MSR in place of the true MSRs > when checking VMCS12 during VM-entry. It's not necessary, userspace would set the relevant bits to 1 in the true MSRs, for both the low and high parts. If it doesn't, it's garbage in garbage out. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 29, 2016 at 12:01 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> On Mon, Nov 28, 2016 at 2:48 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> > On 28/11/2016 22:11, David Matlack wrote: >> >> > PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived >> >> > from their "true" counterparts, so I think it's better to remove the >> >> > "non-true" ones from struct nested_vmx (and/or add the "true" ones when >> >> > missing) and make them entirely computed. But it can be done on top. >> >> >> >> Good point. And that would mean userspace does not need to restore the >> >> non-true MSRs, right? >> > >> > Yes, sorry for being a bit too concise. :) >> >> I'll include this cleanup in the next version of the patchset since it >> affects which MSRs userspace will restore. It looks like a pretty >> simple patch. > > Don't bother removing the "non-true" registers from nested_vmx; you only > need to adjust the userspace API. I already wrote the patch, so unless there's an argument against removing them I'll include it in the next patchset. Thanks! > >> > >> >> KVM does not emulate MSR_IA32_VMX_BASIC[55]=0, >> >> and will probably never want to. >> > >> > That's a separate question, MSR_IA32_VMX_BASIC[55]=0 basically means >> > that the "true" capabilities are the same as the "default" capabilities. >> > If userspace wanted to set it that way, KVM right now would not hide >> > the "true" capability MSR, but on the other hand the nested hypervisor >> > should not even notice the difference. >> >> KVM would also need to use the non-true MSR in place of the true MSRs >> when checking VMCS12 during VM-entry. > > It's not necessary, userspace would set the relevant bits to 1 in the true > MSRs, for both the low and high parts. If it doesn't, it's garbage in > garbage out. > > Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a002b07..a4ca897 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,7 @@ #define VMX_H +#include <linux/bitops.h> #include <linux/types.h> #include <uapi/asm/vmx.h> @@ -110,6 +111,36 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) +{ + return vmx_basic & GENMASK_ULL(30, 0); +} + +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; +} + +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) +{ + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; +} + +static inline int vmx_misc_cr3_count(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; +} + +static inline int vmx_misc_max_msr(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; +} + +static inline int vmx_misc_mseg_revid(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; +} + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5382b82..6ec3832 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -463,6 +463,12 @@ struct nested_vmx { u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; u32 nested_vmx_vpid_caps; + u64 nested_vmx_basic; + u64 nested_vmx_cr0_fixed0; + u64 nested_vmx_cr0_fixed1; + u64 nested_vmx_cr4_fixed0; + u64 nested_vmx_cr4_fixed1; + u64 nested_vmx_vmcs_enum; }; #define POSTED_INTR_ON 0 @@ -2829,6 +2835,36 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; vmx->nested.nested_vmx_misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + vmx->nested.nested_vmx_basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed (on or off) + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; + vmx->nested.nested_vmx_cr0_fixed1 = -1ULL; + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + vmx->nested.nested_vmx_cr4_fixed1 = -1ULL; + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + vmx->nested.nested_vmx_vmcs_enum = 0x2e; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2844,24 +2880,260 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.nested_vmx_basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.nested_vmx_basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_true_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_true_exit_ctls_low; + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_exit_ctls_low, + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_true_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; + highp = &vmx->nested.nested_vmx_secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.nested_vmx_pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.nested_vmx_misc_low = data; + vmx->nested.nested_vmx_misc_high = data >> 32; + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, + vmx->nested.nested_vmx_vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.nested_vmx_ept_caps = data; + vmx->nested.nested_vmx_vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.nested_vmx_cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.nested_vmx_cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +static int vmx_restore_fixed1_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED1: + msr = &vmx->nested.nested_vmx_cr0_fixed1; + break; + case MSR_IA32_VMX_CR4_FIXED1: + msr = &vmx->nested.nested_vmx_cr4_fixed1; + break; + default: + BUG(); + } + + /* + * 0 bits (which indicates bits which "must-be-0" during VMX operation) + * must be 0 in the restored value. + */ + if (!is_bitwise_subset(*msr, data, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); switch (msr_index) { case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + return vmx_restore_fixed1_msr(vmx, msr_index, data); + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.nested_vmx_vmcs_enum = data; + return 0; + default: /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. + * The rest of the VMX capability MSRs do not support restore. */ - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - if (cpu_has_vmx_basic_inout()) - *pdata |= VMX_BASIC_INOUT; + return -EINVAL; + } + + return 0; +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = vmx->nested.nested_vmx_basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: @@ -2904,27 +3176,20 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) vmx->nested.nested_vmx_misc_low, vmx->nested.nested_vmx_misc_high); break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + *pdata = vmx->nested.nested_vmx_vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( @@ -3107,7 +3372,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_leave_nested(vcpu); break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1;
The VMX capability MSRs advertise the set of features the KVM virtual CPU can support. This set of features vary across different host CPUs and KVM versions. This patch aims to addresses both sources of differences, allowing VMs to be migrated across CPUs and KVM versions without guest-visible changes to these MSRs. Note that cross-KVM- version migration is only supported from this point forward. When the VMX capability MSRs are restored, they are audited to check that the set of features advertised are a subset of what KVM and the CPU support. Since the VMX capability MSRs are read-only, they do not need to be on the default MSR save/restore lists. The userspace hypervisor can set the values of these MSRs or read them from KVM at VCPU creation time, and restore the same value after every save/restore. Signed-off-by: David Matlack <dmatlack@google.com> --- arch/x86/include/asm/vmx.h | 31 +++++ arch/x86/kvm/vmx.c | 317 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 324 insertions(+), 24 deletions(-)