Message ID | 20160616060531.30028-4-haozhong.zhang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 16/06/2016 08:05, Haozhong Zhang wrote: > From: Ashok Raj <ashok.raj@intel.com> > > On Intel platforms, this patch adds LMCE to KVM MCE supported > capabilities and handles guest access to LMCE related MSRs. > > Signed-off-by: Ashok Raj <ashok.raj@intel.com> > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported > Only enable LMCE on Intel platform > Check MSR_IA32_FEATURE_CONTROL when handling guest > access to MSR_IA32_MCG_EXT_CTL] > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > --- > arch/x86/include/asm/kvm_host.h | 5 +++++ > arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++++++++++++++- > arch/x86/kvm/x86.c | 15 +++++++++------ > 3 files changed, 49 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index e0fbe7e..75defa6 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -598,6 +598,7 @@ struct kvm_vcpu_arch { > u64 mcg_cap; > u64 mcg_status; > u64 mcg_ctl; > + u64 mcg_ext_ctl; > u64 *mce_banks; > > /* Cache MMIO info */ > @@ -1005,6 +1006,8 @@ struct kvm_x86_ops { > int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, > uint32_t guest_irq, bool set); > void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); > + > + void (*setup_mce)(struct kvm_vcpu *vcpu); > }; > > struct kvm_arch_async_pf { > @@ -1077,6 +1080,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits; > /* maximum allowed value of TSC scaling ratio */ > extern u64 kvm_max_tsc_scaling_ratio; > > +extern u64 kvm_mce_cap_supported; > + > enum emulation_result { > EMULATE_DONE, /* no further processing */ > EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 1dc89c5..42db42e 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -638,7 +638,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) > * feature_control_valid_bits_add/del(), so it's not included here. > */ > #define FEATURE_CONTROL_MAX_VALID_BITS \ > - FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX > + (FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE) > > static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits) > { > @@ -2905,6 +2905,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, > return valid_bits && !(val & ~valid_bits); > } > > +static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu, > + bool host_initiated) > +{ > + return (vcpu->arch.mcg_cap & MCG_LMCE_P) && Checking MCG_LMCE_P is unnecessary, because you cannot set FEATURE_CONTROL_LMCE unless MCG_LMCE_P is present. You can just inline this function in the callers, it's simpler. > + (host_initiated || > + (to_vmx(vcpu)->msr_ia32_feature_control & > + FEATURE_CONTROL_LMCE)); > +} > + > /* > * Reads an msr value (of 'msr_index') into 'pdata'. > * Returns 0 on success, non-0 otherwise. > @@ -2946,6 +2955,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > return 1; > msr_info->data = vmcs_read64(GUEST_BNDCFGS); > break; > + case MSR_IA32_MCG_EXT_CTL: > + if (!vmx_mcg_ext_ctrl_msr_present(vcpu, > + msr_info->host_initiated)) > + return 1; > + msr_info->data = vcpu->arch.mcg_ext_ctl; > + break; > case MSR_IA32_FEATURE_CONTROL: > if (!vmx_feature_control_msr_valid(vcpu, 0)) > return 1; > @@ -3039,6 +3054,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_TSC_ADJUST: > ret = kvm_set_msr_common(vcpu, msr_info); > break; > + case MSR_IA32_MCG_EXT_CTL: > + if (!vmx_mcg_ext_ctrl_msr_present(vcpu, > + msr_info->host_initiated) || > + (data & ~MCG_EXT_CTL_LMCE_EN)) > + return 1; > + vcpu->arch.mcg_ext_ctl = data; > + break; > case MSR_IA32_FEATURE_CONTROL: > if (!vmx_feature_control_msr_valid(vcpu, data) || > (to_vmx(vcpu)->msr_ia32_feature_control & > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) > > kvm_set_posted_intr_wakeup_handler(wakeup_handler); > > + kvm_mce_cap_supported |= MCG_LMCE_P; Ah, so virtual LMCE is available on all processors! This is interesting, but it also makes it more complicated to handle in QEMU; a new QEMU generally doesn't require a new kernel. Eduardo, any ideas? Thanks, Paolo > return alloc_kvm_area(); > > out8: > @@ -10950,6 +10974,14 @@ out: > return ret; > } > > +static void vmx_setup_mce(struct kvm_vcpu *vcpu) > +{ > + if (vcpu->arch.mcg_cap & MCG_LMCE_P) > + feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE); > + else > + feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE); > +} > + > static struct kvm_x86_ops vmx_x86_ops = { > .cpu_has_kvm_support = cpu_has_kvm_support, > .disabled_by_bios = vmx_disabled_by_bios, > @@ -11074,6 +11106,8 @@ static struct kvm_x86_ops vmx_x86_ops = { > .pmu_ops = &intel_pmu_ops, > > .update_pi_irte = vmx_update_pi_irte, > + > + .setup_mce = vmx_setup_mce, > }; > > static int __init vmx_init(void) > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index bf22721..5bf76ab 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -70,7 +70,8 @@ > > #define MAX_IO_MSRS 256 > #define KVM_MAX_MCE_BANKS 32 > -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) > +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; > +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); > > #define emul_to_vcpu(ctxt) \ > container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) > @@ -983,6 +984,7 @@ static u32 emulated_msrs[] = { > MSR_IA32_MISC_ENABLE, > MSR_IA32_MCG_STATUS, > MSR_IA32_MCG_CTL, > + MSR_IA32_MCG_EXT_CTL, > MSR_IA32_SMBASE, > }; > > @@ -2684,11 +2686,9 @@ long kvm_arch_dev_ioctl(struct file *filp, > break; > } > case KVM_X86_GET_MCE_CAP_SUPPORTED: { > - u64 mce_cap; > - > - mce_cap = KVM_MCE_CAP_SUPPORTED; > r = -EFAULT; > - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) > + if (copy_to_user(argp, &kvm_mce_cap_supported, > + sizeof(kvm_mce_cap_supported))) > goto out; > r = 0; > break; > @@ -2866,7 +2866,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, > r = -EINVAL; > if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) > goto out; > - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) > + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) > goto out; > r = 0; > vcpu->arch.mcg_cap = mcg_cap; > @@ -2876,6 +2876,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, > /* Init IA32_MCi_CTL to all 1s */ > for (bank = 0; bank < bank_num; bank++) > vcpu->arch.mce_banks[bank*4] = ~(u64)0; > + > + if (kvm_x86_ops->setup_mce) > + kvm_x86_ops->setup_mce(vcpu); > out: > return r; > } > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/16/16 12:04, Paolo Bonzini wrote: > > > On 16/06/2016 08:05, Haozhong Zhang wrote: > > From: Ashok Raj <ashok.raj@intel.com> > > > > On Intel platforms, this patch adds LMCE to KVM MCE supported > > capabilities and handles guest access to LMCE related MSRs. > > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com> > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported > > Only enable LMCE on Intel platform > > Check MSR_IA32_FEATURE_CONTROL when handling guest > > access to MSR_IA32_MCG_EXT_CTL] > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > --- > > arch/x86/include/asm/kvm_host.h | 5 +++++ > > arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++++++++++++++- > > arch/x86/kvm/x86.c | 15 +++++++++------ > > 3 files changed, 49 insertions(+), 7 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > index e0fbe7e..75defa6 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -598,6 +598,7 @@ struct kvm_vcpu_arch { > > u64 mcg_cap; > > u64 mcg_status; > > u64 mcg_ctl; > > + u64 mcg_ext_ctl; > > u64 *mce_banks; > > > > /* Cache MMIO info */ > > @@ -1005,6 +1006,8 @@ struct kvm_x86_ops { > > int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, > > uint32_t guest_irq, bool set); > > void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); > > + > > + void (*setup_mce)(struct kvm_vcpu *vcpu); > > }; > > > > struct kvm_arch_async_pf { > > @@ -1077,6 +1080,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits; > > /* maximum allowed value of TSC scaling ratio */ > > extern u64 kvm_max_tsc_scaling_ratio; > > > > +extern u64 kvm_mce_cap_supported; > > + > > enum emulation_result { > > EMULATE_DONE, /* no further processing */ > > EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > > index 1dc89c5..42db42e 100644 > > --- a/arch/x86/kvm/vmx.c > > +++ b/arch/x86/kvm/vmx.c > > @@ -638,7 +638,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) > > * feature_control_valid_bits_add/del(), so it's not included here. > > */ > > #define FEATURE_CONTROL_MAX_VALID_BITS \ > > - FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX > > + (FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE) > > > > static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits) > > { > > @@ -2905,6 +2905,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, > > return valid_bits && !(val & ~valid_bits); > > } > > > > +static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu, > > + bool host_initiated) > > +{ > > + return (vcpu->arch.mcg_cap & MCG_LMCE_P) && > > Checking MCG_LMCE_P is unnecessary, because you cannot set > FEATURE_CONTROL_LMCE unless MCG_LMCE_P is present. > > You can just inline this function in the callers, it's simpler. I'll remove the first line check and inline the other parts. > > > + (host_initiated || > > + (to_vmx(vcpu)->msr_ia32_feature_control & > > + FEATURE_CONTROL_LMCE)); > > +} > > + > > /* > > * Reads an msr value (of 'msr_index') into 'pdata'. > > * Returns 0 on success, non-0 otherwise. > > @@ -2946,6 +2955,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > return 1; > > msr_info->data = vmcs_read64(GUEST_BNDCFGS); > > break; > > + case MSR_IA32_MCG_EXT_CTL: > > + if (!vmx_mcg_ext_ctrl_msr_present(vcpu, > > + msr_info->host_initiated)) > > + return 1; > > + msr_info->data = vcpu->arch.mcg_ext_ctl; > > + break; > > case MSR_IA32_FEATURE_CONTROL: > > if (!vmx_feature_control_msr_valid(vcpu, 0)) > > return 1; > > @@ -3039,6 +3054,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > case MSR_IA32_TSC_ADJUST: > > ret = kvm_set_msr_common(vcpu, msr_info); > > break; > > + case MSR_IA32_MCG_EXT_CTL: > > + if (!vmx_mcg_ext_ctrl_msr_present(vcpu, > > + msr_info->host_initiated) || > > + (data & ~MCG_EXT_CTL_LMCE_EN)) > > + return 1; > > + vcpu->arch.mcg_ext_ctl = data; > > + break; > > case MSR_IA32_FEATURE_CONTROL: > > if (!vmx_feature_control_msr_valid(vcpu, data) || > > (to_vmx(vcpu)->msr_ia32_feature_control & > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) > > > > kvm_set_posted_intr_wakeup_handler(wakeup_handler); > > > > + kvm_mce_cap_supported |= MCG_LMCE_P; > > Ah, so virtual LMCE is available on all processors! This is > interesting, but it also makes it more complicated to handle in QEMU; a > new QEMU generally doesn't require a new kernel. > My QMEU patch checks KVM MCE capabilities before enabling LMCE (See lmce_supported() and mce_init() in QEMU Patch 1), so either new QEMU on old kernel or old QEMU on new kernel will work. Haozhong > Eduardo, any ideas? > > Thanks, > > Paolo > > > return alloc_kvm_area(); > > > > out8: > > @@ -10950,6 +10974,14 @@ out: > > return ret; > > } > > > > +static void vmx_setup_mce(struct kvm_vcpu *vcpu) > > +{ > > + if (vcpu->arch.mcg_cap & MCG_LMCE_P) > > + feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE); > > + else > > + feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE); > > +} > > + > > static struct kvm_x86_ops vmx_x86_ops = { > > .cpu_has_kvm_support = cpu_has_kvm_support, > > .disabled_by_bios = vmx_disabled_by_bios, > > @@ -11074,6 +11106,8 @@ static struct kvm_x86_ops vmx_x86_ops = { > > .pmu_ops = &intel_pmu_ops, > > > > .update_pi_irte = vmx_update_pi_irte, > > + > > + .setup_mce = vmx_setup_mce, > > }; > > > > static int __init vmx_init(void) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index bf22721..5bf76ab 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -70,7 +70,8 @@ > > > > #define MAX_IO_MSRS 256 > > #define KVM_MAX_MCE_BANKS 32 > > -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) > > +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; > > +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); > > > > #define emul_to_vcpu(ctxt) \ > > container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) > > @@ -983,6 +984,7 @@ static u32 emulated_msrs[] = { > > MSR_IA32_MISC_ENABLE, > > MSR_IA32_MCG_STATUS, > > MSR_IA32_MCG_CTL, > > + MSR_IA32_MCG_EXT_CTL, > > MSR_IA32_SMBASE, > > }; > > > > @@ -2684,11 +2686,9 @@ long kvm_arch_dev_ioctl(struct file *filp, > > break; > > } > > case KVM_X86_GET_MCE_CAP_SUPPORTED: { > > - u64 mce_cap; > > - > > - mce_cap = KVM_MCE_CAP_SUPPORTED; > > r = -EFAULT; > > - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) > > + if (copy_to_user(argp, &kvm_mce_cap_supported, > > + sizeof(kvm_mce_cap_supported))) > > goto out; > > r = 0; > > break; > > @@ -2866,7 +2866,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, > > r = -EINVAL; > > if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) > > goto out; > > - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) > > + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) > > goto out; > > r = 0; > > vcpu->arch.mcg_cap = mcg_cap; > > @@ -2876,6 +2876,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, > > /* Init IA32_MCi_CTL to all 1s */ > > for (bank = 0; bank < bank_num; bank++) > > vcpu->arch.mce_banks[bank*4] = ~(u64)0; > > + > > + if (kvm_x86_ops->setup_mce) > > + kvm_x86_ops->setup_mce(vcpu); > > out: > > return r; > > } > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote: > On 16/06/2016 08:05, Haozhong Zhang wrote: > > From: Ashok Raj <ashok.raj@intel.com> > > > > On Intel platforms, this patch adds LMCE to KVM MCE supported > > capabilities and handles guest access to LMCE related MSRs. > > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com> > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported > > Only enable LMCE on Intel platform > > Check MSR_IA32_FEATURE_CONTROL when handling guest > > access to MSR_IA32_MCG_EXT_CTL] > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> [...] > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) > > > > kvm_set_posted_intr_wakeup_handler(wakeup_handler); > > > > + kvm_mce_cap_supported |= MCG_LMCE_P; > > Ah, so virtual LMCE is available on all processors! This is > interesting, but it also makes it more complicated to handle in QEMU; a > new QEMU generally doesn't require a new kernel. > > Eduardo, any ideas? (CCing libvirt list) As we shouldn't make machine-type changes introduce new host requirements, it looks like we need to either add a new set of CPU models (unreasonable), or expect management software to explicitly enable LMCE after ensuring the host supports it. Or we could wait for a reasonable time after the feature is available in the kernel, and declare that QEMU as a whole requires a newer kernel. But how much time would be reasonable for that? Long term, I believe we should think of a better solution. I don't think it is reasonable to require new libvirt code to be written for every single low-level feature that requires a newer kernel or newer host hardware. Maybe new introspection interfaces that would allow us to drop the "no new requirements on machine-type changes" rule?
On 06/16/16 11:55, Eduardo Habkost wrote: > On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote: > > On 16/06/2016 08:05, Haozhong Zhang wrote: > > > From: Ashok Raj <ashok.raj@intel.com> > > > > > > On Intel platforms, this patch adds LMCE to KVM MCE supported > > > capabilities and handles guest access to LMCE related MSRs. > > > > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com> > > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported > > > Only enable LMCE on Intel platform > > > Check MSR_IA32_FEATURE_CONTROL when handling guest > > > access to MSR_IA32_MCG_EXT_CTL] > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > [...] > > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) > > > > > > kvm_set_posted_intr_wakeup_handler(wakeup_handler); > > > > > > + kvm_mce_cap_supported |= MCG_LMCE_P; > > > > Ah, so virtual LMCE is available on all processors! This is > > interesting, but it also makes it more complicated to handle in QEMU; a > > new QEMU generally doesn't require a new kernel. > > > > Eduardo, any ideas? > > (CCing libvirt list) > > As we shouldn't make machine-type changes introduce new host > requirements, it looks like we need to either add a new set of > CPU models (unreasonable), or expect management software to > explicitly enable LMCE after ensuring the host supports it. > > Or we could wait for a reasonable time after the feature is > available in the kernel, and declare that QEMU as a whole > requires a newer kernel. But how much time would be reasonable > for that? > > Long term, I believe we should think of a better solution. I > don't think it is reasonable to require new libvirt code to be > written for every single low-level feature that requires a newer > kernel or newer host hardware. Maybe new introspection interfaces > that would allow us to drop the "no new requirements on > machine-type changes" rule? > Because new MSR (MSR_IA32_MCG_EXT_CTL) and new MSR bit (FEATURE_CONTROL_LMCE in MSR_IA32_FEATURE_CONTROL) are introduced by LMCE, QEMU requires new KVM which can handle those changes. I'm not familiar with libvirt. Does the requirement of new KVM capability bring any troubles to libvirt? Thanks, Haozhong -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jun 17, 2016 at 09:11:16AM +0800, Haozhong Zhang wrote: > On 06/16/16 11:55, Eduardo Habkost wrote: > > On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote: > > > On 16/06/2016 08:05, Haozhong Zhang wrote: > > > > From: Ashok Raj <ashok.raj@intel.com> > > > > > > > > On Intel platforms, this patch adds LMCE to KVM MCE supported > > > > capabilities and handles guest access to LMCE related MSRs. > > > > > > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com> > > > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported > > > > Only enable LMCE on Intel platform > > > > Check MSR_IA32_FEATURE_CONTROL when handling guest > > > > access to MSR_IA32_MCG_EXT_CTL] > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > [...] > > > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) > > > > > > > > kvm_set_posted_intr_wakeup_handler(wakeup_handler); > > > > > > > > + kvm_mce_cap_supported |= MCG_LMCE_P; > > > > > > Ah, so virtual LMCE is available on all processors! This is > > > interesting, but it also makes it more complicated to handle in QEMU; a > > > new QEMU generally doesn't require a new kernel. > > > > > > Eduardo, any ideas? > > > > (CCing libvirt list) > > > > As we shouldn't make machine-type changes introduce new host > > requirements, it looks like we need to either add a new set of > > CPU models (unreasonable), or expect management software to > > explicitly enable LMCE after ensuring the host supports it. > > > > Or we could wait for a reasonable time after the feature is > > available in the kernel, and declare that QEMU as a whole > > requires a newer kernel. But how much time would be reasonable > > for that? > > > > Long term, I believe we should think of a better solution. I > > don't think it is reasonable to require new libvirt code to be > > written for every single low-level feature that requires a newer > > kernel or newer host hardware. Maybe new introspection interfaces > > that would allow us to drop the "no new requirements on > > machine-type changes" rule? > > > > Because new MSR (MSR_IA32_MCG_EXT_CTL) and new MSR bit > (FEATURE_CONTROL_LMCE in MSR_IA32_FEATURE_CONTROL) are introduced by > LMCE, QEMU requires new KVM which can handle those changes. If I understood correctly, you are describing the second option above (declaring that QEMU as a whole requires a newer kernel). > > I'm not familiar with libvirt. Does the requirement of new KVM > capability bring any troubles to libvirt? It does, assuming that we still support running QEMU under an older kernel where KVM doesn't LMCE. In this case, the pc-2.6 machine-type will run, but the pc-2.7 machine-type won't. The requirement of new KVM capabilities based on the machine-type is a problem for livirt. libvirt have some host-capabilities APIs to allow software to check if the VM can be run on (or migrated to) a host, but none of them are based on machine-type. This is not necessarily specific to libvirt: people may have their own configuration or scripts that use the default "pc" alias, and a QEMU upgrade shouldn't break their configuration.
On 06/17/16 14:15, Eduardo Habkost wrote: > On Fri, Jun 17, 2016 at 09:11:16AM +0800, Haozhong Zhang wrote: > > On 06/16/16 11:55, Eduardo Habkost wrote: > > > On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote: > > > > On 16/06/2016 08:05, Haozhong Zhang wrote: > > > > > From: Ashok Raj <ashok.raj@intel.com> > > > > > > > > > > On Intel platforms, this patch adds LMCE to KVM MCE supported > > > > > capabilities and handles guest access to LMCE related MSRs. > > > > > > > > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com> > > > > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported > > > > > Only enable LMCE on Intel platform > > > > > Check MSR_IA32_FEATURE_CONTROL when handling guest > > > > > access to MSR_IA32_MCG_EXT_CTL] > > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > > [...] > > > > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) > > > > > > > > > > kvm_set_posted_intr_wakeup_handler(wakeup_handler); > > > > > > > > > > + kvm_mce_cap_supported |= MCG_LMCE_P; > > > > > > > > Ah, so virtual LMCE is available on all processors! This is > > > > interesting, but it also makes it more complicated to handle in QEMU; a > > > > new QEMU generally doesn't require a new kernel. > > > > > > > > Eduardo, any ideas? > > > > > > (CCing libvirt list) > > > > > > As we shouldn't make machine-type changes introduce new host > > > requirements, it looks like we need to either add a new set of > > > CPU models (unreasonable), or expect management software to > > > explicitly enable LMCE after ensuring the host supports it. > > > > > > Or we could wait for a reasonable time after the feature is > > > available in the kernel, and declare that QEMU as a whole > > > requires a newer kernel. But how much time would be reasonable > > > for that? > > > > > > Long term, I believe we should think of a better solution. I > > > don't think it is reasonable to require new libvirt code to be > > > written for every single low-level feature that requires a newer > > > kernel or newer host hardware. Maybe new introspection interfaces > > > that would allow us to drop the "no new requirements on > > > machine-type changes" rule? > > > > > > > Because new MSR (MSR_IA32_MCG_EXT_CTL) and new MSR bit > > (FEATURE_CONTROL_LMCE in MSR_IA32_FEATURE_CONTROL) are introduced by > > LMCE, QEMU requires new KVM which can handle those changes. > > If I understood correctly, you are describing the second option > above (declaring that QEMU as a whole requires a newer kernel). > > > > > I'm not familiar with libvirt. Does the requirement of new KVM > > capability bring any troubles to libvirt? > > It does, assuming that we still support running QEMU under an > older kernel where KVM doesn't LMCE. In this case, the pc-2.6 > machine-type will run, but the pc-2.7 machine-type won't. > > The requirement of new KVM capabilities based on the machine-type > is a problem for livirt. libvirt have some host-capabilities APIs > to allow software to check if the VM can be run on (or migrated > to) a host, but none of them are based on machine-type. > > This is not necessarily specific to libvirt: people may have > their own configuration or scripts that use the default "pc" > alias, and a QEMU upgrade shouldn't break their configuration. > Thanks for the explanation! If we disable LMCE in QEMU by default (even for -cpu host), will it still be a problem? That is, - pc-2.7 can continue to run on old kernels unless users explicitly require LMCE - existing libvirt VM configurations can continue to work on pc-2.7 because LMCE is not specified in those configurations and are disabled by default (i.e. no requirement for new kernels) - existing QEMU configurations/scripts using pc alias can continue to work on pc-27 for the same reason above. Thanks, Haozhong -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 20/06/2016 03:49, Haozhong Zhang wrote: > Thanks for the explanation! > > If we disable LMCE in QEMU by default (even for -cpu host), will it > still be a problem? That is, > > - pc-2.7 can continue to run on old kernels unless users explicitly > require LMCE > > - existing libvirt VM configurations can continue to work on pc-2.7 > because LMCE is not specified in those configurations and are > disabled by default (i.e. no requirement for new kernels) > > - existing QEMU configurations/scripts using pc alias can continue to > work on pc-27 for the same reason above. Yes, that would be fine. "-cpu host" can leave LMCE enabled if supported by the kernel. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0fbe7e..75defa6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -598,6 +598,7 @@ struct kvm_vcpu_arch { u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; + u64 mcg_ext_ctl; u64 *mce_banks; /* Cache MMIO info */ @@ -1005,6 +1006,8 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + + void (*setup_mce)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1077,6 +1080,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits; /* maximum allowed value of TSC scaling ratio */ extern u64 kvm_max_tsc_scaling_ratio; +extern u64 kvm_mce_cap_supported; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1dc89c5..42db42e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -638,7 +638,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) * feature_control_valid_bits_add/del(), so it's not included here. */ #define FEATURE_CONTROL_MAX_VALID_BITS \ - FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX + (FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE) static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits) { @@ -2905,6 +2905,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, return valid_bits && !(val & ~valid_bits); } +static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu, + bool host_initiated) +{ + return (vcpu->arch.mcg_cap & MCG_LMCE_P) && + (host_initiated || + (to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -2946,6 +2955,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; + case MSR_IA32_MCG_EXT_CTL: + if (!vmx_mcg_ext_ctrl_msr_present(vcpu, + msr_info->host_initiated)) + return 1; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; case MSR_IA32_FEATURE_CONTROL: if (!vmx_feature_control_msr_valid(vcpu, 0)) return 1; @@ -3039,6 +3054,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_MCG_EXT_CTL: + if (!vmx_mcg_ext_ctrl_msr_present(vcpu, + msr_info->host_initiated) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; case MSR_IA32_FEATURE_CONTROL: if (!vmx_feature_control_msr_valid(vcpu, data) || (to_vmx(vcpu)->msr_ia32_feature_control & @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void) kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_mce_cap_supported |= MCG_LMCE_P; + return alloc_kvm_area(); out8: @@ -10950,6 +10974,14 @@ out: return ret; } +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE); + else + feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11074,6 +11106,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, + + .setup_mce = vmx_setup_mce, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf22721..5bf76ab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -70,7 +70,8 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) @@ -983,6 +984,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, }; @@ -2684,11 +2686,9 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + if (copy_to_user(argp, &kvm_mce_cap_supported, + sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; @@ -2866,7 +2866,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -2876,6 +2876,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + + if (kvm_x86_ops->setup_mce) + kvm_x86_ops->setup_mce(vcpu); out: return r; }