Message ID | 1368939152-11406-1-git-send-email-jun.nakajima@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Il 19/05/2013 06:52, Jun Nakajima ha scritto: > From: Nadav Har'El <nyh@il.ibm.com> > > Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > switch the EFER MSR when EPT is used and the host and guest have different > NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) > and want to be able to run recent KVM as L1, we need to allow L1 to use this > EFER switching feature. > > To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available, > and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds > support for the former (the latter is still unsupported). > > Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state, > respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all > that's left to do in this patch is to properly advertise this feature to L1. > > Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using > vmx_set_efer (which itself sets one of several vmcs02 fields), so we always > support this feature, regardless of whether the host supports it. > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > --- > arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > 1 file changed, 16 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 260a919..fb9cae5 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > #else > nested_vmx_exit_ctls_high = 0; > #endif > - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | > + VM_EXIT_LOAD_IA32_EFER); > > /* entry controls */ > rdmsr(MSR_IA32_VMX_ENTRY_CTLS, > @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > nested_vmx_entry_ctls_high &= > VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; > - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > - > + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | > + VM_ENTRY_LOAD_IA32_EFER); > /* cpu-based controls */ > rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); > @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; > vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); > > - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ > - vmcs_write32(VM_EXIT_CONTROLS, > - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); > - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | > + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so > + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER > + * bits are further modified by vmx_set_efer() below. > + */ > + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > + > + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are > + * emulated by vmx_set_efer(), below. VM_ENTRY_LOAD_IA32_EFER is not emulated by vmx_set_efer, so: /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE * are emulated below. VM_ENTRY_IA32E_MODE is handled in * vmx_set_efer(). */ Paolo > + */ > + vmcs_write32(VM_ENTRY_CONTROLS, > + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & > + ~VM_ENTRY_IA32E_MODE) | > (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); > > if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Since this series is pending in mail list for long time. And it's really a big feature for Nested. Also, I doubt the original authors(Jun and Nahav)should not have enough time to continue it. So I will pick it up. :) See comments below: Paolo Bonzini wrote on 2013-05-20: > Il 19/05/2013 06:52, Jun Nakajima ha scritto: > > From: Nadav Har'El <nyh@il.ibm.com> > > > > Recent KVM, since > http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > > switch the EFER MSR when EPT is used and the host and guest have different > > NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) > > and want to be able to run recent KVM as L1, we need to allow L1 to use this > > EFER switching feature. > > > > To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if > available, > > and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds > > support for the former (the latter is still unsupported). > > > > Nested entry and exit emulation (prepare_vmcs_02 and > load_vmcs12_host_state, > > respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So > all > > that's left to do in this patch is to properly advertise this feature to L1. > > > > Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by > using > > vmx_set_efer (which itself sets one of several vmcs02 fields), so we always > > support this feature, regardless of whether the host supports it. > > > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > > --- > > arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > > 1 file changed, 16 insertions(+), 7 deletions(-) > > > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > > index 260a919..fb9cae5 100644 > > --- a/arch/x86/kvm/vmx.c > > +++ b/arch/x86/kvm/vmx.c > > @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > > #else > > nested_vmx_exit_ctls_high = 0; > > #endif > > - nested_vmx_exit_ctls_high |= > VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > > + nested_vmx_exit_ctls_high |= > (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | > > + VM_EXIT_LOAD_IA32_EFER); > > > > /* entry controls */ > > rdmsr(MSR_IA32_VMX_ENTRY_CTLS, > > @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > > nested_vmx_entry_ctls_low = > VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > > nested_vmx_entry_ctls_high &= > > VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; > > - nested_vmx_entry_ctls_high |= > VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > > - > > + nested_vmx_entry_ctls_high |= > (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | > > + VM_ENTRY_LOAD_IA32_EFER); > > /* cpu-based controls */ > > rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > > nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); > > @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, > struct vmcs12 *vmcs12) > > vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; > > vmcs_writel(CR0_GUEST_HOST_MASK, > ~vcpu->arch.cr0_guest_owned_bits); > > > > - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer > below */ > > - vmcs_write32(VM_EXIT_CONTROLS, > > - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); > > - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | > > + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so > > + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER > > + * bits are further modified by vmx_set_efer() below. > > + */ > > + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); This is wrong. We cannot use L0 exit control directly. LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, ACK_INTR_ON_EXIT should use host's exit control. But others, still need use (vmcs12|host). > > + > > + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE > are > > + * emulated by vmx_set_efer(), below. > > VM_ENTRY_LOAD_IA32_EFER is not emulated by vmx_set_efer, so: VM_ENTRY_LOAD_IA32_EFER is hanlded in setup_msrs(), and vmx_set_efer already call it. > > /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE > * are emulated below. VM_ENTRY_IA32E_MODE is handled in > * vmx_set_efer(). */ > > Paolo > > > + */ > > + vmcs_write32(VM_ENTRY_CONTROLS, > > + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & > > + ~VM_ENTRY_IA32E_MODE) | > > (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); > > > > if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) > > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: > Since this series is pending in mail list for long time. And it's really a big feature for Nested. Also, I doubt the original authors(Jun and Nahav)should not have enough time to continue it. So I will pick it up. :) > > See comments below: > > Paolo Bonzini wrote on 2013-05-20: > > Il 19/05/2013 06:52, Jun Nakajima ha scritto: > > > From: Nadav Har'El <nyh@il.ibm.com> > > > > > > Recent KVM, since > > http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > > > switch the EFER MSR when EPT is used and the host and guest have different > > > NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) > > > and want to be able to run recent KVM as L1, we need to allow L1 to use this > > > EFER switching feature. > > > > > > To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if > > available, > > > and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds > > > support for the former (the latter is still unsupported). > > > > > > Nested entry and exit emulation (prepare_vmcs_02 and > > load_vmcs12_host_state, > > > respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So > > all > > > that's left to do in this patch is to properly advertise this feature to L1. > > > > > > Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by > > using > > > vmx_set_efer (which itself sets one of several vmcs02 fields), so we always > > > support this feature, regardless of whether the host supports it. > > > > > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > > > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > > > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > > > --- > > > arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > > > 1 file changed, 16 insertions(+), 7 deletions(-) > > > > > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > > > index 260a919..fb9cae5 100644 > > > --- a/arch/x86/kvm/vmx.c > > > +++ b/arch/x86/kvm/vmx.c > > > @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > > > #else > > > nested_vmx_exit_ctls_high = 0; > > > #endif > > > - nested_vmx_exit_ctls_high |= > > VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > > > + nested_vmx_exit_ctls_high |= > > (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | > > > + VM_EXIT_LOAD_IA32_EFER); > > > > > > /* entry controls */ > > > rdmsr(MSR_IA32_VMX_ENTRY_CTLS, > > > @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > > > nested_vmx_entry_ctls_low = > > VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > > > nested_vmx_entry_ctls_high &= > > > VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; > > > - nested_vmx_entry_ctls_high |= > > VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > > > - > > > + nested_vmx_entry_ctls_high |= > > (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | > > > + VM_ENTRY_LOAD_IA32_EFER); > > > /* cpu-based controls */ > > > rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > > > nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); > > > @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, > > struct vmcs12 *vmcs12) > > > vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; > > > vmcs_writel(CR0_GUEST_HOST_MASK, > > ~vcpu->arch.cr0_guest_owned_bits); > > > > > > - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer > > below */ > > > - vmcs_write32(VM_EXIT_CONTROLS, > > > - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); > > > - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | > > > + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so > > > + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER > > > + * bits are further modified by vmx_set_efer() below. > > > + */ > > > + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > This is wrong. We cannot use L0 exit control directly. > LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, ACK_INTR_ON_EXIT should use host's exit control. But others, still need use (vmcs12|host). > I do not see why. We always intercept DR7/PAT/EFER, so save is emulated too. Host address space size always come from L0 and preemption timer is not supported for nested IIRC and when it will be host will have to save it on exit anyway for correct emulation. > > > + > > > + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE > > are > > > + * emulated by vmx_set_efer(), below. > > > > VM_ENTRY_LOAD_IA32_EFER is not emulated by vmx_set_efer, so: > VM_ENTRY_LOAD_IA32_EFER is hanlded in setup_msrs(), and vmx_set_efer already call it. > > > > > /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE > > * are emulated below. VM_ENTRY_IA32E_MODE is handled in > > * vmx_set_efer(). */ > > > > Paolo > > > > > + */ > > > + vmcs_write32(VM_ENTRY_CONTROLS, > > > + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & > > > + ~VM_ENTRY_IA32E_MODE) | > > > (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); > > > > > > if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) > > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe kvm" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > Best regards, > Yang > -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-07-02 15:59, Gleb Natapov wrote: > On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: >> Since this series is pending in mail list for long time. And it's really a big feature for Nested. Also, I doubt the original authors(Jun and Nahav)should not have enough time to continue it. So I will pick it up. :) >> >> See comments below: >> >> Paolo Bonzini wrote on 2013-05-20: >>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: >>>> From: Nadav Har'El <nyh@il.ibm.com> >>>> >>>> Recent KVM, since >>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 >>>> switch the EFER MSR when EPT is used and the host and guest have different >>>> NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) >>>> and want to be able to run recent KVM as L1, we need to allow L1 to use this >>>> EFER switching feature. >>>> >>>> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if >>> available, >>>> and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds >>>> support for the former (the latter is still unsupported). >>>> >>>> Nested entry and exit emulation (prepare_vmcs_02 and >>> load_vmcs12_host_state, >>>> respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So >>> all >>>> that's left to do in this patch is to properly advertise this feature to L1. >>>> >>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by >>> using >>>> vmx_set_efer (which itself sets one of several vmcs02 fields), so we always >>>> support this feature, regardless of whether the host supports it. >>>> >>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >>>> --- >>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- >>>> 1 file changed, 16 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>>> index 260a919..fb9cae5 100644 >>>> --- a/arch/x86/kvm/vmx.c >>>> +++ b/arch/x86/kvm/vmx.c >>>> @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) >>>> #else >>>> nested_vmx_exit_ctls_high = 0; >>>> #endif >>>> - nested_vmx_exit_ctls_high |= >>> VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; >>>> + nested_vmx_exit_ctls_high |= >>> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | >>>> + VM_EXIT_LOAD_IA32_EFER); >>>> >>>> /* entry controls */ >>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, >>>> @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) >>>> nested_vmx_entry_ctls_low = >>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; >>>> nested_vmx_entry_ctls_high &= >>>> VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; >>>> - nested_vmx_entry_ctls_high |= >>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; >>>> - >>>> + nested_vmx_entry_ctls_high |= >>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | >>>> + VM_ENTRY_LOAD_IA32_EFER); >>>> /* cpu-based controls */ >>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, >>>> nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); >>>> @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, >>> struct vmcs12 *vmcs12) >>>> vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; >>>> vmcs_writel(CR0_GUEST_HOST_MASK, >>> ~vcpu->arch.cr0_guest_owned_bits); >>>> >>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer >>> below */ >>>> - vmcs_write32(VM_EXIT_CONTROLS, >>>> - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); >>>> - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | >>>> + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so >>>> + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER >>>> + * bits are further modified by vmx_set_efer() below. >>>> + */ >>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); >> This is wrong. We cannot use L0 exit control directly. >> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, ACK_INTR_ON_EXIT should use host's exit control. But others, still need use (vmcs12|host). >> > I do not see why. We always intercept DR7/PAT/EFER, so save is emulated > too. Host address space size always come from L0 and preemption timer is > not supported for nested IIRC and when it will be host will have to save > it on exit anyway for correct emulation. Preemption timer is already supported and works fine as far as I tested. KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. Jan
On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: > On 2013-07-02 15:59, Gleb Natapov wrote: > > On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: > >> Since this series is pending in mail list for long time. And it's really a big feature for Nested. Also, I doubt the original authors(Jun and Nahav)should not have enough time to continue it. So I will pick it up. :) > >> > >> See comments below: > >> > >> Paolo Bonzini wrote on 2013-05-20: > >>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: > >>>> From: Nadav Har'El <nyh@il.ibm.com> > >>>> > >>>> Recent KVM, since > >>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > >>>> switch the EFER MSR when EPT is used and the host and guest have different > >>>> NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) > >>>> and want to be able to run recent KVM as L1, we need to allow L1 to use this > >>>> EFER switching feature. > >>>> > >>>> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if > >>> available, > >>>> and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds > >>>> support for the former (the latter is still unsupported). > >>>> > >>>> Nested entry and exit emulation (prepare_vmcs_02 and > >>> load_vmcs12_host_state, > >>>> respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So > >>> all > >>>> that's left to do in this patch is to properly advertise this feature to L1. > >>>> > >>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by > >>> using > >>>> vmx_set_efer (which itself sets one of several vmcs02 fields), so we always > >>>> support this feature, regardless of whether the host supports it. > >>>> > >>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > >>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > >>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > >>>> --- > >>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > >>>> 1 file changed, 16 insertions(+), 7 deletions(-) > >>>> > >>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >>>> index 260a919..fb9cae5 100644 > >>>> --- a/arch/x86/kvm/vmx.c > >>>> +++ b/arch/x86/kvm/vmx.c > >>>> @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > >>>> #else > >>>> nested_vmx_exit_ctls_high = 0; > >>>> #endif > >>>> - nested_vmx_exit_ctls_high |= > >>> VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > >>>> + nested_vmx_exit_ctls_high |= > >>> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | > >>>> + VM_EXIT_LOAD_IA32_EFER); > >>>> > >>>> /* entry controls */ > >>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, > >>>> @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > >>>> nested_vmx_entry_ctls_low = > >>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > >>>> nested_vmx_entry_ctls_high &= > >>>> VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; > >>>> - nested_vmx_entry_ctls_high |= > >>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > >>>> - > >>>> + nested_vmx_entry_ctls_high |= > >>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | > >>>> + VM_ENTRY_LOAD_IA32_EFER); > >>>> /* cpu-based controls */ > >>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > >>>> nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); > >>>> @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, > >>> struct vmcs12 *vmcs12) > >>>> vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; > >>>> vmcs_writel(CR0_GUEST_HOST_MASK, > >>> ~vcpu->arch.cr0_guest_owned_bits); > >>>> > >>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer > >>> below */ > >>>> - vmcs_write32(VM_EXIT_CONTROLS, > >>>> - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); > >>>> - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | > >>>> + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so > >>>> + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER > >>>> + * bits are further modified by vmx_set_efer() below. > >>>> + */ > >>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > >> This is wrong. We cannot use L0 exit control directly. > >> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, ACK_INTR_ON_EXIT should use host's exit control. But others, still need use (vmcs12|host). > >> > > I do not see why. We always intercept DR7/PAT/EFER, so save is emulated > > too. Host address space size always come from L0 and preemption timer is > > not supported for nested IIRC and when it will be host will have to save > > it on exit anyway for correct emulation. > > Preemption timer is already supported and works fine as far as I tested. > KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. > So what happens if L1 configures it to value X after X/2 ticks L0 exit happen and L0 gets back to L2 directly. The counter will be X again instead of X/2. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-07-02 17:15, Gleb Natapov wrote: > On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: >> On 2013-07-02 15:59, Gleb Natapov wrote: >>> On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: >>>> Since this series is pending in mail list for long time. And it's really a big feature for Nested. Also, I doubt the original authors(Jun and Nahav)should not have enough time to continue it. So I will pick it up. :) >>>> >>>> See comments below: >>>> >>>> Paolo Bonzini wrote on 2013-05-20: >>>>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: >>>>>> From: Nadav Har'El <nyh@il.ibm.com> >>>>>> >>>>>> Recent KVM, since >>>>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 >>>>>> switch the EFER MSR when EPT is used and the host and guest have different >>>>>> NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) >>>>>> and want to be able to run recent KVM as L1, we need to allow L1 to use this >>>>>> EFER switching feature. >>>>>> >>>>>> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if >>>>> available, >>>>>> and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds >>>>>> support for the former (the latter is still unsupported). >>>>>> >>>>>> Nested entry and exit emulation (prepare_vmcs_02 and >>>>> load_vmcs12_host_state, >>>>>> respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So >>>>> all >>>>>> that's left to do in this patch is to properly advertise this feature to L1. >>>>>> >>>>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by >>>>> using >>>>>> vmx_set_efer (which itself sets one of several vmcs02 fields), so we always >>>>>> support this feature, regardless of whether the host supports it. >>>>>> >>>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >>>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >>>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >>>>>> --- >>>>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- >>>>>> 1 file changed, 16 insertions(+), 7 deletions(-) >>>>>> >>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>>>>> index 260a919..fb9cae5 100644 >>>>>> --- a/arch/x86/kvm/vmx.c >>>>>> +++ b/arch/x86/kvm/vmx.c >>>>>> @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) >>>>>> #else >>>>>> nested_vmx_exit_ctls_high = 0; >>>>>> #endif >>>>>> - nested_vmx_exit_ctls_high |= >>>>> VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; >>>>>> + nested_vmx_exit_ctls_high |= >>>>> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | >>>>>> + VM_EXIT_LOAD_IA32_EFER); >>>>>> >>>>>> /* entry controls */ >>>>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, >>>>>> @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) >>>>>> nested_vmx_entry_ctls_low = >>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; >>>>>> nested_vmx_entry_ctls_high &= >>>>>> VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; >>>>>> - nested_vmx_entry_ctls_high |= >>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; >>>>>> - >>>>>> + nested_vmx_entry_ctls_high |= >>>>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | >>>>>> + VM_ENTRY_LOAD_IA32_EFER); >>>>>> /* cpu-based controls */ >>>>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, >>>>>> nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); >>>>>> @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, >>>>> struct vmcs12 *vmcs12) >>>>>> vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; >>>>>> vmcs_writel(CR0_GUEST_HOST_MASK, >>>>> ~vcpu->arch.cr0_guest_owned_bits); >>>>>> >>>>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer >>>>> below */ >>>>>> - vmcs_write32(VM_EXIT_CONTROLS, >>>>>> - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); >>>>>> - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | >>>>>> + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so >>>>>> + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER >>>>>> + * bits are further modified by vmx_set_efer() below. >>>>>> + */ >>>>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); >>>> This is wrong. We cannot use L0 exit control directly. >>>> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, ACK_INTR_ON_EXIT should use host's exit control. But others, still need use (vmcs12|host). >>>> >>> I do not see why. We always intercept DR7/PAT/EFER, so save is emulated >>> too. Host address space size always come from L0 and preemption timer is >>> not supported for nested IIRC and when it will be host will have to save >>> it on exit anyway for correct emulation. >> >> Preemption timer is already supported and works fine as far as I tested. >> KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. >> > So what happens if L1 configures it to value X after X/2 ticks L0 exit > happen and L0 gets back to L2 directly. The counter will be X again > instead of X/2. Likely. Yes, we need to improve our emulation by setting "Save VMX-preemption timer value" or emulate this in software if the hardware lacks support for it (was this flag introduced after the preemption timer itself?). Jan
On Tue, Jul 02, 2013 at 05:34:56PM +0200, Jan Kiszka wrote: > On 2013-07-02 17:15, Gleb Natapov wrote: > > On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: > >> On 2013-07-02 15:59, Gleb Natapov wrote: > >>> On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: > >>>> Since this series is pending in mail list for long time. And it's really a big feature for Nested. Also, I doubt the original authors(Jun and Nahav)should not have enough time to continue it. So I will pick it up. :) > >>>> > >>>> See comments below: > >>>> > >>>> Paolo Bonzini wrote on 2013-05-20: > >>>>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: > >>>>>> From: Nadav Har'El <nyh@il.ibm.com> > >>>>>> > >>>>>> Recent KVM, since > >>>>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > >>>>>> switch the EFER MSR when EPT is used and the host and guest have different > >>>>>> NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) > >>>>>> and want to be able to run recent KVM as L1, we need to allow L1 to use this > >>>>>> EFER switching feature. > >>>>>> > >>>>>> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if > >>>>> available, > >>>>>> and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds > >>>>>> support for the former (the latter is still unsupported). > >>>>>> > >>>>>> Nested entry and exit emulation (prepare_vmcs_02 and > >>>>> load_vmcs12_host_state, > >>>>>> respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So > >>>>> all > >>>>>> that's left to do in this patch is to properly advertise this feature to L1. > >>>>>> > >>>>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by > >>>>> using > >>>>>> vmx_set_efer (which itself sets one of several vmcs02 fields), so we always > >>>>>> support this feature, regardless of whether the host supports it. > >>>>>> > >>>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > >>>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > >>>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > >>>>>> --- > >>>>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > >>>>>> 1 file changed, 16 insertions(+), 7 deletions(-) > >>>>>> > >>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >>>>>> index 260a919..fb9cae5 100644 > >>>>>> --- a/arch/x86/kvm/vmx.c > >>>>>> +++ b/arch/x86/kvm/vmx.c > >>>>>> @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > >>>>>> #else > >>>>>> nested_vmx_exit_ctls_high = 0; > >>>>>> #endif > >>>>>> - nested_vmx_exit_ctls_high |= > >>>>> VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > >>>>>> + nested_vmx_exit_ctls_high |= > >>>>> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | > >>>>>> + VM_EXIT_LOAD_IA32_EFER); > >>>>>> > >>>>>> /* entry controls */ > >>>>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, > >>>>>> @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) > >>>>>> nested_vmx_entry_ctls_low = > >>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > >>>>>> nested_vmx_entry_ctls_high &= > >>>>>> VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; > >>>>>> - nested_vmx_entry_ctls_high |= > >>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > >>>>>> - > >>>>>> + nested_vmx_entry_ctls_high |= > >>>>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | > >>>>>> + VM_ENTRY_LOAD_IA32_EFER); > >>>>>> /* cpu-based controls */ > >>>>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > >>>>>> nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); > >>>>>> @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, > >>>>> struct vmcs12 *vmcs12) > >>>>>> vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; > >>>>>> vmcs_writel(CR0_GUEST_HOST_MASK, > >>>>> ~vcpu->arch.cr0_guest_owned_bits); > >>>>>> > >>>>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer > >>>>> below */ > >>>>>> - vmcs_write32(VM_EXIT_CONTROLS, > >>>>>> - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); > >>>>>> - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | > >>>>>> + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so > >>>>>> + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER > >>>>>> + * bits are further modified by vmx_set_efer() below. > >>>>>> + */ > >>>>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > >>>> This is wrong. We cannot use L0 exit control directly. > >>>> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, ACK_INTR_ON_EXIT should use host's exit control. But others, still need use (vmcs12|host). > >>>> > >>> I do not see why. We always intercept DR7/PAT/EFER, so save is emulated > >>> too. Host address space size always come from L0 and preemption timer is > >>> not supported for nested IIRC and when it will be host will have to save > >>> it on exit anyway for correct emulation. > >> > >> Preemption timer is already supported and works fine as far as I tested. > >> KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. > >> > > So what happens if L1 configures it to value X after X/2 ticks L0 exit > > happen and L0 gets back to L2 directly. The counter will be X again > > instead of X/2. > > Likely. Yes, we need to improve our emulation by setting "Save > VMX-preemption timer value" or emulate this in software if the hardware > lacks support for it (was this flag introduced after the preemption > timer itself?). > Not sure, but my point was that for correct emulation host needs to set "save preempt timer on vmexit" anyway so all VM_EXIT_CONTROLS are indeed emulated as far as I see. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote on 2013-07-02: > On Tue, Jul 02, 2013 at 05:34:56PM +0200, Jan Kiszka wrote: >> On 2013-07-02 17:15, Gleb Natapov wrote: >>> On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: >>>> On 2013-07-02 15:59, Gleb Natapov wrote: >>>>> On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: >>>>>> Since this series is pending in mail list for long time. And >>>>>> it's really a big feature for Nested. Also, I doubt the >>>>>> original authors(Jun and Nahav)should not have enough time to continue it. >>>>>> So I will pick it up. :) >>>>>> >>>>>> See comments below: >>>>>> >>>>>> Paolo Bonzini wrote on 2013-05-20: >>>>>>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: >>>>>>>> From: Nadav Har'El <nyh@il.ibm.com> >>>>>>>> >>>>>>>> Recent KVM, since >>>>>>>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 >>>>>>>> switch the EFER MSR when EPT is used and the host and guest have >>>>>>>> different NX bits. So if we add support for nested EPT (L1 guest >>>>>>>> using EPT to run L2) and want to be able to run recent KVM as L1, >>>>>>>> we need to allow L1 to use this EFER switching feature. >>>>>>>> >>>>>>>> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER >>>>>>>> if available, and if it isn't, it uses the generic >>>>>>>> VM_ENTRY/EXIT_MSR_LOAD. This patch adds support for the former >>>>>>>> (the latter is still unsupported). >>>>>>>> >>>>>>>> Nested entry and exit emulation (prepare_vmcs_02 and >>>>>>>> load_vmcs12_host_state, respectively) already handled >>>>>>>> VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all that's left to do >>>>>>>> in this patch is to properly advertise this feature to L1. >>>>>>>> >>>>>>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by >>>>>>>> L0, by using vmx_set_efer (which itself sets one of several >>>>>>>> vmcs02 fields), so we always support this feature, regardless of >>>>>>>> whether the host supports it. >>>>>>>> >>>>>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >>>>>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >>>>>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >>>>>>>> --- >>>>>>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- >>>>>>>> 1 file changed, 16 insertions(+), 7 deletions(-) >>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index >>>>>>>> 260a919..fb9cae5 100644 >>>>>>>> --- a/arch/x86/kvm/vmx.c >>>>>>>> +++ b/arch/x86/kvm/vmx.c >>>>>>>> @@ -2192,7 +2192,8 @@ static __init void >>>>>>>> nested_vmx_setup_ctls_msrs(void) #else >>>>>>>> nested_vmx_exit_ctls_high = 0; #endif >>>>>>>> - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; >>>>>>>> + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR >>>>>>>> | + VM_EXIT_LOAD_IA32_EFER); >>>>>>>> >>>>>>>> /* entry controls */ >>>>>>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2201,8 +2202,8 > @@ static >>>>>>>> __init void nested_vmx_setup_ctls_msrs(void) >>>>>>>> nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; >>>>>>>> nested_vmx_entry_ctls_high &= VM_ENTRY_LOAD_IA32_PAT | >>>>>>>> VM_ENTRY_IA32E_MODE; >>>>>>>> - nested_vmx_entry_ctls_high |= >>>>>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - >>>>>>>> + nested_vmx_entry_ctls_high |= >>>>>>>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + >>>>>>>> VM_ENTRY_LOAD_IA32_EFER); >>>>>>>> /* cpu-based controls */ >>>>>>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, >>>>>>>> nested_vmx_procbased_ctls_low, >>>>>>>> nested_vmx_procbased_ctls_high); @@ -7492,10 +7493,18 @@ static >>>>>>>> void prepare_vmcs02(struct kvm_vcpu *vcpu, >>>>>>> struct vmcs12 *vmcs12) >>>>>>>> vcpu->arch.cr0_guest_owned_bits &= >>>>>>>> ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, >>>>>>> ~vcpu->arch.cr0_guest_owned_bits); >>>>>>>> >>>>>>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by > vmx_set_efer >>>>>>> below */ >>>>>>>> - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | >>>>>>>> vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, >>>>>>>> vmcs12->vm_entry_controls | + /* L2->L1 exit controls are >>>>>>>> emulated - the hardware exit is +to L0 so + * we should use its >>>>>>>> exit controls. Note that IA32_MODE, LOAD_IA32_EFER + * bits are >>>>>>>> further modified by vmx_set_efer() below. + */ >>>>>>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); >>>>>> This is wrong. We cannot use L0 exit control directly. >>>>>> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, > ACK_INTR_ON_EXIT should use host's exit control. But others, still > need use (vmcs12|host). >>>>>> >>>>> I do not see why. We always intercept DR7/PAT/EFER, so save is >>>>> emulated too. Host address space size always come from L0 and >>>>> preemption timer is not supported for nested IIRC and when it >>>>> will be host will have to save it on exit anyway for correct emulation. >>>> >>>> Preemption timer is already supported and works fine as far as I tested. >>>> KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. >>>> >>> So what happens if L1 configures it to value X after X/2 ticks L0 >>> exit happen and L0 gets back to L2 directly. The counter will be X >>> again instead of X/2. >> >> Likely. Yes, we need to improve our emulation by setting "Save >> VMX-preemption timer value" or emulate this in software if the >> hardware lacks support for it (was this flag introduced after the >> preemption timer itself?). >> > Not sure, but my point was that for correct emulation host needs to > set "save preempt timer on vmexit" anyway so all VM_EXIT_CONTROLS are > indeed emulated as far as I see. > Ok, here is my summary, please correct me if I am wrong: bit 2: Save debug controls, the first processor only support 1-setting on it, so just use host's setting is enough bit 9: Host address space size, it indicate the host's state, so must use host's setting. bit 12: Load IA32_PERF_GLOBAL_CTRL: same as above. bit 15 : Acknowledge interrupt on exit: same as above. bit 19: Load IA32_PAT: same as above. bit 20: Load IA32_EFER: same as above. bit 18: Save IA32_PAT, Didn't expose it to L1, so use host' setting is ok. bit 19: Save IA32_EFER, same as above. bit 22: Save VMXpreemption timer value, I don't see KVM expose it to L1, but Jan said it's working. Strange! And according gleb's suggestion, it better to always set it. So, currently, only use host' exit_control for L2 is enough. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 04, 2013 at 08:42:53AM +0000, Zhang, Yang Z wrote: > Gleb Natapov wrote on 2013-07-02: > > On Tue, Jul 02, 2013 at 05:34:56PM +0200, Jan Kiszka wrote: > >> On 2013-07-02 17:15, Gleb Natapov wrote: > >>> On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: > >>>> On 2013-07-02 15:59, Gleb Natapov wrote: > >>>>> On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: > >>>>>> Since this series is pending in mail list for long time. And > >>>>>> it's really a big feature for Nested. Also, I doubt the > >>>>>> original authors(Jun and Nahav)should not have enough time to continue it. > >>>>>> So I will pick it up. :) > >>>>>> > >>>>>> See comments below: > >>>>>> > >>>>>> Paolo Bonzini wrote on 2013-05-20: > >>>>>>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: > >>>>>>>> From: Nadav Har'El <nyh@il.ibm.com> > >>>>>>>> > >>>>>>>> Recent KVM, since > >>>>>>>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > >>>>>>>> switch the EFER MSR when EPT is used and the host and guest have > >>>>>>>> different NX bits. So if we add support for nested EPT (L1 guest > >>>>>>>> using EPT to run L2) and want to be able to run recent KVM as L1, > >>>>>>>> we need to allow L1 to use this EFER switching feature. > >>>>>>>> > >>>>>>>> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER > >>>>>>>> if available, and if it isn't, it uses the generic > >>>>>>>> VM_ENTRY/EXIT_MSR_LOAD. This patch adds support for the former > >>>>>>>> (the latter is still unsupported). > >>>>>>>> > >>>>>>>> Nested entry and exit emulation (prepare_vmcs_02 and > >>>>>>>> load_vmcs12_host_state, respectively) already handled > >>>>>>>> VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all that's left to do > >>>>>>>> in this patch is to properly advertise this feature to L1. > >>>>>>>> > >>>>>>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by > >>>>>>>> L0, by using vmx_set_efer (which itself sets one of several > >>>>>>>> vmcs02 fields), so we always support this feature, regardless of > >>>>>>>> whether the host supports it. > >>>>>>>> > >>>>>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > >>>>>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > >>>>>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > >>>>>>>> --- > >>>>>>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > >>>>>>>> 1 file changed, 16 insertions(+), 7 deletions(-) > >>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index > >>>>>>>> 260a919..fb9cae5 100644 > >>>>>>>> --- a/arch/x86/kvm/vmx.c > >>>>>>>> +++ b/arch/x86/kvm/vmx.c > >>>>>>>> @@ -2192,7 +2192,8 @@ static __init void > >>>>>>>> nested_vmx_setup_ctls_msrs(void) #else > >>>>>>>> nested_vmx_exit_ctls_high = 0; #endif > >>>>>>>> - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > >>>>>>>> + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR > >>>>>>>> | + VM_EXIT_LOAD_IA32_EFER); > >>>>>>>> > >>>>>>>> /* entry controls */ > >>>>>>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2201,8 +2202,8 > > @@ static > >>>>>>>> __init void nested_vmx_setup_ctls_msrs(void) > >>>>>>>> nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > >>>>>>>> nested_vmx_entry_ctls_high &= VM_ENTRY_LOAD_IA32_PAT | > >>>>>>>> VM_ENTRY_IA32E_MODE; > >>>>>>>> - nested_vmx_entry_ctls_high |= > >>>>>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - > >>>>>>>> + nested_vmx_entry_ctls_high |= > >>>>>>>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + > >>>>>>>> VM_ENTRY_LOAD_IA32_EFER); > >>>>>>>> /* cpu-based controls */ > >>>>>>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > >>>>>>>> nested_vmx_procbased_ctls_low, > >>>>>>>> nested_vmx_procbased_ctls_high); @@ -7492,10 +7493,18 @@ static > >>>>>>>> void prepare_vmcs02(struct kvm_vcpu *vcpu, > >>>>>>> struct vmcs12 *vmcs12) > >>>>>>>> vcpu->arch.cr0_guest_owned_bits &= > >>>>>>>> ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, > >>>>>>> ~vcpu->arch.cr0_guest_owned_bits); > >>>>>>>> > >>>>>>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by > > vmx_set_efer > >>>>>>> below */ > >>>>>>>> - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | > >>>>>>>> vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, > >>>>>>>> vmcs12->vm_entry_controls | + /* L2->L1 exit controls are > >>>>>>>> emulated - the hardware exit is +to L0 so + * we should use its > >>>>>>>> exit controls. Note that IA32_MODE, LOAD_IA32_EFER + * bits are > >>>>>>>> further modified by vmx_set_efer() below. + */ > >>>>>>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > >>>>>> This is wrong. We cannot use L0 exit control directly. > >>>>>> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, > > ACK_INTR_ON_EXIT should use host's exit control. But others, still > > need use (vmcs12|host). > >>>>>> > >>>>> I do not see why. We always intercept DR7/PAT/EFER, so save is > >>>>> emulated too. Host address space size always come from L0 and > >>>>> preemption timer is not supported for nested IIRC and when it > >>>>> will be host will have to save it on exit anyway for correct emulation. > >>>> > >>>> Preemption timer is already supported and works fine as far as I tested. > >>>> KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. > >>>> > >>> So what happens if L1 configures it to value X after X/2 ticks L0 > >>> exit happen and L0 gets back to L2 directly. The counter will be X > >>> again instead of X/2. > >> > >> Likely. Yes, we need to improve our emulation by setting "Save > >> VMX-preemption timer value" or emulate this in software if the > >> hardware lacks support for it (was this flag introduced after the > >> preemption timer itself?). > >> > > Not sure, but my point was that for correct emulation host needs to > > set "save preempt timer on vmexit" anyway so all VM_EXIT_CONTROLS are > > indeed emulated as far as I see. > > > Ok, here is my summary, please correct me if I am wrong: > bit 2: Save debug controls, the first processor only support 1-setting on it, so just use host's setting is enough Not because first processor only supported 1-setting, but because L0 intercepts all changes to DR7 and DEBUGCTL MSR, so L2 cannot change them behind L0 back. If L1 asks to save them in vmcs12 L0 can do it during vmexit emulation. > bit 9: Host address space size, it indicate the host's state, so must use host's setting. > bit 12: Load IA32_PERF_GLOBAL_CTRL: same as above. Not sure what "above" do you mean. It is fully emulated during vmexit emulation. We do not want PERF_GLOBAL_CTRL to be loaded during L2->L0 vmexit, we want it to be loaded in L1 after L0 emulates L2->L1 vmexit. But I think there is a bug in current emulation. The code looks like this: if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); But GUEST_IA32_PERF_GLOBAL_CTRL will not be loaded during L1 entry unless VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL is set. Why do we assume it is set here? > bit 15 : Acknowledge interrupt on exit: same as above. Same as bit 9. > bit 19: Load IA32_PAT: same as above. > bit 20: Load IA32_EFER: same as above. Those two are same as bit 12. > bit 18: Save IA32_PAT, Didn't expose it to L1, so use host' setting is ok. > bit 19: Save IA32_EFER, same as above. Those two are the same a bit 2. > bit 22: Save VMXpreemption timer value, I don't see KVM expose it to L1, but Jan said it's working. Strange! And according gleb's suggestion, it better to always set it. > It exposed it in nested_vmx_setup_ctls_msrs: nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | PIN_BASED_VMX_PREEMPTION_TIMER; According to Gleb's suggestion current emulation is broken and to fix it the bit will have to be set on each L2 entry if L1 is using preemption timer. > So, currently, only use host' exit_control for L2 is enough. > > Best regards, > Yang > -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Gleb Natapov [mailto:gleb@redhat.com] > Sent: Monday, July 08, 2013 8:38 PM > To: Zhang, Yang Z > Cc: Jan Kiszka; Paolo Bonzini; Nakajima, Jun; kvm@vger.kernel.org > Subject: Re: [PATCH v3 01/13] nEPT: Support LOAD_IA32_EFER entry/exit > controls for L1 > > On Thu, Jul 04, 2013 at 08:42:53AM +0000, Zhang, Yang Z wrote: > > Gleb Natapov wrote on 2013-07-02: > > > On Tue, Jul 02, 2013 at 05:34:56PM +0200, Jan Kiszka wrote: > > >> On 2013-07-02 17:15, Gleb Natapov wrote: > > >>> On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: > > >>>> On 2013-07-02 15:59, Gleb Natapov wrote: > > >>>>> On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: > > >>>>>> Since this series is pending in mail list for long time. And > > >>>>>> it's really a big feature for Nested. Also, I doubt the > > >>>>>> original authors(Jun and Nahav)should not have enough time to > continue it. > > >>>>>> So I will pick it up. :) > > >>>>>> > > >>>>>> See comments below: > > >>>>>> > > >>>>>> Paolo Bonzini wrote on 2013-05-20: > > >>>>>>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: > > >>>>>>>> From: Nadav Har'El <nyh@il.ibm.com> > > >>>>>>>> > > >>>>>>>> Recent KVM, since > > >>>>>>>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > > >>>>>>>> switch the EFER MSR when EPT is used and the host and guest > have > > >>>>>>>> different NX bits. So if we add support for nested EPT (L1 guest > > >>>>>>>> using EPT to run L2) and want to be able to run recent KVM as L1, > > >>>>>>>> we need to allow L1 to use this EFER switching feature. > > >>>>>>>> > > >>>>>>>> To do this EFER switching, KVM uses > VM_ENTRY/EXIT_LOAD_IA32_EFER > > >>>>>>>> if available, and if it isn't, it uses the generic > > >>>>>>>> VM_ENTRY/EXIT_MSR_LOAD. This patch adds support for the > former > > >>>>>>>> (the latter is still unsupported). > > >>>>>>>> > > >>>>>>>> Nested entry and exit emulation (prepare_vmcs_02 and > > >>>>>>>> load_vmcs12_host_state, respectively) already handled > > >>>>>>>> VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all that's left to do > > >>>>>>>> in this patch is to properly advertise this feature to L1. > > >>>>>>>> > > >>>>>>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are > emulated by > > >>>>>>>> L0, by using vmx_set_efer (which itself sets one of several > > >>>>>>>> vmcs02 fields), so we always support this feature, regardless of > > >>>>>>>> whether the host supports it. > > >>>>>>>> > > >>>>>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > > >>>>>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > > >>>>>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > > >>>>>>>> --- > > >>>>>>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > > >>>>>>>> 1 file changed, 16 insertions(+), 7 deletions(-) > > >>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index > > >>>>>>>> 260a919..fb9cae5 100644 > > >>>>>>>> --- a/arch/x86/kvm/vmx.c > > >>>>>>>> +++ b/arch/x86/kvm/vmx.c > > >>>>>>>> @@ -2192,7 +2192,8 @@ static __init void > > >>>>>>>> nested_vmx_setup_ctls_msrs(void) #else > > >>>>>>>> nested_vmx_exit_ctls_high = 0; #endif > > >>>>>>>> - nested_vmx_exit_ctls_high |= > VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > > >>>>>>>> + nested_vmx_exit_ctls_high |= > (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR > > >>>>>>>> | + VM_EXIT_LOAD_IA32_EFER); > > >>>>>>>> > > >>>>>>>> /* entry controls */ > > >>>>>>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2201,8 +2202,8 > > > @@ static > > >>>>>>>> __init void nested_vmx_setup_ctls_msrs(void) > > >>>>>>>> nested_vmx_entry_ctls_low = > VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > > >>>>>>>> nested_vmx_entry_ctls_high &= > VM_ENTRY_LOAD_IA32_PAT | > > >>>>>>>> VM_ENTRY_IA32E_MODE; > > >>>>>>>> - nested_vmx_entry_ctls_high |= > > >>>>>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - > > >>>>>>>> + nested_vmx_entry_ctls_high |= > > >>>>>>>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + > > >>>>>>>> VM_ENTRY_LOAD_IA32_EFER); > > >>>>>>>> /* cpu-based controls */ > > >>>>>>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > > >>>>>>>> nested_vmx_procbased_ctls_low, > > >>>>>>>> nested_vmx_procbased_ctls_high); @@ -7492,10 +7493,18 @@ > static > > >>>>>>>> void prepare_vmcs02(struct kvm_vcpu *vcpu, > > >>>>>>> struct vmcs12 *vmcs12) > > >>>>>>>> vcpu->arch.cr0_guest_owned_bits &= > > >>>>>>>> ~vmcs12->cr0_guest_host_mask; > vmcs_writel(CR0_GUEST_HOST_MASK, > > >>>>>>> ~vcpu->arch.cr0_guest_owned_bits); > > >>>>>>>> > > >>>>>>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by > > > vmx_set_efer > > >>>>>>> below */ > > >>>>>>>> - vmcs_write32(VM_EXIT_CONTROLS, - > vmcs12->vm_exit_controls | > > >>>>>>>> vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, > > >>>>>>>> vmcs12->vm_entry_controls | + /* L2->L1 exit controls are > > >>>>>>>> emulated - the hardware exit is +to L0 so + * we should use > its > > >>>>>>>> exit controls. Note that IA32_MODE, LOAD_IA32_EFER + * > bits are > > >>>>>>>> further modified by vmx_set_efer() below. + */ > > >>>>>>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > > >>>>>> This is wrong. We cannot use L0 exit control directly. > > >>>>>> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, > > > ACK_INTR_ON_EXIT should use host's exit control. But others, still > > > need use (vmcs12|host). > > >>>>>> > > >>>>> I do not see why. We always intercept DR7/PAT/EFER, so save is > > >>>>> emulated too. Host address space size always come from L0 and > > >>>>> preemption timer is not supported for nested IIRC and when it > > >>>>> will be host will have to save it on exit anyway for correct emulation. > > >>>> > > >>>> Preemption timer is already supported and works fine as far as I tested. > > >>>> KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. > > >>>> > > >>> So what happens if L1 configures it to value X after X/2 ticks L0 > > >>> exit happen and L0 gets back to L2 directly. The counter will be X > > >>> again instead of X/2. > > >> > > >> Likely. Yes, we need to improve our emulation by setting "Save > > >> VMX-preemption timer value" or emulate this in software if the > > >> hardware lacks support for it (was this flag introduced after the > > >> preemption timer itself?). > > >> > > > Not sure, but my point was that for correct emulation host needs to > > > set "save preempt timer on vmexit" anyway so all VM_EXIT_CONTROLS are > > > indeed emulated as far as I see. > > > > > Ok, here is my summary, please correct me if I am wrong: > > bit 2: Save debug controls, the first processor only support 1-setting on it, so > just use host's setting is enough > Not because first processor only supported 1-setting, but because L0 > intercepts all changes to DR7 and DEBUGCTL MSR, so L2 cannot change them > behind L0 back. If L1 asks to save them in vmcs12 L0 can do it during > vmexit emulation. > > > bit 9: Host address space size, it indicate the host's state, so must use host's > setting. > > bit 12: Load IA32_PERF_GLOBAL_CTRL: same as above. > Not sure what "above" do you mean. It is fully emulated during vmexit > emulation. We do not want PERF_GLOBAL_CTRL to be loaded during L2->L0 Not understand why PERF_GLOBAL_CTRL/ Load IA32_PAT/ Load IA32_EFER shouldn't be loaded during L2->L0? > vmexit, we want it to be loaded in L1 after L0 emulates L2->L1 vmexit. > But I think there is a bug in current emulation. The code looks like > this: > > if (vmcs12->vm_exit_controls & > VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) > vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, > vmcs12->host_ia32_perf_global_ctrl); > > But GUEST_IA32_PERF_GLOBAL_CTRL will not be loaded during L1 entry unless > VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL is set. Why do we assume it is > set > here? You are right. It seems a bug in current emulation. > > > bit 15 : Acknowledge interrupt on exit: same as above. > Same as bit 9. > > > bit 19: Load IA32_PAT: same as above. > > bit 20: Load IA32_EFER: same as above. > Those two are same as bit 12. > > > bit 18: Save IA32_PAT, Didn't expose it to L1, so use host' setting is ok. > > bit 19: Save IA32_EFER, same as above. > Those two are the same a bit 2. > > > bit 22: Save VMXpreemption timer value, I don't see KVM expose it to L1, but > Jan said it's working. Strange! And according gleb's suggestion, it better to > always set it. > > > It exposed it in nested_vmx_setup_ctls_msrs: > > nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | > PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | > PIN_BASED_VMX_PREEMPTION_TIMER; > According to Gleb's suggestion current emulation is broken and to fix it > the bit will have to be set on each L2 entry if L1 is using preemption timer. > > > So, currently, only use host' exit_control for L2 is enough. > > > > Best regards, > > Yang > > Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 08, 2013 at 02:28:15PM +0000, Zhang, Yang Z wrote: > > -----Original Message----- > > From: Gleb Natapov [mailto:gleb@redhat.com] > > Sent: Monday, July 08, 2013 8:38 PM > > To: Zhang, Yang Z > > Cc: Jan Kiszka; Paolo Bonzini; Nakajima, Jun; kvm@vger.kernel.org > > Subject: Re: [PATCH v3 01/13] nEPT: Support LOAD_IA32_EFER entry/exit > > controls for L1 > > > > On Thu, Jul 04, 2013 at 08:42:53AM +0000, Zhang, Yang Z wrote: > > > Gleb Natapov wrote on 2013-07-02: > > > > On Tue, Jul 02, 2013 at 05:34:56PM +0200, Jan Kiszka wrote: > > > >> On 2013-07-02 17:15, Gleb Natapov wrote: > > > >>> On Tue, Jul 02, 2013 at 04:28:56PM +0200, Jan Kiszka wrote: > > > >>>> On 2013-07-02 15:59, Gleb Natapov wrote: > > > >>>>> On Tue, Jul 02, 2013 at 03:01:24AM +0000, Zhang, Yang Z wrote: > > > >>>>>> Since this series is pending in mail list for long time. And > > > >>>>>> it's really a big feature for Nested. Also, I doubt the > > > >>>>>> original authors(Jun and Nahav)should not have enough time to > > continue it. > > > >>>>>> So I will pick it up. :) > > > >>>>>> > > > >>>>>> See comments below: > > > >>>>>> > > > >>>>>> Paolo Bonzini wrote on 2013-05-20: > > > >>>>>>> Il 19/05/2013 06:52, Jun Nakajima ha scritto: > > > >>>>>>>> From: Nadav Har'El <nyh@il.ibm.com> > > > >>>>>>>> > > > >>>>>>>> Recent KVM, since > > > >>>>>>>> http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 > > > >>>>>>>> switch the EFER MSR when EPT is used and the host and guest > > have > > > >>>>>>>> different NX bits. So if we add support for nested EPT (L1 guest > > > >>>>>>>> using EPT to run L2) and want to be able to run recent KVM as L1, > > > >>>>>>>> we need to allow L1 to use this EFER switching feature. > > > >>>>>>>> > > > >>>>>>>> To do this EFER switching, KVM uses > > VM_ENTRY/EXIT_LOAD_IA32_EFER > > > >>>>>>>> if available, and if it isn't, it uses the generic > > > >>>>>>>> VM_ENTRY/EXIT_MSR_LOAD. This patch adds support for the > > former > > > >>>>>>>> (the latter is still unsupported). > > > >>>>>>>> > > > >>>>>>>> Nested entry and exit emulation (prepare_vmcs_02 and > > > >>>>>>>> load_vmcs12_host_state, respectively) already handled > > > >>>>>>>> VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all that's left to do > > > >>>>>>>> in this patch is to properly advertise this feature to L1. > > > >>>>>>>> > > > >>>>>>>> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are > > emulated by > > > >>>>>>>> L0, by using vmx_set_efer (which itself sets one of several > > > >>>>>>>> vmcs02 fields), so we always support this feature, regardless of > > > >>>>>>>> whether the host supports it. > > > >>>>>>>> > > > >>>>>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > > > >>>>>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > > > >>>>>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > > > >>>>>>>> --- > > > >>>>>>>> arch/x86/kvm/vmx.c | 23 ++++++++++++++++------- > > > >>>>>>>> 1 file changed, 16 insertions(+), 7 deletions(-) > > > >>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index > > > >>>>>>>> 260a919..fb9cae5 100644 > > > >>>>>>>> --- a/arch/x86/kvm/vmx.c > > > >>>>>>>> +++ b/arch/x86/kvm/vmx.c > > > >>>>>>>> @@ -2192,7 +2192,8 @@ static __init void > > > >>>>>>>> nested_vmx_setup_ctls_msrs(void) #else > > > >>>>>>>> nested_vmx_exit_ctls_high = 0; #endif > > > >>>>>>>> - nested_vmx_exit_ctls_high |= > > VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; > > > >>>>>>>> + nested_vmx_exit_ctls_high |= > > (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR > > > >>>>>>>> | + VM_EXIT_LOAD_IA32_EFER); > > > >>>>>>>> > > > >>>>>>>> /* entry controls */ > > > >>>>>>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2201,8 +2202,8 > > > > @@ static > > > >>>>>>>> __init void nested_vmx_setup_ctls_msrs(void) > > > >>>>>>>> nested_vmx_entry_ctls_low = > > VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; > > > >>>>>>>> nested_vmx_entry_ctls_high &= > > VM_ENTRY_LOAD_IA32_PAT | > > > >>>>>>>> VM_ENTRY_IA32E_MODE; > > > >>>>>>>> - nested_vmx_entry_ctls_high |= > > > >>>>>>>> VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - > > > >>>>>>>> + nested_vmx_entry_ctls_high |= > > > >>>>>>>> (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + > > > >>>>>>>> VM_ENTRY_LOAD_IA32_EFER); > > > >>>>>>>> /* cpu-based controls */ > > > >>>>>>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, > > > >>>>>>>> nested_vmx_procbased_ctls_low, > > > >>>>>>>> nested_vmx_procbased_ctls_high); @@ -7492,10 +7493,18 @@ > > static > > > >>>>>>>> void prepare_vmcs02(struct kvm_vcpu *vcpu, > > > >>>>>>> struct vmcs12 *vmcs12) > > > >>>>>>>> vcpu->arch.cr0_guest_owned_bits &= > > > >>>>>>>> ~vmcs12->cr0_guest_host_mask; > > vmcs_writel(CR0_GUEST_HOST_MASK, > > > >>>>>>> ~vcpu->arch.cr0_guest_owned_bits); > > > >>>>>>>> > > > >>>>>>>> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by > > > > vmx_set_efer > > > >>>>>>> below */ > > > >>>>>>>> - vmcs_write32(VM_EXIT_CONTROLS, - > > vmcs12->vm_exit_controls | > > > >>>>>>>> vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, > > > >>>>>>>> vmcs12->vm_entry_controls | + /* L2->L1 exit controls are > > > >>>>>>>> emulated - the hardware exit is +to L0 so + * we should use > > its > > > >>>>>>>> exit controls. Note that IA32_MODE, LOAD_IA32_EFER + * > > bits are > > > >>>>>>>> further modified by vmx_set_efer() below. + */ > > > >>>>>>>> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); > > > >>>>>> This is wrong. We cannot use L0 exit control directly. > > > >>>>>> LOAD_PERF_GLOBAL_CTRL, LOAD_HOST_EFE, LOAD_HOST_PAT, > > > > ACK_INTR_ON_EXIT should use host's exit control. But others, still > > > > need use (vmcs12|host). > > > >>>>>> > > > >>>>> I do not see why. We always intercept DR7/PAT/EFER, so save is > > > >>>>> emulated too. Host address space size always come from L0 and > > > >>>>> preemption timer is not supported for nested IIRC and when it > > > >>>>> will be host will have to save it on exit anyway for correct emulation. > > > >>>> > > > >>>> Preemption timer is already supported and works fine as far as I tested. > > > >>>> KVM doesn't use it for L1, so we do not need to save/restore it - IIRC. > > > >>>> > > > >>> So what happens if L1 configures it to value X after X/2 ticks L0 > > > >>> exit happen and L0 gets back to L2 directly. The counter will be X > > > >>> again instead of X/2. > > > >> > > > >> Likely. Yes, we need to improve our emulation by setting "Save > > > >> VMX-preemption timer value" or emulate this in software if the > > > >> hardware lacks support for it (was this flag introduced after the > > > >> preemption timer itself?). > > > >> > > > > Not sure, but my point was that for correct emulation host needs to > > > > set "save preempt timer on vmexit" anyway so all VM_EXIT_CONTROLS are > > > > indeed emulated as far as I see. > > > > > > > Ok, here is my summary, please correct me if I am wrong: > > > bit 2: Save debug controls, the first processor only support 1-setting on it, so > > just use host's setting is enough > > Not because first processor only supported 1-setting, but because L0 > > intercepts all changes to DR7 and DEBUGCTL MSR, so L2 cannot change them > > behind L0 back. If L1 asks to save them in vmcs12 L0 can do it during > > vmexit emulation. > > > > > bit 9: Host address space size, it indicate the host's state, so must use host's > > setting. > > > bit 12: Load IA32_PERF_GLOBAL_CTRL: same as above. > > Not sure what "above" do you mean. It is fully emulated during vmexit > > emulation. We do not want PERF_GLOBAL_CTRL to be loaded during L2->L0 > Not understand why PERF_GLOBAL_CTRL/ Load IA32_PAT/ Load IA32_EFER shouldn't be loaded during L2->L0? > Because L0 didn't ask them to be loaded and values that would be loaded are L1 values, not L0's. If L0 requests loading then they should be loaded, but we should be careful to put L0's values in vmcs02. > > vmexit, we want it to be loaded in L1 after L0 emulates L2->L1 vmexit. > > But I think there is a bug in current emulation. The code looks like > > this: > > > > if (vmcs12->vm_exit_controls & > > VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) > > vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, > > vmcs12->host_ia32_perf_global_ctrl); > > > > But GUEST_IA32_PERF_GLOBAL_CTRL will not be loaded during L1 entry unless > > VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL is set. Why do we assume it is > > set > > here? > You are right. It seems a bug in current emulation. > > > > > > bit 15 : Acknowledge interrupt on exit: same as above. > > Same as bit 9. > > > > > bit 19: Load IA32_PAT: same as above. > > > bit 20: Load IA32_EFER: same as above. > > Those two are same as bit 12. > > > > > bit 18: Save IA32_PAT, Didn't expose it to L1, so use host' setting is ok. > > > bit 19: Save IA32_EFER, same as above. > > Those two are the same a bit 2. > > > > > bit 22: Save VMXpreemption timer value, I don't see KVM expose it to L1, but > > Jan said it's working. Strange! And according gleb's suggestion, it better to > > always set it. > > > > > It exposed it in nested_vmx_setup_ctls_msrs: > > > > nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | > > PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | > > PIN_BASED_VMX_PREEMPTION_TIMER; > > According to Gleb's suggestion current emulation is broken and to fix it > > the bit will have to be set on each L2 entry if L1 is using preemption timer. > > > > > So, currently, only use host' exit_control for L2 is enough. > > > > > > Best regards, > > > Yang > > > > > Best regards, > Yang -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..fb9cae5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) #else nested_vmx_exit_ctls_high = 0; #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); @@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)