diff mbox series

[v2,10/18] KVM: nVMX: split pieces of prepare_vmcs02() to prepare_vmcs02_early()

Message ID 20180828160459.14093-11-sean.j.christopherson@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: nVMX: add option to perform early consistency checks via H/W | expand

Commit Message

Sean Christopherson Aug. 28, 2018, 4:04 p.m. UTC
Add prepare_vmcs02_early() and move pieces of prepare_vmcs02() to the
new function.  prepare_vmcs02_early() writes the bits of vmcs02 that
a) must be in place to pass the VMFail consistency checks (assuming
vmcs12 is valid) and b) are needed recover from a VMExit, e.g. host
state that is loaded on VMExit.  Splitting the functionality will
enable KVM to leverage hardware to do VMFail consistency checks via
a dry run of VMEnter and recover from a potential VMExit without
having to fully initialize vmcs02.

Add prepare_vmcs02_first_run() to handle writing vmcs02 state that
comes from vmcs01 and never changes, i.e. we don't need to rewrite
any of the vmcs02 that is effectively constant once defined.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/kvm/vmx.c | 410 +++++++++++++++++++++++++--------------------
 1 file changed, 225 insertions(+), 185 deletions(-)

Comments

Jim Mattson Sept. 20, 2018, 7:22 p.m. UTC | #1
On Tue, Aug 28, 2018 at 9:04 AM, Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
> Add prepare_vmcs02_early() and move pieces of prepare_vmcs02() to the
> new function.  prepare_vmcs02_early() writes the bits of vmcs02 that
> a) must be in place to pass the VMFail consistency checks (assuming
> vmcs12 is valid) and b) are needed recover from a VMExit, e.g. host
> state that is loaded on VMExit.  Splitting the functionality will
> enable KVM to leverage hardware to do VMFail consistency checks via
> a dry run of VMEnter and recover from a potential VMExit without
> having to fully initialize vmcs02.
>
> Add prepare_vmcs02_first_run() to handle writing vmcs02 state that
> comes from vmcs01 and never changes, i.e. we don't need to rewrite
> any of the vmcs02 that is effectively constant once defined.
>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> ---
>  arch/x86/kvm/vmx.c | 410 +++++++++++++++++++++++++--------------------
>  1 file changed, 225 insertions(+), 185 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index cb8df73e9b49..492dc154c31e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -11904,10 +11904,229 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>                 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
>  }
>
> -static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> +static void prepare_vmcs02_first_run(struct vcpu_vmx *vmx)
>  {
> -       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       /*
> +        * If we have never launched vmcs02, set the constant vmcs02 state
> +        * according to L0's settings (vmcs12 is irrelevant here).  Host
> +        * fields that come from L0 and are not constant, e.g. HOST_CR3,
> +        * will be set as needed prior to VMLAUNCH/VMRESUME.
> +        */
> +       if (vmx->nested.vmcs02.launched)
> +               return;

The launched flag gets cleared on logical CPU migration, but we don't
actually have to execute this code again. Should we introduce a new
boolean?

> +       /* All VMFUNCs are currently emulated through L0 vmexits.  */
> +       if (cpu_has_vmx_vmfunc())
> +               vmcs_write64(VM_FUNCTION_CONTROL, 0);
> +
> +       if (cpu_has_vmx_posted_intr())
> +               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
> +
> +       if (cpu_has_vmx_msr_bitmap())
> +               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
> +
> +       if (enable_pml)
> +               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));

Was the ASSERT(vmx->pml_pg) deliberately dropped?

> +       /*
> +        * Set the MSR load/store lists to match L0's settings.  Only the
> +        * addresses are constant (for vmcs02), the counts can change based
> +        * on L2's behavior, e.g. switching to/from long mode.
> +        */
> +       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
> +       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
> +       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
> +
> +       vmx_set_constant_host_state(vmx);
> +}
> +
> +static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
> +                                     struct vmcs12 *vmcs12)
> +{
> +       prepare_vmcs02_first_run(vmx);
> +
> +       vmcs_write64(VMCS_LINK_POINTER, -1ull);
> +
> +       if (enable_vpid) {
> +               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
> +                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
> +               else
> +                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
> +       }
> +}
> +
> +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> +{
> +       u32 exec_control, vmcs12_exec_ctrl;
> +       u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
> +
> +       if (vmx->nested.dirty_vmcs12)
> +               prepare_vmcs02_early_full(vmx, vmcs12);
> +
> +       /*
> +        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
> +        * entry, but only if the current (host) sp changed from the value
> +        * we wrote last (vmx->host_rsp).  This cache is no longer relevant
> +        * if we switch vmcs, and rather than hold a separate cache per vmcs,
> +        * here we just force the write to happen on entry.
> +        */
> +       vmx->host_rsp = 0;
> +
> +       /*
> +        * PIN CONTROLS
> +        */
> +       exec_control = vmcs12->pin_based_vm_exec_control;
> +
> +       /* Preemption timer setting is only taken from vmcs01.  */
> +       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
> +       exec_control |= vmcs_config.pin_based_exec_ctrl;
> +       if (vmx->hv_deadline_tsc == -1)
> +               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
> +
> +       /* Posted interrupts setting is only taken from vmcs12.  */
> +       if (nested_cpu_has_posted_intr(vmcs12)) {
> +               vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
> +               vmx->nested.pi_pending = false;
> +       } else {
> +               exec_control &= ~PIN_BASED_POSTED_INTR;
> +       }
> +       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
> +
> +       /*
> +        * EXEC CONTROLS
> +        */
> +       exec_control = vmx_exec_control(vmx); /* L0's desires */
> +       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +       exec_control &= ~CPU_BASED_TPR_SHADOW;
> +       exec_control |= vmcs12->cpu_based_vm_exec_control;
> +
> +       /*
> +        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
> +        * nested_get_vmcs12_pages can't fix it up, the illegal value
> +        * will result in a VM entry failure.
> +        */
> +       if (exec_control & CPU_BASED_TPR_SHADOW) {
> +               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
> +               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
> +       } else {
> +#ifdef CONFIG_X86_64
> +               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
> +                               CPU_BASED_CR8_STORE_EXITING;
> +#endif
> +       }
> +
> +       /*
> +        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
> +        * for I/O port accesses.
> +        */
> +       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
> +       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
> +       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
> +
> +       /*
> +        * SECONDARY EXEC CONTROLS
> +        */
> +       if (cpu_has_secondary_exec_ctrls()) {
> +               exec_control = vmx->secondary_exec_control;
> +
> +               /* Take the following fields only from vmcs12 */
> +               exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> +                                 SECONDARY_EXEC_ENABLE_INVPCID |
> +                                 SECONDARY_EXEC_RDTSCP |
> +                                 SECONDARY_EXEC_XSAVES |
> +                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
> +                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
> +                                 SECONDARY_EXEC_ENABLE_VMFUNC);
> +               if (nested_cpu_has(vmcs12,
> +                                  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
> +                       vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
> +                               ~SECONDARY_EXEC_ENABLE_PML;
> +                       exec_control |= vmcs12_exec_ctrl;
> +               }
> +
> +               /* VMCS shadowing for L2 is emulated for now */
> +               exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
> +
> +               if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
> +                       vmcs_write16(GUEST_INTR_STATUS,
> +                               vmcs12->guest_intr_status);
> +
> +               /*
> +                * Write an illegal value to APIC_ACCESS_ADDR. Later,
> +                * nested_get_vmcs12_pages will either fix it up or
> +                * remove the VM execution control.
> +                */
> +               if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
> +                       vmcs_write64(APIC_ACCESS_ADDR, -1ull);
> +
> +               if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
> +                       vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
> +
> +               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +       }
> +
> +       /*
> +        * ENTRY CONTROLS
> +        *
> +        * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
> +        * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
> +        * on the related bits (if supported by the CPU) in the hope that
> +        * we can avoid VMWrites during vmx_set_efer().
> +        */
> +       exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
> +                       ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
> +       if (cpu_has_load_ia32_efer) {
> +               if (guest_efer & EFER_LMA)
> +                       exec_control |= VM_ENTRY_IA32E_MODE;
> +               if (guest_efer != host_efer)
> +                       exec_control |= VM_ENTRY_LOAD_IA32_EFER;
> +       }
> +       vm_entry_controls_init(vmx, exec_control);
> +
> +       /*
> +        * EXIT CONTROLS
> +        *
> +        * L2->L1 exit controls are emulated - the hardware exit is to L0 so
> +        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
> +        * bits may be modified by vmx_set_efer() in prepare_vmcs02().
> +        */
> +       exec_control = vmcs_config.vmexit_ctrl;

> +       if (cpu_has_load_ia32_efer && guest_efer != host_efer)
> +               exec_control |= VM_EXIT_LOAD_IA32_EFER;

This code looks new. I think it's fine, but it doesn't appear to be
part of the refactoring.

Reviewed-by: Jim Mattson <jmattson@google.com>
Sean Christopherson Sept. 25, 2018, 4:56 p.m. UTC | #2
On Thu, 2018-09-20 at 12:22 -0700, Jim Mattson wrote:
> On Tue, Aug 28, 2018 at 9:04 AM, Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> > 
> > Add prepare_vmcs02_early() and move pieces of prepare_vmcs02() to the
> > new function.  prepare_vmcs02_early() writes the bits of vmcs02 that
> > a) must be in place to pass the VMFail consistency checks (assuming
> > vmcs12 is valid) and b) are needed recover from a VMExit, e.g. host
> > state that is loaded on VMExit.  Splitting the functionality will
> > enable KVM to leverage hardware to do VMFail consistency checks via
> > a dry run of VMEnter and recover from a potential VMExit without
> > having to fully initialize vmcs02.
> > 
> > Add prepare_vmcs02_first_run() to handle writing vmcs02 state that
> > comes from vmcs01 and never changes, i.e. we don't need to rewrite
> > any of the vmcs02 that is effectively constant once defined.
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > ---
> >  arch/x86/kvm/vmx.c | 410 +++++++++++++++++++++++++--------------------
> >  1 file changed, 225 insertions(+), 185 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index cb8df73e9b49..492dc154c31e 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -11904,10 +11904,229 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> >                 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
> >  }
> > 
> > -static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> > +static void prepare_vmcs02_first_run(struct vcpu_vmx *vmx)
> >  {
> > -       struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +       /*
> > +        * If we have never launched vmcs02, set the constant vmcs02 state
> > +        * according to L0's settings (vmcs12 is irrelevant here).  Host
> > +        * fields that come from L0 and are not constant, e.g. HOST_CR3,
> > +        * will be set as needed prior to VMLAUNCH/VMRESUME.
> > +        */
> > +       if (vmx->nested.vmcs02.launched)
> > +               return;
> The launched flag gets cleared on logical CPU migration, but we don't
> actually have to execute this code again. Should we introduce a new
> boolean?

Oooh, good point.  I think I'll add that optimization in a separate
patch to make it easier to bisect in case there's something wrong
with the optimization.

> > 
> > +       /* All VMFUNCs are currently emulated through L0 vmexits.  */
> > +       if (cpu_has_vmx_vmfunc())
> > +               vmcs_write64(VM_FUNCTION_CONTROL, 0);
> > +
> > +       if (cpu_has_vmx_posted_intr())
> > +               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
> > +
> > +       if (cpu_has_vmx_msr_bitmap())
> > +               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
> > +
> > +       if (enable_pml)
> > +               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
> Was the ASSERT(vmx->pml_pg) deliberately dropped?

Yes.  I'll move it to a separate patch and drop the other instance
of the assert from vmx_vcpu_setup().

> > 
> > +       /*
> > +        * Set the MSR load/store lists to match L0's settings.  Only the
> > +        * addresses are constant (for vmcs02), the counts can change based
> > +        * on L2's behavior, e.g. switching to/from long mode.
> > +        */
> > +       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
> > +       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
> > +       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
> > +
> > +       vmx_set_constant_host_state(vmx);
> > +}
> > +
> > +static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
> > +                                     struct vmcs12 *vmcs12)
> > +{
> > +       prepare_vmcs02_first_run(vmx);
> > +
> > +       vmcs_write64(VMCS_LINK_POINTER, -1ull);
> > +
> > +       if (enable_vpid) {
> > +               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
> > +                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
> > +               else
> > +                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
> > +       }
> > +}
> > +
> > +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> > +{
> > +       u32 exec_control, vmcs12_exec_ctrl;
> > +       u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
> > +
> > +       if (vmx->nested.dirty_vmcs12)
> > +               prepare_vmcs02_early_full(vmx, vmcs12);
> > +
> > +       /*
> > +        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
> > +        * entry, but only if the current (host) sp changed from the value
> > +        * we wrote last (vmx->host_rsp).  This cache is no longer relevant
> > +        * if we switch vmcs, and rather than hold a separate cache per vmcs,
> > +        * here we just force the write to happen on entry.
> > +        */
> > +       vmx->host_rsp = 0;
> > +
> > +       /*
> > +        * PIN CONTROLS
> > +        */
> > +       exec_control = vmcs12->pin_based_vm_exec_control;
> > +
> > +       /* Preemption timer setting is only taken from vmcs01.  */
> > +       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
> > +       exec_control |= vmcs_config.pin_based_exec_ctrl;
> > +       if (vmx->hv_deadline_tsc == -1)
> > +               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
> > +
> > +       /* Posted interrupts setting is only taken from vmcs12.  */
> > +       if (nested_cpu_has_posted_intr(vmcs12)) {
> > +               vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
> > +               vmx->nested.pi_pending = false;
> > +       } else {
> > +               exec_control &= ~PIN_BASED_POSTED_INTR;
> > +       }
> > +       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
> > +
> > +       /*
> > +        * EXEC CONTROLS
> > +        */
> > +       exec_control = vmx_exec_control(vmx); /* L0's desires */
> > +       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> > +       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> > +       exec_control &= ~CPU_BASED_TPR_SHADOW;
> > +       exec_control |= vmcs12->cpu_based_vm_exec_control;
> > +
> > +       /*
> > +        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
> > +        * nested_get_vmcs12_pages can't fix it up, the illegal value
> > +        * will result in a VM entry failure.
> > +        */
> > +       if (exec_control & CPU_BASED_TPR_SHADOW) {
> > +               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
> > +               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
> > +       } else {
> > +#ifdef CONFIG_X86_64
> > +               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
> > +                               CPU_BASED_CR8_STORE_EXITING;
> > +#endif
> > +       }
> > +
> > +       /*
> > +        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
> > +        * for I/O port accesses.
> > +        */
> > +       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
> > +       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
> > +       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
> > +
> > +       /*
> > +        * SECONDARY EXEC CONTROLS
> > +        */
> > +       if (cpu_has_secondary_exec_ctrls()) {
> > +               exec_control = vmx->secondary_exec_control;
> > +
> > +               /* Take the following fields only from vmcs12 */
> > +               exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> > +                                 SECONDARY_EXEC_ENABLE_INVPCID |
> > +                                 SECONDARY_EXEC_RDTSCP |
> > +                                 SECONDARY_EXEC_XSAVES |
> > +                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
> > +                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
> > +                                 SECONDARY_EXEC_ENABLE_VMFUNC);
> > +               if (nested_cpu_has(vmcs12,
> > +                                  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
> > +                       vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
> > +                               ~SECONDARY_EXEC_ENABLE_PML;
> > +                       exec_control |= vmcs12_exec_ctrl;
> > +               }
> > +
> > +               /* VMCS shadowing for L2 is emulated for now */
> > +               exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
> > +
> > +               if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
> > +                       vmcs_write16(GUEST_INTR_STATUS,
> > +                               vmcs12->guest_intr_status);
> > +
> > +               /*
> > +                * Write an illegal value to APIC_ACCESS_ADDR. Later,
> > +                * nested_get_vmcs12_pages will either fix it up or
> > +                * remove the VM execution control.
> > +                */
> > +               if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
> > +                       vmcs_write64(APIC_ACCESS_ADDR, -1ull);
> > +
> > +               if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
> > +                       vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
> > +
> > +               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> > +       }
> > +
> > +       /*
> > +        * ENTRY CONTROLS
> > +        *
> > +        * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
> > +        * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
> > +        * on the related bits (if supported by the CPU) in the hope that
> > +        * we can avoid VMWrites during vmx_set_efer().
> > +        */
> > +       exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
> > +                       ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
> > +       if (cpu_has_load_ia32_efer) {
> > +               if (guest_efer & EFER_LMA)
> > +                       exec_control |= VM_ENTRY_IA32E_MODE;
> > +               if (guest_efer != host_efer)
> > +                       exec_control |= VM_ENTRY_LOAD_IA32_EFER;
> > +       }
> > +       vm_entry_controls_init(vmx, exec_control);
> > +
> > +       /*
> > +        * EXIT CONTROLS
> > +        *
> > +        * L2->L1 exit controls are emulated - the hardware exit is to L0 so
> > +        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
> > +        * bits may be modified by vmx_set_efer() in prepare_vmcs02().
> > +        */
> > +       exec_control = vmcs_config.vmexit_ctrl;
> > 
> > +       if (cpu_has_load_ia32_efer && guest_efer != host_efer)
> > +               exec_control |= VM_EXIT_LOAD_IA32_EFER;
> This code looks new. I think it's fine, but it doesn't appear to be
> part of the refactoring.

Doh, this should have introduced in patch 6/18, "KVM: nVMX: try to set
EFER bits correctly when init'ing entry controls".

> Reviewed-by: Jim Mattson <jmattson@google.com>
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cb8df73e9b49..492dc154c31e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11904,10 +11904,229 @@  static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 		return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
 }
 
-static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_first_run(struct vcpu_vmx *vmx)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	/*
+	 * If we have never launched vmcs02, set the constant vmcs02 state
+	 * according to L0's settings (vmcs12 is irrelevant here).  Host
+	 * fields that come from L0 and are not constant, e.g. HOST_CR3,
+	 * will be set as needed prior to VMLAUNCH/VMRESUME.
+	 */
+	if (vmx->nested.vmcs02.launched)
+		return;
 
+	/* All VMFUNCs are currently emulated through L0 vmexits.  */
+	if (cpu_has_vmx_vmfunc())
+		vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+	if (cpu_has_vmx_posted_intr())
+		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
+	if (enable_pml)
+		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+
+	/*
+	 * Set the MSR load/store lists to match L0's settings.  Only the
+	 * addresses are constant (for vmcs02), the counts can change based
+	 * on L2's behavior, e.g. switching to/from long mode.
+	 */
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+	vmx_set_constant_host_state(vmx);
+}
+
+static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+				      struct vmcs12 *vmcs12)
+{
+	prepare_vmcs02_first_run(vmx);
+
+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+	if (enable_vpid) {
+		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+		else
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+	}
+}
+
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+	u32 exec_control, vmcs12_exec_ctrl;
+	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
+
+	if (vmx->nested.dirty_vmcs12)
+		prepare_vmcs02_early_full(vmx, vmcs12);
+
+	/*
+	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+	 * entry, but only if the current (host) sp changed from the value
+	 * we wrote last (vmx->host_rsp).  This cache is no longer relevant
+	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
+	 * here we just force the write to happen on entry.
+	 */
+	vmx->host_rsp = 0;
+
+	/*
+	 * PIN CONTROLS
+	 */
+	exec_control = vmcs12->pin_based_vm_exec_control;
+
+	/* Preemption timer setting is only taken from vmcs01.  */
+	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+	exec_control |= vmcs_config.pin_based_exec_ctrl;
+	if (vmx->hv_deadline_tsc == -1)
+		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+	/* Posted interrupts setting is only taken from vmcs12.  */
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+		vmx->nested.pi_pending = false;
+	} else {
+		exec_control &= ~PIN_BASED_POSTED_INTR;
+	}
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
+
+	/*
+	 * EXEC CONTROLS
+	 */
+	exec_control = vmx_exec_control(vmx); /* L0's desires */
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+	exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+	/*
+	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+	 * nested_get_vmcs12_pages can't fix it up, the illegal value
+	 * will result in a VM entry failure.
+	 */
+	if (exec_control & CPU_BASED_TPR_SHADOW) {
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+	} else {
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+				CPU_BASED_CR8_STORE_EXITING;
+#endif
+	}
+
+	/*
+	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+	 * for I/O port accesses.
+	 */
+	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	/*
+	 * SECONDARY EXEC CONTROLS
+	 */
+	if (cpu_has_secondary_exec_ctrls()) {
+		exec_control = vmx->secondary_exec_control;
+
+		/* Take the following fields only from vmcs12 */
+		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+				  SECONDARY_EXEC_ENABLE_INVPCID |
+				  SECONDARY_EXEC_RDTSCP |
+				  SECONDARY_EXEC_XSAVES |
+				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				  SECONDARY_EXEC_ENABLE_VMFUNC);
+		if (nested_cpu_has(vmcs12,
+				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
+			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
+				~SECONDARY_EXEC_ENABLE_PML;
+			exec_control |= vmcs12_exec_ctrl;
+		}
+
+		/* VMCS shadowing for L2 is emulated for now */
+		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+			vmcs_write16(GUEST_INTR_STATUS,
+				vmcs12->guest_intr_status);
+
+		/*
+		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
+		 * nested_get_vmcs12_pages will either fix it up or
+		 * remove the VM execution control.
+		 */
+		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+			vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
+		if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+			vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+	/*
+	 * ENTRY CONTROLS
+	 *
+	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
+	 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
+	 * on the related bits (if supported by the CPU) in the hope that
+	 * we can avoid VMWrites during vmx_set_efer().
+	 */
+	exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
+			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+	if (cpu_has_load_ia32_efer) {
+		if (guest_efer & EFER_LMA)
+			exec_control |= VM_ENTRY_IA32E_MODE;
+		if (guest_efer != host_efer)
+			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
+	}
+	vm_entry_controls_init(vmx, exec_control);
+
+	/*
+	 * EXIT CONTROLS
+	 *
+	 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
+	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
+	 */
+	exec_control = vmcs_config.vmexit_ctrl;
+	if (cpu_has_load_ia32_efer && guest_efer != host_efer)
+		exec_control |= VM_EXIT_LOAD_IA32_EFER;
+	vm_exit_controls_init(vmx, exec_control);
+
+	/*
+	 * Conceptually we want to copy the PML address and index from
+	 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+	 * since we always flush the log on each vmexit and never change
+	 * the PML address (once set), this happens to be equivalent to
+	 * simply resetting the index in vmcs02.
+	 */
+	if (enable_pml)
+		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+	/*
+	 * Interrupt/Exception Fields
+	 */
+	if (vmx->nested.nested_run_pending) {
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     vmcs12->vm_entry_intr_info_field);
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+			     vmcs12->vm_entry_exception_error_code);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs12->vm_entry_instruction_len);
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+			     vmcs12->guest_interruptibility_info);
+		vmx->loaded_vmcs->nmi_known_unmasked =
+			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
+	} else {
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+	}
+}
+
+static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
 	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
@@ -11948,10 +12167,6 @@  static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (nested_cpu_has_xsaves(vmcs12))
 		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
-	vmcs_write64(VMCS_LINK_POINTER, -1ull);
-
-	if (cpu_has_vmx_posted_intr())
-		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
 
 	/*
 	 * Whether page-faults are trapped is determined by a combination of
@@ -11972,10 +12187,6 @@  static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
 		enable_ept ? vmcs12->page_fault_error_code_match : 0);
 
-	/* All VMFUNCs are currently emulated through L0 vmexits.  */
-	if (cpu_has_vmx_vmfunc())
-		vmcs_write64(VM_FUNCTION_CONTROL, 0);
-
 	if (cpu_has_vmx_apicv()) {
 		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
 		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
@@ -11983,36 +12194,14 @@  static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
 	}
 
-	/*
-	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
-	 * Some constant fields are set here by vmx_set_constant_host_state().
-	 * Other fields are different per CPU, and will be set later when
-	 * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
-	 * is called.
-	 */
-	vmx_set_constant_host_state(vmx);
-
-	/*
-	 * Set the MSR load/store lists to match L0's settings.
-	 */
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
-	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
-	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
 
 	set_cr4_guest_host_mask(vmx);
 
 	if (vmx_mpx_supported())
 		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 
-	if (enable_vpid) {
-		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
-			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
-		else
-			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-	}
-
 	/*
 	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
 	 */
@@ -12022,9 +12211,6 @@  static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
 		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 	}
-
-	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 }
 
 /*
@@ -12042,11 +12228,9 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			  u32 *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exec_control, vmcs12_exec_ctrl;
-	u64 guest_efer;
 
 	if (vmx->nested.dirty_vmcs12) {
-		prepare_vmcs02_full(vcpu, vmcs12);
+		prepare_vmcs02_full(vmx, vmcs12);
 		vmx->nested.dirty_vmcs12 = false;
 	}
 
@@ -12069,122 +12253,12 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
 	}
-	if (vmx->nested.nested_run_pending) {
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     vmcs12->vm_entry_intr_info_field);
-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-			     vmcs12->vm_entry_exception_error_code);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-			     vmcs12->vm_entry_instruction_len);
-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-			     vmcs12->guest_interruptibility_info);
-		vmx->loaded_vmcs->nmi_known_unmasked =
-			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
-	} else {
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
-	}
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 
-	exec_control = vmcs12->pin_based_vm_exec_control;
-
-	/* Preemption timer setting is only taken from vmcs01.  */
-	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-	exec_control |= vmcs_config.pin_based_exec_ctrl;
-	if (vmx->hv_deadline_tsc == -1)
-		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-
-	/* Posted interrupts setting is only taken from vmcs12.  */
-	if (nested_cpu_has_posted_intr(vmcs12)) {
-		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
-		vmx->nested.pi_pending = false;
-	} else {
-		exec_control &= ~PIN_BASED_POSTED_INTR;
-	}
-
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
-
 	vmx->nested.preemption_timer_expired = false;
 	if (nested_cpu_has_preemption_timer(vmcs12))
 		vmx_start_preemption_timer(vcpu);
 
-	if (cpu_has_secondary_exec_ctrls()) {
-		exec_control = vmx->secondary_exec_control;
-
-		/* Take the following fields only from vmcs12 */
-		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-				  SECONDARY_EXEC_ENABLE_INVPCID |
-				  SECONDARY_EXEC_RDTSCP |
-				  SECONDARY_EXEC_XSAVES |
-				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				  SECONDARY_EXEC_ENABLE_VMFUNC);
-		if (nested_cpu_has(vmcs12,
-				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
-			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
-				~SECONDARY_EXEC_ENABLE_PML;
-			exec_control |= vmcs12_exec_ctrl;
-		}
-
-		/* VMCS shadowing for L2 is emulated for now */
-		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-
-		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
-			vmcs_write16(GUEST_INTR_STATUS,
-				vmcs12->guest_intr_status);
-
-		/*
-		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
-		 * nested_get_vmcs12_pages will either fix it up or
-		 * remove the VM execution control.
-		 */
-		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
-			vmcs_write64(APIC_ACCESS_ADDR, -1ull);
-
-		if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
-			vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
-
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-	}
-
-	/*
-	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
-	 * entry, but only if the current (host) sp changed from the value
-	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
-	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
-	 * here we just force the write to happen on entry.
-	 */
-	vmx->host_rsp = 0;
-
-	exec_control = vmx_exec_control(vmx); /* L0's desires */
-	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
-	exec_control &= ~CPU_BASED_TPR_SHADOW;
-	exec_control |= vmcs12->cpu_based_vm_exec_control;
-
-	/*
-	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
-	 * nested_get_vmcs12_pages can't fix it up, the illegal value
-	 * will result in a VM entry failure.
-	 */
-	if (exec_control & CPU_BASED_TPR_SHADOW) {
-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
-		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
-	} else {
-#ifdef CONFIG_X86_64
-		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
-				CPU_BASED_CR8_STORE_EXITING;
-#endif
-	}
-
-	/*
-	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
-	 * for I/O port accesses.
-	 */
-	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
-	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
-
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
 	 * bitwise-or of what L1 wants to trap for L2, and what we want to
 	 * trap. Note that CR0.TS also needs updating - we do this later.
@@ -12193,30 +12267,6 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
-	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
-	 * bits are further modified by vmx_set_efer() below.
-	 */
-	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
-
-	/*
-	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
-	 * are emulated by vmx_set_efer(), below, but speculate on the
-	 * related bits (if supported by the CPU) in the hope that we can
-	 * avoid VMWrites during vmx_set_efer().
-	 */
-	exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
-			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
-
-	guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
-	if (cpu_has_load_ia32_efer) {
-		if (guest_efer & EFER_LMA)
-			exec_control |= VM_ENTRY_IA32E_MODE;
-		if (guest_efer != host_efer)
-			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
-	}
-	vm_entry_controls_init(vmx, exec_control);
-
 	if (vmx->nested.nested_run_pending &&
 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
@@ -12249,18 +12299,6 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		}
 	}
 
-	if (enable_pml) {
-		/*
-		 * Conceptually we want to copy the PML address and index from
-		 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
-		 * since we always flush the log on each vmexit, this happens
-		 * to be equivalent to simply resetting the fields in vmcs02.
-		 */
-		ASSERT(vmx->pml_pg);
-		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
-		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-	}
-
 	if (nested_cpu_has_ept(vmcs12))
 		nested_ept_init_mmu_context(vcpu);
 	else if (nested_cpu_has2(vmcs12,
@@ -12281,7 +12319,7 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-	vcpu->arch.efer = guest_efer;
+	vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
 	/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
 	vmx_set_efer(vcpu, vcpu->arch.efer);
 
@@ -12573,6 +12611,8 @@  static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
 
+	prepare_vmcs02_early(vmx, vmcs12);
+
 	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
 		goto fail;