@@ -5217,6 +5217,8 @@ static void __vmx_complete_interrupts(st
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
+ if (is_guest_mode(&vmx->vcpu))
+ return;
__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
VM_EXIT_INSTRUCTION_LEN,
IDT_VECTORING_ERROR_CODE);
@@ -6137,6 +6139,252 @@ static int nested_vmx_run(struct kvm_vcp
return 1;
}
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (see CRO_GUEST_HOST_MASK)
+ * without L0 trapping the change and updating vmcs12.
+ * This function returns the value we should put in vmcs12.guest_cr0. It's not
+ * enough to just return the current (vmcs02) GUEST_CR0. This may not be the
+ * guest cr0 that L1 thought it was giving its L2 guest - it is possible that
+ * L1 wished to allow its guest to set a cr0 bit directly, but we (L0) asked
+ * to trap this change and instead set just the read shadow. If this is the
+ * case, we need to copy these read-shadow bits back to vmcs12.guest_cr0, where
+ * L1 believes they already are.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs_fields *vmcs12)
+{
+ unsigned long guest_cr0_bits =
+ vcpu->arch.cr0_guest_owned_bits | vmcs12->cr0_guest_host_mask;
+ return (vmcs_readl(GUEST_CR0) & guest_cr0_bits) |
+ (vmcs_readl(CR0_READ_SHADOW) & ~guest_cr0_bits);
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs_fields *vmcs12)
+{
+ unsigned long guest_cr4_bits =
+ vcpu->arch.cr4_guest_owned_bits | vmcs12->cr4_guest_host_mask;
+ return (vmcs_readl(GUEST_CR4) & guest_cr4_bits) |
+ (vmcs_readl(CR4_READ_SHADOW) & ~guest_cr4_bits);
+}
+
+/*
+ * prepare_vmcs12 is called when the nested L2 guest exits and we want to
+ * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this
+ * function updates it to reflect the changes to the guest state while L2 was
+ * running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+void prepare_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu);
+
+ /* update guest state fields: */
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7);
+ vmcs12->guest_rsp = vmcs_readl(GUEST_RSP);
+ vmcs12->guest_rip = vmcs_readl(GUEST_RIP);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+
+ /* TODO: These cannot have changed unless we have MSR bitmaps and
+ * the relevant bit asks not to trap the change */
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (vmcs_config.vmentry_ctrl & VM_EXIT_SAVE_IA32_PAT)
+ vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+ /* update exit information fields: */
+
+ vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
+ vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ vmcs12->idt_vectoring_info_field =
+ vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ vmcs12->idt_vectoring_error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ /* clear vm-entry fields which are to be cleared on exit */
+ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, bool is_interrupt)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs_fields *vmcs01 = vmx->nested.vmcs01_fields;
+ int cpu;
+ struct saved_vmcs *saved_vmcs02;
+
+ leave_guest_mode(vcpu);
+
+ sync_cached_regs_to_vmcs(vcpu);
+
+ prepare_vmcs12(vcpu);
+
+ if (is_interrupt)
+ get_vmcs12_fields(vcpu)->vm_exit_reason =
+ EXIT_REASON_EXTERNAL_INTERRUPT;
+
+ /*
+ * Switch from L2's VMCS, to L1's VMCS. Remember on which CPU the L2
+ * VMCS was last loaded, and whether it was launched (we need to know
+ * this next time we use L2), and recall these values as they were for
+ * L1's VMCS.
+ */
+ saved_vmcs02 = nested_get_current_vmcs(vmx);
+ if (saved_vmcs02) {
+ saved_vmcs02->cpu = vcpu->cpu;
+ saved_vmcs02->launched = vmx->launched;
+ }
+ vmx->vmcs = vmx->nested.saved_vmcs01.vmcs;
+ vcpu->cpu = vmx->nested.saved_vmcs01.cpu;
+ vmx->launched = vmx->nested.saved_vmcs01.launched;
+
+ vmx_vcpu_put(vcpu);
+ cpu = get_cpu();
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+
+ if (get_vmcs12_fields(vcpu)->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /*
+ * L2 perhaps switched to real mode and set vmx->rmode, but we're back
+ * in L1 and as it is running VMX, it can't be in real mode.
+ */
+ vmx->rmode.vm86_active = 0;
+
+ /*
+ * If L1 set the HOST_* fields in the VMCS, when exiting from L2 to L1
+ * we need to return those, not L1's old values.
+ */
+ vmcs_writel(GUEST_RIP, get_vmcs12_fields(vcpu)->host_rip);
+ vmcs_writel(GUEST_RSP, get_vmcs12_fields(vcpu)->host_rsp);
+ vmcs01->cr0_read_shadow = get_vmcs12_fields(vcpu)->host_cr0;
+
+ /*
+ * We're running a regular L1 guest again, so we do the regular KVM
+ * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has.
+ * vmx_set_cr0 might use slightly different bits on the new guest_cr0
+ * it sets, e.g., add TS when !fpu_active.
+ * Note that vmx_set_cr0 refers to rmode and efer set above.
+ */
+ vmx_set_cr0(vcpu, guest_readable_cr0(vmcs01));
+ /*
+ * If we did fpu_activate()/fpu_deactive() during l2's run, we need to
+ * apply the same changes to l1's vmcs. We just set cr0 correctly, but
+ * now we need to also update cr0_guest_host_mask and exception_bitmap.
+ */
+ vmcs_write32(EXCEPTION_BITMAP,
+ (vmcs01->exception_bitmap & ~(1u<<NM_VECTOR)) |
+ (vcpu->fpu_active ? 0 : (1u<<NM_VECTOR)));
+ vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ vmx_set_cr4(vcpu, guest_readable_cr4(vmcs01));
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs01->cr4_guest_host_mask;
+
+ if (enable_ept) {
+ /* shadow page tables on EPT: */
+ set_cr3_and_pdptrs(vcpu, get_vmcs12_fields(vcpu)->host_cr3);
+ } else {
+ /* shadow page tables on shadow page tables: */
+ kvm_set_cr3(vcpu, get_vmcs12_fields(vcpu)->host_cr3);
+ kvm_mmu_reset_context(vcpu);
+ kvm_mmu_load(vcpu);
+ }
+ if (enable_vpid) {
+ /*
+ * Trivially support vpid by letting L2s share their parent
+ * L1's vpid. TODO: move to a more elaborate solution, giving
+ * each L2 its own vpid and exposing the vpid feature to L1.
+ */
+ vmx_flush_tlb(vcpu);
+ }
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs01->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs01->guest_rip);
+
+ if (unlikely(vmx->fail)) {
+ /*
+ * When L1 launches L2 and then we (L0) fail to launch L2,
+ * we nested_vmx_vmexit back to L1, but now should let it know
+ * that the VMLAUNCH failed - with the same error that we
+ * got when launching L2.
+ */
+ vmx->fail = 0;
+ nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+ } else
+ nested_vmx_succeed(vcpu);
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,