@@ -5080,9 +5080,13 @@ static void vmx_complete_interrupts(stru
int type;
bool idtv_info_valid;
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+ if (vmx->nested.nested_mode)
+ return;
+
exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
/* Handle machine checks before interrupts are enabled */
if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -5978,6 +5982,278 @@ static int nested_vmx_run(struct kvm_vcp
return 1;
}
+/* prepare_vmcs_12 is called when the nested L2 guest exits and we want to
+ * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this
+ * function updates it to reflect the state of the registers during the exit,
+ * and to reflect some changes that happened while L2 was running (and perhaps
+ * made some exits which were handled directly by L0 without going back to L1).
+ */
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+ struct shadow_vmcs *vmcs12 = get_shadow_vmcs(vcpu);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+ vmcs12->tsc_offset = vmcs_read64(TSC_OFFSET);
+ vmcs12->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ vmcs12->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+ vmcs12->vm_entry_intr_info_field =
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ vmcs12->vm_entry_exception_error_code =
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ vmcs12->vm_entry_instruction_len =
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+ vmcs12->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+ vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
+ vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ vmcs12->idt_vectoring_info_field =
+ vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ vmcs12->idt_vectoring_error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+ vmcs12->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+ vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+ /* If any of the CRO_GUEST_HOST_MASK bits are off, the L2 guest may
+ * have changed some cr0 bits without us ever saving them in the shadow
+ * vmcs. So we need to save these changes now.
+ * In the current code, the only GHM bit which can be off is TS (it
+ * will be off when fpu_active and L1 also set it to off).
+ */
+ vmcs12->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+ /* But this may not be the guest_cr0 that the L1 guest hypervisor
+ * actually thought it was giving its L2 guest. It is possible that
+ * L1 wished to allow its guest to set a cr0 bit directly, but we (L0)
+ * captured this attempt and instead set just the read shadow. If this
+ * is the case, we need copy these read-shadow bits back to guest_cr0,
+ * where L1 believes they already are. Note that we must read the
+ * actual CR0_READ_SHADOW (which is what L0 may have changed), not
+ * vmcs12->cr0_read_shadow (which L1 defined, and we don't
+ * change without being told by L1). Currently, the only bit where
+ * this can happen is TS.
+ */
+ if (!(vcpu->arch.cr0_guest_owned_bits & X86_CR0_TS)
+ && !(vmcs12->cr0_guest_host_mask & X86_CR0_TS))
+ vmcs12->guest_cr0 =
+ (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+ (vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS);
+
+ vmcs12->guest_cr4 = vmcs_readl(GUEST_CR4);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7);
+ vmcs12->guest_rsp = vmcs_readl(GUEST_RSP);
+ vmcs12->guest_rip = vmcs_readl(GUEST_RIP);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_shadow_vmcs;
+
+ if (enable_vpid && src->virtual_processor_id != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+ vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+ vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+ if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+ vmcs_write64(APIC_ACCESS_ADDR,
+ src->apic_access_addr);
+
+ if (enable_ept) {
+ vmcs_write64(EPT_POINTER, src->ept_pointer);
+ vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+ }
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+ vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ src->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ src->page_fault_error_code_match);
+ vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+ vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ src->secondary_vm_exec_control);
+
+ load_vmcs_common(src);
+
+ load_vmcs_host_state(to_vmx(vcpu)->nested.l1_shadow_vmcs);
+
+ return 0;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+ bool is_interrupt)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int efer_offset;
+
+ if (!vmx->nested.nested_mode) {
+ printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+ __func__);
+ return 0;
+ }
+
+ sync_cached_regs_to_vmcs(vcpu);
+
+ if (!nested_map_current(vcpu)) {
+ printk(KERN_INFO "Error mapping shadow vmcs\n");
+ set_rflags_to_vmx_fail_valid(vcpu);
+ return 1;
+ }
+
+ prepare_vmcs_12(vcpu);
+ if (is_interrupt)
+ get_shadow_vmcs(vcpu)->vm_exit_reason =
+ EXIT_REASON_EXTERNAL_INTERRUPT;
+
+ vmx->nested.current_l2_page->launched = vmx->launched;
+ vmx->nested.current_l2_page->cpu = vcpu->cpu;
+
+ nested_unmap_current(vcpu);
+
+ vmx->vmcs = vmx->nested.l1_vmcs;
+ vcpu->cpu = vmx->nested.l1_state.cpu;
+ vmx->launched = vmx->nested.l1_state.launched;
+
+ vmx_vcpu_load(vcpu, get_cpu());
+ put_cpu();
+
+ vcpu->arch.efer = vmx->nested.l1_state.efer;
+ if ((vcpu->arch.efer & EFER_LMA) &&
+ !(vcpu->arch.efer & EFER_SCE))
+ vcpu->arch.efer |= EFER_SCE;
+
+ efer_offset = __find_msr_index(vmx, MSR_EFER);
+ if (update_transition_efer(vmx, efer_offset))
+ wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data);
+
+ /* We're running a regular L1 guest again, so we do the regular KVM
+ * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has
+ * (this can be figured out by combining its old guest_cr0 and
+ * cr0_read_shadow, using the cr0_guest_host_mask). vmx_set_cr0 might
+ * use slightly different bits on the new guest_cr0 it sets, e.g.,
+ * add TS when !fpu_active.
+ */
+ vmx_set_cr0(vcpu,
+ (vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask &
+ vmx->nested.l1_shadow_vmcs->cr0_read_shadow) |
+ (~vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask &
+ vmx->nested.l1_shadow_vmcs->guest_cr0));
+
+ vmx_set_cr4(vcpu, vmx->nested.l1_state.cr4);
+
+ if (enable_ept) {
+ vcpu->arch.cr3 = vmx->nested.l1_shadow_vmcs->guest_cr3;
+ vmcs_write32(GUEST_CR3, vmx->nested.l1_shadow_vmcs->guest_cr3);
+ } else {
+ kvm_set_cr3(vcpu, vmx->nested.l1_state.cr3);
+ }
+
+ if (!nested_map_current(vcpu)) {
+ printk(KERN_INFO "Error mapping shadow vmcs\n");
+ set_rflags_to_vmx_fail_valid(vcpu);
+ return 1;
+ }
+
+ switch_back_vmcs(vcpu);
+
+ nested_unmap_current(vcpu);
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP,
+ vmx->nested.l1_shadow_vmcs->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP,
+ vmx->nested.l1_shadow_vmcs->guest_rip);
+
+ vmx->nested.nested_mode = 0;
+
+ /* If we did fpu_activate()/fpu_deactive() during l2's run, we need
+ * to apply the same changes also when running l1. We don't need to
+ * change cr0 here - we already did this above - just the
+ * cr0_guest_host_mask, and exception bitmap.
+ */
+ vmcs_write32(EXCEPTION_BITMAP,
+ (vmx->nested.l1_shadow_vmcs->exception_bitmap &
+ ~(1u<<NM_VECTOR)) |
+ (vcpu->fpu_active ? 0 : (1u<<NM_VECTOR)));
+ vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ kvm_mmu_reset_context(vcpu);
+ kvm_mmu_load(vcpu);
+
+ if (unlikely(vmx->fail)) {
+ vmx->fail = 0;
+ set_rflags_to_vmx_fail_valid(vcpu);
+ } else
+ clear_rflags_cf_zf(vcpu);
+
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,