diff mbox

[18/24] Exiting from L2 to L1

Message ID 201006131231.o5DCVlKB013102@rice.haifa.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El June 13, 2010, 12:31 p.m. UTC
None
diff mbox

Patch

--- .before/arch/x86/kvm/vmx.c	2010-06-13 15:01:30.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2010-06-13 15:01:30.000000000 +0300
@@ -5080,9 +5080,13 @@  static void vmx_complete_interrupts(stru
 	int type;
 	bool idtv_info_valid;
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+	if (vmx->nested.nested_mode)
+		return;
+
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -5978,6 +5982,278 @@  static int nested_vmx_run(struct kvm_vcp
 	return 1;
 }
 
+/* prepare_vmcs_12 is called when the nested L2 guest exits and we want to
+ * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this
+ * function updates it to reflect the state of the registers during the exit,
+ * and to reflect some changes that happened while L2 was running (and perhaps
+ * made some exits which were handled directly by L0 without going back to L1).
+ */
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *vmcs12 = get_shadow_vmcs(vcpu);
+
+	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	vmcs12->tsc_offset = vmcs_read64(TSC_OFFSET);
+	vmcs12->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	vmcs12->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	vmcs12->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	vmcs12->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	vmcs12->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	vmcs12->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	vmcs12->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	vmcs12->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	vmcs12->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	vmcs12->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+	/* If any of the CRO_GUEST_HOST_MASK bits are off, the L2 guest may
+	 * have changed some cr0 bits without us ever saving them in the shadow
+	 * vmcs. So we need to save these changes now.
+	 * In the current code, the only GHM bit which can be off is TS (it
+	 * will be off when fpu_active and L1 also set it to off).
+	 */
+	vmcs12->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+	/* But this may not be the guest_cr0 that the L1 guest hypervisor
+	 * actually thought it was giving its L2 guest. It is possible that
+	 * L1 wished to allow its guest to set a cr0 bit directly, but we (L0)
+	 * captured this attempt and instead set just the read shadow. If this
+	 * is the case, we need copy these read-shadow bits back to guest_cr0,
+	 * where L1 believes they already are. Note that we must read the
+	 * actual CR0_READ_SHADOW (which is what L0 may have changed), not
+	 * vmcs12->cr0_read_shadow (which L1 defined, and we don't
+	 * change without being told by L1). Currently, the only bit where
+	 * this can happen is TS.
+	 */
+	if (!(vcpu->arch.cr0_guest_owned_bits & X86_CR0_TS)
+			&& !(vmcs12->cr0_guest_host_mask & X86_CR0_TS))
+		vmcs12->guest_cr0 =
+			(vmcs12->guest_cr0 & ~X86_CR0_TS) |
+			(vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS);
+
+	vmcs12->guest_cr4 = vmcs_readl(GUEST_CR4);
+	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7);
+	vmcs12->guest_rsp = vmcs_readl(GUEST_RSP);
+	vmcs12->guest_rip = vmcs_readl(GUEST_RIP);
+	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	vmcs12->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_shadow_vmcs);
+
+	return 0;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int efer_offset;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		get_shadow_vmcs(vcpu)->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	vmx->nested.current_l2_page->launched = vmx->launched;
+	vmx->nested.current_l2_page->cpu = vcpu->cpu;
+
+	nested_unmap_current(vcpu);
+
+	vmx->vmcs = vmx->nested.l1_vmcs;
+	vcpu->cpu = vmx->nested.l1_state.cpu;
+	vmx->launched = vmx->nested.l1_state.launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.efer = vmx->nested.l1_state.efer;
+	if ((vcpu->arch.efer & EFER_LMA) &&
+	    !(vcpu->arch.efer & EFER_SCE))
+		vcpu->arch.efer |= EFER_SCE;
+
+	efer_offset = __find_msr_index(vmx, MSR_EFER);
+	if (update_transition_efer(vmx, efer_offset))
+		wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data);
+
+	/* We're running a regular L1 guest again, so we do the regular KVM
+	 * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has
+	 * (this can be figured out by combining its old guest_cr0 and
+	 * cr0_read_shadow, using the cr0_guest_host_mask). vmx_set_cr0 might
+	 * use slightly different bits on the new guest_cr0 it sets, e.g.,
+	 * add TS when !fpu_active.
+	 */
+	vmx_set_cr0(vcpu,
+		(vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask &
+		vmx->nested.l1_shadow_vmcs->cr0_read_shadow) |
+		(~vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask &
+		vmx->nested.l1_shadow_vmcs->guest_cr0));
+
+	vmx_set_cr4(vcpu, vmx->nested.l1_state.cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state.cr3);
+	}
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	switch_back_vmcs(vcpu);
+
+	nested_unmap_current(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	/* If we did fpu_activate()/fpu_deactive() during l2's run, we need
+	 * to apply the same changes also when running l1. We don't need to
+	 * change cr0 here - we already did this above - just the
+	 * cr0_guest_host_mask, and exception bitmap.
+	 */
+	vmcs_write32(EXCEPTION_BITMAP,
+		(vmx->nested.l1_shadow_vmcs->exception_bitmap &
+			~(1u<<NM_VECTOR)) |
+			(vcpu->fpu_active ? 0 : (1u<<NM_VECTOR)));
+	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,