From patchwork Sun Jun 13 12:31:47 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nadav Har'El X-Patchwork-Id: 105797 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o5DCVrWF028763 for ; Sun, 13 Jun 2010 12:31:54 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753438Ab0FMMbv (ORCPT ); Sun, 13 Jun 2010 08:31:51 -0400 Received: from mtagate2.uk.ibm.com ([194.196.100.162]:51637 "EHLO mtagate2.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753437Ab0FMMbu (ORCPT ); Sun, 13 Jun 2010 08:31:50 -0400 Received: from d06nrmr1806.portsmouth.uk.ibm.com (d06nrmr1806.portsmouth.uk.ibm.com [9.149.39.193]) by mtagate2.uk.ibm.com (8.13.1/8.13.1) with ESMTP id o5DCVnt6021608 for ; Sun, 13 Jun 2010 12:31:49 GMT Received: from d06av01.portsmouth.uk.ibm.com (d06av01.portsmouth.uk.ibm.com [9.149.37.212]) by d06nrmr1806.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o5DCVniT1015830 for ; Sun, 13 Jun 2010 13:31:49 +0100 Received: from d06av01.portsmouth.uk.ibm.com (loopback [127.0.0.1]) by d06av01.portsmouth.uk.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o5DCVnZu029860 for ; Sun, 13 Jun 2010 13:31:49 +0100 Received: from rice.haifa.ibm.com (rice.haifa.ibm.com [9.148.8.205]) by d06av01.portsmouth.uk.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id o5DCVmhS029844 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Sun, 13 Jun 2010 13:31:48 +0100 Received: from rice.haifa.ibm.com (lnx-nyh.haifa.ibm.com [127.0.0.1]) by rice.haifa.ibm.com (8.14.4/8.14.4) with ESMTP id o5DCVlDg013104; Sun, 13 Jun 2010 15:31:47 +0300 Received: (from nyh@localhost) by rice.haifa.ibm.com (8.14.4/8.14.4/Submit) id o5DCVlKB013102; Sun, 13 Jun 2010 15:31:47 +0300 Date: Sun, 13 Jun 2010 15:31:47 +0300 Message-Id: <201006131231.o5DCVlKB013102@rice.haifa.ibm.com> X-Authentication-Warning: rice.haifa.ibm.com: nyh set sender to "Nadav Har'El" using -f Cc: kvm@vger.kernel.org To: avi@redhat.com From: "Nadav Har'El" References: <1276431753-nyh@il.ibm.com> Subject: [PATCH 18/24] Exiting from L2 to L1 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Sun, 13 Jun 2010 12:31:54 +0000 (UTC) --- .before/arch/x86/kvm/vmx.c 2010-06-13 15:01:30.000000000 +0300 +++ .after/arch/x86/kvm/vmx.c 2010-06-13 15:01:30.000000000 +0300 @@ -5080,9 +5080,13 @@ static void vmx_complete_interrupts(stru int type; bool idtv_info_valid; + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + + if (vmx->nested.nested_mode) + return; + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); /* Handle machine checks before interrupts are enabled */ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -5978,6 +5982,278 @@ static int nested_vmx_run(struct kvm_vcp return 1; } +/* prepare_vmcs_12 is called when the nested L2 guest exits and we want to + * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this + * function updates it to reflect the state of the registers during the exit, + * and to reflect some changes that happened while L2 was running (and perhaps + * made some exits which were handled directly by L0 without going back to L1). + */ +void prepare_vmcs_12(struct kvm_vcpu *vcpu) +{ + struct shadow_vmcs *vmcs12 = get_shadow_vmcs(vcpu); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + + vmcs12->tsc_offset = vmcs_read64(TSC_OFFSET); + vmcs12->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + vmcs12->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT); + vmcs12->vm_entry_intr_info_field = + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + vmcs12->vm_entry_exception_error_code = + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); + vmcs12->vm_entry_instruction_len = + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN); + vmcs12->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR); + vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); + vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = + vmcs_read32(IDT_VECTORING_INFO_FIELD); + vmcs12->idt_vectoring_error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + + vmcs12->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW); + vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + + /* If any of the CRO_GUEST_HOST_MASK bits are off, the L2 guest may + * have changed some cr0 bits without us ever saving them in the shadow + * vmcs. So we need to save these changes now. + * In the current code, the only GHM bit which can be off is TS (it + * will be off when fpu_active and L1 also set it to off). + */ + vmcs12->guest_cr0 = vmcs_readl(GUEST_CR0); + + /* But this may not be the guest_cr0 that the L1 guest hypervisor + * actually thought it was giving its L2 guest. It is possible that + * L1 wished to allow its guest to set a cr0 bit directly, but we (L0) + * captured this attempt and instead set just the read shadow. If this + * is the case, we need copy these read-shadow bits back to guest_cr0, + * where L1 believes they already are. Note that we must read the + * actual CR0_READ_SHADOW (which is what L0 may have changed), not + * vmcs12->cr0_read_shadow (which L1 defined, and we don't + * change without being told by L1). Currently, the only bit where + * this can happen is TS. + */ + if (!(vcpu->arch.cr0_guest_owned_bits & X86_CR0_TS) + && !(vmcs12->cr0_guest_host_mask & X86_CR0_TS)) + vmcs12->guest_cr0 = + (vmcs12->guest_cr0 & ~X86_CR0_TS) | + (vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS); + + vmcs12->guest_cr4 = vmcs_readl(GUEST_CR4); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7); + vmcs12->guest_rsp = vmcs_readl(GUEST_RSP); + vmcs12->guest_rip = vmcs_readl(GUEST_RIP); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); +} + +int switch_back_vmcs(struct kvm_vcpu *vcpu) +{ + struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_shadow_vmcs; + + if (enable_vpid && src->virtual_processor_id != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id); + + vmcs_write64(IO_BITMAP_A, src->io_bitmap_a); + vmcs_write64(IO_BITMAP_B, src->io_bitmap_b); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, src->msr_bitmap); + + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr); + + if (vm_need_virtualize_apic_accesses(vcpu->kvm)) + vmcs_write64(APIC_ACCESS_ADDR, + src->apic_access_addr); + + if (enable_ept) { + vmcs_write64(EPT_POINTER, src->ept_pointer); + vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3); + } + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control); + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control); + vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + src->page_fault_error_code_mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + src->page_fault_error_code_match); + vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls); + vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls); + + if (cpu_has_secondary_exec_ctrls()) + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + src->secondary_vm_exec_control); + + load_vmcs_common(src); + + load_vmcs_host_state(to_vmx(vcpu)->nested.l1_shadow_vmcs); + + return 0; +} + +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, + bool is_interrupt) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int efer_offset; + + if (!vmx->nested.nested_mode) { + printk(KERN_INFO "WARNING: %s called but not in nested mode\n", + __func__); + return 0; + } + + sync_cached_regs_to_vmcs(vcpu); + + if (!nested_map_current(vcpu)) { + printk(KERN_INFO "Error mapping shadow vmcs\n"); + set_rflags_to_vmx_fail_valid(vcpu); + return 1; + } + + prepare_vmcs_12(vcpu); + if (is_interrupt) + get_shadow_vmcs(vcpu)->vm_exit_reason = + EXIT_REASON_EXTERNAL_INTERRUPT; + + vmx->nested.current_l2_page->launched = vmx->launched; + vmx->nested.current_l2_page->cpu = vcpu->cpu; + + nested_unmap_current(vcpu); + + vmx->vmcs = vmx->nested.l1_vmcs; + vcpu->cpu = vmx->nested.l1_state.cpu; + vmx->launched = vmx->nested.l1_state.launched; + + vmx_vcpu_load(vcpu, get_cpu()); + put_cpu(); + + vcpu->arch.efer = vmx->nested.l1_state.efer; + if ((vcpu->arch.efer & EFER_LMA) && + !(vcpu->arch.efer & EFER_SCE)) + vcpu->arch.efer |= EFER_SCE; + + efer_offset = __find_msr_index(vmx, MSR_EFER); + if (update_transition_efer(vmx, efer_offset)) + wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data); + + /* We're running a regular L1 guest again, so we do the regular KVM + * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has + * (this can be figured out by combining its old guest_cr0 and + * cr0_read_shadow, using the cr0_guest_host_mask). vmx_set_cr0 might + * use slightly different bits on the new guest_cr0 it sets, e.g., + * add TS when !fpu_active. + */ + vmx_set_cr0(vcpu, + (vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask & + vmx->nested.l1_shadow_vmcs->cr0_read_shadow) | + (~vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask & + vmx->nested.l1_shadow_vmcs->guest_cr0)); + + vmx_set_cr4(vcpu, vmx->nested.l1_state.cr4); + + if (enable_ept) { + vcpu->arch.cr3 = vmx->nested.l1_shadow_vmcs->guest_cr3; + vmcs_write32(GUEST_CR3, vmx->nested.l1_shadow_vmcs->guest_cr3); + } else { + kvm_set_cr3(vcpu, vmx->nested.l1_state.cr3); + } + + if (!nested_map_current(vcpu)) { + printk(KERN_INFO "Error mapping shadow vmcs\n"); + set_rflags_to_vmx_fail_valid(vcpu); + return 1; + } + + switch_back_vmcs(vcpu); + + nested_unmap_current(vcpu); + + kvm_register_write(vcpu, VCPU_REGS_RSP, + vmx->nested.l1_shadow_vmcs->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, + vmx->nested.l1_shadow_vmcs->guest_rip); + + vmx->nested.nested_mode = 0; + + /* If we did fpu_activate()/fpu_deactive() during l2's run, we need + * to apply the same changes also when running l1. We don't need to + * change cr0 here - we already did this above - just the + * cr0_guest_host_mask, and exception bitmap. + */ + vmcs_write32(EXCEPTION_BITMAP, + (vmx->nested.l1_shadow_vmcs->exception_bitmap & + ~(1u<fpu_active ? 0 : (1u<arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + + if (unlikely(vmx->fail)) { + vmx->fail = 0; + set_rflags_to_vmx_fail_valid(vcpu); + } else + clear_rflags_cf_zf(vcpu); + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios,