From patchwork Sun Oct 17 10:13:18 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nadav Har'El X-Patchwork-Id: 259901 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id o9HADPEu018696 for ; Sun, 17 Oct 2010 10:13:25 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932374Ab0JQKNX (ORCPT ); Sun, 17 Oct 2010 06:13:23 -0400 Received: from mtagate3.de.ibm.com ([195.212.17.163]:34820 "EHLO mtagate3.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932234Ab0JQKNW (ORCPT ); Sun, 17 Oct 2010 06:13:22 -0400 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate3.de.ibm.com (8.13.1/8.13.1) with ESMTP id o9HADLST017413 for ; Sun, 17 Oct 2010 10:13:21 GMT Received: from d12av04.megacenter.de.ibm.com (d12av04.megacenter.de.ibm.com [9.149.165.229]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o9HADLMW4034702 for ; Sun, 17 Oct 2010 12:13:21 +0200 Received: from d12av04.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av04.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o9HADKoS001525 for ; Sun, 17 Oct 2010 12:13:21 +0200 Received: from rice.haifa.ibm.com (rice.haifa.ibm.com [9.148.8.112]) by d12av04.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id o9HADJPe001306 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Sun, 17 Oct 2010 12:13:20 +0200 Received: from rice.haifa.ibm.com (lnx-nyh.haifa.ibm.com [127.0.0.1]) by rice.haifa.ibm.com (8.14.4/8.14.4) with ESMTP id o9HADJJ5029528; Sun, 17 Oct 2010 12:13:19 +0200 Received: (from nyh@localhost) by rice.haifa.ibm.com (8.14.4/8.14.4/Submit) id o9HADI1v029526; Sun, 17 Oct 2010 12:13:18 +0200 Date: Sun, 17 Oct 2010 12:13:18 +0200 Message-Id: <201010171013.o9HADI1v029526@rice.haifa.ibm.com> X-Authentication-Warning: rice.haifa.ibm.com: nyh set sender to "Nadav Har'El" using -f Cc: gleb@redhat.com, avi@redhat.com To: kvm@vger.kernel.org From: "Nadav Har'El" References: <1287309814-nyh@il.ibm.com> Subject: [PATCH 19/27] nVMX: Exiting from L2 to L1 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Sun, 17 Oct 2010 10:13:25 +0000 (UTC) --- .before/arch/x86/kvm/vmx.c 2010-10-17 11:52:02.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2010-10-17 11:52:02.000000000 +0200 @@ -5085,6 +5085,8 @@ static void __vmx_complete_interrupts(st static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { + if (vmx->nested.nested_mode) + return; __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); @@ -5981,6 +5983,239 @@ static int nested_vmx_run(struct kvm_vcp return 1; } +/* + * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date + * because L2 may have changed some cr0 bits directly (see CRO_GUEST_HOST_MASK) + * without L0 trapping the change and updating vmcs12. + * This function returns the value we should put in vmcs12.guest_cr0. It's not + * enough to just return the current (vmcs02) GUEST_CR0. This may not be the + * guest cr0 that L1 thought it was giving its L2 guest - it is possible that + * L1 wished to allow its guest to set a cr0 bit directly, but we (L0) asked + * to trap this change and instead set just the read shadow. If this is the + * case, we need to copy these read-shadow bits back to vmcs12.guest_cr0, where + * L1 believes they already are. + */ +static inline unsigned long +vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs_fields *vmcs12) +{ + unsigned long guest_cr0_bits = + vcpu->arch.cr0_guest_owned_bits | vmcs12->cr0_guest_host_mask; + return (vmcs_readl(GUEST_CR0) & guest_cr0_bits) | + (vmcs_readl(CR0_READ_SHADOW) & ~guest_cr0_bits); +} + +static inline unsigned long +vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs_fields *vmcs12) +{ + unsigned long guest_cr4_bits = + vcpu->arch.cr4_guest_owned_bits | vmcs12->cr4_guest_host_mask; + return (vmcs_readl(GUEST_CR4) & guest_cr4_bits) | + (vmcs_readl(CR4_READ_SHADOW) & ~guest_cr4_bits); +} + +/* + * prepare_vmcs12 is called when the nested L2 guest exits and we want to + * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this + * function updates it to reflect the changes to the guest state while L2 was + * running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +void prepare_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu); + + /* update guest state fields: */ + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7); + vmcs12->guest_rsp = vmcs_readl(GUEST_RSP); + vmcs12->guest_rip = vmcs_readl(GUEST_RIP); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + + /* TODO: These cannot have changed unless we have MSR bitmaps and + * the relevant bit asks not to trap the change */ + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (vmcs_config.vmentry_ctrl & VM_EXIT_SAVE_IA32_PAT) + vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + + vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER); + + /* update exit information fields: */ + + vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); + vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + if(enable_ept){ + vmcs12->guest_physical_address = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } + + vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = + vmcs_read32(IDT_VECTORING_INFO_FIELD); + vmcs12->idt_vectoring_error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + /* clear vm-entry fields which are to be cleared on exit */ + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; +} + +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, bool is_interrupt) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int efer_offset; + struct vmcs_fields *vmcs01 = vmx->nested.vmcs01_fields; + + if (!vmx->nested.nested_mode) { + printk(KERN_INFO "WARNING: %s called but not in nested mode\n", + __func__); + return 0; + } + + sync_cached_regs_to_vmcs(vcpu); + + prepare_vmcs12(vcpu); + if (is_interrupt) + get_vmcs12_fields(vcpu)->vm_exit_reason = + EXIT_REASON_EXTERNAL_INTERRUPT; + + vmx->nested.current_vmcs12->launched = vmx->launched; + vmx->nested.current_vmcs12->cpu = vcpu->cpu; + + vmx->vmcs = vmx->nested.vmcs01; + vcpu->cpu = vmx->nested.l1_state.cpu; + vmx->launched = vmx->nested.l1_state.launched; + + vmx->nested.nested_mode = false; + + vmx_vcpu_load(vcpu, get_cpu()); + put_cpu(); + + vcpu->arch.efer = vmx->nested.l1_state.efer; + if ((vcpu->arch.efer & EFER_LMA) && + !(vcpu->arch.efer & EFER_SCE)) + vcpu->arch.efer |= EFER_SCE; + + efer_offset = __find_msr_index(vmx, MSR_EFER); + if (update_transition_efer(vmx, efer_offset)) + wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data); + + /* + * L2 perhaps switched to real mode and set vmx->rmode, but we're back + * in L1 and as it is running VMX, it can't be in real mode. + */ + vmx->rmode.vm86_active = 0; + + /* + * We're running a regular L1 guest again, so we do the regular KVM + * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has. + * vmx_set_cr0 might use slightly different bits on the new guest_cr0 + * it sets, e.g., add TS when !fpu_active. + * Note that vmx_set_cr0 refers to rmode and efer set above. + */ + vmx_set_cr0(vcpu, guest_readable_cr0(vmcs01)); + /* + * If we did fpu_activate()/fpu_deactive() during l2's run, we need to + * apply the same changes to l1's vmcs. We just set cr0 correctly, but + * now we need to also update cr0_guest_host_mask and exception_bitmap. + */ + vmcs_write32(EXCEPTION_BITMAP, + (vmcs01->exception_bitmap & ~(1u<fpu_active ? 0 : (1u<arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + + vmx_set_cr4(vcpu, vmx->nested.l1_state.cr4); + + if (enable_ept) { + vcpu->arch.cr3 = vmcs01->guest_cr3; + vmcs_write32(GUEST_CR3, vmcs01->guest_cr3); + vmcs_write64(EPT_POINTER, vmcs01->ept_pointer); + vmcs_write64(GUEST_PDPTR0, vmcs01->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs01->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs01->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs01->guest_pdptr3); + } else { + kvm_set_cr3(vcpu, vmx->nested.l1_state.cr3); + } + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs01->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs01->guest_rip); + + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + + if (unlikely(vmx->fail)) { + /* + * When L1 launches L2 and then we (L0) fail to launch L2, + * we nested_vmx_vmexit back to L1, but now should let it know + * that the VMLAUNCH failed - with the same error that we + * got when launching L2. + */ + vmx->fail = 0; + nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + } else + nested_vmx_succeed(vcpu); + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios,