From patchwork Tue Sep 14 13:07:50 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nadav Har'El X-Patchwork-Id: 179162 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id o8ED85sf001025 for ; Tue, 14 Sep 2010 13:08:06 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753807Ab0INNHz (ORCPT ); Tue, 14 Sep 2010 09:07:55 -0400 Received: from mailgw12.technion.ac.il ([132.68.225.12]:28971 "EHLO mailgw12.technion.ac.il" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753596Ab0INNHz (ORCPT ); Tue, 14 Sep 2010 09:07:55 -0400 X-IronPort-Anti-Spam-Filtered: true X-IronPort-Anti-Spam-Result: AvsEAI4Qj0yERHMG/2dsb2JhbAChT3HCYwKFPgSCAogl X-IronPort-AV: E=Sophos;i="4.56,364,1280696400"; d="scan'208";a="5037364" Received: from fermat.math.technion.ac.il ([132.68.115.6]) by mailgw12.technion.ac.il with ESMTP; 14 Sep 2010 15:07:51 +0200 Received: from fermat.math.technion.ac.il (localhost [127.0.0.1]) by fermat.math.technion.ac.il (8.12.10/8.12.10) with ESMTP id o8ED7oBk006211; Tue, 14 Sep 2010 15:07:50 +0200 (IST) Received: (from nyh@localhost) by fermat.math.technion.ac.il (8.12.10/8.12.10/Submit) id o8ED7ohf006210; Tue, 14 Sep 2010 15:07:50 +0200 (IST) X-Authentication-Warning: fermat.math.technion.ac.il: nyh set sender to nyh@math.technion.ac.il using -f Date: Tue, 14 Sep 2010 15:07:50 +0200 From: "Nadav Har'El" To: Avi Kivity Cc: kvm@vger.kernel.org Subject: Re: [PATCH 18/24] Exiting from L2 to L1 Message-ID: <20100914130750.GB26346@fermat.math.technion.ac.il> References: <1276431753-nyh@il.ibm.com> <201006131231.o5DCVlKB013102@rice.haifa.ibm.com> <4C161AB8.4060905@redhat.com> Mime-Version: 1.0 Content-Disposition: inline In-Reply-To: <4C161AB8.4060905@redhat.com> User-Agent: Mutt/1.4.2.2i Hebrew-Date: 4 Tishri 5771 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Tue, 14 Sep 2010 13:08:06 +0000 (UTC) --- .before/arch/x86/kvm/vmx.c 2010-09-14 15:02:37.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2010-09-14 15:02:37.000000000 +0200 @@ -4970,9 +4970,13 @@ static void vmx_complete_interrupts(stru int type; bool idtv_info_valid; + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + + if (vmx->nested.nested_mode) + return; + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); /* Handle machine checks before interrupts are enabled */ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -5961,6 +5965,242 @@ static int nested_vmx_run(struct kvm_vcp return 1; } +/* + * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date + * because L2 may have changed some cr0 bits directly (see CRO_GUEST_HOST_MASK) + * without L0 trapping the change and updating vmcs12. + * This function returns the value we should put in vmcs12.guest_cr0. It's not + * enough to just return the current (vmcs02) GUEST_CR0. This may not be the + * guest cr0 that L1 thought it was giving its L2 guest - it is possible that + * L1 wished to allow its guest to set a cr0 bit directly, but we (L0) asked + * to trap this change and instead set just the read shadow. If this is the + * case, we need to copy these read-shadow bits back to vmcs12.guest_cr0, where + * L1 believes they already are. + */ +static inline unsigned long +vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs_fields *vmcs12) +{ + unsigned long guest_cr0_bits = + vcpu->arch.cr0_guest_owned_bits | vmcs12->cr0_guest_host_mask; + return (vmcs_readl(GUEST_CR0) & guest_cr0_bits) | + (vmcs_readl(CR0_READ_SHADOW) & ~guest_cr0_bits); +} + +static inline unsigned long +vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs_fields *vmcs12) +{ + unsigned long guest_cr4_bits = + vcpu->arch.cr4_guest_owned_bits | vmcs12->cr4_guest_host_mask; + return (vmcs_readl(GUEST_CR4) & guest_cr4_bits) | + (vmcs_readl(CR4_READ_SHADOW) & ~guest_cr4_bits); +} + +/* + * prepare_vmcs12 is called when the nested L2 guest exits and we want to + * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this + * function updates it to reflect the changes to the guest state while L2 was + * running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +void prepare_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu); + + /* update guest state fields: */ + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7); + vmcs12->guest_rsp = vmcs_readl(GUEST_RSP); + vmcs12->guest_rip = vmcs_readl(GUEST_RIP); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + + /* TODO: These cannot have changed unless we have MSR bitmaps and + * the relevant bit asks not to trap the change */ + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (vmcs_config.vmentry_ctrl & VM_EXIT_SAVE_IA32_PAT) + vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + + vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER); + + /* update exit information fields: */ + + vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); + vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + if(enable_ept){ + vmcs12->guest_physical_address = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } + + vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = + vmcs_read32(IDT_VECTORING_INFO_FIELD); + vmcs12->idt_vectoring_error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + /* clear vm-entry fields which are to be cleared on exit */ + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; +} + +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, bool is_interrupt) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int efer_offset; + struct vmcs_fields *vmcs01 = vmx->nested.vmcs01_fields; + + if (!vmx->nested.nested_mode) { + printk(KERN_INFO "WARNING: %s called but not in nested mode\n", + __func__); + return 0; + } + + sync_cached_regs_to_vmcs(vcpu); + + prepare_vmcs12(vcpu); + if (is_interrupt) + get_vmcs12_fields(vcpu)->vm_exit_reason = + EXIT_REASON_EXTERNAL_INTERRUPT; + + vmx->nested.current_vmcs12->launched = vmx->launched; + vmx->nested.current_vmcs12->cpu = vcpu->cpu; + + vmx->vmcs = vmx->nested.vmcs01; + vcpu->cpu = vmx->nested.l1_state.cpu; + vmx->launched = vmx->nested.l1_state.launched; + + vmx_vcpu_load(vcpu, get_cpu()); + put_cpu(); + + vcpu->arch.efer = vmx->nested.l1_state.efer; + if ((vcpu->arch.efer & EFER_LMA) && + !(vcpu->arch.efer & EFER_SCE)) + vcpu->arch.efer |= EFER_SCE; + + efer_offset = __find_msr_index(vmx, MSR_EFER); + if (update_transition_efer(vmx, efer_offset)) + wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data); + + /* + * L2 perhaps switched to real mode and set vmx->rmode, but we're back + * in L1 and as it is running VMX, it can't be in real mode. + */ + vmx->rmode.vm86_active = 0; + + /* + * We're running a regular L1 guest again, so we do the regular KVM + * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has. + * vmx_set_cr0 might use slightly different bits on the new guest_cr0 + * it sets, e.g., add TS when !fpu_active. + * Note that vmx_set_cr0 refers to rmode and efer set above. + */ + vmx_set_cr0(vcpu, guest_readable_cr0(vmcs01)); + /* + * If we did fpu_activate()/fpu_deactive() during l2's run, we need to + * apply the same changes to l1's vmcs. We just set cr0 correctly, but + * now we need to also update cr0_guest_host_mask and exception_bitmap. + */ + vmcs_write32(EXCEPTION_BITMAP, + (vmx->nested.vmcs01_fields->exception_bitmap & + ~(1u<fpu_active ? 0 : (1u<arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + + vmx_set_cr4(vcpu, vmx->nested.l1_state.cr4); + + if (enable_ept) { + vcpu->arch.cr3 = vmcs01->guest_cr3; + vmcs_write32(GUEST_CR3, vmcs01->guest_cr3); + vmcs_write64(EPT_POINTER, vmcs01->ept_pointer); + vmcs_write64(GUEST_PDPTR0, vmcs01->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs01->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs01->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs01->guest_pdptr3); + } else { + kvm_set_cr3(vcpu, vmx->nested.l1_state.cr3); + } + + kvm_register_write(vcpu, VCPU_REGS_RSP, + vmx->nested.vmcs01_fields->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, + vmx->nested.vmcs01_fields->guest_rip); + + vmx->nested.nested_mode = 0; + + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + + if (unlikely(vmx->fail)) { + /* + * When L1 launches L2 and then we (L0) fail to launch L2, + * we nested_vmx_vmexit back to L1, but now should let it know + * that the VMLAUNCH failed - with the same error that we + * got when launching L2. + */ + vmx->fail = 0; + nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + } else + nested_vmx_succeed(vcpu); + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios,