From patchwork Wed Dec 8 17:07:38 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nadav Har'El X-Patchwork-Id: 391182 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id oB8H7k7w019950 for ; Wed, 8 Dec 2010 17:07:46 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753993Ab0LHRHn (ORCPT ); Wed, 8 Dec 2010 12:07:43 -0500 Received: from mtagate5.uk.ibm.com ([194.196.100.165]:50313 "EHLO mtagate5.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752460Ab0LHRHm (ORCPT ); Wed, 8 Dec 2010 12:07:42 -0500 Received: from d06nrmr1707.portsmouth.uk.ibm.com (d06nrmr1707.portsmouth.uk.ibm.com [9.149.39.225]) by mtagate5.uk.ibm.com (8.13.1/8.13.1) with ESMTP id oB8H7f8B032379 for ; Wed, 8 Dec 2010 17:07:41 GMT Received: from d06av02.portsmouth.uk.ibm.com (d06av02.portsmouth.uk.ibm.com [9.149.37.228]) by d06nrmr1707.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id oB8H7g2I3576020 for ; Wed, 8 Dec 2010 17:07:42 GMT Received: from d06av02.portsmouth.uk.ibm.com (loopback [127.0.0.1]) by d06av02.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id oB8H7evE006391 for ; Wed, 8 Dec 2010 10:07:41 -0700 Received: from rice.haifa.ibm.com (rice.haifa.ibm.com [9.148.8.217]) by d06av02.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id oB8H7diV006388 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Wed, 8 Dec 2010 10:07:40 -0700 Received: from rice.haifa.ibm.com (lnx-nyh.haifa.ibm.com [127.0.0.1]) by rice.haifa.ibm.com (8.14.4/8.14.4) with ESMTP id oB8H7dxm008718; Wed, 8 Dec 2010 19:07:39 +0200 Received: (from nyh@localhost) by rice.haifa.ibm.com (8.14.4/8.14.4/Submit) id oB8H7cV4008716; Wed, 8 Dec 2010 19:07:38 +0200 Date: Wed, 8 Dec 2010 19:07:38 +0200 Message-Id: <201012081707.oB8H7cV4008716@rice.haifa.ibm.com> X-Authentication-Warning: rice.haifa.ibm.com: nyh set sender to "Nadav Har'El" using -f Cc: gleb@redhat.com, avi@redhat.com To: kvm@vger.kernel.org From: "Nadav Har'El" References: <1291827596-nyh@il.ibm.com> Subject: [PATCH 15/28] nVMX: Prepare vmcs02 from vmcs01 and vmcs12 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Wed, 08 Dec 2010 17:07:46 +0000 (UTC) --- .before/arch/x86/kvm/vmx.c 2010-12-08 18:56:50.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2010-12-08 18:56:50.000000000 +0200 @@ -805,6 +805,28 @@ static inline bool report_flexpriority(v return flexpriority_enabled; } +static inline bool nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu) +{ + return cpu_has_vmx_tpr_shadow() && + get_vmcs12_fields(vcpu)->cpu_based_vm_exec_control & + CPU_BASED_TPR_SHADOW; +} + +static inline bool nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu) +{ + return cpu_has_secondary_exec_ctrls() && + get_vmcs12_fields(vcpu)->cpu_based_vm_exec_control & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu + *vcpu) +{ + return nested_cpu_has_secondary_exec_ctrls(vcpu) && + (get_vmcs12_fields(vcpu)->secondary_vm_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -1253,6 +1275,37 @@ static void vmx_load_host_state(struct v preempt_enable(); } +int load_vmcs_host_state(struct vmcs_fields *src) +{ + vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector); + vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector); + vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector); + vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector); + vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector); + vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector); + vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector); + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) + vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat); + + vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs); + + vmcs_writel(HOST_CR0, src->host_cr0); + vmcs_writel(HOST_CR3, src->host_cr3); + vmcs_writel(HOST_CR4, src->host_cr4); + vmcs_writel(HOST_FS_BASE, src->host_fs_base); + vmcs_writel(HOST_GS_BASE, src->host_gs_base); + vmcs_writel(HOST_TR_BASE, src->host_tr_base); + vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base); + vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base); + vmcs_writel(HOST_RSP, src->host_rsp); + vmcs_writel(HOST_RIP, src->host_rip); + vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip); + + return 0; +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -5365,6 +5418,362 @@ static void vmx_set_supported_cpuid(u32 entry->ecx |= bit(X86_FEATURE_VMX); } +/* + * Make a copy of the current VMCS to ordinary memory. This is needed because + * in VMX you cannot read and write to two VMCS at the same time, so when we + * want to do this (in prepare_vmcs02, which needs to read from vmcs01 while + * preparing vmcs02), we need to first save a copy of one VMCS's fields in + * memory, and then use that copy. + */ +void save_vmcs(struct vmcs_fields *dst) +{ + dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR); + dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR); + dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR); + dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR); + dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR); + dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR); + dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR); + dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A); + dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B); + if (cpu_has_vmx_msr_bitmap()) + dst->msr_bitmap = vmcs_read64(MSR_BITMAP); + dst->tsc_offset = vmcs_read64(TSC_OFFSET); + dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR); + dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR); + dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER); + dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (enable_ept) { + /* shadow pages tables on EPT */ + dst->ept_pointer = vmcs_read64(EPT_POINTER); + dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); + dst->page_fault_error_code_mask = + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK); + dst->page_fault_error_code_match = + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH); + dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT); + dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS); + dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS); + dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + dst->vm_entry_exception_error_code = + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); + dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN); + dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD); + dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (enable_vpid && dst->secondary_vm_exec_control & + SECONDARY_EXEC_ENABLE_VPID) + dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID); + dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR); + dst->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); + dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); + dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE); + dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + dst->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); + dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS); + dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK); + dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK); + dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW); + dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW); + dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0); + dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1); + dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2); + dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3); + dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + dst->guest_cr0 = vmcs_readl(GUEST_CR0); + dst->guest_cr3 = vmcs_readl(GUEST_CR3); + dst->guest_cr4 = vmcs_readl(GUEST_CR4); + dst->guest_es_base = vmcs_readl(GUEST_ES_BASE); + dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + dst->guest_dr7 = vmcs_readl(GUEST_DR7); + dst->guest_rsp = vmcs_readl(GUEST_RSP); + dst->guest_rip = vmcs_readl(GUEST_RIP); + dst->guest_rflags = vmcs_readl(GUEST_RFLAGS); + dst->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + dst->host_cr0 = vmcs_readl(HOST_CR0); + dst->host_cr3 = vmcs_readl(HOST_CR3); + dst->host_cr4 = vmcs_readl(HOST_CR4); + dst->host_fs_base = vmcs_readl(HOST_FS_BASE); + dst->host_gs_base = vmcs_readl(HOST_GS_BASE); + dst->host_tr_base = vmcs_readl(HOST_TR_BASE); + dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE); + dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE); + dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP); + dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP); + dst->host_rsp = vmcs_readl(HOST_RSP); + dst->host_rip = vmcs_readl(HOST_RIP); + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) + dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT); +} + +/* + * prepare_vmcs02 is called in when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's wishes for its guest (vmsc01), so we can run the L2 guest in a + * way that will both be appropriate to L1's requests, and our needs. + */ +int prepare_vmcs02(struct kvm_vcpu *vcpu, + struct vmcs_fields *vmcs12, struct vmcs_fields *vmcs01) +{ + u32 exec_control; + + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); + vmcs_writel(GUEST_RSP, vmcs12->guest_rsp); + vmcs_writel(GUEST_RIP, vmcs12->guest_rip); + vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + vmcs_write64(VMCS_LINK_POINTER, vmcs12->vmcs_link_pointer); + vmcs_write64(IO_BITMAP_A, vmcs01->io_bitmap_a); + vmcs_write64(IO_BITMAP_B, vmcs01->io_bitmap_b); + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, vmcs01->msr_bitmap); + + if (vmcs12->vm_entry_msr_load_count > 0 || + vmcs12->vm_exit_msr_load_count > 0 || + vmcs12->vm_exit_msr_store_count > 0) { + printk(KERN_WARNING + "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); + } + + if (nested_cpu_has_vmx_tpr_shadow(vcpu)) { + struct page *page = + nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + if (!page) + return 1; + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page)); + kvm_release_page_clean(page); + } + + if (nested_vm_need_virtualize_apic_accesses(vcpu)) { + struct page *page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + if (!page) + return 1; + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + kvm_release_page_clean(page); + } + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, + (vmcs01->pin_based_vm_exec_control | + vmcs12->pin_based_vm_exec_control)); + + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + * + * A problem with this approach (when !enable_ept) is that L1 may be + * injected with more page faults than it asked for. This could have + * caused problems, but in practice existing hypervisors don't care. + * To fix this, we will need to emulate the PFEC checking (on the L1 + * page tables), using walk_addr(), when injecting PFs to L1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + if (cpu_has_secondary_exec_ctrls()) { + u32 exec_control = vmcs01->secondary_vm_exec_control; + if (nested_cpu_has_secondary_exec_ctrls(vcpu)) { + exec_control |= vmcs12->secondary_vm_exec_control; + if (!vm_need_virtualize_apic_accesses(vcpu->kvm) || + !nested_vm_need_virtualize_apic_accesses(vcpu)) + exec_control &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + load_vmcs_host_state(vmcs01); + + if (vm_need_tpr_shadow(vcpu->kvm) && + nested_cpu_has_vmx_tpr_shadow(vcpu)) + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + + exec_control = vmcs01->cpu_based_vm_exec_control; + exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + if (!vm_need_tpr_shadow(vcpu->kvm) || + vmcs12->virtual_apic_page_addr == 0) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } else if (exec_control & CPU_BASED_TPR_SHADOW) { +#ifdef CONFIG_X86_64 + exec_control &= ~CPU_BASED_CR8_STORE_EXITING; + exec_control &= ~CPU_BASED_CR8_LOAD_EXITING; +#endif + } + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the + * bitwise-or of what L1 wants to trap for L2, and what we want to + * trap. However, vmx_fpu_activate/deactivate may have happened after + * we saved vmcs01, so we shouldn't trust its TS and NM_VECTOR bits + * and need to base them again on fpu_active. Note that CR0.TS also + * needs updating - we do this after this function returns (in + * nested_vmx_run). + */ + vmcs_write32(EXCEPTION_BITMAP, + ((vmcs01->exception_bitmap&~(1u<fpu_active ? 0 : (1u<exception_bitmap)); + vmcs_writel(CR0_GUEST_HOST_MASK, vmcs12->cr0_guest_host_mask | + (vcpu->fpu_active ? 0 : X86_CR0_TS)); + vcpu->arch.cr0_guest_owned_bits = ~(vmcs12->cr0_guest_host_mask | + (vcpu->fpu_active ? 0 : X86_CR0_TS)); + + vmcs_write32(VM_EXIT_CONTROLS, + (vmcs01->vm_exit_controls & + (~(VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT))) + | vmcs12->vm_exit_controls); + + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs01->vm_entry_controls & + (~(VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE))) + | vmcs12->vm_entry_controls); + + vmcs_writel(CR4_GUEST_HOST_MASK, + (vmcs01->cr4_guest_host_mask | + vmcs12->cr4_guest_host_mask)); + vcpu->arch.cr4_guest_owned_bits = ~(vmcs01->cr4_guest_host_mask | + vmcs12->cr4_guest_host_mask); + + vmcs_write64(TSC_OFFSET, vmcs01->tsc_offset + vmcs12->tsc_offset); + + if (enable_ept) { + /* shadow page tables on EPT */ + vmcs_write64(EPT_POINTER, vmcs01->ept_pointer); + } + return 0; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios,