@@ -389,7 +389,8 @@ struct kvm_arch{
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_alloc_mmu_pages;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ struct hlist_head _mmu_page_hash[KVM_NUM_MMU_PAGES];
+ struct hlist_head *mmu_page_hash;
/*
* Hash table of struct kvm_mmu_page.
*/
@@ -552,6 +553,8 @@ void kvm_mmu_zap_all(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+int nested_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa2, gpa_t ept12);
+
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -253,6 +253,7 @@ enum vmcs_field {
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_WBINVD 54
/*
@@ -19,6 +19,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -2042,7 +2043,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (tdp_enabled)
direct = 1;
- if (mmu_check_root(vcpu, root_gfn))
+ if (!is_nested_tdp() && mmu_check_root(vcpu, root_gfn))
return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
@@ -2137,10 +2138,9 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+static int __tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, pfn_t pfn,
u32 error_code)
{
- pfn_t pfn;
int r;
int level;
gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -2159,11 +2159,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -2180,6 +2175,30 @@ out_unlock:
return 0;
}
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ u32 error_code)
+{
+ pfn_t pfn;
+ int r;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+
+ return __tdp_page_fault(vcpu, gpa, pfn, error_code);
+}
+
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -2418,6 +2437,45 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
}
}
+int nested_tdp_page_fault(struct kvm_vcpu *vcpu,
+ gpa_t gpa2,
+ gpa_t ept12)
+{
+ gpa_t gpa1;
+ pfn_t pfn;
+ int r;
+ u64 data = 0;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ gpa1 = paging64_nested_ept_walk(vcpu, gpa2, ept12);
+
+ if (gpa1 == UNMAPPED_GVA)
+ return 1;
+
+ kvm_read_guest(vcpu->kvm, gpa1, &data, sizeof(data));
+
+ pfn = gfn_to_pfn(vcpu->kvm, gpa1 >> PAGE_SHIFT);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+
+ r = __tdp_page_fault(vcpu, gpa2 & PAGE_MASK, pfn, 0);
+ if (r)
+ return r;
+
+ return 0;
+
+}
+EXPORT_SYMBOL_GPL(nested_tdp_page_fault);
+
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
destroy_kvm_mmu(vcpu);
@@ -603,6 +603,62 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return !nr_present;
}
+#if PTTYPE == 64
+static gpa_t paging64_nested_ept_walk(struct kvm_vcpu *vcpu, gpa_t addr,
+ gpa_t ept12)
+{
+ pt_element_t pte;
+ gfn_t table_gfn;
+ unsigned index;
+ gpa_t pte_gpa;
+ gpa_t gpa1 = UNMAPPED_GVA;
+
+ struct guest_walker walk;
+ struct guest_walker *walker = &walk;
+
+ walker->level = vcpu->arch.mmu.shadow_root_level;;
+ pte = ept12;
+
+ for (;;) {
+ index = PT_INDEX(addr, walker->level);
+
+ table_gfn = gpte_to_gfn(pte);
+ pte_gpa = gfn_to_gpa(table_gfn);
+ pte_gpa += index * sizeof(pt_element_t);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+
+ if (pte == shadow_trap_nonpresent_pte)
+ return UNMAPPED_GVA;
+
+ walker->ptes[walker->level - 1] = pte;
+
+ if (walker->level == PT_PAGE_TABLE_LEVEL) {
+ walker->gfn = gpte_to_gfn(pte);
+ break;
+ }
+
+ if (walker->level == PT_DIRECTORY_LEVEL
+ && (pte & PT_PAGE_SIZE_MASK)
+ && (PTTYPE == 64 || is_pse(vcpu))) {
+ walker->gfn = gpte_to_gfn_lvl(pte, PT_DIRECTORY_LEVEL);
+ walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+ if (PTTYPE == 32 && is_cpuid_PSE36())
+ walker->gfn += pse36_gfn_delta(pte);
+ break;
+ }
+
+ --walker->level;
+ }
+
+ gpa1 = gfn_to_gpa(walker->gfn);
+
+ return gpa1;
+}
+#endif
+
#undef pt_element_t
#undef guest_walker
#undef FNAME
@@ -125,9 +125,6 @@ static int npt = 1;
module_param(npt, int, S_IRUGO);
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -61,6 +61,174 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+struct __attribute__ ((__packed__)) shadow_vmcs {
+ uint16_t virtual_processor_id;
+ uint16_t guest_es_selector;
+ uint16_t guest_cs_selector;
+ uint16_t guest_ss_selector;
+ uint16_t guest_ds_selector;
+ uint16_t guest_fs_selector;
+ uint16_t guest_gs_selector;
+ uint16_t guest_ldtr_selector;
+ uint16_t guest_tr_selector;
+ uint16_t host_es_selector;
+ uint16_t host_cs_selector;
+ uint16_t host_ss_selector;
+ uint16_t host_ds_selector;
+ uint16_t host_fs_selector;
+ uint16_t host_gs_selector;
+ uint16_t host_tr_selector;
+ uint64_t io_bitmap_a;
+ uint64_t io_bitmap_b;
+ uint64_t msr_bitmap;
+ uint64_t vm_exit_msr_store_addr;
+ uint64_t vm_exit_msr_load_addr;
+ uint64_t vm_entry_msr_load_addr;
+ uint64_t tsc_offset;
+ uint64_t virtual_apic_page_addr;
+ uint64_t apic_access_addr;
+ uint64_t ept_pointer;
+ uint64_t guest_physical_address;
+ uint64_t vmcs_link_pointer;
+ uint64_t guest_ia32_debugctl;
+ uint64_t guest_ia32_pat;
+ uint64_t guest_pdptr0;
+ uint64_t guest_pdptr1;
+ uint64_t guest_pdptr2;
+ uint64_t guest_pdptr3;
+ uint64_t host_ia32_pat;
+ uint32_t pin_based_vm_exec_control;
+ uint32_t cpu_based_vm_exec_control;
+ uint32_t exception_bitmap;
+ uint32_t page_fault_error_code_mask;
+ uint32_t page_fault_error_code_match;
+ uint32_t cr3_target_count;
+ uint32_t vm_exit_controls;
+ uint32_t vm_exit_msr_store_count;
+ uint32_t vm_exit_msr_load_count;
+ uint32_t vm_entry_controls;
+ uint32_t vm_entry_msr_load_count;
+ uint32_t vm_entry_intr_info_field;
+ uint32_t vm_entry_exception_error_code;
+ uint32_t vm_entry_instruction_len;
+ uint32_t tpr_threshold;
+ uint32_t secondary_vm_exec_control;
+ uint32_t vm_instruction_error;
+ uint32_t vm_exit_reason;
+ uint32_t vm_exit_intr_info;
+ uint32_t vm_exit_intr_error_code;
+ uint32_t idt_vectoring_info_field;
+ uint32_t idt_vectoring_error_code;
+ uint32_t vm_exit_instruction_len;
+ uint32_t vmx_instruction_info;
+ uint32_t guest_es_limit;
+ uint32_t guest_cs_limit;
+ uint32_t guest_ss_limit;
+ uint32_t guest_ds_limit;
+ uint32_t guest_fs_limit;
+ uint32_t guest_gs_limit;
+ uint32_t guest_ldtr_limit;
+ uint32_t guest_tr_limit;
+ uint32_t guest_gdtr_limit;
+ uint32_t guest_idtr_limit;
+ uint32_t guest_es_ar_bytes;
+ uint32_t guest_cs_ar_bytes;
+ uint32_t guest_ss_ar_bytes;
+ uint32_t guest_ds_ar_bytes;
+ uint32_t guest_fs_ar_bytes;
+ uint32_t guest_gs_ar_bytes;
+ uint32_t guest_ldtr_ar_bytes;
+ uint32_t guest_tr_ar_bytes;
+ uint32_t guest_interruptibility_info;
+ uint32_t guest_activity_state;
+ uint32_t guest_sysenter_cs;
+ uint32_t host_ia32_sysenter_cs;
+ unsigned long cr0_guest_host_mask;
+ unsigned long cr4_guest_host_mask;
+ unsigned long cr0_read_shadow;
+ unsigned long cr4_read_shadow;
+ unsigned long cr3_target_value0;
+ unsigned long cr3_target_value1;
+ unsigned long cr3_target_value2;
+ unsigned long cr3_target_value3;
+ unsigned long exit_qualification;
+ unsigned long guest_linear_address;
+ unsigned long guest_cr0;
+ unsigned long guest_cr3;
+ unsigned long guest_cr4;
+ unsigned long guest_es_base;
+ unsigned long guest_cs_base;
+ unsigned long guest_ss_base;
+ unsigned long guest_ds_base;
+ unsigned long guest_fs_base;
+ unsigned long guest_gs_base;
+ unsigned long guest_ldtr_base;
+ unsigned long guest_tr_base;
+ unsigned long guest_gdtr_base;
+ unsigned long guest_idtr_base;
+ unsigned long guest_dr7;
+ unsigned long guest_rsp;
+ unsigned long guest_rip;
+ unsigned long guest_rflags;
+ unsigned long guest_pending_dbg_exceptions;
+ unsigned long guest_sysenter_esp;
+ unsigned long guest_sysenter_eip;
+ unsigned long host_cr0;
+ unsigned long host_cr3;
+ unsigned long host_cr4;
+ unsigned long host_fs_base;
+ unsigned long host_gs_base;
+ unsigned long host_tr_base;
+ unsigned long host_gdtr_base;
+ unsigned long host_idtr_base;
+ unsigned long host_ia32_sysenter_esp;
+ unsigned long host_ia32_sysenter_eip;
+ unsigned long host_rsp;
+ unsigned long host_rip;
+};
+
+struct __attribute__ ((__packed__)) level_state {
+ struct shadow_vmcs *shadow_vmcs;
+
+ u16 vpid;
+ u64 shadow_efer;
+ unsigned long cr2;
+ unsigned long cr3;
+ unsigned long cr4;
+ unsigned long cr8;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ struct vmcs *vmcs;
+ int cpu;
+ int launched;
+
+ u64 ept_pointer;
+ struct hlist_head *mmu_page_hash;
+};
+
+enum vmcs_field_type {
+ VMCS_FIELD_TYPE_U16 = 0,
+ VMCS_FIELD_TYPE_U64 = 1,
+ VMCS_FIELD_TYPE_U32 = 2,
+ VMCS_FIELD_TYPE_ULONG = 3
+};
+
+#define VMCS_FIELD_LENGTH_OFFSET 13
+#define VMCS_FIELD_LENGTH_MASK 0x6000
+
+static inline int vmcs_field_length(unsigned long field)
+{
+ return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
+}
+
+#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_PAT))
+#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_IA32E_MODE))
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -114,6 +282,34 @@ struct vcpu_vmx {
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+
+ /* Nested vmx */
+
+ /* Has the level1 guest done vmon? */
+ int vmon;
+ /* Has the level1 guest done vmclear? */
+ int vmclear;
+
+ /* Are we running nested guest */
+ int nested_mode;
+
+ /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+ int nested_run_pending;
+
+ /* flag indicating if there was a valid IDT after exiting from l2 */
+ int nested_pending_valid_idt;
+
+ /* What is the location of the vmcs l1 keeps for l2? (in level1 gpa) */
+ u64 l1_cur_vmcs;
+
+ /*
+ * Level 2 state : includes vmcs,registers and
+ * a copy of vmcs12 for vmread/vmwrite
+ */
+ struct level_state *l2_state;
+
+ /* Level 1 state for switching to level 2 and back */
+ struct level_state *l1_state;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -121,6 +317,460 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static inline bool is_nested(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested_mode;
+}
+
+static inline int vmcs_field_to_offset(unsigned long field)
+{
+ switch (field) {
+ case VIRTUAL_PROCESSOR_ID:
+ return offsetof(struct shadow_vmcs, virtual_processor_id);
+ case GUEST_ES_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_es_selector);
+ case GUEST_CS_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_cs_selector);
+ case GUEST_SS_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_ss_selector);
+ case GUEST_DS_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_ds_selector);
+ case GUEST_FS_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_fs_selector);
+ case GUEST_GS_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_gs_selector);
+ case GUEST_LDTR_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_ldtr_selector);
+ case GUEST_TR_SELECTOR:
+ return offsetof(struct shadow_vmcs, guest_tr_selector);
+ case HOST_ES_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_es_selector);
+ case HOST_CS_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_cs_selector);
+ case HOST_SS_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_ss_selector);
+ case HOST_DS_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_ds_selector);
+ case HOST_FS_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_fs_selector);
+ case HOST_GS_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_gs_selector);
+ case HOST_TR_SELECTOR:
+ return offsetof(struct shadow_vmcs, host_tr_selector);
+ case IO_BITMAP_A:
+ return offsetof(struct shadow_vmcs, io_bitmap_a);
+ case IO_BITMAP_A_HIGH:
+ return offsetof(struct shadow_vmcs, io_bitmap_a)+4;
+ case IO_BITMAP_B:
+ return offsetof(struct shadow_vmcs, io_bitmap_b);
+ case IO_BITMAP_B_HIGH:
+ return offsetof(struct shadow_vmcs, io_bitmap_b)+4;
+ case MSR_BITMAP:
+ return offsetof(struct shadow_vmcs, msr_bitmap);
+ case MSR_BITMAP_HIGH:
+ return offsetof(struct shadow_vmcs, msr_bitmap)+4;
+ case VM_EXIT_MSR_STORE_ADDR:
+ return offsetof(struct shadow_vmcs, vm_exit_msr_store_addr);
+ case VM_EXIT_MSR_STORE_ADDR_HIGH:
+ return offsetof(struct shadow_vmcs, vm_exit_msr_store_addr)+4;
+ case VM_EXIT_MSR_LOAD_ADDR:
+ return offsetof(struct shadow_vmcs, vm_exit_msr_load_addr);
+ case VM_EXIT_MSR_LOAD_ADDR_HIGH:
+ return offsetof(struct shadow_vmcs, vm_exit_msr_load_addr)+4;
+ case VM_ENTRY_MSR_LOAD_ADDR:
+ return offsetof(struct shadow_vmcs, vm_entry_msr_load_addr);
+ case VM_ENTRY_MSR_LOAD_ADDR_HIGH:
+ return offsetof(struct shadow_vmcs, vm_entry_msr_load_addr)+4;
+ case TSC_OFFSET:
+ return offsetof(struct shadow_vmcs, tsc_offset);
+ case TSC_OFFSET_HIGH:
+ return offsetof(struct shadow_vmcs, tsc_offset)+4;
+ case VIRTUAL_APIC_PAGE_ADDR:
+ return offsetof(struct shadow_vmcs, virtual_apic_page_addr);
+ case VIRTUAL_APIC_PAGE_ADDR_HIGH:
+ return offsetof(struct shadow_vmcs, virtual_apic_page_addr)+4;
+ case APIC_ACCESS_ADDR:
+ return offsetof(struct shadow_vmcs, apic_access_addr);
+ case APIC_ACCESS_ADDR_HIGH:
+ return offsetof(struct shadow_vmcs, apic_access_addr)+4;
+ case EPT_POINTER:
+ return offsetof(struct shadow_vmcs, ept_pointer);
+ case EPT_POINTER_HIGH:
+ return offsetof(struct shadow_vmcs, ept_pointer)+4;
+ case GUEST_PHYSICAL_ADDRESS:
+ return offsetof(struct shadow_vmcs, guest_physical_address);
+ case GUEST_PHYSICAL_ADDRESS_HIGH:
+ return offsetof(struct shadow_vmcs, guest_physical_address)+4;
+ case VMCS_LINK_POINTER:
+ return offsetof(struct shadow_vmcs, vmcs_link_pointer);
+ case VMCS_LINK_POINTER_HIGH:
+ return offsetof(struct shadow_vmcs, vmcs_link_pointer)+4;
+ case GUEST_IA32_DEBUGCTL:
+ return offsetof(struct shadow_vmcs, guest_ia32_debugctl);
+ case GUEST_IA32_DEBUGCTL_HIGH:
+ return offsetof(struct shadow_vmcs, guest_ia32_debugctl)+4;
+ case GUEST_IA32_PAT:
+ return offsetof(struct shadow_vmcs, guest_ia32_pat);
+ case GUEST_IA32_PAT_HIGH:
+ return offsetof(struct shadow_vmcs, guest_ia32_pat)+4;
+ case GUEST_PDPTR0:
+ return offsetof(struct shadow_vmcs, guest_pdptr0);
+ case GUEST_PDPTR0_HIGH:
+ return offsetof(struct shadow_vmcs, guest_pdptr0)+4;
+ case GUEST_PDPTR1:
+ return offsetof(struct shadow_vmcs, guest_pdptr1);
+ case GUEST_PDPTR1_HIGH:
+ return offsetof(struct shadow_vmcs, guest_pdptr1)+4;
+ case GUEST_PDPTR2:
+ return offsetof(struct shadow_vmcs, guest_pdptr2);
+ case GUEST_PDPTR2_HIGH:
+ return offsetof(struct shadow_vmcs, guest_pdptr2)+4;
+ case GUEST_PDPTR3:
+ return offsetof(struct shadow_vmcs, guest_pdptr3);
+ case GUEST_PDPTR3_HIGH:
+ return offsetof(struct shadow_vmcs, guest_pdptr3)+4;
+ case HOST_IA32_PAT:
+ return offsetof(struct shadow_vmcs, host_ia32_pat);
+ case HOST_IA32_PAT_HIGH:
+ return offsetof(struct shadow_vmcs, host_ia32_pat)+4;
+ case PIN_BASED_VM_EXEC_CONTROL:
+ return offsetof(struct shadow_vmcs, pin_based_vm_exec_control);
+ case CPU_BASED_VM_EXEC_CONTROL:
+ return offsetof(struct shadow_vmcs, cpu_based_vm_exec_control);
+ case EXCEPTION_BITMAP:
+ return offsetof(struct shadow_vmcs, exception_bitmap);
+ case PAGE_FAULT_ERROR_CODE_MASK:
+ return offsetof(struct shadow_vmcs, page_fault_error_code_mask);
+ case PAGE_FAULT_ERROR_CODE_MATCH:
+ return offsetof(struct shadow_vmcs,
+ page_fault_error_code_match);
+ case CR3_TARGET_COUNT:
+ return offsetof(struct shadow_vmcs, cr3_target_count);
+ case VM_EXIT_CONTROLS:
+ return offsetof(struct shadow_vmcs, vm_exit_controls);
+ case VM_EXIT_MSR_STORE_COUNT:
+ return offsetof(struct shadow_vmcs, vm_exit_msr_store_count);
+ case VM_EXIT_MSR_LOAD_COUNT:
+ return offsetof(struct shadow_vmcs, vm_exit_msr_load_count);
+ case VM_ENTRY_CONTROLS:
+ return offsetof(struct shadow_vmcs, vm_entry_controls);
+ case VM_ENTRY_MSR_LOAD_COUNT:
+ return offsetof(struct shadow_vmcs, vm_entry_msr_load_count);
+ case VM_ENTRY_INTR_INFO_FIELD:
+ return offsetof(struct shadow_vmcs, vm_entry_intr_info_field);
+ case VM_ENTRY_EXCEPTION_ERROR_CODE:
+ return offsetof(struct shadow_vmcs,
+ vm_entry_exception_error_code);
+ case VM_ENTRY_INSTRUCTION_LEN:
+ return offsetof(struct shadow_vmcs, vm_entry_instruction_len);
+ case TPR_THRESHOLD:
+ return offsetof(struct shadow_vmcs, tpr_threshold);
+ case SECONDARY_VM_EXEC_CONTROL:
+ return offsetof(struct shadow_vmcs, secondary_vm_exec_control);
+ case VM_INSTRUCTION_ERROR:
+ return offsetof(struct shadow_vmcs, vm_instruction_error);
+ case VM_EXIT_REASON:
+ return offsetof(struct shadow_vmcs, vm_exit_reason);
+ case VM_EXIT_INTR_INFO:
+ return offsetof(struct shadow_vmcs, vm_exit_intr_info);
+ case VM_EXIT_INTR_ERROR_CODE:
+ return offsetof(struct shadow_vmcs, vm_exit_intr_error_code);
+ case IDT_VECTORING_INFO_FIELD:
+ return offsetof(struct shadow_vmcs, idt_vectoring_info_field);
+ case IDT_VECTORING_ERROR_CODE:
+ return offsetof(struct shadow_vmcs, idt_vectoring_error_code);
+ case VM_EXIT_INSTRUCTION_LEN:
+ return offsetof(struct shadow_vmcs, vm_exit_instruction_len);
+ case VMX_INSTRUCTION_INFO:
+ return offsetof(struct shadow_vmcs, vmx_instruction_info);
+ case GUEST_ES_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_es_limit);
+ case GUEST_CS_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_cs_limit);
+ case GUEST_SS_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_ss_limit);
+ case GUEST_DS_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_ds_limit);
+ case GUEST_FS_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_fs_limit);
+ case GUEST_GS_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_gs_limit);
+ case GUEST_LDTR_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_ldtr_limit);
+ case GUEST_TR_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_tr_limit);
+ case GUEST_GDTR_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_gdtr_limit);
+ case GUEST_IDTR_LIMIT:
+ return offsetof(struct shadow_vmcs, guest_idtr_limit);
+ case GUEST_ES_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_es_ar_bytes);
+ case GUEST_CS_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_cs_ar_bytes);
+ case GUEST_SS_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_ss_ar_bytes);
+ case GUEST_DS_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_ds_ar_bytes);
+ case GUEST_FS_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_fs_ar_bytes);
+ case GUEST_GS_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_gs_ar_bytes);
+ case GUEST_LDTR_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_ldtr_ar_bytes);
+ case GUEST_TR_AR_BYTES:
+ return offsetof(struct shadow_vmcs, guest_tr_ar_bytes);
+ case GUEST_INTERRUPTIBILITY_INFO:
+ return offsetof(struct shadow_vmcs,
+ guest_interruptibility_info);
+ case GUEST_ACTIVITY_STATE:
+ return offsetof(struct shadow_vmcs, guest_activity_state);
+ case GUEST_SYSENTER_CS:
+ return offsetof(struct shadow_vmcs, guest_sysenter_cs);
+ case HOST_IA32_SYSENTER_CS:
+ return offsetof(struct shadow_vmcs, host_ia32_sysenter_cs);
+ case CR0_GUEST_HOST_MASK:
+ return offsetof(struct shadow_vmcs, cr0_guest_host_mask);
+ case CR4_GUEST_HOST_MASK:
+ return offsetof(struct shadow_vmcs, cr4_guest_host_mask);
+ case CR0_READ_SHADOW:
+ return offsetof(struct shadow_vmcs, cr0_read_shadow);
+ case CR4_READ_SHADOW:
+ return offsetof(struct shadow_vmcs, cr4_read_shadow);
+ case CR3_TARGET_VALUE0:
+ return offsetof(struct shadow_vmcs, cr3_target_value0);
+ case CR3_TARGET_VALUE1:
+ return offsetof(struct shadow_vmcs, cr3_target_value1);
+ case CR3_TARGET_VALUE2:
+ return offsetof(struct shadow_vmcs, cr3_target_value2);
+ case CR3_TARGET_VALUE3:
+ return offsetof(struct shadow_vmcs, cr3_target_value3);
+ case EXIT_QUALIFICATION:
+ return offsetof(struct shadow_vmcs, exit_qualification);
+ case GUEST_LINEAR_ADDRESS:
+ return offsetof(struct shadow_vmcs, guest_linear_address);
+ case GUEST_CR0:
+ return offsetof(struct shadow_vmcs, guest_cr0);
+ case GUEST_CR3:
+ return offsetof(struct shadow_vmcs, guest_cr3);
+ case GUEST_CR4:
+ return offsetof(struct shadow_vmcs, guest_cr4);
+ case GUEST_ES_BASE:
+ return offsetof(struct shadow_vmcs, guest_es_base);
+ case GUEST_CS_BASE:
+ return offsetof(struct shadow_vmcs, guest_cs_base);
+ case GUEST_SS_BASE:
+ return offsetof(struct shadow_vmcs, guest_ss_base);
+ case GUEST_DS_BASE:
+ return offsetof(struct shadow_vmcs, guest_ds_base);
+ case GUEST_FS_BASE:
+ return offsetof(struct shadow_vmcs, guest_fs_base);
+ case GUEST_GS_BASE:
+ return offsetof(struct shadow_vmcs, guest_gs_base);
+ case GUEST_LDTR_BASE:
+ return offsetof(struct shadow_vmcs, guest_ldtr_base);
+ case GUEST_TR_BASE:
+ return offsetof(struct shadow_vmcs, guest_tr_base);
+ case GUEST_GDTR_BASE:
+ return offsetof(struct shadow_vmcs, guest_gdtr_base);
+ case GUEST_IDTR_BASE:
+ return offsetof(struct shadow_vmcs, guest_idtr_base);
+ case GUEST_DR7:
+ return offsetof(struct shadow_vmcs, guest_dr7);
+ case GUEST_RSP:
+ return offsetof(struct shadow_vmcs, guest_rsp);
+ case GUEST_RIP:
+ return offsetof(struct shadow_vmcs, guest_rip);
+ case GUEST_RFLAGS:
+ return offsetof(struct shadow_vmcs, guest_rflags);
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ return offsetof(struct shadow_vmcs,
+ guest_pending_dbg_exceptions);
+ case GUEST_SYSENTER_ESP:
+ return offsetof(struct shadow_vmcs, guest_sysenter_esp);
+ case GUEST_SYSENTER_EIP:
+ return offsetof(struct shadow_vmcs, guest_sysenter_eip);
+ case HOST_CR0:
+ return offsetof(struct shadow_vmcs, host_cr0);
+ case HOST_CR3:
+ return offsetof(struct shadow_vmcs, host_cr3);
+ case HOST_CR4:
+ return offsetof(struct shadow_vmcs, host_cr4);
+ case HOST_FS_BASE:
+ return offsetof(struct shadow_vmcs, host_fs_base);
+ case HOST_GS_BASE:
+ return offsetof(struct shadow_vmcs, host_gs_base);
+ case HOST_TR_BASE:
+ return offsetof(struct shadow_vmcs, host_tr_base);
+ case HOST_GDTR_BASE:
+ return offsetof(struct shadow_vmcs, host_gdtr_base);
+ case HOST_IDTR_BASE:
+ return offsetof(struct shadow_vmcs, host_idtr_base);
+ case HOST_IA32_SYSENTER_ESP:
+ return offsetof(struct shadow_vmcs, host_ia32_sysenter_esp);
+ case HOST_IA32_SYSENTER_EIP:
+ return offsetof(struct shadow_vmcs, host_ia32_sysenter_eip);
+ case HOST_RSP:
+ return offsetof(struct shadow_vmcs, host_rsp);
+ case HOST_RIP:
+ return offsetof(struct shadow_vmcs, host_rip);
+ default:
+ printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
+ return -1;
+ }
+}
+
+static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
+ unsigned long field)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long entry = (unsigned long)(vmx->l2_state->shadow_vmcs);
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+ return -1;
+ }
+
+ entry += vmcs_field_to_offset(field);
+ return *(unsigned long *)(entry);
+}
+
+static inline u16 nested_vmcs_read16(struct kvm_vcpu *vcpu,
+ unsigned long field)
+{
+ return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u32 nested_vmcs_read32(struct kvm_vcpu *vcpu, unsigned long field)
+{
+ return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
+{
+#ifdef CONFIG_X86_64
+ return nested_vmcs_readl(vcpu, field);
+#else /* nested: 32 bit not actually tested */
+ return nested_vmcs_readl(vcpu, field) |
+ ((u64)nested_vmcs_readl(vcpu, field+1) << 32);
+#endif
+}
+
+static inline void nested_vmcs_writel(struct kvm_vcpu *vcpu,
+ unsigned long field, unsigned long value)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long entry = (unsigned long)(vmx->l2_state->shadow_vmcs);
+
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+ return;
+ }
+ entry += vmcs_field_to_offset(field);
+ *(unsigned long *)entry = value;
+}
+
+static inline void nested_vmcs_write16(struct kvm_vcpu *vcpu,
+ unsigned long field, u16 value)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long entry = (unsigned long)(vmx->l2_state->shadow_vmcs);
+
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+ return;
+ }
+ entry += vmcs_field_to_offset(field);
+ *(u16 *)entry = value;
+}
+
+static inline void nested_vmcs_write32(struct kvm_vcpu *vcpu,
+ unsigned long field, u32 value)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long entry = (unsigned long)(vmx->l2_state->shadow_vmcs);
+
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+ return;
+ }
+ entry += vmcs_field_to_offset(field);
+ *(u32 *)entry = value;
+}
+
+static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
+ unsigned long field, u64 value)
+{
+#ifdef CONFIG_X86_64
+ nested_vmcs_writel(vcpu, field, value);
+#else /* nested: 32 bit not actually tested */
+ nested_vmcs_writel(vcpu, field, value);
+ nested_vmcs_writel(vcpu, field+1, value >> 32);
+#endif
+}
+static inline int nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->l2_state->shadow_vmcs->
+ cpu_based_vm_exec_control & CPU_BASED_TPR_SHADOW;
+}
+
+static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->l2_state->shadow_vmcs->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+ *vcpu)
+{
+ struct shadow_vmcs *shadow = to_vmx(vcpu)->l2_state->shadow_vmcs;
+
+ return (shadow->secondary_vm_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ to_vmx(vcpu)->l2_state->shadow_vmcs->apic_access_addr != 0;
+}
+
+static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->l2_state->shadow_vmcs->
+ secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->l2_state->shadow_vmcs->secondary_vm_exec_control &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->l2_state->shadow_vmcs->vm_entry_controls &
+ VM_ENTRY_LOAD_IA32_PAT;
+}
+
+static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->l2_state->shadow_vmcs->cpu_based_vm_exec_control &
+ CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+ u64 vmcs_addr)
+{
+ struct page *vmcs_page = NULL;
+
+ down_read(¤t->mm->mmap_sem);
+ vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
+ if (is_error_page(vmcs_page)) {
+ printk(KERN_ERR "%s error allocating page \n", __func__);
+ kvm_release_page_clean(vmcs_page);
+ return NULL;
+ }
+
+ return vmcs_page;
+
+}
+
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
@@ -177,6 +827,20 @@ static struct kvm_vmx_segment_field {
};
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+ bool has_error_code, u32 error_code);
+static int nested_vmx_intr(struct kvm_vcpu *vcpu);
+static int create_l1_state(struct kvm_vcpu *vcpu);
+static int create_l2_state(struct kvm_vcpu *vcpu);
+static int launch_guest(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+ bool is_interrupt);
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr4);
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int shadow_vmcs_load(struct kvm_vcpu *vcpu);
/*
* Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -293,6 +957,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
}
+static inline int is_exception(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_nmi(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -836,6 +1512,50 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
}
+static int vmptrld(struct kvm_vcpu *vcpu,
+ u64 phys_addr)
+{
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+ : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc");
+ if (error) {
+ printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
+ __func__, phys_addr);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void clear_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+ rflags = vmx_get_rflags(vcpu);
+ rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+ vmx_set_rflags(vcpu, rflags);
+}
+
+static void vmfailInvalid_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+ rflags = vmx_get_rflags(vcpu);
+ rflags |= X86_EFLAGS_CF;
+ rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_ZF &
+ ~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+ vmx_set_rflags(vcpu, rflags);
+}
+
+static void vmfailValid_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+ rflags = vmx_get_rflags(vcpu);
+ rflags |= X86_EFLAGS_ZF;
+ rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_CF &
+ ~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+ vmx_set_rflags(vcpu, rflags);
+}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code)
@@ -843,6 +1563,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info = nr | INTR_INFO_VALID_MASK;
+ if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
+ return;
+
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -967,6 +1690,73 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
}
/*
+ * Handles msr read for nested virtualization
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
+ u64 *pdata)
+{
+ u32 vmx_msr_low = 0, vmx_msr_high = 0;
+
+ switch (msr_index) {
+ case MSR_IA32_FEATURE_CONTROL:
+ *pdata = 0;
+ break;
+ case MSR_IA32_VMX_BASIC:
+ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ *pdata = vmx_msr_low | ((u64)vmx_msr_high << 32);
+ break;
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ *pdata = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ *pdata = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MOV_DR_EXITING |
+ CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_INVLPG_EXITING;
+
+ if (cpu_has_secondary_exec_ctrls())
+ *pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+ if (vm_need_tpr_shadow(vcpu->kvm))
+ *pdata |= CPU_BASED_TPR_SHADOW;
+ break;
+ case MSR_IA32_VMX_EXIT_CTLS:
+ *pdata = 0;
+#ifdef CONFIG_X86_64
+ *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+ break;
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ *pdata = 0;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *pdata = 0;
+ if (enable_ept)
+ *pdata |= SECONDARY_EXEC_ENABLE_EPT;
+ if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+ *pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ break;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ *pdata = 0;
+ if (enable_ept)
+ *pdata |= vmx_capability.ept;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
@@ -1005,6 +1795,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
default:
+ if (nested &&
+ !nested_vmx_get_msr(vcpu, msr_index, &data))
+ break;
vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
@@ -1018,6 +1811,23 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
return 0;
}
+
+/*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ switch (msr_index) {
+ case MSR_IA32_FEATURE_CONTROL:
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -1064,6 +1874,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
}
/* Otherwise falls through to kvm_set_msr_common */
default:
+ if (nested &&
+ !nested_vmx_set_msr(vcpu, msr_index, data))
+ break;
vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -1605,6 +2418,9 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_dirty))
return;
+ if (is_nested(vcpu) && nested_cpu_has_vmx_ept(vcpu))
+ return;
+
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
@@ -2534,6 +3350,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
+ if (is_nested(vcpu)) {
+ if (kvm_cpu_has_interrupt(vcpu)) {
+ if (nested_vmx_intr(vcpu))
+ return;
+ }
+ return;
+ }
+
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2543,6 +3367,9 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
+ if (is_nested(vcpu))
+ return;
+
if (!cpu_has_virtual_nmis()) {
enable_irq_window(vcpu);
return;
@@ -2585,10 +3412,25 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
}
+static void vmx_set_irq(struct kvm_vcpu *vcpu)
+{
+ if (is_nested(vcpu))
+ return;
+
+ if (nested_vmx_intr(vcpu))
+ return;
+
+ vmx_inject_irq(vcpu);
+}
+
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_nested(vcpu)) {
+ return;
+ }
+
if (!cpu_has_virtual_nmis()) {
/*
* Tracking the NMI-blocked state in software is built upon
@@ -2630,6 +3472,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
+ if (is_nested(vcpu)) {
+ if (kvm_cpu_has_interrupt(vcpu)) {
+ if (!nested_vmx_intr(vcpu))
+ return 0;
+ }
+ }
+
return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3101,9 +3950,331 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+/*
+ Check to see if vcpu can execute vmx command
+ Inject the corrseponding exception
+*/
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception(vcpu, UD_VECTOR);
+ struct kvm_segment cs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_msr_entry *msr;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+ if (!vmx->vmon) {
+ printk(KERN_DEBUG "%s: vmx not on\n", __func__);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ msr = find_msr_entry(vmx, MSR_EFER);
+
+ if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+ ((msr->data & EFER_LMA) && !cs.l)) {
+ printk(KERN_DEBUG "%s: invalid mode cs.l %d lma %llu\n",
+ __func__, cs.l, msr->data & EFER_LMA);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
+{
+ gpa_t gpa;
+ int r = 0;
+
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+ r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
+ if (r) {
+ printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
+ __func__, gpa, r);
+ goto out;
+ }
+
+ if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+ printk(KERN_DEBUG "%s addr %llx not aligned\n",
+ __func__, *gentry);
+ return 1;
+ }
+
+out:
+ return r;
+}
+
+static int handle_vmclear(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ to_vmx(vcpu)->vmclear = 1;
+
+ skip_emulated_instruction(vcpu);
+ clear_rflags(vcpu);
+
+ return 1;
+}
+
+static int handle_vmptrld(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *vmcs_page;
+ u64 guest_vmcs_addr;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (read_guest_vmcs_gpa(vcpu, &guest_vmcs_addr))
+ return 1;
+
+ if (create_l1_state(vcpu)) {
+ printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+ return 1;
+ }
+
+ if (create_l2_state(vcpu)) {
+ printk(KERN_ERR "%s create_l2_state failed\n", __func__);
+ return 1;
+ }
+
+ vmx->l2_state->vmcs = alloc_vmcs();
+ if (!vmx->l2_state->vmcs) {
+ printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
+ return 1;
+ }
+
+ if (vmx->l1_cur_vmcs != guest_vmcs_addr) {
+ vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
+ if (vmcs_page == NULL)
+ return 1;
+
+ /* load nested vmcs to processor */
+ if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
+ printk(KERN_INFO "%s error in vmptrld \n",
+ __func__);
+ kvm_release_page_clean(vmcs_page);
+ return 1;
+ }
+
+ /* save nested vmcs in the shadow vmcs */
+ if (shadow_vmcs_load(vcpu)) {
+ kvm_release_page_clean(vmcs_page);
+ return 1;
+ }
+
+ vmx->l1_cur_vmcs = guest_vmcs_addr;
+
+ /* load to previous vmcs */
+ if (vmptrld(vcpu, __pa(to_vmx(vcpu)->vmcs))) {
+ kvm_release_page_clean(vmcs_page);
+ return 1;
+ }
+
+ kvm_release_page_clean(vmcs_page);
+ }
+ clear_rflags(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+
+static int handle_vmptrst(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->l1_cur_vmcs;
+
+ clear_rflags(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_vmlaunch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!to_vmx(vcpu)->vmclear)
+ return 1;
+
+ return launch_guest(vcpu);
+}
+
+static int handle_vmresume(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (to_vmx(vcpu)->vmclear)
+ return 1;
+
+ return launch_guest(vcpu);
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+#ifndef CONFIG_X86_64
+ u64 value;
+#endif
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_INFO "%s no shadow vmcs\n", __func__);
+ vmfailInvalid_rflags(vcpu);
+ return 1;
+ }
+
+ switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
+ case VMCS_FIELD_TYPE_U16:
+ vcpu->arch.regs[VCPU_REGS_RAX] =
+ nested_vmcs_read16(vcpu,
+ vcpu->arch.regs[VCPU_REGS_RDX]);
+ break;
+ case VMCS_FIELD_TYPE_U32:
+ vcpu->arch.regs[VCPU_REGS_RAX] =
+ nested_vmcs_read32(vcpu,
+ vcpu->arch.regs[VCPU_REGS_RDX]);
+ break;
+ case VMCS_FIELD_TYPE_U64:
+#ifdef CONFIG_X86_64
+ vcpu->arch.regs[VCPU_REGS_RAX] =
+ nested_vmcs_read64(vcpu,
+ vcpu->arch.regs[VCPU_REGS_RDX]);
+#else /* nested: 32 bit not actually tested */
+ value = nested_vmcs_read64(vcpu,
+ vcpu->arch.regs[VCPU_REGS_RDX]);
+ vcpu->arch.regs[VCPU_REGS_RAX] = value;
+ vcpu->arch.regs[VCPU_REGS_RBX] = value >> 32;
+#endif
+ break;
+ case VMCS_FIELD_TYPE_ULONG:
+ vcpu->arch.regs[VCPU_REGS_RAX] =
+ nested_vmcs_readl(vcpu,
+ vcpu->arch.regs[VCPU_REGS_RDX]);
+ break;
+ default:
+ printk(KERN_INFO "%s invalid field\n", __func__);
+ vmfailValid_rflags(vcpu);
+ vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+ return 1;
+ }
+
+ clear_rflags(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifndef CONFIG_X86_64
+ u64 value ;
+#endif
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_INFO "%s no shadow vmcs\n", __func__);
+ vmfailInvalid_rflags(vcpu);
+ return 1;
+ }
+
+ switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
+ case VMCS_FIELD_TYPE_U16:
+ nested_vmcs_write16(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ break;
+ case VMCS_FIELD_TYPE_U32:
+ nested_vmcs_write32(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ break;
+ case VMCS_FIELD_TYPE_U64:
+#ifdef CONFIG_X86_64
+ nested_vmcs_write64(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+#else /* nested: 32 bit not actually tested */
+ value = vcpu->arch.regs[VCPU_REGS_RAX] |
+ (vcpu->arch.regs[VCPU_REGS_RBX] << 32);
+ nested_vmcs_write64(vcpu,
+ vcpu->arch.regs[VCPU_REGS_RDX], value);
+#endif
+ break;
+ case VMCS_FIELD_TYPE_ULONG:
+ nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ break;
+ default:
+ printk(KERN_INFO "%s invalid field\n", __func__);
+ vmfailValid_rflags(vcpu);
+ vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+ return 1;
+ }
+
+ clear_rflags(vcpu);
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_vmoff(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx->vmon = 0;
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_vmon(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct kvm_segment cs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!nested) {
+ printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+ if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
+ !(vcpu->arch.cr0 & X86_CR0_PE) ||
+ (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+ ((find_msr_entry(to_vmx(vcpu),
+ MSR_EFER)->data & EFER_LMA) && !cs.l)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ printk(KERN_INFO "%s invalid register state\n", __func__);
+ return 1;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ printk(KERN_INFO "%s no permission\n", __func__);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmx->vmon = 1;
+
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -3202,6 +4373,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
unsigned long exit_qualification;
gpa_t gpa;
int gla_validity;
+ int r;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3218,14 +4390,89 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vmcs_readl(GUEST_LINEAR_ADDRESS));
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
+ if (is_nested(vcpu) && nested_cpu_has_vmx_ept(vcpu)) {
+ nested_vmx_vmexit(vcpu, false);
+ return 1;
+ }
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
return 0;
}
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+
+ if (is_nested(vcpu) && nested_cpu_has_vmx_ept(vcpu)) {
+ /* get gpa1 from EPT12 */
+ r = nested_tdp_page_fault(vcpu, gpa,
+ to_vmx(vcpu)->
+ l2_state->shadow_vmcs->ept_pointer);
+
+ if (r < 0) {
+ printk(KERN_ERR "EPT: Not enough memory!\n");
+ return -ENOMEM;
+ } else if (r) {
+ nested_vmx_vmexit(vcpu, false);
+ return 1;
+ }
+ return 1;
+ } else {
+ trace_kvm_page_fault(gpa, exit_qualification);
+ return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+ }
+}
+
+static int handle_invept(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ unsigned long type = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ switch (type) {
+ case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+ if (!cpu_has_vmx_invept_individual_addr())
+ vmfailValid_rflags(vcpu);
+ else {
+ struct {
+ u64 eptp, gpa;
+ } operand;
+ kvm_read_guest(vcpu->kvm,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ &operand, sizeof(operand));
+ if (vmx->l2_state->ept_pointer)
+ ept_sync_individual_addr(vmx->l2_state->
+ ept_pointer,
+ operand.gpa);
+ else
+ ept_sync_global();
+ }
+ break;
+ case VMX_EPT_EXTENT_CONTEXT:
+ if (!cpu_has_vmx_invept_context())
+ vmfailValid_rflags(vcpu);
+ else {
+ if (vmx->l2_state->ept_pointer)
+ ept_sync_context(vmx->l2_state->ept_pointer);
+ else
+ ept_sync_global();
+ }
+ break;
+ case VMX_EPT_EXTENT_GLOBAL:
+ if (!cpu_has_vmx_invept_global())
+ vmfailValid_rflags(vcpu);
+ else
+ ept_sync_global();
+ break;
+ default:
+ vmfailValid_rflags(vcpu);
+ break;
+ }
+
+ skip_emulated_instruction(vcpu);
+
+ return 1;
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3377,15 +4624,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
- [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
- [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
- [EXIT_REASON_VMPTRST] = handle_vmx_insn,
- [EXIT_REASON_VMREAD] = handle_vmx_insn,
- [EXIT_REASON_VMRESUME] = handle_vmx_insn,
- [EXIT_REASON_VMWRITE] = handle_vmx_insn,
- [EXIT_REASON_VMOFF] = handle_vmx_insn,
- [EXIT_REASON_VMON] = handle_vmx_insn,
+ [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
+ [EXIT_REASON_VMRESUME] = handle_vmresume,
+ [EXIT_REASON_VMCLEAR] = handle_vmclear,
+ [EXIT_REASON_VMPTRLD] = handle_vmptrld,
+ [EXIT_REASON_VMPTRST] = handle_vmptrst,
+ [EXIT_REASON_VMREAD] = handle_vmread,
+ [EXIT_REASON_VMWRITE] = handle_vmwrite,
+ [EXIT_REASON_VMOFF] = handle_vmoff,
+ [EXIT_REASON_VMON] = handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
@@ -3393,6 +4640,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_INVEPT] = handle_invept,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3410,6 +4658,17 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+ if (exit_reason == EXIT_REASON_VMLAUNCH ||
+ exit_reason == EXIT_REASON_VMRESUME)
+ vmx->nested_run_pending = 1;
+ else
+ vmx->nested_run_pending = 0;
+
+ if (is_nested(vcpu) && nested_vmx_exit_handled(vcpu, true)) {
+ nested_vmx_vmexit(vcpu, false);
+ return 1;
+ }
+
/* If we need to emulate an MMIO from handle_invalid_guest_state
* we just return 0 */
if (vmx->emulation_required && emulate_invalid_guest_state) {
@@ -3438,7 +4697,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
"(0x%x) and exit reason is 0x%x\n",
__func__, vectoring_info, exit_reason);
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (!is_nested(vcpu) && unlikely(!cpu_has_virtual_nmis() &&
+ vmx->soft_vnmi_blocked)) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -3485,10 +4745,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
int type;
bool idtv_info_valid;
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ if (is_nested(&vmx->vcpu))
+ return;
+
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
/* Handle machine checks before interrupts are enabled */
if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
|| (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
@@ -3549,11 +4812,14 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
/* fall through */
case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
- kvm_queue_exception_e(&vmx->vcpu, vector, err);
- } else
- kvm_queue_exception(&vmx->vcpu, vector);
+ if (!is_nested(&vmx->vcpu)) {
+ if (idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ kvm_queue_exception_e(&vmx->vcpu, vector, err);
+ } else
+ kvm_queue_exception(&vmx->vcpu, vector);
+ }
break;
case INTR_TYPE_SOFT_INTR:
vmx->vcpu.arch.event_exit_inst_len =
@@ -3591,6 +4857,62 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
| vmx->rmode.irq.vector;
}
+static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int irq;
+ int type;
+ int errCodeValid;
+ u32 idt_vectoring_info;
+ u32 guest_intr;
+ bool nmi_window_open;
+ bool interrupt_window_open;
+
+ if (is_nested(vcpu) && vmx->nested_pending_valid_idt) {
+ idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ irq = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+ errCodeValid = idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK;
+
+ guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ nmi_window_open =
+ !(guest_intr & (GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_MOV_SS |
+ GUEST_INTR_STATE_NMI));
+
+ interrupt_window_open =
+ ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(guest_intr & (GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_MOV_SS)));
+
+ if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
+ printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
+ return 0;
+ }
+
+ if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
+ printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
+ return 0;
+ }
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+ if (errCodeValid)
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+
+ return 1;
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
@@ -3602,6 +4924,16 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 intr_info;
+
+ nested_handle_pending_idt(vcpu);
+
+ if (is_nested(vcpu)) {
+ vmcs_writel(GUEST_CR0, vmx->l2_state->shadow_vmcs->guest_cr0);
+ vmcs_write32(EXCEPTION_BITMAP, vmx->l2_state->shadow_vmcs->
+ exception_bitmap |
+ vmx->l1_state->shadow_vmcs->exception_bitmap);
+ }
if (enable_ept && is_paging(vcpu)) {
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3740,12 +5072,27 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
get_debugreg(vcpu->arch.dr6, 6);
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ vmx->nested_pending_valid_idt = is_nested(vcpu) &&
+ (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
+ if (is_nested(vcpu))
+ vmx->vmclear = 0;
+
+ intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+ (intr_info & INTR_INFO_VALID_MASK)) {
+ printk(KERN_INFO "%s:%d: injecting NMI\n", __func__, __LINE__);
+ asm("int $2");
+ }
+
vmx_complete_interrupts(vmx);
}
@@ -3828,6 +5175,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
+ vmx->l1_cur_vmcs = 0;
+
+ vmx->l1_state = NULL;
+ vmx->l2_state = NULL;
+
return &vmx->vcpu;
free_vmcs:
@@ -3918,6 +5270,1157 @@ static bool vmx_gb_page_enable(void)
return false;
}
+void save_vmcs(struct shadow_vmcs *dst)
+{
+ dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+ dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+ dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+ dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+ dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+ dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+ dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+ dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+ dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+ if (cpu_has_vmx_msr_bitmap())
+ dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
+
+ dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
+ dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
+ dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
+ dst->tsc_offset = vmcs_read64(TSC_OFFSET);
+ dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
+ dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
+ if (enable_ept)
+ dst->ept_pointer = vmcs_read64(EPT_POINTER);
+
+ dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+ dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ if (enable_ept) {
+ dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+ dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+ dst->page_fault_error_code_mask =
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
+ dst->page_fault_error_code_match =
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
+ dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+ dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
+ dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
+ dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
+ dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
+ dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
+ dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ dst->vm_entry_exception_error_code =
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+ dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
+ dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ if (enable_vpid && dst->secondary_vm_exec_control &
+ SECONDARY_EXEC_ENABLE_VPID)
+ dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
+ dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+ dst->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
+ dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ dst->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+ dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
+ dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
+ dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
+ dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+ dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+ dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
+ dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
+ dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
+ dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
+ dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+ dst->guest_cr0 = vmcs_readl(GUEST_CR0);
+ dst->guest_cr3 = vmcs_readl(GUEST_CR3);
+ dst->guest_cr4 = vmcs_readl(GUEST_CR4);
+ dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ dst->guest_dr7 = vmcs_readl(GUEST_DR7);
+ dst->guest_rsp = vmcs_readl(GUEST_RSP);
+ dst->guest_rip = vmcs_readl(GUEST_RIP);
+ dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+ dst->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+ dst->host_cr0 = vmcs_readl(HOST_CR0);
+ dst->host_cr3 = vmcs_readl(HOST_CR3);
+ dst->host_cr4 = vmcs_readl(HOST_CR4);
+ dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
+ dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
+ dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
+ dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+ dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+ dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
+ dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
+ dst->host_rsp = vmcs_readl(HOST_RSP);
+ dst->host_rip = vmcs_readl(HOST_RIP);
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+ dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
+}
+
+static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->l2_state->shadow_vmcs) {
+ vmx->l2_state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!vmx->l2_state->shadow_vmcs) {
+ printk(KERN_INFO "%s error creating nested vmcs\n",
+ __func__);
+ return -ENOMEM;
+ }
+ }
+
+ save_vmcs(vmx->l2_state->shadow_vmcs);
+
+ return 0;
+}
+
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+ struct shadow_vmcs *l2_shadow_vmcs =
+ to_vmx(vcpu)->l2_state->shadow_vmcs;
+ struct shadow_vmcs *l1_shadow_vmcs =
+ to_vmx(vcpu)->l1_state->shadow_vmcs;
+
+ l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+ l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+ l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+ l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+ l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+ l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+ l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+ l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+
+ l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
+ l2_shadow_vmcs->guest_physical_address =
+ vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+ l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+ l2_shadow_vmcs->vm_entry_intr_info_field =
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ l2_shadow_vmcs->vm_entry_exception_error_code =
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ l2_shadow_vmcs->vm_entry_instruction_len =
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+ l2_shadow_vmcs->vm_instruction_error =
+ vmcs_read32(VM_INSTRUCTION_ERROR);
+ l2_shadow_vmcs->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
+ l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ l2_shadow_vmcs->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ l2_shadow_vmcs->idt_vectoring_info_field =
+ vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ l2_shadow_vmcs->idt_vectoring_error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ l2_shadow_vmcs->vm_exit_instruction_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ l2_shadow_vmcs->vmx_instruction_info =
+ vmcs_read32(VMX_INSTRUCTION_INFO);
+ l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ l2_shadow_vmcs->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ l2_shadow_vmcs->guest_activity_state =
+ vmcs_read32(GUEST_ACTIVITY_STATE);
+ l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+ l1_shadow_vmcs->host_ia32_sysenter_cs =
+ vmcs_read32(HOST_IA32_SYSENTER_CS);
+
+ l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+ l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+ l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+ l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+ if (nested_cpu_has_vmx_ept(vcpu))
+ l2_shadow_vmcs->guest_cr3 = vmcs_readl(GUEST_CR3);
+
+ l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
+ l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
+ l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
+ l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
+ l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+ l2_shadow_vmcs->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+ l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
+ l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
+ l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
+ l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
+ l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
+ l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
+ l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+ l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+ l1_shadow_vmcs->host_ia32_sysenter_esp =
+ vmcs_readl(HOST_IA32_SYSENTER_ESP);
+ l1_shadow_vmcs->host_ia32_sysenter_eip =
+ vmcs_readl(HOST_IA32_SYSENTER_EIP);
+ l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
+ l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
+}
+
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+ vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
+ vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
+
+ vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
+
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ src->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
+
+ vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
+ vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
+ vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
+ vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
+ vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ src->guest_interruptibility_info);
+ vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
+ vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
+
+ vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
+ vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
+ vmcs_writel(GUEST_DR7, src->guest_dr7);
+ vmcs_writel(GUEST_RSP, src->guest_rsp);
+ vmcs_writel(GUEST_RIP, src->guest_rip);
+ vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ src->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
+
+ return 0;
+}
+
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+ vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+ vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+ vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+ vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+ vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+ vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+ vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
+
+ vmcs_write64(TSC_OFFSET, src->tsc_offset);
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+ vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+ vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+ vmcs_writel(HOST_CR0, src->host_cr0);
+ vmcs_writel(HOST_CR3, src->host_cr3);
+ vmcs_writel(HOST_CR4, src->host_cr4);
+ vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+ vmcs_writel(HOST_GS_BASE, src->host_gs_base);
+ vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+ vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
+ vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
+ vmcs_writel(HOST_RSP, src->host_rsp);
+ vmcs_writel(HOST_RIP, src->host_rip);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
+
+ return 0;
+}
+
+struct level_state *create_state(void)
+{
+ struct level_state *state = NULL;
+
+ state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
+ if (!state) {
+ printk(KERN_INFO "Error create level state\n");
+ return NULL;
+ }
+ state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!state->shadow_vmcs) {
+ printk(KERN_INFO "%s error creating shadow vmcs\n",
+ __func__);
+ kfree(state);
+ return NULL;
+ }
+ return state;
+}
+
+int create_l1_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->l1_state) {
+ vmx->l1_state = create_state();
+ if (!vmx->l1_state)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int create_l2_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->l2_state) {
+ vmx->l2_state = create_state();
+ if (!vmx->l2_state)
+ return -ENOMEM;
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx->l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+ else
+ vmx->l2_state->msr_bitmap = 0;
+
+ vmx->l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+ vmx->l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+
+ vmx->l2_state->mmu_page_hash =
+ kzalloc(sizeof(struct hlist_head)*KVM_NUM_MMU_PAGES,
+ GFP_KERNEL);
+
+ return 0;
+}
+
+int prepare_vmcs_02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shadow_vmcs *src = vmx->l2_state->shadow_vmcs;
+ u32 exec_control;
+
+ if (!src) {
+ printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
+ return 1;
+ }
+
+ load_vmcs_common(src);
+
+ if (cpu_has_vmx_vpid() && vmx->l2_state->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->l2_state->vpid);
+
+ if (vmx->l2_state->io_bitmap_a)
+ vmcs_write64(IO_BITMAP_A, vmx->l2_state->io_bitmap_a);
+
+ if (vmx->l2_state->io_bitmap_b)
+ vmcs_write64(IO_BITMAP_B, vmx->l2_state->io_bitmap_b);
+
+ if (vmx->l2_state->msr_bitmap)
+ vmcs_write64(MSR_BITMAP, vmx->l2_state->msr_bitmap);
+
+ if (src->vm_entry_msr_load_count > 0) {
+ struct page *page;
+
+ page = nested_get_page(vcpu,
+ src->vm_entry_msr_load_addr);
+ if (!page)
+ return 1;
+
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
+
+ kvm_release_page_clean(page);
+ }
+
+ if (src->virtual_apic_page_addr != 0) {
+ struct page *page;
+
+ page = nested_get_page(vcpu,
+ src->virtual_apic_page_addr);
+ if (!page)
+ return 1;
+
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
+
+ kvm_release_page_clean(page);
+ } else {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ src->virtual_apic_page_addr);
+ }
+
+ if (vm_need_virtualize_apic_accesses(vcpu->kvm)) {
+ if (src->apic_access_addr != 0) {
+ struct page *page =
+ nested_get_page(vcpu, src->apic_access_addr);
+ if (!page)
+ return 1;
+
+ vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+ kvm_release_page_clean(page);
+ } else {
+ vmcs_write64(APIC_ACCESS_ADDR, 0);
+ }
+ }
+
+ if (vm_need_tpr_shadow(vcpu->kvm) &&
+ nested_cpu_has_vmx_tpr_shadow(vcpu))
+ vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
+
+ if (enable_ept) {
+ if (!nested_cpu_has_vmx_ept(vcpu)) {
+ vmcs_write64(EPT_POINTER,
+ vmx->l1_state->shadow_vmcs->ept_pointer);
+ vmcs_write64(GUEST_PDPTR0,
+ vmx->l1_state->shadow_vmcs->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1,
+ vmx->l1_state->shadow_vmcs->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2,
+ vmx->l1_state->shadow_vmcs->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3,
+ vmx->l1_state->shadow_vmcs->guest_pdptr3);
+ } else {
+ vmcs_write64(GUEST_PDPTR0,
+ vmx->l2_state->shadow_vmcs->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1,
+ vmx->l2_state->shadow_vmcs->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2,
+ vmx->l2_state->shadow_vmcs->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3,
+ vmx->l2_state->shadow_vmcs->guest_pdptr3);
+ }
+
+ }
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+ (vmx->l1_state->shadow_vmcs->pin_based_vm_exec_control |
+ src->pin_based_vm_exec_control));
+
+ exec_control = vmx->l1_state->shadow_vmcs->cpu_based_vm_exec_control;
+
+ exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+ exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+ exec_control |= src->cpu_based_vm_exec_control;
+
+ if (!vm_need_tpr_shadow(vcpu->kvm) ||
+ src->virtual_apic_page_addr == 0) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ } else if (exec_control & CPU_BASED_TPR_SHADOW) {
+
+#ifdef CONFIG_X86_64
+ exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
+ exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ vmcs_write32(EXCEPTION_BITMAP,
+ (vmx->l1_state->shadow_vmcs->exception_bitmap |
+ src->exception_bitmap));
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ (vmx->l1_state->shadow_vmcs->page_fault_error_code_mask &
+ src->page_fault_error_code_mask));
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ (vmx->l1_state->shadow_vmcs->page_fault_error_code_match &
+ src->page_fault_error_code_match));
+
+ vmcs_write32(VM_EXIT_CONTROLS,
+ ((vmx->l1_state->shadow_vmcs->vm_exit_controls &
+ NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
+
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ (vmx->l1_state->shadow_vmcs->vm_entry_controls &
+ NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
+
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+
+ exec_control =
+ vmx->l1_state->shadow_vmcs->secondary_vm_exec_control;
+
+ if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
+
+ exec_control |= src->secondary_vm_exec_control;
+
+ if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
+ !nested_vm_need_virtualize_apic_accesses(vcpu))
+ exec_control &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ }
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+ vmcs_writel(CR0_GUEST_HOST_MASK,
+ (vmx->l1_state->shadow_vmcs->cr0_guest_host_mask &
+ src->cr0_guest_host_mask));
+ vmcs_writel(CR4_GUEST_HOST_MASK,
+ (vmx->l1_state->shadow_vmcs->cr4_guest_host_mask &
+ src->cr4_guest_host_mask));
+
+ load_vmcs_host_state(vmx->l1_state->shadow_vmcs);
+
+ return 0;
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct shadow_vmcs *src = to_vmx(vcpu)->l1_state->shadow_vmcs;
+
+ if (enable_vpid && src->virtual_processor_id != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+ vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+ vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+ if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+ vmcs_write64(APIC_ACCESS_ADDR,
+ src->apic_access_addr);
+
+ if (enable_ept) {
+ vmcs_write64(EPT_POINTER, src->ept_pointer);
+ vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+ }
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+ vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ src->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ src->page_fault_error_code_match);
+ vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+ vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ src->secondary_vm_exec_control);
+
+ load_vmcs_common(src);
+
+ load_vmcs_host_state(to_vmx(vcpu)->l1_state->shadow_vmcs);
+
+ return 0;
+}
+
+void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+ unsigned long mask;
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
+
+ if (vcpu->arch.regs_dirty & mask) {
+ printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
+ vcpu->arch.regs_dirty, mask);
+ WARN_ON(1);
+ }
+
+ vcpu->arch.regs_dirty = 0;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+ /* verify that l1 has done vmptrld for l2 earlier */
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int initial_pfu_active = vcpu->fpu_active;
+ int r = 0;
+
+ if (is_nested(vcpu)) {
+ printk(KERN_INFO "Nested guest already running\n");
+ vmfailValid_rflags(vcpu);
+ return 1;
+ }
+
+ vmx->nested_mode = 1;
+
+ vcpu->arch.exception.pending = false;
+
+ sync_cached_regs_to_vmcs(vcpu);
+
+ save_vmcs(vmx->l1_state->shadow_vmcs);
+
+ vmx->l1_state->shadow_efer = vcpu->arch.shadow_efer;
+ if (!enable_ept)
+ vmx->l1_state->cr3 = vcpu->arch.cr3;
+ vmx->l1_state->cr4 = vcpu->arch.cr4;
+
+ if (enable_vpid) {
+ if (vmx->l2_state->vpid == 0) {
+ allocate_vpid(vmx);
+ vmx->l2_state->vpid = vmx->vpid;
+ }
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx->l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+ else
+ vmx->l1_state->msr_bitmap = 0;
+
+ vmx->l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+ vmx->l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+ vmx->l1_state->vmcs = vmx->vmcs;
+ vmx->l1_state->cpu = vcpu->cpu;
+ vmx->l1_state->launched = vmx->launched;
+
+ vmx->vmcs = vmx->l2_state->vmcs;
+ vcpu->cpu = vmx->l2_state->cpu;
+ vmx->launched = vmx->l2_state->launched;
+
+ if (nested_cpu_has_vmx_ept(vcpu)) {
+ vmx->l1_state->mmu_page_hash =
+ vcpu->kvm->arch.mmu_page_hash;
+ vcpu->kvm->arch.mmu_page_hash =
+ vmx->l2_state->mmu_page_hash;
+ }
+
+ if (vmx->vmclear || !vmx->launched) {
+ vmcs_clear(vmx->vmcs);
+ vmx->launched = 0;
+ }
+
+ vmx_vcpu_load(vcpu, get_cpu());
+ put_cpu();
+
+ prepare_vmcs_02(vcpu);
+
+ if (vmx->l2_state->shadow_vmcs->vm_entry_controls &
+ VM_ENTRY_IA32E_MODE) {
+ if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
+ (vcpu->arch.shadow_efer & EFER_LME)))
+ vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
+ } else {
+ if ((vcpu->arch.shadow_efer & EFER_LMA) ||
+ (vcpu->arch.shadow_efer & EFER_LME))
+ vcpu->arch.shadow_efer = 0;
+ }
+
+ vmx_set_cr0(vcpu, vmx->l2_state->shadow_vmcs->guest_cr0);
+ vmcs_writel(CR0_READ_SHADOW,
+ vmx->l2_state->shadow_vmcs->cr0_read_shadow);
+ vmx_set_cr4(vcpu, vmx->l2_state->shadow_vmcs->guest_cr4);
+ vmcs_writel(CR4_READ_SHADOW,
+ vmx->l2_state->shadow_vmcs->cr4_read_shadow);
+
+ vcpu->arch.cr0 |= X86_CR0_PG;
+
+ if (nested_cpu_has_vmx_ept(vcpu))
+ nested_tdp = 1;
+
+ if (cpu_has_vmx_ept() && !nested_cpu_has_vmx_ept(vcpu)) {
+ vmcs_write32(GUEST_CR3, vmx->l2_state->shadow_vmcs->guest_cr3);
+ vmx->vcpu.arch.cr3 = vmx->l2_state->shadow_vmcs->guest_cr3;
+ } else {
+ kvm_set_cr3(vcpu, vmx->l2_state->shadow_vmcs->guest_cr3);
+ kvm_mmu_reset_context(vcpu);
+
+ r = kvm_mmu_load(vcpu);
+ if (unlikely(r)) {
+ printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+ nested_vmx_vmexit(vcpu, false);
+ vmfailValid_rflags(vcpu);
+ return 1;
+ }
+
+ if (nested_cpu_has_vmx_ept(vcpu))
+ vmx->l2_state->ept_pointer = vmcs_read64(EPT_POINTER);
+ }
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP,
+ vmx->l2_state->shadow_vmcs->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP,
+ vmx->l2_state->shadow_vmcs->guest_rip);
+
+ vmcs_write32(EXCEPTION_BITMAP,
+ (vmx->l1_state->shadow_vmcs->exception_bitmap |
+ vmx->l2_state->shadow_vmcs->exception_bitmap));
+
+ if (initial_pfu_active)
+ vmx_fpu_activate(vcpu);
+
+ return 1;
+}
+
+static int launch_guest(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ skip_emulated_instruction(vcpu);
+
+ nested_vmx_run(vcpu);
+
+ return 1;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+ bool is_interrupt)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int initial_pfu_active = vcpu->fpu_active;
+
+ if (!is_nested(vcpu)) {
+ printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+ __func__);
+ return 0;
+ }
+
+ save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+
+ sync_cached_regs_to_vmcs(vcpu);
+
+ prepare_vmcs_12(vcpu);
+ if (is_interrupt)
+ vmx->l2_state->shadow_vmcs->vm_exit_reason =
+ EXIT_REASON_EXTERNAL_INTERRUPT;
+
+ vmx->l2_state->launched = vmx->launched;
+ vmx->l2_state->cpu = vcpu->cpu;
+
+ vmx->vmcs = vmx->l1_state->vmcs;
+ vcpu->cpu = vmx->l1_state->cpu;
+ vmx->launched = vmx->l1_state->launched;
+
+ if (enable_ept && nested_cpu_has_vmx_ept(vcpu)) {
+ vmx->l2_state->ept_pointer = vmcs_read64(EPT_POINTER);
+ vcpu->kvm->arch.mmu_page_hash =
+ vmx->l1_state->mmu_page_hash;
+ nested_tdp = 0;
+ }
+
+ vmx_vcpu_load(vcpu, get_cpu());
+ put_cpu();
+
+ vcpu->arch.exception.pending = false;
+
+ vcpu->arch.shadow_efer = vmx->l1_state->shadow_efer;
+ vmx_set_cr0(vcpu, vmx->l1_state->shadow_vmcs->cr0_read_shadow);
+ vmx_set_cr4(vcpu, vmx->l1_state->cr4);
+
+ if (enable_ept) {
+ vcpu->arch.cr3 = vmx->l1_state->shadow_vmcs->guest_cr3;
+ vmcs_write32(GUEST_CR3, vmx->l1_state->shadow_vmcs->guest_cr3);
+ } else {
+ kvm_set_cr3(vcpu, vmx->l1_state->cr3);
+ }
+
+ switch_back_vmcs(vcpu);
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP,
+ vmx->l1_state->shadow_vmcs->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP,
+ vmx->l1_state->shadow_vmcs->guest_rip);
+
+ vmx->nested_mode = 0;
+
+ kvm_mmu_reset_context(vcpu);
+ kvm_mmu_load(vcpu);
+
+ if (unlikely(vmx->fail)) {
+ vmx->fail = 0;
+ vmfailValid_rflags(vcpu);
+ } else
+ clear_rflags(vcpu);
+
+ if (initial_pfu_active)
+ vmx_fpu_activate(vcpu);
+
+ return 0;
+}
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+ if (is_nested(vcpu)) {
+ struct page *msr_page = NULL;
+ u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+ struct shadow_vmcs *l2svmcs =
+ to_vmx(vcpu)->l2_state->shadow_vmcs;
+
+ if (!cpu_has_vmx_msr_bitmap()
+ || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+ return 1;
+
+ msr_page = nested_get_page(vcpu,
+ l2svmcs->msr_bitmap);
+
+ if (!msr_page) {
+ printk(KERN_INFO "%s error in nested_get_page\n",
+ __func__);
+ return 0;
+ }
+
+ switch (exit_code) {
+ case EXIT_REASON_MSR_READ:
+ if (msr_index <= 0x1fff) {
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0x000)))
+ return 1;
+ } else if ((msr_index >= 0xc0000000) &&
+ (msr_index <= 0xc0001fff)) {
+ msr_index &= 0x1fff;
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0x400)))
+ return 1;
+ }
+ break;
+ case EXIT_REASON_MSR_WRITE:
+ if (msr_index <= 0x1fff) {
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0x800)))
+ return 1;
+ } else if ((msr_index >= 0xc0000000) &&
+ (msr_index <= 0xc0001fff)) {
+ msr_index &= 0x1fff;
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0xc00)))
+ return 1;
+ }
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+ u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct shadow_vmcs *l2svmcs = vmx->l2_state->shadow_vmcs;
+
+ if (vmx->nested_run_pending)
+ return 0;
+
+ if (unlikely(vmx->fail)) {
+ printk(KERN_INFO "%s failed vm entry %x\n",
+ __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+ return 1;
+ }
+
+ if (kvm_override) {
+ switch (exit_code) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return 0;
+ case EXIT_REASON_EXCEPTION_NMI:
+ if (!is_exception(intr_info))
+ return 0;
+
+ if (is_page_fault(intr_info) && (!enable_ept))
+ return 0;
+
+ break;
+ case EXIT_REASON_EPT_VIOLATION:
+ if (enable_ept)
+ return 0;
+
+ break;
+ }
+ }
+
+ switch (exit_code) {
+ case EXIT_REASON_INVLPG:
+ if (l2svmcs->cpu_based_vm_exec_control &
+ CPU_BASED_INVLPG_EXITING)
+ return 1;
+
+ break;
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return nested_vmx_exit_handled_msr(vcpu);
+ case EXIT_REASON_CR_ACCESS: {
+ unsigned long exit_qualification =
+ vmcs_readl(EXIT_QUALIFICATION);
+ int cr = exit_qualification & 15;
+ int reg = (exit_qualification >> 8) & 15;
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ switch (cr) {
+ case 0:
+ if (l2svmcs->cr0_guest_host_mask &
+ (val ^ l2svmcs->cr0_read_shadow))
+ return 1;
+ break;
+ case 3:
+ if (l2svmcs->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_LOAD_EXITING)
+ return 1;
+ return 1;
+ case 4:
+ if (l2svmcs->cr4_guest_host_mask &
+ (l2svmcs->cr4_read_shadow ^ val))
+ return 1;
+ break;
+ case 8:
+ if (l2svmcs->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_LOAD_EXITING)
+ return 1;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if (l2svmcs->cr0_guest_host_mask &
+ (val ^ l2svmcs->cr0_read_shadow))
+ return 1;
+ break;
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 0:
+ return 1;
+ case 3:
+ if (l2svmcs->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_STORE_EXITING)
+ return 1;
+ break;
+ case 4:
+ return 1;
+ case 8:
+ if (l2svmcs->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_STORE_EXITING)
+ return 1;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ if (l2svmcs->cr0_guest_host_mask &
+ (val ^ l2svmcs->cr0_read_shadow))
+ return 1;
+ break;
+ }
+ break;
+ }
+ case EXIT_REASON_DR_ACCESS: {
+ if (l2svmcs->cpu_based_vm_exec_control &
+ CPU_BASED_MOV_DR_EXITING)
+ return 1;
+ break;
+ }
+
+ case EXIT_REASON_EXCEPTION_NMI: {
+
+ if (is_external_interrupt(intr_info) &&
+ (l2svmcs->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK))
+ return 1;
+
+ if (is_nmi(intr_info) &&
+ (l2svmcs->pin_based_vm_exec_control &
+ PIN_BASED_NMI_EXITING))
+ return 1;
+
+ if (is_exception(intr_info) &&
+ (l2svmcs->exception_bitmap &
+ (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
+ return 1;
+
+ if (is_page_fault(intr_info))
+ return 1;
+
+ break;
+ }
+
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ if (l2svmcs->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK)
+ return 1;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ if (is_nested(&vmx->vcpu)) {
+ if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
+ nested_vmx_vmexit(&vmx->vcpu, false);
+ vmx->l2_state->shadow_vmcs->vm_exit_reason =
+ EXIT_REASON_EXCEPTION_NMI;
+ vmx->l2_state->shadow_vmcs->vm_exit_intr_info =
+ (nr | INTR_TYPE_HARD_EXCEPTION
+ | (has_error_code ?
+ INTR_INFO_DELIVER_CODE_MASK : 0)
+ | INTR_INFO_VALID_MASK);
+
+ if (has_error_code)
+ vmx->l2_state->shadow_vmcs->
+ vm_exit_intr_error_code = error_code;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int nested_vmx_intr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_nested(vcpu)) {
+ if (vmx->l2_state->shadow_vmcs->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK) {
+ if (vmx->nested_run_pending)
+ return 0;
+
+ nested_vmx_vmexit(vcpu, true);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -3965,7 +6468,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
+ .set_irq = vmx_set_irq,
.set_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
.interrupt_allowed = vmx_interrupt_allowed,
@@ -48,6 +48,9 @@
#include <asm/mtrr.h>
#include <asm/mce.h>
+int nested_tdp;
+EXPORT_SYMBOL_GPL(nested_tdp);
+
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -87,6 +90,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+int nested = 1;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -373,7 +380,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return;
}
- if (cr4 & X86_CR4_VMXE) {
+ if (cr4 & X86_CR4_VMXE && !nested) {
printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
kvm_inject_gp(vcpu, 0);
return;
@@ -4733,6 +4740,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
+ vcpu->kvm->arch.mmu_page_hash = vcpu->kvm->arch._mmu_page_hash;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -34,5 +34,13 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
+extern int nested_tdp;
+
+static inline bool is_nested_tdp(void)
+{
+ return nested_tdp;
+}
+
+extern int nested;
#endif
@@ -26,6 +26,8 @@
#include <asm/kvm_host.h>
+extern int nested_enabled;
+
/*
* vcpu->requests bit members
*/