Message ID | 201105161955.p4GJtB4O001985@rice.haifa.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> From: Nadav Har'El > Sent: Tuesday, May 17, 2011 3:55 AM > > This patch contains the logic of whether an L2 exit should be handled by L0 > and then L2 should be resumed, or whether L1 should be run to handle this > exit (using the nested_vmx_vmexit() function of the previous patch). > > The basic idea is to let L1 handle the exit only if it actually asked to > trap this sort of event. For example, when L2 exits on a change to CR0, > we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any > bit which changed; If it did, we exit to L1. But if it didn't it means that > it is we (L0) that wished to trap this event, so we handle it ourselves. > > The next two patches add additional logic of what to do when an interrupt or > exception is injected: Does L0 need to do it, should we exit to L1 to do it, > or should we resume L2 and keep the exception to be injected later. > > We keep a new flag, "nested_run_pending", which can override the decision of > which should run next, L1 or L2. nested_run_pending=1 means that we *must* > run > L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2 > and therefore expects L2 to be run (and perhaps be injected with an event it > specified, etc.). Nested_run_pending is especially intended to avoid switching > to L1 in the injection decision-point described above. > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > --- > arch/x86/kvm/vmx.c | 256 > ++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 255 insertions(+), 1 deletion(-) > > --- .before/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300 > +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300 > @@ -351,6 +351,8 @@ struct nested_vmx { > /* Saving the VMCS that we used for running L1 */ > struct saved_vmcs saved_vmcs01; > u64 vmcs01_tsc_offset; > + /* L2 must run next, and mustn't decide to exit to L1. */ > + bool nested_run_pending; > /* > * Guest pages referred to in vmcs02 with host-physical pointers, so > * we must keep them pinned while L2 runs. > @@ -870,6 +872,20 @@ static inline bool nested_cpu_has2(struc > (vmcs12->secondary_vm_exec_control & bit); > } > > +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) > +{ > + return is_guest_mode(vcpu) && > + (get_vmcs12(vcpu)->pin_based_vm_exec_control & > + PIN_BASED_VIRTUAL_NMIS); > +} any reason to add guest mode check here? I didn't see such check in your earlier nested_cpu_has_xxx. It would be clearer to use existing nested_cpu_has_xxx along with is_guest_mode explicitly which makes such usage consistent. > + > +static inline bool is_exception(u32 intr_info) > +{ > + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | > INTR_INFO_VALID_MASK)) > + == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); > +} > + > +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); > static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, > struct vmcs12 *vmcs12, > u32 reason, unsigned long qualification); > @@ -5281,6 +5297,232 @@ static int (*kvm_vmx_exit_handlers[])(st > static const int kvm_vmx_max_exit_handlers = > ARRAY_SIZE(kvm_vmx_exit_handlers); > > +/* > + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, > + * rather than handle it ourselves in L0. I.e., check whether L1 expressed > + * disinterest in the current event (read or write a specific MSR) by using an > + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. > + */ > +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12, u32 exit_reason) > +{ > + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; > + gpa_t bitmap; > + > + if (!nested_cpu_has(get_vmcs12(vcpu), > CPU_BASED_USE_MSR_BITMAPS)) > + return 1; > + > + /* > + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, > + * for the four combinations of read/write and low/high MSR numbers. > + * First we need to figure out which of the four to use: > + */ > + bitmap = vmcs12->msr_bitmap; > + if (exit_reason == EXIT_REASON_MSR_WRITE) > + bitmap += 2048; > + if (msr_index >= 0xc0000000) { > + msr_index -= 0xc0000000; > + bitmap += 1024; > + } > + > + /* Then read the msr_index'th bit from this bitmap: */ > + if (msr_index < 1024*8) { > + unsigned char b; > + kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); > + return 1 & (b >> (msr_index & 7)); > + } else > + return 1; /* let L1 handle the wrong parameter */ > +} > + > +/* > + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, > + * rather than handle it ourselves in L0. I.e., check if L1 wanted to > + * intercept (via guest_host_mask etc.) the current event. > + */ > +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12) > +{ > + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > + int cr = exit_qualification & 15; > + int reg = (exit_qualification >> 8) & 15; > + unsigned long val = kvm_register_read(vcpu, reg); > + > + switch ((exit_qualification >> 4) & 3) { > + case 0: /* mov to cr */ > + switch (cr) { > + case 0: > + if (vmcs12->cr0_guest_host_mask & > + (val ^ vmcs12->cr0_read_shadow)) > + return 1; > + break; > + case 3: > + if ((vmcs12->cr3_target_count >= 1 && > + vmcs12->cr3_target_value0 == val) || > + (vmcs12->cr3_target_count >= 2 && > + vmcs12->cr3_target_value1 == val) || > + (vmcs12->cr3_target_count >= 3 && > + vmcs12->cr3_target_value2 == val) || > + (vmcs12->cr3_target_count >= 4 && > + vmcs12->cr3_target_value3 == val)) > + return 0; > + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) > + return 1; > + break; > + case 4: > + if (vmcs12->cr4_guest_host_mask & > + (vmcs12->cr4_read_shadow ^ val)) > + return 1; > + break; > + case 8: > + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) > + return 1; > + break; > + } > + break; > + case 2: /* clts */ > + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && > + (vmcs12->cr0_read_shadow & X86_CR0_TS)) > + return 1; > + break; > + case 1: /* mov from cr */ > + switch (cr) { > + case 3: > + if (vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_CR3_STORE_EXITING) > + return 1; > + break; > + case 8: > + if (vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_CR8_STORE_EXITING) > + return 1; > + break; > + } > + break; > + case 3: /* lmsw */ > + /* > + * lmsw can change bits 1..3 of cr0, and only set bit 0 of > + * cr0. Other attempted changes are ignored, with no exit. > + */ > + if (vmcs12->cr0_guest_host_mask & 0xe & > + (val ^ vmcs12->cr0_read_shadow)) > + return 1; > + if ((vmcs12->cr0_guest_host_mask & 0x1) && > + !(vmcs12->cr0_read_shadow & 0x1) && > + (val & 0x1)) > + return 1; > + break; > + } > + return 0; > +} > + > +/* > + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we > + * should handle it ourselves in L0 (and then continue L2). Only call this > + * when in is_guest_mode (L2). > + */ > +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) > +{ > + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); > + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > + > + if (vmx->nested.nested_run_pending) > + return 0; > + > + if (unlikely(vmx->fail)) { > + printk(KERN_INFO "%s failed vm entry %x\n", > + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); > + return 1; > + } > + > + switch (exit_reason) { > + case EXIT_REASON_EXCEPTION_NMI: > + if (!is_exception(intr_info)) > + return 0; > + else if (is_page_fault(intr_info)) > + return enable_ept; > + return vmcs12->exception_bitmap & > + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); > + case EXIT_REASON_EXTERNAL_INTERRUPT: > + return 0; > + case EXIT_REASON_TRIPLE_FAULT: > + return 1; > + case EXIT_REASON_PENDING_INTERRUPT: > + case EXIT_REASON_NMI_WINDOW: > + /* > + * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING > bit > + * (aka Interrupt Window Exiting) only when L1 turned it on, > + * so if we got a PENDING_INTERRUPT exit, this must be for L1. > + * Same for NMI Window Exiting. > + */ > + return 1; > + case EXIT_REASON_TASK_SWITCH: > + return 1; > + case EXIT_REASON_CPUID: > + return 1; > + case EXIT_REASON_HLT: > + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); > + case EXIT_REASON_INVD: > + return 1; > + case EXIT_REASON_INVLPG: > + return vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_INVLPG_EXITING; use nested_cpu_has. > + case EXIT_REASON_RDPMC: > + return vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_RDPMC_EXITING; > + case EXIT_REASON_RDTSC: > + return vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_RDTSC_EXITING; ditto > + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: > + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: > + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: > + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: > + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: > + /* > + * VMX instructions trap unconditionally. This allows L1 to > + * emulate them for its L2 guest, i.e., allows 3-level nesting! > + */ > + return 1; > + case EXIT_REASON_CR_ACCESS: > + return nested_vmx_exit_handled_cr(vcpu, vmcs12); > + case EXIT_REASON_DR_ACCESS: > + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); > + case EXIT_REASON_IO_INSTRUCTION: > + /* TODO: support IO bitmaps */ > + return 1; > + case EXIT_REASON_MSR_READ: > + case EXIT_REASON_MSR_WRITE: > + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); > + case EXIT_REASON_INVALID_STATE: > + return 1; > + case EXIT_REASON_MWAIT_INSTRUCTION: > + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); > + case EXIT_REASON_MONITOR_INSTRUCTION: > + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); > + case EXIT_REASON_PAUSE_INSTRUCTION: > + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || > + nested_cpu_has2(vmcs12, > + SECONDARY_EXEC_PAUSE_LOOP_EXITING); > + case EXIT_REASON_MCE_DURING_VMENTRY: > + return 0; > + case EXIT_REASON_TPR_BELOW_THRESHOLD: > + return 1; > + case EXIT_REASON_APIC_ACCESS: > + return nested_cpu_has2(vmcs12, > + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); > + case EXIT_REASON_EPT_VIOLATION: > + case EXIT_REASON_EPT_MISCONFIG: > + return 0; > + case EXIT_REASON_WBINVD: > + return nested_cpu_has2(vmcs12, > SECONDARY_EXEC_WBINVD_EXITING); > + case EXIT_REASON_XSETBV: > + return 1; > + default: > + return 1; > + } > +} > + > static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) > { > *info1 = vmcs_readl(EXIT_QUALIFICATION); > @@ -5303,6 +5545,17 @@ static int vmx_handle_exit(struct kvm_vc > if (vmx->emulation_required && emulate_invalid_guest_state) > return handle_invalid_guest_state(vcpu); > > + if (exit_reason == EXIT_REASON_VMLAUNCH || > + exit_reason == EXIT_REASON_VMRESUME) > + vmx->nested.nested_run_pending = 1; > + else > + vmx->nested.nested_run_pending = 0; what about VMLAUNCH invoked from L2? In such case I think you expect L1 to run instead of L2. On the other hand, isn't just guest mode check enough to differentiate pending nested run? When L1 invokes VMLAUNCH/VMRESUME, guest mode hasn't been set yet, and below check will fail. All other operations will then be checked by nested_vmx_exit_handled... Do I miss anything here? > + > + if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { > + nested_vmx_vmexit(vcpu); > + return 1; > + } > + > if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { > vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; > vcpu->run->fail_entry.hardware_entry_failure_reason > @@ -5325,7 +5578,8 @@ static int vmx_handle_exit(struct kvm_vc > "(0x%x) and exit reason is 0x%x\n", > __func__, vectoring_info, exit_reason); > > - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { > + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && > + !nested_cpu_has_virtual_nmis(vcpu))) { Would L0 want to control vNMI for L2 guest? Otherwise we just use is_guest_mode here for the condition check? > if (vmx_interrupt_allowed(vcpu)) { > vmx->soft_vnmi_blocked = 0; > } else if (vmx->vnmi_blocked_time > 1000000000LL && Thanks, Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 25, 2011, Tian, Kevin wrote about "RE: [PATCH 22/31] nVMX: Deciding if L0 or L1 should handle an L2 exit": > > +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) > > +{ > > + return is_guest_mode(vcpu) && > > + (get_vmcs12(vcpu)->pin_based_vm_exec_control & > > + PIN_BASED_VIRTUAL_NMIS); > > +} > > any reason to add guest mode check here? I didn't see such check in your > earlier nested_cpu_has_xxx. It would be clearer to use existing nested_cpu_has_xxx > along with is_guest_mode explicitly which makes such usage consistent. The nested_cpu_has function is for procbased controls, not pinbased controls... But you're right, it's strange that only this function has is_guest_mode() in it. Moving it outside. The call site is now really ugly ;-) > > + case EXIT_REASON_INVLPG: > > + return vmcs12->cpu_based_vm_exec_control & > > + CPU_BASED_INVLPG_EXITING; > > use nested_cpu_has. Right ;-) > > + if (exit_reason == EXIT_REASON_VMLAUNCH || > > + exit_reason == EXIT_REASON_VMRESUME) > > + vmx->nested.nested_run_pending = 1; > > + else > > + vmx->nested.nested_run_pending = 0; > > what about VMLAUNCH invoked from L2? In such case I think you expect > L1 to run instead of L2. Wow, a good catch! According to the theory (see our paper :-)), our implementations should work with any number of nesting levels, not just two. But in practice, we've always had a bug in running L3, and we never had the time to figure out why. This is a good lead - I'll have to investigate. But not now. > On the other hand, isn't just guest mode check enough to differentiate > pending nested run? When L1 invokes VMLAUNCH/VMRESUME, guest mode > hasn't been set yet, and below check will fail. All other operations will then > be checked by nested_vmx_exit_handled... > > Do I miss anything here? As we discussed in another thread, nested_run_pending is important later, at the injection path. > > - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { > > + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && > > + !nested_cpu_has_virtual_nmis(vcpu))) { > > Would L0 want to control vNMI for L2 guest? Otherwise we just use is_guest_mode > here for the condition check? I don't remember the details here, but this if() used to be different, until Avi Kivity asked me to change it to this state. Nadav.
--- .before/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300 +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300 @@ -351,6 +351,8 @@ struct nested_vmx { /* Saving the VMCS that we used for running L1 */ struct saved_vmcs saved_vmcs01; u64 vmcs01_tsc_offset; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; /* * Guest pages referred to in vmcs02 with host-physical pointers, so * we must keep them pinned while L2 runs. @@ -870,6 +872,20 @@ static inline bool nested_cpu_has2(struc (vmcs12->secondary_vm_exec_control & bit); } +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) +{ + return is_guest_mode(vcpu) && + (get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_VIRTUAL_NMIS); +} + +static inline bool is_exception(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); +} + +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); @@ -5281,6 +5297,232 @@ static int (*kvm_vmx_exit_handlers[])(st static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); +/* + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * rather than handle it ourselves in L0. I.e., check whether L1 expressed + * disinterest in the current event (read or write a specific MSR) by using an + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. + */ +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, u32 exit_reason) +{ + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + gpa_t bitmap; + + if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + return 1; + + /* + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, + * for the four combinations of read/write and low/high MSR numbers. + * First we need to figure out which of the four to use: + */ + bitmap = vmcs12->msr_bitmap; + if (exit_reason == EXIT_REASON_MSR_WRITE) + bitmap += 2048; + if (msr_index >= 0xc0000000) { + msr_index -= 0xc0000000; + bitmap += 1024; + } + + /* Then read the msr_index'th bit from this bitmap: */ + if (msr_index < 1024*8) { + unsigned char b; + kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); + return 1 & (b >> (msr_index & 7)); + } else + return 1; /* let L1 handle the wrong parameter */ +} + +/* + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int cr = exit_qualification & 15; + int reg = (exit_qualification >> 8) & 15; + unsigned long val = kvm_register_read(vcpu, reg); + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + if (vmcs12->cr0_guest_host_mask & + (val ^ vmcs12->cr0_read_shadow)) + return 1; + break; + case 3: + if ((vmcs12->cr3_target_count >= 1 && + vmcs12->cr3_target_value0 == val) || + (vmcs12->cr3_target_count >= 2 && + vmcs12->cr3_target_value1 == val) || + (vmcs12->cr3_target_count >= 3 && + vmcs12->cr3_target_value2 == val) || + (vmcs12->cr3_target_count >= 4 && + vmcs12->cr3_target_value3 == val)) + return 0; + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + return 1; + break; + case 4: + if (vmcs12->cr4_guest_host_mask & + (vmcs12->cr4_read_shadow ^ val)) + return 1; + break; + case 8: + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + return 1; + break; + } + break; + case 2: /* clts */ + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && + (vmcs12->cr0_read_shadow & X86_CR0_TS)) + return 1; + break; + case 1: /* mov from cr */ + switch (cr) { + case 3: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + return 1; + break; + case 8: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + return 1; + break; + } + break; + case 3: /* lmsw */ + /* + * lmsw can change bits 1..3 of cr0, and only set bit 0 of + * cr0. Other attempted changes are ignored, with no exit. + */ + if (vmcs12->cr0_guest_host_mask & 0xe & + (val ^ vmcs12->cr0_read_shadow)) + return 1; + if ((vmcs12->cr0_guest_host_mask & 0x1) && + !(vmcs12->cr0_read_shadow & 0x1) && + (val & 0x1)) + return 1; + break; + } + return 0; +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * should handle it ourselves in L0 (and then continue L2). Only call this + * when in is_guest_mode (L2). + */ +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) +{ + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (vmx->nested.nested_run_pending) + return 0; + + if (unlikely(vmx->fail)) { + printk(KERN_INFO "%s failed vm entry %x\n", + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + if (!is_exception(intr_info)) + return 0; + else if (is_page_fault(intr_info)) + return enable_ept; + return vmcs12->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); + case EXIT_REASON_EXTERNAL_INTERRUPT: + return 0; + case EXIT_REASON_TRIPLE_FAULT: + return 1; + case EXIT_REASON_PENDING_INTERRUPT: + case EXIT_REASON_NMI_WINDOW: + /* + * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit + * (aka Interrupt Window Exiting) only when L1 turned it on, + * so if we got a PENDING_INTERRUPT exit, this must be for L1. + * Same for NMI Window Exiting. + */ + return 1; + case EXIT_REASON_TASK_SWITCH: + return 1; + case EXIT_REASON_CPUID: + return 1; + case EXIT_REASON_HLT: + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + case EXIT_REASON_INVD: + return 1; + case EXIT_REASON_INVLPG: + return vmcs12->cpu_based_vm_exec_control & + CPU_BASED_INVLPG_EXITING; + case EXIT_REASON_RDPMC: + return vmcs12->cpu_based_vm_exec_control & + CPU_BASED_RDPMC_EXITING; + case EXIT_REASON_RDTSC: + return vmcs12->cpu_based_vm_exec_control & + CPU_BASED_RDTSC_EXITING; + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! + */ + return 1; + case EXIT_REASON_CR_ACCESS: + return nested_vmx_exit_handled_cr(vcpu, vmcs12); + case EXIT_REASON_DR_ACCESS: + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + case EXIT_REASON_IO_INSTRUCTION: + /* TODO: support IO bitmaps */ + return 1; + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); + case EXIT_REASON_INVALID_STATE: + return 1; + case EXIT_REASON_MWAIT_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); + case EXIT_REASON_PAUSE_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_PAUSE_LOOP_EXITING); + case EXIT_REASON_MCE_DURING_VMENTRY: + return 0; + case EXIT_REASON_TPR_BELOW_THRESHOLD: + return 1; + case EXIT_REASON_APIC_ACCESS: + return nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + case EXIT_REASON_EPT_VIOLATION: + case EXIT_REASON_EPT_MISCONFIG: + return 0; + case EXIT_REASON_WBINVD: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); + case EXIT_REASON_XSETBV: + return 1; + default: + return 1; + } +} + static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { *info1 = vmcs_readl(EXIT_QUALIFICATION); @@ -5303,6 +5545,17 @@ static int vmx_handle_exit(struct kvm_vc if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); + if (exit_reason == EXIT_REASON_VMLAUNCH || + exit_reason == EXIT_REASON_VMRESUME) + vmx->nested.nested_run_pending = 1; + else + vmx->nested.nested_run_pending = 0; + + if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { + nested_vmx_vmexit(vcpu); + return 1; + } + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason @@ -5325,7 +5578,8 @@ static int vmx_handle_exit(struct kvm_vc "(0x%x) and exit reason is 0x%x\n", __func__, vectoring_info, exit_reason); - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && + !nested_cpu_has_virtual_nmis(vcpu))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL &&
This patch contains the logic of whether an L2 exit should be handled by L0 and then L2 should be resumed, or whether L1 should be run to handle this exit (using the nested_vmx_vmexit() function of the previous patch). The basic idea is to let L1 handle the exit only if it actually asked to trap this sort of event. For example, when L2 exits on a change to CR0, we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any bit which changed; If it did, we exit to L1. But if it didn't it means that it is we (L0) that wished to trap this event, so we handle it ourselves. The next two patches add additional logic of what to do when an interrupt or exception is injected: Does L0 need to do it, should we exit to L1 to do it, or should we resume L2 and keep the exception to be injected later. We keep a new flag, "nested_run_pending", which can override the decision of which should run next, L1 or L2. nested_run_pending=1 means that we *must* run L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2 and therefore expects L2 to be run (and perhaps be injected with an event it specified, etc.). Nested_run_pending is especially intended to avoid switching to L1 in the injection decision-point described above. Signed-off-by: Nadav Har'El <nyh@il.ibm.com> --- arch/x86/kvm/vmx.c | 256 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 255 insertions(+), 1 deletion(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html