diff mbox

[22/30] nVMX: Correct handling of interrupt injection

Message ID 201105080826.p488QUXA018307@rice.haifa.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El May 8, 2011, 8:26 a.m. UTC
When KVM wants to inject an interrupt, the guest should think a real interrupt
has happened. Normally (in the non-nested case) this means checking that the
guest doesn't block interrupts (and if it does, inject when it doesn't - using
the "interrupt window" VMX mechanism), and setting up the appropriate VMCS
fields for the guest to receive the interrupt.

However, when we are running a nested guest (L2) and its hypervisor (L1)
requested exits on interrupts (as most hypervisors do), the most efficient
thing to do is to exit L2, telling L1 that the exit was caused by an
interrupt, the one we were injecting; Only when L1 asked not to be notified
of interrupts, we should inject directly to the running L2 guest (i.e.,
the normal code path).

However, properly doing what is described above requires invasive changes to
the flow of the existing code, which we elected not to do in this stage.
Instead we do something more simplistic and less efficient: we modify
vmx_interrupt_allowed(), which kvm calls to see if it can inject the interrupt
now, to exit from L2 to L1 before continuing the normal code. The normal kvm
code then notices that L1 is blocking interrupts, and sets the interrupt
window to inject the interrupt later to L1. Shortly after, L1 gets the
interrupt while it is itself running, not as an exit from L2. The cost is an
extra L1 exit (the interrupt window).

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |   35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity May 9, 2011, 10:57 a.m. UTC | #1
On 05/08/2011 11:26 AM, Nadav Har'El wrote:
> When KVM wants to inject an interrupt, the guest should think a real interrupt
> has happened. Normally (in the non-nested case) this means checking that the
> guest doesn't block interrupts (and if it does, inject when it doesn't - using
> the "interrupt window" VMX mechanism), and setting up the appropriate VMCS
> fields for the guest to receive the interrupt.
>
> However, when we are running a nested guest (L2) and its hypervisor (L1)
> requested exits on interrupts (as most hypervisors do), the most efficient
> thing to do is to exit L2, telling L1 that the exit was caused by an
> interrupt, the one we were injecting; Only when L1 asked not to be notified
> of interrupts, we should inject directly to the running L2 guest (i.e.,
> the normal code path).
>
> However, properly doing what is described above requires invasive changes to
> the flow of the existing code, which we elected not to do in this stage.
> Instead we do something more simplistic and less efficient: we modify
> vmx_interrupt_allowed(), which kvm calls to see if it can inject the interrupt
> now, to exit from L2 to L1 before continuing the normal code. The normal kvm
> code then notices that L1 is blocking interrupts, and sets the interrupt
> window to inject the interrupt later to L1. Shortly after, L1 gets the
> interrupt while it is itself running, not as an exit from L2. The cost is an
> extra L1 exit (the interrupt window).
>
> Signed-off-by: Nadav Har'El<nyh@il.ibm.com>
> ---
>   arch/x86/kvm/vmx.c |   35 +++++++++++++++++++++++++++++++++++
>   1 file changed, 35 insertions(+)
>
> --- .before/arch/x86/kvm/vmx.c	2011-05-08 10:43:20.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2011-05-08 10:43:20.000000000 +0300
> @@ -3675,9 +3675,25 @@ out:
>   	return ret;
>   }
>
> +/*
> + * In nested virtualization, check if L1 asked to exit on external interrupts.
> + * For most existing hypervisors, this will always return true.
> + */
> +static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
> +{
> +	return get_vmcs12(vcpu)->pin_based_vm_exec_control&
> +		PIN_BASED_EXT_INTR_MASK;
> +}
> +
>   static void enable_irq_window(struct kvm_vcpu *vcpu)
>   {
>   	u32 cpu_based_vm_exec_control;
> +	if (is_guest_mode(vcpu)&&  nested_exit_on_intr(vcpu))
> +		/* We can get here when nested_run_pending caused
> +		 * vmx_interrupt_allowed() to return false. In this case, do
> +		 * nothing - the interrupt will be injected later.
> +		 */
> +		return;

Why not do (or schedule) the nested vmexit here?  It's more natural than 
in vmx_interrupt_allowed() which from its name you'd expect to only read 
stuff.

I guess it can live for now if there's some unexpected complexity there.

>
>   	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>   	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
> @@ -3800,6 +3816,13 @@ static void vmx_set_nmi_mask(struct kvm_
>
>   static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
>   {
> +	if (is_guest_mode(vcpu)&&  nested_exit_on_intr(vcpu)) {
> +		if (to_vmx(vcpu)->nested.nested_run_pending)
> +			return 0;
> +		nested_vmx_vmexit(vcpu, true);
> +		/* fall through to normal code, but now in L1, not L2 */
> +	}
> +
>   	return (vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
>   		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)&
>   			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
> @@ -5463,6 +5486,14 @@ static int vmx_handle_exit(struct kvm_vc
>   	if (vmx->emulation_required&&  emulate_invalid_guest_state)
>   		return handle_invalid_guest_state(vcpu);
>
> +	/*
> +	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
> +	 * we did not inject a still-pending event to L1 now because of
> +	 * nested_run_pending, we need to re-enable this bit.
> +	 */
> +	if (vmx->nested.nested_run_pending)
> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
> +
>   	if (exit_reason == EXIT_REASON_VMLAUNCH ||
>   	    exit_reason == EXIT_REASON_VMRESUME)
>   		vmx->nested.nested_run_pending = 1;
> @@ -5660,6 +5691,8 @@ static void __vmx_complete_interrupts(st
>
>   static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>   {
> +	if (is_guest_mode(&vmx->vcpu))
> +		return;
>   	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
>   				  VM_EXIT_INSTRUCTION_LEN,
>   				  IDT_VECTORING_ERROR_CODE);
> @@ -5667,6 +5700,8 @@ static void vmx_complete_interrupts(stru
>
>   static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
>   {
> +	if (is_guest_mode(vcpu))
> +		return;

Hmm.  What if L0 injected something into L2?
diff mbox

Patch

--- .before/arch/x86/kvm/vmx.c	2011-05-08 10:43:20.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-08 10:43:20.000000000 +0300
@@ -3675,9 +3675,25 @@  out:
 	return ret;
 }
 
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+		PIN_BASED_EXT_INTR_MASK;
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+		/* We can get here when nested_run_pending caused
+		 * vmx_interrupt_allowed() to return false. In this case, do
+		 * nothing - the interrupt will be injected later.
+		 */
+		return;
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3800,6 +3816,13 @@  static void vmx_set_nmi_mask(struct kvm_
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+		if (to_vmx(vcpu)->nested.nested_run_pending)
+			return 0;
+		nested_vmx_vmexit(vcpu, true);
+		/* fall through to normal code, but now in L1, not L2 */
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -5463,6 +5486,14 @@  static int vmx_handle_exit(struct kvm_vc
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
 
+	/*
+	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+	 * we did not inject a still-pending event to L1 now because of
+	 * nested_run_pending, we need to re-enable this bit.
+	 */
+	if (vmx->nested.nested_run_pending)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	if (exit_reason == EXIT_REASON_VMLAUNCH ||
 	    exit_reason == EXIT_REASON_VMRESUME)
 		vmx->nested.nested_run_pending = 1;
@@ -5660,6 +5691,8 @@  static void __vmx_complete_interrupts(st
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
+	if (is_guest_mode(&vmx->vcpu))
+		return;
 	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
@@ -5667,6 +5700,8 @@  static void vmx_complete_interrupts(stru
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu))
+		return;
 	__vmx_complete_interrupts(to_vmx(vcpu),
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 				  VM_ENTRY_INSTRUCTION_LEN,