diff mbox series

[v2,29/43] KVM: VMX: Handle PI wakeup shenanigans during vcpu_put/load

Message ID 20211009021236.4122790-30-seanjc@google.com (mailing list archive)
State Not Applicable
Headers show
Series KVM: Halt-polling and x86 APICv overhaul | expand

Commit Message

Sean Christopherson Oct. 9, 2021, 2:12 a.m. UTC
Move the posted interrupt pre/post_block logic into vcpu_put/load
respectively, using the kvm_vcpu_is_blocking() to determining whether or
not the wakeup handler needs to be set (and unset).  This avoids updating
the PI descriptor if halt-polling is successful, reduces the number of
touchpoints for updating the descriptor, and eliminates the confusing
behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU
is scheduled back in after preemption.

The downside is that KVM will do the PID update twice if the vCPU is
preempted after prepare_to_rcuwait() but before schedule(), but that's a
rare case (and non-existent on !PREEMPT kernels).

The notable wart is the need to send a self-IPI on the wakeup vector if
an outstanding notification is pending after configuring the wakeup
vector.  Ideally, KVM would just do a kvm_vcpu_wake_up() in this case,
but the scheduler doesn't support waking a task from its preemption
notifier callback, i.e. while the task is smack dab in the middle of
being scheduled out.

Note, setting the wakeup vector before halt-polling is not necessary as
the pending IRQ will be recorded in the PIR and detected as a blocking-
breaking condition by kvm_vcpu_has_events() -> vmx_sync_pir_to_irr().

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/vmx/posted_intr.c | 162 ++++++++++++++-------------------
 arch/x86/kvm/vmx/posted_intr.h |   8 +-
 arch/x86/kvm/vmx/vmx.c         |   5 -
 3 files changed, 75 insertions(+), 100 deletions(-)

Comments

Maxim Levitsky Oct. 28, 2021, 3:14 p.m. UTC | #1
On Fri, 2021-10-08 at 19:12 -0700, Sean Christopherson wrote:
> Move the posted interrupt pre/post_block logic into vcpu_put/load
> respectively, using the kvm_vcpu_is_blocking() to determining whether or
> not the wakeup handler needs to be set (and unset).  This avoids updating
> the PI descriptor if halt-polling is successful, reduces the number of
> touchpoints for updating the descriptor, and eliminates the confusing
> behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU
> is scheduled back in after preemption.
> 
> The downside is that KVM will do the PID update twice if the vCPU is
> preempted after prepare_to_rcuwait() but before schedule(), but that's a
> rare case (and non-existent on !PREEMPT kernels).
> 
> The notable wart is the need to send a self-IPI on the wakeup vector if
> an outstanding notification is pending after configuring the wakeup
> vector.  Ideally, KVM would just do a kvm_vcpu_wake_up() in this case,
> but the scheduler doesn't support waking a task from its preemption
> notifier callback, i.e. while the task is smack dab in the middle of
> being scheduled out.
> 
> Note, setting the wakeup vector before halt-polling is not necessary as
> the pending IRQ will be recorded in the PIR and detected as a blocking-
> breaking condition by kvm_vcpu_has_events() -> vmx_sync_pir_to_irr().
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/vmx/posted_intr.c | 162 ++++++++++++++-------------------
>  arch/x86/kvm/vmx/posted_intr.h |   8 +-
>  arch/x86/kvm/vmx/vmx.c         |   5 -
>  3 files changed, 75 insertions(+), 100 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
> index 901b7a5f7777..d2b3d75c57d1 100644
> --- a/arch/x86/kvm/vmx/posted_intr.c
> +++ b/arch/x86/kvm/vmx/posted_intr.c
> @@ -37,33 +37,45 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
>  {
>  	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
>  	struct pi_desc old, new;
> +	unsigned long flags;
>  	unsigned int dest;
>  
>  	/*
> -	 * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
> -	 * PI.SN up-to-date even if there is no assigned device or if APICv is
> +	 * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST
> +	 * up-to-date even if there is no assigned device or if APICv is
>  	 * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
>  	 */
>  	if (!enable_apicv || !lapic_in_kernel(vcpu))
>  		return;
>  
> -	/* Nothing to do if PI.SN==0 and the vCPU isn't being migrated. */
> -	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
> +	/*
> +	 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
> +	 * full update can be skipped as neither the vector nor the destination
> +	 * needs to be changed.
> +	 */
> +	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
> +		/*
> +		 * Clear SN if it was set due to being preempted.  Again, do
> +		 * this even if there is no assigned device for simplicity.
> +		 */
> +		if (pi_test_and_clear_sn(pi_desc))
> +			goto after_clear_sn;
>  		return;
> +	}
> +
> +	local_irq_save(flags);
>  
>  	/*
> -	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
> -	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
> -	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
> -	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
> -	 * correctly.
> +	 * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
> +	 * list of the _previous_ pCPU, which will not be the same as the
> +	 * current pCPU if the task was migrated.
>  	 */
> -	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
> -		pi_clear_sn(pi_desc);
> -		goto after_clear_sn;
> +	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
> +		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
> +		list_del(&vcpu->blocked_vcpu_list);
> +		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
>  	}
>  
> -	/* The full case.  Set the new destination and clear SN. */
>  	dest = cpu_physical_id(cpu);
>  	if (!x2apic_mode)
>  		dest = (dest << 8) & 0xFF00;
> @@ -71,11 +83,23 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
>  	do {
>  		old.control = new.control = READ_ONCE(pi_desc->control);
>  
> +		/*
> +		 * Clear SN (as above) and refresh the destination APIC ID to
> +		 * handle task migration (@cpu != vcpu->cpu).
> +		 */
>  		new.ndst = dest;
>  		new.sn = 0;
> +
> +		/*
> +		 * Restore the notification vector; in the blocking case, the
> +		 * descriptor was modified on "put" to use the wakeup vector.
> +		 */
> +		new.nv = POSTED_INTR_VECTOR;
>  	} while (cmpxchg64(&pi_desc->control, old.control,
>  			   new.control) != old.control);
>  
> +	local_irq_restore(flags);
> +
>  after_clear_sn:
>  
>  	/*
> @@ -90,88 +114,24 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
>  		pi_set_on(pi_desc);
>  }
>  
> -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
> -{
> -	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> -
> -	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> -		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
> -		!kvm_vcpu_apicv_active(vcpu))
> -		return;
> -
> -	/* Set SN when the vCPU is preempted */
> -	if (vcpu->preempted)
> -		pi_set_sn(pi_desc);
> -}
> -
> -static void __pi_post_block(struct kvm_vcpu *vcpu)
> -{
> -	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> -	struct pi_desc old, new;
> -	unsigned int dest;
> -
> -	WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
> -	     "Wakeup handler not enabled while the vCPU was blocking");
> -
> -	/*
> -	 * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
> -	 * will not be the same as the current pCPU if the task was migrated.
> -	 */
> -	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> -	list_del(&vcpu->blocked_vcpu_list);
> -	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> -
> -	dest = cpu_physical_id(vcpu->cpu);
> -	if (!x2apic_mode)
> -		dest = (dest << 8) & 0xFF00;
> -
> -	do {
> -		old.control = new.control = READ_ONCE(pi_desc->control);
> -
> -		new.ndst = dest;
> -
> -		/* set 'NV' to 'notification vector' */
> -		new.nv = POSTED_INTR_VECTOR;
> -	} while (cmpxchg64(&pi_desc->control, old.control,
> -			   new.control) != old.control);
> -
> -	vcpu->pre_pcpu = -1;
> -}
> -
>  /*
> - * This routine does the following things for vCPU which is going
> - * to be blocked if VT-d PI is enabled.
> - * - Store the vCPU to the wakeup list, so when interrupts happen
> - *   we can find the right vCPU to wake up.
> - * - Change the Posted-interrupt descriptor as below:
> - *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
> - * - If 'ON' is set during this process, which means at least one
> - *   interrupt is posted for this vCPU, we cannot block it, in
> - *   this case, return 1, otherwise, return 0.
> - *
> + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
> + * WAKEUP as the notification vector in the PI descriptor.
>   */
> -int pi_pre_block(struct kvm_vcpu *vcpu)
> +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
>  {
> -	struct pi_desc old, new;
>  	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> +	struct pi_desc old, new;
>  	unsigned long flags;
>  
> -	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> -	    !irq_remapping_cap(IRQ_POSTING_CAP) ||
> -	    !kvm_vcpu_apicv_active(vcpu) ||
> -	    vmx_interrupt_blocked(vcpu))
> -		return 0;
> -
>  	local_irq_save(flags);
>  
> -	vcpu->pre_pcpu = vcpu->cpu;
>  	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
>  	list_add_tail(&vcpu->blocked_vcpu_list,
>  		      &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
>  	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
>  
> -	WARN(pi_desc->sn == 1,
> -	     "Posted Interrupt Suppress Notification set before blocking");
> +	WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
>  
>  	do {
>  		old.control = new.control = READ_ONCE(pi_desc->control);
> @@ -181,24 +141,40 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
>  	} while (cmpxchg64(&pi_desc->control, old.control,
>  			   new.control) != old.control);
>  
> -	/* We should not block the vCPU if an interrupt is posted for it.  */
> -	if (pi_test_on(pi_desc))
> -		__pi_post_block(vcpu);
> +	/*
> +	 * Send a wakeup IPI to this CPU if an interrupt may have been posted
> +	 * before the notification vector was updated, in which case the IRQ
> +	 * will arrive on the non-wakeup vector.  An IPI is needed as calling
> +	 * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
> +	 * enabled until it is safe to call try_to_wake_up() on the task being
> +	 * scheduled out).
> +	 */
> +	if (pi_test_on(&new))
> +		apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
>  
>  	local_irq_restore(flags);
> -	return (vcpu->pre_pcpu == -1);
>  }
>  
> -void pi_post_block(struct kvm_vcpu *vcpu)
> +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
>  {
> -	unsigned long flags;
> +	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
>  
> -	if (vcpu->pre_pcpu == -1)
> +	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> +	    !irq_remapping_cap(IRQ_POSTING_CAP)  ||
> +	    !kvm_vcpu_apicv_active(vcpu))
>  		return;
>  
> -	local_irq_save(flags);
> -	__pi_post_block(vcpu);
> -	local_irq_restore(flags);
> +	if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
> +		pi_enable_wakeup_handler(vcpu);
> +
> +	/*
> +	 * Set SN when the vCPU is preempted.  Note, the vCPU can both be seen
> +	 * as blocking and preempted, e.g. if it's preempted between setting
> +	 * its wait state and manually scheduling out.  In that case, KVM will
> +	 * update
> +	 */
> +	if (vcpu->preempted)
> +		pi_set_sn(pi_desc);
>  }
>  
>  /*
> @@ -239,7 +215,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
>   * Bail out of the block loop if the VM has an assigned
>   * device, but the blocking vCPU didn't reconfigure the
>   * PI.NV to the wakeup vector, i.e. the assigned device
> - * came along after the initial check in pi_pre_block().
> + * came along after the initial check in vmx_vcpu_pi_put().
>   */
>  void vmx_pi_start_assignment(struct kvm *kvm)
>  {
> diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
> index 36ae035f14aa..eb14e76b84ef 100644
> --- a/arch/x86/kvm/vmx/posted_intr.h
> +++ b/arch/x86/kvm/vmx/posted_intr.h
> @@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
>  			(unsigned long *)&pi_desc->control);
>  }
>  
> +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
> +{
> +	return test_and_clear_bit(POSTED_INTR_SN,
> +			(unsigned long *)&pi_desc->control);
> +}
> +
>  static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
>  {
>  	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
> @@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
>  
>  void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
>  void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
> -int pi_pre_block(struct kvm_vcpu *vcpu);
> -void pi_post_block(struct kvm_vcpu *vcpu);
>  void pi_wakeup_handler(void);
>  void __init pi_init_cpu(int cpu);
>  bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 5517893f12fc..26ed8cd1a1f2 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7462,9 +7462,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
>  
>  static int vmx_pre_block(struct kvm_vcpu *vcpu)
>  {
> -	if (pi_pre_block(vcpu))
> -		return 1;
> -
>  	if (kvm_lapic_hv_timer_in_use(vcpu))
>  		kvm_lapic_switch_to_sw_timer(vcpu);
>  
> @@ -7475,8 +7472,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
>  {
>  	if (kvm_x86_ops.set_hv_timer)
>  		kvm_lapic_switch_to_hv_timer(vcpu);
> -
> -	pi_post_block(vcpu);
>  }
>  
>  static void vmx_setup_mce(struct kvm_vcpu *vcpu)


Looks OK to me, and IMHO is a very good step in direction to simplify that code,
but the logic is far from beeing simple so I might have missed something.

Especially, this should be tested with nested APICv, which I don't yet know well
enough to know if this can break it or not.


Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 901b7a5f7777..d2b3d75c57d1 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -37,33 +37,45 @@  void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct pi_desc old, new;
+	unsigned long flags;
 	unsigned int dest;
 
 	/*
-	 * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
-	 * PI.SN up-to-date even if there is no assigned device or if APICv is
+	 * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST
+	 * up-to-date even if there is no assigned device or if APICv is
 	 * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
 	 */
 	if (!enable_apicv || !lapic_in_kernel(vcpu))
 		return;
 
-	/* Nothing to do if PI.SN==0 and the vCPU isn't being migrated. */
-	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+	/*
+	 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+	 * full update can be skipped as neither the vector nor the destination
+	 * needs to be changed.
+	 */
+	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+		/*
+		 * Clear SN if it was set due to being preempted.  Again, do
+		 * this even if there is no assigned device for simplicity.
+		 */
+		if (pi_test_and_clear_sn(pi_desc))
+			goto after_clear_sn;
 		return;
+	}
+
+	local_irq_save(flags);
 
 	/*
-	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
-	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
-	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
-	 * correctly.
+	 * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+	 * list of the _previous_ pCPU, which will not be the same as the
+	 * current pCPU if the task was migrated.
 	 */
-	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
-		pi_clear_sn(pi_desc);
-		goto after_clear_sn;
+	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
 	}
 
-	/* The full case.  Set the new destination and clear SN. */
 	dest = cpu_physical_id(cpu);
 	if (!x2apic_mode)
 		dest = (dest << 8) & 0xFF00;
@@ -71,11 +83,23 @@  void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	do {
 		old.control = new.control = READ_ONCE(pi_desc->control);
 
+		/*
+		 * Clear SN (as above) and refresh the destination APIC ID to
+		 * handle task migration (@cpu != vcpu->cpu).
+		 */
 		new.ndst = dest;
 		new.sn = 0;
+
+		/*
+		 * Restore the notification vector; in the blocking case, the
+		 * descriptor was modified on "put" to use the wakeup vector.
+		 */
+		new.nv = POSTED_INTR_VECTOR;
 	} while (cmpxchg64(&pi_desc->control, old.control,
 			   new.control) != old.control);
 
+	local_irq_restore(flags);
+
 after_clear_sn:
 
 	/*
@@ -90,88 +114,24 @@  void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 		pi_set_on(pi_desc);
 }
 
-void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
-		return;
-
-	/* Set SN when the vCPU is preempted */
-	if (vcpu->preempted)
-		pi_set_sn(pi_desc);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	struct pi_desc old, new;
-	unsigned int dest;
-
-	WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
-	     "Wakeup handler not enabled while the vCPU was blocking");
-
-	/*
-	 * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
-	 * will not be the same as the current pCPU if the task was migrated.
-	 */
-	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-	list_del(&vcpu->blocked_vcpu_list);
-	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-
-	dest = cpu_physical_id(vcpu->cpu);
-	if (!x2apic_mode)
-		dest = (dest << 8) & 0xFF00;
-
-	do {
-		old.control = new.control = READ_ONCE(pi_desc->control);
-
-		new.ndst = dest;
-
-		/* set 'NV' to 'notification vector' */
-		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg64(&pi_desc->control, old.control,
-			   new.control) != old.control);
-
-	vcpu->pre_pcpu = -1;
-}
-
 /*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- *   we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- *   interrupt is posted for this vCPU, we cannot block it, in
- *   this case, return 1, otherwise, return 0.
- *
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
  */
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 {
-	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
 	unsigned long flags;
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-	    !irq_remapping_cap(IRQ_POSTING_CAP) ||
-	    !kvm_vcpu_apicv_active(vcpu) ||
-	    vmx_interrupt_blocked(vcpu))
-		return 0;
-
 	local_irq_save(flags);
 
-	vcpu->pre_pcpu = vcpu->cpu;
 	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
 	list_add_tail(&vcpu->blocked_vcpu_list,
 		      &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
 	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
 
-	WARN(pi_desc->sn == 1,
-	     "Posted Interrupt Suppress Notification set before blocking");
+	WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
 
 	do {
 		old.control = new.control = READ_ONCE(pi_desc->control);
@@ -181,24 +141,40 @@  int pi_pre_block(struct kvm_vcpu *vcpu)
 	} while (cmpxchg64(&pi_desc->control, old.control,
 			   new.control) != old.control);
 
-	/* We should not block the vCPU if an interrupt is posted for it.  */
-	if (pi_test_on(pi_desc))
-		__pi_post_block(vcpu);
+	/*
+	 * Send a wakeup IPI to this CPU if an interrupt may have been posted
+	 * before the notification vector was updated, in which case the IRQ
+	 * will arrive on the non-wakeup vector.  An IPI is needed as calling
+	 * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+	 * enabled until it is safe to call try_to_wake_up() on the task being
+	 * scheduled out).
+	 */
+	if (pi_test_on(&new))
+		apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
 
 	local_irq_restore(flags);
-	return (vcpu->pre_pcpu == -1);
 }
 
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
-	unsigned long flags;
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
-	if (vcpu->pre_pcpu == -1)
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+	    !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+	    !kvm_vcpu_apicv_active(vcpu))
 		return;
 
-	local_irq_save(flags);
-	__pi_post_block(vcpu);
-	local_irq_restore(flags);
+	if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+		pi_enable_wakeup_handler(vcpu);
+
+	/*
+	 * Set SN when the vCPU is preempted.  Note, the vCPU can both be seen
+	 * as blocking and preempted, e.g. if it's preempted between setting
+	 * its wait state and manually scheduling out.  In that case, KVM will
+	 * update
+	 */
+	if (vcpu->preempted)
+		pi_set_sn(pi_desc);
 }
 
 /*
@@ -239,7 +215,7 @@  bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
  * Bail out of the block loop if the VM has an assigned
  * device, but the blocking vCPU didn't reconfigure the
  * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in pi_pre_block().
+ * came along after the initial check in vmx_vcpu_pi_put().
  */
 void vmx_pi_start_assignment(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 36ae035f14aa..eb14e76b84ef 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -40,6 +40,12 @@  static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
 			(unsigned long *)&pi_desc->control);
 }
 
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+	return test_and_clear_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
 static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 {
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@@ -88,8 +94,6 @@  static inline bool pi_test_sn(struct pi_desc *pi_desc)
 
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
-int pi_pre_block(struct kvm_vcpu *vcpu);
-void pi_post_block(struct kvm_vcpu *vcpu);
 void pi_wakeup_handler(void);
 void __init pi_init_cpu(int cpu);
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5517893f12fc..26ed8cd1a1f2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7462,9 +7462,6 @@  void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
 
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
 {
-	if (pi_pre_block(vcpu))
-		return 1;
-
 	if (kvm_lapic_hv_timer_in_use(vcpu))
 		kvm_lapic_switch_to_sw_timer(vcpu);
 
@@ -7475,8 +7472,6 @@  static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
 	if (kvm_x86_ops.set_hv_timer)
 		kvm_lapic_switch_to_hv_timer(vcpu);
-
-	pi_post_block(vcpu);
 }
 
 static void vmx_setup_mce(struct kvm_vcpu *vcpu)