diff mbox series

[02/67] KVM: x86: Reset IRTE to host control if *new* route isn't postable

Message ID 20250404193923.1413163-3-seanjc@google.com (mailing list archive)
State New
Headers show
Series KVM: iommu: Overhaul device posted IRQs support | expand

Commit Message

Sean Christopherson April 4, 2025, 7:38 p.m. UTC
Restore an IRTE back to host control (remapped or posted MSI mode) if the
*new* GSI route prevents posting the IRQ directly to a vCPU, regardless of
the GSI routing type.  Updating the IRTE if and only if the new GSI is an
MSI results in KVM leaving an IRTE posting to a vCPU.

The dangling IRTE can result in interrupts being incorrectly delivered to
the guest, and in the worst case scenario can result in use-after-free,
e.g. if the VM is torn down, but the underlying host IRQ isn't freed.

Fixes: efc644048ecd ("KVM: x86: Update IRTE for posted-interrupts")
Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c        | 61 ++++++++++++++++++----------------
 arch/x86/kvm/vmx/posted_intr.c | 28 ++++++----------
 2 files changed, 43 insertions(+), 46 deletions(-)

Comments

Sairaj Kodilkar April 11, 2025, 8:08 a.m. UTC | #1
On 4/5/2025 1:08 AM, Sean Christopherson wrote:
> Restore an IRTE back to host control (remapped or posted MSI mode) if the
> *new* GSI route prevents posting the IRQ directly to a vCPU, regardless of
> the GSI routing type.  Updating the IRTE if and only if the new GSI is an
> MSI results in KVM leaving an IRTE posting to a vCPU.
> 
> The dangling IRTE can result in interrupts being incorrectly delivered to
> the guest, and in the worst case scenario can result in use-after-free,
> e.g. if the VM is torn down, but the underlying host IRQ isn't freed.
> 
> Fixes: efc644048ecd ("KVM: x86: Update IRTE for posted-interrupts")
> Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt")
> Cc: stable@vger.kernel.org
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/svm/avic.c        | 61 ++++++++++++++++++----------------
>   arch/x86/kvm/vmx/posted_intr.c | 28 ++++++----------
>   2 files changed, 43 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index a961e6e67050..ef08356fdb1c 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -896,6 +896,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   {
>   	struct kvm_kernel_irq_routing_entry *e;
>   	struct kvm_irq_routing_table *irq_rt;
> +	bool enable_remapped_mode = true;
>   	int idx, ret = 0;
>   
>   	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
> @@ -932,6 +933,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   		    kvm_vcpu_apicv_active(&svm->vcpu)) {
>   			struct amd_iommu_pi_data pi;
>   
> +			enable_remapped_mode = false;
> +
>   			/* Try to enable guest_mode in IRTE */
>   			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
>   					    AVIC_HPA_MASK);
> @@ -950,33 +953,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   			 */
>   			if (!ret && pi.is_guest_mode)
>   				svm_ir_list_add(svm, &pi);
> -		} else {
> -			/* Use legacy mode in IRTE */
> -			struct amd_iommu_pi_data pi;
> -
> -			/**
> -			 * Here, pi is used to:
> -			 * - Tell IOMMU to use legacy mode for this interrupt.
> -			 * - Retrieve ga_tag of prior interrupt remapping data.
> -			 */
> -			pi.prev_ga_tag = 0;
> -			pi.is_guest_mode = false;
> -			ret = irq_set_vcpu_affinity(host_irq, &pi);
> -
> -			/**
> -			 * Check if the posted interrupt was previously
> -			 * setup with the guest_mode by checking if the ga_tag
> -			 * was cached. If so, we need to clean up the per-vcpu
> -			 * ir_list.
> -			 */
> -			if (!ret && pi.prev_ga_tag) {
> -				int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
> -				struct kvm_vcpu *vcpu;
> -
> -				vcpu = kvm_get_vcpu_by_id(kvm, id);
> -				if (vcpu)
> -					svm_ir_list_del(to_svm(vcpu), &pi);
> -			}
>   		}
>   
>   		if (!ret && svm) {
> @@ -991,7 +967,36 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   		}
>   	}
>   
> -	ret = 0;
> +	if (enable_remapped_mode) {
> +		/* Use legacy mode in IRTE */
> +		struct amd_iommu_pi_data pi;
> +
> +		/**
> +		 * Here, pi is used to:
> +		 * - Tell IOMMU to use legacy mode for this interrupt.
> +		 * - Retrieve ga_tag of prior interrupt remapping data.
> +		 */
> +		pi.prev_ga_tag = 0;
> +		pi.is_guest_mode = false;
> +		ret = irq_set_vcpu_affinity(host_irq, &pi);
> +
> +		/**
> +		 * Check if the posted interrupt was previously
> +		 * setup with the guest_mode by checking if the ga_tag
> +		 * was cached. If so, we need to clean up the per-vcpu
> +		 * ir_list.
> +		 */
> +		if (!ret && pi.prev_ga_tag) {
> +			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
> +			struct kvm_vcpu *vcpu;
> +
> +			vcpu = kvm_get_vcpu_by_id(kvm, id);
> +			if (vcpu)
> +				svm_ir_list_del(to_svm(vcpu), &pi);
> +		}
> +	} else {
> +		ret = 0;
> +	}

Hi Sean,
I think you can remove this else and "ret = 0". Because Code will come 
to this point when irq_set_vcpu_affinity() is successful, ensuring that 
ret is 0.

.../...

Regards
Sairaj Kodilkar
Sean Christopherson April 11, 2025, 2:16 p.m. UTC | #2
On Fri, Apr 11, 2025, Sairaj Kodilkar wrote:
> On 4/5/2025 1:08 AM, Sean Christopherson wrote:
> > @@ -991,7 +967,36 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
> >   		}
> >   	}
> > -	ret = 0;
> > +	if (enable_remapped_mode) {
> > +		/* Use legacy mode in IRTE */
> > +		struct amd_iommu_pi_data pi;
> > +
> > +		/**
> > +		 * Here, pi is used to:
> > +		 * - Tell IOMMU to use legacy mode for this interrupt.
> > +		 * - Retrieve ga_tag of prior interrupt remapping data.
> > +		 */
> > +		pi.prev_ga_tag = 0;
> > +		pi.is_guest_mode = false;
> > +		ret = irq_set_vcpu_affinity(host_irq, &pi);
> > +
> > +		/**
> > +		 * Check if the posted interrupt was previously
> > +		 * setup with the guest_mode by checking if the ga_tag
> > +		 * was cached. If so, we need to clean up the per-vcpu
> > +		 * ir_list.
> > +		 */
> > +		if (!ret && pi.prev_ga_tag) {
> > +			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
> > +			struct kvm_vcpu *vcpu;
> > +
> > +			vcpu = kvm_get_vcpu_by_id(kvm, id);
> > +			if (vcpu)
> > +				svm_ir_list_del(to_svm(vcpu), &pi);
> > +		}
> > +	} else {
> > +		ret = 0;
> > +	}
> 
> Hi Sean,
> I think you can remove this else and "ret = 0". Because Code will come to
> this point when irq_set_vcpu_affinity() is successful, ensuring that ret is
> 0.

Ah, nice, because of this:

		if (ret < 0) {
			pr_err("%s: failed to update PI IRTE\n", __func__);
			goto out;
		}

However, looking at this again, I'm very tempted to simply leave the "ret = 0;"
that's already there so as to minimize the change.  It'll get cleaned up later on
no matter what, so safety for LTS kernels is the driving factor as of this patch.

Paolo, any preference?
Paolo Bonzini April 15, 2025, 11:36 a.m. UTC | #3
On 4/11/25 16:16, Sean Christopherson wrote:
> On Fri, Apr 11, 2025, Sairaj Kodilkar wrote:
>> On 4/5/2025 1:08 AM, Sean Christopherson wrote:
>>> @@ -991,7 +967,36 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>>>    		}
>>>    	}
>>> -	ret = 0;
>>> +	if (enable_remapped_mode) {
>>> +		/* Use legacy mode in IRTE */
>>> +		struct amd_iommu_pi_data pi;
>>> +
>>> +		/**
>>> +		 * Here, pi is used to:
>>> +		 * - Tell IOMMU to use legacy mode for this interrupt.
>>> +		 * - Retrieve ga_tag of prior interrupt remapping data.
>>> +		 */
>>> +		pi.prev_ga_tag = 0;
>>> +		pi.is_guest_mode = false;
>>> +		ret = irq_set_vcpu_affinity(host_irq, &pi);
>>> +
>>> +		/**
>>> +		 * Check if the posted interrupt was previously
>>> +		 * setup with the guest_mode by checking if the ga_tag
>>> +		 * was cached. If so, we need to clean up the per-vcpu
>>> +		 * ir_list.
>>> +		 */
>>> +		if (!ret && pi.prev_ga_tag) {
>>> +			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
>>> +			struct kvm_vcpu *vcpu;
>>> +
>>> +			vcpu = kvm_get_vcpu_by_id(kvm, id);
>>> +			if (vcpu)
>>> +				svm_ir_list_del(to_svm(vcpu), &pi);
>>> +		}
>>> +	} else {
>>> +		ret = 0;
>>> +	}
>>
>> Hi Sean,
>> I think you can remove this else and "ret = 0". Because Code will come to
>> this point when irq_set_vcpu_affinity() is successful, ensuring that ret is
>> 0.
> 
> Ah, nice, because of this:
> 
> 		if (ret < 0) {
> 			pr_err("%s: failed to update PI IRTE\n", __func__);
> 			goto out;
> 		}
> 
> However, looking at this again, I'm very tempted to simply leave the "ret = 0;"
> that's already there so as to minimize the change.  It'll get cleaned up later on
> no matter what, so safety for LTS kernels is the driving factor as of this patch.
> 
> Paolo, any preference?

If you mean squashing this in:

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index ef08356fdb1c..8e09f6ae98fd 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -967,6 +967,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
                 }
         }
  
+       ret = 0;
         if (enable_remapped_mode) {
                 /* Use legacy mode in IRTE */
                 struct amd_iommu_pi_data pi;
@@ -994,8 +995,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
                         if (vcpu)
                                 svm_ir_list_del(to_svm(vcpu), &pi);
                 }
-       } else {
-               ret = 0;
         }
  out:
         srcu_read_unlock(&kvm->irq_srcu, idx);

to undo the moving of "ret = 0", that's a good idea yes.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index a961e6e67050..ef08356fdb1c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -896,6 +896,7 @@  int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
+	bool enable_remapped_mode = true;
 	int idx, ret = 0;
 
 	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
@@ -932,6 +933,8 @@  int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		    kvm_vcpu_apicv_active(&svm->vcpu)) {
 			struct amd_iommu_pi_data pi;
 
+			enable_remapped_mode = false;
+
 			/* Try to enable guest_mode in IRTE */
 			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
 					    AVIC_HPA_MASK);
@@ -950,33 +953,6 @@  int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 			 */
 			if (!ret && pi.is_guest_mode)
 				svm_ir_list_add(svm, &pi);
-		} else {
-			/* Use legacy mode in IRTE */
-			struct amd_iommu_pi_data pi;
-
-			/**
-			 * Here, pi is used to:
-			 * - Tell IOMMU to use legacy mode for this interrupt.
-			 * - Retrieve ga_tag of prior interrupt remapping data.
-			 */
-			pi.prev_ga_tag = 0;
-			pi.is_guest_mode = false;
-			ret = irq_set_vcpu_affinity(host_irq, &pi);
-
-			/**
-			 * Check if the posted interrupt was previously
-			 * setup with the guest_mode by checking if the ga_tag
-			 * was cached. If so, we need to clean up the per-vcpu
-			 * ir_list.
-			 */
-			if (!ret && pi.prev_ga_tag) {
-				int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
-				struct kvm_vcpu *vcpu;
-
-				vcpu = kvm_get_vcpu_by_id(kvm, id);
-				if (vcpu)
-					svm_ir_list_del(to_svm(vcpu), &pi);
-			}
 		}
 
 		if (!ret && svm) {
@@ -991,7 +967,36 @@  int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		}
 	}
 
-	ret = 0;
+	if (enable_remapped_mode) {
+		/* Use legacy mode in IRTE */
+		struct amd_iommu_pi_data pi;
+
+		/**
+		 * Here, pi is used to:
+		 * - Tell IOMMU to use legacy mode for this interrupt.
+		 * - Retrieve ga_tag of prior interrupt remapping data.
+		 */
+		pi.prev_ga_tag = 0;
+		pi.is_guest_mode = false;
+		ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+		/**
+		 * Check if the posted interrupt was previously
+		 * setup with the guest_mode by checking if the ga_tag
+		 * was cached. If so, we need to clean up the per-vcpu
+		 * ir_list.
+		 */
+		if (!ret && pi.prev_ga_tag) {
+			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+			struct kvm_vcpu *vcpu;
+
+			vcpu = kvm_get_vcpu_by_id(kvm, id);
+			if (vcpu)
+				svm_ir_list_del(to_svm(vcpu), &pi);
+		}
+	} else {
+		ret = 0;
+	}
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 16121d29dfd9..78ba3d638fe8 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -273,6 +273,7 @@  int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
+	bool enable_remapped_mode = true;
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
@@ -311,21 +312,8 @@  int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 
 		kvm_set_msi_irq(kvm, e, &irq);
 		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
-		    !kvm_irq_is_postable(&irq)) {
-			/*
-			 * Make sure the IRTE is in remapped mode if
-			 * we don't handle it in posted mode.
-			 */
-			ret = irq_set_vcpu_affinity(host_irq, NULL);
-			if (ret < 0) {
-				printk(KERN_INFO
-				   "failed to back to remapped mode, irq: %u\n",
-				   host_irq);
-				goto out;
-			}
-
+		    !kvm_irq_is_postable(&irq))
 			continue;
-		}
 
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
@@ -333,11 +321,12 @@  int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
-		if (set)
-			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-		else
-			ret = irq_set_vcpu_affinity(host_irq, NULL);
+		if (!set)
+			continue;
 
+		enable_remapped_mode = false;
+
+		ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
 		if (ret < 0) {
 			printk(KERN_INFO "%s: failed to update PI IRTE\n",
 					__func__);
@@ -345,6 +334,9 @@  int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		}
 	}
 
+	if (enable_remapped_mode)
+		ret = irq_set_vcpu_affinity(host_irq, NULL);
+
 	ret = 0;
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);