diff mbox series

[5/5] x86: KVM: SVM: workaround for AVIC's errata #1235

Message ID 20230928150428.199929-6-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series AVIC bugfixes and workarounds | expand

Commit Message

Maxim Levitsky Sept. 28, 2023, 3:04 p.m. UTC
On Zen2 (and likely on Zen1 as well), AVIC doesn't reliably detect a change
in the 'is_running' bit during ICR write emulation and might skip a
VM exit, if that bit was recently cleared.

The absence of the VM exit, leads to the KVM not waking up / triggering
nested vm exit on the target(s) of the IPI which can, in some cases,
lead to an unbounded delays in the guest execution.

As I recently discovered, a reasonable workaround exists: make the KVM
never set the is_running bit.

This workaround ensures that (*) all ICR writes always cause a VM exit
and therefore correctly emulated, in expense of never enjoying VM exit-less
ICR emulation.

This workaround does carry a performance penalty but according to my
benchmarks is still much better than not using AVIC at all,
because AVIC is still used for the receiving end of the IPIs, and for the
posted interrupts.

If the user is aware of the errata and it doesn't affect his workload,
the user can disable the workaround with 'avic_zen2_errata_workaround=0'

(*) More correctly all ICR writes except when 'Self' shorthand is used:

In this case AVIC skips reading physid table and just sets bits in IRR
of local APIC. Thankfully in this case, the errata is not possible,
therefore an extra workaround for this case is not needed.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/avic.c | 50 +++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 14 deletions(-)

Comments

Sean Christopherson Sept. 28, 2023, 3:37 p.m. UTC | #1
KVM: SVM: for the shortlog scope (applies to all relevant patches in this series)

On Thu, Sep 28, 2023, Maxim Levitsky wrote:
> On Zen2 (and likely on Zen1 as well), AVIC doesn't reliably detect a change
> in the 'is_running' bit during ICR write emulation and might skip a
> VM exit, if that bit was recently cleared.
> 
> The absence of the VM exit, leads to the KVM not waking up / triggering
> nested vm exit on the target(s) of the IPI which can, in some cases,
> lead to an unbounded delays in the guest execution.
> 
> As I recently discovered, a reasonable workaround exists: make the KVM

Nit, please just write "KVM", not "the KVM".  KVM is a proper noun when used in
this way, e.g. saying "the KVM" is like saying "the Sean" or "the Maxim".

> never set the is_running bit.
> 
> This workaround ensures that (*) all ICR writes always cause a VM exit
> and therefore correctly emulated, in expense of never enjoying VM exit-less
> ICR emulation.

This breaks svm_ir_list_add(), which relies on the vCPU's entry being up-to-date
and marked running to detect that IOMMU needs to be immediately pointed at the
current pCPU.

	/*
	 * Update the target pCPU for IOMMU doorbells if the vCPU is running.
	 * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
	 * will update the pCPU info when the vCPU awkened and/or scheduled in.
	 * See also avic_vcpu_load().
	 */
	entry = READ_ONCE(*(svm->avic_physical_id_cache));
	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
		amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
				    true, pi->ir_data);

> This workaround does carry a performance penalty but according to my
> benchmarks is still much better than not using AVIC at all,
> because AVIC is still used for the receiving end of the IPIs, and for the
> posted interrupts.

I really, really don't like the idea of carrying a workaround like this in
perpetuity.  If there is a customer that is determined to enable AVIC on Zen1/Zen2,
then *maybe* it's something to consider, but I don't think we should carry this
if the only anticipated beneficiary is one-off users and KVM developers.  IMO, the
AVIC code is complex enough as it is.
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index c44b65af494e3ff..df9efa428f86aa9 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -62,6 +62,9 @@  static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
 static bool force_avic;
 module_param_unsafe(force_avic, bool, 0444);
 
+static int avic_zen2_errata_workaround = -1;
+module_param(avic_zen2_errata_workaround, int, 0444);
+
 /* Note:
  * This hash table is used to map VM_ID to a struct kvm_svm,
  * when handling AMD IOMMU GALOG notification to schedule in
@@ -1027,7 +1030,6 @@  avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 
 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	u64 entry;
 	int h_physical_id = kvm_cpu_get_apicid(cpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned long flags;
@@ -1056,14 +1058,18 @@  void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
 
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
-	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+	if (!avic_zen2_errata_workaround) {
+		u64 entry = READ_ONCE(*(svm->avic_physical_id_cache));
 
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
-	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+		WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+		entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+		entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+		WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	}
 
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
 
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
@@ -1071,7 +1077,7 @@  void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 void avic_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	u64 entry;
+	u64 entry = 0;
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned long flags;
 
@@ -1084,11 +1090,13 @@  void avic_vcpu_put(struct kvm_vcpu *vcpu)
 	 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
 	 * recursively.
 	 */
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
 
-	/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
-	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
-		return;
+	if (!avic_zen2_errata_workaround) {
+		/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+		entry = READ_ONCE(*(svm->avic_physical_id_cache));
+		if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+			return;
+	}
 
 	/*
 	 * Take and hold the per-vCPU interrupt remapping lock while updating
@@ -1102,8 +1110,10 @@  void avic_vcpu_put(struct kvm_vcpu *vcpu)
 
 	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
 
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	if (!avic_zen2_errata_workaround) {
+		entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+		WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	}
 
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 
@@ -1217,5 +1227,17 @@  bool avic_hardware_setup(void)
 
 	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 
+	if (avic_zen2_errata_workaround == -1) {
+
+		/* Assume that Zen1 and Zen2 have errata #1235 */
+		if (boot_cpu_data.x86 == 0x17)
+			avic_zen2_errata_workaround = 1;
+		else
+			avic_zen2_errata_workaround = 0;
+	}
+
+	if (avic_zen2_errata_workaround)
+		pr_info("Workaround for AVIC errata #1235 is enabled\n");
+
 	return true;
 }