@@ -62,6 +62,9 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
+static int enable_ipiv = -1;
+module_param(enable_ipiv, int, 0444);
+
/* Note:
* This hash table is used to map VM_ID to a struct kvm_svm,
* when handling AMD IOMMU GALOG notification to schedule in
@@ -1024,7 +1027,6 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
@@ -1053,14 +1055,22 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ /*
+ * Do not update the actual physical id table entry, if the IPI
+ * virtualization portion of AVIC is not enabled.
+ * In this case all ICR writes except Self IPIs will be intercepted.
+ */
+
+ if (enable_ipiv) {
+ u64 entry = READ_ONCE(*svm->avic_physical_id_cache);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
- entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
@@ -1068,7 +1078,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
- u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
@@ -1093,11 +1102,17 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ /*
+ * Do not update the actual physical id table entry if the IPI
+ * virtualization is disabled. See explanation in avic_vcpu_load().
+ */
+ if (enable_ipiv) {
+ u64 entry = READ_ONCE(*svm->avic_physical_id_cache);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
@@ -1211,5 +1226,17 @@ bool avic_hardware_setup(void)
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ if (enable_ipiv == -1) {
+ enable_ipiv = 1;
+ /* Assume that Zen1 and Zen2 have errata #1235 */
+ if (boot_cpu_data.x86 == 0x17) {
+ pr_info("AVIC's IPI virtualization disabled due to errata #1235\n");
+ enable_ipiv = 0;
+ }
+ }
+
+ if (enable_ipiv)
+ pr_info("AVIC's IPI virtualization enabled\n");
+
return true;
}
On Zen2 (and likely on Zen1 as well), AVIC doesn't reliably detect a change in the 'is_running' bit during ICR write emulation and might skip a VM exit, if that bit was recently cleared. The absence of the VM exit, leads to the KVM not waking up / triggering nested vm exit on the target(s) of the IPI, which can, in some cases, lead to unbounded delays in the guest execution. As I recently discovered, a reasonable workaround exists: make the KVM never set the is_running bit, which in essence disables the IPI virtualization portion of AVIC making it equal to APICv without IPI virtualization. This workaround ensures that (*) all ICR writes always cause a VM exit and therefore correctly emulated, in expense of never enjoying VM exit-less ICR write emulation. To let the user control the workaround, a new kvm_amd module parameter was added: 'enable_ipiv', using the same name as IPI virtualization of VMX. However unlike VMX, this parameter is tri-state: 0, 1, -1. -1 is the default value which instructs KVM to choose the default based on the CPU model. (*) More correctly all ICR writes except when the 'Self' shorthand is used: In this case AVIC skips reading physid table and just sets bits in IRR of local APIC. Thankfully in this case, the errata is not possible, therefore an extra workaround is not needed. Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> --- arch/x86/kvm/svm/avic.c | 51 +++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 12 deletions(-)