@@ -8146,6 +8146,12 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
guest CPUID on writes to MISC_ENABLE if
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
disabled.
+
+KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in
+ fast way when a memslot is deleted.
+ When this quirk is disabled, KVM zaps only
+ leaf SPTEs that are within the range of the
+ memslot being deleted.
=================================== ============================================
7.32 KVM_CAP_MAX_VCPU_ID
@@ -2383,7 +2383,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
- KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
+ KVM_X86_QUIRK_SLOT_ZAP_ALL)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
@@ -438,6 +438,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
+#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -6987,10 +6987,44 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_mmu_zap_all(kvm);
}
+/*
+ * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
+ *
+ * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+ * case scenario we'll have unused shadow pages lying around until they
+ * are recycled due to age or when the VM is destroyed.
+ */
+static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ };
+ bool flush = false;
+
+ write_lock(&kvm->mmu_lock);
+
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, &range, kvm_zap_rmap);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush);
+
+ if (flush)
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_zap_all_fast(kvm);
+ if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot_leafs(kvm, slot);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
Introduce a quirk KVM_X86_QUIRK_SLOT_ZAP_ALL to allow users to choose between enabling the quirk to invalidate/zap all SPTEs when a memslot is moved/deleted and disabling the quirk to zap precisely/slowly of only leaf SPTEs that are within the range of moving/deleting memslot. The quirk is turned on by default. This is to work around a bug [1] where the changing the zapping behavior of memslot move/deletion would cause VM instability for VMs with an Nvidia GPU assigned. Users have the option to deactivate the quirk for specific VMs that are unaffected by this bug. Turning off the quirk enables a more precise zapping of SPTEs within the memory slot range, enhancing performance for certain scenarios [2] and meeting the functional requirements for TDX. In TDX, it is crucial that the root page of the private page table remains unchanged, with leaf entries being zapped before non-leaf entries. Additionally, any pages dropped in TDX would necessitate their re-acceptance by the guest. Previously, an attempt was made to introduce a per-VM capability [3] as a workaround for the bug. However, this approach was not preferred because the root cause of the bug remained unidentified. An alternative solution involving a per-memslot flag [4] was also considered but ultimately rejected. Sean and Paolo thereafter recommended the implementation of this quirk and explained that it's the least bad option [5]. For the quirk disabled case, add a new function kvm_mmu_zap_memslot_leafs() to zap leaf SPTEs within a memslot when moving/deleting a memslot. Rather than further calling kvm_unmap_gfn_range() for the actual zapping, this function bypasses the special handling to APIC_ACCESS_PAGE_PRIVATE_MEMSLOT. This is based on the considerations that 1) The APIC_ACCESS_PAGE_PRIVATE_MEMSLOT cannot be created by users, nor can it be moved. It is only deleted by KVM when APICv is permanently inhibited. 2) kvm_vcpu_reload_apic_access_page() effectively does nothing when APIC_ACCESS_PAGE_PRIVATE_MEMSLOT is deleted. 3) Avoid making all cpus request of KVM_REQ_APIC_PAGE_RELOAD can save on costly IPIs. Suggested-by: Kai Huang <kai.huang@intel.com> Suggested-by: Sean Christopherson <seanjc@google.com> Suggested-by: Paolo Bonzini <pbonzini@redhat.com> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com> Link: https://patchwork.kernel.org/project/kvm/patch/20190205210137.1377-11-sean.j.christopherson@intel.com [1] Link: https://patchwork.kernel.org/project/kvm/patch/20190205210137.1377-11-sean.j.christopherson@intel.com/#25054908 [2] Link: https://lore.kernel.org/kvm/20200713190649.GE29725@linux.intel.com/T/#mabc0119583dacf621025e9d873c85f4fbaa66d5c [3] Link: https://lore.kernel.org/all/20240515005952.3410568-3-rick.p.edgecombe@intel.com [4] Link: https://lore.kernel.org/all/7df9032d-83e4-46a1-ab29-6c7973a2ab0b@redhat.com [5] Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/mmu/mmu.c | 36 ++++++++++++++++++++++++++++++++- 4 files changed, 44 insertions(+), 2 deletions(-)