diff mbox series

[RFC,v2,8/8] KVM: arm64: Start up SW/HW combined dirty log

Message ID 20230825093528.1637-9-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Implement SW/HW combined dirty log | expand

Commit Message

Shameerali Kolothum Thodi Aug. 25, 2023, 9:35 a.m. UTC
From: Keqian Zhu <zhukeqian1@huawei.com>

When a user has enabled HW DBM support for live migration,set the
HW DBM bit for nearby pages(64 pages) for a write faulting page.
We track the DBM set pages in a separate bitmap and uses that during
sync dirty log avoiding a full scan of PTEs.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 arch/arm64/include/asm/kvm_host.h |   6 ++
 arch/arm64/kvm/arm.c              | 125 ++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c      |  10 +--
 arch/arm64/kvm/mmu.c              |  11 ++-
 4 files changed, 144 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 17ac53150a1d..5f0be57eebc4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -181,6 +181,8 @@  struct kvm_s2_mmu {
 };
 
 struct kvm_arch_memory_slot {
+	#define HWDBM_GRANULE_SHIFT 6  /* 64 pages per bit */
+	unsigned long *hwdbm_bitmap;
 };
 
 /**
@@ -901,6 +903,10 @@  struct kvm_vcpu_stat {
 	u64 exits;
 };
 
+int kvm_arm_init_hwdbm_bitmap(struct kvm *kvm, struct kvm_memory_slot *memslot);
+void kvm_arm_destroy_hwdbm_bitmap(struct kvm_memory_slot *memslot);
+void kvm_arm_enable_nearby_hwdbm(struct kvm *kvm, gfn_t gfn);
+
 void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 0dbf2cda40d7..ab1e2da3bf0d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1540,9 +1540,134 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
+static unsigned long kvm_hwdbm_bitmap_bytes(struct kvm_memory_slot *memslot)
+{
+	unsigned long nbits = DIV_ROUND_UP(memslot->npages, 1 << HWDBM_GRANULE_SHIFT);
+
+	return ALIGN(nbits, BITS_PER_LONG) / 8;
+}
+
+static unsigned long *kvm_second_hwdbm_bitmap(struct kvm_memory_slot *memslot)
+{
+	unsigned long len = kvm_hwdbm_bitmap_bytes(memslot);
+
+	return (void *)memslot->arch.hwdbm_bitmap + len;
+}
+
+/*
+ * Allocate twice space. Refer kvm_arch_sync_dirty_log() to see why the
+ * second space is needed.
+ */
+int kvm_arm_init_hwdbm_bitmap(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+	unsigned long bytes = 2 * kvm_hwdbm_bitmap_bytes(memslot);
+
+	if (!kvm->arch.mmu.hwdbm_enabled)
+		return 0;
+
+	if (memslot->arch.hwdbm_bitmap) {
+		/* Inherited from old memslot */
+		bitmap_clear(memslot->arch.hwdbm_bitmap, 0, bytes * 8);
+	} else {
+		memslot->arch.hwdbm_bitmap = kvzalloc(bytes, GFP_KERNEL_ACCOUNT);
+		if (!memslot->arch.hwdbm_bitmap)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void kvm_arm_destroy_hwdbm_bitmap(struct kvm_memory_slot *memslot)
+{
+	if (!memslot->arch.hwdbm_bitmap)
+		return;
+
+	kvfree(memslot->arch.hwdbm_bitmap);
+	memslot->arch.hwdbm_bitmap = NULL;
+}
+
+/* Add DBM for nearby pagetables but do not across memslot */
+void kvm_arm_enable_nearby_hwdbm(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (memslot && kvm_slot_dirty_track_enabled(memslot) &&
+	    memslot->arch.hwdbm_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		unsigned long dbm_idx = rel_gfn >> HWDBM_GRANULE_SHIFT;
+		unsigned long start_page, npages;
+
+		if (!test_and_set_bit(dbm_idx, memslot->arch.hwdbm_bitmap)) {
+			start_page = dbm_idx << HWDBM_GRANULE_SHIFT;
+			npages = 1 << HWDBM_GRANULE_SHIFT;
+			npages = min(memslot->npages - start_page, npages);
+			kvm_stage2_set_dbm(kvm, memslot, start_page, npages);
+		}
+	}
+}
+
+/*
+ * We have to find a place to clear hwdbm_bitmap, and clear hwdbm_bitmap means
+ * to clear DBM bit of all related pgtables. Note that between we clear DBM bit
+ * and flush TLB, HW dirty log may occur, so we must scan all related pgtables
+ * after flush TLB. Giving above, it's best choice to clear hwdbm_bitmap before
+ * sync HW dirty log.
+ */
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
+	unsigned long *second_bitmap = kvm_second_hwdbm_bitmap(memslot);
+	unsigned long start_page, npages;
+	unsigned int end, rs, re;
+	bool has_hwdbm = false;
+
+	if (!memslot->arch.hwdbm_bitmap)
+		return;
+
+	end = kvm_hwdbm_bitmap_bytes(memslot) * 8;
+	bitmap_clear(second_bitmap, 0, end);
+
+	write_lock(&kvm->mmu_lock);
+	for_each_set_bitrange(rs, re, memslot->arch.hwdbm_bitmap, end) {
+		has_hwdbm = true;
 
+		/*
+		 * Must clear bitmap before clear DBM bit. During we clear DBM
+		 * (it releases the mmu spinlock periodly), SW dirty tracking
+		 * has chance to add DBM which overlaps what we are clearing. So
+		 * if we clear bitmap after clear DBM, we will face a situation
+		 * that bitmap is cleared but DBM are lefted, then we may have
+		 * no chance to scan these lefted pgtables anymore.
+		 */
+		bitmap_clear(memslot->arch.hwdbm_bitmap, rs, re - rs);
+
+		/* Record the bitmap cleared */
+		bitmap_set(second_bitmap, rs, re - rs);
+
+		start_page = rs << HWDBM_GRANULE_SHIFT;
+		npages = (re - rs) << HWDBM_GRANULE_SHIFT;
+		npages = min(memslot->npages - start_page, npages);
+		kvm_stage2_clear_dbm(kvm, memslot, start_page, npages);
+	}
+	write_unlock(&kvm->mmu_lock);
+
+	if (!has_hwdbm)
+		return;
+
+	/*
+	 * Ensure vcpu write-actions that occur after we clear hwdbm_bitmap can
+	 * be catched by guest memory abort handler.
+	 */
+	kvm_flush_remote_tlbs_memslot(kvm, memslot);
+
+	read_lock(&kvm->mmu_lock);
+	for_each_set_bitrange(rs, re, second_bitmap, end) {
+		start_page = rs << HWDBM_GRANULE_SHIFT;
+		npages = (re - rs) << HWDBM_GRANULE_SHIFT;
+		npages = min(memslot->npages - start_page, npages);
+		kvm_stage2_sync_dirty(kvm, memslot, start_page, npages);
+	}
+	read_unlock(&kvm->mmu_lock);
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4552bfb1f274..330912d647c7 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -651,10 +651,10 @@  u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
 
 #ifdef CONFIG_ARM64_HW_AFDBM
 	/*
-	 * Enable the Hardware Access Flag management, unconditionally
-	 * on all CPUs. In systems that have asymmetric support for the feature
-	 * this allows KVM to leverage hardware support on the subset of cores
-	 * that implement the feature.
+	 * Enable the Hardware Access Flag management and Dirty State management,
+	 * unconditionally on all CPUs. In systems that have asymmetric support for
+	 * the feature this allows KVM to leverage hardware support on the subset of
+	 * cores that implement the feature.
 	 *
 	 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
 	 * hardware) on implementations that do not advertise support for the
@@ -663,7 +663,7 @@  u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
 	 * HAFDBS. Here be dragons.
 	 */
 	if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
-		vtcr |= VTCR_EL2_HA;
+		vtcr |= VTCR_EL2_HA | VTCR_EL2_HD;
 #endif /* CONFIG_ARM64_HW_AFDBM */
 
 	/* Set the vmid bits */
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 34251932560e..b2fdcd762d70 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1569,14 +1569,18 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
 	 * kvm_pgtable_stage2_map() should be called to change block size.
 	 */
-	if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule)
+	if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) {
 		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
-	else
+		/* Try to enable HW DBM for nearby pages */
+		if (!ret && vma_pagesize == PAGE_SIZE && writable)
+			kvm_arm_enable_nearby_hwdbm(kvm, gfn);
+	} else {
 		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
 					     __pfn_to_phys(pfn), prot,
 					     memcache,
 					     KVM_PGTABLE_WALK_HANDLE_FAULT |
 					     KVM_PGTABLE_WALK_SHARED);
+	}
 
 	/* Mark the page dirty only if the fault is handled successfully */
 	if (writable && !ret) {
@@ -2046,11 +2050,12 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	} while (hva < reg_end);
 
 	mmap_read_unlock(current->mm);
-	return ret;
+	return ret ? : kvm_arm_init_hwdbm_bitmap(kvm, new);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
+	kvm_arm_destroy_hwdbm_bitmap(slot);
 }
 
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)