@@ -181,6 +181,8 @@ struct kvm_s2_mmu {
};
struct kvm_arch_memory_slot {
+ #define HWDBM_GRANULE_SHIFT 6 /* 64 pages per bit */
+ unsigned long *hwdbm_bitmap;
};
/**
@@ -901,6 +903,10 @@ struct kvm_vcpu_stat {
u64 exits;
};
+int kvm_arm_init_hwdbm_bitmap(struct kvm *kvm, struct kvm_memory_slot *memslot);
+void kvm_arm_destroy_hwdbm_bitmap(struct kvm_memory_slot *memslot);
+void kvm_arm_enable_nearby_hwdbm(struct kvm *kvm, gfn_t gfn);
+
void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
@@ -1540,9 +1540,134 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return r;
}
+static unsigned long kvm_hwdbm_bitmap_bytes(struct kvm_memory_slot *memslot)
+{
+ unsigned long nbits = DIV_ROUND_UP(memslot->npages, 1 << HWDBM_GRANULE_SHIFT);
+
+ return ALIGN(nbits, BITS_PER_LONG) / 8;
+}
+
+static unsigned long *kvm_second_hwdbm_bitmap(struct kvm_memory_slot *memslot)
+{
+ unsigned long len = kvm_hwdbm_bitmap_bytes(memslot);
+
+ return (void *)memslot->arch.hwdbm_bitmap + len;
+}
+
+/*
+ * Allocate twice space. Refer kvm_arch_sync_dirty_log() to see why the
+ * second space is needed.
+ */
+int kvm_arm_init_hwdbm_bitmap(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+ unsigned long bytes = 2 * kvm_hwdbm_bitmap_bytes(memslot);
+
+ if (!kvm->arch.mmu.hwdbm_enabled)
+ return 0;
+
+ if (memslot->arch.hwdbm_bitmap) {
+ /* Inherited from old memslot */
+ bitmap_clear(memslot->arch.hwdbm_bitmap, 0, bytes * 8);
+ } else {
+ memslot->arch.hwdbm_bitmap = kvzalloc(bytes, GFP_KERNEL_ACCOUNT);
+ if (!memslot->arch.hwdbm_bitmap)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void kvm_arm_destroy_hwdbm_bitmap(struct kvm_memory_slot *memslot)
+{
+ if (!memslot->arch.hwdbm_bitmap)
+ return;
+
+ kvfree(memslot->arch.hwdbm_bitmap);
+ memslot->arch.hwdbm_bitmap = NULL;
+}
+
+/* Add DBM for nearby pagetables but do not across memslot */
+void kvm_arm_enable_nearby_hwdbm(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *memslot;
+
+ memslot = gfn_to_memslot(kvm, gfn);
+ if (memslot && kvm_slot_dirty_track_enabled(memslot) &&
+ memslot->arch.hwdbm_bitmap) {
+ unsigned long rel_gfn = gfn - memslot->base_gfn;
+ unsigned long dbm_idx = rel_gfn >> HWDBM_GRANULE_SHIFT;
+ unsigned long start_page, npages;
+
+ if (!test_and_set_bit(dbm_idx, memslot->arch.hwdbm_bitmap)) {
+ start_page = dbm_idx << HWDBM_GRANULE_SHIFT;
+ npages = 1 << HWDBM_GRANULE_SHIFT;
+ npages = min(memslot->npages - start_page, npages);
+ kvm_stage2_set_dbm(kvm, memslot, start_page, npages);
+ }
+ }
+}
+
+/*
+ * We have to find a place to clear hwdbm_bitmap, and clear hwdbm_bitmap means
+ * to clear DBM bit of all related pgtables. Note that between we clear DBM bit
+ * and flush TLB, HW dirty log may occur, so we must scan all related pgtables
+ * after flush TLB. Giving above, it's best choice to clear hwdbm_bitmap before
+ * sync HW dirty log.
+ */
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
+ unsigned long *second_bitmap = kvm_second_hwdbm_bitmap(memslot);
+ unsigned long start_page, npages;
+ unsigned int end, rs, re;
+ bool has_hwdbm = false;
+
+ if (!memslot->arch.hwdbm_bitmap)
+ return;
+
+ end = kvm_hwdbm_bitmap_bytes(memslot) * 8;
+ bitmap_clear(second_bitmap, 0, end);
+
+ write_lock(&kvm->mmu_lock);
+ for_each_set_bitrange(rs, re, memslot->arch.hwdbm_bitmap, end) {
+ has_hwdbm = true;
+ /*
+ * Must clear bitmap before clear DBM bit. During we clear DBM
+ * (it releases the mmu spinlock periodly), SW dirty tracking
+ * has chance to add DBM which overlaps what we are clearing. So
+ * if we clear bitmap after clear DBM, we will face a situation
+ * that bitmap is cleared but DBM are lefted, then we may have
+ * no chance to scan these lefted pgtables anymore.
+ */
+ bitmap_clear(memslot->arch.hwdbm_bitmap, rs, re - rs);
+
+ /* Record the bitmap cleared */
+ bitmap_set(second_bitmap, rs, re - rs);
+
+ start_page = rs << HWDBM_GRANULE_SHIFT;
+ npages = (re - rs) << HWDBM_GRANULE_SHIFT;
+ npages = min(memslot->npages - start_page, npages);
+ kvm_stage2_clear_dbm(kvm, memslot, start_page, npages);
+ }
+ write_unlock(&kvm->mmu_lock);
+
+ if (!has_hwdbm)
+ return;
+
+ /*
+ * Ensure vcpu write-actions that occur after we clear hwdbm_bitmap can
+ * be catched by guest memory abort handler.
+ */
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
+
+ read_lock(&kvm->mmu_lock);
+ for_each_set_bitrange(rs, re, second_bitmap, end) {
+ start_page = rs << HWDBM_GRANULE_SHIFT;
+ npages = (re - rs) << HWDBM_GRANULE_SHIFT;
+ npages = min(memslot->npages - start_page, npages);
+ kvm_stage2_sync_dirty(kvm, memslot, start_page, npages);
+ }
+ read_unlock(&kvm->mmu_lock);
}
static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
@@ -651,10 +651,10 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
#ifdef CONFIG_ARM64_HW_AFDBM
/*
- * Enable the Hardware Access Flag management, unconditionally
- * on all CPUs. In systems that have asymmetric support for the feature
- * this allows KVM to leverage hardware support on the subset of cores
- * that implement the feature.
+ * Enable the Hardware Access Flag management and Dirty State management,
+ * unconditionally on all CPUs. In systems that have asymmetric support for
+ * the feature this allows KVM to leverage hardware support on the subset of
+ * cores that implement the feature.
*
* The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
* hardware) on implementations that do not advertise support for the
@@ -663,7 +663,7 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
* HAFDBS. Here be dragons.
*/
if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
- vtcr |= VTCR_EL2_HA;
+ vtcr |= VTCR_EL2_HA | VTCR_EL2_HD;
#endif /* CONFIG_ARM64_HW_AFDBM */
/* Set the vmid bits */
@@ -1569,14 +1569,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule)
+ if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) {
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- else
+ /* Try to enable HW DBM for nearby pages */
+ if (!ret && vma_pagesize == PAGE_SIZE && writable)
+ kvm_arm_enable_nearby_hwdbm(kvm, gfn);
+ } else {
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
memcache,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
+ }
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
@@ -2046,11 +2050,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
} while (hva < reg_end);
mmap_read_unlock(current->mm);
- return ret;
+ return ret ? : kvm_arm_init_hwdbm_bitmap(kvm, new);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
+ kvm_arm_destroy_hwdbm_bitmap(slot);
}
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)