@@ -340,4 +340,19 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *cache);
+static inline int kvm_pfn_to_page_table_nid(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ struct page *page;
+
+ if (!kvm->arch.numa_aware_page_table)
+ return NUMA_NO_NODE;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+
+ if (page)
+ return page_to_nid(page);
+ else
+ return numa_mem_id();
+}
+
#endif /* __KVM_X86_MMU_INTERNAL_H */
@@ -1412,7 +1412,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}
-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp, int nid)
{
struct kvm_mmu_page *sp;
@@ -1422,7 +1422,7 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
if (!sp)
return NULL;
- sp->spt = (void *)__get_free_page(gfp);
+ sp->spt = kvm_mmu_get_free_page(gfp, nid);
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
@@ -1435,6 +1435,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
struct tdp_iter *iter,
bool shared)
{
+ int nid = kvm_pfn_to_page_table_nid(kvm, spte_to_pfn(iter->old_spte));
struct kvm_mmu_page *sp;
/*
@@ -1446,7 +1447,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
* If this allocation fails we drop the lock and retry with reclaim
* allowed.
*/
- sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT, nid);
if (sp)
return sp;
@@ -1458,7 +1459,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
iter->yielded = true;
- sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT, nid);
if (shared)
read_lock(&kvm->mmu_lock);
@@ -1358,6 +1358,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
void kvm_flush_remote_tlbs(struct kvm *kvm);
#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
+void *kvm_mmu_get_free_page(gfp_t gfp, int nid);
int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
@@ -377,6 +377,22 @@ static void kvm_flush_shadow_all(struct kvm *kvm)
}
#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
+
+void *kvm_mmu_get_free_page(gfp_t gfp, int nid)
+{
+#ifdef CONFIG_NUMA
+ struct page *page;
+
+ if (nid != NUMA_NO_NODE) {
+ page = alloc_pages_node(nid, gfp, 0);
+ if (!page)
+ return (void *)0;
+ return page_address(page);
+ }
+#endif /* CONFIG_NUMA */
+ return (void *)__get_free_page(gfp);
+}
+
static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
gfp_t gfp_flags)
{
When splitting a huge page, try to allocate new lower level page tables on the same NUMA node of the huge page. Only do NUMA aware page splits if KVM has enabled NUMA aware page table for the VM else fall back to the default method of using current thread mempolicy. When huge pages are split for dirty log, new page tables are created based on the current thread mempolicy, which by default will be the NUMA node of the pCPU executing the thread. If thread enabling dirty log is on a remote NUMA node than the huge page NUMA node then it will create all page tables mapping 4KiB pages of that huge page on the remote node. This reduces performances of the vCPUs which are NUMA bound and are only accessing local NUMA memory as they will access remote NUMA node page tables to access their local NUMA node memory. Tested this feature on synthetic read-write-heavy workload in a 416 vCPU VM on a 8 NUMA node host. This workload creates multiple threads, partitions data in equal sizes and assigns them to each thread. Each thread iterates over its own data in strides, reads and writes value in its partitions. While executing, this workload continuously outputs combined rates at which it is performing operations. When dirty log is enabled in WRPROT mode, workload's performance: - Without NUMA aware page table drops by ~75% - With NUMA aware page table drops by ~20% Raw data from one example run: 1. Without NUMA aware page table Before dirty log: ~2750000 accesses/sec After dirty log: ~700000 accesses/sec 2. With NUMA aware page table Before dirty log: ~2750000 accesses/sec After dirty log: ~2250000 accesses/sec NUMA aware page table improved performance by more than 200% Signed-off-by: Vipin Sharma <vipinsh@google.com> --- arch/x86/kvm/mmu/mmu_internal.h | 15 +++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++---- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 16 ++++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-)