@@ -265,6 +265,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order);
struct folio *folio_alloc(gfp_t gfp, unsigned order);
struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage);
+struct page *alloc_mcpages(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr);
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
@@ -276,7 +278,10 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
}
#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \
folio_alloc(gfp, order)
+#define alloc_mcpages(gfp, order, vma, addr) \
+ alloc_pages(gfp, order)
#endif
+
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
static inline struct page *alloc_page_vma(gfp_t gfp,
struct vm_area_struct *vma, unsigned long addr)
new file mode 100644
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MCPAGE_MM_H
+#define _LINUX_MCPAGE_MM_H
+
+#include <linux/mm_types.h>
+
+#ifdef CONFIG_MCPAGE_ORDER
+
+static inline bool allow_mcpage(struct vm_area_struct *vma,
+ unsigned long addr, unsigned int order)
+{
+ unsigned int mcpage_size = 1 << (order + PAGE_SHIFT);
+ unsigned long haddr = ALIGN_DOWN(addr, mcpage_size);
+
+ return range_in_vma(vma, haddr, haddr + mcpage_size);
+}
+
+extern vm_fault_t do_anonymous_mcpages(struct vm_fault *vmf,
+ unsigned int order);
+
+#else
+static inline bool allow_mcpage(struct vm_area_struct *vma,
+ unsigned long addr, unsigned int order)
+{
+ return false;
+}
+
+static inline vm_fault_t do_anonymous_mcpages(struct vm_fault *vmf,
+ unsigned int order)
+{
+ return VM_FAULT_FALLBACK;
+}
+#endif /* CONFIG_MCPAGE */
+
+#endif /* _LINUX_MCPAGE_MM_H */
@@ -96,6 +96,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+obj-$(CONFIG_MCPAGE) += mcpage_memory.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
new file mode 100644
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2022 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/gfp.h>
+#include <linux/page_owner.h>
+#include <linux/pgtable.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/rmap.h>
+#include <linux/oom.h>
+#include <linux/vm_event_item.h>
+#include <linux/userfaultfd_k.h>
+
+#include "internal.h"
+
+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
+static inline struct page *
+alloc_zeroed_mcpages(int order, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page = alloc_mcpages(GFP_HIGHUSER_MOVABLE, order,
+ vma, addr);
+
+ if (page) {
+ int i;
+ struct page *it = page;
+
+ for (i = 0; i < (1 << order); i++, it++) {
+ clear_user_highpage(it, addr);
+ cond_resched();
+ }
+ }
+
+ return page;
+}
+#else
+static inline struct page *
+alloc_zeroed_mcpages(int order, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ return alloc_mcpages(GFP_HIGHUSER_MOVABLE | __GFP_ZERO,
+ order, vma, addr);
+}
+#endif
+
+static vm_fault_t do_anonymous_mcpage(struct vm_fault *vmf,
+ struct page *page, unsigned long addr)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ pte_t entry;
+
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) {
+ ret = VM_FAULT_OOM;
+ goto oom;
+ }
+
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
+ __SetPageUptodate(page);
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+
+ if (!pte_none(*vmf->pte)) {
+ ret = VM_FAULT_FALLBACK;
+ update_mmu_cache(vma, addr, vmf->pte);
+ goto release;
+ }
+
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret) {
+ ret = VM_FAULT_FALLBACK;
+ goto release;
+ }
+
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
+ }
+
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+ update_mmu_cache(vma, addr, vmf->pte);
+release:
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+oom:
+ return ret;
+}
+
+vm_fault_t do_anonymous_mcpages(struct vm_fault *vmf, unsigned int order)
+{
+ int i, nr = 1 << order;
+ unsigned int mcpage_size = nr * PAGE_SIZE;
+ vm_fault_t ret = 0, real_ret = 0;
+ bool handled = false;
+ struct page *page;
+ unsigned long haddr = ALIGN_DOWN(vmf->address, mcpage_size);
+
+ page = alloc_zeroed_mcpages(order, vmf->vma, haddr);
+ if (!page)
+ return VM_FAULT_FALLBACK;
+
+ split_page(page, order);
+ for (i = 0; i < nr; i++, haddr += PAGE_SIZE) {
+ ret = do_anonymous_mcpage(vmf, &page[i], haddr);
+ if (haddr == PAGE_ALIGN_DOWN(vmf->address)) {
+ real_ret = ret;
+ handled = true;
+ }
+ if (ret)
+ break;
+ }
+
+ while (i < nr)
+ put_page(&page[i++]);
+
+ /*
+ * If the fault address is not handled, fallback to handle
+ * fault address with normal page.
+ */
+ if (!handled)
+ return VM_FAULT_FALLBACK;
+ else
+ return real_ret;
+}
@@ -77,6 +77,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/mcpage_mm.h>
#include <trace/events/kmem.h>
@@ -4071,6 +4072,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
+
+ if (allow_mcpage(vma, vmf->address, MCPAGE_ORDER)) {
+ ret = do_anonymous_mcpages(vmf, MCPAGE_ORDER);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+
+ ret = 0;
+ }
+
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2251,6 +2251,57 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
}
EXPORT_SYMBOL(vma_alloc_folio);
+/**
+ * alloc_mcpages - Allocate a mcpage for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the mcpage.
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
+ *
+ * Allocate a mcpage for a specific address in @vma, using the
+ * appropriate NUMA policy. When @vma is not NULL the caller must hold the
+ * mmap_lock of the mm_struct of the VMA to prevent it from going away.
+ * Should be used for all allocations for pages that will be mapped into
+ * user space.
+ *
+ * Return: The page on success or NULL if allocation fails.
+ */
+struct page *alloc_mcpages(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct mempolicy *pol;
+ int node = numa_node_id();
+ struct page *page;
+ int preferred_nid;
+ nodemask_t *nmask;
+
+ pol = get_vma_policy(vma, addr);
+
+ if (pol->mode == MPOL_INTERLEAVE) {
+ unsigned int nid;
+
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ mpol_cond_put(pol);
+ page = alloc_page_interleave(gfp, order, nid);
+ goto out;
+ }
+
+ if (pol->mode == MPOL_PREFERRED_MANY) {
+ node = policy_node(gfp, pol, node);
+ page = alloc_pages_preferred_many(gfp, order, node, pol);
+ mpol_cond_put(pol);
+ goto out;
+ }
+
+ nmask = policy_nodemask(gfp, pol);
+ preferred_nid = policy_node(gfp, pol, node);
+ page = __alloc_pages(gfp, order, preferred_nid, nmask);
+ mpol_cond_put(pol);
+out:
+ return page;
+}
+EXPORT_SYMBOL(alloc_mcpages);
+
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
If mcpage is in the range of VMA, try to allocated mcpage and setup for anonymous mapping. Try best to populate all the around page table entries. The benefit is that the page fault number will be reduced. Split the mcpage to allow each sub-page to be managed as normal 4K page. Doing split before setup page table entries to avoid the complicated page lock, mapcount and refcount handling. It's expected that the change will impact the memory consumption, page fault number, zone lock and lru lock directly. The memory consumption and system performance impact are evaluated as following. Some system performance data were collected with 16K mcpage size: =============================================================================== v6.1-rc4-no-thp v6.1-rc4-thp mcpage will-it-scale/malloc1 (higher is better) 100% 2% 17% will-it-scale/page_fault1 (higher is better) 100% 238% 115% redis.set_avg_throughput (higher is better) 100% 99% 102% redis.get_avg_throughput (higher is better) 100% 99% 100% kernel build (lower is better) 100% 98% 97% * v6.1-rc4-no-thp: 6.1-rc4 with THP disabled in Kconfig * v6.1-rc4-thp: 6.1-rc4 with THP enabled as always in Kconfig * mcpage: 6.1-rc4 + 16KB mcpage The test results are normalized to config "v6.1-rc4-no-thp" The perf data between v6.1-rc4-no-thp and mcpage are collected: For kernel build, perf showed 56% minor_page_fault drop and 1.3% clear_page increasing: v6.1-rc4-no-thp mcpage 5.939e+08 -56.0% 2.61e+08 kbuild.time.minor_page_faults 0.00 +2.2 2.20 perf-profile.calltrace.cycles-pp.clear_page_erms.get_page_from_freelist.__alloc_pages.alloc_mcpages.do_anonymous_mcpages 0.72 -0.7 0.00 perf-profile.calltrace.cycles-pp.clear_page_erms.get_page_from_freelist.__alloc_pages.vma_alloc_folio.do_anonymous_page For redis, perf showed 74.6% minor_page_fault drop and 0.11% zone lock drop. v6.1-rc4-no-thp mcpage 401414 -74.6% 102134 redis.time.minor_page_faults 0.00 +0.1 0.11 perf-profile.calltrace.cycles-pp.rmqueue.get_page_from_freelist.__alloc_pages.alloc_mcpages.do_anonymous_mcpages 0.22 -0.2 0.00 perf-profile.calltrace.cycles-pp.rmqueue_bulk.rmqueue.get_page_from_freelist.__alloc_pages.vma_alloc_folio For will-it-scale/page_fault1, perf showed 12.8% minor_page_fault drop and 15.97% zone lock drop and 27% lru lock increasing. v6.1-rc4-no-thp mcpage 7239 -12.8% 6312 will-it-scale.time.minor_page_faults 52.15 -34.4 17.75 perf-profile.calltrace.cycles-pp._raw_spin_lock.rmqueue_bulk.rmqueue.get_page_from_freelist.__alloc_pages 3.29 +27.0 30.29 perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.folio_lruvec_lock_irqsave.release_pages.tlb_batch_pages_flush 4.14 -4.1 0.00 perf-profile.calltrace.cycles-pp.clear_page_erms.get_page_from_freelist.__alloc_pages.vma_alloc_folio.do_anonymous_page 0.00 +13.2 13.20 perf-profile.calltrace.cycles-pp.clear_page_erms.get_page_from_freelist.__alloc_pages.alloc_mcpages.do_anonymous_mcpages 0.00 +18.4 18.43 perf-profile.calltrace.cycles-pp.rmqueue_bulk.rmqueue.get_page_from_freelist.__alloc_pages.alloc_mcpages For will-it-scale/malloc1, the test result is surprise. The regression is much bigger than expected. perf showed 12.3% minor_page_fault drop and 43.6% zone lock increasing: v6.1-rc4-no-thp mcpage 2978027 -82.2% 530847 will-it-scale.128.processes 7249 -12.3% 6360 will-it-scale.time.minor_page_faults 0.00 +43.6 43.62 perf-profile.calltrace.cycles-pp.rmqueue.get_page_from_freelist.__alloc_pages.pte_alloc_one.__pte_alloc 0.00 +45.4 45.39 perf-profile.calltrace.cycles-pp._raw_spin_lock.free_pcppages_bulk.free_unref_page_list.release_pages.tlb_batch_pages_flush It turned out the mcpage allocation/free pattern hit a corn case (high zone lock contention triggered and impact pte_alloc) which current pcp list bulk free can't handle very well. Will address the pcp list bulk free issue separately. After fix the pcp list bulk corn case, the result of will-it-scale/malloc1 is restored to 56% of v6.1-rc4-no-thp. =============================================================================== For tail latency of page allocation, use following testing setup: - alloc_page() with order 0, 2 and 9 are called 2097152, 2097152 and 32768 times in kernel - none fragment and fragment entier memory - w/o __GFP_ZERO flag to identify pure compaction latency and user visible latency And the result is as following: no page zeroing: 4K page: none fragment: fragment: Number of test: 2097152 Number of test: 2097152 max latency: 26us max latency: 27us 90% tail latency: 1us (1887436th) 90% tail latency: 1us (1887436th) 95% tail latency: 1us (1992294th) 95% tail latency: 1us (1992294th) 99% tail latency: 2us (2076180th) 99% tail latency: 3us (2076180th) 16K mcpage none fragment: fragment: Number of test: 2097152 Number of test: 2097152 max latency: 26us max latency: 9862us 90% tail latency: 1us (1887436th) 90% tail latency: 1us (1887436th) 95% tail latency: 1us (1992294th) 95% tail latency: 1us (1992294th) 99% tail latency: 1us (2076180th) 99% tail latency: 3us (2076180th) 2M THP: none fragment: fragment: Number of test: 32768 Number of test: 32768 max latency: 40us max latency: 12149us 90% tail latency: 8us (29491th) 90% tail latency: 864us (29491th) 95% tail latency: 10us (31129th) 95% tail latency: 943us (31129th) 99% tail latency: 13us (32440th) 99% tail latency: 1067us (32440th) page zeroing: 4K page: none fragment: fragment: Number of test: 2097152 Number of test: 2097152 max latency: 18us max latency: 46us 90% tail latency: 1us (1887436th) 90% tail latency: 1us (1887436th) 95% tail latency: 1us (1992294th) 95% tail latency: 1us (1992294th) 99% tail latency: 2us (2076180th) 99% tail latency: 4us (2076180th) 16K mcpage none fragment: fragment: Number of test: 2097152 Number of test: 2097152 max latency: 31us max latency: 5740us 90% tail latency: 3us (1887436th) 90% tail latency: 3us (1887436th) 95% tail latency: 3us (1992294th) 95% tail latency: 4us (1992294th) 99% tail latency: 4us (2076180th) 99% tail latency: 5us (2076180th) 2M THP: none fragment: fragment: Number of test: 32768 Number of test: 32768 max latency: 530us max latency: 10494us 90% tail latency: 366us (29491th) 90% tail latency: 1114us (29491th) 95% tail latency: 373us (31129th) 95% tail latency: 1263us (31129th) 99% tail latency: 391us (32440th) 99% tail latency: 1808us (32440th) With 16K mcpage, the tail latency for page allocation is good while 2M THP has much worse result in memory fragment case. =============================================================================== For the performance of NUMA interleaving on base page, mcpage and THP, memory latency from https://github.com/torvalds/test-tlb is used. On a Cascade Lake box with 96 core + 258G memory with two NUMA nodes: node distances: node 0 1 0: 10 20 1: 20 10 With memory policy set to MPOL_INTERLEAVE and 1G memory mapping with 128 bytes (2X cache line) stride, the memory access latency (less is better): random access with 4K apge: 142.32 ns random access with 16K mcpage: 141.21 ns (+0.8%) random access with 2M THP: 116.56 ns (+18.2%) sequential access with 4K page: 21.28 ns sequential access with 16K mcpage: 20.52 ns (+0.36%) sequential access with 2M THP: 20.36 ns (+0.43%) mcpage brings minor memory access latency improvement comparing to 4K page. But less than the improvement comparing to 2M THP. =============================================================================== The memory consumption is checked by using firefox to access "www.lwn.net" website and collect the RSS of firefox with 16K mcpage size: 6.1-rc7: RSS of firefox is 285300 KB 6.1-rc7 + 16K mcpage: RSS of firefox is 295536 KB 3.59% more memory consumption with 16K mcpage. =============================================================================== In this RFC patch, the none-batch update to page table entries is used to show the idea. Batch mode will be chosen if make this official patch in the future. Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> --- include/linux/gfp.h | 5 ++ include/linux/mcpage_mm.h | 35 ++++++++++ mm/Makefile | 1 + mm/mcpage_memory.c | 134 ++++++++++++++++++++++++++++++++++++++ mm/memory.c | 11 ++++ mm/mempolicy.c | 51 +++++++++++++++ 6 files changed, 237 insertions(+) create mode 100644 include/linux/mcpage_mm.h create mode 100644 mm/mcpage_memory.c