@@ -839,6 +839,10 @@ struct mm_struct {
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
+ /* Next mm_pgtable scan (in jiffies) */
+ unsigned long mm_pgtable_next_scan;
+ /* Restart point for scanning and freeing empty user PTE pages */
+ unsigned long mm_pgtable_scan_offset;
#endif
int map_count; /* number of VMAs */
@@ -1589,6 +1589,20 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
}
#endif /* CONFIG_MMU */
+#ifdef CONFIG_MMU
+#define MM_PGTABLE_SCAN_DELAY 100 /* 100ms */
+#define MM_PGTABLE_SCAN_SIZE 256 /* 256MB */
+void init_mm_pgtable_work(struct task_struct *p);
+void task_tick_mm_pgtable(struct task_struct *curr);
+#else
+static inline void init_mm_pgtable_work(struct task_struct *p)
+{
+}
+static inline void task_tick_mm_pgtable(struct task_struct *curr)
+{
+}
+#endif
+
#ifdef CONFIG_MMU
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1485,6 +1485,7 @@ struct task_struct {
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
struct timer_list oom_reaper_timer;
+ struct callback_head pgtable_work;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
@@ -4359,6 +4359,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->migration_pending = NULL;
#endif
init_sched_mm_cid(p);
+ init_mm_pgtable_work(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -12696,6 +12696,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+ task_tick_mm_pgtable(curr);
+
update_misfit_status(curr, rq);
check_update_overutilized_status(task_rq(curr));
@@ -54,7 +54,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o shmem_quota.o\
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o freept.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
new file mode 100644
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/userfaultfd_k.h>
+#include <linux/pagewalk.h>
+#include <linux/task_work.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+
+void task_tick_mm_pgtable(struct task_struct *curr)
+{
+ struct callback_head *work = &curr->pgtable_work;
+ unsigned long now = jiffies;
+
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
+ work->next != work)
+ return;
+
+ if (time_before(now, READ_ONCE(curr->mm->mm_pgtable_next_scan)))
+ return;
+
+ task_work_add(curr, work, TWA_RESUME);
+}
+
+/*
+ * Locking:
+ * - already held the mmap read lock to traverse the vma tree and pgtable
+ * - use pmd lock for clearing pmd entry
+ * - use pte lock for checking empty PTE page, and release it after clearing
+ * pmd entry, then we can capture the changed pmd in pte_offset_map_lock()
+ * etc after holding this pte lock. Thanks to this, we don't need to hold the
+ * rmap-related locks.
+ * - users of pte_offset_map_lock() etc all expect the PTE page to be stable by
+ * using rcu lock, so use pte_free_defer() to free PTE pages.
+ */
+static int freept_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct mmu_notifier_range range;
+ struct mm_struct *mm = walk->mm;
+ pte_t *start_pte, *pte;
+ pmd_t pmdval;
+ spinlock_t *pml = NULL, *ptl;
+ unsigned long haddr = addr;
+ int i;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ haddr, haddr + PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ start_pte = pte_offset_map_nolock(mm, pmd, &pmdval, haddr, &ptl);
+ if (!start_pte)
+ goto out;
+
+ pml = pmd_lock(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd))))
+ goto out_ptl;
+
+ /* Check if it is empty PTE page */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+ flush_tlb_range(walk->vma, haddr, haddr + PMD_SIZE);
+ pmdp_get_lockless_sync();
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ mm_dec_nr_ptes(mm);
+ pte_free_defer(mm, pmd_pgtable(pmdval));
+
+ return 0;
+
+out_ptl:
+ pte_unmap_unlock(start_pte, ptl);
+ if (pml != ptl)
+ spin_unlock(pml);
+out:
+ mmu_notifier_invalidate_range_end(&range);
+
+ return 0;
+}
+
+static const struct mm_walk_ops mm_pgtable_walk_ops = {
+ .pmd_entry = freept_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static void task_mm_pgtable_work(struct callback_head *work)
+{
+ unsigned long now = jiffies, old_scan, next_scan;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ struct vma_iterator vmi;
+
+ work->next = work; /* Prevent double-add */
+ if (p->flags & PF_EXITING)
+ return;
+
+ if (!mm->mm_pgtable_next_scan) {
+ mm->mm_pgtable_next_scan = now + msecs_to_jiffies(MM_PGTABLE_SCAN_DELAY);
+ return;
+ }
+
+ old_scan = mm->mm_pgtable_next_scan;
+ if (time_before(now, old_scan))
+ return;
+
+ next_scan = now + msecs_to_jiffies(MM_PGTABLE_SCAN_DELAY);
+ if (!try_cmpxchg(&mm->mm_pgtable_next_scan, &old_scan, next_scan))
+ return;
+
+ if (!mmap_read_trylock(mm))
+ return;
+
+ start = mm->mm_pgtable_scan_offset;
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_next(&vmi);
+ if (!vma) {
+ mm->mm_pgtable_scan_offset = 0;
+ start = 0;
+ vma_iter_set(&vmi, start);
+ vma = vma_next(&vmi);
+ }
+
+ do {
+ /* Skip hugetlb case */
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ /* Leave this to the THP path to handle */
+ if (vma->vm_flags & VM_HUGEPAGE)
+ continue;
+
+ /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
+ if (userfaultfd_wp(vma))
+ continue;
+
+ /* Only consider PTE pages that do not cross vmas */
+ start = ALIGN(vma->vm_start, PMD_SIZE);
+ end = ALIGN_DOWN(vma->vm_end, PMD_SIZE);
+ if (end - start < PMD_SIZE)
+ continue;
+
+ walk_page_range_vma(vma, start, end, &mm_pgtable_walk_ops, NULL);
+
+ if (end - mm->mm_pgtable_scan_offset >= (MM_PGTABLE_SCAN_SIZE << 20))
+ goto out;
+
+ cond_resched();
+ } for_each_vma(vmi, vma);
+
+out:
+ mm->mm_pgtable_scan_offset = vma ? end : 0;
+ mmap_read_unlock(mm);
+}
+
+void init_mm_pgtable_work(struct task_struct *p)
+{
+ struct mm_struct *mm = p->mm;
+ int mm_users = 0;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1)
+ mm->mm_pgtable_next_scan = jiffies + msecs_to_jiffies(MM_PGTABLE_SCAN_DELAY);
+ }
+ p->pgtable_work.next = &p->pgtable_work; /* Protect against double add */
+ init_task_work(&p->pgtable_work, task_mm_pgtable_work);
+}
@@ -1581,7 +1581,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, NULL, haddr, &ptl);
+ start_pte = pte_offset_map_nolock(mm, pmd, &pgt_pmd, haddr, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1589,6 +1589,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ /* pmd entry may be changed by others */
+ if (unlikely(!pml && !pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1636,6 +1640,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
pml = pmd_lock(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ spin_unlock(ptl);
+ goto unlock;
+ }
}
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
@@ -1663,6 +1672,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
+unlock:
if (pml && pml != ptl)
spin_unlock(pml);
if (notified)
@@ -1722,6 +1732,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
mmu_notifier_invalidate_range_start(&range);
pml = pmd_lock(mm, pmd);
+ /* check if the pmd is still valid */
+ if (check_pmd_still_valid(mm, addr, pmd) != SCAN_SUCCEED) {
+ spin_unlock(pml);
+ mmu_notifier_invalidate_range_end(&range);
+ continue;
+ }
ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
Now in order to pursue high performance, applications mostly use some high-performance user-mode memory allocators, such as jemalloc or tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE) to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will release page table memory, which may cause huge page table memory usage. The following are a memory usage snapshot of one process which actually happened on our server: VIRT: 55t RES: 590g VmPTE: 110g In this case, most of the page table entries are empty. For such a PTE page where all entries are empty, we can actually free it back to the system for others to use. Similar to numa_balancing, this commit adds a task_work to scan the address space of the user process when it returns to user space. If a suitable empty PTE page is found, it will be released. The following code snippet can show the effect of optimization: mmap 50G while (1) { for (; i < 1024 * 25; i++) { touch 2M memory madvise MADV_DONTNEED 2M } } As we can see, the memory usage of VmPTE is reduced: before after VIRT 50.0 GB 50.0 GB RES 3.1 MB 3.1 MB VmPTE 102640 kB 756 kB (Even less) Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> --- include/linux/mm_types.h | 4 + include/linux/pgtable.h | 14 +++ include/linux/sched.h | 1 + kernel/sched/core.c | 1 + kernel/sched/fair.c | 2 + mm/Makefile | 2 +- mm/freept.c | 180 +++++++++++++++++++++++++++++++++++++++ mm/khugepaged.c | 18 +++- 8 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 mm/freept.c