@@ -1426,6 +1426,9 @@ config DYNAMIC_SIGFRAME
config HAVE_ARCH_NODE_DEV_GROUP
bool
+config LRU_TASK_PAGE_AGING
+ bool
+
config ARCH_HAS_NONLEAF_PMD_YOUNG
bool
help
@@ -219,6 +219,7 @@ config ARM64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select KASAN_VMALLOC if KASAN
+ select LRU_TASK_PAGE_AGING
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
@@ -274,6 +274,7 @@ config X86
select HAVE_GENERIC_VDSO
select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
+ select LRU_TASK_PAGE_AGING
select NEED_PER_CPU_EMBED_FIRST_CHUNK
select NEED_PER_CPU_PAGE_FIRST_CHUNK
select NEED_SG_DMA_LENGTH
@@ -320,7 +320,7 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif
@@ -796,7 +796,7 @@ struct mm_struct {
*/
unsigned long ksm_rmap_items;
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
@@ -811,7 +811,7 @@ struct mm_struct {
struct mem_cgroup *memcg;
#endif
} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_TASK_PAGE_AGING */
} __randomize_layout;
/*
@@ -839,7 +839,7 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
struct lru_gen_mm_list {
/* mm_struct list for page table walkers */
@@ -873,7 +873,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
WRITE_ONCE(mm->lru_gen.bitmap, -1);
}
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_TASK_PAGE_AGING */
static inline void lru_gen_add_mm(struct mm_struct *mm)
{
@@ -428,6 +428,7 @@ struct lru_gen_struct {
bool enabled;
};
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
enum {
MM_LEAF_TOTAL, /* total leaf entries */
MM_LEAF_OLD, /* old leaf entries */
@@ -474,6 +475,7 @@ struct lru_gen_mm_walk {
bool can_swap;
bool force_scan;
};
+#endif
void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
@@ -525,8 +527,14 @@ struct lruvec {
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
struct lru_gen_struct lrugen;
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
+#else
+ /* for concurrent update of max_seq without holding lru_lock */
+ struct wait_queue_head seq_update_wait;
+ bool seq_update_progress;
+#endif
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
@@ -1240,7 +1248,7 @@ typedef struct pglist_data {
unsigned long flags;
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
#endif
@@ -154,7 +154,7 @@ union swap_header {
*/
struct reclaim_state {
unsigned long reclaimed_slab;
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
@@ -2708,7 +2708,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_TASK_PAGE_AGING) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
@@ -6305,7 +6305,7 @@ static void mem_cgroup_move_task(void)
}
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
@@ -3225,6 +3225,7 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/******************************************************************************
* mm_struct list
******************************************************************************/
@@ -3586,6 +3587,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
return success;
}
+#endif
/******************************************************************************
* refault feedback loop
@@ -3778,6 +3780,7 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
return folio;
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
int old_gen, int new_gen)
{
@@ -4235,7 +4238,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
} while (err == -EAGAIN);
}
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+static void *set_mm_walk(struct pglist_data *pgdat)
{
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
@@ -4266,6 +4269,18 @@ static void clear_mm_walk(void)
if (!current_is_kswapd())
kfree(walk);
}
+#else
+
+static inline void *set_mm_walk(struct pglist_data *pgdat)
+{
+ return NULL;
+}
+
+static inline void clear_mm_walk(void)
+{
+}
+
+#endif
static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
@@ -4399,11 +4414,14 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+#ifndef CONFIG_LRU_TASK_PAGE_AGING
+ lruvec->seq_update_progress = false;
+#endif
spin_unlock_irq(&lruvec->lru_lock);
}
-
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, bool force_scan)
+ int scan_priority, bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -4429,7 +4447,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
goto done;
}
- walk = set_mm_walk(NULL);
+ walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL);
if (!walk) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
@@ -4449,7 +4467,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
} while (mm);
done:
if (!success) {
- if (sc->priority <= DEF_PRIORITY - 2)
+ if (scan_priority <= DEF_PRIORITY - 2)
wait_event_killable(lruvec->mm_state.wait,
max_seq < READ_ONCE(lrugen->max_seq));
@@ -4465,6 +4483,61 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
return true;
}
+#else
+
+/*
+ * inc_max_seq can drop the lru_lock in between. So use a waitqueue seq_update_progress
+ * to allow concurrent access.
+ */
+bool __try_to_inc_max_seq(struct lruvec *lruvec,
+ unsigned long max_seq, int scan_priority,
+ bool can_swap, bool force_scan)
+{
+ bool success = false;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+ /* see the comment in iterate_mm_list() */
+ if (lruvec->seq_update_progress)
+ success = false;
+ else {
+ spin_lock_irq(&lruvec->lru_lock);
+
+ if (max_seq != lrugen->max_seq)
+ goto done;
+
+ if (lruvec->seq_update_progress)
+ goto done;
+
+ success = true;
+ lruvec->seq_update_progress = true;
+done:
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+ if (!success) {
+ if (scan_priority <= DEF_PRIORITY - 2)
+ wait_event_killable(lruvec->seq_update_wait,
+ max_seq < READ_ONCE(lrugen->max_seq));
+
+ return max_seq < READ_ONCE(lrugen->max_seq);
+ }
+
+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+ inc_max_seq(lruvec, can_swap, force_scan);
+ /* either this sees any waiters or they will see updated max_seq */
+ if (wq_has_sleeper(&lruvec->seq_update_wait))
+ wake_up_all(&lruvec->seq_update_wait);
+
+ return success;
+}
+
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ int scan_priority, bool can_swap, bool force_scan)
+{
+ return __try_to_inc_max_seq(lruvec, max_seq, scan_priority, can_swap, force_scan);
+}
+#endif
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
@@ -4554,8 +4627,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
}
if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
-
+ try_to_inc_max_seq(lruvec, max_seq, sc->priority, swappiness, false);
return true;
}
@@ -4617,6 +4689,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
}
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/*
* This function exploits spatial locality when shrink_folio_list() walks the
* rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
@@ -4744,6 +4817,115 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
mem_cgroup_unlock_pages();
}
+#else
+/*
+ * This function exploits spatial locality when shrink_page_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, true);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+#endif
/******************************************************************************
* the eviction
@@ -5026,7 +5208,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct folio *next;
enum vm_event_item item;
struct reclaim_stat stat;
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
struct lru_gen_mm_walk *walk;
+#endif
bool skip_retry = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -5081,9 +5265,11 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
move_folios_to_lru(lruvec, &list);
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
walk = current->reclaim_state->mm_walk;
if (walk && walk->batched)
reset_batch_size(lruvec, walk);
+#endif
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
@@ -5140,8 +5326,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
if (current_is_kswapd())
return 0;
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+ if (try_to_inc_max_seq(lruvec, max_seq, sc->priority, can_swap, false))
return nr_to_scan;
+
done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
}
@@ -5610,6 +5797,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_putc(m, '\n');
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
const char *s = " ";
@@ -5626,6 +5814,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_printf(m, " %10lu%c", n, s[i]);
}
seq_putc(m, '\n');
+#endif
}
/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
@@ -5707,7 +5896,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr
if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
return -ERANGE;
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+ try_to_inc_max_seq(lruvec, max_seq, sc->priority, can_swap, force_scan);
return 0;
}
@@ -5898,21 +6087,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
-
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
lruvec->mm_state.seq = MIN_NR_GENS;
init_waitqueue_head(&lruvec->mm_state.wait);
+#else
+ lruvec->seq_update_progress = false;
+ init_waitqueue_head(&lruvec->seq_update_wait);
+#endif
}
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
INIT_LIST_HEAD(&memcg->mm_list.fifo);
spin_lock_init(&memcg->mm_list.lock);
+#endif
}
void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
- int i;
int nid;
for_each_node(nid) {
@@ -5920,11 +6114,12 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
sizeof(lruvec->lrugen.nr_pages)));
-
- for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
+ for (int i = 0; i < NR_BLOOM_FILTERS; i++) {
bitmap_free(lruvec->mm_state.filters[i]);
lruvec->mm_state.filters[i] = NULL;
}
+#endif
}
}
#endif
Not all architecture supports hardware atomic updates of access bits. On such an arch, we don't use page table walk to classify pages into generations. Add a kernel config option and remove adding all the page table walk code on such architecture. lru_gen_look_around() code is duplicated because lru_gen_mm_walk is not available always. This patch did result in some improvements on powerpc because it is removing all additional code that is not used in page classification. memcached: patch details Total Ops/sec: mglru 160821 PATCH 2 164572 mongodb: Patch details Throughput(Ops/sec) mglru 92987 PATCH 2 93740 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> --- arch/Kconfig | 3 + arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + include/linux/memcontrol.h | 2 +- include/linux/mm_types.h | 8 +- include/linux/mmzone.h | 10 +- include/linux/swap.h | 2 +- kernel/fork.c | 2 +- mm/memcontrol.c | 2 +- mm/vmscan.c | 221 ++++++++++++++++++++++++++++++++++--- 10 files changed, 230 insertions(+), 22 deletions(-)