@@ -19,6 +19,7 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/mm_inline.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -1720,7 +1721,7 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
if (PageSwapCache(page))
md->swapcache += nr_pages;
- if (PageActive(page) || PageUnevictable(page))
+ if (PageUnevictable(page) || page_is_active(compound_head(page), NULL))
md->active += nr_pages;
if (PageWriteback(page))
@@ -116,6 +116,49 @@ static inline int page_lru_gen(struct page *page)
return ((READ_ONCE(page->flags) & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
+/* This function works regardless whether multigenerational lru is enabled. */
+static inline bool page_is_active(struct page *page, struct lruvec *lruvec)
+{
+ struct mem_cgroup *memcg;
+ int gen = page_lru_gen(page);
+ bool active = false;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (gen < 0)
+ return PageActive(page);
+
+ if (lruvec) {
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ return lru_gen_is_active(lruvec, gen);
+ }
+
+ rcu_read_lock();
+
+ memcg = page_memcg_rcu(page);
+ lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
+ active = lru_gen_is_active(lruvec, gen);
+
+ rcu_read_unlock();
+
+ return active;
+}
+
+/* Activate a page from page cache or swap cache after it's mapped. */
+static inline void lru_gen_activate_page(struct page *page, struct vm_area_struct *vma)
+{
+ if (!lru_gen_enabled() || PageActive(page))
+ return;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_HUGETLB))
+ return;
+
+ activate_page(page);
+}
+
/* Update multigenerational lru sizes in addition to active/inactive lru sizes. */
static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
int old_gen, int new_gen)
@@ -252,6 +295,15 @@ static inline bool lru_gen_enabled(void)
return false;
}
+static inline bool page_is_active(struct page *page, struct lruvec *lruvec)
+{
+ return PageActive(page);
+}
+
+static inline void lru_gen_activate_page(struct page *page, struct vm_area_struct *vma)
+{
+}
+
static inline bool page_set_lru_gen(struct page *page, struct lruvec *lruvec, bool front)
{
return false;
@@ -292,6 +292,7 @@ enum lruvec_flags {
};
struct lruvec;
+struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
@@ -328,6 +329,7 @@ struct lru_gen {
void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_set_state(bool enable, bool main, bool swap);
+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw);
#else /* CONFIG_LRU_GEN */
@@ -339,6 +341,10 @@ static inline void lru_gen_set_state(bool enable, bool main, bool swap)
{
}
+static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
struct lruvec {
@@ -350,8 +350,8 @@ extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma);
+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+ bool faulting);
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, false);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
@@ -637,7 +637,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@ -1199,7 +1199,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, true);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
@@ -73,6 +73,7 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
#include <trace/events/kmem.h>
@@ -845,7 +846,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
page_add_new_anon_rmap(new_page, dst_vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
+ lru_cache_add_page_vma(new_page, dst_vma, false);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
@@ -2913,7 +2914,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, true);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3436,9 +3437,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+ lru_gen_activate_page(page, vma);
}
swap_free(entry);
@@ -3582,7 +3584,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3707,6 +3709,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
+ lru_gen_activate_page(page, vma);
/*
* deposit and withdraw with pmd lock held
*/
@@ -3750,10 +3753,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
+ lru_gen_activate_page(page, vma);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
@@ -3004,7 +3004,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
if (!is_zone_device_page(page))
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, false);
get_page(page);
if (flush) {
@@ -72,6 +72,7 @@
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
@@ -792,6 +793,11 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
}
if (pvmw.pte) {
+ /* multigenerational lru exploits spatial locality */
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+ lru_gen_scan_around(&pvmw);
+ referenced++;
+ }
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
@@ -310,7 +310,7 @@ void lru_note_cost_page(struct page *page)
static void __activate_page(struct page *page, struct lruvec *lruvec)
{
- if (!PageActive(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page) && !page_is_active(page, lruvec)) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -341,7 +341,7 @@ static bool need_activate_page_drain(int cpu)
static void activate_page_on_lru(struct page *page)
{
page = compound_head(page);
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && !PageUnevictable(page) && !page_is_active(page, NULL)) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
@@ -435,7 +435,7 @@ void mark_page_accessed(struct page *page)
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
- } else if (!PageActive(page)) {
+ } else if (!page_is_active(page, NULL)) {
activate_page(page);
ClearPageReferenced(page);
workingset_activation(page);
@@ -471,15 +471,14 @@ void lru_cache_add(struct page *page)
EXPORT_SYMBOL(lru_cache_add);
/**
- * lru_cache_add_inactive_or_unevictable
+ * lru_cache_add_page_vma
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * Place @page on an LRU list, depending on its evictability.
*/
-void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+ bool faulting)
{
bool unevictable;
@@ -496,6 +495,11 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
+
+ /* multigenerational lru uses PageActive() to track page faults */
+ if (lru_gen_enabled() && !unevictable && faulting)
+ SetPageActive(page);
+
lru_cache_add(page);
}
@@ -522,7 +526,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
*/
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
- bool active = PageActive(page);
+ bool active = page_is_active(page, lruvec);
int nr_pages = thp_nr_pages(page);
if (PageUnevictable(page))
@@ -562,7 +566,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageActive(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page) && page_is_active(page, lruvec)) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -676,7 +680,7 @@ void deactivate_file_page(struct page *page)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && !PageUnevictable(page) && page_is_active(page, NULL)) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, false);
}
swap_free(entry);
out:
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ lru_cache_add_page_vma(page, dst_vma, true);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -1876,7 +1876,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
- if (PageActive(page))
+ if (page_is_active(page, lruvec))
workingset_age_nonresident(lruvec, nr_pages);
}
@@ -4688,6 +4688,57 @@ static int page_update_lru_gen(struct page *page, int new_gen)
return old_gen;
}
+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
+{
+ pte_t *pte;
+ unsigned long start, end;
+ int old_gen, new_gen;
+ unsigned long flags;
+ struct lruvec *lruvec;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat = page_pgdat(pvmw->page);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_BUG_ON_VMA(pvmw->address < pvmw->vma->vm_start, pvmw->vma);
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end);
+ pte = pvmw->pte - ((pvmw->address - start) >> PAGE_SHIFT);
+
+ memcg = lock_page_memcg(pvmw->page);
+ lruvec = lock_page_lruvec_irqsave(pvmw->page, &flags);
+
+ new_gen = lru_gen_from_seq(lruvec->evictable.max_seq);
+
+ for (; start != end; pte++, start += PAGE_SIZE) {
+ struct page *page;
+ unsigned long pfn = pte_pfn(*pte);
+
+ if (!pte_present(*pte) || !pte_young(*pte) || is_zero_pfn(pfn))
+ continue;
+
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ continue;
+
+ page = compound_head(pte_page(*pte));
+ if (page_to_nid(page) != pgdat->node_id)
+ continue;
+ if (page_memcg_rcu(page) != memcg)
+ continue;
+ /*
+ * We may be holding many locks. So try to finish as fast as
+ * possible and leave the accessed and the dirty bits to page
+ * table walk.
+ */
+ old_gen = page_update_lru_gen(page, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ lru_gen_update_size(page, lruvec, old_gen, new_gen);
+ }
+
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ unlock_page_memcg(pvmw->page);
+}
+
struct mm_walk_args {
struct mem_cgroup *memcg;
unsigned long max_seq;
In the page fault path, we want to add pages to the per-zone lists index by max_seq as they cannot be evicted without going through the aging first. For anon pages, we rename lru_cache_add_inactive_or_unevictable() to lru_cache_add_page_vma() and add a new parameter, which is set to true in the page fault path, to indicate whether they should be added to the per-zone lists index by max_seq. For page/swap cache, since we cannot differentiate the page fault path from the read ahead path at the time we call lru_cache_add() in add_to_page_cache_lru() and __read_swap_cache_async(), we have to add a new function lru_gen_activate_page(), which is essentially activate_page(), to move pages to the per-zone lists indexed by max_seq at a later time. Hopefully we would find pages we want to activate in lru_pvecs.lru_add and simply set PageActive() on them without having to actually move them. In the reclaim path, pages mapped around a referenced PTE may also have been referenced due to spatial locality. We add a new function lru_gen_scan_around() to scan the vicinity of such a PTE. In addition, we add a new function page_is_active() to tell whether a page is active. We cannot use PageActive() because it is only set on active pages while they are not on multigenerational lru. It is cleared while pages are on multigenerational lru, in order to spare the aging the trouble of clearing it when an active generation becomes inactive. Internally, page_is_active() compares the generation number of a page with max_seq and max_seq-1, which are active generations and protected from the eviction. Other generations, which may or may not exist, are inactive. Signed-off-by: Yu Zhao <yuzhao@google.com> --- fs/proc/task_mmu.c | 3 ++- include/linux/mm_inline.h | 52 ++++++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 6 +++++ include/linux/swap.h | 4 +-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 14 +++++++---- mm/migrate.c | 2 +- mm/rmap.c | 6 +++++ mm/swap.c | 26 +++++++++++-------- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- mm/vmscan.c | 53 ++++++++++++++++++++++++++++++++++++++- 14 files changed, 150 insertions(+), 26 deletions(-)