@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
return seq % MAX_NR_GENS;
}
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
+static inline int lru_tier_from_usage(int usage)
+{
+ return order_base_2(usage + 1);
+}
+
/* Return a proper index regardless whether we keep a full history of stats. */
static inline int hist_from_seq_or_gen(int seq_or_gen)
{
@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
return true;
}
+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
+static inline int page_tier_usage(struct page *page)
+{
+ unsigned long flags = READ_ONCE(page->flags);
+
+ return flags & BIT(PG_workingset) ?
+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
+}
+
+/* Increment the usage counter after a page is accessed via file descriptors. */
+static inline void page_inc_usage(struct page *page)
+{
+ unsigned long usage;
+ unsigned long old_flags, new_flags;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+
+ if (!(old_flags & BIT(PG_workingset))) {
+ new_flags = old_flags | BIT(PG_workingset);
+ continue;
+ }
+
+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
+
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
#else /* CONFIG_LRU_GEN */
static inline bool lru_gen_enabled(void)
@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
return false;
}
+static inline void page_inc_usage(struct page *page)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static __always_inline void add_page_to_lru_list(struct page *page,
@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma);
+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+ bool faulting);
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, false);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
@@ -636,7 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@ -1198,7 +1198,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, true);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
@@ -839,7 +839,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
page_add_new_anon_rmap(new_page, dst_vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
+ lru_cache_add_page_vma(new_page, dst_vma, false);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
@@ -2950,7 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, true);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3479,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
@@ -3625,7 +3625,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3793,7 +3793,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -2967,7 +2967,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
if (!is_zone_device_page(page))
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, false);
get_page(page);
if (flush) {
@@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
+ } else if (lru_gen_enabled()) {
+ page_inc_usage(page);
} else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
@@ -478,15 +480,14 @@ void lru_cache_add(struct page *page)
EXPORT_SYMBOL(lru_cache_add);
/**
- * lru_cache_add_inactive_or_unevictable
+ * lru_cache_add_page_vma
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * Place @page on an LRU list, depending on its evictability.
*/
-void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+ bool faulting)
{
bool unevictable;
@@ -503,6 +504,11 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
+
+ /* tell the multigenerational lru that the page is being faulted in */
+ if (lru_gen_enabled() && !unevictable && faulting)
+ SetPageActive(page);
+
lru_cache_add(page);
}
@@ -529,7 +535,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
*/
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
- bool active = PageActive(page);
+ bool active = PageActive(page) || lru_gen_enabled();
int nr_pages = thp_nr_pages(page);
if (PageUnevictable(page))
@@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageActive(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -684,7 +690,7 @@ void deactivate_file_page(struct page *page)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, false);
}
swap_free(entry);
out:
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ lru_cache_add_page_vma(page, dst_vma, true);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
- mem_cgroup_swapout(page, swap);
+
+ /* get a shadow entry before page_memcg() is cleared */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
+ mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
@@ -2780,6 +2782,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
}
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop modeled after the PID controller. Currently supports the
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
+ * added if necessary. The setpoint (SP) is the desired position; the process
+ * variable (PV) is the measured position. The error is the difference between
+ * the SP and the PV. A positive error results in a positive control output
+ * correction, which, in our case, is to allow eviction.
+ *
+ * The P term is the current refault rate refaulted/(evicted+activated), which
+ * has a weight of 1. The I term is the arithmetic mean of the last N refault
+ * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
+ *
+ * Our goal is to make sure upper tiers have similar refault rates as the base
+ * tier. That is we try to be fair to all tiers by maintaining similar refault
+ * rates across them.
+ */
+struct controller_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
+ int type, int tier, int gain)
+{
+ struct lrugen *lrugen = &lruvec->evictable;
+ int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->activated[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
+{
+ int tier;
+ int hist = hist_from_seq_or_gen(gen);
+ struct lrugen *lrugen = &lruvec->evictable;
+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (!carryover && NR_STAT_GENS == 1)
+ return;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->activated[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+
+ if (NR_STAT_GENS > 1)
+ continue;
+ }
+
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
+ }
+}
+
+static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
+{
+ /*
+ * Allow eviction if the PV has a limited number of refaulted pages or a
+ * lower refault rate than the SP.
+ */
+ return pv->refaulted < SWAP_CLUSTER_MAX ||
+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
+}
+
/******************************************************************************
* state change
******************************************************************************/
@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
return val >> MEM_CGROUP_ID_SHIFT;
}
+#ifdef CONFIG_LRU_GEN
+
+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
+#endif
+
+static void page_set_usage(struct page *page, int usage)
+{
+ unsigned long old_flags, new_flags;
+
+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
+
+ if (!usage)
+ return;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
+ ((usage - 1UL) << LRU_USAGE_PGOFF);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
+/* Return a token to be stored in the shadow entry of a page being evicted. */
+static void *lru_gen_eviction(struct page *page)
+{
+ int hist, tier;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ int type = page_is_file_lru(page);
+ int usage = page_tier_usage(page);
+ struct mem_cgroup *memcg = page_memcg(page);
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_USAGE_SHIFT) | usage;
+
+ hist = hist_from_seq_or_gen(min_seq);
+ tier = lru_tier_from_usage(usage);
+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
+}
+
+/* Account a refaulted page based on the token stored in its shadow entry. */
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+ int hist, tier, usage;
+ int memcg_id;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ int type = page_is_file_lru(page);
+
+ token = unpack_shadow(shadow, &memcg_id, &pgdat);
+ if (page_pgdat(page) != pgdat)
+ return;
+
+ rcu_read_lock();
+ memcg = page_memcg_rcu(page);
+ if (mem_cgroup_id(memcg) != memcg_id)
+ goto unlock;
+
+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
+ token >>= LRU_USAGE_SHIFT;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
+ goto unlock;
+
+ page_set_usage(page, usage);
+
+ hist = hist_from_seq_or_gen(min_seq);
+ tier = lru_tier_from_usage(usage);
+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
+ if (tier)
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct page *page)
+{
+ return NULL;
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(page);
+
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
bool workingset;
int memcgid;
+ if (lru_gen_enabled()) {
+ lru_gen_refault(page, shadow);
+ return;
+ }
+
eviction = unpack_shadow(shadow, &memcgid, &pgdat);
rcu_read_lock();