@@ -3730,17 +3730,21 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
if (carryover) {
- unsigned long sum;
+ unsigned long refaulted, total;
- sum = atomic_long_read(&lrugen->avg_refaulted[type][tier]) +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- atomic_long_set(&lrugen->avg_refaulted[type][tier], sum / 2);
+ refaulted = atomic_long_read(&lrugen->avg_refaulted[type][tier]) +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- sum = atomic_long_read(&lrugen->avg_total[type][tier]) +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ total = atomic_long_read(&lrugen->avg_total[type][tier]) +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
- atomic_long_set(&lrugen->avg_total[type][tier], sum / 2);
+ total += lrugen->protected[hist][type][tier - 1];
+
+ /* total could be less than refaulted, see lru_gen_refault */
+ total = max(total, refaulted);
+
+ atomic_long_set(&lrugen->avg_refaulted[type][tier], refaulted / 2);
+ atomic_long_set(&lrugen->avg_total[type][tier], total / 2);
}
if (clear) {
@@ -175,6 +175,7 @@
MEM_CGROUP_ID_SHIFT)
#define EVICTION_BITS (BITS_PER_LONG - (EVICTION_SHIFT))
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+#define LRU_GEN_EVICTION_BITS (EVICTION_BITS - LRU_REFS_WIDTH - LRU_GEN_WIDTH)
/*
* Eviction timestamps need to be able to cover the full range of
@@ -185,6 +186,7 @@
* evictions into coarser buckets by shaving off lower timestamp bits.
*/
static unsigned int bucket_order __read_mostly;
+static unsigned int lru_gen_bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
@@ -290,6 +292,34 @@ static inline bool lru_test_refault(struct mem_cgroup *memcg,
(file ? inactive_anon : inactive_file);
}
+/**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+ * @nr_pages: the number of pages to count
+ *
+ * As in-memory pages are aged, non-resident pages need to be aged as
+ * well, in order for the refault distances later on to be comparable
+ * to the in-memory dimensions. This function allows reclaim and LRU
+ * operations to drive the non-resident aging along in parallel.
+ */
+void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
+{
+ /*
+ * Reclaiming a cgroup means reclaiming all its children in a
+ * round-robin fashion. That means that each cgroup has an LRU
+ * order that is composed of the LRU orders of its child
+ * cgroups; and every page has an LRU position not just in the
+ * cgroup that owns it, but in all of that group's ancestors.
+ *
+ * So when the physical inactive list of a leaf cgroup ages,
+ * the virtual inactive lists of all its parents, including
+ * the root cgroup's, age as well.
+ */
+ do {
+ atomic_long_add(nr_pages, &lruvec->nonresident_age);
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
+
#ifdef CONFIG_LRU_GEN
static void *lru_gen_eviction(struct folio *folio)
@@ -311,10 +341,14 @@ static void *lru_gen_eviction(struct folio *folio)
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
+
token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
+ token <<= LRU_GEN_EVICTION_BITS;
+ token |= lru_eviction(lruvec, LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+ workingset_age_nonresident(lruvec, folio_nr_pages(folio));
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}
@@ -329,15 +363,17 @@ static bool lru_gen_test_recent(struct lruvec *lruvec, bool file,
unsigned long min_seq;
min_seq = READ_ONCE(lruvec->lrugen.min_seq[file]);
+ token >>= LRU_GEN_EVICTION_BITS;
return (token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
}
static void lru_gen_refault(struct folio *folio, void *shadow)
{
int memcgid;
- bool recent;
+ bool refault;
bool workingset;
unsigned long token;
+ bool recent = false;
int hist, tier, refs;
struct lruvec *lruvec;
struct pglist_data *pgdat;
@@ -345,28 +381,36 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
- rcu_read_lock();
-
unpack_shadow(shadow, &memcgid, &pgdat, &token, &workingset);
lruvec = mem_cgroup_lruvec(mem_cgroup_from_id(memcgid), pgdat);
if (lruvec != folio_lruvec(folio))
- goto unlock;
+ return;
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
-
+ refault = lru_test_refault(lruvec_memcg(lruvec), lruvec, token, type,
+ LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
recent = lru_gen_test_recent(lruvec, type, token);
- if (!recent)
- goto unlock;
+ if (!recent && !refault)
+ return;
lrugen = &lruvec->lrugen;
-
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
/* see the comment in folio_lru_refs() */
+ token >>= LRU_GEN_EVICTION_BITS;
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
tier = lru_tier_from_refs(refs);
- atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+ if (refault) {
+ if (refs)
+ folio_set_active(folio);
+ /*
+ * Protect higher tier to make it easier
+ * to stay in a stable workingset and prevent refault.
+ */
+ if (refs != BIT(LRU_REFS_WIDTH))
+ tier = lru_tier_from_refs(refs + 1);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+ }
/*
* Count the following two cases as stalls:
@@ -375,12 +419,25 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
* 2. For pages accessed multiple times through file descriptors,
* numbers of accesses might have been out of the range.
*/
- if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
- folio_set_workingset(folio);
+ if (refault || lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
+ folio_set_workingset(folio);
+ }
+
+ /*
+ * If recent is false, add to global PID counters since the gen which
+ * the page evicted is gone already.
+ */
+ if (recent) {
+ /*
+ * tier may get increased upon refault, which makes refaulted larger
+ * than evicted, this will be reset and accounted by reset_ctrl_pos
+ */
+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
+ } else {
+ atomic_long_add(delta, &lrugen->avg_total[type][tier]);
+ atomic_long_add(delta, &lrugen->avg_refaulted[type][tier]);
}
-unlock:
- rcu_read_unlock();
}
#else /* !CONFIG_LRU_GEN */
@@ -402,34 +459,6 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
#endif /* CONFIG_LRU_GEN */
-/**
- * workingset_age_nonresident - age non-resident entries as LRU ages
- * @lruvec: the lruvec that was aged
- * @nr_pages: the number of pages to count
- *
- * As in-memory pages are aged, non-resident pages need to be aged as
- * well, in order for the refault distances later on to be comparable
- * to the in-memory dimensions. This function allows reclaim and LRU
- * operations to drive the non-resident aging along in parallel.
- */
-void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
-{
- /*
- * Reclaiming a cgroup means reclaiming all its children in a
- * round-robin fashion. That means that each cgroup has an LRU
- * order that is composed of the LRU orders of its child
- * cgroups; and every page has an LRU position not just in the
- * cgroup that owns it, but in all of that group's ancestors.
- *
- * So when the physical inactive list of a leaf cgroup ages,
- * the virtual inactive lists of all its parents, including
- * the root cgroup's, age as well.
- */
- do {
- atomic_long_add(nr_pages, &lruvec->nonresident_age);
- } while ((lruvec = parent_lruvec(lruvec)));
-}
-
/**
* workingset_eviction - note the eviction of a folio from memory
* @target_memcg: the cgroup that is causing the reclaim
@@ -529,16 +558,16 @@ void workingset_refault(struct folio *folio, void *shadow)
bool workingset;
long nr;
- if (lru_gen_enabled()) {
- lru_gen_refault(folio, shadow);
- return;
- }
-
/* Flush stats (and potentially sleep) before holding RCU read lock */
mem_cgroup_flush_stats_ratelimited();
rcu_read_lock();
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ goto out;
+ }
+
/*
* The activation decision for this folio is made at the level
* where the eviction occurred, as that is where the LRU order
@@ -785,6 +814,13 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
EVICTION_BITS, max_order, bucket_order);
+#ifdef CONFIG_LRU_GEN
+ if (max_order > LRU_GEN_EVICTION_BITS)
+ lru_gen_bucket_order = max_order - LRU_GEN_EVICTION_BITS;
+ pr_info("workingset: lru_gen_timestamp_bits=%d lru_gen_bucket_order=%u\n",
+ LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
+#endif
+
ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
if (ret)
goto err;