@@ -135,10 +135,9 @@ struct scan_control {
unsigned int no_demotion:1;
#ifdef CONFIG_LRU_GEN
- /* help make better choices when multiple memcgs are available */
+ /* help kswapd make better choices among multiple memcgs */
unsigned int memcgs_need_aging:1;
- unsigned int memcgs_need_swapping:1;
- unsigned int memcgs_avoid_swapping:1;
+ unsigned long last_reclaimed;
#endif
/* Allocation order */
@@ -4524,22 +4523,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
VM_WARN_ON_ONCE(!current_is_kswapd());
+ sc->last_reclaimed = sc->nr_reclaimed;
+
/*
- * To reduce the chance of going into the aging path or swapping, which
- * can be costly, optimistically skip them unless their corresponding
- * flags were cleared in the eviction path. This improves the overall
- * performance when multiple memcgs are available.
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
*/
if (!sc->memcgs_need_aging) {
sc->memcgs_need_aging = true;
- sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping;
- sc->memcgs_need_swapping = true;
return;
}
- sc->memcgs_need_swapping = true;
- sc->memcgs_avoid_swapping = true;
-
set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -5035,7 +5031,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
sc->nr_reclaimed += reclaimed;
- if (type == LRU_GEN_ANON && need_swapping)
+ if (need_swapping && type == LRU_GEN_ANON)
*need_swapping = true;
return scanned;
@@ -5047,19 +5043,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
* reclaim.
*/
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap, unsigned long reclaimed, bool *need_aging)
+ bool can_swap, bool *need_aging)
{
- int priority;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- if (fatal_signal_pending(current)) {
- sc->nr_reclaimed += MIN_LRU_BATCH;
- return 0;
- }
-
if (mem_cgroup_below_min(memcg) ||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;
@@ -5068,15 +5058,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
if (!nr_to_scan)
return 0;
- /* adjust priority if memcg is offline or the target is met */
- if (!mem_cgroup_online(memcg))
- priority = 0;
- else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim)
- priority = DEF_PRIORITY;
- else
- priority = sc->priority;
-
- nr_to_scan >>= priority;
+ nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0;
if (!nr_to_scan)
return 0;
@@ -5084,7 +5066,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
return nr_to_scan;
/* skip the aging path at the default priority */
- if (priority == DEF_PRIORITY)
+ if (sc->priority == DEF_PRIORITY)
goto done;
/* leave the work to lru_gen_age_node() */
@@ -5097,6 +5079,60 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
}
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the target isn't
+ * met, and yet the allocation may still succeed, since kswapd may have
+ * caught up. In either case, it's better to stop now, and restart if
+ * necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
@@ -5104,6 +5140,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
bool need_swapping = false;
unsigned long scanned = 0;
unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
lru_add_drain();
@@ -5123,7 +5160,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
else
swappiness = 0;
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging);
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
if (!nr_to_scan)
goto done;
@@ -5135,17 +5172,15 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
if (scanned >= nr_to_scan)
break;
- if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping)
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
break;
cond_resched();
}
/* see the comment in lru_gen_age_node() */
- if (!need_aging)
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
sc->memcgs_need_aging = false;
- if (!need_swapping)
- sc->memcgs_need_swapping = false;
done:
clear_mm_walk();
Long-tailed direct reclaim latency was seen on high-memory (TBs) machines: MGLRU is better at the 99th percentile but worse at the 99.9th. It turned out the old direct reclaim backoff, which tries to enforce a minimum fairness among all eligible memcgs, over-swapped by about (total_mem>>DEF_PRIORITY)-nr_to_reclaim: /* adjust priority if memcg is offline or the target is met */ if (!mem_cgroup_online(memcg)) priority = 0; else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim) priority = DEF_PRIORITY; else priority = sc->priority; The new backoff, which pulls the plug on swapping once the target is met, trades some fairness for curtailed latency. Specifically, in should_abort_scan(): /* over-swapping can increase allocation latency */ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) return true; The fundamental problem is that the backoff requires a sophisticated model and the previous one was oversimplified. The new one may still be, but at least it can handle a couple more corner cases on top of the above: /* age each memcg once to ensure fairness */ if (max_seq - seq > 1) return true; The NR_FREE_PAGES check at the bottom of should_abort_scan(). Signed-off-by: Yu Zhao <yuzhao@google.com> --- mm/vmscan.c | 105 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 35 deletions(-)