@@ -511,6 +511,8 @@ struct lru_gen_mm_walk {
unsigned long seq;
/* the next address within an mm to scan */
unsigned long next_addr;
+ /* called for each accessed pte/pmd */
+ int (*accessed_cb)(pfn_t pfn);
/* to batch promoted pages */
int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* to batch the mm stats */
@@ -518,6 +520,9 @@ struct lru_gen_mm_walk {
/* total batched items */
int batched;
int swappiness;
+ /* for the pmd under scanning */
+ int nr_young_pte;
+ int nr_total_pte;
bool force_scan;
};
@@ -476,6 +476,10 @@ extern unsigned long highest_memmap_pfn;
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
+void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs);
+void lru_gen_scan_lruvec(struct lruvec *lruvec, unsigned long seq,
+ int (*accessed_cb)(pfn_t), void (*flush_cb)(void));
/*
* in mm/rmap.c:
@@ -57,6 +57,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/mmu_notifier.h>
+#include <linux/pfn_t.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -271,7 +272,7 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
-static void set_task_reclaim_state(struct task_struct *task,
+void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
/* Check for an overwrite */
@@ -3023,7 +3024,7 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite
VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
- if (walk->seq <= mm_state->seq)
+ if (!walk->accessed_cb && walk->seq <= mm_state->seq)
goto done;
if (!mm_state->head)
@@ -3452,16 +3453,14 @@ static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
}
}
-static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- struct mm_walk *args)
+static int walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ struct mm_walk *args, bool *suitable)
{
- int i;
+ int i, err = 0;
bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
- int total = 0;
- int young = 0;
struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
@@ -3471,17 +3470,21 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
pmd_t pmdval;
pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
- if (!pte)
- return false;
+ if (!pte) {
+ *suitable = false;
+ return 0;
+ }
if (!spin_trylock(ptl)) {
pte_unmap(pte);
- return true;
+ *suitable = true;
+ return 0;
}
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
pte_unmap_unlock(pte, ptl);
- return false;
+ *suitable = false;
+ return 0;
}
arch_enter_lazy_mmu_mode();
@@ -3491,7 +3494,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct folio *folio;
pte_t ptent = ptep_get(pte + i);
- total++;
+ walk->nr_total_pte++;
walk->mm_stats[MM_LEAF_TOTAL]++;
pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
@@ -3515,23 +3518,34 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
if (pte_dirty(ptent))
dirty = true;
- young++;
+ walk->nr_young_pte++;
walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (!walk->accessed_cb)
+ continue;
+
+ err = walk->accessed_cb(pfn_to_pfn_t(pfn));
+ if (err) {
+ walk->next_addr = addr + PAGE_SIZE;
+ break;
+ }
}
walk_update_folio(walk, last, gen, dirty);
last = NULL;
- if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
+ if (!err && i < PTRS_PER_PTE &&
+ get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte, ptl);
- return suitable_to_scan(total, young);
+ *suitable = suitable_to_scan(walk->nr_total_pte, walk->nr_young_pte);
+ return err;
}
-static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
+static int walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
@@ -3544,6 +3558,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
int gen = lru_gen_from_seq(max_seq);
+ int err = 0;
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3551,13 +3566,13 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (*first == -1) {
*first = addr;
bitmap_zero(bitmap, MIN_LRU_BATCH);
- return;
+ return 0;
}
i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
if (i && i <= MIN_LRU_BATCH) {
__set_bit(i - 1, bitmap);
- return;
+ return 0;
}
pmd = pmd_offset(pud, *first);
@@ -3607,6 +3622,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
dirty = true;
walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (!walk->accessed_cb)
+ goto next;
+
+ err = walk->accessed_cb(pfn_to_pfn_t(pfn));
+ if (err) {
+ i = find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
+
+ walk->next_addr = (*first & PMD_MASK) + i * PMD_SIZE;
+ break;
+ }
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
@@ -3617,9 +3642,10 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
spin_unlock(ptl);
done:
*first = -1;
+ return err;
}
-static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+static int walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
@@ -3631,6 +3657,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
unsigned long first = -1;
struct lru_gen_mm_walk *walk = args->private;
struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
+ int err = 0;
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3644,6 +3671,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
/* walk_pte_range() may call get_next_vma() */
vma = args->vma;
for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+ bool suitable;
pmd_t val = pmdp_get_lockless(pmd + i);
next = pmd_addr_end(addr, end);
@@ -3660,7 +3688,10 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
if (pfn != -1)
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ err = walk_pmd_range_locked(pud, addr, vma, args,
+ bitmap, &first);
+ if (err)
+ return err;
continue;
}
@@ -3669,33 +3700,50 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
if (!pmd_young(val))
continue;
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ err = walk_pmd_range_locked(pud, addr, vma, args,
+ bitmap, &first);
+ if (err)
+ return err;
}
if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
continue;
+ err = walk_pte_range(&val, addr, next, args, &suitable);
+ if (err && walk->next_addr < next && first == -1)
+ return err;
+
+ walk->nr_total_pte = 0;
+ walk->nr_young_pte = 0;
+
walk->mm_stats[MM_NONLEAF_FOUND]++;
- if (!walk_pte_range(&val, addr, next, args))
- continue;
+ if (!suitable)
+ goto next;
walk->mm_stats[MM_NONLEAF_ADDED]++;
/* carry over to the next generation */
update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
+next:
+ if (err) {
+ walk->next_addr = first;
+ return err;
+ }
}
- walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
+ err = walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
- if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
+ if (!err && i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
goto restart;
+
+ return err;
}
static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
struct mm_walk *args)
{
- int i;
+ int i, err;
pud_t *pud;
unsigned long addr;
unsigned long next;
@@ -3713,7 +3761,9 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
continue;
- walk_pmd_range(&val, addr, next, args);
+ err = walk_pmd_range(&val, addr, next, args);
+ if (err)
+ return err;
if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
end = (addr | ~PUD_MASK) + 1;
@@ -3734,40 +3784,48 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
return -EAGAIN;
}
-static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+static int try_walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
{
+ int err;
static const struct mm_walk_ops mm_walk_ops = {
.test_walk = should_skip_vma,
.p4d_entry = walk_pud_range,
.walk_lock = PGWALK_RDLOCK,
};
- int err;
struct lruvec *lruvec = walk->lruvec;
- walk->next_addr = FIRST_USER_ADDRESS;
+ DEFINE_MAX_SEQ(lruvec);
- do {
- DEFINE_MAX_SEQ(lruvec);
+ err = -EBUSY;
- err = -EBUSY;
+ /* another thread might have called inc_max_seq() */
+ if (walk->seq != max_seq)
+ return err;
- /* another thread might have called inc_max_seq() */
- if (walk->seq != max_seq)
- break;
+ /* the caller might be holding the lock for write */
+ if (mmap_read_trylock(mm)) {
+ err = walk_page_range(mm, walk->next_addr, ULONG_MAX,
+ &mm_walk_ops, walk);
- /* the caller might be holding the lock for write */
- if (mmap_read_trylock(mm)) {
- err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
+ mmap_read_unlock(mm);
+ }
- mmap_read_unlock(mm);
- }
+ if (walk->batched) {
+ spin_lock_irq(&lruvec->lru_lock);
+ reset_batch_size(walk);
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
- if (walk->batched) {
- spin_lock_irq(&lruvec->lru_lock);
- reset_batch_size(walk);
- spin_unlock_irq(&lruvec->lru_lock);
- }
+ return err;
+}
+
+static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ int err;
+ walk->next_addr = FIRST_USER_ADDRESS;
+ do {
+ err = try_walk_mm(mm, walk);
cond_resched();
} while (err == -EAGAIN);
}
@@ -3964,6 +4022,33 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness
return success;
}
+void lru_gen_scan_lruvec(struct lruvec *lruvec, unsigned long seq,
+ int (*accessed_cb)(pfn_t), void (*flush_cb)(void))
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+ struct mm_struct *mm = NULL;
+
+ walk->lruvec = lruvec;
+ walk->seq = seq;
+ walk->accessed_cb = accessed_cb;
+ walk->swappiness = MAX_SWAPPINESS;
+
+ do {
+ int err = -EBUSY;
+
+ iterate_mm_list(walk, &mm);
+ if (!mm)
+ break;
+
+ walk->next_addr = FIRST_USER_ADDRESS;
+ do {
+ err = try_walk_mm(mm, walk);
+ cond_resched();
+ flush_cb();
+ } while (err == -EAGAIN);
+ } while (mm);
+}
+
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
int swappiness, bool force_scan)
{