diff mbox series

[v10,08/14] mm: multi-gen LRU: support page table walks

Message ID 20220407031525.2368067-9-yuzhao@google.com (mailing list archive)
State New, archived
Headers show
Series Multi-Gen LRU Framework | expand

Commit Message

Yu Zhao April 7, 2022, 3:15 a.m. UTC
To further exploit spatial locality, the aging prefers to walk page
tables to search for young PTEs and promote hot pages. A kill switch
will be added in the next patch to disable this behavior. When
disabled, the aging relies on the rmap only.

NB: this behavior has nothing similar with the page table scanning in
the 2.4 kernel [1], which searches page tables for old PTEs, adds cold
pages to swapcache and unmaps them.

To avoid confusion, the term "iteration" specifically means the
traversal of an entire mm_struct list; the term "walk" will be applied
to page tables and the rmap, as usual.

An mm_struct list is maintained for each memcg, and an mm_struct
follows its owner task to the new memcg when this task is migrated.
Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls
walk_page_range() with each mm_struct on this list to promote hot
pages before it increments max_seq.

When multiple page table walkers iterate the same list, each of them
gets a unique mm_struct; therefore they can run concurrently. Page
table walkers ignore any misplaced pages, e.g., if an mm_struct was
migrated, pages it left in the previous memcg will not be promoted
when its current memcg is under reclaim. Similarly, page table walkers
will not promote pages from nodes other than the one under reclaim.

This patch uses the following optimizations when walking page tables:
1. It tracks the usage of mm_struct's between context switches so that
   page table walkers can skip processes that have been sleeping since
   the last iteration.
2. It uses generational Bloom filters to record populated branches so
   that page table walkers can reduce their search space based on the
   query results, e.g., to skip page tables containing mostly holes or
   misplaced pages.
3. It takes advantage of the accessed bit in non-leaf PMD entries when
   CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
4. It does not zigzag between a PGD table and the same PMD table
   spanning multiple VMAs. IOW, it finishes all the VMAs within the
   range of the same PMD table before it returns to a PGD table. This
   improves the cache performance for workloads that have large
   numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.

Server benchmark results:
  Single workload:
    fio (buffered I/O): no change

  Single workload:
    memcached (anon): +[8, 10]%
                Ops/sec      KB/sec
      patch1-7: 1193918.93   46438.15
      patch1-8: 1301954.44   50640.27

  Configurations:
    no change

Client benchmark results:
  kswapd profiles:
    patch1-7
      45.90%  lzo1x_1_do_compress (real work)
       9.14%  page_vma_mapped_walk
       6.81%  _raw_spin_unlock_irq
       2.80%  ptep_clear_flush
       2.34%  __zram_bvec_write
       2.29%  do_raw_spin_lock
       1.84%  lru_gen_look_around
       1.78%  memmove
       1.74%  obj_malloc
       1.50%  free_unref_page_list

    patch1-8
      46.96%  lzo1x_1_do_compress (real work)
       7.55%  page_vma_mapped_walk
       5.89%  _raw_spin_unlock_irq
       3.33%  walk_pte_range
       2.65%  ptep_clear_flush
       2.23%  __zram_bvec_write
       2.08%  do_raw_spin_lock
       1.83%  memmove
       1.65%  obj_malloc
       1.47%  free_unref_page_list

  Configurations:
    no change

[1] https://lwn.net/Articles/23732/
[2] https://source.android.com/devices/tech/debug/scudo

Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
 fs/exec.c                  |   2 +
 include/linux/memcontrol.h |   5 +
 include/linux/mm_types.h   |  78 +++
 include/linux/mmzone.h     |  59 +++
 include/linux/swap.h       |   4 +
 kernel/exit.c              |   1 +
 kernel/fork.c              |   9 +
 kernel/sched/core.c        |   1 +
 mm/memcontrol.c            |  24 +
 mm/vmscan.c                | 975 ++++++++++++++++++++++++++++++++++++-
 10 files changed, 1144 insertions(+), 14 deletions(-)

Comments

Andrew Morton April 12, 2022, 2:16 a.m. UTC | #1
On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:

>
> ...
>
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6083,6 +6083,29 @@ static void mem_cgroup_move_task(void)
>  }
>  #endif
>  
> +#ifdef CONFIG_LRU_GEN
> +static void mem_cgroup_attach(struct cgroup_taskset *tset)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct task_struct *task = NULL;
> +
> +	cgroup_taskset_for_each_leader(task, css, tset)
> +		break;

Does this actually do anything?

> +	if (!task)
> +		return;
> +
> +	task_lock(task);
> +	if (task->mm && task->mm->owner == task)
> +		lru_gen_migrate_mm(task->mm);
> +	task_unlock(task);
> +}
>  
> ...
>
> +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> +			      int old_gen, int new_gen)
> +{
> +	int type = folio_is_file_lru(folio);
> +	int zone = folio_zonenum(folio);
> +	int delta = folio_nr_pages(folio);
> +
> +	VM_BUG_ON(old_gen >= MAX_NR_GENS);
> +	VM_BUG_ON(new_gen >= MAX_NR_GENS);

General rule: don't add new BUG_ONs, because they crash the kenrel. 
It's better to use WARN_ON or WARN_ON_ONCE then try to figure out a way
to keep the kernel limping along.  At least so the poor user can gather logs.

> +	walk->batched++;
> +
> +	walk->nr_pages[old_gen][type][zone] -= delta;
> +	walk->nr_pages[new_gen][type][zone] += delta;
> +}
> +
Peter Zijlstra April 12, 2022, 7:10 a.m. UTC | #2
On Mon, Apr 11, 2022 at 07:16:21PM -0700, Andrew Morton wrote:
> On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:

> > +#ifdef CONFIG_LRU_GEN
> > +static void mem_cgroup_attach(struct cgroup_taskset *tset)
> > +{
> > +	struct cgroup_subsys_state *css;
> > +	struct task_struct *task = NULL;
> > +
> > +	cgroup_taskset_for_each_leader(task, css, tset)
> > +		break;
> 
> Does this actually do anything?

Yeah, it finds the first leader if there is any, but this is indeed
quite terrible coding style.
Yu Zhao April 15, 2022, 1:14 a.m. UTC | #3
On Mon, Apr 11, 2022 at 8:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> > +                           int old_gen, int new_gen)
> > +{
> > +     int type = folio_is_file_lru(folio);
> > +     int zone = folio_zonenum(folio);
> > +     int delta = folio_nr_pages(folio);
> > +
> > +     VM_BUG_ON(old_gen >= MAX_NR_GENS);
> > +     VM_BUG_ON(new_gen >= MAX_NR_GENS);
>
> General rule: don't add new BUG_ONs, because they crash the kenrel.
> It's better to use WARN_ON or WARN_ON_ONCE then try to figure out a way
> to keep the kernel limping along.  At least so the poor user can gather logs.

These are VM_BUG_ONs, which are BUILD_BUG_ONs except for (mostly MM) developers.
Andrew Morton April 15, 2022, 1:56 a.m. UTC | #4
On Thu, 14 Apr 2022 19:14:54 -0600 Yu Zhao <yuzhao@google.com> wrote:

> On Mon, Apr 11, 2022 at 8:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:
> >
> > > +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> > > +                           int old_gen, int new_gen)
> > > +{
> > > +     int type = folio_is_file_lru(folio);
> > > +     int zone = folio_zonenum(folio);
> > > +     int delta = folio_nr_pages(folio);
> > > +
> > > +     VM_BUG_ON(old_gen >= MAX_NR_GENS);
> > > +     VM_BUG_ON(new_gen >= MAX_NR_GENS);
> >
> > General rule: don't add new BUG_ONs, because they crash the kenrel.
> > It's better to use WARN_ON or WARN_ON_ONCE then try to figure out a way
> > to keep the kernel limping along.  At least so the poor user can gather logs.
> 
> These are VM_BUG_ONs, which are BUILD_BUG_ONs except for (mostly MM) developers.

I'm told that many production builds enable runtime VM_BUG_ONning.
Yu Zhao April 15, 2022, 5:30 a.m. UTC | #5
On Tue, Apr 12, 2022 at 1:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Apr 11, 2022 at 07:16:21PM -0700, Andrew Morton wrote:
> > On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > > +#ifdef CONFIG_LRU_GEN
> > > +static void mem_cgroup_attach(struct cgroup_taskset *tset)
> > > +{
> > > +   struct cgroup_subsys_state *css;
> > > +   struct task_struct *task = NULL;
> > > +
> > > +   cgroup_taskset_for_each_leader(task, css, tset)
> > > +           break;
> >
> > Does this actually do anything?
>
> Yeah, it finds the first leader if there is any, but this is indeed
> quite terrible coding style.

I've added a one liner comment "find the first leader if there is
any", for now. I'm open to other suggestions.
Yu Zhao April 15, 2022, 6:25 a.m. UTC | #6
On Thu, Apr 14, 2022 at 7:57 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 14 Apr 2022 19:14:54 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > On Mon, Apr 11, 2022 at 8:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:
> > >
> > > > +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> > > > +                           int old_gen, int new_gen)
> > > > +{
> > > > +     int type = folio_is_file_lru(folio);
> > > > +     int zone = folio_zonenum(folio);
> > > > +     int delta = folio_nr_pages(folio);
> > > > +
> > > > +     VM_BUG_ON(old_gen >= MAX_NR_GENS);
> > > > +     VM_BUG_ON(new_gen >= MAX_NR_GENS);
> > >
> > > General rule: don't add new BUG_ONs, because they crash the kenrel.
> > > It's better to use WARN_ON or WARN_ON_ONCE then try to figure out a way
> > > to keep the kernel limping along.  At least so the poor user can gather logs.
> >
> > These are VM_BUG_ONs, which are BUILD_BUG_ONs except for (mostly MM) developers.
>
> I'm told that many production builds enable runtime VM_BUG_ONning.

Nobody wants to debug VM in production. Some distros that offer both
the latest/LTS kernels do enable CONFIG_DEBUG_VM in the former so the
latter can have better test coverage when it becomes available. Do
people use the former in production? Absolutely, otherwise we won't
have enough test coverage. Are we supposed to avoid CONFIG_DEBUG_VM? I
don't think so, because it defeats the purpose of those distros
enabling it in the first place.

The bottomline is that none of RHEL 8.5, SLES 15, Debian 11 enables
CONFIG_DEBUG_VM.
Andrew Morton April 15, 2022, 7:15 p.m. UTC | #7
On Fri, 15 Apr 2022 00:25:45 -0600 Yu Zhao <yuzhao@google.com> wrote:

> On Thu, Apr 14, 2022 at 7:57 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Thu, 14 Apr 2022 19:14:54 -0600 Yu Zhao <yuzhao@google.com> wrote:
> >
> > > On Mon, Apr 11, 2022 at 8:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > > >
> > > > On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:
> > > >
> > > > > +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> > > > > +                           int old_gen, int new_gen)
> > > > > +{
> > > > > +     int type = folio_is_file_lru(folio);
> > > > > +     int zone = folio_zonenum(folio);
> > > > > +     int delta = folio_nr_pages(folio);
> > > > > +
> > > > > +     VM_BUG_ON(old_gen >= MAX_NR_GENS);
> > > > > +     VM_BUG_ON(new_gen >= MAX_NR_GENS);
> > > >
> > > > General rule: don't add new BUG_ONs, because they crash the kenrel.
> > > > It's better to use WARN_ON or WARN_ON_ONCE then try to figure out a way
> > > > to keep the kernel limping along.  At least so the poor user can gather logs.
> > >
> > > These are VM_BUG_ONs, which are BUILD_BUG_ONs except for (mostly MM) developers.
> >
> > I'm told that many production builds enable runtime VM_BUG_ONning.
> 
> Nobody wants to debug VM in production. Some distros that offer both
> the latest/LTS kernels do enable CONFIG_DEBUG_VM in the former so the
> latter can have better test coverage when it becomes available. Do
> people use the former in production? Absolutely, otherwise we won't
> have enough test coverage. Are we supposed to avoid CONFIG_DEBUG_VM? I
> don't think so, because it defeats the purpose of those distros
> enabling it in the first place.
> 
> The bottomline is that none of RHEL 8.5, SLES 15, Debian 11 enables
> CONFIG_DEBUG_VM.

I grabbed
https://kojipkgs.fedoraproject.org//packages/kernel/5.18.0/0.rc2.23.fc37/src/kernel-5.18.0-0.rc2.23.fc37.src.rpm
and 

hp2:/home/akpm/yy> grep "DEBUG_VM[= ]" *.config 
kernel-aarch64-debug-fedora.config:CONFIG_DEBUG_VM=y
kernel-aarch64-debug-rhel.config:# CONFIG_DEBUG_VM is not set
kernel-aarch64-fedora.config:CONFIG_DEBUG_VM=y
kernel-aarch64-rhel.config:# CONFIG_DEBUG_VM is not set
kernel-armv7hl-debug-fedora.config:CONFIG_DEBUG_VM=y
kernel-armv7hl-fedora.config:CONFIG_DEBUG_VM=y
kernel-armv7hl-lpae-debug-fedora.config:CONFIG_DEBUG_VM=y
kernel-armv7hl-lpae-fedora.config:CONFIG_DEBUG_VM=y
kernel-ppc64le-debug-fedora.config:CONFIG_DEBUG_VM=y
kernel-ppc64le-debug-rhel.config:CONFIG_DEBUG_VM=y
kernel-ppc64le-fedora.config:CONFIG_DEBUG_VM=y
kernel-ppc64le-rhel.config:# CONFIG_DEBUG_VM is not set
kernel-s390x-debug-fedora.config:CONFIG_DEBUG_VM=y
kernel-s390x-debug-rhel.config:CONFIG_DEBUG_VM=y
kernel-s390x-fedora.config:CONFIG_DEBUG_VM=y
kernel-s390x-rhel.config:# CONFIG_DEBUG_VM is not set
kernel-s390x-zfcpdump-rhel.config:# CONFIG_DEBUG_VM is not set
kernel-x86_64-debug-fedora.config:CONFIG_DEBUG_VM=y
kernel-x86_64-debug-rhel.config:CONFIG_DEBUG_VM=y
kernel-x86_64-fedora.config:CONFIG_DEBUG_VM=y
kernel-x86_64-rhel.config:# CONFIG_DEBUG_VM is not set
Yu Zhao April 15, 2022, 8:11 p.m. UTC | #8
On Fri, Apr 15, 2022 at 1:15 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Fri, 15 Apr 2022 00:25:45 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > On Thu, Apr 14, 2022 at 7:57 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Thu, 14 Apr 2022 19:14:54 -0600 Yu Zhao <yuzhao@google.com> wrote:
> > >
> > > > On Mon, Apr 11, 2022 at 8:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > > > >
> > > > > On Wed,  6 Apr 2022 21:15:20 -0600 Yu Zhao <yuzhao@google.com> wrote:
> > > > >
> > > > > > +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> > > > > > +                           int old_gen, int new_gen)
> > > > > > +{
> > > > > > +     int type = folio_is_file_lru(folio);
> > > > > > +     int zone = folio_zonenum(folio);
> > > > > > +     int delta = folio_nr_pages(folio);
> > > > > > +
> > > > > > +     VM_BUG_ON(old_gen >= MAX_NR_GENS);
> > > > > > +     VM_BUG_ON(new_gen >= MAX_NR_GENS);
> > > > >
> > > > > General rule: don't add new BUG_ONs, because they crash the kenrel.
> > > > > It's better to use WARN_ON or WARN_ON_ONCE then try to figure out a way
> > > > > to keep the kernel limping along.  At least so the poor user can gather logs.
> > > >
> > > > These are VM_BUG_ONs, which are BUILD_BUG_ONs except for (mostly MM) developers.
> > >
> > > I'm told that many production builds enable runtime VM_BUG_ONning.
> >
> > Nobody wants to debug VM in production. Some distros that offer both
> > the latest/LTS kernels do enable CONFIG_DEBUG_VM in the former so the
> > latter can have better test coverage when it becomes available. Do
> > people use the former in production? Absolutely, otherwise we won't
> > have enough test coverage. Are we supposed to avoid CONFIG_DEBUG_VM? I
> > don't think so, because it defeats the purpose of those distros
> > enabling it in the first place.
> >
> > The bottomline is that none of RHEL 8.5, SLES 15, Debian 11 enables
> > CONFIG_DEBUG_VM.
>
> I grabbed
> https://kojipkgs.fedoraproject.org//packages/kernel/5.18.0/0.rc2.23.fc37/src/kernel-5.18.0-0.rc2.23.fc37.src.rpm
> and

Yes, Fedora/RHEL is one concrete example of the model I mentioned
above (experimental/stable). I added Justin, the Fedora kernel
maintainer, and he can further clarify.

If we don't want more VM_BUG_ONs, I'll remove them. But (let me
reiterate) it seems to me that just defeats the purpose of having
CONFIG_DEBUG_VM.

> hp2:/home/akpm/yy> grep "DEBUG_VM[= ]" *.config
> kernel-aarch64-debug-fedora.config:CONFIG_DEBUG_VM=y
> kernel-aarch64-debug-rhel.config:# CONFIG_DEBUG_VM is not set
> kernel-aarch64-fedora.config:CONFIG_DEBUG_VM=y
> kernel-aarch64-rhel.config:# CONFIG_DEBUG_VM is not set
> kernel-armv7hl-debug-fedora.config:CONFIG_DEBUG_VM=y
> kernel-armv7hl-fedora.config:CONFIG_DEBUG_VM=y
> kernel-armv7hl-lpae-debug-fedora.config:CONFIG_DEBUG_VM=y
> kernel-armv7hl-lpae-fedora.config:CONFIG_DEBUG_VM=y
> kernel-ppc64le-debug-fedora.config:CONFIG_DEBUG_VM=y
> kernel-ppc64le-debug-rhel.config:CONFIG_DEBUG_VM=y
> kernel-ppc64le-fedora.config:CONFIG_DEBUG_VM=y
> kernel-ppc64le-rhel.config:# CONFIG_DEBUG_VM is not set
> kernel-s390x-debug-fedora.config:CONFIG_DEBUG_VM=y
> kernel-s390x-debug-rhel.config:CONFIG_DEBUG_VM=y
> kernel-s390x-fedora.config:CONFIG_DEBUG_VM=y
> kernel-s390x-rhel.config:# CONFIG_DEBUG_VM is not set
> kernel-s390x-zfcpdump-rhel.config:# CONFIG_DEBUG_VM is not set
> kernel-x86_64-debug-fedora.config:CONFIG_DEBUG_VM=y
> kernel-x86_64-debug-rhel.config:CONFIG_DEBUG_VM=y
> kernel-x86_64-fedora.config:CONFIG_DEBUG_VM=y
> kernel-x86_64-rhel.config:# CONFIG_DEBUG_VM is not set
Andrew Morton April 15, 2022, 9:32 p.m. UTC | #9
On Fri, 15 Apr 2022 14:11:32 -0600 Yu Zhao <yuzhao@google.com> wrote:

> >
> > I grabbed
> > https://kojipkgs.fedoraproject.org//packages/kernel/5.18.0/0.rc2.23.fc37/src/kernel-5.18.0-0.rc2.23.fc37.src.rpm
> > and
> 
> Yes, Fedora/RHEL is one concrete example of the model I mentioned
> above (experimental/stable). I added Justin, the Fedora kernel
> maintainer, and he can further clarify.
> 
> If we don't want more VM_BUG_ONs, I'll remove them. But (let me
> reiterate) it seems to me that just defeats the purpose of having
> CONFIG_DEBUG_VM.
> 

Well, I feel your pain.  It was never expected that VM_BUG_ON() would
get subverted in this fashion.

We could create a new MM-developer-only assertion.  Might even call it
MM_BUG_ON().  With compile-time enablement but perhaps not a runtime
switch.

With nice simple semantics, please.  Like "it returns void" and "if you
pass an expression with side-effects then you lose".  And "if you send
a patch which produces warnings when CONFIG_MM_BUG_ON=n then you get to
switch to windows95 for a month".

Let's leave the mglru assertions in place for now and let's think about
creating something more suitable, with a view to switching mglru over
to that at a later time.



But really, none of this addresses the core problem: *_BUG_ON() often
kills the kernel.  So guess what we just did?  We killed the user's
kernel at the exact time when we least wished to do so: when they have
a bug to report to us.  So the thing is self-defeating.

It's much much better to WARN and to attempt to continue.  This makes
it much more likely that we'll get to hear about the kernel flaw.
Linus Torvalds April 15, 2022, 9:36 p.m. UTC | #10
On Fri, Apr 15, 2022 at 2:32 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> We could create a new MM-developer-only assertion.  Might even call it
> MM_BUG_ON().  With compile-time enablement but perhaps not a runtime
> switch.

.. or VM_BUG_ON() could just become a WARN_ON_ONCE().

Which it should be anyway - since the code has to be written to
continue after that BUG_ON() anyway.

There is absolutely _zero_ advantage to killing the machine. If you
want to be notified about "this must not happen", then WARN_ON_ONCE()
is the right thing to use.

BUG_ON() is basically always the wrong thing to do.

                Linus
Yu Zhao April 15, 2022, 10:57 p.m. UTC | #11
On Fri, Apr 15, 2022 at 3:43 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Fri, Apr 15, 2022 at 2:32 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > We could create a new MM-developer-only assertion.  Might even call it
> > MM_BUG_ON().  With compile-time enablement but perhaps not a runtime
> > switch.
>
> .. or VM_BUG_ON() could just become a WARN_ON_ONCE().
>
> Which it should be anyway - since the code has to be written to
> continue after that BUG_ON() anyway.
>
> There is absolutely _zero_ advantage to killing the machine. If you
> want to be notified about "this must not happen", then WARN_ON_ONCE()
> is the right thing to use.
>
> BUG_ON() is basically always the wrong thing to do.

Not trying to start a meta discussion, just my two cents:

This is a typical trolley problem: for the greater good, do we want to
inflict more pain on a small group of users running experimental
kernels so that they'd come back and yell at us quicker and louder?
BUG_ONs are harmful but problems that trigger them would be
presummingly less penetrating to the user base; on the other hand,
from my experience working with some testers (ordinary users), they
ignore WARN_ON_ONCEs until the kernel crashes.

I'll let Justin chime in on Fedora's take on CONFIG_DEBUG_VM. I bet
it's intended to crash the kernel.
Linus Torvalds April 15, 2022, 11:03 p.m. UTC | #12
On Fri, Apr 15, 2022 at 3:58 PM Yu Zhao <yuzhao@google.com> wrote:
>
> BUG_ONs are harmful but problems that trigger them would be
> presummingly less penetrating to the user base; on the other hand,
> from my experience working with some testers (ordinary users), they
> ignore WARN_ON_ONCEs until the kernel crashes.

I don't understand your argument.

First you say that VM_BUG_ON() is only for VM developers.

Then you say "some testers (ordinary users) ignore WARN_ON_ONCEs until
the kernel crashes".

So which is it?

VM developers, or ordinary users?

Honestly, if a VM developer is ignoring a WARN_ON_ONCE() from the VM
subsystem, I don't even know what to say.

And for ordinary users, a WARN_ON_ONCE() is about a million times
better, becasue:

 - the machine will hopefully continue working, so they can report the warning

 - even when they don't notice them, distros tend to have automated
reporting infrastructure

That's why I absolutely *DETEST* those stupid BUG_ON() cases - they
will often kill the machine with nasty locks held, resulting in a
completely undebuggable thing that never gets reported.

Yes, you can be careful and only put BUG_ON() in places where recovery
is possible. But even then, they have no actual _advantages_ over just
a WARN_ON_ONCE.

                  Linus
Jesse Barnes April 15, 2022, 11:24 p.m. UTC | #13
On Fri, Apr 15, 2022 at 4:04 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Fri, Apr 15, 2022 at 3:58 PM Yu Zhao <yuzhao@google.com> wrote:
> >
> > BUG_ONs are harmful but problems that trigger them would be
> > presummingly less penetrating to the user base; on the other hand,
> > from my experience working with some testers (ordinary users), they
> > ignore WARN_ON_ONCEs until the kernel crashes.
>
> I don't understand your argument.
>
> First you say that VM_BUG_ON() is only for VM developers.
>
> Then you say "some testers (ordinary users) ignore WARN_ON_ONCEs until
> the kernel crashes".
>
> So which is it?
>
> VM developers, or ordinary users?
>
> Honestly, if a VM developer is ignoring a WARN_ON_ONCE() from the VM
> subsystem, I don't even know what to say.
>
> And for ordinary users, a WARN_ON_ONCE() is about a million times
> better, becasue:
>
>  - the machine will hopefully continue working, so they can report the warning
>
>  - even when they don't notice them, distros tend to have automated
> reporting infrastructure
>
> That's why I absolutely *DETEST* those stupid BUG_ON() cases - they
> will often kill the machine with nasty locks held, resulting in a
> completely undebuggable thing that never gets reported.
>
> Yes, you can be careful and only put BUG_ON() in places where recovery
> is possible. But even then, they have no actual _advantages_ over just
> a WARN_ON_ONCE.

Generally agreed, and not to belabor this relatively small issue, but in some
environments like cloud or managed client deployments, a crash can actually
be preferable so we can get a dump, reboot the machine, and get things going
again for the application or user, then debug offline.  So having the
flexibility to
do that in those situations is helpful.  And there, a full crash dump is better
than just a log report with the WARN info, since debugging may be easier with
all the kernel memory.

Jesse
Matthew Wilcox (Oracle) April 15, 2022, 11:31 p.m. UTC | #14
On Fri, Apr 15, 2022 at 04:24:14PM -0700, Jesse Barnes wrote:
> On Fri, Apr 15, 2022 at 4:04 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > And for ordinary users, a WARN_ON_ONCE() is about a million times
> > better, becasue:
> >
> >  - the machine will hopefully continue working, so they can report the warning
> >
> >  - even when they don't notice them, distros tend to have automated
> > reporting infrastructure
> >
> > That's why I absolutely *DETEST* those stupid BUG_ON() cases - they
> > will often kill the machine with nasty locks held, resulting in a
> > completely undebuggable thing that never gets reported.
> >
> > Yes, you can be careful and only put BUG_ON() in places where recovery
> > is possible. But even then, they have no actual _advantages_ over just
> > a WARN_ON_ONCE.
> 
> Generally agreed, and not to belabor this relatively small issue, but in some
> environments like cloud or managed client deployments, a crash can actually
> be preferable so we can get a dump, reboot the machine, and get things going
> again for the application or user, then debug offline.  So having the
> flexibility to
> do that in those situations is helpful.  And there, a full crash dump is better
> than just a log report with the WARN info, since debugging may be easier with
> all the kernel memory.

But for those situations, don't you set panic_on_warn anyway?
Jesse Barnes April 15, 2022, 11:37 p.m. UTC | #15
On Fri, Apr 15, 2022, 4:31 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Apr 15, 2022 at 04:24:14PM -0700, Jesse Barnes wrote:
> > On Fri, Apr 15, 2022 at 4:04 PM Linus Torvalds
> > <torvalds@linux-foundation.org> wrote:
> > > And for ordinary users, a WARN_ON_ONCE() is about a million times
> > > better, becasue:
> > >
> > >  - the machine will hopefully continue working, so they can report the warning
> > >
> > >  - even when they don't notice them, distros tend to have automated
> > > reporting infrastructure
> > >
> > > That's why I absolutely *DETEST* those stupid BUG_ON() cases - they
> > > will often kill the machine with nasty locks held, resulting in a
> > > completely undebuggable thing that never gets reported.
> > >
> > > Yes, you can be careful and only put BUG_ON() in places where recovery
> > > is possible. But even then, they have no actual _advantages_ over just
> > > a WARN_ON_ONCE.
> >
> > Generally agreed, and not to belabor this relatively small issue, but in some
> > environments like cloud or managed client deployments, a crash can actually
> > be preferable so we can get a dump, reboot the machine, and get things going
> > again for the application or user, then debug offline.  So having the
> > flexibility to
> > do that in those situations is helpful.  And there, a full crash dump is better
> > than just a log report with the WARN info, since debugging may be easier with
> > all the kernel memory.
>
> But for those situations, don't you set panic_on_warn anyway?

Yes ignore me.

Jesse "returning to his cave of ignorace" Barnes
Yu Zhao April 15, 2022, 11:49 p.m. UTC | #16
On Fri, Apr 15, 2022 at 5:03 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Fri, Apr 15, 2022 at 3:58 PM Yu Zhao <yuzhao@google.com> wrote:
> >
> > BUG_ONs are harmful but problems that trigger them would be
> > presummingly less penetrating to the user base; on the other hand,
> > from my experience working with some testers (ordinary users), they
> > ignore WARN_ON_ONCEs until the kernel crashes.
>
> I don't understand your argument.
>
> First you say that VM_BUG_ON() is only for VM developers.

I did? Probably I implied CONFIG_DEBUG_VM=y is meant for MM developers.

> Then you say "some testers (ordinary users) ignore WARN_ON_ONCEs until
> the kernel crashes".
>
> So which is it?
>
> VM developers, or ordinary users?

Ordinary users.

> Honestly, if a VM developer is ignoring a WARN_ON_ONCE() from the VM
> subsystem, I don't even know what to say.

Same here. I wasn't worried about kernel developers ignoring any warnings.

> And for ordinary users, a WARN_ON_ONCE() is about a million times
> better, becasue:
>
>  - the machine will hopefully continue working, so they can report the warning
>
>  - even when they don't notice them, distros tend to have automated
> reporting infrastructure
>
> That's why I absolutely *DETEST* those stupid BUG_ON() cases - they
> will often kill the machine with nasty locks held, resulting in a
> completely undebuggable thing that never gets reported.
>
> Yes, you can be careful and only put BUG_ON() in places where recovery
> is possible. But even then, they have no actual _advantages_ over just
> a WARN_ON_ONCE.

I hear you, and I wasn't arguing about anything, just sharing my two cents.
Justin Forbes April 16, 2022, 4:32 p.m. UTC | #17
On Fri, Apr 15, 2022 at 4:33 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Fri, 15 Apr 2022 14:11:32 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > >
> > > I grabbed
> > > https://kojipkgs.fedoraproject.org//packages/kernel/5.18.0/0.rc2.23.fc37/src/kernel-5.18.0-0.rc2.23.fc37.src.rpm
> > > and
> >
> > Yes, Fedora/RHEL is one concrete example of the model I mentioned
> > above (experimental/stable). I added Justin, the Fedora kernel
> > maintainer, and he can further clarify.

We almost split into 3 scenarios. In rawhide we run a standard Fedora
config for rcX releases and .0, but git snapshots are built with debug
configs only. The trade off is that we can't turn on certain options
which kill performance, but we do get more users running these kernels
which expose real bugs.  The rawhide kernel follows Linus' tree and is
rebuilt most weekdays.  Stable Fedora is not a full debug config, but
in cases where we can keep a debug feature on without it much getting
in the way of performance, as is the case with CONFIG_DEBUG_VM, I
think there is value in keeping those on, until there is not.  And of
course RHEL is a much more conservative config, and a much more
conservative rebase/backport codebase.

> > If we don't want more VM_BUG_ONs, I'll remove them. But (let me
> > reiterate) it seems to me that just defeats the purpose of having
> > CONFIG_DEBUG_VM.
> >
>
> Well, I feel your pain.  It was never expected that VM_BUG_ON() would
> get subverted in this fashion.

Fedora is not trying to subvert anything.  If keeping the option on
becomes problematic, we can simply turn it off.   Fedora certainly has
a more diverse installed base than typical enterprise distributions,
and much more diverse than most QA pools.  Both in the array of
hardware, and in the use patterns, so things do get uncovered that
would not be seen otherwise.

> We could create a new MM-developer-only assertion.  Might even call it
> MM_BUG_ON().  With compile-time enablement but perhaps not a runtime
> switch.
>
> With nice simple semantics, please.  Like "it returns void" and "if you
> pass an expression with side-effects then you lose".  And "if you send
> a patch which produces warnings when CONFIG_MM_BUG_ON=n then you get to
> switch to windows95 for a month".
>
> Let's leave the mglru assertions in place for now and let's think about
> creating something more suitable, with a view to switching mglru over
> to that at a later time.
>
>
>
> But really, none of this addresses the core problem: *_BUG_ON() often
> kills the kernel.  So guess what we just did?  We killed the user's
> kernel at the exact time when we least wished to do so: when they have
> a bug to report to us.  So the thing is self-defeating.
>
> It's much much better to WARN and to attempt to continue.  This makes
> it much more likely that we'll get to hear about the kernel flaw.

I agree very much with this. We hear about warnings from users, they
don't go unnoticed, and several of these users are willing to spend
time to help get to the bottom of an issue. They may not know the
code, but plenty are willing to test various patches or scenarios.

Justin
Yu Zhao April 19, 2022, 10:32 p.m. UTC | #18
On Sat, Apr 16, 2022 at 10:32 AM Justin Forbes
<jforbes@fedoraproject.org> wrote:
>
> On Fri, Apr 15, 2022 at 4:33 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Fri, 15 Apr 2022 14:11:32 -0600 Yu Zhao <yuzhao@google.com> wrote:
> >
> > > >
> > > > I grabbed
> > > > https://kojipkgs.fedoraproject.org//packages/kernel/5.18.0/0.rc2.23.fc37/src/kernel-5.18.0-0.rc2.23.fc37.src.rpm
> > > > and
> > >
> > > Yes, Fedora/RHEL is one concrete example of the model I mentioned
> > > above (experimental/stable). I added Justin, the Fedora kernel
> > > maintainer, and he can further clarify.
>
> We almost split into 3 scenarios. In rawhide we run a standard Fedora
> config for rcX releases and .0, but git snapshots are built with debug
> configs only. The trade off is that we can't turn on certain options
> which kill performance, but we do get more users running these kernels
> which expose real bugs.  The rawhide kernel follows Linus' tree and is
> rebuilt most weekdays.  Stable Fedora is not a full debug config, but
> in cases where we can keep a debug feature on without it much getting
> in the way of performance, as is the case with CONFIG_DEBUG_VM, I
> think there is value in keeping those on, until there is not.  And of
> course RHEL is a much more conservative config, and a much more
> conservative rebase/backport codebase.
>
> > > If we don't want more VM_BUG_ONs, I'll remove them. But (let me
> > > reiterate) it seems to me that just defeats the purpose of having
> > > CONFIG_DEBUG_VM.
> > >
> >
> > Well, I feel your pain.  It was never expected that VM_BUG_ON() would
> > get subverted in this fashion.
>
> Fedora is not trying to subvert anything.  If keeping the option on
> becomes problematic, we can simply turn it off.   Fedora certainly has
> a more diverse installed base than typical enterprise distributions,
> and much more diverse than most QA pools.  Both in the array of
> hardware, and in the use patterns, so things do get uncovered that
> would not be seen otherwise.
>
> > We could create a new MM-developer-only assertion.  Might even call it
> > MM_BUG_ON().  With compile-time enablement but perhaps not a runtime
> > switch.
> >
> > With nice simple semantics, please.  Like "it returns void" and "if you
> > pass an expression with side-effects then you lose".  And "if you send
> > a patch which produces warnings when CONFIG_MM_BUG_ON=n then you get to
> > switch to windows95 for a month".
> >
> > Let's leave the mglru assertions in place for now and let's think about
> > creating something more suitable, with a view to switching mglru over
> > to that at a later time.
> >
> >
> >
> > But really, none of this addresses the core problem: *_BUG_ON() often
> > kills the kernel.  So guess what we just did?  We killed the user's
> > kernel at the exact time when we least wished to do so: when they have
> > a bug to report to us.  So the thing is self-defeating.
> >
> > It's much much better to WARN and to attempt to continue.  This makes
> > it much more likely that we'll get to hear about the kernel flaw.
>
> I agree very much with this. We hear about warnings from users, they
> don't go unnoticed, and several of these users are willing to spend
> time to help get to the bottom of an issue. They may not know the
> code, but plenty are willing to test various patches or scenarios.

Thanks, Justin. Glad to hear warnings are collected from the field.

Based on all the feedback, my action item is to replace all VM_BUG_ONs
with VM_WARN_ON_ONCEs.
zhong jiang April 29, 2022, 2:10 p.m. UTC | #19
On 2022/4/7 11:15 上午, Yu Zhao wrote:
> To further exploit spatial locality, the aging prefers to walk page
> tables to search for young PTEs and promote hot pages. A kill switch
> will be added in the next patch to disable this behavior. When
> disabled, the aging relies on the rmap only.
>
> NB: this behavior has nothing similar with the page table scanning in
> the 2.4 kernel [1], which searches page tables for old PTEs, adds cold
> pages to swapcache and unmaps them.
>
> To avoid confusion, the term "iteration" specifically means the
> traversal of an entire mm_struct list; the term "walk" will be applied
> to page tables and the rmap, as usual.
>
> An mm_struct list is maintained for each memcg, and an mm_struct
> follows its owner task to the new memcg when this task is migrated.
> Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls
> walk_page_range() with each mm_struct on this list to promote hot
> pages before it increments max_seq.
>
> When multiple page table walkers iterate the same list, each of them
> gets a unique mm_struct; therefore they can run concurrently. Page
> table walkers ignore any misplaced pages, e.g., if an mm_struct was
> migrated, pages it left in the previous memcg will not be promoted
> when its current memcg is under reclaim. Similarly, page table walkers
> will not promote pages from nodes other than the one under reclaim.
>
> This patch uses the following optimizations when walking page tables:
> 1. It tracks the usage of mm_struct's between context switches so that
>     page table walkers can skip processes that have been sleeping since
>     the last iteration.
> 2. It uses generational Bloom filters to record populated branches so
>     that page table walkers can reduce their search space based on the
>     query results, e.g., to skip page tables containing mostly holes or
>     misplaced pages.
> 3. It takes advantage of the accessed bit in non-leaf PMD entries when
>     CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
> 4. It does not zigzag between a PGD table and the same PMD table
>     spanning multiple VMAs. IOW, it finishes all the VMAs within the
>     range of the same PMD table before it returns to a PGD table. This
>     improves the cache performance for workloads that have large
>     numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
>
> Server benchmark results:
>    Single workload:
>      fio (buffered I/O): no change
>
>    Single workload:
>      memcached (anon): +[8, 10]%
>                  Ops/sec      KB/sec
>        patch1-7: 1193918.93   46438.15
>        patch1-8: 1301954.44   50640.27
>
>    Configurations:
>      no change
>
> Client benchmark results:
>    kswapd profiles:
>      patch1-7
>        45.90%  lzo1x_1_do_compress (real work)
>         9.14%  page_vma_mapped_walk
>         6.81%  _raw_spin_unlock_irq
>         2.80%  ptep_clear_flush
>         2.34%  __zram_bvec_write
>         2.29%  do_raw_spin_lock
>         1.84%  lru_gen_look_around
>         1.78%  memmove
>         1.74%  obj_malloc
>         1.50%  free_unref_page_list
>
>      patch1-8
>        46.96%  lzo1x_1_do_compress (real work)
>         7.55%  page_vma_mapped_walk
>         5.89%  _raw_spin_unlock_irq
>         3.33%  walk_pte_range
>         2.65%  ptep_clear_flush
>         2.23%  __zram_bvec_write
>         2.08%  do_raw_spin_lock
>         1.83%  memmove
>         1.65%  obj_malloc
>         1.47%  free_unref_page_list
>
>    Configurations:
>      no change
>
> [1] https://lwn.net/Articles/23732/
> [2] https://source.android.com/devices/tech/debug/scudo
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Acked-by: Brian Geffon <bgeffon@google.com>
> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
> Acked-by: Steven Barrett <steven@liquorix.net>
> Acked-by: Suleiman Souhlal <suleiman@google.com>
> Tested-by: Daniel Byrne <djbyrne@mtu.edu>
> Tested-by: Donald Carr <d@chaos-reins.com>
> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
> Tested-by: Sofia Trinh <sofia.trinh@edi.works>
> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---
>   fs/exec.c                  |   2 +
>   include/linux/memcontrol.h |   5 +
>   include/linux/mm_types.h   |  78 +++
>   include/linux/mmzone.h     |  59 +++
>   include/linux/swap.h       |   4 +
>   kernel/exit.c              |   1 +
>   kernel/fork.c              |   9 +
>   kernel/sched/core.c        |   1 +
>   mm/memcontrol.c            |  24 +
>   mm/vmscan.c                | 975 ++++++++++++++++++++++++++++++++++++-
>   10 files changed, 1144 insertions(+), 14 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index e3e55d5e0be1..bba8fc44926f 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1011,6 +1011,7 @@ static int exec_mmap(struct mm_struct *mm)
>   	active_mm = tsk->active_mm;
>   	tsk->active_mm = mm;
>   	tsk->mm = mm;
> +	lru_gen_add_mm(mm);
>   	/*
>   	 * This prevents preemption while active_mm is being loaded and
>   	 * it and mm are being updated, which could cause problems for
> @@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *mm)
>   	activate_mm(active_mm, mm);
>   	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>   		local_irq_enable();
> +	lru_gen_use_mm(mm);
>   	tsk->mm->vmacache_seqnum = 0;
>   	vmacache_flush(tsk);
>   	task_unlock(tsk);
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 463168fa1670..954c54652736 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -344,6 +344,11 @@ struct mem_cgroup {
>   	struct deferred_split deferred_split_queue;
>   #endif
>   
> +#ifdef CONFIG_LRU_GEN
> +	/* per-memcg mm_struct list */
> +	struct lru_gen_mm_list mm_list;
> +#endif
> +
>   	struct mem_cgroup_per_node *nodeinfo[];
>   };
>   
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 8834e38c06a4..eee29f700fab 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -3,6 +3,7 @@
>   #define _LINUX_MM_TYPES_H
>   
>   #include <linux/mm_types_task.h>
> +#include <linux/sched.h>
>   
>   #include <linux/auxvec.h>
>   #include <linux/kref.h>
> @@ -17,6 +18,8 @@
>   #include <linux/page-flags-layout.h>
>   #include <linux/workqueue.h>
>   #include <linux/seqlock.h>
> +#include <linux/nodemask.h>
> +#include <linux/mmdebug.h>
>   
>   #include <asm/mmu.h>
>   
> @@ -655,6 +658,22 @@ struct mm_struct {
>   #ifdef CONFIG_IOMMU_SVA
>   		u32 pasid;
>   #endif
> +#ifdef CONFIG_LRU_GEN
> +		struct {
> +			/* this mm_struct is on lru_gen_mm_list */
> +			struct list_head list;
> +#ifdef CONFIG_MEMCG
> +			/* points to the memcg of "owner" above */
> +			struct mem_cgroup *memcg;
> +#endif
> +			/*
> +			 * Set when switching to this mm_struct, as a hint of
> +			 * whether it has been used since the last time per-node
> +			 * page table walkers cleared the corresponding bits.
> +			 */
> +			nodemask_t nodes;
> +		} lru_gen;
> +#endif /* CONFIG_LRU_GEN */
>   	} __randomize_layout;
>   
>   	/*
> @@ -681,6 +700,65 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
>   	return (struct cpumask *)&mm->cpu_bitmap;
>   }
>   
> +#ifdef CONFIG_LRU_GEN
> +
> +struct lru_gen_mm_list {
> +	/* mm_struct list for page table walkers */
> +	struct list_head fifo;
> +	/* protects the list above */
> +	spinlock_t lock;
> +};
> +
> +void lru_gen_add_mm(struct mm_struct *mm);
> +void lru_gen_del_mm(struct mm_struct *mm);
> +#ifdef CONFIG_MEMCG
> +void lru_gen_migrate_mm(struct mm_struct *mm);
> +#endif
> +
> +static inline void lru_gen_init_mm(struct mm_struct *mm)
> +{
> +	INIT_LIST_HEAD(&mm->lru_gen.list);
> +#ifdef CONFIG_MEMCG
> +	mm->lru_gen.memcg = NULL;
> +#endif
> +	nodes_clear(mm->lru_gen.nodes);
> +}
> +
> +static inline void lru_gen_use_mm(struct mm_struct *mm)
> +{
> +	/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
> +	VM_WARN_ON(list_empty(&mm->lru_gen.list));
> +
> +	if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lru_gen.nodes))
> +		nodes_setall(mm->lru_gen.nodes);
> +}
> +
> +#else /* !CONFIG_LRU_GEN */
> +
> +static inline void lru_gen_add_mm(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void lru_gen_del_mm(struct mm_struct *mm)
> +{
> +}
> +
> +#ifdef CONFIG_MEMCG
> +static inline void lru_gen_migrate_mm(struct mm_struct *mm)
> +{
> +}
> +#endif
> +
> +static inline void lru_gen_init_mm(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void lru_gen_use_mm(struct mm_struct *mm)
> +{
> +}
> +
> +#endif /* CONFIG_LRU_GEN */
> +
>   struct mmu_gather;
>   extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
>   extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index c69589ad2b05..a1a99971ff9c 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -424,6 +424,58 @@ struct lru_gen_struct {
>   	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
>   };
>   
> +enum {
> +	MM_PTE_TOTAL,	/* total leaf entries */
> +	MM_PTE_OLD,	/* old leaf entries */
> +	MM_PTE_YOUNG,	/* young leaf entries */
> +	MM_PMD_TOTAL,	/* total non-leaf entries */
> +	MM_PMD_FOUND,	/* non-leaf entries found in Bloom filters */
> +	MM_PMD_ADDED,	/* non-leaf entries added to Bloom filters */
> +	NR_MM_STATS
> +};
> +
> +/* mnemonic codes for the mm stats above */
> +#define MM_STAT_CODES		"toydfa"
> +
> +/* double-buffering Bloom filters */
> +#define NR_BLOOM_FILTERS	2
> +
> +struct lru_gen_mm_state {
> +	/* set to max_seq after each iteration */
> +	unsigned long seq;
> +	/* where the current iteration starts (inclusive) */
> +	struct list_head *head;
> +	/* where the last iteration ends (exclusive) */
> +	struct list_head *tail;
> +	/* to wait for the last page table walker to finish */
> +	struct wait_queue_head wait;
> +	/* Bloom filters flip after each iteration */
> +	unsigned long *filters[NR_BLOOM_FILTERS];
> +	/* the mm stats for debugging */
> +	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
> +	/* the number of concurrent page table walkers */
> +	int nr_walkers;
> +};
> +
> +struct lru_gen_mm_walk {
> +	/* the lruvec under reclaim */
> +	struct lruvec *lruvec;
> +	/* unstable max_seq from lru_gen_struct */
> +	unsigned long max_seq;
> +	/* the next address within an mm to scan */
> +	unsigned long next_addr;
> +	/* to batch page table entries */
> +	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
> +	/* to batch promoted pages */
> +	int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
> +	/* to batch the mm stats */
> +	int mm_stats[NR_MM_STATS];
> +	/* total batched items */
> +	int batched;
> +	bool can_swap;
> +	bool full_scan;
> +};
> +
>   void lru_gen_init_lruvec(struct lruvec *lruvec);
>   void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
>   
> @@ -474,6 +526,8 @@ struct lruvec {
>   #ifdef CONFIG_LRU_GEN
>   	/* evictable pages divided into generations */
>   	struct lru_gen_struct		lrugen;
> +	/* to concurrently iterate lru_gen_mm_list */
> +	struct lru_gen_mm_state		mm_state;
>   #endif
>   #ifdef CONFIG_MEMCG
>   	struct pglist_data *pgdat;
> @@ -1067,6 +1121,11 @@ typedef struct pglist_data {
>   
>   	unsigned long		flags;
>   
> +#ifdef CONFIG_LRU_GEN
> +	/* kswap mm walk data */
> +	struct lru_gen_mm_walk	mm_walk;
> +#endif
> +
>   	ZONE_PADDING(_pad2_)
>   
>   	/* Per-node vmstats */
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 27093b477c5f..7bdd7bcb135d 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -137,6 +137,10 @@ union swap_header {
>    */
>   struct reclaim_state {
>   	unsigned long reclaimed_slab;
> +#ifdef CONFIG_LRU_GEN
> +	/* per-thread mm walk data */
> +	struct lru_gen_mm_walk *mm_walk;
> +#endif
>   };
>   
>   #ifdef __KERNEL__
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f072959fcab7..f2d4d48ea790 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -466,6 +466,7 @@ void mm_update_next_owner(struct mm_struct *mm)
>   		goto retry;
>   	}
>   	WRITE_ONCE(mm->owner, c);
> +	lru_gen_migrate_mm(mm);
>   	task_unlock(c);
>   	put_task_struct(c);
>   }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9796897560ab..d14297ce1151 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1148,6 +1148,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   		goto fail_nocontext;
>   
>   	mm->user_ns = get_user_ns(user_ns);
> +	lru_gen_init_mm(mm);
>   	return mm;
>   
>   fail_nocontext:
> @@ -1191,6 +1192,7 @@ static inline void __mmput(struct mm_struct *mm)
>   	if (mm->binfmt)
>   		module_put(mm->binfmt->module);
>   	mm_pasid_drop(mm);
> +	lru_gen_del_mm(mm);
>   	mmdrop(mm);
>   }
>   
> @@ -2660,6 +2662,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
>   		get_task_struct(p);
>   	}
>   
> +	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
> +		/* lock the task to synchronize with memcg migration */
> +		task_lock(p);
> +		lru_gen_add_mm(p->mm);
> +		task_unlock(p);
> +	}
> +
>   	wake_up_new_task(p);
>   
>   	/* forking complete and child started to run, tell ptracer */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d575b4914925..88193a0f6d2b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5057,6 +5057,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
>   		 * finish_task_switch()'s mmdrop().
>   		 */
>   		switch_mm_irqs_off(prev->active_mm, next->mm, next);
> +		lru_gen_use_mm(next->mm);
>   
>   		if (!prev->mm) {                        // from kernel
>   			/* will mmdrop() in finish_task_switch(). */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8069b58f2422..6a76152614c5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6083,6 +6083,29 @@ static void mem_cgroup_move_task(void)
>   }
>   #endif
>   
> +#ifdef CONFIG_LRU_GEN
> +static void mem_cgroup_attach(struct cgroup_taskset *tset)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct task_struct *task = NULL;
> +
> +	cgroup_taskset_for_each_leader(task, css, tset)
> +		break;
> +
> +	if (!task)
> +		return;
> +
> +	task_lock(task);
> +	if (task->mm && task->mm->owner == task)
> +		lru_gen_migrate_mm(task->mm);
> +	task_unlock(task);
> +}
> +#else
> +static void mem_cgroup_attach(struct cgroup_taskset *tset)
> +{
> +}
> +#endif /* CONFIG_LRU_GEN */
> +
>   static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
>   {
>   	if (value == PAGE_COUNTER_MAX)
> @@ -6428,6 +6451,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
>   	.css_reset = mem_cgroup_css_reset,
>   	.css_rstat_flush = mem_cgroup_css_rstat_flush,
>   	.can_attach = mem_cgroup_can_attach,
> +	.attach = mem_cgroup_attach,
>   	.cancel_attach = mem_cgroup_cancel_attach,
>   	.post_attach = mem_cgroup_move_task,
>   	.dfl_cftypes = memory_files,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f4dd3c3c589b..9e2810a230a4 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -50,6 +50,8 @@
>   #include <linux/printk.h>
>   #include <linux/dax.h>
>   #include <linux/psi.h>
> +#include <linux/pagewalk.h>
> +#include <linux/shmem_fs.h>
>   
>   #include <asm/tlbflush.h>
>   #include <asm/div64.h>
> @@ -3016,7 +3018,7 @@ static int folio_lru_tier(struct folio *folio)
>   	return lru_tier_from_refs(refs);
>   }
>   
> -static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
> +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
>   {
>   	struct pglist_data *pgdat = NODE_DATA(nid);
>   
> @@ -3061,6 +3063,374 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
>   	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
>   }
>   
> +/******************************************************************************
> + *                          mm_struct list
> + ******************************************************************************/
> +
> +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
> +{
> +	static struct lru_gen_mm_list mm_list = {
> +		.fifo = LIST_HEAD_INIT(mm_list.fifo),
> +		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
> +	};
> +
> +#ifdef CONFIG_MEMCG
> +	if (memcg)
> +		return &memcg->mm_list;
> +#endif
> +	VM_BUG_ON(!mem_cgroup_disabled());
> +
> +	return &mm_list;
> +}
> +
> +void lru_gen_add_mm(struct mm_struct *mm)
> +{
> +	int nid;
> +	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
> +	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
> +
> +	VM_BUG_ON_MM(!list_empty(&mm->lru_gen.list), mm);
> +#ifdef CONFIG_MEMCG
> +	VM_BUG_ON_MM(mm->lru_gen.memcg, mm);
> +	mm->lru_gen.memcg = memcg;
> +#endif
> +	spin_lock(&mm_list->lock);
> +
> +	for_each_node_state(nid, N_MEMORY) {
> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
> +
> +		if (!lruvec)
> +			continue;
> +
> +		if (lruvec->mm_state.tail == &mm_list->fifo)
> +			lruvec->mm_state.tail = &mm->lru_gen.list;
> +	}
> +
> +	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
> +
> +	spin_unlock(&mm_list->lock);
> +}
> +
> +void lru_gen_del_mm(struct mm_struct *mm)
> +{
> +	int nid;
> +	struct lru_gen_mm_list *mm_list;
> +	struct mem_cgroup *memcg = NULL;
> +
> +	if (list_empty(&mm->lru_gen.list))
> +		return;
> +
> +#ifdef CONFIG_MEMCG
> +	memcg = mm->lru_gen.memcg;
> +#endif
> +	mm_list = get_mm_list(memcg);
> +
> +	spin_lock(&mm_list->lock);
> +
> +	for_each_node(nid) {
> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
> +
> +		if (!lruvec)
> +			continue;
> +
> +		if (lruvec->mm_state.tail == &mm->lru_gen.list)
> +			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
> +
> +		if (lruvec->mm_state.head != &mm->lru_gen.list)
> +			continue;
> +
> +		lruvec->mm_state.head = lruvec->mm_state.head->next;
> +		if (lruvec->mm_state.head == &mm_list->fifo)
> +			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
> +	}
> +
> +	list_del_init(&mm->lru_gen.list);
> +
> +	spin_unlock(&mm_list->lock);
> +
> +#ifdef CONFIG_MEMCG
> +	mem_cgroup_put(mm->lru_gen.memcg);
> +	mm->lru_gen.memcg = NULL;
> +#endif
> +}
> +
> +#ifdef CONFIG_MEMCG
> +void lru_gen_migrate_mm(struct mm_struct *mm)
> +{
> +	struct mem_cgroup *memcg;
> +
> +	lockdep_assert_held(&mm->owner->alloc_lock);
> +
> +	/* for mm_update_next_owner() */
> +	if (mem_cgroup_disabled())
> +		return;
> +
> +	rcu_read_lock();
> +	memcg = mem_cgroup_from_task(mm->owner);
> +	rcu_read_unlock();
> +	if (memcg == mm->lru_gen.memcg)
> +		return;
> +
> +	VM_BUG_ON_MM(!mm->lru_gen.memcg, mm);
> +	VM_BUG_ON_MM(list_empty(&mm->lru_gen.list), mm);
> +
> +	lru_gen_del_mm(mm);
> +	lru_gen_add_mm(mm);
> +}
> +#endif
> +
> +/*
> + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
> + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
> + * bits in a bitmap, k is the number of hash functions and n is the number of
> + * inserted items.
> + *
> + * Page table walkers use one of the two filters to reduce their search space.
> + * To get rid of non-leaf entries that no longer have enough leaf entries, the
> + * aging uses the double-buffering technique to flip to the other filter each
> + * time it produces a new generation. For non-leaf entries that have enough
> + * leaf entries, the aging carries them over to the next generation in
> + * walk_pmd_range(); the eviction also report them when walking the rmap
> + * in lru_gen_look_around().
> + *
> + * For future optimizations:
> + * 1. It's not necessary to keep both filters all the time. The spare one can be
> + *    freed after the RCU grace period and reallocated if needed again.
> + * 2. And when reallocating, it's worth scaling its size according to the number
> + *    of inserted entries in the other filter, to reduce the memory overhead on
> + *    small systems and false positives on large systems.
> + * 3. Jenkins' hash function is an alternative to Knuth's.
> + */
> +#define BLOOM_FILTER_SHIFT	15
> +
> +static inline int filter_gen_from_seq(unsigned long seq)
> +{
> +	return seq % NR_BLOOM_FILTERS;
> +}
> +
> +static void get_item_key(void *item, int *key)
> +{
> +	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
> +
> +	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
> +
> +	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
> +	key[1] = hash >> BLOOM_FILTER_SHIFT;
> +}
> +
> +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
> +{
> +	unsigned long *filter;
> +	int gen = filter_gen_from_seq(seq);
> +
> +	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
> +
> +	filter = lruvec->mm_state.filters[gen];
> +	if (filter) {
> +		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
> +		return;
> +	}
> +
> +	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
> +	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
> +}
> +
> +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
> +{
> +	int key[2];
> +	unsigned long *filter;
> +	int gen = filter_gen_from_seq(seq);
> +
> +	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
> +	if (!filter)
> +		return;
> +
> +	get_item_key(item, key);
> +
> +	if (!test_bit(key[0], filter))
> +		set_bit(key[0], filter);
> +	if (!test_bit(key[1], filter))
> +		set_bit(key[1], filter);
> +}
> +
> +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
> +{
> +	int key[2];
> +	unsigned long *filter;
> +	int gen = filter_gen_from_seq(seq);
> +
> +	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
> +	if (!filter)
> +		return true;
> +
> +	get_item_key(item, key);
> +
> +	return test_bit(key[0], filter) && test_bit(key[1], filter);
> +}
> +
> +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
> +{
> +	int i;
> +	int hist;
> +
> +	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
> +
> +	if (walk) {
> +		hist = lru_hist_from_seq(walk->max_seq);
> +
> +		for (i = 0; i < NR_MM_STATS; i++) {
> +			WRITE_ONCE(lruvec->mm_state.stats[hist][i],
> +				   lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
> +			walk->mm_stats[i] = 0;
> +		}
> +	}
> +
> +	if (NR_HIST_GENS > 1 && last) {
> +		hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
> +
> +		for (i = 0; i < NR_MM_STATS; i++)
> +			WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
> +	}
> +}
> +
> +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
> +{
> +	int type;
> +	unsigned long size = 0;
> +	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
> +
> +	if (!walk->full_scan && cpumask_empty(mm_cpumask(mm)) &&
> +	    !node_isset(pgdat->node_id, mm->lru_gen.nodes))
> +		return true;
> +
> +	node_clear(pgdat->node_id, mm->lru_gen.nodes);
> +
> +	for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
> +		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
> +			       get_mm_counter(mm, MM_ANONPAGES) +
> +			       get_mm_counter(mm, MM_SHMEMPAGES);
> +	}
> +
> +	if (size < MIN_LRU_BATCH)
> +		return true;
> +
> +	if (mm_is_oom_victim(mm))
> +		return true;
> +
> +	return !mmget_not_zero(mm);
> +}
> +
> +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
> +			    struct mm_struct **iter)
> +{
> +	bool first = false;
> +	bool last = true;
> +	struct mm_struct *mm = NULL;
> +	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> +	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
> +	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
> +
> +	/*
> +	 * There are four interesting cases for this page table walker:
> +	 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
> +	 *    there is nothing to be done.
> +	 * 2. It's the first of the current generation, and it needs to reset
> +	 *    the Bloom filter for the next generation.
> +	 * 3. It reaches the end of mm_list, and it needs to increment
> +	 *    mm_state->seq; the iteration is done.
> +	 * 4. It's the last of the current generation, and it needs to reset the
> +	 *    mm stats counters for the next generation.
> +	 */
> +	if (*iter)
> +		mmput_async(*iter);
> +	else if (walk->max_seq <= READ_ONCE(mm_state->seq))
> +		return false;
> +
> +	spin_lock(&mm_list->lock);
> +
> +	VM_BUG_ON(mm_state->seq + 1 < walk->max_seq);
> +	VM_BUG_ON(*iter && mm_state->seq > walk->max_seq);
> +	VM_BUG_ON(*iter && !mm_state->nr_walkers);
> +
> +	if (walk->max_seq <= mm_state->seq) {
> +		if (!*iter)
> +			last = false;
> +		goto done;
> +	}
> +
> +	if (!mm_state->nr_walkers) {
> +		VM_BUG_ON(mm_state->head && mm_state->head != &mm_list->fifo);
> +
> +		mm_state->head = mm_list->fifo.next;
> +		first = true;
> +	}
> +
> +	while (!mm && mm_state->head != &mm_list->fifo) {
> +		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
> +
> +		mm_state->head = mm_state->head->next;
> +
> +		/* full scan for those added after the last iteration */
> +		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
> +			mm_state->tail = mm_state->head;
> +			walk->full_scan = true;
> +		}
> +

The full_scan seems to be alway true.   because mm_state->tail points to 
the first item in mm_list,  hence the walker's

condition mm_state->tail == &mm->lru_gen.list alway equal.  Am I missing 
something?


Thanks,

> +		if (should_skip_mm(mm, walk))
> +			mm = NULL;
> +	}
> +
> +	if (mm_state->head == &mm_list->fifo)
> +		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
> +done:
> +	if (*iter && !mm)
> +		mm_state->nr_walkers--;
> +	if (!*iter && mm)
> +		mm_state->nr_walkers++;
> +
> +	if (mm_state->nr_walkers)
> +		last = false;
> +
> +	if (mm && first)
> +		reset_bloom_filter(lruvec, walk->max_seq + 1);
> +
> +	if (*iter || last)
> +		reset_mm_stats(lruvec, walk, last);
> +
> +	spin_unlock(&mm_list->lock);
> +
> +	*iter = mm;
> +
> +	return last;
> +}
> +
> +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
> +{
> +	bool success = false;
> +	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> +	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
> +	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
> +
> +	if (max_seq <= READ_ONCE(mm_state->seq))
> +		return false;
> +
> +	spin_lock(&mm_list->lock);
> +
> +	VM_BUG_ON(mm_state->seq + 1 < max_seq);
> +
> +	if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
> +		VM_BUG_ON(mm_state->head && mm_state->head != &mm_list->fifo);
> +
> +		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
> +		reset_mm_stats(lruvec, NULL, true);
> +		success = true;
> +	}
> +
> +	spin_unlock(&mm_list->lock);
> +
> +	return success;
> +}
> +
>   /******************************************************************************
>    *                          refault feedback loop
>    ******************************************************************************/
> @@ -3214,6 +3584,476 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
>   	return new_gen;
>   }
>   
> +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
> +			      int old_gen, int new_gen)
> +{
> +	int type = folio_is_file_lru(folio);
> +	int zone = folio_zonenum(folio);
> +	int delta = folio_nr_pages(folio);
> +
> +	VM_BUG_ON(old_gen >= MAX_NR_GENS);
> +	VM_BUG_ON(new_gen >= MAX_NR_GENS);
> +
> +	walk->batched++;
> +
> +	walk->nr_pages[old_gen][type][zone] -= delta;
> +	walk->nr_pages[new_gen][type][zone] += delta;
> +}
> +
> +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
> +{
> +	int gen, type, zone;
> +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
> +
> +	walk->batched = 0;
> +
> +	for_each_gen_type_zone(gen, type, zone) {
> +		enum lru_list lru = type * LRU_INACTIVE_FILE;
> +		int delta = walk->nr_pages[gen][type][zone];
> +
> +		if (!delta)
> +			continue;
> +
> +		walk->nr_pages[gen][type][zone] = 0;
> +		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
> +			   lrugen->nr_pages[gen][type][zone] + delta);
> +
> +		if (lru_gen_is_active(lruvec, gen))
> +			lru += LRU_ACTIVE;
> +		__update_lru_size(lruvec, lru, zone, delta);
> +	}
> +}
> +
> +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
> +{
> +	struct address_space *mapping;
> +	struct vm_area_struct *vma = walk->vma;
> +	struct lru_gen_mm_walk *priv = walk->private;
> +
> +	if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
> +	    (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) ||
> +	    vma == get_gate_vma(vma->vm_mm))
> +		return true;
> +
> +	if (vma_is_anonymous(vma))
> +		return !priv->can_swap;
> +
> +	if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
> +		return true;
> +
> +	mapping = vma->vm_file->f_mapping;
> +	if (mapping_unevictable(mapping))
> +		return true;
> +
> +	/* check readpage to exclude special mappings like dax, etc. */
> +	return shmem_mapping(mapping) ? !priv->can_swap : !mapping->a_ops->readpage;
> +}
> +
> +/*
> + * Some userspace memory allocators map many single-page VMAs. Instead of
> + * returning back to the PGD table for each of such VMAs, finish an entire PMD
> + * table to reduce zigzags and improve cache performance.
> + */
> +static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size,
> +			 unsigned long *start, unsigned long *end)
> +{
> +	unsigned long next = round_up(*end, size);
> +
> +	VM_BUG_ON(mask & size);
> +	VM_BUG_ON(*start >= *end);
> +	VM_BUG_ON((next & mask) != (*start & mask));
> +
> +	while (walk->vma) {
> +		if (next >= walk->vma->vm_end) {
> +			walk->vma = walk->vma->vm_next;
> +			continue;
> +		}
> +
> +		if ((next & mask) != (walk->vma->vm_start & mask))
> +			return false;
> +
> +		if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
> +			walk->vma = walk->vma->vm_next;
> +			continue;
> +		}
> +
> +		*start = max(next, walk->vma->vm_start);
> +		next = (next | ~mask) + 1;
> +		/* rounded-up boundaries can wrap to 0 */
> +		*end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
> +
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool suitable_to_scan(int total, int young)
> +{
> +	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
> +
> +	/* suitable if the average number of young PTEs per cacheline is >=1 */
> +	return young * n >= total;
> +}
> +
> +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
> +			   struct mm_walk *walk)
> +{
> +	int i;
> +	pte_t *pte;
> +	spinlock_t *ptl;
> +	unsigned long addr;
> +	int total = 0;
> +	int young = 0;
> +	struct lru_gen_mm_walk *priv = walk->private;
> +	struct mem_cgroup *memcg = lruvec_memcg(priv->lruvec);
> +	struct pglist_data *pgdat = lruvec_pgdat(priv->lruvec);
> +	int old_gen, new_gen = lru_gen_from_seq(priv->max_seq);
> +
> +	VM_BUG_ON(pmd_leaf(*pmd));
> +
> +	ptl = pte_lockptr(walk->mm, pmd);
> +	if (!spin_trylock(ptl))
> +		return false;
> +
> +	arch_enter_lazy_mmu_mode();
> +
> +	pte = pte_offset_map(pmd, start & PMD_MASK);
> +restart:
> +	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
> +		struct folio *folio;
> +		unsigned long pfn = pte_pfn(pte[i]);
> +
> +		VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end);
> +
> +		total++;
> +		priv->mm_stats[MM_PTE_TOTAL]++;
> +
> +		if (!pte_present(pte[i]) || is_zero_pfn(pfn))
> +			continue;
> +
> +		if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
> +			continue;
> +
> +		if (!pte_young(pte[i])) {
> +			priv->mm_stats[MM_PTE_OLD]++;
> +			continue;
> +		}
> +
> +		VM_BUG_ON(!pfn_valid(pfn));
> +		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
> +			continue;
> +
> +		folio = pfn_folio(pfn);
> +		if (folio_nid(folio) != pgdat->node_id)
> +			continue;
> +
> +		if (folio_memcg_rcu(folio) != memcg)
> +			continue;
> +
> +		if (!ptep_test_and_clear_young(walk->vma, addr, pte + i))
> +			continue;
> +
> +		young++;
> +		priv->mm_stats[MM_PTE_YOUNG]++;
> +
> +		if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
> +		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
> +		      !folio_test_swapcache(folio)))
> +			folio_mark_dirty(folio);
> +
> +		old_gen = folio_update_gen(folio, new_gen);
> +		if (old_gen >= 0 && old_gen != new_gen)
> +			update_batch_size(priv, folio, old_gen, new_gen);
> +	}
> +
> +	if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end))
> +		goto restart;
> +
> +	pte_unmap(pte);
> +
> +	arch_leave_lazy_mmu_mode();
> +	spin_unlock(ptl);
> +
> +	return suitable_to_scan(total, young);
> +}
> +
> +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
> +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
> +				  struct mm_walk *walk, unsigned long *start)
> +{
> +	int i;
> +	pmd_t *pmd;
> +	spinlock_t *ptl;
> +	struct lru_gen_mm_walk *priv = walk->private;
> +	struct mem_cgroup *memcg = lruvec_memcg(priv->lruvec);
> +	struct pglist_data *pgdat = lruvec_pgdat(priv->lruvec);
> +	int old_gen, new_gen = lru_gen_from_seq(priv->max_seq);
> +
> +	VM_BUG_ON(pud_leaf(*pud));
> +
> +	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
> +	if (*start == -1) {
> +		*start = next;
> +		return;
> +	}
> +
> +	i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
> +	if (i && i <= MIN_LRU_BATCH) {
> +		__set_bit(i - 1, priv->bitmap);
> +		return;
> +	}
> +
> +	pmd = pmd_offset(pud, *start);
> +
> +	ptl = pmd_lockptr(walk->mm, pmd);
> +	if (!spin_trylock(ptl))
> +		goto done;
> +
> +	arch_enter_lazy_mmu_mode();
> +
> +	do {
> +		struct folio *folio;
> +		unsigned long pfn = pmd_pfn(pmd[i]);
> +		unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
> +
> +		VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end);
> +
> +		if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i]))
> +			goto next;
> +
> +		if (WARN_ON_ONCE(pmd_devmap(pmd[i])))
> +			goto next;
> +
> +		if (!pmd_trans_huge(pmd[i])) {
> +			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
> +				pmdp_test_and_clear_young(vma, addr, pmd + i);
> +			goto next;
> +		}
> +
> +		VM_BUG_ON(!pfn_valid(pfn));
> +		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
> +			goto next;
> +
> +		folio = pfn_folio(pfn);
> +		if (folio_nid(folio) != pgdat->node_id)
> +			goto next;
> +
> +		if (folio_memcg_rcu(folio) != memcg)
> +			goto next;
> +
> +		if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
> +			goto next;
> +
> +		priv->mm_stats[MM_PTE_YOUNG]++;
> +
> +		if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
> +		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
> +		      !folio_test_swapcache(folio)))
> +			folio_mark_dirty(folio);
> +
> +		old_gen = folio_update_gen(folio, new_gen);
> +		if (old_gen >= 0 && old_gen != new_gen)
> +			update_batch_size(priv, folio, old_gen, new_gen);
> +next:
> +		i = i > MIN_LRU_BATCH ? 0 :
> +		    find_next_bit(priv->bitmap, MIN_LRU_BATCH, i) + 1;
> +	} while (i <= MIN_LRU_BATCH);
> +
> +	arch_leave_lazy_mmu_mode();
> +	spin_unlock(ptl);
> +done:
> +	*start = -1;
> +	bitmap_zero(priv->bitmap, MIN_LRU_BATCH);
> +}
> +#else
> +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
> +				  struct mm_walk *walk, unsigned long *start)
> +{
> +}
> +#endif
> +
> +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
> +			   struct mm_walk *walk)
> +{
> +	int i;
> +	pmd_t *pmd;
> +	unsigned long next;
> +	unsigned long addr;
> +	struct vm_area_struct *vma;
> +	unsigned long pos = -1;
> +	struct lru_gen_mm_walk *priv = walk->private;
> +
> +	VM_BUG_ON(pud_leaf(*pud));
> +
> +	/*
> +	 * Finish an entire PMD in two passes: the first only reaches to PTE
> +	 * tables to avoid taking the PMD lock; the second, if necessary, takes
> +	 * the PMD lock to clear the accessed bit in PMD entries.
> +	 */
> +	pmd = pmd_offset(pud, start & PUD_MASK);
> +restart:
> +	/* walk_pte_range() may call get_next_vma() */
> +	vma = walk->vma;
> +	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
> +		pmd_t val = pmd_read_atomic(pmd + i);
> +
> +		/* for pmd_read_atomic() */
> +		barrier();
> +
> +		next = pmd_addr_end(addr, end);
> +
> +		if (!pmd_present(val)) {
> +			priv->mm_stats[MM_PTE_TOTAL]++;
> +			continue;
> +		}
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +		if (pmd_trans_huge(val)) {
> +			unsigned long pfn = pmd_pfn(val);
> +			struct pglist_data *pgdat = lruvec_pgdat(priv->lruvec);
> +
> +			priv->mm_stats[MM_PTE_TOTAL]++;
> +
> +			if (is_huge_zero_pmd(val))
> +				continue;
> +
> +			if (!pmd_young(val)) {
> +				priv->mm_stats[MM_PTE_OLD]++;
> +				continue;
> +			}
> +
> +			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
> +				continue;
> +
> +			walk_pmd_range_locked(pud, addr, vma, walk, &pos);
> +			continue;
> +		}
> +#endif
> +		priv->mm_stats[MM_PMD_TOTAL]++;
> +
> +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
> +		if (!pmd_young(val))
> +			continue;
> +
> +		walk_pmd_range_locked(pud, addr, vma, walk, &pos);
> +#endif
> +		if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i))
> +			continue;
> +
> +		priv->mm_stats[MM_PMD_FOUND]++;
> +
> +		if (!walk_pte_range(&val, addr, next, walk))
> +			continue;
> +
> +		priv->mm_stats[MM_PMD_ADDED]++;
> +
> +		/* carry over to the next generation */
> +		update_bloom_filter(priv->lruvec, priv->max_seq + 1, pmd + i);
> +	}
> +
> +	walk_pmd_range_locked(pud, -1, vma, walk, &pos);
> +
> +	if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end))
> +		goto restart;
> +}
> +
> +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
> +			  struct mm_walk *walk)
> +{
> +	int i;
> +	pud_t *pud;
> +	unsigned long addr;
> +	unsigned long next;
> +	struct lru_gen_mm_walk *priv = walk->private;
> +
> +	VM_BUG_ON(p4d_leaf(*p4d));
> +
> +	pud = pud_offset(p4d, start & P4D_MASK);
> +restart:
> +	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
> +		pud_t val = READ_ONCE(pud[i]);
> +
> +		next = pud_addr_end(addr, end);
> +
> +		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
> +			continue;
> +
> +		walk_pmd_range(&val, addr, next, walk);
> +
> +		if (priv->batched >= MAX_LRU_BATCH) {
> +			end = (addr | ~PUD_MASK) + 1;
> +			goto done;
> +		}
> +	}
> +
> +	if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end))
> +		goto restart;
> +
> +	end = round_up(end, P4D_SIZE);
> +done:
> +	/* rounded-up boundaries can wrap to 0 */
> +	priv->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
> +
> +	return -EAGAIN;
> +}
> +
> +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
> +{
> +	static const struct mm_walk_ops mm_walk_ops = {
> +		.test_walk = should_skip_vma,
> +		.p4d_entry = walk_pud_range,
> +	};
> +
> +	int err;
> +	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> +
> +	walk->next_addr = FIRST_USER_ADDRESS;
> +
> +	do {
> +		err = -EBUSY;
> +
> +		/* folio_update_gen() requires stable folio_memcg() */
> +		if (!mem_cgroup_trylock_pages(memcg))
> +			break;
> +
> +		/* the caller might be holding the lock for write */
> +		if (mmap_read_trylock(mm)) {
> +			unsigned long start = walk->next_addr;
> +			unsigned long end = mm->highest_vm_end;
> +
> +			err = walk_page_range(mm, start, end, &mm_walk_ops, walk);
> +
> +			mmap_read_unlock(mm);
> +
> +			if (walk->batched) {
> +				spin_lock_irq(&lruvec->lru_lock);
> +				reset_batch_size(lruvec, walk);
> +				spin_unlock_irq(&lruvec->lru_lock);
> +			}
> +		}
> +
> +		mem_cgroup_unlock_pages();
> +
> +		cond_resched();
> +	} while (err == -EAGAIN && walk->next_addr && !mm_is_oom_victim(mm));
> +}
> +
> +static struct lru_gen_mm_walk *alloc_mm_walk(void)
> +{
> +	if (current->reclaim_state && current->reclaim_state->mm_walk)
> +		return current->reclaim_state->mm_walk;
> +
> +	return kzalloc(sizeof(struct lru_gen_mm_walk),
> +		       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +}
> +
> +static void free_mm_walk(struct lru_gen_mm_walk *walk)
> +{
> +	if (!current->reclaim_state || !current->reclaim_state->mm_walk)
> +		kfree(walk);
> +}
> +
>   static void inc_min_seq(struct lruvec *lruvec)
>   {
>   	int type;
> @@ -3272,7 +4112,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
>   	return success;
>   }
>   
> -static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
> +static void inc_max_seq(struct lruvec *lruvec)
>   {
>   	int prev, next;
>   	int type, zone;
> @@ -3282,9 +4122,6 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
>   
>   	VM_BUG_ON(!seq_is_valid(lruvec));
>   
> -	if (max_seq != lrugen->max_seq)
> -		goto unlock;
> -
>   	inc_min_seq(lruvec);
>   
>   	/*
> @@ -3316,10 +4153,72 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
>   
>   	/* make sure preceding modifications appear */
>   	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
> -unlock:
> +
>   	spin_unlock_irq(&lruvec->lru_lock);
>   }
>   
> +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
> +			       struct scan_control *sc, bool can_swap, bool full_scan)
> +{
> +	bool success;
> +	struct lru_gen_mm_walk *walk;
> +	struct mm_struct *mm = NULL;
> +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
> +
> +	VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
> +
> +	/*
> +	 * If the hardware doesn't automatically set the accessed bit, fallback
> +	 * to lru_gen_look_around(), which only clears the accessed bit in a
> +	 * handful of PTEs. Spreading the work out over a period of time usually
> +	 * is less efficient, but it avoids bursty page faults.
> +	 */
> +	if (!full_scan && !arch_has_hw_pte_young()) {
> +		success = iterate_mm_list_nowalk(lruvec, max_seq);
> +		goto done;
> +	}
> +
> +	walk = alloc_mm_walk();
> +	if (!walk) {
> +		success = iterate_mm_list_nowalk(lruvec, max_seq);
> +		goto done;
> +	}
> +
> +	walk->lruvec = lruvec;
> +	walk->max_seq = max_seq;
> +	walk->can_swap = can_swap;
> +	walk->full_scan = full_scan;
> +
> +	do {
> +		success = iterate_mm_list(lruvec, walk, &mm);
> +		if (mm)
> +			walk_mm(lruvec, mm, walk);
> +
> +		cond_resched();
> +	} while (mm);
> +
> +	free_mm_walk(walk);
> +done:
> +	if (!success) {
> +		if (!current_is_kswapd() && !sc->priority)
> +			wait_event_killable(lruvec->mm_state.wait,
> +					    max_seq < READ_ONCE(lrugen->max_seq));
> +
> +		return max_seq < READ_ONCE(lrugen->max_seq);
> +	}
> +
> +	VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
> +
> +	inc_max_seq(lruvec);
> +	/* either this sees any waiters or they will see updated max_seq */
> +	if (wq_has_sleeper(&lruvec->mm_state.wait))
> +		wake_up_all(&lruvec->mm_state.wait);
> +
> +	wakeup_flusher_threads(WB_REASON_VMSCAN);
> +
> +	return true;
> +}
> +
>   static long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq,
>   			     unsigned long *min_seq, bool can_swap, bool *need_aging)
>   {
> @@ -3401,7 +4300,7 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>   		nr_to_scan++;
>   
>   	if (nr_to_scan && need_aging && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
> -		inc_max_seq(lruvec, max_seq);
> +		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
>   }
>   
>   static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
> @@ -3410,6 +4309,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>   
>   	VM_BUG_ON(!current_is_kswapd());
>   
> +	current->reclaim_state->mm_walk = &pgdat->mm_walk;
> +
>   	memcg = mem_cgroup_iter(NULL, NULL, NULL);
>   	do {
>   		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> @@ -3418,11 +4319,16 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>   
>   		cond_resched();
>   	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
> +
> +	current->reclaim_state->mm_walk = NULL;
>   }
>   
>   /*
>    * This function exploits spatial locality when shrink_page_list() walks the
>    * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
> + * If the scan was done cacheline efficiently, it adds the PMD entry pointing
> + * to the PTE table to the Bloom filter. This process is a feedback loop from
> + * the eviction to the aging.
>    */
>   void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>   {
> @@ -3431,6 +4337,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>   	unsigned long start;
>   	unsigned long end;
>   	unsigned long addr;
> +	struct lru_gen_mm_walk *walk;
> +	int young = 0;
>   	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
>   	struct folio *folio = pfn_folio(pvmw->pfn);
>   	struct mem_cgroup *memcg = folio_memcg(folio);
> @@ -3492,6 +4400,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>   		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
>   			continue;
>   
> +		young++;
> +
>   		if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
>   		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
>   		      !folio_test_swapcache(folio)))
> @@ -3507,7 +4417,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>   	arch_leave_lazy_mmu_mode();
>   	rcu_read_unlock();
>   
> -	if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
> +	/* feedback from rmap walkers to page table walkers */
> +	if (suitable_to_scan(i, young))
> +		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
> +
> +	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
> +
> +	if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
>   		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
>   			folio = pfn_folio(pte_pfn(pte[i]));
>   			folio_activate(folio);
> @@ -3519,8 +4435,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>   	if (!mem_cgroup_trylock_pages(memcg))
>   		return;
>   
> -	spin_lock_irq(&lruvec->lru_lock);
> -	new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
> +	if (!walk) {
> +		spin_lock_irq(&lruvec->lru_lock);
> +		new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
> +	}
>   
>   	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
>   		folio = pfn_folio(pte_pfn(pte[i]));
> @@ -3531,10 +4449,14 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>   		if (old_gen < 0 || old_gen == new_gen)
>   			continue;
>   
> -		lru_gen_update_size(lruvec, folio, old_gen, new_gen);
> +		if (walk)
> +			update_batch_size(walk, folio, old_gen, new_gen);
> +		else
> +			lru_gen_update_size(lruvec, folio, old_gen, new_gen);
>   	}
>   
> -	spin_unlock_irq(&lruvec->lru_lock);
> +	if (!walk)
> +		spin_unlock_irq(&lruvec->lru_lock);
>   
>   	mem_cgroup_unlock_pages();
>   }
> @@ -3801,6 +4723,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>   	struct folio *folio;
>   	enum vm_event_item item;
>   	struct reclaim_stat stat;
> +	struct lru_gen_mm_walk *walk;
>   	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>   	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>   
> @@ -3840,6 +4763,10 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>   
>   	move_pages_to_lru(lruvec, &list);
>   
> +	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
> +	if (walk && walk->batched)
> +		reset_batch_size(lruvec, walk);
> +
>   	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
>   	if (!cgroup_reclaim(sc))
>   		__count_vm_events(item, reclaimed);
> @@ -3894,20 +4821,25 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
>   		return 0;
>   	}
>   
> -	inc_max_seq(lruvec, max_seq);
> +	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
> +		return nr_to_scan;
>   
> -	return nr_to_scan;
> +	return min_seq[LRU_GEN_FILE] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
>   }
>   
>   static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>   {
>   	struct blk_plug plug;
>   	long scanned = 0;
> +	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>   
>   	lru_add_drain();
>   
>   	blk_start_plug(&plug);
>   
> +	if (current_is_kswapd())
> +		current->reclaim_state->mm_walk = &pgdat->mm_walk;
> +
>   	while (true) {
>   		int delta;
>   		int swappiness;
> @@ -3935,6 +4867,9 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
>   		cond_resched();
>   	}
>   
> +	if (current_is_kswapd())
> +		current->reclaim_state->mm_walk = NULL;
> +
>   	blk_finish_plug(&plug);
>   }
>   
> @@ -3951,15 +4886,21 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
>   
>   	for_each_gen_type_zone(gen, type, zone)
>   		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
> +
> +	lruvec->mm_state.seq = MIN_NR_GENS;
> +	init_waitqueue_head(&lruvec->mm_state.wait);
>   }
>   
>   #ifdef CONFIG_MEMCG
>   void lru_gen_init_memcg(struct mem_cgroup *memcg)
>   {
> +	INIT_LIST_HEAD(&memcg->mm_list.fifo);
> +	spin_lock_init(&memcg->mm_list.lock);
>   }
>   
>   void lru_gen_exit_memcg(struct mem_cgroup *memcg)
>   {
> +	int i;
>   	int nid;
>   
>   	for_each_node(nid) {
> @@ -3967,6 +4908,11 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
>   
>   		VM_BUG_ON(memchr_inv(lruvec->lrugen.nr_pages, 0,
>   				     sizeof(lruvec->lrugen.nr_pages)));
> +
> +		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
> +			bitmap_free(lruvec->mm_state.filters[i]);
> +			lruvec->mm_state.filters[i] = NULL;
> +		}
>   	}
>   }
>   #endif
> @@ -3975,6 +4921,7 @@ static int __init init_lru_gen(void)
>   {
>   	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
>   	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
> +	BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
>   
>   	return 0;
>   };
Yu Zhao April 30, 2022, 8:34 a.m. UTC | #20
On Fri, Apr 29, 2022 at 7:10 AM zhong jiang
<zhongjiang-ali@linux.alibaba.com> wrote:
>
> On 2022/4/7 11:15 上午, Yu Zhao wrote:
...
> > +     while (!mm && mm_state->head != &mm_list->fifo) {
> > +             mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
> > +
> > +             mm_state->head = mm_state->head->next;
> > +
> > +             /* full scan for those added after the last iteration */
> > +             if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
> > +                     mm_state->tail = mm_state->head;
> > +                     walk->full_scan = true;
> > +             }
> > +
>
> The full_scan seems to be alway true.   because mm_state->tail points to
> the first item in mm_list,  hence the walker's
>
> condition mm_state->tail == &mm->lru_gen.list alway equal.  Am I missing
> something?

mm_state->tail points to the first item *added after the last
iteration*. If no new items, mm_state->tail parks at mm_list->fifo.
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index e3e55d5e0be1..bba8fc44926f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1011,6 +1011,7 @@  static int exec_mmap(struct mm_struct *mm)
 	active_mm = tsk->active_mm;
 	tsk->active_mm = mm;
 	tsk->mm = mm;
+	lru_gen_add_mm(mm);
 	/*
 	 * This prevents preemption while active_mm is being loaded and
 	 * it and mm are being updated, which could cause problems for
@@ -1023,6 +1024,7 @@  static int exec_mmap(struct mm_struct *mm)
 	activate_mm(active_mm, mm);
 	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 		local_irq_enable();
+	lru_gen_use_mm(mm);
 	tsk->mm->vmacache_seqnum = 0;
 	vmacache_flush(tsk);
 	task_unlock(tsk);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 463168fa1670..954c54652736 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -344,6 +344,11 @@  struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_LRU_GEN
+	/* per-memcg mm_struct list */
+	struct lru_gen_mm_list mm_list;
+#endif
+
 	struct mem_cgroup_per_node *nodeinfo[];
 };
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8834e38c06a4..eee29f700fab 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -3,6 +3,7 @@ 
 #define _LINUX_MM_TYPES_H
 
 #include <linux/mm_types_task.h>
+#include <linux/sched.h>
 
 #include <linux/auxvec.h>
 #include <linux/kref.h>
@@ -17,6 +18,8 @@ 
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
+#include <linux/nodemask.h>
+#include <linux/mmdebug.h>
 
 #include <asm/mmu.h>
 
@@ -655,6 +658,22 @@  struct mm_struct {
 #ifdef CONFIG_IOMMU_SVA
 		u32 pasid;
 #endif
+#ifdef CONFIG_LRU_GEN
+		struct {
+			/* this mm_struct is on lru_gen_mm_list */
+			struct list_head list;
+#ifdef CONFIG_MEMCG
+			/* points to the memcg of "owner" above */
+			struct mem_cgroup *memcg;
+#endif
+			/*
+			 * Set when switching to this mm_struct, as a hint of
+			 * whether it has been used since the last time per-node
+			 * page table walkers cleared the corresponding bits.
+			 */
+			nodemask_t nodes;
+		} lru_gen;
+#endif /* CONFIG_LRU_GEN */
 	} __randomize_layout;
 
 	/*
@@ -681,6 +700,65 @@  static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return (struct cpumask *)&mm->cpu_bitmap;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+	/* mm_struct list for page table walkers */
+	struct list_head fifo;
+	/* protects the list above */
+	spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+	INIT_LIST_HEAD(&mm->lru_gen.list);
+#ifdef CONFIG_MEMCG
+	mm->lru_gen.memcg = NULL;
+#endif
+	nodes_clear(mm->lru_gen.nodes);
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+	/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
+	VM_WARN_ON(list_empty(&mm->lru_gen.list));
+
+	if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lru_gen.nodes))
+		nodes_setall(mm->lru_gen.nodes);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c69589ad2b05..a1a99971ff9c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -424,6 +424,58 @@  struct lru_gen_struct {
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 };
 
+enum {
+	MM_PTE_TOTAL,	/* total leaf entries */
+	MM_PTE_OLD,	/* old leaf entries */
+	MM_PTE_YOUNG,	/* young leaf entries */
+	MM_PMD_TOTAL,	/* total non-leaf entries */
+	MM_PMD_FOUND,	/* non-leaf entries found in Bloom filters */
+	MM_PMD_ADDED,	/* non-leaf entries added to Bloom filters */
+	NR_MM_STATS
+};
+
+/* mnemonic codes for the mm stats above */
+#define MM_STAT_CODES		"toydfa"
+
+/* double-buffering Bloom filters */
+#define NR_BLOOM_FILTERS	2
+
+struct lru_gen_mm_state {
+	/* set to max_seq after each iteration */
+	unsigned long seq;
+	/* where the current iteration starts (inclusive) */
+	struct list_head *head;
+	/* where the last iteration ends (exclusive) */
+	struct list_head *tail;
+	/* to wait for the last page table walker to finish */
+	struct wait_queue_head wait;
+	/* Bloom filters flip after each iteration */
+	unsigned long *filters[NR_BLOOM_FILTERS];
+	/* the mm stats for debugging */
+	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+	/* the number of concurrent page table walkers */
+	int nr_walkers;
+};
+
+struct lru_gen_mm_walk {
+	/* the lruvec under reclaim */
+	struct lruvec *lruvec;
+	/* unstable max_seq from lru_gen_struct */
+	unsigned long max_seq;
+	/* the next address within an mm to scan */
+	unsigned long next_addr;
+	/* to batch page table entries */
+	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+	/* to batch promoted pages */
+	int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+	/* to batch the mm stats */
+	int mm_stats[NR_MM_STATS];
+	/* total batched items */
+	int batched;
+	bool can_swap;
+	bool full_scan;
+};
+
 void lru_gen_init_lruvec(struct lruvec *lruvec);
 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
@@ -474,6 +526,8 @@  struct lruvec {
 #ifdef CONFIG_LRU_GEN
 	/* evictable pages divided into generations */
 	struct lru_gen_struct		lrugen;
+	/* to concurrently iterate lru_gen_mm_list */
+	struct lru_gen_mm_state		mm_state;
 #endif
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
@@ -1067,6 +1121,11 @@  typedef struct pglist_data {
 
 	unsigned long		flags;
 
+#ifdef CONFIG_LRU_GEN
+	/* kswap mm walk data */
+	struct lru_gen_mm_walk	mm_walk;
+#endif
+
 	ZONE_PADDING(_pad2_)
 
 	/* Per-node vmstats */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 27093b477c5f..7bdd7bcb135d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -137,6 +137,10 @@  union swap_header {
  */
 struct reclaim_state {
 	unsigned long reclaimed_slab;
+#ifdef CONFIG_LRU_GEN
+	/* per-thread mm walk data */
+	struct lru_gen_mm_walk *mm_walk;
+#endif
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/exit.c b/kernel/exit.c
index f072959fcab7..f2d4d48ea790 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -466,6 +466,7 @@  void mm_update_next_owner(struct mm_struct *mm)
 		goto retry;
 	}
 	WRITE_ONCE(mm->owner, c);
+	lru_gen_migrate_mm(mm);
 	task_unlock(c);
 	put_task_struct(c);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..d14297ce1151 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1148,6 +1148,7 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 		goto fail_nocontext;
 
 	mm->user_ns = get_user_ns(user_ns);
+	lru_gen_init_mm(mm);
 	return mm;
 
 fail_nocontext:
@@ -1191,6 +1192,7 @@  static inline void __mmput(struct mm_struct *mm)
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
 	mm_pasid_drop(mm);
+	lru_gen_del_mm(mm);
 	mmdrop(mm);
 }
 
@@ -2660,6 +2662,13 @@  pid_t kernel_clone(struct kernel_clone_args *args)
 		get_task_struct(p);
 	}
 
+	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+		/* lock the task to synchronize with memcg migration */
+		task_lock(p);
+		lru_gen_add_mm(p->mm);
+		task_unlock(p);
+	}
+
 	wake_up_new_task(p);
 
 	/* forking complete and child started to run, tell ptracer */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..88193a0f6d2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5057,6 +5057,7 @@  context_switch(struct rq *rq, struct task_struct *prev,
 		 * finish_task_switch()'s mmdrop().
 		 */
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
+		lru_gen_use_mm(next->mm);
 
 		if (!prev->mm) {                        // from kernel
 			/* will mmdrop() in finish_task_switch(). */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8069b58f2422..6a76152614c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6083,6 +6083,29 @@  static void mem_cgroup_move_task(void)
 }
 #endif
 
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *task = NULL;
+
+	cgroup_taskset_for_each_leader(task, css, tset)
+		break;
+
+	if (!task)
+		return;
+
+	task_lock(task);
+	if (task->mm && task->mm->owner == task)
+		lru_gen_migrate_mm(task->mm);
+	task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 {
 	if (value == PAGE_COUNTER_MAX)
@@ -6428,6 +6451,7 @@  struct cgroup_subsys memory_cgrp_subsys = {
 	.css_reset = mem_cgroup_css_reset,
 	.css_rstat_flush = mem_cgroup_css_rstat_flush,
 	.can_attach = mem_cgroup_can_attach,
+	.attach = mem_cgroup_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,
 	.dfl_cftypes = memory_files,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f4dd3c3c589b..9e2810a230a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,8 @@ 
 #include <linux/printk.h>
 #include <linux/dax.h>
 #include <linux/psi.h>
+#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3016,7 +3018,7 @@  static int folio_lru_tier(struct folio *folio)
 	return lru_tier_from_refs(refs);
 }
 
-static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 
@@ -3061,6 +3063,374 @@  static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
 	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
 }
 
+/******************************************************************************
+ *                          mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+	static struct lru_gen_mm_list mm_list = {
+		.fifo = LIST_HEAD_INIT(mm_list.fifo),
+		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+	};
+
+#ifdef CONFIG_MEMCG
+	if (memcg)
+		return &memcg->mm_list;
+#endif
+	VM_BUG_ON(!mem_cgroup_disabled());
+
+	return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+	int nid;
+	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+	VM_BUG_ON_MM(!list_empty(&mm->lru_gen.list), mm);
+#ifdef CONFIG_MEMCG
+	VM_BUG_ON_MM(mm->lru_gen.memcg, mm);
+	mm->lru_gen.memcg = memcg;
+#endif
+	spin_lock(&mm_list->lock);
+
+	for_each_node_state(nid, N_MEMORY) {
+		struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+		if (!lruvec)
+			continue;
+
+		if (lruvec->mm_state.tail == &mm_list->fifo)
+			lruvec->mm_state.tail = &mm->lru_gen.list;
+	}
+
+	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
+
+	spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+	int nid;
+	struct lru_gen_mm_list *mm_list;
+	struct mem_cgroup *memcg = NULL;
+
+	if (list_empty(&mm->lru_gen.list))
+		return;
+
+#ifdef CONFIG_MEMCG
+	memcg = mm->lru_gen.memcg;
+#endif
+	mm_list = get_mm_list(memcg);
+
+	spin_lock(&mm_list->lock);
+
+	for_each_node(nid) {
+		struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+		if (!lruvec)
+			continue;
+
+		if (lruvec->mm_state.tail == &mm->lru_gen.list)
+			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+
+		if (lruvec->mm_state.head != &mm->lru_gen.list)
+			continue;
+
+		lruvec->mm_state.head = lruvec->mm_state.head->next;
+		if (lruvec->mm_state.head == &mm_list->fifo)
+			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
+	}
+
+	list_del_init(&mm->lru_gen.list);
+
+	spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+	mem_cgroup_put(mm->lru_gen.memcg);
+	mm->lru_gen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+
+	lockdep_assert_held(&mm->owner->alloc_lock);
+
+	/* for mm_update_next_owner() */
+	if (mem_cgroup_disabled())
+		return;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(mm->owner);
+	rcu_read_unlock();
+	if (memcg == mm->lru_gen.memcg)
+		return;
+
+	VM_BUG_ON_MM(!mm->lru_gen.memcg, mm);
+	VM_BUG_ON_MM(list_empty(&mm->lru_gen.list), mm);
+
+	lru_gen_del_mm(mm);
+	lru_gen_add_mm(mm);
+}
+#endif
+
+/*
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+ * bits in a bitmap, k is the number of hash functions and n is the number of
+ * inserted items.
+ *
+ * Page table walkers use one of the two filters to reduce their search space.
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
+ * aging uses the double-buffering technique to flip to the other filter each
+ * time it produces a new generation. For non-leaf entries that have enough
+ * leaf entries, the aging carries them over to the next generation in
+ * walk_pmd_range(); the eviction also report them when walking the rmap
+ * in lru_gen_look_around().
+ *
+ * For future optimizations:
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
+ *    freed after the RCU grace period and reallocated if needed again.
+ * 2. And when reallocating, it's worth scaling its size according to the number
+ *    of inserted entries in the other filter, to reduce the memory overhead on
+ *    small systems and false positives on large systems.
+ * 3. Jenkins' hash function is an alternative to Knuth's.
+ */
+#define BLOOM_FILTER_SHIFT	15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+	return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+	key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+	unsigned long *filter;
+	int gen = filter_gen_from_seq(seq);
+
+	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+	filter = lruvec->mm_state.filters[gen];
+	if (filter) {
+		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+		return;
+	}
+
+	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
+	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+}
+
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+	int key[2];
+	unsigned long *filter;
+	int gen = filter_gen_from_seq(seq);
+
+	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+	if (!filter)
+		return;
+
+	get_item_key(item, key);
+
+	if (!test_bit(key[0], filter))
+		set_bit(key[0], filter);
+	if (!test_bit(key[1], filter))
+		set_bit(key[1], filter);
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+	int key[2];
+	unsigned long *filter;
+	int gen = filter_gen_from_seq(seq);
+
+	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+	if (!filter)
+		return true;
+
+	get_item_key(item, key);
+
+	return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+{
+	int i;
+	int hist;
+
+	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+	if (walk) {
+		hist = lru_hist_from_seq(walk->max_seq);
+
+		for (i = 0; i < NR_MM_STATS; i++) {
+			WRITE_ONCE(lruvec->mm_state.stats[hist][i],
+				   lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+			walk->mm_stats[i] = 0;
+		}
+	}
+
+	if (NR_HIST_GENS > 1 && last) {
+		hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+
+		for (i = 0; i < NR_MM_STATS; i++)
+			WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+	}
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+	int type;
+	unsigned long size = 0;
+	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+
+	if (!walk->full_scan && cpumask_empty(mm_cpumask(mm)) &&
+	    !node_isset(pgdat->node_id, mm->lru_gen.nodes))
+		return true;
+
+	node_clear(pgdat->node_id, mm->lru_gen.nodes);
+
+	for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
+		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+			       get_mm_counter(mm, MM_ANONPAGES) +
+			       get_mm_counter(mm, MM_SHMEMPAGES);
+	}
+
+	if (size < MIN_LRU_BATCH)
+		return true;
+
+	if (mm_is_oom_victim(mm))
+		return true;
+
+	return !mmget_not_zero(mm);
+}
+
+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
+			    struct mm_struct **iter)
+{
+	bool first = false;
+	bool last = true;
+	struct mm_struct *mm = NULL;
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+	/*
+	 * There are four interesting cases for this page table walker:
+	 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
+	 *    there is nothing to be done.
+	 * 2. It's the first of the current generation, and it needs to reset
+	 *    the Bloom filter for the next generation.
+	 * 3. It reaches the end of mm_list, and it needs to increment
+	 *    mm_state->seq; the iteration is done.
+	 * 4. It's the last of the current generation, and it needs to reset the
+	 *    mm stats counters for the next generation.
+	 */
+	if (*iter)
+		mmput_async(*iter);
+	else if (walk->max_seq <= READ_ONCE(mm_state->seq))
+		return false;
+
+	spin_lock(&mm_list->lock);
+
+	VM_BUG_ON(mm_state->seq + 1 < walk->max_seq);
+	VM_BUG_ON(*iter && mm_state->seq > walk->max_seq);
+	VM_BUG_ON(*iter && !mm_state->nr_walkers);
+
+	if (walk->max_seq <= mm_state->seq) {
+		if (!*iter)
+			last = false;
+		goto done;
+	}
+
+	if (!mm_state->nr_walkers) {
+		VM_BUG_ON(mm_state->head && mm_state->head != &mm_list->fifo);
+
+		mm_state->head = mm_list->fifo.next;
+		first = true;
+	}
+
+	while (!mm && mm_state->head != &mm_list->fifo) {
+		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+
+		mm_state->head = mm_state->head->next;
+
+		/* full scan for those added after the last iteration */
+		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
+			mm_state->tail = mm_state->head;
+			walk->full_scan = true;
+		}
+
+		if (should_skip_mm(mm, walk))
+			mm = NULL;
+	}
+
+	if (mm_state->head == &mm_list->fifo)
+		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+done:
+	if (*iter && !mm)
+		mm_state->nr_walkers--;
+	if (!*iter && mm)
+		mm_state->nr_walkers++;
+
+	if (mm_state->nr_walkers)
+		last = false;
+
+	if (mm && first)
+		reset_bloom_filter(lruvec, walk->max_seq + 1);
+
+	if (*iter || last)
+		reset_mm_stats(lruvec, walk, last);
+
+	spin_unlock(&mm_list->lock);
+
+	*iter = mm;
+
+	return last;
+}
+
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+{
+	bool success = false;
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+	if (max_seq <= READ_ONCE(mm_state->seq))
+		return false;
+
+	spin_lock(&mm_list->lock);
+
+	VM_BUG_ON(mm_state->seq + 1 < max_seq);
+
+	if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
+		VM_BUG_ON(mm_state->head && mm_state->head != &mm_list->fifo);
+
+		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+		reset_mm_stats(lruvec, NULL, true);
+		success = true;
+	}
+
+	spin_unlock(&mm_list->lock);
+
+	return success;
+}
+
 /******************************************************************************
  *                          refault feedback loop
  ******************************************************************************/
@@ -3214,6 +3584,476 @@  static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
 	return new_gen;
 }
 
+static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
+			      int old_gen, int new_gen)
+{
+	int type = folio_is_file_lru(folio);
+	int zone = folio_zonenum(folio);
+	int delta = folio_nr_pages(folio);
+
+	VM_BUG_ON(old_gen >= MAX_NR_GENS);
+	VM_BUG_ON(new_gen >= MAX_NR_GENS);
+
+	walk->batched++;
+
+	walk->nr_pages[old_gen][type][zone] -= delta;
+	walk->nr_pages[new_gen][type][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+	int gen, type, zone;
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+	walk->batched = 0;
+
+	for_each_gen_type_zone(gen, type, zone) {
+		enum lru_list lru = type * LRU_INACTIVE_FILE;
+		int delta = walk->nr_pages[gen][type][zone];
+
+		if (!delta)
+			continue;
+
+		walk->nr_pages[gen][type][zone] = 0;
+		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+			   lrugen->nr_pages[gen][type][zone] + delta);
+
+		if (lru_gen_is_active(lruvec, gen))
+			lru += LRU_ACTIVE;
+		__update_lru_size(lruvec, lru, zone, delta);
+	}
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct address_space *mapping;
+	struct vm_area_struct *vma = walk->vma;
+	struct lru_gen_mm_walk *priv = walk->private;
+
+	if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
+	    (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) ||
+	    vma == get_gate_vma(vma->vm_mm))
+		return true;
+
+	if (vma_is_anonymous(vma))
+		return !priv->can_swap;
+
+	if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
+		return true;
+
+	mapping = vma->vm_file->f_mapping;
+	if (mapping_unevictable(mapping))
+		return true;
+
+	/* check readpage to exclude special mappings like dax, etc. */
+	return shmem_mapping(mapping) ? !priv->can_swap : !mapping->a_ops->readpage;
+}
+
+/*
+ * Some userspace memory allocators map many single-page VMAs. Instead of
+ * returning back to the PGD table for each of such VMAs, finish an entire PMD
+ * table to reduce zigzags and improve cache performance.
+ */
+static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size,
+			 unsigned long *start, unsigned long *end)
+{
+	unsigned long next = round_up(*end, size);
+
+	VM_BUG_ON(mask & size);
+	VM_BUG_ON(*start >= *end);
+	VM_BUG_ON((next & mask) != (*start & mask));
+
+	while (walk->vma) {
+		if (next >= walk->vma->vm_end) {
+			walk->vma = walk->vma->vm_next;
+			continue;
+		}
+
+		if ((next & mask) != (walk->vma->vm_start & mask))
+			return false;
+
+		if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
+			walk->vma = walk->vma->vm_next;
+			continue;
+		}
+
+		*start = max(next, walk->vma->vm_start);
+		next = (next | ~mask) + 1;
+		/* rounded-up boundaries can wrap to 0 */
+		*end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool suitable_to_scan(int total, int young)
+{
+	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
+
+	/* suitable if the average number of young PTEs per cacheline is >=1 */
+	return young * n >= total;
+}
+
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+			   struct mm_walk *walk)
+{
+	int i;
+	pte_t *pte;
+	spinlock_t *ptl;
+	unsigned long addr;
+	int total = 0;
+	int young = 0;
+	struct lru_gen_mm_walk *priv = walk->private;
+	struct mem_cgroup *memcg = lruvec_memcg(priv->lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(priv->lruvec);
+	int old_gen, new_gen = lru_gen_from_seq(priv->max_seq);
+
+	VM_BUG_ON(pmd_leaf(*pmd));
+
+	ptl = pte_lockptr(walk->mm, pmd);
+	if (!spin_trylock(ptl))
+		return false;
+
+	arch_enter_lazy_mmu_mode();
+
+	pte = pte_offset_map(pmd, start & PMD_MASK);
+restart:
+	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+		struct folio *folio;
+		unsigned long pfn = pte_pfn(pte[i]);
+
+		VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end);
+
+		total++;
+		priv->mm_stats[MM_PTE_TOTAL]++;
+
+		if (!pte_present(pte[i]) || is_zero_pfn(pfn))
+			continue;
+
+		if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
+			continue;
+
+		if (!pte_young(pte[i])) {
+			priv->mm_stats[MM_PTE_OLD]++;
+			continue;
+		}
+
+		VM_BUG_ON(!pfn_valid(pfn));
+		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+			continue;
+
+		folio = pfn_folio(pfn);
+		if (folio_nid(folio) != pgdat->node_id)
+			continue;
+
+		if (folio_memcg_rcu(folio) != memcg)
+			continue;
+
+		if (!ptep_test_and_clear_young(walk->vma, addr, pte + i))
+			continue;
+
+		young++;
+		priv->mm_stats[MM_PTE_YOUNG]++;
+
+		if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+		      !folio_test_swapcache(folio)))
+			folio_mark_dirty(folio);
+
+		old_gen = folio_update_gen(folio, new_gen);
+		if (old_gen >= 0 && old_gen != new_gen)
+			update_batch_size(priv, folio, old_gen, new_gen);
+	}
+
+	if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end))
+		goto restart;
+
+	pte_unmap(pte);
+
+	arch_leave_lazy_mmu_mode();
+	spin_unlock(ptl);
+
+	return suitable_to_scan(total, young);
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+				  struct mm_walk *walk, unsigned long *start)
+{
+	int i;
+	pmd_t *pmd;
+	spinlock_t *ptl;
+	struct lru_gen_mm_walk *priv = walk->private;
+	struct mem_cgroup *memcg = lruvec_memcg(priv->lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(priv->lruvec);
+	int old_gen, new_gen = lru_gen_from_seq(priv->max_seq);
+
+	VM_BUG_ON(pud_leaf(*pud));
+
+	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
+	if (*start == -1) {
+		*start = next;
+		return;
+	}
+
+	i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
+	if (i && i <= MIN_LRU_BATCH) {
+		__set_bit(i - 1, priv->bitmap);
+		return;
+	}
+
+	pmd = pmd_offset(pud, *start);
+
+	ptl = pmd_lockptr(walk->mm, pmd);
+	if (!spin_trylock(ptl))
+		goto done;
+
+	arch_enter_lazy_mmu_mode();
+
+	do {
+		struct folio *folio;
+		unsigned long pfn = pmd_pfn(pmd[i]);
+		unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
+
+		VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end);
+
+		if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i]))
+			goto next;
+
+		if (WARN_ON_ONCE(pmd_devmap(pmd[i])))
+			goto next;
+
+		if (!pmd_trans_huge(pmd[i])) {
+			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
+				pmdp_test_and_clear_young(vma, addr, pmd + i);
+			goto next;
+		}
+
+		VM_BUG_ON(!pfn_valid(pfn));
+		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+			goto next;
+
+		folio = pfn_folio(pfn);
+		if (folio_nid(folio) != pgdat->node_id)
+			goto next;
+
+		if (folio_memcg_rcu(folio) != memcg)
+			goto next;
+
+		if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+			goto next;
+
+		priv->mm_stats[MM_PTE_YOUNG]++;
+
+		if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
+		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+		      !folio_test_swapcache(folio)))
+			folio_mark_dirty(folio);
+
+		old_gen = folio_update_gen(folio, new_gen);
+		if (old_gen >= 0 && old_gen != new_gen)
+			update_batch_size(priv, folio, old_gen, new_gen);
+next:
+		i = i > MIN_LRU_BATCH ? 0 :
+		    find_next_bit(priv->bitmap, MIN_LRU_BATCH, i) + 1;
+	} while (i <= MIN_LRU_BATCH);
+
+	arch_leave_lazy_mmu_mode();
+	spin_unlock(ptl);
+done:
+	*start = -1;
+	bitmap_zero(priv->bitmap, MIN_LRU_BATCH);
+}
+#else
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+				  struct mm_walk *walk, unsigned long *start)
+{
+}
+#endif
+
+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+			   struct mm_walk *walk)
+{
+	int i;
+	pmd_t *pmd;
+	unsigned long next;
+	unsigned long addr;
+	struct vm_area_struct *vma;
+	unsigned long pos = -1;
+	struct lru_gen_mm_walk *priv = walk->private;
+
+	VM_BUG_ON(pud_leaf(*pud));
+
+	/*
+	 * Finish an entire PMD in two passes: the first only reaches to PTE
+	 * tables to avoid taking the PMD lock; the second, if necessary, takes
+	 * the PMD lock to clear the accessed bit in PMD entries.
+	 */
+	pmd = pmd_offset(pud, start & PUD_MASK);
+restart:
+	/* walk_pte_range() may call get_next_vma() */
+	vma = walk->vma;
+	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+		pmd_t val = pmd_read_atomic(pmd + i);
+
+		/* for pmd_read_atomic() */
+		barrier();
+
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(val)) {
+			priv->mm_stats[MM_PTE_TOTAL]++;
+			continue;
+		}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		if (pmd_trans_huge(val)) {
+			unsigned long pfn = pmd_pfn(val);
+			struct pglist_data *pgdat = lruvec_pgdat(priv->lruvec);
+
+			priv->mm_stats[MM_PTE_TOTAL]++;
+
+			if (is_huge_zero_pmd(val))
+				continue;
+
+			if (!pmd_young(val)) {
+				priv->mm_stats[MM_PTE_OLD]++;
+				continue;
+			}
+
+			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+				continue;
+
+			walk_pmd_range_locked(pud, addr, vma, walk, &pos);
+			continue;
+		}
+#endif
+		priv->mm_stats[MM_PMD_TOTAL]++;
+
+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+		if (!pmd_young(val))
+			continue;
+
+		walk_pmd_range_locked(pud, addr, vma, walk, &pos);
+#endif
+		if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i))
+			continue;
+
+		priv->mm_stats[MM_PMD_FOUND]++;
+
+		if (!walk_pte_range(&val, addr, next, walk))
+			continue;
+
+		priv->mm_stats[MM_PMD_ADDED]++;
+
+		/* carry over to the next generation */
+		update_bloom_filter(priv->lruvec, priv->max_seq + 1, pmd + i);
+	}
+
+	walk_pmd_range_locked(pud, -1, vma, walk, &pos);
+
+	if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end))
+		goto restart;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+			  struct mm_walk *walk)
+{
+	int i;
+	pud_t *pud;
+	unsigned long addr;
+	unsigned long next;
+	struct lru_gen_mm_walk *priv = walk->private;
+
+	VM_BUG_ON(p4d_leaf(*p4d));
+
+	pud = pud_offset(p4d, start & P4D_MASK);
+restart:
+	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
+		pud_t val = READ_ONCE(pud[i]);
+
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
+			continue;
+
+		walk_pmd_range(&val, addr, next, walk);
+
+		if (priv->batched >= MAX_LRU_BATCH) {
+			end = (addr | ~PUD_MASK) + 1;
+			goto done;
+		}
+	}
+
+	if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end))
+		goto restart;
+
+	end = round_up(end, P4D_SIZE);
+done:
+	/* rounded-up boundaries can wrap to 0 */
+	priv->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
+
+	return -EAGAIN;
+}
+
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+	static const struct mm_walk_ops mm_walk_ops = {
+		.test_walk = should_skip_vma,
+		.p4d_entry = walk_pud_range,
+	};
+
+	int err;
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+	walk->next_addr = FIRST_USER_ADDRESS;
+
+	do {
+		err = -EBUSY;
+
+		/* folio_update_gen() requires stable folio_memcg() */
+		if (!mem_cgroup_trylock_pages(memcg))
+			break;
+
+		/* the caller might be holding the lock for write */
+		if (mmap_read_trylock(mm)) {
+			unsigned long start = walk->next_addr;
+			unsigned long end = mm->highest_vm_end;
+
+			err = walk_page_range(mm, start, end, &mm_walk_ops, walk);
+
+			mmap_read_unlock(mm);
+
+			if (walk->batched) {
+				spin_lock_irq(&lruvec->lru_lock);
+				reset_batch_size(lruvec, walk);
+				spin_unlock_irq(&lruvec->lru_lock);
+			}
+		}
+
+		mem_cgroup_unlock_pages();
+
+		cond_resched();
+	} while (err == -EAGAIN && walk->next_addr && !mm_is_oom_victim(mm));
+}
+
+static struct lru_gen_mm_walk *alloc_mm_walk(void)
+{
+	if (current->reclaim_state && current->reclaim_state->mm_walk)
+		return current->reclaim_state->mm_walk;
+
+	return kzalloc(sizeof(struct lru_gen_mm_walk),
+		       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+}
+
+static void free_mm_walk(struct lru_gen_mm_walk *walk)
+{
+	if (!current->reclaim_state || !current->reclaim_state->mm_walk)
+		kfree(walk);
+}
+
 static void inc_min_seq(struct lruvec *lruvec)
 {
 	int type;
@@ -3272,7 +4112,7 @@  static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
 	return success;
 }
 
-static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
+static void inc_max_seq(struct lruvec *lruvec)
 {
 	int prev, next;
 	int type, zone;
@@ -3282,9 +4122,6 @@  static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
 
 	VM_BUG_ON(!seq_is_valid(lruvec));
 
-	if (max_seq != lrugen->max_seq)
-		goto unlock;
-
 	inc_min_seq(lruvec);
 
 	/*
@@ -3316,10 +4153,72 @@  static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
 
 	/* make sure preceding modifications appear */
 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-unlock:
+
 	spin_unlock_irq(&lruvec->lru_lock);
 }
 
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+			       struct scan_control *sc, bool can_swap, bool full_scan)
+{
+	bool success;
+	struct lru_gen_mm_walk *walk;
+	struct mm_struct *mm = NULL;
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+	VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
+
+	/*
+	 * If the hardware doesn't automatically set the accessed bit, fallback
+	 * to lru_gen_look_around(), which only clears the accessed bit in a
+	 * handful of PTEs. Spreading the work out over a period of time usually
+	 * is less efficient, but it avoids bursty page faults.
+	 */
+	if (!full_scan && !arch_has_hw_pte_young()) {
+		success = iterate_mm_list_nowalk(lruvec, max_seq);
+		goto done;
+	}
+
+	walk = alloc_mm_walk();
+	if (!walk) {
+		success = iterate_mm_list_nowalk(lruvec, max_seq);
+		goto done;
+	}
+
+	walk->lruvec = lruvec;
+	walk->max_seq = max_seq;
+	walk->can_swap = can_swap;
+	walk->full_scan = full_scan;
+
+	do {
+		success = iterate_mm_list(lruvec, walk, &mm);
+		if (mm)
+			walk_mm(lruvec, mm, walk);
+
+		cond_resched();
+	} while (mm);
+
+	free_mm_walk(walk);
+done:
+	if (!success) {
+		if (!current_is_kswapd() && !sc->priority)
+			wait_event_killable(lruvec->mm_state.wait,
+					    max_seq < READ_ONCE(lrugen->max_seq));
+
+		return max_seq < READ_ONCE(lrugen->max_seq);
+	}
+
+	VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
+
+	inc_max_seq(lruvec);
+	/* either this sees any waiters or they will see updated max_seq */
+	if (wq_has_sleeper(&lruvec->mm_state.wait))
+		wake_up_all(&lruvec->mm_state.wait);
+
+	wakeup_flusher_threads(WB_REASON_VMSCAN);
+
+	return true;
+}
+
 static long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq,
 			     unsigned long *min_seq, bool can_swap, bool *need_aging)
 {
@@ -3401,7 +4300,7 @@  static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 		nr_to_scan++;
 
 	if (nr_to_scan && need_aging && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
-		inc_max_seq(lruvec, max_seq);
+		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
 }
 
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -3410,6 +4309,8 @@  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 
 	VM_BUG_ON(!current_is_kswapd());
 
+	current->reclaim_state->mm_walk = &pgdat->mm_walk;
+
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
@@ -3418,11 +4319,16 @@  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 
 		cond_resched();
 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+	current->reclaim_state->mm_walk = NULL;
 }
 
 /*
  * This function exploits spatial locality when shrink_page_list() walks the
  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ * If the scan was done cacheline efficiently, it adds the PMD entry pointing
+ * to the PTE table to the Bloom filter. This process is a feedback loop from
+ * the eviction to the aging.
  */
 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
@@ -3431,6 +4337,8 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	unsigned long start;
 	unsigned long end;
 	unsigned long addr;
+	struct lru_gen_mm_walk *walk;
+	int young = 0;
 	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
 	struct folio *folio = pfn_folio(pvmw->pfn);
 	struct mem_cgroup *memcg = folio_memcg(folio);
@@ -3492,6 +4400,8 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
 			continue;
 
+		young++;
+
 		if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
 		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
 		      !folio_test_swapcache(folio)))
@@ -3507,7 +4417,13 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	arch_leave_lazy_mmu_mode();
 	rcu_read_unlock();
 
-	if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+	/* feedback from rmap walkers to page table walkers */
+	if (suitable_to_scan(i, young))
+		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+
+	if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
 		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
 			folio = pfn_folio(pte_pfn(pte[i]));
 			folio_activate(folio);
@@ -3519,8 +4435,10 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	if (!mem_cgroup_trylock_pages(memcg))
 		return;
 
-	spin_lock_irq(&lruvec->lru_lock);
-	new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+	if (!walk) {
+		spin_lock_irq(&lruvec->lru_lock);
+		new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+	}
 
 	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
 		folio = pfn_folio(pte_pfn(pte[i]));
@@ -3531,10 +4449,14 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 		if (old_gen < 0 || old_gen == new_gen)
 			continue;
 
-		lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+		if (walk)
+			update_batch_size(walk, folio, old_gen, new_gen);
+		else
+			lru_gen_update_size(lruvec, folio, old_gen, new_gen);
 	}
 
-	spin_unlock_irq(&lruvec->lru_lock);
+	if (!walk)
+		spin_unlock_irq(&lruvec->lru_lock);
 
 	mem_cgroup_unlock_pages();
 }
@@ -3801,6 +4723,7 @@  static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	struct folio *folio;
 	enum vm_event_item item;
 	struct reclaim_stat stat;
+	struct lru_gen_mm_walk *walk;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
@@ -3840,6 +4763,10 @@  static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 
 	move_pages_to_lru(lruvec, &list);
 
+	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+	if (walk && walk->batched)
+		reset_batch_size(lruvec, walk);
+
 	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, reclaimed);
@@ -3894,20 +4821,25 @@  static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
 		return 0;
 	}
 
-	inc_max_seq(lruvec, max_seq);
+	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+		return nr_to_scan;
 
-	return nr_to_scan;
+	return min_seq[LRU_GEN_FILE] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
 }
 
 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	struct blk_plug plug;
 	long scanned = 0;
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
 	lru_add_drain();
 
 	blk_start_plug(&plug);
 
+	if (current_is_kswapd())
+		current->reclaim_state->mm_walk = &pgdat->mm_walk;
+
 	while (true) {
 		int delta;
 		int swappiness;
@@ -3935,6 +4867,9 @@  static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 		cond_resched();
 	}
 
+	if (current_is_kswapd())
+		current->reclaim_state->mm_walk = NULL;
+
 	blk_finish_plug(&plug);
 }
 
@@ -3951,15 +4886,21 @@  void lru_gen_init_lruvec(struct lruvec *lruvec)
 
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+
+	lruvec->mm_state.seq = MIN_NR_GENS;
+	init_waitqueue_head(&lruvec->mm_state.wait);
 }
 
 #ifdef CONFIG_MEMCG
 void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
+	INIT_LIST_HEAD(&memcg->mm_list.fifo);
+	spin_lock_init(&memcg->mm_list.lock);
 }
 
 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 {
+	int i;
 	int nid;
 
 	for_each_node(nid) {
@@ -3967,6 +4908,11 @@  void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 
 		VM_BUG_ON(memchr_inv(lruvec->lrugen.nr_pages, 0,
 				     sizeof(lruvec->lrugen.nr_pages)));
+
+		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+			bitmap_free(lruvec->mm_state.filters[i]);
+			lruvec->mm_state.filters[i] = NULL;
+		}
 	}
 }
 #endif
@@ -3975,6 +4921,7 @@  static int __init init_lru_gen(void)
 {
 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+	BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
 
 	return 0;
 };