diff mbox series

[v2,1/5] mm: add a new parameter `node` to `get/add/inc/dec_mm_counter`

Message ID 20220708082129.80115-2-ligang.bdlg@bytedance.com (mailing list archive)
State New
Headers show
Series [v2,1/5] mm: add a new parameter `node` to `get/add/inc/dec_mm_counter` | expand

Commit Message

Gang Li July 8, 2022, 8:21 a.m. UTC
Add a new parameter `node` to mm_counter for counting per node rss. Use
page_to_nid(page) to get node id from page.
before:
    dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
after:
    dec_mm_counter(vma->vm_mm, MM_ANONPAGES, page_to_nid(page));

If a page is swapped out, it no longer exists on any numa node.
(Or it is swapped from disk into a specific numa node.)
Thus when we call *_mm_counter(MM_ANONPAGES), the `node` field should be
`NUMA_NO_NODE`. For example:
```
swap_out(){
    dec_mm_counter(vma->vm_mm, MM_ANONPAGES, page_to_nid(page));
    inc_mm_counter(vma->vm_mm, MM_SWAPENTS, NUMA_NO_NODE);
}
```

Pages can be migrated between nodes. `remove_migration_pte`
must call `add_mm_counter` now.

Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---
 arch/s390/mm/pgtable.c        |  4 +-
 fs/exec.c                     |  2 +-
 fs/proc/task_mmu.c            | 14 +++---
 include/linux/mm.h            | 14 +++---
 include/linux/mm_types_task.h | 10 ++++
 kernel/events/uprobes.c       |  6 +--
 mm/huge_memory.c              | 13 ++---
 mm/khugepaged.c               |  4 +-
 mm/ksm.c                      |  2 +-
 mm/madvise.c                  |  2 +-
 mm/memory.c                   | 94 +++++++++++++++++++++++------------
 mm/migrate.c                  |  4 ++
 mm/migrate_device.c           |  2 +-
 mm/oom_kill.c                 | 16 +++---
 mm/rmap.c                     | 19 ++++---
 mm/swapfile.c                 |  6 +--
 mm/userfaultfd.c              |  2 +-
 17 files changed, 132 insertions(+), 82 deletions(-)

Comments

kernel test robot July 12, 2022, 6:33 a.m. UTC | #1
Greeting,

FYI, we noticed the following commit (built with gcc-11):

commit: c20f7bacef67af52980742f564d2ddb9519e6b18 ("[PATCH v2 1/5] mm: add a new parameter `node` to `get/add/inc/dec_mm_counter`")
url: https://github.com/intel-lab-lkp/linux/commits/Gang-Li/mm-oom-Introduce-per-numa-node-oom-for-CONSTRAINT_-MEMORY_POLICY-CPUSET/20220708-162505
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/lkml/20220708082129.80115-2-ligang.bdlg@bytedance.com

in testcase: trinity
version: trinity-i386-4d2343bd-1_20200320
with following parameters:

	runtime: 300s
	group: group-00

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <oliver.sang@intel.com>


[  153.358510][ T3796] WARNING: possible circular locking dependency detected
[  153.362349][ T3796] 5.19.0-rc4-00459-gc20f7bacef67 #1 Tainted: G                 N
[  153.366427][ T3796] ------------------------------------------------------
[  153.370459][ T3796] trinity-c0/3796 is trying to acquire lock:
[ 153.374381][ T3796] ffffffff90b85a80 (fs_reclaim){+.+.}-{0:0}, at: __kmalloc (include/linux/sched/mm.h:272 mm/slab.h:723 mm/slub.c:3157 mm/slub.c:3251 mm/slub.c:4442) 
[  153.378606][ T3796]
[  153.378606][ T3796] but task is already holding lock:
[ 153.385910][ T3796] ffff88817b7adbe8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: unmap_mapping_range (mm/memory.c:3616 mm/memory.c:3654) 
[  153.390178][ T3796]
[  153.390178][ T3796] which lock already depends on the new lock.
[  153.390178][ T3796]
[  153.400120][ T3796]
[  153.400120][ T3796] the existing dependency chain (in reverse order) is:
[  153.406931][ T3796]
[  153.406931][ T3796] -> #1 (&mapping->i_mmap_rwsem){++++}-{3:3}:
[ 153.413287][ T3796] __lock_acquire (kernel/locking/lockdep.c:5053) 
[ 153.416541][ T3796] lock_acquire (kernel/locking/lockdep.c:466 kernel/locking/lockdep.c:5667 kernel/locking/lockdep.c:5630) 
[ 153.419765][ T3796] down_write (include/linux/instrumented.h:101 include/linux/atomic/atomic-instrumented.h:1779 kernel/locking/rwsem.c:255 kernel/locking/rwsem.c:1286 kernel/locking/rwsem.c:1296 kernel/locking/rwsem.c:1543) 
[ 153.422840][ T3796] dma_resv_lockdep (include/linux/fs.h:462 drivers/dma-buf/dma-resv.c:755) 
[ 153.430008][ T3796] do_one_initcall (init/main.c:1300) 
[ 153.433077][ T3796] do_initcalls (init/main.c:1374 init/main.c:1391) 
[ 153.436058][ T3796] kernel_init_freeable (init/main.c:1621) 
[ 153.438917][ T3796] kernel_init (init/main.c:1508) 
[ 153.441729][ T3796] ret_from_fork (arch/x86/entry/entry_64.S:308) 
[  153.444438][ T3796]
[  153.444438][ T3796] -> #0 (fs_reclaim){+.+.}-{0:0}:
[ 153.449538][ T3796] check_prev_add (kernel/locking/lockdep.c:3096) 
[ 153.452347][ T3796] validate_chain (kernel/locking/lockdep.c:3215 kernel/locking/lockdep.c:3829) 
[ 153.455054][ T3796] __lock_acquire (kernel/locking/lockdep.c:5053) 
[ 153.460465][ T3796] lock_acquire (kernel/locking/lockdep.c:466 kernel/locking/lockdep.c:5667 kernel/locking/lockdep.c:5630) 
[ 153.463055][ T3796] fs_reclaim_acquire (mm/page_alloc.c:4674 mm/page_alloc.c:4687) 
[ 153.465635][ T3796] __kmalloc (include/linux/sched/mm.h:272 mm/slab.h:723 mm/slub.c:3157 mm/slub.c:3251 mm/slub.c:4442) 
[ 153.468003][ T3796] zap_pte_range (include/linux/slab.h:640 include/linux/slab.h:671 mm/memory.c:1443) 
[ 153.470465][ T3796] zap_pmd_range+0x218/0x600 
[ 153.472982][ T3796] unmap_page_range (mm/memory.c:1642 mm/memory.c:1663 mm/memory.c:1684) 
[ 153.475441][ T3796] zap_page_range_single (include/linux/mmu_notifier.h:481 mm/memory.c:1828) 
[ 153.477934][ T3796] unmap_mapping_range (mm/memory.c:3545 mm/memory.c:3617 mm/memory.c:3654) 
[ 153.480478][ T3796] shmem_fallocate (mm/shmem.c:2696) 
[ 153.482951][ T3796] vfs_fallocate (fs/open.c:323) 
[ 153.485354][ T3796] madvise_vma_behavior (mm/madvise.c:979 mm/madvise.c:1000) 
[ 153.487881][ T3796] do_madvise (mm/page_io.c:401 (discriminator 3)) 
[ 153.490238][ T3796] __ia32_sys_madvise (mm/madvise.c:1421) 
[ 153.492660][ T3796] __do_fast_syscall_32 (arch/x86/entry/common.c:112 arch/x86/entry/common.c:178) 
[ 153.495107][ T3796] do_fast_syscall_32 (arch/x86/entry/common.c:203) 
[ 153.499080][ T3796] entry_SYSENTER_compat_after_hwframe (arch/x86/entry/entry_64_compat.S:117) 
[  153.501696][ T3796]
[  153.501696][ T3796] other info that might help us debug this:
[  153.501696][ T3796]
[  153.508293][ T3796]  Possible unsafe locking scenario:
[  153.508293][ T3796]
[  153.512679][ T3796]        CPU0                    CPU1
[  153.515107][ T3796]        ----                    ----
[  153.517452][ T3796]   lock(&mapping->i_mmap_rwsem);
[  153.519875][ T3796]                                lock(fs_reclaim);
[  153.522376][ T3796]                                lock(&mapping->i_mmap_rwsem);
[  153.524846][ T3796]   lock(fs_reclaim);
[  153.527244][ T3796]
[  153.527244][ T3796]  *** DEADLOCK ***
[  153.527244][ T3796]
[  153.539913][ T3796] 3 locks held by trinity-c0/3796:
[ 153.542060][ T3796] #0: ffff888100198448 (sb_writers#6){.+.+}-{0:0}, at: madvise_vma_behavior (mm/madvise.c:979 mm/madvise.c:1000) 
[ 153.544779][ T3796] #1: ffff88817b7ad998 (&sb->s_type->i_mutex_key#9){+.+.}-{3:3}, at: shmem_fallocate (mm/shmem.c:2679) 
[ 153.550190][ T3796] #2: ffff88817b7adbe8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: unmap_mapping_range (mm/memory.c:3616 mm/memory.c:3654) 
[  153.553014][ T3796]
[  153.553014][ T3796] stack backtrace:
[  153.556815][ T3796] CPU: 0 PID: 3796 Comm: trinity-c0 Tainted: G                 N 5.19.0-rc4-00459-gc20f7bacef67 #1
[  153.559828][ T3796] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
[  153.562943][ T3796] Call Trace:
[  153.565082][ T3796]  <TASK>
[ 153.567138][ T3796] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 4)) 
[ 153.571950][ T3796] check_noncircular (kernel/locking/lockdep.c:2175) 
[ 153.574391][ T3796] ? print_circular_bug (kernel/locking/lockdep.c:2154) 
[ 153.576822][ T3796] ? perf_output_begin (kernel/events/ring_buffer.c:261 kernel/events/ring_buffer.c:283) 
[ 153.579355][ T3796] ? perf_event_update_userpage (include/linux/rcupdate.h:274 include/linux/rcupdate.h:728 kernel/events/core.c:5860) 
[ 153.582038][ T3796] check_prev_add (kernel/locking/lockdep.c:3096) 
[ 153.584430][ T3796] ? unwind_next_frame (arch/x86/kernel/unwind_orc.c:355 arch/x86/kernel/unwind_orc.c:600) 
[ 153.586886][ T3796] validate_chain (kernel/locking/lockdep.c:3215 kernel/locking/lockdep.c:3829) 
[ 153.589168][ T3796] ? check_prev_add (kernel/locking/lockdep.c:3785) 
[ 153.591576][ T3796] ? unwind_get_return_address (arch/x86/kernel/unwind_orc.c:318 arch/x86/kernel/unwind_orc.c:313) 
[ 153.594073][ T3796] ? create_prof_cpu_mask (kernel/stacktrace.c:83) 
[ 153.596534][ T3796] ? arch_stack_walk (arch/x86/kernel/stacktrace.c:26) 
[ 153.599822][ T3796] __lock_acquire (kernel/locking/lockdep.c:5053) 
[ 153.602354][ T3796] lock_acquire (kernel/locking/lockdep.c:466 kernel/locking/lockdep.c:5667 kernel/locking/lockdep.c:5630) 
[ 153.604750][ T3796] ? __kmalloc (include/linux/sched/mm.h:272 mm/slab.h:723 mm/slub.c:3157 mm/slub.c:3251 mm/slub.c:4442) 
[ 153.607125][ T3796] ? rcu_read_unlock (include/linux/rcupdate.h:724 (discriminator 5)) 
[ 153.613644][ T3796] ? check_prev_add (kernel/locking/lockdep.c:3175) 
[ 153.616210][ T3796] fs_reclaim_acquire (mm/page_alloc.c:4674 mm/page_alloc.c:4687) 
[ 153.618603][ T3796] ? __kmalloc (include/linux/sched/mm.h:272 mm/slab.h:723 mm/slub.c:3157 mm/slub.c:3251 mm/slub.c:4442) 
[ 153.620990][ T3796] ? zap_pte_range (include/linux/slab.h:640 include/linux/slab.h:671 mm/memory.c:1443) 
[ 153.623423][ T3796] __kmalloc (include/linux/sched/mm.h:272 mm/slab.h:723 mm/slub.c:3157 mm/slub.c:3251 mm/slub.c:4442) 
[ 153.625784][ T3796] zap_pte_range (include/linux/slab.h:640 include/linux/slab.h:671 mm/memory.c:1443) 
[ 153.628074][ T3796] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:50 (discriminator 22)) 
[ 153.630495][ T3796] ? copy_pte_range (mm/memory.c:1434) 
[ 153.632873][ T3796] zap_pmd_range+0x218/0x600 
[ 153.635329][ T3796] ? __lock_release (kernel/locking/lockdep.c:5341) 
[ 153.637622][ T3796] unmap_page_range (mm/memory.c:1642 mm/memory.c:1663 mm/memory.c:1684) 
[ 153.639995][ T3796] zap_page_range_single (include/linux/mmu_notifier.h:481 mm/memory.c:1828) 
[ 153.642445][ T3796] ? unmap_single_vma (mm/memory.c:1817) 
[ 153.644839][ T3796] ? lock_is_held_type (kernel/locking/lockdep.c:5406 kernel/locking/lockdep.c:5708) 
[ 153.647197][ T3796] ? down_read (arch/x86/include/asm/atomic64_64.h:34 include/linux/atomic/atomic-long.h:41 include/linux/atomic/atomic-instrumented.h:1280 kernel/locking/rwsem.c:171 kernel/locking/rwsem.c:176 kernel/locking/rwsem.c:244 kernel/locking/rwsem.c:1241 kernel/locking/rwsem.c:1251 kernel/locking/rwsem.c:1491) 
[ 153.649549][ T3796] ? rwsem_down_read_slowpath (kernel/locking/rwsem.c:1487) 
[ 153.652052][ T3796] ? shmem_fallocate (mm/shmem.c:2679) 
[ 153.654434][ T3796] ? __lock_release (kernel/locking/lockdep.c:5341) 
[ 153.656767][ T3796] unmap_mapping_range (mm/memory.c:3545 mm/memory.c:3617 mm/memory.c:3654) 
[ 153.659179][ T3796] ? do_raw_spin_lock (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:111 kernel/locking/spinlock_debug.c:115) 
[ 153.661589][ T3796] ? __do_fault (mm/memory.c:3642) 
[ 153.663943][ T3796] ? shmem_fallocate (mm/shmem.c:2679) 
[ 153.666368][ T3796] shmem_fallocate (mm/shmem.c:2696) 
[ 153.668748][ T3796] ? check_prev_add (kernel/locking/lockdep.c:3785) 
[ 153.671195][ T3796] ? shmem_get_link (mm/shmem.c:2663) 
[ 153.673509][ T3796] ? __lock_acquire (kernel/locking/lockdep.c:5053) 
[ 153.675897][ T3796] ? lock_is_held_type (kernel/locking/lockdep.c:5406 kernel/locking/lockdep.c:5708) 
[ 153.678193][ T3796] vfs_fallocate (fs/open.c:323) 
[ 153.680381][ T3796] madvise_vma_behavior (mm/madvise.c:979 mm/madvise.c:1000) 
[ 153.682707][ T3796] ? force_shm_swapin_readahead (mm/madvise.c:993) 
[ 153.685060][ T3796] ? vm_unmapped_area (mm/mmap.c:1873) 
[ 153.687235][ T3796] ? find_held_lock (kernel/locking/lockdep.c:5156) 
[ 153.689312][ T3796] ? __task_pid_nr_ns (include/linux/rcupdate.h:274 include/linux/rcupdate.h:728 kernel/pid.c:501) 
[ 153.691439][ T3796] do_madvise (mm/page_io.c:401 (discriminator 3)) 
[ 153.693571][ T3796] ? madvise_vma_behavior (mm/madvise.c:1368) 
[ 153.695803][ T3796] ? lock_is_held_type (kernel/locking/lockdep.c:5406 kernel/locking/lockdep.c:5708) 
[ 153.697942][ T3796] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4526) 
[ 153.700274][ T3796] __ia32_sys_madvise (mm/madvise.c:1421) 
[ 153.702357][ T3796] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:50 (discriminator 22)) 
[ 153.704454][ T3796] __do_fast_syscall_32 (arch/x86/entry/common.c:112 arch/x86/entry/common.c:178) 
[ 153.706651][ T3796] ? __do_fast_syscall_32 (arch/x86/entry/common.c:183) 
[ 153.708853][ T3796] ? __do_fast_syscall_32 (arch/x86/entry/common.c:183) 
[ 153.715186][ T3796] ? __do_fast_syscall_32 (arch/x86/entry/common.c:183) 
[ 153.717275][ T3796] ? __do_fast_syscall_32 (arch/x86/entry/common.c:183) 
[ 153.719353][ T3796] ? __do_fast_syscall_32 (arch/x86/entry/common.c:183) 
[ 153.721475][ T3796] ? __do_fast_syscall_32 (arch/x86/entry/common.c:183) 
[ 153.723630][ T3796] do_fast_syscall_32 (arch/x86/entry/common.c:203) 
[ 153.725671][ T3796] entry_SYSENTER_compat_after_hwframe (arch/x86/entry/entry_64_compat.S:117) 
[  153.727911][ T3796] RIP: 0023:0xf7f40549
[ 153.729935][ T3796] Code: 03 74 c0 01 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00
All code
========
   0:	03 74 c0 01          	add    0x1(%rax,%rax,8),%esi
   4:	10 05 03 74 b8 01    	adc    %al,0x1b87403(%rip)        # 0x1b8740d
   a:	10 06                	adc    %al,(%rsi)
   c:	03 74 b4 01          	add    0x1(%rsp,%rsi,4),%esi
  10:	10 07                	adc    %al,(%rdi)
  12:	03 74 b0 01          	add    0x1(%rax,%rsi,4),%esi
  16:	10 08                	adc    %cl,(%rax)
  18:	03 74 d8 01          	add    0x1(%rax,%rbx,8),%esi
  1c:	00 00                	add    %al,(%rax)
  1e:	00 00                	add    %al,(%rax)
  20:	00 51 52             	add    %dl,0x52(%rcx)
  23:	55                   	push   %rbp
  24:	89 e5                	mov    %esp,%ebp
  26:	0f 34                	sysenter 


To reproduce:

        # build kernel
	cd linux
	cp config-5.19.0-rc4-00459-gc20f7bacef67 .config
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
	cd <mod-install-dir>
	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
diff mbox series

Patch

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4909dcd762e8..b8306765cd63 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -703,11 +703,11 @@  void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 {
 	if (!non_swap_entry(entry))
-		dec_mm_counter(mm, MM_SWAPENTS);
+		dec_mm_counter(mm, MM_SWAPENTS, NUMA_NO_NODE);
 	else if (is_migration_entry(entry)) {
 		struct page *page = pfn_swap_entry_to_page(entry);
 
-		dec_mm_counter(mm, mm_counter(page));
+		dec_mm_counter(mm, mm_counter(page), page_to_nid(page));
 	}
 	free_swap_and_cache(entry);
 }
diff --git a/fs/exec.c b/fs/exec.c
index 5f0656e10b5d..99825c06d0c2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -192,7 +192,7 @@  static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 		return;
 
 	bprm->vma_pages = pages;
-	add_mm_counter(mm, MM_ANONPAGES, diff);
+	add_mm_counter(mm, MM_ANONPAGES, diff, NUMA_NO_NODE);
 }
 
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 34d292cec79a..24d33d1011d9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -32,9 +32,9 @@  void task_mem(struct seq_file *m, struct mm_struct *mm)
 	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
-	anon = get_mm_counter(mm, MM_ANONPAGES);
-	file = get_mm_counter(mm, MM_FILEPAGES);
-	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+	anon = get_mm_counter(mm, MM_ANONPAGES, NUMA_NO_NODE);
+	file = get_mm_counter(mm, MM_FILEPAGES, NUMA_NO_NODE);
+	shmem = get_mm_counter(mm, MM_SHMEMPAGES, NUMA_NO_NODE);
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -55,7 +55,7 @@  void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = min(text, mm->exec_vm << PAGE_SHIFT);
 	lib = (mm->exec_vm << PAGE_SHIFT) - text;
 
-	swap = get_mm_counter(mm, MM_SWAPENTS);
+	swap = get_mm_counter(mm, MM_SWAPENTS, NUMA_NO_NODE);
 	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
 	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
 	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -88,12 +88,12 @@  unsigned long task_statm(struct mm_struct *mm,
 			 unsigned long *shared, unsigned long *text,
 			 unsigned long *data, unsigned long *resident)
 {
-	*shared = get_mm_counter(mm, MM_FILEPAGES) +
-			get_mm_counter(mm, MM_SHMEMPAGES);
+	*shared = get_mm_counter(mm, MM_FILEPAGES, NUMA_NO_NODE) +
+			get_mm_counter(mm, MM_SHMEMPAGES, NUMA_NO_NODE);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->data_vm + mm->stack_vm;
-	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES, NUMA_NO_NODE);
 	return mm->total_vm;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 794ad19b57f8..84ce6e1b1252 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2026,7 +2026,7 @@  static inline bool get_user_page_fast_only(unsigned long addr,
 /*
  * per-process(per-mm_struct) statistics.
  */
-static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
+static inline unsigned long get_mm_counter(struct mm_struct *mm, int member, int node)
 {
 	long val = atomic_long_read(&mm->rss_stat.count[member]);
 
@@ -2043,21 +2043,21 @@  static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 
 void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
 
-static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
+static inline void add_mm_counter(struct mm_struct *mm, int member, long value, int node)
 {
 	long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
 
 	mm_trace_rss_stat(mm, member, count);
 }
 
-static inline void inc_mm_counter(struct mm_struct *mm, int member)
+static inline void inc_mm_counter(struct mm_struct *mm, int member, int node)
 {
 	long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
 
 	mm_trace_rss_stat(mm, member, count);
 }
 
-static inline void dec_mm_counter(struct mm_struct *mm, int member)
+static inline void dec_mm_counter(struct mm_struct *mm, int member, int node)
 {
 	long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
 
@@ -2081,9 +2081,9 @@  static inline int mm_counter(struct page *page)
 
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
 {
-	return get_mm_counter(mm, MM_FILEPAGES) +
-		get_mm_counter(mm, MM_ANONPAGES) +
-		get_mm_counter(mm, MM_SHMEMPAGES);
+	return get_mm_counter(mm, MM_FILEPAGES, NUMA_NO_NODE) +
+		get_mm_counter(mm, MM_ANONPAGES, NUMA_NO_NODE) +
+		get_mm_counter(mm, MM_SHMEMPAGES, NUMA_NO_NODE);
 }
 
 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 0bb4b6da9993..32512af31721 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -36,6 +36,16 @@  enum {
 	NR_MM_COUNTERS
 };
 
+/*
+ * This macro should only be used in committing local values, like sync_mm_rss,
+ * add_mm_rss_vec. It means don't count per-mm-type, only count per-node in
+ * mm_stat.
+ *
+ * `MM_NO_TYPE` must equals to `NR_MM_COUNTERS`, since we will use it in
+ * `TRACE_MM_PAGES`.
+ */
+#define MM_NO_TYPE NR_MM_COUNTERS
+
 #if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
 #define SPLIT_RSS_COUNTING
 /* per-thread cached information, */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 401bc2d24ce0..f5b0db3494a3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,11 +184,11 @@  static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		lru_cache_add_inactive_or_unevictable(new_page, vma);
 	} else
 		/* no new page, just dec_mm_counter for old_page */
-		dec_mm_counter(mm, MM_ANONPAGES);
+		dec_mm_counter(mm, MM_ANONPAGES, page_to_nid(old_page));
 
 	if (!PageAnon(old_page)) {
-		dec_mm_counter(mm, mm_counter_file(old_page));
-		inc_mm_counter(mm, MM_ANONPAGES);
+		dec_mm_counter(mm, mm_counter_file(old_page), page_to_nid(old_page));
+		inc_mm_counter(mm, MM_ANONPAGES, page_to_nid(new_page));
 	}
 
 	flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd9d502aadc4..b7fd7df70e7c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -809,7 +809,7 @@  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR, page_to_nid(page));
 		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
@@ -1220,7 +1220,7 @@  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pmd = pmd_swp_mkuffd_wp(pmd);
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
-		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR, page_to_nid(pmd_page(*dst_pmd)));
 		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 		if (!userfaultfd_wp(dst_vma))
@@ -1263,7 +1263,7 @@  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
 		return -EAGAIN;
 	}
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR, page_to_nid(src_page));
 out_zero_page:
 	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
@@ -1753,11 +1753,12 @@  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 		if (PageAnon(page)) {
 			zap_deposited_table(tlb->mm, pmd);
-			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR, page_to_nid(page));
 		} else {
 			if (arch_needs_pgtable_deposit())
 				zap_deposited_table(tlb->mm, pmd);
-			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR,
+				       page_to_nid(page));
 		}
 
 		spin_unlock(ptl);
@@ -2143,7 +2144,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			page_remove_rmap(page, vma, true);
 			put_page(page);
 		}
-		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR, page_to_nid(page));
 		return;
 	}
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cfe231c5958f..74d4c578a91c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -687,7 +687,7 @@  static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			clear_user_highpage(page, address);
-			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1, page_to_nid(page));
 			if (is_zero_pfn(pte_pfn(pteval))) {
 				/*
 				 * ptl mostly unnecessary.
@@ -1469,7 +1469,7 @@  void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	/* step 3: set proper refcount and mm_counters. */
 	if (count) {
 		page_ref_sub(hpage, count);
-		add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+		add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count, page_to_nid(hpage));
 	}
 
 	/* step 4: collapse pmd */
diff --git a/mm/ksm.c b/mm/ksm.c
index 63b4b9d71597..4dc4b78d6f9b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1180,7 +1180,7 @@  static int replace_page(struct vm_area_struct *vma, struct page *page,
 		 * will get wrong values in /proc, and a BUG message in dmesg
 		 * when tearing down the mm.
 		 */
-		dec_mm_counter(mm, MM_ANONPAGES);
+		dec_mm_counter(mm, MM_ANONPAGES, page_to_nid(page));
 	}
 
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
diff --git a/mm/madvise.c b/mm/madvise.c
index 851fa4e134bc..46229b70cbbe 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -715,7 +715,7 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		if (current->mm == mm)
 			sync_mm_rss(mm);
 
-		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+		add_mm_counter(mm, MM_SWAPENTS, nr_swap, NUMA_NO_NODE);
 	}
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(orig_pte, ptl);
diff --git a/mm/memory.c b/mm/memory.c
index 8917bea2f0bc..bb24da767f79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -161,6 +161,8 @@  EXPORT_SYMBOL(zero_pfn);
 
 unsigned long highest_memmap_pfn __read_mostly;
 
+static DEFINE_PER_CPU(int, percpu_numa_rss[MAX_NUMNODES]);
+
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
@@ -184,24 +186,24 @@  void sync_mm_rss(struct mm_struct *mm)
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		if (current->rss_stat.count[i]) {
-			add_mm_counter(mm, i, current->rss_stat.count[i]);
+			add_mm_counter(mm, i, current->rss_stat.count[i], NUMA_NO_NODE);
 			current->rss_stat.count[i] = 0;
 		}
 	}
 	current->rss_stat.events = 0;
 }
 
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val, int node)
 {
 	struct task_struct *task = current;
 
 	if (likely(task->mm == mm))
 		task->rss_stat.count[member] += val;
 	else
-		add_mm_counter(mm, member, val);
+		add_mm_counter(mm, member, val, node);
 }
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+#define inc_mm_counter_fast(mm, member, node) add_mm_counter_fast(mm, member, 1, node)
+#define dec_mm_counter_fast(mm, member, node) add_mm_counter_fast(mm, member, -1, node)
 
 /* sync counter once per 64 page faults */
 #define TASK_RSS_EVENTS_THRESH	(64)
@@ -214,8 +216,8 @@  static void check_sync_rss_stat(struct task_struct *task)
 }
 #else /* SPLIT_RSS_COUNTING */
 
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+#define inc_mm_counter_fast(mm, member, node) inc_mm_counter(mm, member, node)
+#define dec_mm_counter_fast(mm, member, node) dec_mm_counter(mm, member, node)
 
 static void check_sync_rss_stat(struct task_struct *task)
 {
@@ -502,12 +504,13 @@  int __pte_alloc_kernel(pmd_t *pmd)
 	return 0;
 }
 
-static inline void init_rss_vec(int *rss)
+static inline void init_rss_vec(int *rss, int *numa_rss)
 {
 	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
+	memset(numa_rss, 0, sizeof(int) * num_possible_nodes());
 }
 
-static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
+static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss, int *numa_rss)
 {
 	int i;
 
@@ -515,7 +518,7 @@  static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 		sync_mm_rss(mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
-			add_mm_counter(mm, i, rss[i]);
+			add_mm_counter(mm, i, rss[i], NUMA_NO_NODE);
 }
 
 /*
@@ -792,7 +795,8 @@  try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
 static unsigned long
 copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
-		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
+		struct vm_area_struct *src_vma, unsigned long addr, int *rss,
+		int *numa_rss)
 {
 	unsigned long vm_flags = dst_vma->vm_flags;
 	pte_t pte = *src_pte;
@@ -817,10 +821,12 @@  copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
 		rss[MM_SWAPENTS]++;
+		numa_rss[page_to_nid(pte_page(*dst_pte))]++;
 	} else if (is_migration_entry(entry)) {
 		page = pfn_swap_entry_to_page(entry);
 
 		rss[mm_counter(page)]++;
+		numa_rss[page_to_nid(page)]++;
 
 		if (!is_readable_migration_entry(entry) &&
 				is_cow_mapping(vm_flags)) {
@@ -852,6 +858,8 @@  copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 */
 		get_page(page);
 		rss[mm_counter(page)]++;
+		numa_rss[page_to_nid(page)]++;
+
 		/* Cannot fail as these pages cannot get pinned. */
 		BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
 
@@ -912,7 +920,7 @@  copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 static inline int
 copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
-		  struct page **prealloc, struct page *page)
+		  struct page **prealloc, struct page *page, int *numa_rss)
 {
 	struct page *new_page;
 	pte_t pte;
@@ -931,6 +939,7 @@  copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	page_add_new_anon_rmap(new_page, dst_vma, addr);
 	lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
 	rss[mm_counter(new_page)]++;
+	rss[page_to_nid(new_page)]++;
 
 	/* All done, just insert the new page copy in the child */
 	pte = mk_pte(new_page, dst_vma->vm_page_prot);
@@ -949,7 +958,7 @@  copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 static inline int
 copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
-		 struct page **prealloc)
+		 struct page **prealloc, int *numa_rss)
 {
 	struct mm_struct *src_mm = src_vma->vm_mm;
 	unsigned long vm_flags = src_vma->vm_flags;
@@ -969,13 +978,15 @@  copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			/* Page maybe pinned, we have to copy. */
 			put_page(page);
 			return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
-						 addr, rss, prealloc, page);
+						 addr, rss, prealloc, page, numa_rss);
 		}
 		rss[mm_counter(page)]++;
+		numa_rss[page_to_nid(page)]++;
 	} else if (page) {
 		get_page(page);
 		page_dup_file_rmap(page, false);
 		rss[mm_counter(page)]++;
+		numa_rss[page_to_nid(page)]++;
 	}
 
 	/*
@@ -1034,12 +1045,16 @@  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress, ret = 0;
 	int rss[NR_MM_COUNTERS];
+	int *numa_rss;
 	swp_entry_t entry = (swp_entry_t){0};
 	struct page *prealloc = NULL;
+	numa_rss = kcalloc(num_possible_nodes(), sizeof(int), GFP_KERNEL);
+	if (unlikely(!numa_rss))
+		numa_rss = (int *)get_cpu_ptr(&percpu_numa_rss);
 
 again:
 	progress = 0;
-	init_rss_vec(rss);
+	init_rss_vec(rss, numa_rss);
 
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte) {
@@ -1072,7 +1087,7 @@  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			ret = copy_nonpresent_pte(dst_mm, src_mm,
 						  dst_pte, src_pte,
 						  dst_vma, src_vma,
-						  addr, rss);
+						  addr, rss, numa_rss);
 			if (ret == -EIO) {
 				entry = pte_to_swp_entry(*src_pte);
 				break;
@@ -1091,7 +1106,7 @@  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		}
 		/* copy_present_pte() will clear `*prealloc' if consumed */
 		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
-				       addr, rss, &prealloc);
+				       addr, rss, &prealloc, numa_rss);
 		/*
 		 * If we need a pre-allocated page for this pte, drop the
 		 * locks, allocate, and try again.
@@ -1114,7 +1129,7 @@  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap(orig_src_pte);
-	add_mm_rss_vec(dst_mm, rss);
+	add_mm_rss_vec(dst_mm, rss, numa_rss);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -1143,6 +1158,10 @@  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 out:
 	if (unlikely(prealloc))
 		put_page(prealloc);
+	if (unlikely(numa_rss == (int *)raw_cpu_ptr(&percpu_numa_rss)))
+		put_cpu_ptr(numa_rss);
+	else
+		kfree(numa_rss);
 	return ret;
 }
 
@@ -1415,14 +1434,18 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	struct mm_struct *mm = tlb->mm;
 	int force_flush = 0;
 	int rss[NR_MM_COUNTERS];
+	int *numa_rss;
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
 	swp_entry_t entry;
+	numa_rss = kcalloc(num_possible_nodes(), sizeof(int), GFP_KERNEL);
+	if (unlikely(!numa_rss))
+		numa_rss = (int *)get_cpu_ptr(&percpu_numa_rss);
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 again:
-	init_rss_vec(rss);
+	init_rss_vec(rss, numa_rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	pte = start_pte;
 	flush_tlb_batched_pending(mm);
@@ -1459,6 +1482,7 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 					mark_page_accessed(page);
 			}
 			rss[mm_counter(page)]--;
+			numa_rss[page_to_nid(page)]--;
 			page_remove_rmap(page, vma, false);
 			if (unlikely(page_mapcount(page) < 0))
 				print_bad_pte(vma, addr, ptent, page);
@@ -1484,6 +1508,7 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			 */
 			WARN_ON_ONCE(!vma_is_anonymous(vma));
 			rss[mm_counter(page)]--;
+			numa_rss[page_to_nid(page)]--;
 			if (is_device_private_entry(entry))
 				page_remove_rmap(page, vma, false);
 			put_page(page);
@@ -1499,13 +1524,13 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			if (!should_zap_page(details, page))
 				continue;
 			rss[mm_counter(page)]--;
+			numa_rss[page_to_nid(page)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
 			/* Only drop the uffd-wp marker if explicitly requested */
 			if (!zap_drop_file_uffd_wp(details))
 				continue;
 		} else if (is_hwpoison_entry(entry) ||
 			   is_swapin_error_entry(entry)) {
-			if (!should_zap_cows(details))
 				continue;
 		} else {
 			/* We should have covered all the swap entry types */
@@ -1515,7 +1540,7 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	add_mm_rss_vec(mm, rss);
+	add_mm_rss_vec(mm, rss, numa_rss);
 	arch_leave_lazy_mmu_mode();
 
 	/* Do the actual TLB flush before dropping ptl */
@@ -1539,6 +1564,10 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		goto again;
 	}
 
+	if (unlikely(numa_rss == (int *)raw_cpu_ptr(&percpu_numa_rss)))
+		put_cpu_ptr(numa_rss);
+	else
+		kfree(numa_rss);
 	return addr;
 }
 
@@ -1868,7 +1897,7 @@  static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
 		return -EBUSY;
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+	inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page), page_to_nid(page));
 	page_add_file_rmap(page, vma, false);
 	set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
 	return 0;
@@ -3164,11 +3193,14 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		if (old_page) {
 			if (!PageAnon(old_page)) {
 				dec_mm_counter_fast(mm,
-						mm_counter_file(old_page));
-				inc_mm_counter_fast(mm, MM_ANONPAGES);
+						mm_counter_file(old_page), page_to_nid(old_page));
+				inc_mm_counter_fast(mm, MM_ANONPAGES, page_to_nid(new_page));
+			} else {
+				dec_mm_counter_fast(mm, MM_ANONPAGES, page_to_nid(old_page));
+				inc_mm_counter_fast(mm, MM_ANONPAGES, page_to_nid(new_page));
 			}
 		} else {
-			inc_mm_counter_fast(mm, MM_ANONPAGES);
+			inc_mm_counter_fast(mm, MM_ANONPAGES, page_to_nid(new_page));
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
@@ -3955,8 +3987,8 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (should_try_to_free_swap(page, vma, vmf->flags))
 		try_to_free_swap(page);
 
-	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES, page_to_nid(page));
+	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS, NUMA_NO_NODE);
 	pte = mk_pte(page, vma->vm_page_prot);
 
 	/*
@@ -4134,7 +4166,7 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 		return handle_userfault(vmf, VM_UFFD_MISSING);
 	}
 
-	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES, page_to_nid(page));
 	page_add_new_anon_rmap(page, vma, vmf->address);
 	lru_cache_add_inactive_or_unevictable(page, vma);
 setpte:
@@ -4275,7 +4307,7 @@  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 	if (write)
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 
-	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR, page_to_nid(page));
 	page_add_file_rmap(page, vma, true);
 
 	/*
@@ -4324,11 +4356,11 @@  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 		entry = pte_mkuffd_wp(pte_wrprotect(entry));
 	/* copy-on-write page */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES, page_to_nid(page));
 		page_add_new_anon_rmap(page, vma, addr);
 		lru_cache_add_inactive_or_unevictable(page, vma);
 	} else {
-		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page), page_to_nid(page));
 		page_add_file_rmap(page, vma, false);
 	}
 	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
diff --git a/mm/migrate.c b/mm/migrate.c
index e01624fcab5b..1d7aac928e7e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -258,6 +258,10 @@  static bool remove_migration_pte(struct folio *folio,
 
 		/* No need to invalidate - it was non-present before */
 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
+		add_mm_counter(vma->vm_mm, MM_ANONPAGES,
+				-compound_nr(old), page_to_nid(old));
+		add_mm_counter(vma->vm_mm, MM_ANONPAGES,
+				compound_nr(&folio->page), page_to_nid(&folio->page));
 	}
 
 	return true;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index ad593d5754cf..e17c5fbc3d2a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -631,7 +631,7 @@  static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	if (userfaultfd_missing(vma))
 		goto unlock_abort;
 
-	inc_mm_counter(mm, MM_ANONPAGES);
+	inc_mm_counter(mm, MM_ANONPAGES, page_to_nid(page));
 	page_add_new_anon_rmap(page, vma, addr);
 	if (!is_zone_device_page(page))
 		lru_cache_add_inactive_or_unevictable(page, vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 35ec75cdfee2..e25c37e2e90d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -227,7 +227,7 @@  long oom_badness(struct task_struct *p, unsigned long totalpages)
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
 		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
 	task_unlock(p);
 
@@ -403,7 +403,7 @@  static int dump_task(struct task_struct *p, void *arg)
 		task->pid, from_kuid(&init_user_ns, task_uid(task)),
 		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 		mm_pgtables_bytes(task->mm),
-		get_mm_counter(task->mm, MM_SWAPENTS),
+		get_mm_counter(task->mm, MM_SWAPENTS, NUMA_NO_NODE),
 		task->signal->oom_score_adj, task->comm);
 	task_unlock(task);
 
@@ -594,9 +594,9 @@  static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 			task_pid_nr(tsk), tsk->comm,
-			K(get_mm_counter(mm, MM_ANONPAGES)),
-			K(get_mm_counter(mm, MM_FILEPAGES)),
-			K(get_mm_counter(mm, MM_SHMEMPAGES)));
+			K(get_mm_counter(mm, MM_ANONPAGES, NUMA_NO_NODE)),
+			K(get_mm_counter(mm, MM_FILEPAGES, NUMA_NO_NODE)),
+			K(get_mm_counter(mm, MM_SHMEMPAGES, NUMA_NO_NODE)));
 out_finish:
 	trace_finish_task_reaping(tsk->pid);
 out_unlock:
@@ -948,9 +948,9 @@  static void __oom_kill_process(struct task_struct *victim, const char *message)
 	mark_oom_victim(victim);
 	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
 		message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
-		K(get_mm_counter(mm, MM_ANONPAGES)),
-		K(get_mm_counter(mm, MM_FILEPAGES)),
-		K(get_mm_counter(mm, MM_SHMEMPAGES)),
+		K(get_mm_counter(mm, MM_ANONPAGES, NUMA_NO_NODE)),
+		K(get_mm_counter(mm, MM_FILEPAGES, NUMA_NO_NODE)),
+		K(get_mm_counter(mm, MM_SHMEMPAGES, NUMA_NO_NODE)),
 		from_kuid(&init_user_ns, task_uid(victim)),
 		mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
 	task_unlock(victim);
diff --git a/mm/rmap.c b/mm/rmap.c
index edc06c52bc82..a6e8bb3d40cc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1620,7 +1620,8 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
 				set_huge_pte_at(mm, address, pvmw.pte, pteval);
 			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
+				dec_mm_counter(mm, mm_counter(&folio->page),
+						page_to_nid(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
@@ -1635,7 +1636,7 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * migration) will not expect userfaults on already
 			 * copied pages.
 			 */
-			dec_mm_counter(mm, mm_counter(&folio->page));
+			dec_mm_counter(mm, mm_counter(&folio->page), page_to_nid(&folio->page));
 			/* We have to invalidate as we cleared the pte */
 			mmu_notifier_invalidate_range(mm, address,
 						      address + PAGE_SIZE);
@@ -1686,7 +1687,7 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					/* Invalidate as we cleared the pte */
 					mmu_notifier_invalidate_range(mm,
 						address, address + PAGE_SIZE);
-					dec_mm_counter(mm, MM_ANONPAGES);
+					dec_mm_counter(mm, MM_ANONPAGES, page_to_nid(&folio->page));
 					goto discard;
 				}
 
@@ -1739,8 +1740,8 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			dec_mm_counter(mm, MM_ANONPAGES);
-			inc_mm_counter(mm, MM_SWAPENTS);
+			dec_mm_counter(mm, MM_ANONPAGES, page_to_nid(&folio->page));
+			inc_mm_counter(mm, MM_SWAPENTS, NUMA_NO_NODE);
 			swp_pte = swp_entry_to_pte(entry);
 			if (anon_exclusive)
 				swp_pte = pte_swp_mkexclusive(swp_pte);
@@ -1764,7 +1765,8 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 *
 			 * See Documentation/mm/mmu_notifier.rst
 			 */
-			dec_mm_counter(mm, mm_counter_file(&folio->page));
+			dec_mm_counter(mm, mm_counter_file(&folio->page),
+					page_to_nid(&folio->page));
 		}
 discard:
 		/*
@@ -2011,7 +2013,8 @@  static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
 				set_huge_pte_at(mm, address, pvmw.pte, pteval);
 			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
+				dec_mm_counter(mm, mm_counter(&folio->page),
+						page_to_nid(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
@@ -2026,7 +2029,7 @@  static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * migration) will not expect userfaults on already
 			 * copied pages.
 			 */
-			dec_mm_counter(mm, mm_counter(&folio->page));
+			dec_mm_counter(mm, mm_counter(&folio->page), page_to_nid(&folio->page));
 			/* We have to invalidate as we cleared the pte */
 			mmu_notifier_invalidate_range(mm, address,
 						      address + PAGE_SIZE);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5c8681a3f1d9..c0485bb54954 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1791,7 +1791,7 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	if (unlikely(!PageUptodate(page))) {
 		pte_t pteval;
 
-		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+		dec_mm_counter(vma->vm_mm, MM_SWAPENTS, NUMA_NO_NODE);
 		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
 		set_pte_at(vma->vm_mm, addr, pte, pteval);
 		swap_free(entry);
@@ -1803,8 +1803,8 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
 	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
 
-	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+	dec_mm_counter(vma->vm_mm, MM_SWAPENTS, NUMA_NO_NODE);
+	inc_mm_counter(vma->vm_mm, MM_ANONPAGES, page_to_nid(page));
 	get_page(page);
 	if (page == swapcache) {
 		rmap_t rmap_flags = RMAP_NONE;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 07d3befc80e4..b6581867aad6 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -127,7 +127,7 @@  int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 	 * Must happen after rmap, as mm_counter() checks mapping (via
 	 * PageAnon()), which is set by __page_set_anon_rmap().
 	 */
-	inc_mm_counter(dst_mm, mm_counter(page));
+	inc_mm_counter(dst_mm, mm_counter(page), page_to_nid(page));
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);