diff mbox series

[1/1] sched/numa: Fix memory leak due to the overwritten vma->numab_state

Message ID 20241108133139.25326-1-ahuang12@lenovo.com (mailing list archive)
State New
Headers show
Series [1/1] sched/numa: Fix memory leak due to the overwritten vma->numab_state | expand

Commit Message

Adrian Huang Nov. 8, 2024, 1:31 p.m. UTC
From: Adrian Huang <ahuang12@lenovo.com>

[Problem Description]
When running the hackbench program of LTP, the following memory leak is
reported by kmemleak.

  # /opt/ltp/testcases/bin/hackbench 20 thread 1000
  Running with 20*40 (== 800) tasks.

  # dmesg | grep kmemleak
  ...
  kmemleak: 480 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
  kmemleak: 665 new suspected memory leaks (see /sys/kernel/debug/kmemleak)

  # cat /sys/kernel/debug/kmemleak
  unreferenced object 0xffff888cd8ca2c40 (size 64):
    comm "hackbench", pid 17142, jiffies 4299780315
    hex dump (first 32 bytes):
      ac 74 49 00 01 00 00 00 4c 84 49 00 01 00 00 00  .tI.....L.I.....
      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    backtrace (crc bff18fd4):
      [<ffffffff81419a89>] __kmalloc_cache_noprof+0x2f9/0x3f0
      [<ffffffff8113f715>] task_numa_work+0x725/0xa00
      [<ffffffff8110f878>] task_work_run+0x58/0x90
      [<ffffffff81ddd9f8>] syscall_exit_to_user_mode+0x1c8/0x1e0
      [<ffffffff81dd78d5>] do_syscall_64+0x85/0x150
      [<ffffffff81e0012b>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
  ...

  This issue can be consistently reproduced on three different servers:
    * a 448-core server
    * a 256-core server
    * a 192-core server

[Root Cause]
Since multiple threads are created by the hackbench program (along with
the command argument 'thread'), a shared vma might be accessed by two or
more cores simultaneously. When two or more cores observe that
vma->numab_state is NULL at the same time, vma->numab_state will be
overwritten.

Note that the command `/opt/ltp/testcases/bin/hackbench 50 process 1000`
cannot the reproduce the issue because of the fork() and COW. It is
verified with 200+ test runs.

[Solution]
Introduce a lock to make sure the atomic operation of the vma->numab_state
access.

Fixes: ef6a22b70f6d ("sched/numa: apply the scan delay to every new vma")
Reported-by: Jiwei Sun <sunjw10@lenovo.com>
Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
---
 include/linux/mm.h       |  1 +
 include/linux/mm_types.h |  1 +
 kernel/sched/fair.c      | 17 ++++++++++++++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

Comments

Vlastimil Babka Nov. 8, 2024, 4 p.m. UTC | #1
On 11/8/24 14:31, Adrian Huang wrote:
> From: Adrian Huang <ahuang12@lenovo.com>
> 
> [Problem Description]
> When running the hackbench program of LTP, the following memory leak is
> reported by kmemleak.
> 
>   # /opt/ltp/testcases/bin/hackbench 20 thread 1000
>   Running with 20*40 (== 800) tasks.
> 
>   # dmesg | grep kmemleak
>   ...
>   kmemleak: 480 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>   kmemleak: 665 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> 
>   # cat /sys/kernel/debug/kmemleak
>   unreferenced object 0xffff888cd8ca2c40 (size 64):
>     comm "hackbench", pid 17142, jiffies 4299780315
>     hex dump (first 32 bytes):
>       ac 74 49 00 01 00 00 00 4c 84 49 00 01 00 00 00  .tI.....L.I.....
>       00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
>     backtrace (crc bff18fd4):
>       [<ffffffff81419a89>] __kmalloc_cache_noprof+0x2f9/0x3f0
>       [<ffffffff8113f715>] task_numa_work+0x725/0xa00
>       [<ffffffff8110f878>] task_work_run+0x58/0x90
>       [<ffffffff81ddd9f8>] syscall_exit_to_user_mode+0x1c8/0x1e0
>       [<ffffffff81dd78d5>] do_syscall_64+0x85/0x150
>       [<ffffffff81e0012b>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
>   ...
> 
>   This issue can be consistently reproduced on three different servers:
>     * a 448-core server
>     * a 256-core server
>     * a 192-core server
> 
> [Root Cause]
> Since multiple threads are created by the hackbench program (along with
> the command argument 'thread'), a shared vma might be accessed by two or
> more cores simultaneously. When two or more cores observe that
> vma->numab_state is NULL at the same time, vma->numab_state will be
> overwritten.
> 
> Note that the command `/opt/ltp/testcases/bin/hackbench 50 process 1000`
> cannot the reproduce the issue because of the fork() and COW. It is
> verified with 200+ test runs.
> 
> [Solution]
> Introduce a lock to make sure the atomic operation of the vma->numab_state
> access.
> 
> Fixes: ef6a22b70f6d ("sched/numa: apply the scan delay to every new vma")
> Reported-by: Jiwei Sun <sunjw10@lenovo.com>
> Signed-off-by: Adrian Huang <ahuang12@lenovo.com>

Could this be achieved without the new lock, by a cmpxchg attempt to install
 vma->numab_state that will free the allocated vma_numab_state if it fails?

Thanks,
Vlastimil

> ---
>  include/linux/mm.h       |  1 +
>  include/linux/mm_types.h |  1 +
>  kernel/sched/fair.c      | 17 ++++++++++++++++-
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 61fff5d34ed5..a08e31ac53de 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -673,6 +673,7 @@ struct vm_operations_struct {
>  static inline void vma_numab_state_init(struct vm_area_struct *vma)
>  {
>  	vma->numab_state = NULL;
> +	mutex_init(&vma->numab_state_lock);
>  }
>  static inline void vma_numab_state_free(struct vm_area_struct *vma)
>  {
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 6e3bdf8e38bc..77eee89a89f5 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -768,6 +768,7 @@ struct vm_area_struct {
>  #endif
>  #ifdef CONFIG_NUMA_BALANCING
>  	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
> +	struct mutex numab_state_lock;		/* NUMA Balancing state lock */
>  #endif
>  	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
>  } __randomize_layout;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c157d4860a3b..53e6383cd94e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3397,12 +3397,24 @@ static void task_numa_work(struct callback_head *work)
>  			continue;
>  		}
>  
> +		/*
> +		 * In case of the shared vma, the vma->numab_state will be
> +		 * overwritten if two or more cores observe vma->numab_state
> +		 * is NULL at the same time. Make sure that only one core
> +		 * allocates memory for vma->numab_state. This can prevent
> +		 * the memory leak.
> +		 */
> +		if (!mutex_trylock(&vma->numab_state_lock))
> +			continue;
> +
>  		/* Initialise new per-VMA NUMAB state. */
>  		if (!vma->numab_state) {
>  			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
>  				GFP_KERNEL);
> -			if (!vma->numab_state)
> +			if (!vma->numab_state) {
> +				mutex_unlock(&vma->numab_state_lock);
>  				continue;
> +			}
>  
>  			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
>  
> @@ -3428,6 +3440,7 @@ static void task_numa_work(struct callback_head *work)
>  		if (mm->numa_scan_seq && time_before(jiffies,
>  						vma->numab_state->next_scan)) {
>  			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
> +			mutex_unlock(&vma->numab_state_lock);
>  			continue;
>  		}
>  
> @@ -3440,6 +3453,8 @@ static void task_numa_work(struct callback_head *work)
>  			vma->numab_state->pids_active[1] = 0;
>  		}
>  
> +		mutex_unlock(&vma->numab_state_lock);
> +
>  		/* Do not rescan VMAs twice within the same sequence. */
>  		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
>  			mm->numa_scan_offset = vma->vm_end;
kernel test robot Nov. 8, 2024, 8:50 p.m. UTC | #2
Hi Adrian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on tip/master tip/sched/core peterz-queue/sched/core linus/master v6.12-rc6 next-20241108]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Adrian-Huang/sched-numa-Fix-memory-leak-due-to-the-overwritten-vma-numab_state/20241108-213420
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20241108133139.25326-1-ahuang12%40lenovo.com
patch subject: [PATCH 1/1] sched/numa: Fix memory leak due to the overwritten vma->numab_state
config: s390-allyesconfig (https://download.01.org/0day-ci/archive/20241109/202411090453.EFAEFpnv-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241109/202411090453.EFAEFpnv-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411090453.EFAEFpnv-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from mm/damon/vaddr.c:736:
   mm/damon/tests/vaddr-kunit.h: In function 'damon_test_three_regions_in_vmas':
>> mm/damon/tests/vaddr-kunit.h:92:1: warning: the frame size of 2168 bytes is larger than 2048 bytes [-Wframe-larger-than=]
      92 | }
         | ^


vim +92 mm/damon/tests/vaddr-kunit.h

17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  38  
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  39  /*
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  40   * Test __damon_va_three_regions() function
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  41   *
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  42   * In case of virtual memory address spaces monitoring, DAMON converts the
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  43   * complex and dynamic memory mappings of each target task to three
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  44   * discontiguous regions which cover every mapped areas.  However, the three
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  45   * regions should not include the two biggest unmapped areas in the original
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  46   * mapping, because the two biggest areas are normally the areas between 1)
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  47   * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  48   * Because these two unmapped areas are very huge but obviously never accessed,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  49   * covering the region is just a waste.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  50   *
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  51   * '__damon_va_three_regions() receives an address space of a process.  It
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  52   * first identifies the start of mappings, end of mappings, and the two biggest
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  53   * unmapped areas.  After that, based on the information, it constructs the
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  54   * three regions and returns.  For more detail, refer to the comment of
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  55   * 'damon_init_regions_of()' function definition in 'mm/damon.c' file.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  56   *
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  57   * For example, suppose virtual address ranges of 10-20, 20-25, 200-210,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  58   * 210-220, 300-305, and 307-330 (Other comments represent this mappings in
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  59   * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  60   * mapped.  To cover every mappings, the three regions should start with 10,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  61   * and end with 305.  The process also has three unmapped areas, 25-200,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  62   * 220-300, and 305-307.  Among those, 25-200 and 220-300 are the biggest two
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  63   * unmapped areas, and thus it should be converted to three regions of 10-25,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  64   * 200-220, and 300-330.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  65   */
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  66  static void damon_test_three_regions_in_vmas(struct kunit *test)
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  67  {
d0cf3dd47f0d5d mm/damon/vaddr-test.h        Liam R. Howlett 2022-09-06  68  	static struct mm_struct mm;
ba7196e566516f mm/damon/tests/vaddr-kunit.h Leo Stone       2024-09-22  69  	struct damon_addr_range regions[3] = {0};
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  70  	/* 10-20-25, 200-210-220, 300-305, 307-330 */
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  71  	struct vm_area_struct vmas[] = {
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  72  		(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  73  		(struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  74  		(struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  75  		(struct vm_area_struct) {.vm_start = 210, .vm_end = 220},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  76  		(struct vm_area_struct) {.vm_start = 300, .vm_end = 305},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  77  		(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  78  	};
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  79  
f0679f9e6d88ae mm/damon/tests/vaddr-kunit.h SeongJae Park   2024-09-04  80  	mt_init_flags(&mm.mm_mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU);
34403fa579514a mm/damon/vaddr-test.h        Liam R. Howlett 2023-01-20  81  	if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
34403fa579514a mm/damon/vaddr-test.h        Liam R. Howlett 2023-01-20  82  		kunit_skip(test, "Failed to create VMA tree");
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  83  
d0cf3dd47f0d5d mm/damon/vaddr-test.h        Liam R. Howlett 2022-09-06  84  	__damon_va_three_regions(&mm, regions);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  85  
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  86  	KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  87  	KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  88  	KUNIT_EXPECT_EQ(test, 200ul, regions[1].start);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  89  	KUNIT_EXPECT_EQ(test, 220ul, regions[1].end);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  90  	KUNIT_EXPECT_EQ(test, 300ul, regions[2].start);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  91  	KUNIT_EXPECT_EQ(test, 330ul, regions[2].end);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07 @92  }
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  93
kernel test robot Nov. 8, 2024, 10:12 p.m. UTC | #3
Hi Adrian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on tip/master tip/sched/core peterz-queue/sched/core linus/master v6.12-rc6 next-20241108]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Adrian-Huang/sched-numa-Fix-memory-leak-due-to-the-overwritten-vma-numab_state/20241108-213420
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20241108133139.25326-1-ahuang12%40lenovo.com
patch subject: [PATCH 1/1] sched/numa: Fix memory leak due to the overwritten vma->numab_state
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20241109/202411090614.hOVFwh3l-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241109/202411090614.hOVFwh3l-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411090614.hOVFwh3l-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from mm/damon/vaddr.c:10:
   In file included from include/linux/highmem.h:8:
   In file included from include/linux/cacheflush.h:5:
   In file included from arch/x86/include/asm/cacheflush.h:5:
   In file included from include/linux/mm.h:2212:
   include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
     504 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     505 |                            item];
         |                            ~~~~
   include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
     511 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     512 |                            NR_VM_NUMA_EVENT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~~
   include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
     524 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     525 |                            NR_VM_NUMA_EVENT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~~
   In file included from mm/damon/vaddr.c:736:
>> mm/damon/tests/vaddr-kunit.h:66:13: warning: stack frame size (2248) exceeds limit (2048) in 'damon_test_three_regions_in_vmas' [-Wframe-larger-than]
      66 | static void damon_test_three_regions_in_vmas(struct kunit *test)
         |             ^
   5 warnings generated.


vim +/damon_test_three_regions_in_vmas +66 mm/damon/tests/vaddr-kunit.h

17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  38  
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  39  /*
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  40   * Test __damon_va_three_regions() function
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  41   *
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  42   * In case of virtual memory address spaces monitoring, DAMON converts the
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  43   * complex and dynamic memory mappings of each target task to three
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  44   * discontiguous regions which cover every mapped areas.  However, the three
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  45   * regions should not include the two biggest unmapped areas in the original
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  46   * mapping, because the two biggest areas are normally the areas between 1)
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  47   * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  48   * Because these two unmapped areas are very huge but obviously never accessed,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  49   * covering the region is just a waste.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  50   *
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  51   * '__damon_va_three_regions() receives an address space of a process.  It
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  52   * first identifies the start of mappings, end of mappings, and the two biggest
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  53   * unmapped areas.  After that, based on the information, it constructs the
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  54   * three regions and returns.  For more detail, refer to the comment of
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  55   * 'damon_init_regions_of()' function definition in 'mm/damon.c' file.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  56   *
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  57   * For example, suppose virtual address ranges of 10-20, 20-25, 200-210,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  58   * 210-220, 300-305, and 307-330 (Other comments represent this mappings in
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  59   * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  60   * mapped.  To cover every mappings, the three regions should start with 10,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  61   * and end with 305.  The process also has three unmapped areas, 25-200,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  62   * 220-300, and 305-307.  Among those, 25-200 and 220-300 are the biggest two
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  63   * unmapped areas, and thus it should be converted to three regions of 10-25,
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  64   * 200-220, and 300-330.
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  65   */
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07 @66  static void damon_test_three_regions_in_vmas(struct kunit *test)
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  67  {
d0cf3dd47f0d5d mm/damon/vaddr-test.h        Liam R. Howlett 2022-09-06  68  	static struct mm_struct mm;
ba7196e566516f mm/damon/tests/vaddr-kunit.h Leo Stone       2024-09-22  69  	struct damon_addr_range regions[3] = {0};
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  70  	/* 10-20-25, 200-210-220, 300-305, 307-330 */
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  71  	struct vm_area_struct vmas[] = {
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  72  		(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  73  		(struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  74  		(struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  75  		(struct vm_area_struct) {.vm_start = 210, .vm_end = 220},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  76  		(struct vm_area_struct) {.vm_start = 300, .vm_end = 305},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  77  		(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  78  	};
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  79  
f0679f9e6d88ae mm/damon/tests/vaddr-kunit.h SeongJae Park   2024-09-04  80  	mt_init_flags(&mm.mm_mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU);
34403fa579514a mm/damon/vaddr-test.h        Liam R. Howlett 2023-01-20  81  	if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
34403fa579514a mm/damon/vaddr-test.h        Liam R. Howlett 2023-01-20  82  		kunit_skip(test, "Failed to create VMA tree");
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  83  
d0cf3dd47f0d5d mm/damon/vaddr-test.h        Liam R. Howlett 2022-09-06  84  	__damon_va_three_regions(&mm, regions);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  85  
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  86  	KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  87  	KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  88  	KUNIT_EXPECT_EQ(test, 200ul, regions[1].start);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  89  	KUNIT_EXPECT_EQ(test, 220ul, regions[1].end);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  90  	KUNIT_EXPECT_EQ(test, 300ul, regions[2].start);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  91  	KUNIT_EXPECT_EQ(test, 330ul, regions[2].end);
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  92  }
17ccae8bb5c928 mm/damon/vaddr-test.h        SeongJae Park   2021-09-07  93
Raghavendra K T Nov. 9, 2024, 4:03 a.m. UTC | #4
On 11/8/2024 7:01 PM, Adrian Huang wrote:
> From: Adrian Huang <ahuang12@lenovo.com>
> 
> [Problem Description]
> When running the hackbench program of LTP, the following memory leak is
> reported by kmemleak.
> 
>    # /opt/ltp/testcases/bin/hackbench 20 thread 1000
>    Running with 20*40 (== 800) tasks.
> 
>    # dmesg | grep kmemleak
>    ...
>    kmemleak: 480 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>    kmemleak: 665 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> 
>    # cat /sys/kernel/debug/kmemleak
>    unreferenced object 0xffff888cd8ca2c40 (size 64):
>      comm "hackbench", pid 17142, jiffies 4299780315
>      hex dump (first 32 bytes):
>        ac 74 49 00 01 00 00 00 4c 84 49 00 01 00 00 00  .tI.....L.I.....
>        00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
>      backtrace (crc bff18fd4):
>        [<ffffffff81419a89>] __kmalloc_cache_noprof+0x2f9/0x3f0
>        [<ffffffff8113f715>] task_numa_work+0x725/0xa00
>        [<ffffffff8110f878>] task_work_run+0x58/0x90
>        [<ffffffff81ddd9f8>] syscall_exit_to_user_mode+0x1c8/0x1e0
>        [<ffffffff81dd78d5>] do_syscall_64+0x85/0x150
>        [<ffffffff81e0012b>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
>    ...
> 
>    This issue can be consistently reproduced on three different servers:
>      * a 448-core server
>      * a 256-core server
>      * a 192-core server
> 
> [Root Cause]
> Since multiple threads are created by the hackbench program (along with
> the command argument 'thread'), a shared vma might be accessed by two or
> more cores simultaneously. When two or more cores observe that
> vma->numab_state is NULL at the same time, vma->numab_state will be
> overwritten.
> 

Thanks for reporting.

IIRC, This is not the entire scenario. Chunk above the vma->numab code
ideally ensures, only one thread descend down to scan the VMA's in a
single 'numa_scan_period'

     migrate = mm->numa_next_scan;
         if (time_before(now, migrate))
                 return;
         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
         if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
                 return;

However since there are 800 threads, I see there might be an opportunity
for another thread to enter in the next 'numa_scan_period' while
we have not gotten till numab_state allocation.

There should be simpler ways to overcome like Vlastimil already pointed
in the other thread, and having lock is an overkill.

for e.g.,
numab_state = kzalloc(..)

if we see that some other thread able to successfully assign
vma->numab_state with their allocation (with cmpxchg), simply
free your allocation.

Can you please check if my understanding is correct?

Thanks
- Raghu

[...]
Adrian Huang Nov. 11, 2024, 10:08 a.m. UTC | #5
<snip>
>However since there are 800 threads, I see there might be an opportunity
>for another thread to enter in the next 'numa_scan_period' while
>we have not gotten till numab_state allocation.
>
>There should be simpler ways to overcome like Vlastimil already pointed
>in the other thread, and having lock is an overkill.
>
>for e.g.,
>numab_state = kzalloc(..)
>
>if we see that some other thread able to successfully assign
>vma->numab_state with their allocation (with cmpxchg), simply
>free your allocation.
>
>Can you please check if my understanding is correct?

Thanks for Vlastimil's and Raghu's reviews and comments.

Yes, your understanding is correct. Before submitting this patch,
I had two internal proposals: lock and cmpxchg. Here is the my cmpxchg
version (Test passed). If you're ok with this cmpxchg version, may I have
your reviewed-by? I'll send a v2 then.

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3356315d7e64..7f99df294583 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3399,10 +3399,16 @@ static void task_numa_work(struct callback_head *work)
 
 		/* Initialise new per-VMA NUMAB state. */
 		if (!vma->numab_state) {
-			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
-				GFP_KERNEL);
-			if (!vma->numab_state)
+			struct vma_numab_state *ptr;
+
+			ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+			if (!ptr)
+				continue;
+
+			if (cmpxchg(&vma->numab_state, NULL, ptr)) {
+				kfree(ptr);
 				continue;
+			}
 
 			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
Raghavendra K T Nov. 11, 2024, 10:31 a.m. UTC | #6
On 11/11/2024 3:38 PM, Adrian Huang wrote:
> <snip>
>> However since there are 800 threads, I see there might be an opportunity
>> for another thread to enter in the next 'numa_scan_period' while
>> we have not gotten till numab_state allocation.
>>
>> There should be simpler ways to overcome like Vlastimil already pointed
>> in the other thread, and having lock is an overkill.
>>
>> for e.g.,
>> numab_state = kzalloc(..)
>>
>> if we see that some other thread able to successfully assign
>> vma->numab_state with their allocation (with cmpxchg), simply
>> free your allocation.
>>
>> Can you please check if my understanding is correct?
> 
> Thanks for Vlastimil's and Raghu's reviews and comments.
> 
> Yes, your understanding is correct. Before submitting this patch,
> I had two internal proposals: lock and cmpxchg. Here is the my cmpxchg
> version (Test passed). If you're ok with this cmpxchg version, may I have
> your reviewed-by? I'll send a v2 then.
> 
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3356315d7e64..7f99df294583 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3399,10 +3399,16 @@ static void task_numa_work(struct callback_head *work)
>   
>   		/* Initialise new per-VMA NUMAB state. */
>   		if (!vma->numab_state) {
> -			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
> -				GFP_KERNEL);
> -			if (!vma->numab_state)
> +			struct vma_numab_state *ptr;
> +
> +			ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
> +			if (!ptr)
> +				continue;
> +
> +			if (cmpxchg(&vma->numab_state, NULL, ptr)) {
> +				kfree(ptr);
>   				continue;
> +			}
>   
>   			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
> 

LGTM. Sure feel free to add

Reviewed-by: Raghavendra K T <raghavendra.kt@amd.com>

Since allocation for numab_state happen only once.. I hope there is not
much impact on the performance also.

Thanks and Regards
- Raghu
Vlastimil Babka Nov. 13, 2024, 6:41 p.m. UTC | #7
On 11/11/24 11:08, Adrian Huang wrote:
> <snip>
>>However since there are 800 threads, I see there might be an opportunity
>>for another thread to enter in the next 'numa_scan_period' while
>>we have not gotten till numab_state allocation.
>>
>>There should be simpler ways to overcome like Vlastimil already pointed
>>in the other thread, and having lock is an overkill.
>>
>>for e.g.,
>>numab_state = kzalloc(..)
>>
>>if we see that some other thread able to successfully assign
>>vma->numab_state with their allocation (with cmpxchg), simply
>>free your allocation.
>>
>>Can you please check if my understanding is correct?
> 
> Thanks for Vlastimil's and Raghu's reviews and comments.
> 
> Yes, your understanding is correct. Before submitting this patch,
> I had two internal proposals: lock and cmpxchg. Here is the my cmpxchg
> version (Test passed). If you're ok with this cmpxchg version, may I have
> your reviewed-by? I'll send a v2 then.

Yeah much better, thanks!

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3356315d7e64..7f99df294583 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3399,10 +3399,16 @@ static void task_numa_work(struct callback_head *work)
>  
>  		/* Initialise new per-VMA NUMAB state. */
>  		if (!vma->numab_state) {
> -			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
> -				GFP_KERNEL);
> -			if (!vma->numab_state)
> +			struct vma_numab_state *ptr;
> +
> +			ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
> +			if (!ptr)
> +				continue;
> +
> +			if (cmpxchg(&vma->numab_state, NULL, ptr)) {
> +				kfree(ptr);
>  				continue;
> +			}
>  
>  			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
> 
>
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 61fff5d34ed5..a08e31ac53de 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -673,6 +673,7 @@  struct vm_operations_struct {
 static inline void vma_numab_state_init(struct vm_area_struct *vma)
 {
 	vma->numab_state = NULL;
+	mutex_init(&vma->numab_state_lock);
 }
 static inline void vma_numab_state_free(struct vm_area_struct *vma)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e3bdf8e38bc..77eee89a89f5 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -768,6 +768,7 @@  struct vm_area_struct {
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
+	struct mutex numab_state_lock;		/* NUMA Balancing state lock */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c157d4860a3b..53e6383cd94e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3397,12 +3397,24 @@  static void task_numa_work(struct callback_head *work)
 			continue;
 		}
 
+		/*
+		 * In case of the shared vma, the vma->numab_state will be
+		 * overwritten if two or more cores observe vma->numab_state
+		 * is NULL at the same time. Make sure that only one core
+		 * allocates memory for vma->numab_state. This can prevent
+		 * the memory leak.
+		 */
+		if (!mutex_trylock(&vma->numab_state_lock))
+			continue;
+
 		/* Initialise new per-VMA NUMAB state. */
 		if (!vma->numab_state) {
 			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
 				GFP_KERNEL);
-			if (!vma->numab_state)
+			if (!vma->numab_state) {
+				mutex_unlock(&vma->numab_state_lock);
 				continue;
+			}
 
 			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
 
@@ -3428,6 +3440,7 @@  static void task_numa_work(struct callback_head *work)
 		if (mm->numa_scan_seq && time_before(jiffies,
 						vma->numab_state->next_scan)) {
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
+			mutex_unlock(&vma->numab_state_lock);
 			continue;
 		}
 
@@ -3440,6 +3453,8 @@  static void task_numa_work(struct callback_head *work)
 			vma->numab_state->pids_active[1] = 0;
 		}
 
+		mutex_unlock(&vma->numab_state_lock);
+
 		/* Do not rescan VMAs twice within the same sequence. */
 		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
 			mm->numa_scan_offset = vma->vm_end;