diff mbox series

[v4,04/13] mm/khugepaged: make hugepage allocation context-specific

Message ID 20220502181714.3483177-5-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe May 2, 2022, 6:17 p.m. UTC
Add a hook to struct collapse_context that allows contexts to define
their own allocation semantics and charging logic.  For example,
khugepaged has specific NUMA and UMA implementations as well as gfp
flags tied to /sys/kernel/mm/transparent_hugepage/khugepaged/defrag.

Additionally, move [pre]allocated hugepage pointer into
struct collapse_context.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 85 ++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

Comments

kernel test robot May 3, 2022, 3:38 a.m. UTC | #1
Hi Zach,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on next-20220502]
[cannot apply to hnaz-mm/master rostedt-trace/for-next deller-parisc/for-next arnd-asm-generic/master linus/master v5.18-rc5 v5.18-rc4 v5.18-rc3 v5.18-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
base:    9f9b9a2972eb8dcaad09d826c5c6d7488eaca3e6
config: x86_64-randconfig-a011-20220502 (https://download.01.org/0day-ci/archive/20220503/202205031117.A41ue1TS-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 09325d36061e42b495d1f4c7e933e260eac260ed)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/0d006aeaf99be94a0dcb727cb6540195f13fd9c3
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
        git checkout 0d006aeaf99be94a0dcb727cb6540195f13fd9c3
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> mm/khugepaged.c:1091:29: warning: incompatible integer to pointer conversion passing 'gfp_t' (aka 'unsigned int') to parameter of type 'struct page **' [-Wint-conversion]
           if (!khugepaged_alloc_page(gfp, node, cc))
                                      ^~~
   mm/khugepaged.c:949:49: note: passing argument to parameter 'hpage' here
   static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
                                                   ^
>> mm/khugepaged.c:1091:40: warning: incompatible pointer to integer conversion passing 'struct collapse_control *' to parameter of type 'int' [-Wint-conversion]
           if (!khugepaged_alloc_page(gfp, node, cc))
                                                 ^~
   mm/khugepaged.c:949:71: note: passing argument to parameter 'node' here
   static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
                                                                         ^
   2 warnings generated.


vim +1091 mm/khugepaged.c

  1074	
  1075	static int alloc_charge_hpage(struct mm_struct *mm, struct collapse_control *cc)
  1076	{
  1077	#ifdef CONFIG_NUMA
  1078		const struct cpumask *cpumask;
  1079	#endif
  1080		gfp_t gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
  1081		int node = khugepaged_find_target_node(cc);
  1082	
  1083	#ifdef CONFIG_NUMA
  1084		/* sched to specified node before huge page memory copy */
  1085		if (task_node(current) != node) {
  1086			cpumask = cpumask_of_node(node);
  1087			if (!cpumask_empty(cpumask))
  1088				set_cpus_allowed_ptr(current, cpumask);
  1089		}
  1090	#endif
> 1091		if (!khugepaged_alloc_page(gfp, node, cc))
  1092			return SCAN_ALLOC_HUGE_PAGE_FAIL;
  1093		if (unlikely(mem_cgroup_charge(page_folio(cc->hpage), mm, gfp)))
  1094			return SCAN_CGROUP_CHARGE_FAIL;
  1095		count_memcg_page_event(cc->hpage, THP_COLLAPSE_ALLOC);
  1096		return SCAN_SUCCEED;
  1097	}
  1098
kernel test robot May 3, 2022, 6:30 a.m. UTC | #2
Hi Zach,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on next-20220502]
[cannot apply to hnaz-mm/master rostedt-trace/for-next deller-parisc/for-next arnd-asm-generic/master linus/master v5.18-rc5 v5.18-rc4 v5.18-rc3 v5.18-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
base:    9f9b9a2972eb8dcaad09d826c5c6d7488eaca3e6
config: x86_64-randconfig-a013 (https://download.01.org/0day-ci/archive/20220503/202205031435.JEToTgim-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/0d006aeaf99be94a0dcb727cb6540195f13fd9c3
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
        git checkout 0d006aeaf99be94a0dcb727cb6540195f13fd9c3
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   mm/khugepaged.c: In function 'alloc_charge_hpage':
>> mm/khugepaged.c:1091:36: warning: passing argument 1 of 'khugepaged_alloc_page' makes pointer from integer without a cast [-Wint-conversion]
    1091 |         if (!khugepaged_alloc_page(gfp, node, cc))
         |                                    ^~~
         |                                    |
         |                                    gfp_t {aka unsigned int}
   mm/khugepaged.c:949:49: note: expected 'struct page **' but argument is of type 'gfp_t' {aka 'unsigned int'}
     949 | static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
         |                                   ~~~~~~~~~~~~~~^~~~~
>> mm/khugepaged.c:1091:47: warning: passing argument 3 of 'khugepaged_alloc_page' makes integer from pointer without a cast [-Wint-conversion]
    1091 |         if (!khugepaged_alloc_page(gfp, node, cc))
         |                                               ^~
         |                                               |
         |                                               struct collapse_control *
   mm/khugepaged.c:949:71: note: expected 'int' but argument is of type 'struct collapse_control *'
     949 | static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
         |                                                                   ~~~~^~~~


vim +/khugepaged_alloc_page +1091 mm/khugepaged.c

  1074	
  1075	static int alloc_charge_hpage(struct mm_struct *mm, struct collapse_control *cc)
  1076	{
  1077	#ifdef CONFIG_NUMA
  1078		const struct cpumask *cpumask;
  1079	#endif
  1080		gfp_t gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
  1081		int node = khugepaged_find_target_node(cc);
  1082	
  1083	#ifdef CONFIG_NUMA
  1084		/* sched to specified node before huge page memory copy */
  1085		if (task_node(current) != node) {
  1086			cpumask = cpumask_of_node(node);
  1087			if (!cpumask_empty(cpumask))
  1088				set_cpus_allowed_ptr(current, cpumask);
  1089		}
  1090	#endif
> 1091		if (!khugepaged_alloc_page(gfp, node, cc))
  1092			return SCAN_ALLOC_HUGE_PAGE_FAIL;
  1093		if (unlikely(mem_cgroup_charge(page_folio(cc->hpage), mm, gfp)))
  1094			return SCAN_CGROUP_CHARGE_FAIL;
  1095		count_memcg_page_event(cc->hpage, THP_COLLAPSE_ALLOC);
  1096		return SCAN_SUCCEED;
  1097	}
  1098
kernel test robot May 4, 2022, 2:25 a.m. UTC | #3
Greeting,

FYI, we noticed the following commit (built with gcc-11):

commit: 0d006aeaf99be94a0dcb727cb6540195f13fd9c3 ("[PATCH v4 04/13] mm/khugepaged: make hugepage allocation context-specific")
url: https://github.com/intel-lab-lkp/linux/commits/Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
patch link: https://lore.kernel.org/linux-mm/20220502181714.3483177-5-zokeefe@google.com

in testcase: boot

on test machine: qemu-system-i386 -enable-kvm -cpu SandyBridge -smp 2 -m 4G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <oliver.sang@intel.com>


[   18.854835][   T17] BUG: unable to handle page fault for address: 003c24ca
[   18.855169][   T17] #PF: supervisor read access in kernel mode
[   18.855395][   T17] #PF: error_code(0x0000) - not-present page
[   18.855620][   T17] *pde = 00000000
[   18.855763][   T17] Oops: 0000 [#1]
[   18.855903][   T17] CPU: 0 PID: 17 Comm: khugepaged Not tainted 5.18.0-rc5-next-20220502-00004-g0d006aeaf99b #1
[ 18.856283][ T17] EIP: alloc_charge_hpage (mm/khugepaged.c:951 mm/khugepaged.c:1091) 
[ 18.856498][ T17] Code: 00 00 00 55 89 e5 6a 07 e8 86 c0 ff ff c9 31 d2 89 d1 c3 55 89 e5 a1 28 e6 fa 59 25 80 00 00 00 83 f8 01 19 c0 25 00 fc ff ff <8b> 80 ca 24 3c 00 85 c0 74 0f 8b 42 08 5d 8b 40 04 b8 01 00 00 00
All code
========
   0:	00 00                	add    %al,(%rax)
   2:	00 55 89             	add    %dl,-0x77(%rbp)
   5:	e5 6a                	in     $0x6a,%eax
   7:	07                   	(bad)  
   8:	e8 86 c0 ff ff       	callq  0xffffffffffffc093
   d:	c9                   	leaveq 
   e:	31 d2                	xor    %edx,%edx
  10:	89 d1                	mov    %edx,%ecx
  12:	c3                   	retq   
  13:	55                   	push   %rbp
  14:	89 e5                	mov    %esp,%ebp
  16:	a1 28 e6 fa 59 25 80 	movabs 0x802559fae628,%eax
  1d:	00 00 
  1f:	00 83 f8 01 19 c0    	add    %al,-0x3fe6fe08(%rbx)
  25:	25 00 fc ff ff       	and    $0xfffffc00,%eax
  2a:*	8b 80 ca 24 3c 00    	mov    0x3c24ca(%rax),%eax		<-- trapping instruction
  30:	85 c0                	test   %eax,%eax
  32:	74 0f                	je     0x43
  34:	8b 42 08             	mov    0x8(%rdx),%eax
  37:	5d                   	pop    %rbp
  38:	8b 40 04             	mov    0x4(%rax),%eax
  3b:	b8 01 00 00 00       	mov    $0x1,%eax

Code starting with the faulting instruction
===========================================
   0:	8b 80 ca 24 3c 00    	mov    0x3c24ca(%rax),%eax
   6:	85 c0                	test   %eax,%eax
   8:	74 0f                	je     0x19
   a:	8b 42 08             	mov    0x8(%rdx),%eax
   d:	5d                   	pop    %rbp
   e:	8b 40 04             	mov    0x4(%rax),%eax
  11:	b8 01 00 00 00       	mov    $0x1,%eax
[   18.857217][   T17] EAX: 00000000 EBX: 41172400 ECX: 00000000 EDX: 411d1f7c
[   18.857487][   T17] ESI: 411d1f7c EDI: 4117245c EBP: 411d1e64 ESP: 411d1e64
[   18.857750][   T17] DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 EFLAGS: 00010246
[   18.858036][   T17] CR0: 80050033 CR2: 003c24ca CR3: 112f3000 CR4: 000406d0
[   18.858302][   T17] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[   18.858567][   T17] DR6: fffe0ff0 DR7: 00000400
[   18.858743][   T17] Call Trace:
[ 18.858871][ T17] ? collapse_huge_page (mm/khugepaged.c:1109) 
[ 18.859066][ T17] ? find_held_lock (kernel/locking/lockdep.c:5156) 
[ 18.859245][ T17] ? khugepaged_scan_pmd (mm/khugepaged.c:1400) 
[ 18.859446][ T17] ? khugepaged_scan_mm_slot (mm/khugepaged.c:2216) 
[ 18.859657][ T17] ? khugepaged_do_scan (mm/khugepaged.c:2290) 
[ 18.859854][ T17] ? khugepaged (mm/khugepaged.c:2340) 
[ 18.860016][ T17] ? khugepaged_defrag_show (mm/khugepaged.c:1076) 
[ 18.860219][ T17] ? kthread (kernel/kthread.c:376) 
[ 18.860374][ T17] ? khugepaged_do_scan (mm/khugepaged.c:2328) 
[ 18.860570][ T17] ? kthread_complete_and_exit (kernel/kthread.c:331) 
[ 18.860781][ T17] ? ret_from_fork (arch/x86/entry/entry_32.S:772) 
[   18.860956][   T17] Modules linked in:
[   18.861105][   T17] CR2: 00000000003c24ca
[   18.861262][   T17] ---[ end trace 0000000000000000 ]---
[ 18.861263][ T17] EIP: alloc_charge_hpage (mm/khugepaged.c:951 mm/khugepaged.c:1091) 
[ 18.861266][ T17] Code: 00 00 00 55 89 e5 6a 07 e8 86 c0 ff ff c9 31 d2 89 d1 c3 55 89 e5 a1 28 e6 fa 59 25 80 00 00 00 83 f8 01 19 c0 25 00 fc ff ff <8b> 80 ca 24 3c 00 85 c0 74 0f 8b 42 08 5d 8b 40 04 b8 01 00 00 00
All code
========
   0:	00 00                	add    %al,(%rax)
   2:	00 55 89             	add    %dl,-0x77(%rbp)
   5:	e5 6a                	in     $0x6a,%eax
   7:	07                   	(bad)  
   8:	e8 86 c0 ff ff       	callq  0xffffffffffffc093
   d:	c9                   	leaveq 
   e:	31 d2                	xor    %edx,%edx
  10:	89 d1                	mov    %edx,%ecx
  12:	c3                   	retq   
  13:	55                   	push   %rbp
  14:	89 e5                	mov    %esp,%ebp
  16:	a1 28 e6 fa 59 25 80 	movabs 0x802559fae628,%eax
  1d:	00 00 
  1f:	00 83 f8 01 19 c0    	add    %al,-0x3fe6fe08(%rbx)
  25:	25 00 fc ff ff       	and    $0xfffffc00,%eax
  2a:*	8b 80 ca 24 3c 00    	mov    0x3c24ca(%rax),%eax		<-- trapping instruction
  30:	85 c0                	test   %eax,%eax
  32:	74 0f                	je     0x43
  34:	8b 42 08             	mov    0x8(%rdx),%eax
  37:	5d                   	pop    %rbp
  38:	8b 40 04             	mov    0x4(%rax),%eax
  3b:	b8 01 00 00 00       	mov    $0x1,%eax

Code starting with the faulting instruction
===========================================
   0:	8b 80 ca 24 3c 00    	mov    0x3c24ca(%rax),%eax
   6:	85 c0                	test   %eax,%eax
   8:	74 0f                	je     0x19
   a:	8b 42 08             	mov    0x8(%rdx),%eax
   d:	5d                   	pop    %rbp
   e:	8b 40 04             	mov    0x4(%rax),%eax
  11:	b8 01 00 00 00       	mov    $0x1,%eax


To reproduce:

        # build kernel
	cd linux
	cp config-5.18.0-rc5-next-20220502-00004-g0d006aeaf99b .config
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=i386 olddefconfig prepare modules_prepare bzImage modules
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=i386 INSTALL_MOD_PATH=<mod-install-dir> modules_install
	cd <mod-install-dir>
	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
Zach O'Keefe May 4, 2022, 9:45 p.m. UTC | #4
Sorry all about this - I fixed this up in v5

On Mon, May 2, 2022 at 11:31 PM kernel test robot <lkp@intel.com> wrote:
>
> Hi Zach,
>
> Thank you for the patch! Perhaps something to improve:
>
> [auto build test WARNING on next-20220502]
> [cannot apply to hnaz-mm/master rostedt-trace/for-next deller-parisc/for-next arnd-asm-generic/master linus/master v5.18-rc5 v5.18-rc4 v5.18-rc3 v5.18-rc5]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
> base:    9f9b9a2972eb8dcaad09d826c5c6d7488eaca3e6
> config: x86_64-randconfig-a013 (https://download.01.org/0day-ci/archive/20220503/202205031435.JEToTgim-lkp@intel.com/config)
> compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
> reproduce (this is a W=1 build):
>         # https://github.com/intel-lab-lkp/linux/commit/0d006aeaf99be94a0dcb727cb6540195f13fd9c3
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
>         git checkout 0d006aeaf99be94a0dcb727cb6540195f13fd9c3
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
>
> All warnings (new ones prefixed by >>):
>
>    mm/khugepaged.c: In function 'alloc_charge_hpage':
> >> mm/khugepaged.c:1091:36: warning: passing argument 1 of 'khugepaged_alloc_page' makes pointer from integer without a cast [-Wint-conversion]
>     1091 |         if (!khugepaged_alloc_page(gfp, node, cc))
>          |                                    ^~~
>          |                                    |
>          |                                    gfp_t {aka unsigned int}
>    mm/khugepaged.c:949:49: note: expected 'struct page **' but argument is of type 'gfp_t' {aka 'unsigned int'}
>      949 | static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>          |                                   ~~~~~~~~~~~~~~^~~~~
> >> mm/khugepaged.c:1091:47: warning: passing argument 3 of 'khugepaged_alloc_page' makes integer from pointer without a cast [-Wint-conversion]
>     1091 |         if (!khugepaged_alloc_page(gfp, node, cc))
>          |                                               ^~
>          |                                               |
>          |                                               struct collapse_control *
>    mm/khugepaged.c:949:71: note: expected 'int' but argument is of type 'struct collapse_control *'
>      949 | static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>          |                                                                   ~~~~^~~~
>
>
> vim +/khugepaged_alloc_page +1091 mm/khugepaged.c
>
>   1074
>   1075  static int alloc_charge_hpage(struct mm_struct *mm, struct collapse_control *cc)
>   1076  {
>   1077  #ifdef CONFIG_NUMA
>   1078          const struct cpumask *cpumask;
>   1079  #endif
>   1080          gfp_t gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
>   1081          int node = khugepaged_find_target_node(cc);
>   1082
>   1083  #ifdef CONFIG_NUMA
>   1084          /* sched to specified node before huge page memory copy */
>   1085          if (task_node(current) != node) {
>   1086                  cpumask = cpumask_of_node(node);
>   1087                  if (!cpumask_empty(cpumask))
>   1088                          set_cpus_allowed_ptr(current, cpumask);
>   1089          }
>   1090  #endif
> > 1091          if (!khugepaged_alloc_page(gfp, node, cc))
>   1092                  return SCAN_ALLOC_HUGE_PAGE_FAIL;
>   1093          if (unlikely(mem_cgroup_charge(page_folio(cc->hpage), mm, gfp)))
>   1094                  return SCAN_CGROUP_CHARGE_FAIL;
>   1095          count_memcg_page_event(cc->hpage, THP_COLLAPSE_ALLOC);
>   1096          return SCAN_SUCCEED;
>   1097  }
>   1098
>
> --
> 0-DAY CI Kernel Test Service
> https://01.org/lkp
>
Zach O'Keefe May 4, 2022, 9:46 p.m. UTC | #5
Thanks for reporting. Fixed in v5.

On Tue, May 3, 2022 at 7:25 PM kernel test robot <oliver.sang@intel.com> wrote:
>
>
>
> Greeting,
>
> FYI, we noticed the following commit (built with gcc-11):
>
> commit: 0d006aeaf99be94a0dcb727cb6540195f13fd9c3 ("[PATCH v4 04/13] mm/khugepaged: make hugepage allocation context-specific")
> url: https://github.com/intel-lab-lkp/linux/commits/Zach-O-Keefe/mm-khugepaged-record-SCAN_PMD_MAPPED-when-scan_pmd-finds-THP/20220503-031727
> patch link: https://lore.kernel.org/linux-mm/20220502181714.3483177-5-zokeefe@google.com
>
> in testcase: boot
>
> on test machine: qemu-system-i386 -enable-kvm -cpu SandyBridge -smp 2 -m 4G
>
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
>
>
>
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot <oliver.sang@intel.com>
>
>
> [   18.854835][   T17] BUG: unable to handle page fault for address: 003c24ca
> [   18.855169][   T17] #PF: supervisor read access in kernel mode
> [   18.855395][   T17] #PF: error_code(0x0000) - not-present page
> [   18.855620][   T17] *pde = 00000000
> [   18.855763][   T17] Oops: 0000 [#1]
> [   18.855903][   T17] CPU: 0 PID: 17 Comm: khugepaged Not tainted 5.18.0-rc5-next-20220502-00004-g0d006aeaf99b #1
> [ 18.856283][ T17] EIP: alloc_charge_hpage (mm/khugepaged.c:951 mm/khugepaged.c:1091)
> [ 18.856498][ T17] Code: 00 00 00 55 89 e5 6a 07 e8 86 c0 ff ff c9 31 d2 89 d1 c3 55 89 e5 a1 28 e6 fa 59 25 80 00 00 00 83 f8 01 19 c0 25 00 fc ff ff <8b> 80 ca 24 3c 00 85 c0 74 0f 8b 42 08 5d 8b 40 04 b8 01 00 00 00
> All code
> ========
>    0:   00 00                   add    %al,(%rax)
>    2:   00 55 89                add    %dl,-0x77(%rbp)
>    5:   e5 6a                   in     $0x6a,%eax
>    7:   07                      (bad)
>    8:   e8 86 c0 ff ff          callq  0xffffffffffffc093
>    d:   c9                      leaveq
>    e:   31 d2                   xor    %edx,%edx
>   10:   89 d1                   mov    %edx,%ecx
>   12:   c3                      retq
>   13:   55                      push   %rbp
>   14:   89 e5                   mov    %esp,%ebp
>   16:   a1 28 e6 fa 59 25 80    movabs 0x802559fae628,%eax
>   1d:   00 00
>   1f:   00 83 f8 01 19 c0       add    %al,-0x3fe6fe08(%rbx)
>   25:   25 00 fc ff ff          and    $0xfffffc00,%eax
>   2a:*  8b 80 ca 24 3c 00       mov    0x3c24ca(%rax),%eax              <-- trapping instruction
>   30:   85 c0                   test   %eax,%eax
>   32:   74 0f                   je     0x43
>   34:   8b 42 08                mov    0x8(%rdx),%eax
>   37:   5d                      pop    %rbp
>   38:   8b 40 04                mov    0x4(%rax),%eax
>   3b:   b8 01 00 00 00          mov    $0x1,%eax
>
> Code starting with the faulting instruction
> ===========================================
>    0:   8b 80 ca 24 3c 00       mov    0x3c24ca(%rax),%eax
>    6:   85 c0                   test   %eax,%eax
>    8:   74 0f                   je     0x19
>    a:   8b 42 08                mov    0x8(%rdx),%eax
>    d:   5d                      pop    %rbp
>    e:   8b 40 04                mov    0x4(%rax),%eax
>   11:   b8 01 00 00 00          mov    $0x1,%eax
> [   18.857217][   T17] EAX: 00000000 EBX: 41172400 ECX: 00000000 EDX: 411d1f7c
> [   18.857487][   T17] ESI: 411d1f7c EDI: 4117245c EBP: 411d1e64 ESP: 411d1e64
> [   18.857750][   T17] DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 EFLAGS: 00010246
> [   18.858036][   T17] CR0: 80050033 CR2: 003c24ca CR3: 112f3000 CR4: 000406d0
> [   18.858302][   T17] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
> [   18.858567][   T17] DR6: fffe0ff0 DR7: 00000400
> [   18.858743][   T17] Call Trace:
> [ 18.858871][ T17] ? collapse_huge_page (mm/khugepaged.c:1109)
> [ 18.859066][ T17] ? find_held_lock (kernel/locking/lockdep.c:5156)
> [ 18.859245][ T17] ? khugepaged_scan_pmd (mm/khugepaged.c:1400)
> [ 18.859446][ T17] ? khugepaged_scan_mm_slot (mm/khugepaged.c:2216)
> [ 18.859657][ T17] ? khugepaged_do_scan (mm/khugepaged.c:2290)
> [ 18.859854][ T17] ? khugepaged (mm/khugepaged.c:2340)
> [ 18.860016][ T17] ? khugepaged_defrag_show (mm/khugepaged.c:1076)
> [ 18.860219][ T17] ? kthread (kernel/kthread.c:376)
> [ 18.860374][ T17] ? khugepaged_do_scan (mm/khugepaged.c:2328)
> [ 18.860570][ T17] ? kthread_complete_and_exit (kernel/kthread.c:331)
> [ 18.860781][ T17] ? ret_from_fork (arch/x86/entry/entry_32.S:772)
> [   18.860956][   T17] Modules linked in:
> [   18.861105][   T17] CR2: 00000000003c24ca
> [   18.861262][   T17] ---[ end trace 0000000000000000 ]---
> [ 18.861263][ T17] EIP: alloc_charge_hpage (mm/khugepaged.c:951 mm/khugepaged.c:1091)
> [ 18.861266][ T17] Code: 00 00 00 55 89 e5 6a 07 e8 86 c0 ff ff c9 31 d2 89 d1 c3 55 89 e5 a1 28 e6 fa 59 25 80 00 00 00 83 f8 01 19 c0 25 00 fc ff ff <8b> 80 ca 24 3c 00 85 c0 74 0f 8b 42 08 5d 8b 40 04 b8 01 00 00 00
> All code
> ========
>    0:   00 00                   add    %al,(%rax)
>    2:   00 55 89                add    %dl,-0x77(%rbp)
>    5:   e5 6a                   in     $0x6a,%eax
>    7:   07                      (bad)
>    8:   e8 86 c0 ff ff          callq  0xffffffffffffc093
>    d:   c9                      leaveq
>    e:   31 d2                   xor    %edx,%edx
>   10:   89 d1                   mov    %edx,%ecx
>   12:   c3                      retq
>   13:   55                      push   %rbp
>   14:   89 e5                   mov    %esp,%ebp
>   16:   a1 28 e6 fa 59 25 80    movabs 0x802559fae628,%eax
>   1d:   00 00
>   1f:   00 83 f8 01 19 c0       add    %al,-0x3fe6fe08(%rbx)
>   25:   25 00 fc ff ff          and    $0xfffffc00,%eax
>   2a:*  8b 80 ca 24 3c 00       mov    0x3c24ca(%rax),%eax              <-- trapping instruction
>   30:   85 c0                   test   %eax,%eax
>   32:   74 0f                   je     0x43
>   34:   8b 42 08                mov    0x8(%rdx),%eax
>   37:   5d                      pop    %rbp
>   38:   8b 40 04                mov    0x4(%rax),%eax
>   3b:   b8 01 00 00 00          mov    $0x1,%eax
>
> Code starting with the faulting instruction
> ===========================================
>    0:   8b 80 ca 24 3c 00       mov    0x3c24ca(%rax),%eax
>    6:   85 c0                   test   %eax,%eax
>    8:   74 0f                   je     0x19
>    a:   8b 42 08                mov    0x8(%rdx),%eax
>    d:   5d                      pop    %rbp
>    e:   8b 40 04                mov    0x4(%rax),%eax
>   11:   b8 01 00 00 00          mov    $0x1,%eax
>
>
> To reproduce:
>
>         # build kernel
>         cd linux
>         cp config-5.18.0-rc5-next-20220502-00004-g0d006aeaf99b .config
>         make HOSTCC=gcc-11 CC=gcc-11 ARCH=i386 olddefconfig prepare modules_prepare bzImage modules
>         make HOSTCC=gcc-11 CC=gcc-11 ARCH=i386 INSTALL_MOD_PATH=<mod-install-dir> modules_install
>         cd <mod-install-dir>
>         find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
>
>
>         git clone https://github.com/intel/lkp-tests.git
>         cd lkp-tests
>         bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email
>
>         # if come across any failure that blocks the test,
>         # please remove ~/.lkp and /lkp dir to run from a clean state.
>
>
>
> --
> 0-DAY CI Kernel Test Service
> https://01.org/lkp
>
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b05fb9a85eab..755c40fe87d2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -92,6 +92,10 @@  struct collapse_control {
 
 	/* Last target selected in khugepaged_find_target_node() */
 	int last_target_node;
+
+	struct page *hpage;
+	int (*alloc_charge_hpage)(struct mm_struct *mm,
+				  struct collapse_control *cc);
 };
 
 /**
@@ -866,18 +870,19 @@  static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 	return true;
 }
 
-static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool khugepaged_alloc_page(gfp_t gfp, int node,
+				  struct collapse_control *cc)
 {
-	VM_BUG_ON_PAGE(*hpage, *hpage);
+	VM_BUG_ON_PAGE(cc->hpage, cc->hpage);
 
-	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
-	if (unlikely(!*hpage)) {
+	cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+	if (unlikely(!cc->hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-		*hpage = ERR_PTR(-ENOMEM);
+		cc->hpage = ERR_PTR(-ENOMEM);
 		return false;
 	}
 
-	prep_transhuge_page(*hpage);
+	prep_transhuge_page(cc->hpage);
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return true;
 }
@@ -1067,8 +1072,7 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 	return true;
 }
 
-static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
-			      struct collapse_control *cc)
+static int alloc_charge_hpage(struct mm_struct *mm, struct collapse_control *cc)
 {
 #ifdef CONFIG_NUMA
 	const struct cpumask *cpumask;
@@ -1084,17 +1088,17 @@  static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
 			set_cpus_allowed_ptr(current, cpumask);
 	}
 #endif
-	if (!khugepaged_alloc_page(hpage, gfp, node))
+	if (!khugepaged_alloc_page(gfp, node, cc))
 		return SCAN_ALLOC_HUGE_PAGE_FAIL;
-	if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+	if (unlikely(mem_cgroup_charge(page_folio(cc->hpage), mm, gfp)))
 		return SCAN_CGROUP_CHARGE_FAIL;
-	count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+	count_memcg_page_event(cc->hpage, THP_COLLAPSE_ALLOC);
 	return SCAN_SUCCEED;
 }
 
 static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
-			       struct page **hpage, int referenced,
-			       int unmapped, struct collapse_control *cc)
+			       int referenced, int unmapped,
+			       struct collapse_control *cc)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
@@ -1116,11 +1120,11 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 */
 	mmap_read_unlock(mm);
 
-	result = alloc_charge_hpage(hpage, mm, cc);
+	result = cc->alloc_charge_hpage(mm, cc);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
-	new_page = *hpage;
+	new_page = cc->hpage;
 
 	mmap_read_lock(mm);
 	result = hugepage_vma_revalidate(mm, address, &vma);
@@ -1232,15 +1236,15 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	update_mmu_cache_pmd(vma, address, pmd);
 	spin_unlock(pmd_ptl);
 
-	*hpage = NULL;
+	cc->hpage = NULL;
 
 	khugepaged_pages_collapsed++;
 	result = SCAN_SUCCEED;
 out_up_write:
 	mmap_write_unlock(mm);
 out_nolock:
-	if (!IS_ERR_OR_NULL(*hpage))
-		mem_cgroup_uncharge(page_folio(*hpage));
+	if (!IS_ERR_OR_NULL(cc->hpage))
+		mem_cgroup_uncharge(page_folio(cc->hpage));
 	trace_mm_collapse_huge_page(mm, isolated, result);
 	return;
 }
@@ -1248,7 +1252,6 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 static int khugepaged_scan_pmd(struct mm_struct *mm,
 			       struct vm_area_struct *vma,
 			       unsigned long address,
-			       struct page **hpage,
 			       struct collapse_control *cc)
 {
 	pmd_t *pmd;
@@ -1394,8 +1397,7 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 	pte_unmap_unlock(pte, ptl);
 	if (ret) {
 		/* collapse_huge_page will return with the mmap_lock released */
-		collapse_huge_page(mm, address, hpage, referenced, unmapped,
-				   cc);
+		collapse_huge_page(mm, address, referenced, unmapped, cc);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1660,7 +1662,6 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  * @mm: process address space where collapse happens
  * @file: file that collapse on
  * @start: collapse start address
- * @hpage: new allocated huge page for collapse
  * @cc: collapse context and scratchpad
  *
  * Basic scheme is simple, details are more complex:
@@ -1679,8 +1680,7 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + unlock and free huge page;
  */
 static void collapse_file(struct mm_struct *mm, struct file *file,
-			  pgoff_t start, struct page **hpage,
-			  struct collapse_control *cc)
+			  pgoff_t start, struct collapse_control *cc)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *new_page;
@@ -1694,11 +1694,11 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
-	result = alloc_charge_hpage(hpage, mm, cc);
+	result = cc->alloc_charge_hpage(mm, cc);
 	if (result != SCAN_SUCCEED)
 		goto out;
 
-	new_page = *hpage;
+	new_page = cc->hpage;
 
 	/*
 	 * Ensure we have slots for all the pages in the range.  This is
@@ -1981,7 +1981,7 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 		 * Remove pte page tables, so we can re-fault the page as huge.
 		 */
 		retract_page_tables(mapping, start);
-		*hpage = NULL;
+		cc->hpage = NULL;
 
 		khugepaged_pages_collapsed++;
 	} else {
@@ -2028,14 +2028,14 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 	unlock_page(new_page);
 out:
 	VM_BUG_ON(!list_empty(&pagelist));
-	if (!IS_ERR_OR_NULL(*hpage))
-		mem_cgroup_uncharge(page_folio(*hpage));
+	if (!IS_ERR_OR_NULL(cc->hpage))
+		mem_cgroup_uncharge(page_folio(cc->hpage));
 	/* TODO: tracepoints */
 }
 
 static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage,
-		struct collapse_control *cc)
+				 struct file *file, pgoff_t start,
+				 struct collapse_control *cc)
 {
 	struct page *page = NULL;
 	struct address_space *mapping = file->f_mapping;
@@ -2108,7 +2108,7 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-			collapse_file(mm, file, start, hpage, cc);
+			collapse_file(mm, file, start, cc);
 		}
 	}
 
@@ -2116,8 +2116,8 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 }
 #else
 static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage,
-		struct collapse_control *cc)
+				 struct file *file, pgoff_t start,
+				 struct collapse_control *cc)
 {
 	BUILD_BUG();
 }
@@ -2128,7 +2128,6 @@  static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 #endif
 
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
-					    struct page **hpage,
 					    struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
 	__acquires(&khugepaged_mm_lock)
@@ -2205,12 +2204,11 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
 				mmap_read_unlock(mm);
 				ret = 1;
-				khugepaged_scan_file(mm, file, pgoff, hpage, cc);
+				khugepaged_scan_file(mm, file, pgoff, cc);
 				fput(file);
 			} else {
 				ret = khugepaged_scan_pmd(mm, vma,
-						khugepaged_scan.address,
-						hpage, cc);
+						khugepaged_scan.address, cc);
 			}
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2268,15 +2266,15 @@  static int khugepaged_wait_event(void)
 
 static void khugepaged_do_scan(struct collapse_control *cc)
 {
-	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
 	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
 	bool wait = true;
 
+	cc->hpage = NULL;
 	lru_add_drain_all();
 
 	while (progress < pages) {
-		if (!khugepaged_prealloc_page(&hpage, &wait))
+		if (!khugepaged_prealloc_page(&cc->hpage, &wait))
 			break;
 
 		cond_resched();
@@ -2290,14 +2288,14 @@  static void khugepaged_do_scan(struct collapse_control *cc)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    &hpage, cc);
+							    cc);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
 	}
 
-	if (!IS_ERR_OR_NULL(hpage))
-		put_page(hpage);
+	if (!IS_ERR_OR_NULL(cc->hpage))
+		put_page(cc->hpage);
 }
 
 static bool khugepaged_should_wakeup(void)
@@ -2331,6 +2329,7 @@  static int khugepaged(void *none)
 	struct mm_slot *mm_slot;
 	struct collapse_control cc = {
 		.last_target_node = NUMA_NO_NODE,
+		.alloc_charge_hpage = &alloc_charge_hpage,
 	};
 
 	set_freezable();