diff mbox series

mm/vmemmap/devdax: Fix kernel crash when probing devdax devices

Message ID 20230407122353.12018-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/vmemmap/devdax: Fix kernel crash when probing devdax devices | expand

Commit Message

Aneesh Kumar K.V April 7, 2023, 12:23 p.m. UTC
commit c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
added support for using optimized vmmemap for devdax devices. But how vmemmap
mappings are created are architecture specific. For example, powerpc with hash
translation doesn't have vmemmap mappings in init_mm page table instead they are
bolted table entries in the hardware page table

vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware
of these architecture-specific mapping. Hence allow architecture to opt for this
feature. I selected architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP
option as also supporting this feature. I added vmemmap_can_optimize() even
though page_vmemmap_nr(pgmap) > 1 check should filter architecture not
supporting this. IMHO that brings clarity to the code where we are populating
vmemmap.

This patch fixes the below crash on ppc64.

BUG: Unable to handle kernel data access on write at 0xc00c000100400038
Faulting instruction address: 0xc000000001269d90
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
NIP:  c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
REGS: c000000003632c30 TRAP: 0300   Not tainted  (6.3.0-rc5-150500.34-default+)
MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24842228  XER: 00000000
CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
....
NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
Call Trace:
[c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
[c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
[c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
[c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
[c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
[c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
[c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
[c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
[c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
[c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
[c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
[c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
[c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
[c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
[c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
[c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
[c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
[c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
[c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
[c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
[c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
[c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
[c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
[c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
[c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
[c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
[c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
[c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
[c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
[c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64

Fixes: c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/arm64/Kconfig     |  1 +
 arch/loongarch/Kconfig |  1 +
 arch/s390/Kconfig      |  1 +
 arch/x86/Kconfig       |  1 +
 drivers/dax/device.c   |  3 ++-
 include/linux/mm.h     | 16 ++++++++++++++++
 mm/Kconfig             |  3 +++
 mm/sparse-vmemmap.c    |  3 +--
 8 files changed, 26 insertions(+), 3 deletions(-)

Comments

Jane Chu April 7, 2023, 6:03 p.m. UTC | #1
On 4/7/2023 5:23 AM, Aneesh Kumar K.V wrote:
> diff --git a/mm/Kconfig b/mm/Kconfig
> index ff7b209dec05..99f87c1be1e8 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>   	  pfn_to_page and page_to_pfn operations.  This is the most
>   	  efficient option when sufficient kernel resources are available.
>   
> +config ARCH_WANT_OPTIMIZE_VMEMMAP
> +	bool
> +

Could this devdax specific config switch be added to drivers/dax/Kconfig 
? also, how about adding 'DAX' to the config switch name?

BTW, I noticed something minor and unrelated in the original commit 
(c4386bd8ee3a):

-static unsigned long pfn_next(unsigned long pfn)
+static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
  {
-       if (pfn % 1024 == 0)
+       if (pfn % (1024 << pgmap->vmemmap_shift))     <---- this line
                 cond_resched();
-       return pfn + 1;
+       return pfn + pgmap_vmemmap_nr(pgmap);
+}

should be
+       if (pfn % (1024 * pgmap_vmemmap_nr(pgmap))
to be consistent.

thanks,
-jane
Aneesh Kumar K.V April 8, 2023, 10:19 a.m. UTC | #2
Jane Chu <jane.chu@oracle.com> writes:

> On 4/7/2023 5:23 AM, Aneesh Kumar K.V wrote:
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index ff7b209dec05..99f87c1be1e8 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>   	  pfn_to_page and page_to_pfn operations.  This is the most
>>   	  efficient option when sufficient kernel resources are available.
>>   
>> +config ARCH_WANT_OPTIMIZE_VMEMMAP
>> +	bool
>> +
>
> Could this devdax specific config switch be added to drivers/dax/Kconfig 
> ? also, how about adding 'DAX' to the config switch name?

I would say we want to make it more generic. ie, both hugetlb and devdax
can now derive the feature support via ARCH_WANT_OPTIMIZE_VMEMAP

commit aafb4790ea0250c8d2450e9d23a4be80c663d2ec
Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Date:   Sat Apr 8 15:41:48 2023 +0530

    mm/hugetlb: Remove ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
    
    We can depend on the more generic ARCH_WACH_OPTIMIZE_VMEMAP
    which is now used to enable both hugetlb and devddax vmemmap optimization.
    
    Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d3f5945f0aff..77d9713dcd9c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -100,7 +100,6 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_OPTIMIZE_VMEMMAP
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ce5802066d0e..9cb00f962de1 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -53,7 +53,6 @@ config LOONGARCH
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_OPTIMIZE_VMEMMAP
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index abffccd937b2..df2cd510480a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -126,7 +126,6 @@ config S390
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_IPC_PARSE_VERSION
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_OPTIMIZE_VMEMMAP
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e8d66d834b4f..5269131cc248 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -125,7 +125,6 @@ config X86
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_GENERAL_HUGETLB
 	select ARCH_WANT_HUGE_PMD_SHARE
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANTS_THP_SWAP		if X86_64
diff --git a/fs/Kconfig b/fs/Kconfig
index e99830c65033..cc07a0cd3172 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -250,16 +250,9 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
-#
-# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
-#
-config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-	bool
-
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+	depends on ARCH_WANT_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/mm/Kconfig b/mm/Kconfig
index 99f87c1be1e8..09ac60894763 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -460,7 +460,10 @@ config SPARSEMEM_VMEMMAP
 	  SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.
-
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
+#
 config ARCH_WANT_OPTIMIZE_VMEMMAP
 	bool
Dan Williams April 9, 2023, 2 a.m. UTC | #3
Aneesh Kumar K.V wrote:
> commit c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
> added support for using optimized vmmemap for devdax devices. But how vmemmap
> mappings are created are architecture specific. For example, powerpc with hash
> translation doesn't have vmemmap mappings in init_mm page table instead they are
> bolted table entries in the hardware page table
> 
> vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware
> of these architecture-specific mapping. Hence allow architecture to opt for this
> feature. I selected architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> option as also supporting this feature. I added vmemmap_can_optimize() even
> though page_vmemmap_nr(pgmap) > 1 check should filter architecture not
> supporting this. IMHO that brings clarity to the code where we are populating
> vmemmap.
> 
> This patch fixes the below crash on ppc64.
> 
> BUG: Unable to handle kernel data access on write at 0xc00c000100400038
> Faulting instruction address: 0xc000000001269d90
> Oops: Kernel access of bad area, sig: 11 [#1]
> LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> Modules linked in:
> CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
> Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
> NIP:  c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
> REGS: c000000003632c30 TRAP: 0300   Not tainted  (6.3.0-rc5-150500.34-default+)
> MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24842228  XER: 00000000
> CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
> ....
> NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
> LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
> Call Trace:
> [c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
> [c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
> [c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
> [c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
> [c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
> [c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
> [c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
> [c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
> [c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
> [c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
> [c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
> [c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
> [c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
> [c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
> [c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
> [c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
> [c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
> [c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
> [c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
> [c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
> [c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
> [c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
> [c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
> [c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
> [c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
> [c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
> [c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
> [c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
> [c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
> [c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
> [c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
> [c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64
> 
> Fixes: c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
> Cc: Joao Martins <joao.m.martins@oracle.com>
> Cc: Muchun Song <songmuchun@bytedance.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Looks good to me,

Reviewed-by: Dan Williams <dan.j.williams@intel.com>

...I guess device-dax is not often used on ppc if this has been
lingering since v5.17.
Joao Martins April 10, 2023, 10:33 a.m. UTC | #4
On 07/04/2023 13:23, Aneesh Kumar K.V wrote:
> commit c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
> added support for using optimized vmmemap for devdax devices. 

Not really. It added *compound pages* for devdax ZONE device which is what the
commit describes. It is meant to represent the non base page metadata. Should
fsdax support the equivalent, we can have GUP performance on the dax path and
remove today's special zone device case of ref-per-base-page GUP-fast path.

One was tied when it was brought in but in theory[*] we didn't require anything
from arch-es to do this (contrary to vmemmap_populate()). The commit you should
refer to is this instead:

4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")

[*] or so I thought before next paragraph

> But how vmemmap
> mappings are created are architecture specific. For example, powerpc with hash
> translation doesn't have vmemmap mappings in init_mm page table instead they are
> bolted table entries in the hardware page table
> 

Very interesting, I didn't know this (no init_mm) was an option. And my
apologies for regressing s390 :/ So, s390 does not support
vmemmap_populate_basepages() then and always need to go to arch
vmemmap_populate() to create the vmemmap entries.

> vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware
> of these architecture-specific mapping. Hence allow architecture to opt for this
> feature. I selected architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> option as also supporting this feature. I added vmemmap_can_optimize() even
> though page_vmemmap_nr(pgmap) > 1 check should filter architecture not
> supporting this. > IMHO that brings clarity to the code where we are populating
> vmemmap.
> 
I am not sure the last two sentences are right. I would remove, see above and
below comments at the end on why.

> This patch fixes the below crash on ppc64.
> 
> BUG: Unable to handle kernel data access on write at 0xc00c000100400038
> Faulting instruction address: 0xc000000001269d90
> Oops: Kernel access of bad area, sig: 11 [#1]
> LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> Modules linked in:
> CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
> Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
> NIP:  c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
> REGS: c000000003632c30 TRAP: 0300   Not tainted  (6.3.0-rc5-150500.34-default+)
> MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24842228  XER: 00000000
> CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
> ....
> NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
> LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
> Call Trace:
> [c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
> [c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
> [c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
> [c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
> [c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
> [c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
> [c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
> [c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
> [c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
> [c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
> [c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
> [c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
> [c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
> [c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
> [c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
> [c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
> [c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
> [c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
> [c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
> [c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
> [c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
> [c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
> [c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
> [c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
> [c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
> [c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
> [c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
> [c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
> [c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
> [c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
> [c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
> [c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64
> 
> Fixes: c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")

It should be:

Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound
devmaps")

But if what you included in your patch was what lead to the crash, then your
problem is not the vmemmap optimization ... as the latter came after. If it's
the hash I included above, it would reduce the affected trees to v5.19+

> Cc: Joao Martins <joao.m.martins@oracle.com>
> Cc: Muchun Song <songmuchun@bytedance.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/arm64/Kconfig     |  1 +
>  arch/loongarch/Kconfig |  1 +
>  arch/s390/Kconfig      |  1 +
>  arch/x86/Kconfig       |  1 +
>  drivers/dax/device.c   |  3 ++-
>  include/linux/mm.h     | 16 ++++++++++++++++
>  mm/Kconfig             |  3 +++
>  mm/sparse-vmemmap.c    |  3 +--
>  8 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 27b2592698b0..d3f5945f0aff 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -103,6 +103,7 @@ config ARM64
>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>  	select ARCH_WANT_LD_ORPHAN_WARN
>  	select ARCH_WANTS_NO_INSTR
> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>  	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
>  	select ARCH_HAS_UBSAN_SANITIZE_ALL
>  	select ARM_AMBA
> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
> index 9cc8b84f7eb0..ce5802066d0e 100644
> --- a/arch/loongarch/Kconfig
> +++ b/arch/loongarch/Kconfig
> @@ -56,6 +56,7 @@ config LOONGARCH
>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>  	select ARCH_WANT_LD_ORPHAN_WARN
>  	select ARCH_WANTS_NO_INSTR
> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>  	select BUILDTIME_TABLE_SORT
>  	select COMMON_CLK
>  	select CPU_PM
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 933771b0b07a..abffccd937b2 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -127,6 +127,7 @@ config S390
>  	select ARCH_WANT_DEFAULT_BPF_JIT
>  	select ARCH_WANT_IPC_PARSE_VERSION
>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>  	select BUILDTIME_TABLE_SORT
>  	select CLONE_BACKWARDS2
>  	select DMA_OPS if PCI
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index a825bf031f49..e8d66d834b4f 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -127,6 +127,7 @@ config X86
>  	select ARCH_WANT_HUGE_PMD_SHARE
>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
>  	select ARCH_WANT_LD_ORPHAN_WARN
> +	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
>  	select ARCH_WANTS_THP_SWAP		if X86_64
>  	select ARCH_HAS_PARANOID_L1D_FLUSH
>  	select BUILDTIME_TABLE_SORT

I would remove these and instead use the
ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, and have your other snip be a second
patch to drop the 'HUGETLB_' part

> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 5494d745ced5..05be8e79d64b 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -448,7 +448,8 @@ int dev_dax_probe(struct dev_dax *dev_dax)
>  	}
>  
>  	pgmap->type = MEMORY_DEVICE_GENERIC;
> -	if (dev_dax->align > PAGE_SIZE)
> +	if (dev_dax->align > PAGE_SIZE &&
> +	    IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
>  		pgmap->vmemmap_shift =
>  			order_base_2(dev_dax->align >> PAGE_SHIFT);

vmemmap_shift relates to the compound page size we will use in
memmap_init_zone_device(). Otherwise we are back to the base struct page on
PMD/PUD case. Instead move the:

	IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP)

further below to __populate_section_memmap() ...

>  	addr = devm_memremap_pages(dev, pgmap);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 716d30d93616..fb71e21df23d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3442,6 +3442,22 @@ void vmemmap_populate_print_last(void);
>  void vmemmap_free(unsigned long start, unsigned long end,
>  		struct vmem_altmap *altmap);
>  #endif
> +
> +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
> +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
> +					   struct dev_pagemap *pgmap)
> +{
> +	return is_power_of_2(sizeof(struct page)) &&
> +		pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
> +}
> +#else
> +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
> +					   struct dev_pagemap *pgmap)
> +{
> +	return false;
> +}
> +#endif
> +
>  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
>  				  unsigned long nr_pages);
>  
> diff --git a/mm/Kconfig b/mm/Kconfig
> index ff7b209dec05..99f87c1be1e8 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>  	  pfn_to_page and page_to_pfn operations.  This is the most
>  	  efficient option when sufficient kernel resources are available.
>  
> +config ARCH_WANT_OPTIMIZE_VMEMMAP
> +	bool
> +
>  config HAVE_MEMBLOCK_PHYS_MAP
>  	bool
>  
As you mentioned in your other followup email it probably makes sense that you
just use ARCH_HUGETLB_WANT_OPTIMIZE_VMEMMAP if we switch this to per-arch. And
then have that kconfig name be renamed into the above. It simplifies the fix in
case it pickups stable trees (the dax vmemmap deduplication came after hugetlb).
The case for the head page reusal case for hugetlb is anyways captured by a
hugetlb static key.


> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index c5398a5960d0..10d73a0dfcec 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>  		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>  		return NULL;
>  
> -	if (is_power_of_2(sizeof(struct page)) &&
> -	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> +	if (vmemmap_can_optimize(altmap, pgmap))
>  		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
>  	else
>  		r = vmemmap_populate(start, end, nid, altmap);

Same comment(s) as further above. I think it should be:

if (IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
	r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
else
	r = vmemmap_populate(start, end, nid, altmap);

In principle all these arches below can support
vmemmap_populate_compound_pages() but now it would require them enable it
explicitly, which I guess makes sense considering this bug you are fixing. But
the arch-requirement is not really if we support optimized-vmemmap or not but if
we support the init-mm at all. OTOH tieing both hugetlb and dax additionally
eliminates confusion around an optimization shared by both, even if the dax
could encompass more. I am not sure what is the arch requirement for hugetlb
vmemmap opt these days.

arch/arm64/mm/mmu.c:            return vmemmap_populate_basepages(start, end,
node, altmap);
arch/ia64/mm/discontig.c:       return vmemmap_populate_basepages(start, end,
node, NULL);
arch/loongarch/mm/init.c:       return vmemmap_populate_basepages(start, end,
node, NULL);
arch/riscv/mm/init.c:   return vmemmap_populate_basepages(start, end, node, NULL);
arch/x86/mm/init_64.c:          err = vmemmap_populate_basepages(start, end,
node, NULL);
arch/x86/mm/init_64.c:          err = vmemmap_populate_basepages(start, end,
node, NULL);
Jane Chu April 10, 2023, 5:27 p.m. UTC | #5
On 4/8/2023 3:19 AM, Aneesh Kumar K.V wrote:
> Jane Chu <jane.chu@oracle.com> writes:
> 
>> On 4/7/2023 5:23 AM, Aneesh Kumar K.V wrote:
>>> diff --git a/mm/Kconfig b/mm/Kconfig
>>> index ff7b209dec05..99f87c1be1e8 100644
>>> --- a/mm/Kconfig
>>> +++ b/mm/Kconfig
>>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>>    	  pfn_to_page and page_to_pfn operations.  This is the most
>>>    	  efficient option when sufficient kernel resources are available.
>>>    
>>> +config ARCH_WANT_OPTIMIZE_VMEMMAP
>>> +	bool
>>> +
>>
>> Could this devdax specific config switch be added to drivers/dax/Kconfig
>> ? also, how about adding 'DAX' to the config switch name?
> 
> I would say we want to make it more generic. ie, both hugetlb and devdax
> can now derive the feature support via ARCH_WANT_OPTIMIZE_VMEMAP

The two config switches have different purposes, it's better to keep 
them separate.  For example, recent hugetlb high granularity mapping 
(HGM) project requires to users to make a choice between HGM and hugetlb 
vmemmap optimization(at least for now), while one can keep devdax 
compound page support enabled.

> 
> commit aafb4790ea0250c8d2450e9d23a4be80c663d2ec
> Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Date:   Sat Apr 8 15:41:48 2023 +0530
> 
>      mm/hugetlb: Remove ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>      
>      We can depend on the more generic ARCH_WACH_OPTIMIZE_VMEMAP
>      which is now used to enable both hugetlb and devddax vmemmap optimization.
>      
>      Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index d3f5945f0aff..77d9713dcd9c 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -100,7 +100,6 @@ config ARM64
>   	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>   	select ARCH_WANT_FRAME_POINTERS
>   	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
> -	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   	select ARCH_WANT_LD_ORPHAN_WARN
>   	select ARCH_WANTS_NO_INSTR
>   	select ARCH_WANT_OPTIMIZE_VMEMMAP
> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
> index ce5802066d0e..9cb00f962de1 100644
> --- a/arch/loongarch/Kconfig
> +++ b/arch/loongarch/Kconfig
> @@ -53,7 +53,6 @@ config LOONGARCH
>   	select ARCH_USE_QUEUED_RWLOCKS
>   	select ARCH_USE_QUEUED_SPINLOCKS
>   	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> -	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   	select ARCH_WANT_LD_ORPHAN_WARN
>   	select ARCH_WANTS_NO_INSTR
>   	select ARCH_WANT_OPTIMIZE_VMEMMAP
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index abffccd937b2..df2cd510480a 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -126,7 +126,6 @@ config S390
>   	select ARCH_WANTS_NO_INSTR
>   	select ARCH_WANT_DEFAULT_BPF_JIT
>   	select ARCH_WANT_IPC_PARSE_VERSION
> -	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   	select ARCH_WANT_OPTIMIZE_VMEMMAP
>   	select BUILDTIME_TABLE_SORT
>   	select CLONE_BACKWARDS2
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index e8d66d834b4f..5269131cc248 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -125,7 +125,6 @@ config X86
>   	select ARCH_WANTS_NO_INSTR
>   	select ARCH_WANT_GENERAL_HUGETLB
>   	select ARCH_WANT_HUGE_PMD_SHARE
> -	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
>   	select ARCH_WANT_LD_ORPHAN_WARN
>   	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
>   	select ARCH_WANTS_THP_SWAP		if X86_64
> diff --git a/fs/Kconfig b/fs/Kconfig
> index e99830c65033..cc07a0cd3172 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -250,16 +250,9 @@ config HUGETLBFS
>   config HUGETLB_PAGE
>   	def_bool HUGETLBFS
>   
> -#
> -# Select this config option from the architecture Kconfig, if it is preferred
> -# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
> -#
> -config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> -	bool
> -
>   config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   	def_bool HUGETLB_PAGE
> -	depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> +	depends on ARCH_WANT_OPTIMIZE_VMEMMAP
>   	depends on SPARSEMEM_VMEMMAP
>   
>   config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 99f87c1be1e8..09ac60894763 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -460,7 +460,10 @@ config SPARSEMEM_VMEMMAP
>   	  SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
>   	  pfn_to_page and page_to_pfn operations.  This is the most
>   	  efficient option when sufficient kernel resources are available.
> -
> +#
> +# Select this config option from the architecture Kconfig, if it is preferred
> +# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
> +#
>   config ARCH_WANT_OPTIMIZE_VMEMMAP
>   	bool
>   
> 

thanks,
-jane
Joao Martins April 10, 2023, 5:47 p.m. UTC | #6
On 10/04/2023 18:27, Jane Chu wrote:
> 
> On 4/8/2023 3:19 AM, Aneesh Kumar K.V wrote:
>> Jane Chu <jane.chu@oracle.com> writes:
>>
>>> On 4/7/2023 5:23 AM, Aneesh Kumar K.V wrote:
>>>> diff --git a/mm/Kconfig b/mm/Kconfig
>>>> index ff7b209dec05..99f87c1be1e8 100644
>>>> --- a/mm/Kconfig
>>>> +++ b/mm/Kconfig
>>>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>>>          pfn_to_page and page_to_pfn operations.  This is the most
>>>>          efficient option when sufficient kernel resources are available.
>>>>    +config ARCH_WANT_OPTIMIZE_VMEMMAP
>>>> +    bool
>>>> +
>>>
>>> Could this devdax specific config switch be added to drivers/dax/Kconfig
>>> ? also, how about adding 'DAX' to the config switch name?
>>
>> I would say we want to make it more generic. ie, both hugetlb and devdax
>> can now derive the feature support via ARCH_WANT_OPTIMIZE_VMEMAP
> 
> The two config switches have different purposes, it's better to keep them
> separate.  For example, recent hugetlb high granularity mapping (HGM) project
> requires to users to make a choice between HGM and hugetlb vmemmap
> optimization(at least for now), while one can keep devdax compound page support
> enabled.
> 

Is it done by kconfig? If it helps:

* there's a static key hugetlb can use to tell if this is enabled or not like
how page_fixed_fake_head() uses it:

if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
	...

* there's a hugetlb page bit for that vmemmap optimized pages with
HpageOptimized(page).

* there is a separate hugetlb CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP which is not
the same as the ARCH kconfig.

But perhaps there's some relevance in the ARCH_HUGETLB specific to HGMv2 that I
am unaware.

>>
>> commit aafb4790ea0250c8d2450e9d23a4be80c663d2ec
>> Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> Date:   Sat Apr 8 15:41:48 2023 +0530
>>
>>      mm/hugetlb: Remove ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>           We can depend on the more generic ARCH_WACH_OPTIMIZE_VMEMAP
>>      which is now used to enable both hugetlb and devddax vmemmap optimization.
>>           Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index d3f5945f0aff..77d9713dcd9c 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -100,7 +100,6 @@ config ARM64
>>       select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>>       select ARCH_WANT_FRAME_POINTERS
>>       select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES &&
>> !ARM64_VA_BITS_36)
>> -    select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>       select ARCH_WANT_LD_ORPHAN_WARN
>>       select ARCH_WANTS_NO_INSTR
>>       select ARCH_WANT_OPTIMIZE_VMEMMAP
>> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
>> index ce5802066d0e..9cb00f962de1 100644
>> --- a/arch/loongarch/Kconfig
>> +++ b/arch/loongarch/Kconfig
>> @@ -53,7 +53,6 @@ config LOONGARCH
>>       select ARCH_USE_QUEUED_RWLOCKS
>>       select ARCH_USE_QUEUED_SPINLOCKS
>>       select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>> -    select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>       select ARCH_WANT_LD_ORPHAN_WARN
>>       select ARCH_WANTS_NO_INSTR
>>       select ARCH_WANT_OPTIMIZE_VMEMMAP
>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>> index abffccd937b2..df2cd510480a 100644
>> --- a/arch/s390/Kconfig
>> +++ b/arch/s390/Kconfig
>> @@ -126,7 +126,6 @@ config S390
>>       select ARCH_WANTS_NO_INSTR
>>       select ARCH_WANT_DEFAULT_BPF_JIT
>>       select ARCH_WANT_IPC_PARSE_VERSION
>> -    select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>       select ARCH_WANT_OPTIMIZE_VMEMMAP
>>       select BUILDTIME_TABLE_SORT
>>       select CLONE_BACKWARDS2
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index e8d66d834b4f..5269131cc248 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -125,7 +125,6 @@ config X86
>>       select ARCH_WANTS_NO_INSTR
>>       select ARCH_WANT_GENERAL_HUGETLB
>>       select ARCH_WANT_HUGE_PMD_SHARE
>> -    select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP    if X86_64
>>       select ARCH_WANT_LD_ORPHAN_WARN
>>       select ARCH_WANT_OPTIMIZE_VMEMMAP    if X86_64
>>       select ARCH_WANTS_THP_SWAP        if X86_64
>> diff --git a/fs/Kconfig b/fs/Kconfig
>> index e99830c65033..cc07a0cd3172 100644
>> --- a/fs/Kconfig
>> +++ b/fs/Kconfig
>> @@ -250,16 +250,9 @@ config HUGETLBFS
>>   config HUGETLB_PAGE
>>       def_bool HUGETLBFS
>>   -#
>> -# Select this config option from the architecture Kconfig, if it is preferred
>> -# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
>> -#
>> -config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> -    bool
>> -
>>   config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>       def_bool HUGETLB_PAGE
>> -    depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> +    depends on ARCH_WANT_OPTIMIZE_VMEMMAP
>>       depends on SPARSEMEM_VMEMMAP
>>     config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 99f87c1be1e8..09ac60894763 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -460,7 +460,10 @@ config SPARSEMEM_VMEMMAP
>>         SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
>>         pfn_to_page and page_to_pfn operations.  This is the most
>>         efficient option when sufficient kernel resources are available.
>> -
>> +#
>> +# Select this config option from the architecture Kconfig, if it is preferred
>> +# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
>> +#
>>   config ARCH_WANT_OPTIMIZE_VMEMMAP
>>       bool
>>  
> 
> thanks,
> -jane
Jane Chu April 10, 2023, 9:39 p.m. UTC | #7
On 4/10/2023 10:47 AM, Joao Martins wrote:
> 
> 
> On 10/04/2023 18:27, Jane Chu wrote:
>>
>> On 4/8/2023 3:19 AM, Aneesh Kumar K.V wrote:
>>> Jane Chu <jane.chu@oracle.com> writes:
>>>
>>>> On 4/7/2023 5:23 AM, Aneesh Kumar K.V wrote:
>>>>> diff --git a/mm/Kconfig b/mm/Kconfig
>>>>> index ff7b209dec05..99f87c1be1e8 100644
>>>>> --- a/mm/Kconfig
>>>>> +++ b/mm/Kconfig
>>>>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>>>>           pfn_to_page and page_to_pfn operations.  This is the most
>>>>>           efficient option when sufficient kernel resources are available.
>>>>>     +config ARCH_WANT_OPTIMIZE_VMEMMAP
>>>>> +    bool
>>>>> +
>>>>
>>>> Could this devdax specific config switch be added to drivers/dax/Kconfig
>>>> ? also, how about adding 'DAX' to the config switch name?
>>>
>>> I would say we want to make it more generic. ie, both hugetlb and devdax
>>> can now derive the feature support via ARCH_WANT_OPTIMIZE_VMEMAP
>>
>> The two config switches have different purposes, it's better to keep them
>> separate.  For example, recent hugetlb high granularity mapping (HGM) project
>> requires to users to make a choice between HGM and hugetlb vmemmap
>> optimization(at least for now), while one can keep devdax compound page support
>> enabled.
>>
> 
> Is it done by kconfig? If it helps:
> 
> * there's a static key hugetlb can use to tell if this is enabled or not like
> how page_fixed_fake_head() uses it:
> 
> if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
> 	...
> 
> * there's a hugetlb page bit for that vmemmap optimized pages with
> HpageOptimized(page).
> 
> * there is a separate hugetlb CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP which is not
> the same as the ARCH kconfig.
> 
> But perhaps there's some relevance in the ARCH_HUGETLB specific to HGMv2 that I
> am unaware.
> 

Looks like there is no consumer of 
CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP.
There are three ways to enable HVO via 
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON alone, or a combination 
of CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP and boot param 
'vmemmap_optimize_enabled' or sysctl variable 
/proc/sys/vm/hugetlb_optimize_vmemmap.

It seems that the devdax compound page support does not cross path with 
HVO which has a lot more moving arms, and so it's better to keep the 
devdax config switch local to devdax and leave HVO alone.

thanks!
-jane
Joao Martins April 10, 2023, 10:55 p.m. UTC | #8
On 10/04/2023 22:39, Jane Chu wrote:
> On 4/10/2023 10:47 AM, Joao Martins wrote:
>> On 10/04/2023 18:27, Jane Chu wrote:
>>> On 4/8/2023 3:19 AM, Aneesh Kumar K.V wrote:
>>>> Jane Chu <jane.chu@oracle.com> writes:
>>>>> On 4/7/2023 5:23 AM, Aneesh Kumar K.V wrote:
>>>>>> diff --git a/mm/Kconfig b/mm/Kconfig
>>>>>> index ff7b209dec05..99f87c1be1e8 100644
>>>>>> --- a/mm/Kconfig
>>>>>> +++ b/mm/Kconfig
>>>>>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>>>>>           pfn_to_page and page_to_pfn operations.  This is the most
>>>>>>           efficient option when sufficient kernel resources are available.
>>>>>>     +config ARCH_WANT_OPTIMIZE_VMEMMAP
>>>>>> +    bool
>>>>>> +
>>>>>
>>>>> Could this devdax specific config switch be added to drivers/dax/Kconfig
>>>>> ? also, how about adding 'DAX' to the config switch name?
>>>>
>>>> I would say we want to make it more generic. ie, both hugetlb and devdax
>>>> can now derive the feature support via ARCH_WANT_OPTIMIZE_VMEMAP
>>>
>>> The two config switches have different purposes, it's better to keep them
>>> separate.  For example, recent hugetlb high granularity mapping (HGM) project
>>> requires to users to make a choice between HGM and hugetlb vmemmap
>>> optimization(at least for now), while one can keep devdax compound page support
>>> enabled.
>>>
>>
>> Is it done by kconfig? If it helps:
>>
>> * there's a static key hugetlb can use to tell if this is enabled or not like
>> how page_fixed_fake_head() uses it:
>>
>> if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
>>     ...
>>
>> * there's a hugetlb page bit for that vmemmap optimized pages with
>> HpageOptimized(page).
>>
>> * there is a separate hugetlb CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP which is not
>> the same as the ARCH kconfig.
>>
>> But perhaps there's some relevance in the ARCH_HUGETLB specific to HGMv2 that I
>> am unaware.
>>
> 
> Looks like there is no consumer of CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP.
> There are three ways to enable HVO via
> CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON alone, or a combination of
> CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP and boot param 'vmemmap_optimize_enabled'
> or sysctl variable /proc/sys/vm/hugetlb_optimize_vmemmap.
> 
> It seems that the devdax compound page support 

Let's call it devdax vmemmap deduplication as compound pages can work regardless
of the trick.

> does not cross path with HVO
> which has a lot more moving arms, and so it's better to keep the devdax config
> switch local to devdax and leave HVO alone.

I agree; I would rather not change
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON nor
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP and its knobs. But to be clear the one I
was talking about is ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP (which is selected
by architecture) ... not the other ones. Which if it was a generic one would
translate to 'can the architecture support deduplicated vmemmap' regardless of
whether it is active or used by DAX/HVO. HVO would use its hugetlb kconfig knobs
and function helpers pointed above to differentiate in the HVO-specific case as
it does today. Perhaps we can hear also from Muchun on what he thinks is right
for HVO.

	Joao
Aneesh Kumar K.V April 11, 2023, 8:07 a.m. UTC | #9
Joao Martins <joao.m.martins@oracle.com> writes:

> On 07/04/2023 13:23, Aneesh Kumar K.V wrote:
>> commit c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
>> added support for using optimized vmmemap for devdax devices. 
>
> Not really. It added *compound pages* for devdax ZONE device which is what the
> commit describes. It is meant to represent the non base page metadata. Should
> fsdax support the equivalent, we can have GUP performance on the dax path and
> remove today's special zone device case of ref-per-base-page GUP-fast path.
>
> One was tied when it was brought in but in theory[*] we didn't require anything
> from arch-es to do this (contrary to vmemmap_populate()). The commit you should
> refer to is this instead:

ok the compound page dependency came via
commit 6fd3620b3428 ("mm/page_alloc: reuse tail struct pages for compound devmaps")

I have now reworked it such that we do

static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
					      unsigned long nr_pages)
{
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
	/*
	 * this depends on how vmemmap is populated by
	 * vmemmap_populate_compound_pages()
	 */
	return is_power_of_2(sizeof(struct page)) &&
		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
#else
	return nr_pages;
#endif
}

>
> 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")
>
> [*] or so I thought before next paragraph
>
>> But how vmemmap
>> mappings are created are architecture specific. For example, powerpc with hash
>> translation doesn't have vmemmap mappings in init_mm page table instead they are
>> bolted table entries in the hardware page table
>> 
>
> Very interesting, I didn't know this (no init_mm) was an option. And my
> apologies for regressing s390 :/ So, s390 does not support
> vmemmap_populate_basepages() then and always need to go to arch
> vmemmap_populate() to create the vmemmap entries.
>
>> vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware
>> of these architecture-specific mapping. Hence allow architecture to opt for this
>> feature. I selected architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> option as also supporting this feature. I added vmemmap_can_optimize() even
>> though page_vmemmap_nr(pgmap) > 1 check should filter architecture not
>> supporting this. > IMHO that brings clarity to the code where we are populating
>> vmemmap.
>> 
> I am not sure the last two sentences are right. I would remove, see above and
> below comments at the end on why.

That is true because i had vmemmap_shift = 0 if arch didn't support
vmemmap optimization. Based on your reply above, I have now moved that
out and kept vmemmap_can_optimize()

modified   mm/page_alloc.c
@@ -6846,8 +6846,16 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
 					      unsigned long nr_pages)
 {
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+	/*
+	 * this depends on how vmemmap is populated by
+	 * vmemmap_populate_compound_pages()
+	 */
 	return is_power_of_2(sizeof(struct page)) &&
 		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+#else
+	return nr_pages;
+#endif
 }
 
 static void __ref memmap_init_compound(struct page *head,
modified   mm/sparse-vmemmap.c
@@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	if (is_power_of_2(sizeof(struct page)) &&
-	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
+	if (vmemmap_can_optimize(altmap, pgmap))
 		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
 	else
 		r = vmemmap_populate(start, end, nid, altmap);

....

>
>> Fixes: c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
>
> It should be:
>
> Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound
> devmaps")

updated.

>
> But if what you included in your patch was what lead to the crash, then your
> problem is not the vmemmap optimization ... as the latter came after. If it's
> the hash I included above, it would reduce the affected trees to v5.19+
>
>> Cc: Joao Martins <joao.m.martins@oracle.com>
>> Cc: Muchun Song <songmuchun@bytedance.com>
>> Cc: Dan Williams <dan.j.williams@intel.com>
>> Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/arm64/Kconfig     |  1 +
>>  arch/loongarch/Kconfig |  1 +
>>  arch/s390/Kconfig      |  1 +
>>  arch/x86/Kconfig       |  1 +
>>  drivers/dax/device.c   |  3 ++-
>>  include/linux/mm.h     | 16 ++++++++++++++++
>>  mm/Kconfig             |  3 +++
>>  mm/sparse-vmemmap.c    |  3 +--
>>  8 files changed, 26 insertions(+), 3 deletions(-)
>> 
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 27b2592698b0..d3f5945f0aff 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -103,6 +103,7 @@ config ARM64
>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>  	select ARCH_WANT_LD_ORPHAN_WARN
>>  	select ARCH_WANTS_NO_INSTR
>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>>  	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
>>  	select ARCH_HAS_UBSAN_SANITIZE_ALL
>>  	select ARM_AMBA
>> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
>> index 9cc8b84f7eb0..ce5802066d0e 100644
>> --- a/arch/loongarch/Kconfig
>> +++ b/arch/loongarch/Kconfig
>> @@ -56,6 +56,7 @@ config LOONGARCH
>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>  	select ARCH_WANT_LD_ORPHAN_WARN
>>  	select ARCH_WANTS_NO_INSTR
>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>>  	select BUILDTIME_TABLE_SORT
>>  	select COMMON_CLK
>>  	select CPU_PM
>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>> index 933771b0b07a..abffccd937b2 100644
>> --- a/arch/s390/Kconfig
>> +++ b/arch/s390/Kconfig
>> @@ -127,6 +127,7 @@ config S390
>>  	select ARCH_WANT_DEFAULT_BPF_JIT
>>  	select ARCH_WANT_IPC_PARSE_VERSION
>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>>  	select BUILDTIME_TABLE_SORT
>>  	select CLONE_BACKWARDS2
>>  	select DMA_OPS if PCI
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index a825bf031f49..e8d66d834b4f 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -127,6 +127,7 @@ config X86
>>  	select ARCH_WANT_HUGE_PMD_SHARE
>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
>>  	select ARCH_WANT_LD_ORPHAN_WARN
>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
>>  	select ARCH_WANTS_THP_SWAP		if X86_64
>>  	select ARCH_HAS_PARANOID_L1D_FLUSH
>>  	select BUILDTIME_TABLE_SORT
>
> I would remove these and instead use the
> ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, and have your other snip be a second
> patch to drop the 'HUGETLB_' part
>
>> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
>> index 5494d745ced5..05be8e79d64b 100644
>> --- a/drivers/dax/device.c
>> +++ b/drivers/dax/device.c
>> @@ -448,7 +448,8 @@ int dev_dax_probe(struct dev_dax *dev_dax)
>>  	}
>>  
>>  	pgmap->type = MEMORY_DEVICE_GENERIC;
>> -	if (dev_dax->align > PAGE_SIZE)
>> +	if (dev_dax->align > PAGE_SIZE &&
>> +	    IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
>>  		pgmap->vmemmap_shift =
>>  			order_base_2(dev_dax->align >> PAGE_SHIFT);
>
> vmemmap_shift relates to the compound page size we will use in
> memmap_init_zone_device(). Otherwise we are back to the base struct page on
> PMD/PUD case. Instead move the:
>
> 	IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP)
>
> further below to __populate_section_memmap() ...
>
>>  	addr = devm_memremap_pages(dev, pgmap);
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 716d30d93616..fb71e21df23d 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3442,6 +3442,22 @@ void vmemmap_populate_print_last(void);
>>  void vmemmap_free(unsigned long start, unsigned long end,
>>  		struct vmem_altmap *altmap);
>>  #endif
>> +
>> +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
>> +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
>> +					   struct dev_pagemap *pgmap)
>> +{
>> +	return is_power_of_2(sizeof(struct page)) &&
>> +		pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
>> +}
>> +#else
>> +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
>> +					   struct dev_pagemap *pgmap)
>> +{
>> +	return false;
>> +}
>> +#endif
>> +
>>  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
>>  				  unsigned long nr_pages);
>>  
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index ff7b209dec05..99f87c1be1e8 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>  	  pfn_to_page and page_to_pfn operations.  This is the most
>>  	  efficient option when sufficient kernel resources are available.
>>  
>> +config ARCH_WANT_OPTIMIZE_VMEMMAP
>> +	bool
>> +
>>  config HAVE_MEMBLOCK_PHYS_MAP
>>  	bool
>>  
> As you mentioned in your other followup email it probably makes sense that you
> just use ARCH_HUGETLB_WANT_OPTIMIZE_VMEMMAP if we switch this to per-arch. And
> then have that kconfig name be renamed into the above. It simplifies the fix in
> case it pickups stable trees (the dax vmemmap deduplication came after hugetlb).
> The case for the head page reusal case for hugetlb is anyways captured by a
> hugetlb static key.

will do

>
>
>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>> index c5398a5960d0..10d73a0dfcec 100644
>> --- a/mm/sparse-vmemmap.c
>> +++ b/mm/sparse-vmemmap.c
>> @@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>>  		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>>  		return NULL;
>>  
>> -	if (is_power_of_2(sizeof(struct page)) &&
>> -	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
>> +	if (vmemmap_can_optimize(altmap, pgmap))
>>  		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
>>  	else
>>  		r = vmemmap_populate(start, end, nid, altmap);
>
> Same comment(s) as further above. I think it should be:
>
> if (IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
> 	r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
> else
> 	r = vmemmap_populate(start, end, nid, altmap);

Even if arch want vmemmap optimization the ability to use compound
vmemmap_populate is further restricted by

	if (is_power_of_2(sizeof(struct page)) &&
	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)

Hence we still want if (vmemmap_can_optimize(..)) right? 

>
> In principle all these arches below can support
> vmemmap_populate_compound_pages() but now it would require them enable it
> explicitly, which I guess makes sense considering this bug you are fixing. But
> the arch-requirement is not really if we support optimized-vmemmap or not but if
> we support the init-mm at all. OTOH tieing both hugetlb and dax additionally
> eliminates confusion around an optimization shared by both, even if the dax
> could encompass more. I am not sure what is the arch requirement for hugetlb
> vmemmap opt these days.
>
> arch/arm64/mm/mmu.c:            return vmemmap_populate_basepages(start, end,
> node, altmap);
> arch/ia64/mm/discontig.c:       return vmemmap_populate_basepages(start, end,
> node, NULL);
> arch/loongarch/mm/init.c:       return vmemmap_populate_basepages(start, end,
> node, NULL);
> arch/riscv/mm/init.c:   return vmemmap_populate_basepages(start, end, node, NULL);
> arch/x86/mm/init_64.c:          err = vmemmap_populate_basepages(start, end,
> node, NULL);
> arch/x86/mm/init_64.c:          err = vmemmap_populate_basepages(start, end,
> node, NULL);
Joao Martins April 11, 2023, 10:33 a.m. UTC | #10
On 11/04/2023 09:07, Aneesh Kumar K.V wrote:
> Joao Martins <joao.m.martins@oracle.com> writes:
> 
>> On 07/04/2023 13:23, Aneesh Kumar K.V wrote:
>>> commit c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
>>> added support for using optimized vmmemap for devdax devices. 
>>
>> Not really. It added *compound pages* for devdax ZONE device which is what the
>> commit describes. It is meant to represent the non base page metadata. Should
>> fsdax support the equivalent, we can have GUP performance on the dax path and
>> remove today's special zone device case of ref-per-base-page GUP-fast path.
>>
>> One was tied when it was brought in but in theory[*] we didn't require anything
>> from arch-es to do this (contrary to vmemmap_populate()). The commit you should
>> refer to is this instead:
> 
> ok the compound page dependency came via
> commit 6fd3620b3428 ("mm/page_alloc: reuse tail struct pages for compound devmaps")
> 
It's an optimization to remove unnecessary work, not exactly a real dependency.
But I understand what you mean.

> I have now reworked it such that we do
> 
> static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
> 					      unsigned long nr_pages)
> {
> #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
> 	/*
> 	 * this depends on how vmemmap is populated by
> 	 * vmemmap_populate_compound_pages()
> 	 */
> 	return is_power_of_2(sizeof(struct page)) &&
> 		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
> #else
> 	return nr_pages;
> #endif
> }
> 

I would instead just simplify it to:

static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
 					      unsigned long nr_pages)
{
   return IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP) &&
		is_power_of_2(sizeof(struct page)) &&
 		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
}

As the ternary false condition already handles the non-optimized-vmemmap case.

With your vmemmap_can_optimize() helper perhaps:

static inline unsigned long compound_nr_pages(struct dev_pagemap *pgmap,
					      struct vmem_altmap *altmap,
 					      unsigned long nr_pages)
{
   return vmemmap_can_optimize(pgmap, altmap) ?
			2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
}


>>
>> 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")
>>
>> [*] or so I thought before next paragraph
>>
>>> But how vmemmap
>>> mappings are created are architecture specific. For example, powerpc with hash
>>> translation doesn't have vmemmap mappings in init_mm page table instead they are
>>> bolted table entries in the hardware page table
>>>
>>
>> Very interesting, I didn't know this (no init_mm) was an option. And my
>> apologies for regressing s390 :/ So, s390 does not support
>> vmemmap_populate_basepages() then and always need to go to arch
>> vmemmap_populate() to create the vmemmap entries.
>>
>>> vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware
>>> of these architecture-specific mapping. Hence allow architecture to opt for this
>>> feature. I selected architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>> option as also supporting this feature. I added vmemmap_can_optimize() even
>>> though page_vmemmap_nr(pgmap) > 1 check should filter architecture not
>>> supporting this. > IMHO that brings clarity to the code where we are populating
>>> vmemmap.
>>>
>> I am not sure the last two sentences are right. I would remove, see above and
>> below comments at the end on why.
> 
> That is true because i had vmemmap_shift = 0 if arch didn't support
> vmemmap optimization. Based on your reply above, I have now moved that
> out and kept vmemmap_can_optimize()
> 
/me nods

> modified   mm/page_alloc.c
> @@ -6846,8 +6846,16 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
>  static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
>  					      unsigned long nr_pages)
>  {
> +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
> +	/*
> +	 * this depends on how vmemmap is populated by
> +	 * vmemmap_populate_compound_pages()
> +	 */
>  	return is_power_of_2(sizeof(struct page)) &&
>  		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
> +#else
> +	return nr_pages;
> +#endif
>  }
>  
>  static void __ref memmap_init_compound(struct page *head,
> modified   mm/sparse-vmemmap.c
> @@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>  		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>  		return NULL;
>  
> -	if (is_power_of_2(sizeof(struct page)) &&
> -	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> +	if (vmemmap_can_optimize(altmap, pgmap))
>  		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
>  	else
>  		r = vmemmap_populate(start, end, nid, altmap);
> 
> ....
> 
>>
>>> Fixes: c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages")
>>
>> It should be:
>>
>> Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound
>> devmaps")
> 
> updated.
> 
>>
>> But if what you included in your patch was what lead to the crash, then your
>> problem is not the vmemmap optimization ... as the latter came after. If it's
>> the hash I included above, it would reduce the affected trees to v5.19+
>>
>>> Cc: Joao Martins <joao.m.martins@oracle.com>
>>> Cc: Muchun Song <songmuchun@bytedance.com>
>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>> Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>>  arch/arm64/Kconfig     |  1 +
>>>  arch/loongarch/Kconfig |  1 +
>>>  arch/s390/Kconfig      |  1 +
>>>  arch/x86/Kconfig       |  1 +
>>>  drivers/dax/device.c   |  3 ++-
>>>  include/linux/mm.h     | 16 ++++++++++++++++
>>>  mm/Kconfig             |  3 +++
>>>  mm/sparse-vmemmap.c    |  3 +--
>>>  8 files changed, 26 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index 27b2592698b0..d3f5945f0aff 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -103,6 +103,7 @@ config ARM64
>>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>>  	select ARCH_WANT_LD_ORPHAN_WARN
>>>  	select ARCH_WANTS_NO_INSTR
>>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>>>  	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
>>>  	select ARCH_HAS_UBSAN_SANITIZE_ALL
>>>  	select ARM_AMBA
>>> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
>>> index 9cc8b84f7eb0..ce5802066d0e 100644
>>> --- a/arch/loongarch/Kconfig
>>> +++ b/arch/loongarch/Kconfig
>>> @@ -56,6 +56,7 @@ config LOONGARCH
>>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>>  	select ARCH_WANT_LD_ORPHAN_WARN
>>>  	select ARCH_WANTS_NO_INSTR
>>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>>>  	select BUILDTIME_TABLE_SORT
>>>  	select COMMON_CLK
>>>  	select CPU_PM
>>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>>> index 933771b0b07a..abffccd937b2 100644
>>> --- a/arch/s390/Kconfig
>>> +++ b/arch/s390/Kconfig
>>> @@ -127,6 +127,7 @@ config S390
>>>  	select ARCH_WANT_DEFAULT_BPF_JIT
>>>  	select ARCH_WANT_IPC_PARSE_VERSION
>>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP
>>>  	select BUILDTIME_TABLE_SORT
>>>  	select CLONE_BACKWARDS2
>>>  	select DMA_OPS if PCI
>>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>>> index a825bf031f49..e8d66d834b4f 100644
>>> --- a/arch/x86/Kconfig
>>> +++ b/arch/x86/Kconfig
>>> @@ -127,6 +127,7 @@ config X86
>>>  	select ARCH_WANT_HUGE_PMD_SHARE
>>>  	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
>>>  	select ARCH_WANT_LD_ORPHAN_WARN
>>> +	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
>>>  	select ARCH_WANTS_THP_SWAP		if X86_64
>>>  	select ARCH_HAS_PARANOID_L1D_FLUSH
>>>  	select BUILDTIME_TABLE_SORT
>>
>> I would remove these and instead use the
>> ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, and have your other snip be a second
>> patch to drop the 'HUGETLB_' part
>>
>>> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
>>> index 5494d745ced5..05be8e79d64b 100644
>>> --- a/drivers/dax/device.c
>>> +++ b/drivers/dax/device.c
>>> @@ -448,7 +448,8 @@ int dev_dax_probe(struct dev_dax *dev_dax)
>>>  	}
>>>  
>>>  	pgmap->type = MEMORY_DEVICE_GENERIC;
>>> -	if (dev_dax->align > PAGE_SIZE)
>>> +	if (dev_dax->align > PAGE_SIZE &&
>>> +	    IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
>>>  		pgmap->vmemmap_shift =
>>>  			order_base_2(dev_dax->align >> PAGE_SHIFT);
>>
>> vmemmap_shift relates to the compound page size we will use in
>> memmap_init_zone_device(). Otherwise we are back to the base struct page on
>> PMD/PUD case. Instead move the:
>>
>> 	IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP)
>>
>> further below to __populate_section_memmap() ...
>>
>>>  	addr = devm_memremap_pages(dev, pgmap);
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 716d30d93616..fb71e21df23d 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -3442,6 +3442,22 @@ void vmemmap_populate_print_last(void);
>>>  void vmemmap_free(unsigned long start, unsigned long end,
>>>  		struct vmem_altmap *altmap);
>>>  #endif
>>> +
>>> +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
>>> +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
>>> +					   struct dev_pagemap *pgmap)
>>> +{
>>> +	return is_power_of_2(sizeof(struct page)) &&
>>> +		pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
>>> +}
>>> +#else
>>> +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
>>> +					   struct dev_pagemap *pgmap)
>>> +{
>>> +	return false;
>>> +}
>>> +#endif
>>> +
>>>  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
>>>  				  unsigned long nr_pages);
>>>  
>>> diff --git a/mm/Kconfig b/mm/Kconfig
>>> index ff7b209dec05..99f87c1be1e8 100644
>>> --- a/mm/Kconfig
>>> +++ b/mm/Kconfig
>>> @@ -461,6 +461,9 @@ config SPARSEMEM_VMEMMAP
>>>  	  pfn_to_page and page_to_pfn operations.  This is the most
>>>  	  efficient option when sufficient kernel resources are available.
>>>  
>>> +config ARCH_WANT_OPTIMIZE_VMEMMAP
>>> +	bool
>>> +
>>>  config HAVE_MEMBLOCK_PHYS_MAP
>>>  	bool
>>>  
>> As you mentioned in your other followup email it probably makes sense that you
>> just use ARCH_HUGETLB_WANT_OPTIMIZE_VMEMMAP if we switch this to per-arch. And
>> then have that kconfig name be renamed into the above. It simplifies the fix in
>> case it pickups stable trees (the dax vmemmap deduplication came after hugetlb).
>> The case for the head page reusal case for hugetlb is anyways captured by a
>> hugetlb static key.
> 
> will do
> 
>>
>>
>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>>> index c5398a5960d0..10d73a0dfcec 100644
>>> --- a/mm/sparse-vmemmap.c
>>> +++ b/mm/sparse-vmemmap.c
>>> @@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>>>  		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>>>  		return NULL;
>>>  
>>> -	if (is_power_of_2(sizeof(struct page)) &&
>>> -	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
>>> +	if (vmemmap_can_optimize(altmap, pgmap))
>>>  		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
>>>  	else
>>>  		r = vmemmap_populate(start, end, nid, altmap);
>>
>> Same comment(s) as further above. I think it should be:
>>
>> if (IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
>> 	r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
>> else
>> 	r = vmemmap_populate(start, end, nid, altmap);
> 
> Even if arch want vmemmap optimization the ability to use compound
> vmemmap_populate is further restricted by
> 
> 	if (is_power_of_2(sizeof(struct page)) &&
> 	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> 
> Hence we still want if (vmemmap_can_optimize(..)) right? 
> 
Correct. It would become:

if (IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP) &&
    is_power_of_2(sizeof(struct page)) &&
    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
	r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);

I suppose a helper could have this in vmemmap_can_optimize() as you suggest,
as the condition is getting more unreadable. Probably worth using it too
in compound_nr_pages(), but you would need to add a pgmap argument to that
function and change its caller.

vmemmap_can_optimize() is not really the greatest name, but I can't think of
anything better that aligns with the rest of naming of this feature.

>>
>> In principle all these arches below can support
>> vmemmap_populate_compound_pages() but now it would require them enable it
>> explicitly, which I guess makes sense considering this bug you are fixing. But
>> the arch-requirement is not really if we support optimized-vmemmap or not but if
>> we support the init-mm at all. OTOH tieing both hugetlb and dax additionally
>> eliminates confusion around an optimization shared by both, even if the dax
>> could encompass more. I am not sure what is the arch requirement for hugetlb
>> vmemmap opt these days.
>>
>> arch/arm64/mm/mmu.c:            return vmemmap_populate_basepages(start, end,
>> node, altmap);
>> arch/ia64/mm/discontig.c:       return vmemmap_populate_basepages(start, end,
>> node, NULL);
>> arch/loongarch/mm/init.c:       return vmemmap_populate_basepages(start, end,
>> node, NULL);
>> arch/riscv/mm/init.c:   return vmemmap_populate_basepages(start, end, node, NULL);
>> arch/x86/mm/init_64.c:          err = vmemmap_populate_basepages(start, end,
>> node, NULL);
>> arch/x86/mm/init_64.c:          err = vmemmap_populate_basepages(start, end,
>> node, NULL);
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 27b2592698b0..d3f5945f0aff 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -103,6 +103,7 @@  config ARM64
 	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
+	select ARCH_WANT_OPTIMIZE_VMEMMAP
 	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARM_AMBA
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 9cc8b84f7eb0..ce5802066d0e 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -56,6 +56,7 @@  config LOONGARCH
 	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
+	select ARCH_WANT_OPTIMIZE_VMEMMAP
 	select BUILDTIME_TABLE_SORT
 	select COMMON_CLK
 	select CPU_PM
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 933771b0b07a..abffccd937b2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,6 +127,7 @@  config S390
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+	select ARCH_WANT_OPTIMIZE_VMEMMAP
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS2
 	select DMA_OPS if PCI
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a825bf031f49..e8d66d834b4f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -127,6 +127,7 @@  config X86
 	select ARCH_WANT_HUGE_PMD_SHARE
 	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select BUILDTIME_TABLE_SORT
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 5494d745ced5..05be8e79d64b 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -448,7 +448,8 @@  int dev_dax_probe(struct dev_dax *dev_dax)
 	}
 
 	pgmap->type = MEMORY_DEVICE_GENERIC;
-	if (dev_dax->align > PAGE_SIZE)
+	if (dev_dax->align > PAGE_SIZE &&
+	    IS_ENABLED(CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP))
 		pgmap->vmemmap_shift =
 			order_base_2(dev_dax->align >> PAGE_SHIFT);
 	addr = devm_memremap_pages(dev, pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 716d30d93616..fb71e21df23d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3442,6 +3442,22 @@  void vmemmap_populate_print_last(void);
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap);
 #endif
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+					   struct dev_pagemap *pgmap)
+{
+	return is_power_of_2(sizeof(struct page)) &&
+		pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+					   struct dev_pagemap *pgmap)
+{
+	return false;
+}
+#endif
+
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
 				  unsigned long nr_pages);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..99f87c1be1e8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -461,6 +461,9 @@  config SPARSEMEM_VMEMMAP
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.
 
+config ARCH_WANT_OPTIMIZE_VMEMMAP
+	bool
+
 config HAVE_MEMBLOCK_PHYS_MAP
 	bool
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c5398a5960d0..10d73a0dfcec 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -458,8 +458,7 @@  struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	if (is_power_of_2(sizeof(struct page)) &&
-	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
+	if (vmemmap_can_optimize(altmap, pgmap))
 		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
 	else
 		r = vmemmap_populate(start, end, nid, altmap);