Message ID | 20190109203911.7887-3-logang@deltatee.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | sparsemem support for RISC-V | expand |
Looks fine,
Reviewed-by: Christoph Hellwig <hch@lst.de>
Hi Logan, Logan Gunthorpe <logang@deltatee.com> 於 2019年1月10日 週四 上午5:07寫道: > > This patch implements sparsemem support for risc-v which helps pave the > way for memory hotplug and eventually P2P support. > > We introduce Kconfig options for virtual and physical address bits which > are used to calculate the size of the vmemmap and set the > MAX_PHYSMEM_BITS. > > The vmemmap is located directly before the VMALLOC region and sized > such that we can allocate enough pages to populate all the virtual > address space in the system (similar to the way it's done in arm64). > > During initialization, call memblocks_present() and sparse_init(), > and provide a stub for vmemmap_populate() (all of which is similar to > arm64). > > Signed-off-by: Logan Gunthorpe <logang@deltatee.com> > Reviewed-by: Palmer Dabbelt <palmer@sifive.com> > Cc: Albert Ou <aou@eecs.berkeley.edu> > Cc: Andrew Waterman <andrew@sifive.com> > Cc: Olof Johansson <olof@lixom.net> > Cc: Michael Clark <michaeljclark@mac.com> > Cc: Rob Herring <robh@kernel.org> > Cc: Zong Li <zong@andestech.com> > --- > arch/riscv/Kconfig | 23 +++++++++++++++++++++++ > arch/riscv/include/asm/pgtable.h | 21 +++++++++++++++++---- > arch/riscv/include/asm/sparsemem.h | 11 +++++++++++ > arch/riscv/kernel/setup.c | 4 +++- > arch/riscv/mm/init.c | 8 ++++++++ > 5 files changed, 62 insertions(+), 5 deletions(-) > create mode 100644 arch/riscv/include/asm/sparsemem.h > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index e0d7d61779a6..bd659327bc6b 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -54,12 +54,32 @@ config ZONE_DMA32 > bool > default y if 64BIT > > +config VA_BITS > + int > + default 32 if 32BIT > + default 39 if 64BIT > + > +config PA_BITS > + int > + default 34 if 32BIT > + default 56 if 64BIT > + > config PAGE_OFFSET > hex > default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB > default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB > default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB > > +config ARCH_FLATMEM_ENABLE > + def_bool y > + > +config ARCH_SPARSEMEM_ENABLE > + def_bool y > + select SPARSEMEM_VMEMMAP_ENABLE > + > +config ARCH_SELECT_MEMORY_MODEL > + def_bool ARCH_SPARSEMEM_ENABLE > + > config STACKTRACE_SUPPORT > def_bool y > > @@ -94,6 +114,9 @@ config PGTABLE_LEVELS > config HAVE_KPROBES > def_bool n > > +config HAVE_ARCH_PFN_VALID > + def_bool y > + > menu "Platform type" > > choice > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > index 16301966d65b..e1162336f5ea 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -89,6 +89,23 @@ extern pgd_t swapper_pg_dir[]; > #define __S110 PAGE_SHARED_EXEC > #define __S111 PAGE_SHARED_EXEC > > +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > +#define VMALLOC_END (PAGE_OFFSET - 1) > +#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > + > +/* > + * Roughly size the vmemmap space to be large enough to fit enough > + * struct pages to map half the virtual address space. Then > + * position vmemmap directly below the VMALLOC region. > + */ > +#define VMEMMAP_SHIFT \ > + (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) > +#define VMEMMAP_SIZE (1UL << VMEMMAP_SHIFT) > +#define VMEMMAP_END (VMALLOC_START - 1) > +#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) > + > +#define vmemmap ((struct page *)VMEMMAP_START) > + > /* > * ZERO_PAGE is a global shared page that is always zero, > * used for zero-mapped memory areas, etc. > @@ -411,10 +428,6 @@ static inline void pgtable_cache_init(void) > /* No page table caches to initialize */ > } > > -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > -#define VMALLOC_END (PAGE_OFFSET - 1) > -#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > - > /* > * Task size is 0x40000000000 for RV64 or 0xb800000 for RV32. > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h > new file mode 100644 > index 000000000000..b58ba2d9ed6e > --- /dev/null > +++ b/arch/riscv/include/asm/sparsemem.h > @@ -0,0 +1,11 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +#ifndef __ASM_SPARSEMEM_H > +#define __ASM_SPARSEMEM_H > + > +#ifdef CONFIG_SPARSEMEM > +#define MAX_PHYSMEM_BITS CONFIG_PA_BITS > +#define SECTION_SIZE_BITS 27 > +#endif /* CONFIG_SPARSEMEM */ > + > +#endif /* __ASM_SPARSEMEM_H */ > diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c > index fc8006a042eb..98f39adefb1a 100644 > --- a/arch/riscv/kernel/setup.c > +++ b/arch/riscv/kernel/setup.c > @@ -193,6 +193,9 @@ static void __init setup_bootmem(void) > PFN_PHYS(end_pfn - start_pfn), > &memblock.memory, 0); > } > + > + memblocks_present(); > + sparse_init(); > } I just applied this patch to Linux kernel 5.2. I used a dts with 2 memory nodes with hole int it. memory@80000000 { device_type = "memory"; reg = <0x0 0x80000000 0x0 0x40000000>; }; memory@180000000 { device_type = "memory"; reg = <0x1 0x80000000 0x0 0x40000000>; }; I found it will boot failure. Did I miss anything? [ 0.000000] Sorting __ex_table... [ 0.000000] BUG: Bad page state in process swapper pfn:180001 [ 0.000000] page:ffffffcf05400038 refcount:0 mapcount:94371937 mapping:00000000ffffffff index:0x4000000000000000 [ 0.000000] anon [ 0.000000] flags: 0x0() [ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000 00000000ffffffff [ 0.000000] raw: 4000000000000000 ffffffcf05a00060 0000000005a00060 [ 0.000000] page dumped because: non-NULL mapping [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.0-00001-g737d8214d9a9 #3 [ 0.000000] Call Trace: [ 0.000000] [<ffffffe00017759c>] walk_stackframe+0x0/0xa0 [ 0.000000] [<ffffffe00017769c>] show_stack+0x2a/0x34 [ 0.000000] [<ffffffe00070c53e>] dump_stack+0x62/0x7c [ 0.000000] [<ffffffe0002330ae>] bad_page+0xca/0x120 [ 0.000000] [<ffffffe00023313c>] free_pages_check_bad+0x38/0x7a [ 0.000000] [<ffffffe00023368a>] __free_pages_ok+0x496/0x4ba [ 0.000000] [<ffffffe000234a82>] __free_pages.part.4+0xe/0x22 [ 0.000000] [<ffffffe000234c9e>] __free_pages_core+0x9a/0xa6 [ 0.000000] [<ffffffe000009b0a>] memblock_free_pages+0x12/0x1a [ 0.000000] [<ffffffe00000b496>] memblock_free_all+0x144/0x1a8 [ 0.000000] [<ffffffe00000274a>] mem_init+0x28/0x36 [ 0.000000] [<ffffffe0000008a0>] start_kernel+0x1bc/0x360 [ 0.000000] [<ffffffe000000074>] clear_bss_done+0x34/0x38 [ 0.000000] Disabling lock debugging due to kernel taint [ 0.000000] BUG: Bad page state in process swapper pfn:180002 [ 0.000000] page:ffffffcf05400070 refcount:0 mapcount:94371993 mapping:00000000ffffffff index:0x4000000000000000 [ 0.000000] anon [ 0.000000] flags: 0x0() [ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000 00000000ffffffff [ 0.000000] raw: 4000000000000000 ffffffcf05a00098 0000000005a00098 [ 0.000000] page dumped because: non-NULL mapping [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G B 5.2.0-00001-g737d8214d9a9 #3 [ 0.000000] Call Trace: [ 0.000000] [<ffffffe00017759c>] walk_stackframe+0x0/0xa0 [ 0.000000] [<ffffffe00017769c>] show_stack+0x2a/0x34 [ 0.000000] [<ffffffe00070c53e>] dump_stack+0x62/0x7c [ 0.000000] [<ffffffe0002330ae>] bad_page+0xca/0x120 [ 0.000000] [<ffffffe00023313c>] free_pages_check_bad+0x38/0x7a [ 0.000000] [<ffffffe00023368a>] __free_pages_ok+0x496/0x4ba [ 0.000000] [<ffffffe000234a82>] __free_pages.part.4+0xe/0x22 [ 0.000000] [<ffffffe000234c9e>] __free_pages_core+0x9a/0xa6 [ 0.000000] [<ffffffe000009b0a>] memblock_free_pages+0x12/0x1a [ 0.000000] [<ffffffe00000b496>] memblock_free_all+0x144/0x1a8 [ 0.000000] [<ffffffe00000274a>] mem_init+0x28/0x36 [ 0.000000] [<ffffffe0000008a0>] start_kernel+0x1bc/0x360 [ 0.000000] [<ffffffe000000074>] clear_bss_done+0x34/0x38 [ 0.000000] BUG: Bad page state in process swapper pfn:180003 [ 0.000000] page:ffffffcf054000a8 refcount:0 mapcount:94372049 mapping:00000000ffffffff index:0x4000000000000000 [ 0.000000] anon [ 0.000000] flags: 0x0() [ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000 00000000ffffffff [ 0.000000] raw: 4000000000000000 ffffffcf05a000d0 0000000005a000d0 [ 0.000000] page dumped because: non-NULL mapping I look this issue more closely. I found it always sets each memblock region to node 0. Does this make sense? I am not sure if I understand this correctly. Do you have any idea for this? Thank you. :) for_each_memblock(memory, reg) { unsigned long start_pfn = memblock_region_memory_base_pfn(reg); unsigned long end_pfn = memblock_region_memory_end_pfn(reg); memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), &memblock.memory, 0); ^^^ } [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000080200000-0x00000000bfffffff] [ 0.000000] node 0: [mem 0x0000000180000000-0x00000001bfffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000080200000-0x00000001bfffffff]
On 2019-07-31 12:30 a.m., Greentime Hu wrote: > I look this issue more closely. > I found it always sets each memblock region to node 0. Does this make sense? > I am not sure if I understand this correctly. Do you have any idea for > this? Thank you. :) Yes, I think this is normal. When we talk about memory nodes we're talking about NUMA nodes which is unrelated to device tree nodes. I'm not really sure what's causing the crash. Have you verified it's this patch that causes it? Is it related to there being a hole in your memory, does it work if you only have one memory node? Thanks, Logan
Hi Logan, Logan Gunthorpe <logang@deltatee.com> 於 2019年8月1日 週四 上午1:08寫道: > > > > On 2019-07-31 12:30 a.m., Greentime Hu wrote: > > I look this issue more closely. > > I found it always sets each memblock region to node 0. Does this make sense? > > I am not sure if I understand this correctly. Do you have any idea for > > this? Thank you. :) > > Yes, I think this is normal. When we talk about memory nodes we're > talking about NUMA nodes which is unrelated to device tree nodes. Ok, but it seems the second memblock_region may overwrite the first memblock_region in for_each_memblock(memory, reg) loop. It always uses this API to set to node 0. memblock_set_node(PFN_PHYS(start_pfn),PFN_PHYS(end_pfn - start_pfn), &memblock.memory,0) > I'm not really sure what's causing the crash. Have you verified it's > this patch that causes it? Is it related to there being a hole in your > memory, does it work if you only have one memory node? > It works fine if there is only one memory node described in dts. I think it is related to there being a hole in the device tree script. I don't actually have a platform with a hole in the memory region, so I use device tree script to describe it. The physical address layout will be like this. 2GB-3GB-hole-6GB-7GB memory@80000000 { device_type = "memory"; reg = <0x0 0x80000000 0x0 0x40000000>; }; memory@180000000 { device_type = "memory"; reg = <0x1 0x80000000 0x0 0x40000000>; }; Thank you for the quick reply. :)
On 2019-08-08 10:23 p.m., Greentime Hu wrote: > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index 3f12b069af1d..208b3e14ccd8 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > default 2 > > config HAVE_ARCH_PFN_VALID > - def_bool y > + bool > + default !SPARSEMEM_VMEMMAP > > menu "Platform type" > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 8ddb6c7fedac..6991f7a5a4a7 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn; > #define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr))) > #define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn))) > > +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > +#define pfn_valid(pfn) \ > + (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > #define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) > #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) > +#else > +#define virt_to_page(vaddr) ((struct page *)((((u64)vaddr - > va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START)) > +#define page_to_virt(pg) ((void *)(((((u64)pg - VMEMMAP_START) / > sizeof(struct page)) * PAGE_SIZE) + va_pa_offset)) > +#endif This doesn't make sense to me at all. It should always use pfn_to_page() for virt_to_page() and the generic pfn_to_page()/page_to_pfn() implementations essentially already do what you are doing in a cleaner way. So I'd be really surprised if this does anything at all. Logan
Hi Logan, Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道: > > > > On 2019-08-08 10:23 p.m., Greentime Hu wrote: > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > > index 3f12b069af1d..208b3e14ccd8 100644 > > --- a/arch/riscv/Kconfig > > +++ b/arch/riscv/Kconfig > > @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > > default 2 > > > > config HAVE_ARCH_PFN_VALID > > - def_bool y > > + bool > > + default !SPARSEMEM_VMEMMAP > > > > menu "Platform type" > > > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > > index 8ddb6c7fedac..6991f7a5a4a7 100644 > > --- a/arch/riscv/include/asm/page.h > > +++ b/arch/riscv/include/asm/page.h > > @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn; > > #define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr))) > > #define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn))) > > > > +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > > +#define pfn_valid(pfn) \ > > + (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > > #define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) > > #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) > > +#else > > +#define virt_to_page(vaddr) ((struct page *)((((u64)vaddr - > > va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START)) > > +#define page_to_virt(pg) ((void *)(((((u64)pg - VMEMMAP_START) / > > sizeof(struct page)) * PAGE_SIZE) + va_pa_offset)) > > +#endif > > This doesn't make sense to me at all. It should always use pfn_to_page() > for virt_to_page() and the generic pfn_to_page()/page_to_pfn() > implementations essentially already do what you are doing in a cleaner > way. So I'd be really surprised if this does anything at all. > Thank you for point me out that. I just checked the generic implementation and I should use that one. Sorry I didn't check the generic one and just implement it again. I think the only patch we need is the first part to use generic pfn_valid(). I just tested it and yes it can boot successfully in dts with hole. It will fail in this check ((pfn)-pfn_base) < max_mapnr. In my test case it will be ((6GB>>PAGE_SHIFT)-(2GB>>PAGE_SHIFT)=(4GB>>PAGE_SHIFT) < (3GB>>PAGE_SHIFT) to return false. memory@80000000 { device_type = "memory"; reg = <0x0 0x80000000 0x0 0x80000000>; }; memory@180000000 { device_type = "memory"; reg = <0x1 0x80000000 0x0 0x40000000>; }; To cause the check here failed to initialize page struct. for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { if (!early_pfn_valid(pfn)) continue; if (!early_pfn_in_nid(pfn, nid)) continue; if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) break; } page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); ------------------------------------------------------------------------------ diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 3f12b069af1d..208b3e14ccd8 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -116,7 +116,8 @@ config PGTABLE_LEVELS default 2 config HAVE_ARCH_PFN_VALID - def_bool y + bool + default !SPARSEMEM_VMEMMAP menu "Platform type" diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 8ddb6c7fedac..80d28fa1e2eb 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; #define page_to_bus(page) (page_to_phys(page)) #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) #define pfn_valid(pfn) \ (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) +#endif #define ARCH_PFN_OFFSET (pfn_base)
On 2019-08-09 11:01 a.m., Greentime Hu wrote: > Hi Logan, > > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道: >> >> >> >> On 2019-08-08 10:23 p.m., Greentime Hu wrote: >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >>> index 3f12b069af1d..208b3e14ccd8 100644 >>> --- a/arch/riscv/Kconfig >>> +++ b/arch/riscv/Kconfig >>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS >>> default 2 >>> >>> config HAVE_ARCH_PFN_VALID >>> - def_bool y >>> + bool >>> + default !SPARSEMEM_VMEMMAP >>> >>> menu "Platform type" >>> >>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h >>> index 8ddb6c7fedac..6991f7a5a4a7 100644 >>> --- a/arch/riscv/include/asm/page.h >>> +++ b/arch/riscv/include/asm/page.h >>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn; >>> #define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr))) >>> #define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn))) >>> >>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) >>> +#define pfn_valid(pfn) \ >>> + (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) >>> #define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) >>> #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) >>> +#else >>> +#define virt_to_page(vaddr) ((struct page *)((((u64)vaddr - >>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START)) >>> +#define page_to_virt(pg) ((void *)(((((u64)pg - VMEMMAP_START) / >>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset)) >>> +#endif >> >> This doesn't make sense to me at all. It should always use pfn_to_page() >> for virt_to_page() and the generic pfn_to_page()/page_to_pfn() >> implementations essentially already do what you are doing in a cleaner >> way. So I'd be really surprised if this does anything at all. >> > > Thank you for point me out that. I just checked the generic > implementation and I should use that one. > Sorry I didn't check the generic one and just implement it again. > I think the only patch we need is the first part to use generic > pfn_valid(). I just tested it and yes it can boot successfully in dts > with hole. > > It will fail in this check ((pfn)-pfn_base) < max_mapnr. Sounds to me like max_mapnr is not set correctly. See the code in setup_bootmem(). Seems like 'mem_size' should be set to the largest memory block, not just the one that contains the kernel... > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index 3f12b069af1d..208b3e14ccd8 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > default 2 > > config HAVE_ARCH_PFN_VALID > - def_bool y > + bool > + default !SPARSEMEM_VMEMMAP > > menu "Platform type" > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 8ddb6c7fedac..80d28fa1e2eb 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; > #define page_to_bus(page) (page_to_phys(page)) > #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) > > +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > #define pfn_valid(pfn) \ > (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > +#endif > > #define ARCH_PFN_OFFSET (pfn_base) This patch still makes no sense. I'm not sure why we have an arch specific pfn_valid() because it's very similar to the generic one. But my guess is there's a reason for it and it's not doing what it is supposed when you remove it for the sparsemem case. Logan
Hi Logan, Logan Gunthorpe <logang@deltatee.com> 於 2019年8月10日 週六 上午3:03寫道: > > > > On 2019-08-09 11:01 a.m., Greentime Hu wrote: > > Hi Logan, > > > > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道: > >> > >> > >> > >> On 2019-08-08 10:23 p.m., Greentime Hu wrote: > >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > >>> index 3f12b069af1d..208b3e14ccd8 100644 > >>> --- a/arch/riscv/Kconfig > >>> +++ b/arch/riscv/Kconfig > >>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > >>> default 2 > >>> > >>> config HAVE_ARCH_PFN_VALID > >>> - def_bool y > >>> + bool > >>> + default !SPARSEMEM_VMEMMAP > >>> > >>> menu "Platform type" > >>> > >>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > >>> index 8ddb6c7fedac..6991f7a5a4a7 100644 > >>> --- a/arch/riscv/include/asm/page.h > >>> +++ b/arch/riscv/include/asm/page.h > >>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn; > >>> #define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr))) > >>> #define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn))) > >>> > >>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > >>> +#define pfn_valid(pfn) \ > >>> + (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > >>> #define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) > >>> #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) > >>> +#else > >>> +#define virt_to_page(vaddr) ((struct page *)((((u64)vaddr - > >>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START)) > >>> +#define page_to_virt(pg) ((void *)(((((u64)pg - VMEMMAP_START) / > >>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset)) > >>> +#endif > >> > >> This doesn't make sense to me at all. It should always use pfn_to_page() > >> for virt_to_page() and the generic pfn_to_page()/page_to_pfn() > >> implementations essentially already do what you are doing in a cleaner > >> way. So I'd be really surprised if this does anything at all. > >> > > > > Thank you for point me out that. I just checked the generic > > implementation and I should use that one. > > Sorry I didn't check the generic one and just implement it again. > > I think the only patch we need is the first part to use generic > > pfn_valid(). I just tested it and yes it can boot successfully in dts > > with hole. > > > > It will fail in this check ((pfn)-pfn_base) < max_mapnr. > > Sounds to me like max_mapnr is not set correctly. See the code in > setup_bootmem(). Seems like 'mem_size' should be set to the largest > memory block, not just the one that contains the kernel... > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > > index 3f12b069af1d..208b3e14ccd8 100644 > > --- a/arch/riscv/Kconfig > > +++ b/arch/riscv/Kconfig > > @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > > default 2 > > > > config HAVE_ARCH_PFN_VALID > > - def_bool y > > + bool > > + default !SPARSEMEM_VMEMMAP > > > > menu "Platform type" > > > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > > index 8ddb6c7fedac..80d28fa1e2eb 100644 > > --- a/arch/riscv/include/asm/page.h > > +++ b/arch/riscv/include/asm/page.h > > @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; > > #define page_to_bus(page) (page_to_phys(page)) > > #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) > > > > +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > > #define pfn_valid(pfn) \ > > (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > > +#endif > > > > #define ARCH_PFN_OFFSET (pfn_base) > > > This patch still makes no sense. I'm not sure why we have an arch > specific pfn_valid() because it's very similar to the generic one. But > my guess is there's a reason for it and it's not doing what it is > supposed when you remove it for the sparsemem case. It will use another pfn_valid() implementation in include/linux/mmzone.h if CONFIG_SPARSEMEM and !CONFIG_HAVE_ARCH_PFN_VALID It will be this one. static inline int pfn_valid(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); } This generic pfn_valid() API can check the pfn is valid or not even if there a hole in the memory. For example: A hole is between 0x100000000 to 0x180000000 (4GB-6GB) in my dts test case. [ 0.000000] In setup_bootmem, pfn_valid(0x180000)=1 [ 0.000000] In setup_bootmem, pfn_valid(0x80000)=1 [ 0.000000] In setup_bootmem, pfn_valid(0x80200)=1 [ 0.000000] In setup_bootmem, pfn_valid(0x80300)=1 [ 0.000000] In setup_bootmem, pfn_valid(0x160000)=0 [ 0.000000] In setup_bootmem, pfn_valid(0x17ffff)=0 [ 0.000000] In setup_bootmem, pfn_valid(0x120000)=0 [ 0.000000] In setup_bootmem, pfn_valid(0x100000)=0 [ 0.000000] In setup_bootmem, pfn_valid(0xfffff)=1 This generic pfn_valid() could tell the pfn is valid or not. I think this one is only available for flatmem. #define pfn_valid(pfn) (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
On 2019-08-11 10:01 p.m., Greentime Hu wrote: > Hi Logan, > > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月10日 週六 上午3:03寫道: >> >> >> >> On 2019-08-09 11:01 a.m., Greentime Hu wrote: >>> Hi Logan, >>> >>> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道: >>>> >>>> >>>> >>>> On 2019-08-08 10:23 p.m., Greentime Hu wrote: >>>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >>>>> index 3f12b069af1d..208b3e14ccd8 100644 >>>>> --- a/arch/riscv/Kconfig >>>>> +++ b/arch/riscv/Kconfig >>>>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS >>>>> default 2 >>>>> >>>>> config HAVE_ARCH_PFN_VALID >>>>> - def_bool y >>>>> + bool >>>>> + default !SPARSEMEM_VMEMMAP >>>>> >>>>> menu "Platform type" >>>>> >>>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h >>>>> index 8ddb6c7fedac..6991f7a5a4a7 100644 >>>>> --- a/arch/riscv/include/asm/page.h >>>>> +++ b/arch/riscv/include/asm/page.h >>>>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn; >>>>> #define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr))) >>>>> #define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn))) >>>>> >>>>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) >>>>> +#define pfn_valid(pfn) \ >>>>> + (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) >>>>> #define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) >>>>> #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) >>>>> +#else >>>>> +#define virt_to_page(vaddr) ((struct page *)((((u64)vaddr - >>>>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START)) >>>>> +#define page_to_virt(pg) ((void *)(((((u64)pg - VMEMMAP_START) / >>>>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset)) >>>>> +#endif >>>> >>>> This doesn't make sense to me at all. It should always use pfn_to_page() >>>> for virt_to_page() and the generic pfn_to_page()/page_to_pfn() >>>> implementations essentially already do what you are doing in a cleaner >>>> way. So I'd be really surprised if this does anything at all. >>>> >>> >>> Thank you for point me out that. I just checked the generic >>> implementation and I should use that one. >>> Sorry I didn't check the generic one and just implement it again. >>> I think the only patch we need is the first part to use generic >>> pfn_valid(). I just tested it and yes it can boot successfully in dts >>> with hole. >>> >>> It will fail in this check ((pfn)-pfn_base) < max_mapnr. >> >> Sounds to me like max_mapnr is not set correctly. See the code in >> setup_bootmem(). Seems like 'mem_size' should be set to the largest >> memory block, not just the one that contains the kernel... >> >> >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >>> index 3f12b069af1d..208b3e14ccd8 100644 >>> --- a/arch/riscv/Kconfig >>> +++ b/arch/riscv/Kconfig >>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS >>> default 2 >>> >>> config HAVE_ARCH_PFN_VALID >>> - def_bool y >>> + bool >>> + default !SPARSEMEM_VMEMMAP >>> >>> menu "Platform type" >>> >>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h >>> index 8ddb6c7fedac..80d28fa1e2eb 100644 >>> --- a/arch/riscv/include/asm/page.h >>> +++ b/arch/riscv/include/asm/page.h >>> @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; >>> #define page_to_bus(page) (page_to_phys(page)) >>> #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) >>> >>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) >>> #define pfn_valid(pfn) \ >>> (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) >>> +#endif >>> >>> #define ARCH_PFN_OFFSET (pfn_base) >> >> >> This patch still makes no sense. I'm not sure why we have an arch >> specific pfn_valid() because it's very similar to the generic one. But >> my guess is there's a reason for it and it's not doing what it is >> supposed when you remove it for the sparsemem case. > > It will use another pfn_valid() implementation in > include/linux/mmzone.h if CONFIG_SPARSEMEM and > !CONFIG_HAVE_ARCH_PFN_VALID > It will be this one. > > static inline int pfn_valid(unsigned long pfn) > { > if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) > return 0; > return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); > } Ah, ok I see. "page.h" is only included in no-mmu arches. Which explains why riscv re-implements that macro. Couple follow up questions then: * Did you test the memory-with-hole scenario without the sparsemem patches? It seems pfn_valid() will be wrong regardless of sparse/flat mem. * Any chance we can just use the generic pfn_valid() function in all cases not just sparsemem? Can you test that? Thanks, Logan
Hi Logan, Logan Gunthorpe <logang@deltatee.com> 於 2019年8月12日 週一 下午11:52寫道: > > > > On 2019-08-11 10:01 p.m., Greentime Hu wrote: > > Hi Logan, > > > > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月10日 週六 上午3:03寫道: > >> > >> > >> > >> On 2019-08-09 11:01 a.m., Greentime Hu wrote: > >>> Hi Logan, > >>> > >>> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道: > >>>> > >>>> > >>>> > >>>> On 2019-08-08 10:23 p.m., Greentime Hu wrote: > >>>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > >>>>> index 3f12b069af1d..208b3e14ccd8 100644 > >>>>> --- a/arch/riscv/Kconfig > >>>>> +++ b/arch/riscv/Kconfig > >>>>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > >>>>> default 2 > >>>>> > >>>>> config HAVE_ARCH_PFN_VALID > >>>>> - def_bool y > >>>>> + bool > >>>>> + default !SPARSEMEM_VMEMMAP > >>>>> > >>>>> menu "Platform type" > >>>>> > >>>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > >>>>> index 8ddb6c7fedac..6991f7a5a4a7 100644 > >>>>> --- a/arch/riscv/include/asm/page.h > >>>>> +++ b/arch/riscv/include/asm/page.h > >>>>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn; > >>>>> #define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr))) > >>>>> #define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn))) > >>>>> > >>>>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > >>>>> +#define pfn_valid(pfn) \ > >>>>> + (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > >>>>> #define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) > >>>>> #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) > >>>>> +#else > >>>>> +#define virt_to_page(vaddr) ((struct page *)((((u64)vaddr - > >>>>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START)) > >>>>> +#define page_to_virt(pg) ((void *)(((((u64)pg - VMEMMAP_START) / > >>>>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset)) > >>>>> +#endif > >>>> > >>>> This doesn't make sense to me at all. It should always use pfn_to_page() > >>>> for virt_to_page() and the generic pfn_to_page()/page_to_pfn() > >>>> implementations essentially already do what you are doing in a cleaner > >>>> way. So I'd be really surprised if this does anything at all. > >>>> > >>> > >>> Thank you for point me out that. I just checked the generic > >>> implementation and I should use that one. > >>> Sorry I didn't check the generic one and just implement it again. > >>> I think the only patch we need is the first part to use generic > >>> pfn_valid(). I just tested it and yes it can boot successfully in dts > >>> with hole. > >>> > >>> It will fail in this check ((pfn)-pfn_base) < max_mapnr. > >> > >> Sounds to me like max_mapnr is not set correctly. See the code in > >> setup_bootmem(). Seems like 'mem_size' should be set to the largest > >> memory block, not just the one that contains the kernel... > >> > >> > >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > >>> index 3f12b069af1d..208b3e14ccd8 100644 > >>> --- a/arch/riscv/Kconfig > >>> +++ b/arch/riscv/Kconfig > >>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS > >>> default 2 > >>> > >>> config HAVE_ARCH_PFN_VALID > >>> - def_bool y > >>> + bool > >>> + default !SPARSEMEM_VMEMMAP > >>> > >>> menu "Platform type" > >>> > >>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > >>> index 8ddb6c7fedac..80d28fa1e2eb 100644 > >>> --- a/arch/riscv/include/asm/page.h > >>> +++ b/arch/riscv/include/asm/page.h > >>> @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; > >>> #define page_to_bus(page) (page_to_phys(page)) > >>> #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) > >>> > >>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) > >>> #define pfn_valid(pfn) \ > >>> (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > >>> +#endif > >>> > >>> #define ARCH_PFN_OFFSET (pfn_base) > >> > >> > >> This patch still makes no sense. I'm not sure why we have an arch > >> specific pfn_valid() because it's very similar to the generic one. But > >> my guess is there's a reason for it and it's not doing what it is > >> supposed when you remove it for the sparsemem case. > > > > It will use another pfn_valid() implementation in > > include/linux/mmzone.h if CONFIG_SPARSEMEM and > > !CONFIG_HAVE_ARCH_PFN_VALID > > It will be this one. > > > > static inline int pfn_valid(unsigned long pfn) > > { > > if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) > > return 0; > > return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); > > } > > Ah, ok I see. "page.h" is only included in no-mmu arches. Which explains > why riscv re-implements that macro. Couple follow up questions then: > > * Did you test the memory-with-hole scenario without the sparsemem > patches? It seems pfn_valid() will be wrong regardless of sparse/flat mem. > > * Any chance we can just use the generic pfn_valid() function in all > cases not just sparsemem? Can you test that? > I think flat mem doesn't support memory-with-hole scenario. In mm/Kconfig, it says " For systems that have holes in their physical address spaces and for features like NUMA and memory hotplug, choose "Sparse Memory" " IMHO, the memory-with-hole scenario should only be tested for sparse mem but flat mem. The generic pfn_valid() is just for non-mmu arches. Every architecture with mmu defines their own pfn_valid(). This is supposed to be another separate patch that do we need to implement a generic pfn_valid().
On 2019-08-13 12:04 a.m., Greentime Hu wrote: > I think flat mem doesn't support memory-with-hole scenario. > In mm/Kconfig, it says > " > For systems that have holes in their physical address > spaces and for features like NUMA and memory hotplug, > choose "Sparse Memory" > " > IMHO, the memory-with-hole scenario should only be tested for sparse > mem but flat mem. Fair enough. > The generic pfn_valid() is just for non-mmu arches. The generic pfn_valid() in asm-generic is only for non-mmu arches. > Every architecture > with mmu defines their own pfn_valid(). Not true. Arm64, for example just uses the generic implementation in mmzone.h. My main question is whether we can just do that. If we can't we should probably structure it like powerpc where they only use the arch-specific helper for CONFIG_FLATMEM instead of when CONFIG_SPARSEMEM isn't set. Logan
On Tue, 13 Aug 2019, Logan Gunthorpe wrote: > On 2019-08-13 12:04 a.m., Greentime Hu wrote: > > > Every architecture with mmu defines their own pfn_valid(). > > Not true. Arm64, for example just uses the generic implementation in > mmzone.h. arm64 seems to define their own: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899 https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235 While there are many architectures which have their own pfn_valid(); oddly, almost none of them set HAVE_ARCH_PFN_VALID ? - Paul
On Tue, 13 Aug 2019, Paul Walmsley wrote: > On Tue, 13 Aug 2019, Logan Gunthorpe wrote: > > > On 2019-08-13 12:04 a.m., Greentime Hu wrote: > > > > > Every architecture with mmu defines their own pfn_valid(). > > > > Not true. Arm64, for example just uses the generic implementation in > > mmzone.h. > > arm64 seems to define their own: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899 > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235 > > While there are many architectures which have their own pfn_valid(); > oddly, almost none of them set HAVE_ARCH_PFN_VALID ? (fixed the linux-mm@ address) - Paul
On 2019-08-13 10:39 a.m., Paul Walmsley wrote: > On Tue, 13 Aug 2019, Logan Gunthorpe wrote: > >> On 2019-08-13 12:04 a.m., Greentime Hu wrote: >> >>> Every architecture with mmu defines their own pfn_valid(). >> >> Not true. Arm64, for example just uses the generic implementation in >> mmzone.h. > > arm64 seems to define their own: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899 Oh, yup. My mistake. > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235 > > While there are many architectures which have their own pfn_valid(); > oddly, almost none of them set HAVE_ARCH_PFN_VALID ? Yes, much of this is super confusing. Seems HAVE_ARCH_PFN_VALID only matters if SPARSEMEM is set. So risc-v probably doesn't need to set it and we just need a #ifdef !CONFIG_FLATMEM around the pfn_valid definition like other arches. Logan
Logan Gunthorpe <logang@deltatee.com> 於 2019年8月14日 週三 上午12:50寫道: > > On 2019-08-13 10:39 a.m., Paul Walmsley wrote: > > On Tue, 13 Aug 2019, Logan Gunthorpe wrote: > > > >> On 2019-08-13 12:04 a.m., Greentime Hu wrote: > >> > >>> Every architecture with mmu defines their own pfn_valid(). > >> > >> Not true. Arm64, for example just uses the generic implementation in > >> mmzone.h. > > > > arm64 seems to define their own: > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899 > > Oh, yup. My mistake. > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235 > > > > While there are many architectures which have their own pfn_valid(); > > oddly, almost none of them set HAVE_ARCH_PFN_VALID ? > > Yes, much of this is super confusing. Seems HAVE_ARCH_PFN_VALID only > matters if SPARSEMEM is set. So risc-v probably doesn't need to set it > and we just need a #ifdef !CONFIG_FLATMEM around the pfn_valid > definition like other arches. > Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM. https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85 BTW, I found another issue here. #define FIXADDR_TOP (VMALLOC_START) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) #define VMEMMAP_END (VMALLOC_START - 1) #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) These 2 regions are overlapped. How about this fix? Not sure if it is good for everyone. diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 3f12b069af1d..3c4d394679d0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -115,9 +115,6 @@ config PGTABLE_LEVELS default 3 if 64BIT default 2 -config HAVE_ARCH_PFN_VALID - def_bool y - menu "Platform type" choice diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index c207f6634b91..72e106b60bc5 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -26,7 +26,7 @@ enum fixed_addresses { }; #define FIXADDR_SIZE (__end_of_fixed_addresses * PAGE_SIZE) -#define FIXADDR_TOP (VMALLOC_START) +#define FIXADDR_TOP (VMEMMAP_START) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) #define FIXMAP_PAGE_IO PAGE_KERNEL diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 8ddb6c7fedac..83830997dce6 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; #define page_to_bus(page) (page_to_phys(page)) #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) +#if defined(CONFIG_FLATMEM) #define pfn_valid(pfn) \ (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) +#endif #define ARCH_PFN_OFFSET (pfn_base)
On 2019-08-14 7:35 a.m., Greentime Hu wrote: > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月14日 週三 上午12:50寫道: >> >> On 2019-08-13 10:39 a.m., Paul Walmsley wrote: >>> On Tue, 13 Aug 2019, Logan Gunthorpe wrote: >>> >>>> On 2019-08-13 12:04 a.m., Greentime Hu wrote: >>>> >>>>> Every architecture with mmu defines their own pfn_valid(). >>>> >>>> Not true. Arm64, for example just uses the generic implementation in >>>> mmzone.h. >>> >>> arm64 seems to define their own: >>> >>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899 >> >> Oh, yup. My mistake. >> >>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235 >>> >>> While there are many architectures which have their own pfn_valid(); >>> oddly, almost none of them set HAVE_ARCH_PFN_VALID ? >> >> Yes, much of this is super confusing. Seems HAVE_ARCH_PFN_VALID only >> matters if SPARSEMEM is set. So risc-v probably doesn't need to set it >> and we just need a #ifdef !CONFIG_FLATMEM around the pfn_valid >> definition like other arches. >> > > Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM. > https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85 > > BTW, I found another issue here. > #define FIXADDR_TOP (VMALLOC_START) > #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > #define VMEMMAP_END (VMALLOC_START - 1) > #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) > These 2 regions are overlapped. > > How about this fix? Not sure if it is good for everyone. Yes, this looks good to me. I can fold these changes into my patch and send a v5 to the list. Thanks! Logan > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index 3f12b069af1d..3c4d394679d0 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -115,9 +115,6 @@ config PGTABLE_LEVELS > default 3 if 64BIT > default 2 > > -config HAVE_ARCH_PFN_VALID > - def_bool y > - > menu "Platform type" > > choice > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h > index c207f6634b91..72e106b60bc5 100644 > --- a/arch/riscv/include/asm/fixmap.h > +++ b/arch/riscv/include/asm/fixmap.h > @@ -26,7 +26,7 @@ enum fixed_addresses { > }; > > #define FIXADDR_SIZE (__end_of_fixed_addresses * PAGE_SIZE) > -#define FIXADDR_TOP (VMALLOC_START) > +#define FIXADDR_TOP (VMEMMAP_START) > #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > > #define FIXMAP_PAGE_IO PAGE_KERNEL > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 8ddb6c7fedac..83830997dce6 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn; > #define page_to_bus(page) (page_to_phys(page)) > #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) > > +#if defined(CONFIG_FLATMEM) > #define pfn_valid(pfn) \ > (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) > +#endif > > #define ARCH_PFN_OFFSET (pfn_base)
On Wed, 14 Aug 2019, Logan Gunthorpe wrote: > On 2019-08-14 7:35 a.m., Greentime Hu wrote: > > > Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM. > > https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85 > > > > BTW, I found another issue here. > > #define FIXADDR_TOP (VMALLOC_START) > > #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > > #define VMEMMAP_END (VMALLOC_START - 1) > > #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) > > These 2 regions are overlapped. > > > > How about this fix? Not sure if it is good for everyone. > > Yes, this looks good to me. I can fold these changes into my patch and > send a v5 to the list. The change to FIXADDR_TOP should be separated out into its own patch - it probably needs to go up as a fix. - Paul
On 2019-08-14 11:40 a.m., Paul Walmsley wrote: > On Wed, 14 Aug 2019, Logan Gunthorpe wrote: > >> On 2019-08-14 7:35 a.m., Greentime Hu wrote: >> >>> Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM. >>> https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85 >>> >>> BTW, I found another issue here. >>> #define FIXADDR_TOP (VMALLOC_START) >>> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) >>> #define VMEMMAP_END (VMALLOC_START - 1) >>> #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) >>> These 2 regions are overlapped. >>> >>> How about this fix? Not sure if it is good for everyone. >> >> Yes, this looks good to me. I can fold these changes into my patch and >> send a v5 to the list. > > The change to FIXADDR_TOP should be separated out into its own patch - it > probably needs to go up as a fix. I don't think so... VMEMMAP_START doesn't exist until the sparsemem patch so it can't be changed until after the sparsemem patch and no sense adding a bug in the sparsemem patch... Logan
On Wed, 14 Aug 2019, Logan Gunthorpe wrote: > On 2019-08-14 11:40 a.m., Paul Walmsley wrote: > > On Wed, 14 Aug 2019, Logan Gunthorpe wrote: > > > >> On 2019-08-14 7:35 a.m., Greentime Hu wrote: > >> > >>> Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM. > >>> https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85 > >>> > >>> BTW, I found another issue here. > >>> #define FIXADDR_TOP (VMALLOC_START) > >>> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > >>> #define VMEMMAP_END (VMALLOC_START - 1) > >>> #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) > >>> These 2 regions are overlapped. > >>> > >>> How about this fix? Not sure if it is good for everyone. > >> > >> Yes, this looks good to me. I can fold these changes into my patch and > >> send a v5 to the list. > > > > The change to FIXADDR_TOP should be separated out into its own patch - it > > probably needs to go up as a fix. > > I don't think so... VMEMMAP_START doesn't exist until the sparsemem > patch so it can't be changed until after the sparsemem patch and no > sense adding a bug in the sparsemem patch... OK, that's fine then. - Paul
Hey,
On 2019-08-14 7:35 a.m., Greentime Hu wrote:
> How about this fix? Not sure if it is good for everyone.
I applied your fix to the patch and it seems ok. But it doesn't seem to
work on a recent version of the kernel. Have you got it working on v5.3?
It seems the following patch breaks things:
671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages")
I don't really have time right now to debug this any further.
Logan
Hi Logan, On Thu, Aug 15, 2019 at 6:21 AM Logan Gunthorpe <logang@deltatee.com> wrote: > > Hey, > > On 2019-08-14 7:35 a.m., Greentime Hu wrote: > > How about this fix? Not sure if it is good for everyone. > > I applied your fix to the patch and it seems ok. But it doesn't seem to > work on a recent version of the kernel. Have you got it working on v5.3? > It seems the following patch breaks things: > > 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages") > > I don't really have time right now to debug this any further. > I just tried v5.3-rc4 and it failed. I try to debug this case. I found it failed might be because of an unmapped virtual address is used in memblocks_present() -> memblock_alloc (). In this commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages"), it finishes all the VA/PA mapping after setup_vm_final() is called. So we have to call memblocks_present() and sparse_init() right after setup_vm_final(). Here is my patch and I tested with memory-with-hole. It can boot normally in Unleashed board after applying this patch. diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 27d1d847fb2d..35aacb0c93e5 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -138,8 +138,6 @@ void __init setup_bootmem(void) PFN_PHYS(end_pfn - start_pfn), &memblock.memory, 0); } - memblocks_present(); - sparse_init(); } unsigned long va_pa_offset; @@ -452,6 +450,8 @@ static void __init setup_vm_final(void) void __init paging_init(void) { setup_vm_final(); + memblocks_present(); + sparse_init(); setup_zero_page(); zone_sizes_init(); } Test case: memory@80000000 { device_type = "memory"; reg = <0x0 0x80000000 0x0 0x80000000>; }; memory@180000000 { device_type = "memory"; reg = <0x1 0x80000000 0x0 0x40000000>; }; # cat /proc/meminfo MemTotal: 3003496 kB MemFree: 2986584 kB MemAvailable: 2970176 kB Buffers: 0 kB Cached: 3540 kB SwapCached: 0 kB Active: 3920 kB Inactive: 68 kB Active(anon): 3920 kB Inactive(anon): 68 kB Active(file): 0 kB Inactive(file): 0 kB Unevictable: 0 kB Mlocked: 0 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 0 kB Writeback: 0 kB AnonPages: 528 kB Mapped: 1984 kB Shmem: 3540 kB KReclaimable: 688 kB Slab: 5700 kB SReclaimable: 688 kB SUnreclaim: 5012 kB KernelStack: 424 kB PageTables: 80 kB NFS_Unstable: 0 kB Bounce: 0 kB WritebackTmp: 0 kB CommitLimit: 1501748 kB Committed_AS: 3200 kB VmallocTotal: 67108863 kB VmallocUsed: 12 kB VmallocChunk: 0 kB Percpu: 272 kB # uname -a Linux buildroot 5.3.0-rc4-00001-g44404421c481-dirty #10 SMP Thu Aug 15 16:28:24 DST 2019 riscv64 GNU/Lin[ 2352.443621] random: fast init done ux # zcat /proc/config.gz |grep SPARSE CONFIG_SPARSE_IRQ=y CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_SPARSEMEM_MANUAL=y CONFIG_SPARSEMEM=y CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y # CONFIG_INPUT_SPARSEKMAP is not set
On 2019-08-15 3:31 a.m., Greentime Hu wrote: > Hi Logan, > > On Thu, Aug 15, 2019 at 6:21 AM Logan Gunthorpe <logang@deltatee.com> wrote: >> >> Hey, >> >> On 2019-08-14 7:35 a.m., Greentime Hu wrote: >>> How about this fix? Not sure if it is good for everyone. >> >> I applied your fix to the patch and it seems ok. But it doesn't seem to >> work on a recent version of the kernel. Have you got it working on v5.3? >> It seems the following patch breaks things: >> >> 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages") >> >> I don't really have time right now to debug this any further. >> > > I just tried v5.3-rc4 and it failed. I try to debug this case. > I found it failed might be because of an unmapped virtual address is used > in memblocks_present() -> memblock_alloc (). > > In this commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two > stages"), it finishes all the VA/PA mapping after setup_vm_final() is > called. > So we have to call memblocks_present() and sparse_init() right after > setup_vm_final(). > > Here is my patch and I tested with memory-with-hole. > It can boot normally in Unleashed board after applying this patch. Great, thanks! I'll roll this into my patch and send v5 out when I have a moment. Can I add your Signed-off-by for the bits you've contributed to give you credit for your work? Logan
On Fri, Aug 16, 2019 at 12:20 AM Logan Gunthorpe <logang@deltatee.com> wrote: > > > > On 2019-08-15 3:31 a.m., Greentime Hu wrote: > > Hi Logan, > > > > On Thu, Aug 15, 2019 at 6:21 AM Logan Gunthorpe <logang@deltatee.com> wrote: > >> > >> Hey, > >> > >> On 2019-08-14 7:35 a.m., Greentime Hu wrote: > >>> How about this fix? Not sure if it is good for everyone. > >> > >> I applied your fix to the patch and it seems ok. But it doesn't seem to > >> work on a recent version of the kernel. Have you got it working on v5.3? > >> It seems the following patch breaks things: > >> > >> 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages") > >> > >> I don't really have time right now to debug this any further. > >> > > > > I just tried v5.3-rc4 and it failed. I try to debug this case. > > I found it failed might be because of an unmapped virtual address is used > > in memblocks_present() -> memblock_alloc (). > > > > In this commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two > > stages"), it finishes all the VA/PA mapping after setup_vm_final() is > > called. > > So we have to call memblocks_present() and sparse_init() right after > > setup_vm_final(). > > > > Here is my patch and I tested with memory-with-hole. > > It can boot normally in Unleashed board after applying this patch. > > Great, thanks! I'll roll this into my patch and send v5 out when I have > a moment. Can I add your Signed-off-by for the bits you've contributed > to give you credit for your work? Sure. Please use my Signed-off-by: Greentime Hu <greentime.hu@sifive.com>
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e0d7d61779a6..bd659327bc6b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -54,12 +54,32 @@ config ZONE_DMA32 bool default y if 64BIT +config VA_BITS + int + default 32 if 32BIT + default 39 if 64BIT + +config PA_BITS + int + default 34 if 32BIT + default 56 if 64BIT + config PAGE_OFFSET hex default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB +config ARCH_FLATMEM_ENABLE + def_bool y + +config ARCH_SPARSEMEM_ENABLE + def_bool y + select SPARSEMEM_VMEMMAP_ENABLE + +config ARCH_SELECT_MEMORY_MODEL + def_bool ARCH_SPARSEMEM_ENABLE + config STACKTRACE_SUPPORT def_bool y @@ -94,6 +114,9 @@ config PGTABLE_LEVELS config HAVE_KPROBES def_bool n +config HAVE_ARCH_PFN_VALID + def_bool y + menu "Platform type" choice diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 16301966d65b..e1162336f5ea 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -89,6 +89,23 @@ extern pgd_t swapper_pg_dir[]; #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) +#define VMALLOC_END (PAGE_OFFSET - 1) +#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) + +/* + * Roughly size the vmemmap space to be large enough to fit enough + * struct pages to map half the virtual address space. Then + * position vmemmap directly below the VMALLOC region. + */ +#define VMEMMAP_SHIFT \ + (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) +#define VMEMMAP_SIZE (1UL << VMEMMAP_SHIFT) +#define VMEMMAP_END (VMALLOC_START - 1) +#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) + +#define vmemmap ((struct page *)VMEMMAP_START) + /* * ZERO_PAGE is a global shared page that is always zero, * used for zero-mapped memory areas, etc. @@ -411,10 +428,6 @@ static inline void pgtable_cache_init(void) /* No page table caches to initialize */ } -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (PAGE_OFFSET - 1) -#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) - /* * Task size is 0x40000000000 for RV64 or 0xb800000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h new file mode 100644 index 000000000000..b58ba2d9ed6e --- /dev/null +++ b/arch/riscv/include/asm/sparsemem.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_SPARSEMEM_H +#define __ASM_SPARSEMEM_H + +#ifdef CONFIG_SPARSEMEM +#define MAX_PHYSMEM_BITS CONFIG_PA_BITS +#define SECTION_SIZE_BITS 27 +#endif /* CONFIG_SPARSEMEM */ + +#endif /* __ASM_SPARSEMEM_H */ diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index fc8006a042eb..98f39adefb1a 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -193,6 +193,9 @@ static void __init setup_bootmem(void) PFN_PHYS(end_pfn - start_pfn), &memblock.memory, 0); } + + memblocks_present(); + sparse_init(); } void __init setup_arch(char **cmdline_p) @@ -224,4 +227,3 @@ void __init setup_arch(char **cmdline_p) riscv_fill_hwcap(); } - diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 1d9bfaff60bc..09568d5bc5b8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -69,3 +69,11 @@ void free_initrd_mem(unsigned long start, unsigned long end) { } #endif /* CONFIG_BLK_DEV_INITRD */ + +#ifdef CONFIG_SPARSEMEM +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + return vmemmap_populate_basepages(start, end, node); +} +#endif