Message ID | 20240514140446.538622-5-bjorn@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | riscv: Memory Hot(Un)Plug support | expand |
On 14.05.24 16:04, Björn Töpel wrote: > From: Björn Töpel <bjorn@rivosinc.com> > > For an architecture to support memory hotplugging, a couple of > callbacks needs to be implemented: > > arch_add_memory() > This callback is responsible for adding the physical memory into the > direct map, and call into the memory hotplugging generic code via > __add_pages() that adds the corresponding struct page entries, and > updates the vmemmap mapping. > > arch_remove_memory() > This is the inverse of the callback above. > > vmemmap_free() > This function tears down the vmemmap mappings (if > CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the > backing vmemmap pages. Note that for persistent memory, an > alternative allocator for the backing pages can be used; The > vmem_altmap. This means that when the backing pages are cleared, > extra care is needed so that the correct deallocation method is > used. > > arch_get_mappable_range() > This functions returns the PA range that the direct map can map. > Used by the MHP internals for sanity checks. > > The page table unmap/teardown functions are heavily based on code from > the x86 tree. The same remove_pgd_mapping() function is used in both > vmemmap_free() and arch_remove_memory(), but in the latter function > the backing pages are not removed. > > Signed-off-by: Björn Töpel <bjorn@rivosinc.com> > --- > arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 242 insertions(+) > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 6f72b0b2b854..7f0b921a3d3a 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void) > } > } > #endif > + > +#ifdef CONFIG_MEMORY_HOTPLUG > +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) > +{ > + pte_t *pte; > + int i; > + > + for (i = 0; i < PTRS_PER_PTE; i++) { > + pte = pte_start + i; > + if (!pte_none(*pte)) > + return; > + } > + > + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0); > + pmd_clear(pmd); > +} > + > +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) > +{ > + pmd_t *pmd; > + int i; > + > + for (i = 0; i < PTRS_PER_PMD; i++) { > + pmd = pmd_start + i; > + if (!pmd_none(*pmd)) > + return; > + } > + > + free_pages((unsigned long)page_address(pud_page(*pud)), 0); > + pud_clear(pud); > +} > + > +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) > +{ > + pud_t *pud; > + int i; > + > + for (i = 0; i < PTRS_PER_PUD; i++) { > + pud = pud_start + i; > + if (!pud_none(*pud)) > + return; > + } > + > + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0); > + p4d_clear(p4d); > +} > + > +static void __meminit free_vmemmap_storage(struct page *page, size_t size, > + struct vmem_altmap *altmap) > +{ > + if (altmap) > + vmem_altmap_free(altmap, size >> PAGE_SHIFT); > + else > + free_pages((unsigned long)page_address(page), get_order(size)); If you unplug a DIMM that was added during boot (can happen on x86-64, can it happen on riscv?), free_pages() would not be sufficient. You'd be freeing a PG_reserved page that has to be freed differently.
David Hildenbrand <david@redhat.com> writes: > On 14.05.24 16:04, Björn Töpel wrote: >> From: Björn Töpel <bjorn@rivosinc.com> >> >> For an architecture to support memory hotplugging, a couple of >> callbacks needs to be implemented: >> >> arch_add_memory() >> This callback is responsible for adding the physical memory into the >> direct map, and call into the memory hotplugging generic code via >> __add_pages() that adds the corresponding struct page entries, and >> updates the vmemmap mapping. >> >> arch_remove_memory() >> This is the inverse of the callback above. >> >> vmemmap_free() >> This function tears down the vmemmap mappings (if >> CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the >> backing vmemmap pages. Note that for persistent memory, an >> alternative allocator for the backing pages can be used; The >> vmem_altmap. This means that when the backing pages are cleared, >> extra care is needed so that the correct deallocation method is >> used. >> >> arch_get_mappable_range() >> This functions returns the PA range that the direct map can map. >> Used by the MHP internals for sanity checks. >> >> The page table unmap/teardown functions are heavily based on code from >> the x86 tree. The same remove_pgd_mapping() function is used in both >> vmemmap_free() and arch_remove_memory(), but in the latter function >> the backing pages are not removed. >> >> Signed-off-by: Björn Töpel <bjorn@rivosinc.com> >> --- >> arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 242 insertions(+) >> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 6f72b0b2b854..7f0b921a3d3a 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void) >> } >> } >> #endif >> + >> +#ifdef CONFIG_MEMORY_HOTPLUG >> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) >> +{ >> + pte_t *pte; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PTE; i++) { >> + pte = pte_start + i; >> + if (!pte_none(*pte)) >> + return; >> + } >> + >> + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0); >> + pmd_clear(pmd); >> +} >> + >> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) >> +{ >> + pmd_t *pmd; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PMD; i++) { >> + pmd = pmd_start + i; >> + if (!pmd_none(*pmd)) >> + return; >> + } >> + >> + free_pages((unsigned long)page_address(pud_page(*pud)), 0); >> + pud_clear(pud); >> +} >> + >> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) >> +{ >> + pud_t *pud; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PUD; i++) { >> + pud = pud_start + i; >> + if (!pud_none(*pud)) >> + return; >> + } >> + >> + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0); >> + p4d_clear(p4d); >> +} >> + >> +static void __meminit free_vmemmap_storage(struct page *page, size_t size, >> + struct vmem_altmap *altmap) >> +{ >> + if (altmap) >> + vmem_altmap_free(altmap, size >> PAGE_SHIFT); >> + else >> + free_pages((unsigned long)page_address(page), get_order(size)); > > If you unplug a DIMM that was added during boot (can happen on x86-64, > can it happen on riscv?), free_pages() would not be sufficient. You'd be > freeing a PG_reserved page that has to be freed differently. I'd say if it can happen on x86-64, it probably can on RISC-V. I'll look into this for the next spin! Thanks for spending time on the series! Cheers, Björn
On Tue, May 14, 2024 at 4:05 PM Björn Töpel <bjorn@kernel.org> wrote: > > From: Björn Töpel <bjorn@rivosinc.com> > > For an architecture to support memory hotplugging, a couple of > callbacks needs to be implemented: > > arch_add_memory() > This callback is responsible for adding the physical memory into the > direct map, and call into the memory hotplugging generic code via > __add_pages() that adds the corresponding struct page entries, and > updates the vmemmap mapping. > > arch_remove_memory() > This is the inverse of the callback above. > > vmemmap_free() > This function tears down the vmemmap mappings (if > CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the > backing vmemmap pages. Note that for persistent memory, an > alternative allocator for the backing pages can be used; The > vmem_altmap. This means that when the backing pages are cleared, > extra care is needed so that the correct deallocation method is > used. > > arch_get_mappable_range() > This functions returns the PA range that the direct map can map. > Used by the MHP internals for sanity checks. > > The page table unmap/teardown functions are heavily based on code from > the x86 tree. The same remove_pgd_mapping() function is used in both > vmemmap_free() and arch_remove_memory(), but in the latter function > the backing pages are not removed. > > Signed-off-by: Björn Töpel <bjorn@rivosinc.com> > --- > arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 242 insertions(+) > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 6f72b0b2b854..7f0b921a3d3a 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void) > } > } > #endif > + > +#ifdef CONFIG_MEMORY_HOTPLUG > +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) > +{ > + pte_t *pte; > + int i; > + > + for (i = 0; i < PTRS_PER_PTE; i++) { > + pte = pte_start + i; > + if (!pte_none(*pte)) > + return; > + } > + > + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0); > + pmd_clear(pmd); > +} > + > +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) > +{ > + pmd_t *pmd; > + int i; > + > + for (i = 0; i < PTRS_PER_PMD; i++) { > + pmd = pmd_start + i; > + if (!pmd_none(*pmd)) > + return; > + } > + > + free_pages((unsigned long)page_address(pud_page(*pud)), 0); > + pud_clear(pud); > +} > + > +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) > +{ > + pud_t *pud; > + int i; > + > + for (i = 0; i < PTRS_PER_PUD; i++) { > + pud = pud_start + i; > + if (!pud_none(*pud)) > + return; > + } > + > + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0); > + p4d_clear(p4d); > +} > + > +static void __meminit free_vmemmap_storage(struct page *page, size_t size, > + struct vmem_altmap *altmap) > +{ > + if (altmap) > + vmem_altmap_free(altmap, size >> PAGE_SHIFT); > + else > + free_pages((unsigned long)page_address(page), get_order(size)); > +} > + > +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, > + bool is_vmemmap, struct vmem_altmap *altmap) > +{ > + unsigned long next; > + pte_t *ptep, pte; > + > + for (; addr < end; addr = next) { > + next = (addr + PAGE_SIZE) & PAGE_MASK; > + if (next > end) > + next = end; > + > + ptep = pte_base + pte_index(addr); > + pte = READ_ONCE(*ptep); > + > + if (!pte_present(*ptep)) > + continue; > + > + pte_clear(&init_mm, addr, ptep); > + if (is_vmemmap) > + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); > + } > +} > + > +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, > + bool is_vmemmap, struct vmem_altmap *altmap) > +{ > + unsigned long next; > + pte_t *pte_base; > + pmd_t *pmdp, pmd; > + > + for (; addr < end; addr = next) { > + next = pmd_addr_end(addr, end); > + pmdp = pmd_base + pmd_index(addr); > + pmd = READ_ONCE(*pmdp); > + > + if (!pmd_present(pmd)) > + continue; > + > + if (pmd_leaf(pmd)) { > + pmd_clear(pmdp); > + if (is_vmemmap) > + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap); > + continue; > + } > + > + pte_base = (pte_t *)pmd_page_vaddr(*pmdp); > + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap); > + free_pte_table(pte_base, pmdp); > + } > +} > + > +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end, > + bool is_vmemmap, struct vmem_altmap *altmap) > +{ > + unsigned long next; > + pud_t *pudp, pud; > + pmd_t *pmd_base; > + > + for (; addr < end; addr = next) { > + next = pud_addr_end(addr, end); > + pudp = pud_base + pud_index(addr); > + pud = READ_ONCE(*pudp); > + > + if (!pud_present(pud)) > + continue; > + > + if (pud_leaf(pud)) { > + if (pgtable_l4_enabled) { > + pud_clear(pudp); > + if (is_vmemmap) > + free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap); > + } > + continue; > + } > + > + pmd_base = pmd_offset(pudp, 0); > + remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap); > + > + if (pgtable_l4_enabled) > + free_pmd_table(pmd_base, pudp); > + } > +} > + > +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end, > + bool is_vmemmap, struct vmem_altmap *altmap) > +{ > + unsigned long next; > + p4d_t *p4dp, p4d; > + pud_t *pud_base; > + > + for (; addr < end; addr = next) { > + next = p4d_addr_end(addr, end); > + p4dp = p4d_base + p4d_index(addr); > + p4d = READ_ONCE(*p4dp); > + > + if (!p4d_present(p4d)) > + continue; > + > + if (p4d_leaf(p4d)) { > + if (pgtable_l5_enabled) { > + p4d_clear(p4dp); > + if (is_vmemmap) > + free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap); > + } > + continue; > + } > + > + pud_base = pud_offset(p4dp, 0); > + remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap); > + > + if (pgtable_l5_enabled) > + free_pud_table(pud_base, p4dp); > + } > +} > + > +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap, > + struct vmem_altmap *altmap) > +{ > + unsigned long addr, next; > + p4d_t *p4d_base; > + pgd_t *pgd; > + > + for (addr = va; addr < end; addr = next) { > + next = pgd_addr_end(addr, end); > + pgd = pgd_offset_k(addr); > + > + if (!pgd_present(*pgd)) > + continue; > + > + if (pgd_leaf(*pgd)) > + continue; > + > + p4d_base = p4d_offset(pgd, 0); > + remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap); > + } > + > + flush_tlb_all(); > +} > + > +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size) > +{ > + unsigned long va = (unsigned long)__va(start); > + unsigned long end = (unsigned long)__va(start + size); > + > + remove_pgd_mapping(va, end, false, NULL); > +} > + > +struct range arch_get_mappable_range(void) > +{ > + struct range mhp_range; > + > + mhp_range.start = __pa(PAGE_OFFSET); > + mhp_range.end = __pa(PAGE_END - 1); > + return mhp_range; > +} > + > +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) > +{ > + int ret; > + > + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot); > + flush_tlb_all(); > + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); > + if (ret) { > + remove_linear_mapping(start, size); > + return ret; > + } > + You need to flush the TLB here too since __add_pages() populates the page table with the new vmemmap mapping (only because riscv allows to cache invalid entries, I'll adapt this in my next version of Svvptc support). > + max_pfn = PFN_UP(start + size); > + max_low_pfn = max_pfn; > + return 0; > +} > + > +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) > +{ > + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); > + remove_linear_mapping(start, size); You need to flush the TLB here too. > +} > + > +#ifdef CONFIG_SPARSEMEM_VMEMMAP > +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) > +{ > + remove_pgd_mapping(start, end, true, altmap); > +} > +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ > +#endif /* CONFIG_MEMORY_HOTPLUG */ > -- > 2.40.1 >
Alexandre Ghiti <alexghiti@rivosinc.com> writes: > On Tue, May 14, 2024 at 4:05 PM Björn Töpel <bjorn@kernel.org> wrote: >> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) >> +{ >> + int ret; >> + >> + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot); >> + flush_tlb_all(); >> + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); >> + if (ret) { >> + remove_linear_mapping(start, size); >> + return ret; >> + } >> + > > You need to flush the TLB here too since __add_pages() populates the > page table with the new vmemmap mapping (only because riscv allows to > cache invalid entries, I'll adapt this in my next version of Svvptc > support). > >> + max_pfn = PFN_UP(start + size); >> + max_low_pfn = max_pfn; >> + return 0; >> +} >> + >> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) >> +{ >> + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); >> + remove_linear_mapping(start, size); > > You need to flush the TLB here too. I'll address all of the above in the next version. Thanks for reviewing the series! Björn
On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote: > +static void __meminit free_vmemmap_storage(struct page *page, size_t size, > + struct vmem_altmap *altmap) > +{ > + if (altmap) > + vmem_altmap_free(altmap, size >> PAGE_SHIFT); > + else > + free_pages((unsigned long)page_address(page), get_order(size)); David already pointed this out, but can check arch/x86/mm/init_64.c:free_pagetable(). You will see that we have to do some magic for bootmem memory (DIMMs which were not hotplugged but already present) > +#ifdef CONFIG_SPARSEMEM_VMEMMAP > +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) > +{ > + remove_pgd_mapping(start, end, true, altmap); > +} > +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ > +#endif /* CONFIG_MEMORY_HOTPLUG */ I will comment on the patch where you add support for hotplug and the dependency, but on a track in LSFMM today, we decided that most likely we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP environments. So, since you are adding this plain fresh, please consider to tight the hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP. As a bonus, you will only have to maintain one flavour of functions.
Oscar Salvador <osalvador@suse.de> writes: > On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote: >> +static void __meminit free_vmemmap_storage(struct page *page, size_t size, >> + struct vmem_altmap *altmap) >> +{ >> + if (altmap) >> + vmem_altmap_free(altmap, size >> PAGE_SHIFT); >> + else >> + free_pages((unsigned long)page_address(page), get_order(size)); > > David already pointed this out, but can check > arch/x86/mm/init_64.c:free_pagetable(). > > You will see that we have to do some magic for bootmem memory (DIMMs > which were not hotplugged but already present) Thank you! >> +#ifdef CONFIG_SPARSEMEM_VMEMMAP >> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) >> +{ >> + remove_pgd_mapping(start, end, true, altmap); >> +} >> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ >> +#endif /* CONFIG_MEMORY_HOTPLUG */ > > I will comment on the patch where you add support for hotplug and the > dependency, but on a track in LSFMM today, we decided that most likely > we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP > environments. > So, since you are adding this plain fresh, please consider to tight the > hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP. > As a bonus, you will only have to maintain one flavour of functions. Ah, yeah, I saw it mentioned on the LSF/MM/BPF topics. Less is definitely more -- I'll make the next version depend on SPARSEMEM_VMEMMAP. Björn
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 6f72b0b2b854..7f0b921a3d3a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void) } } #endif + +#ifdef CONFIG_MEMORY_HOTPLUG +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0); + pmd_clear(pmd); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + free_pages((unsigned long)page_address(pud_page(*pud)), 0); + pud_clear(pud); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0); + p4d_clear(p4d); +} + +static void __meminit free_vmemmap_storage(struct page *page, size_t size, + struct vmem_altmap *altmap) +{ + if (altmap) + vmem_altmap_free(altmap, size >> PAGE_SHIFT); + else + free_pages((unsigned long)page_address(page), get_order(size)); +} + +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *ptep, pte; + + for (; addr < end; addr = next) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + ptep = pte_base + pte_index(addr); + pte = READ_ONCE(*ptep); + + if (!pte_present(*ptep)) + continue; + + pte_clear(&init_mm, addr, ptep); + if (is_vmemmap) + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); + } +} + +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmdp, pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + pmdp = pmd_base + pmd_index(addr); + pmd = READ_ONCE(*pmdp); + + if (!pmd_present(pmd)) + continue; + + if (pmd_leaf(pmd)) { + pmd_clear(pmdp); + if (is_vmemmap) + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmdp); + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap); + free_pte_table(pte_base, pmdp); + } +} + +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pud_t *pudp, pud; + pmd_t *pmd_base; + + for (; addr < end; addr = next) { + next = pud_addr_end(addr, end); + pudp = pud_base + pud_index(addr); + pud = READ_ONCE(*pudp); + + if (!pud_present(pud)) + continue; + + if (pud_leaf(pud)) { + if (pgtable_l4_enabled) { + pud_clear(pudp); + if (is_vmemmap) + free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap); + } + continue; + } + + pmd_base = pmd_offset(pudp, 0); + remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap); + + if (pgtable_l4_enabled) + free_pmd_table(pmd_base, pudp); + } +} + +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + p4d_t *p4dp, p4d; + pud_t *pud_base; + + for (; addr < end; addr = next) { + next = p4d_addr_end(addr, end); + p4dp = p4d_base + p4d_index(addr); + p4d = READ_ONCE(*p4dp); + + if (!p4d_present(p4d)) + continue; + + if (p4d_leaf(p4d)) { + if (pgtable_l5_enabled) { + p4d_clear(p4dp); + if (is_vmemmap) + free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap); + } + continue; + } + + pud_base = pud_offset(p4dp, 0); + remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap); + + if (pgtable_l5_enabled) + free_pud_table(pud_base, p4dp); + } +} + +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap, + struct vmem_altmap *altmap) +{ + unsigned long addr, next; + p4d_t *p4d_base; + pgd_t *pgd; + + for (addr = va; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!pgd_present(*pgd)) + continue; + + if (pgd_leaf(*pgd)) + continue; + + p4d_base = p4d_offset(pgd, 0); + remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap); + } + + flush_tlb_all(); +} + +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size) +{ + unsigned long va = (unsigned long)__va(start); + unsigned long end = (unsigned long)__va(start + size); + + remove_pgd_mapping(va, end, false, NULL); +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = __pa(PAGE_OFFSET); + mhp_range.end = __pa(PAGE_END - 1); + return mhp_range; +} + +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) +{ + int ret; + + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot); + flush_tlb_all(); + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); + if (ret) { + remove_linear_mapping(start, size); + return ret; + } + + max_pfn = PFN_UP(start + size); + max_low_pfn = max_pfn; + return 0; +} + +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +{ + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); + remove_linear_mapping(start, size); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) +{ + remove_pgd_mapping(start, end, true, altmap); +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ +#endif /* CONFIG_MEMORY_HOTPLUG */