diff mbox series

[v6,3/4] x86/vmemmap: Handle unpopulated sub-pmd ranges

Message ID 20210309214050.4674-4-osalvador@suse.de (mailing list archive)
State New, archived
Headers show
Series Cleanup and fixups for vmemmap handling | expand

Commit Message

Oscar Salvador March 9, 2021, 9:40 p.m. UTC
When sizeof(struct page) is not a power of 2, sections do not span
a PMD anymore and so when populating them some parts of the PMD will
remain unused.
Because of this, PMDs will be left behind when depopulating sections
since remove_pmd_table() thinks that those unused parts are still in
use.

Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv()
will do the right thing and will let us free the PMD when the last user
of it is gone.

This patch is based on a similar patch by David Hildenbrand:

https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/

Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
---
 arch/x86/mm/init_64.c | 65 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 12 deletions(-)

Comments

Naresh Kamboju March 10, 2021, 6:07 a.m. UTC | #1
Hi Oscar,

On Wed, 10 Mar 2021 at 03:11, Oscar Salvador <osalvador@suse.de> wrote:
>
> When sizeof(struct page) is not a power of 2, sections do not span
> a PMD anymore and so when populating them some parts of the PMD will
> remain unused.
> Because of this, PMDs will be left behind when depopulating sections
> since remove_pmd_table() thinks that those unused parts are still in
> use.
>
> Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv()
> will do the right thing and will let us free the PMD when the last user
> of it is gone.
>
> This patch is based on a similar patch by David Hildenbrand:
>
> https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/
>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Reviewed-by: David Hildenbrand <david@redhat.com>
> Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
> ---
>  arch/x86/mm/init_64.c | 65 +++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 53 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 9ecb3c488ac8..d93b36856ed3 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -826,6 +826,51 @@ void __init paging_init(void)
>         zone_sizes_init();
>  }
>
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +#define PAGE_UNUSED 0xFD
> +
> +/* Returns true if the PMD is completely unused and thus it can be freed */
> +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
> +{
> +       unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> +
> +       memset((void *)addr, PAGE_UNUSED, end - addr);
> +
> +       return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
> +}
> +
> +static void __meminit vmemmap_use_sub_pmd(unsigned long start)
> +{
> +       /*
> +        * As we expect to add in the same granularity as we remove, it's
> +        * sufficient to mark only some piece used to block the memmap page from
> +        * getting removed when removing some other adjacent memmap (just in
> +        * case the first memmap never gets initialized e.g., because the memory
> +        * block never gets onlined).
> +        */
> +       memset((void *)start, 0, sizeof(struct page));
> +}
> +
> +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
> +{
> +       /*
> +        * Could be our memmap page is filled with PAGE_UNUSED already from a
> +        * previous remove. Make sure to reset it.
> +        */
> +       vmemmap_use_sub_pmd(start);
> +
> +       /*
> +        * Mark with PAGE_UNUSED the unused parts of the new memmap range
> +        */
> +       if (!IS_ALIGNED(start, PMD_SIZE))
> +               memset((void *)start, PAGE_UNUSED,
> +                       start - ALIGN_DOWN(start, PMD_SIZE));
> +       if (!IS_ALIGNED(end, PMD_SIZE))
> +               memset((void *)end, PAGE_UNUSED,
> +                       ALIGN(end, PMD_SIZE) - end);
> +}
> +#endif
> +
>  /*
>   * Memory hotplug specific functions
>   */
> @@ -871,8 +916,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
>         return add_pages(nid, start_pfn, nr_pages, params);
>  }
>
> -#define PAGE_INUSE 0xFD
> -
>  static void __meminit free_pagetable(struct page *page, int order)
>  {
>         unsigned long magic;
> @@ -1006,7 +1049,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>         unsigned long next, pages = 0;
>         pte_t *pte_base;
>         pmd_t *pmd;
> -       void *page_addr;
>
>         pmd = pmd_start + pmd_index(addr);
>         for (; addr < end; addr = next, pmd++) {
> @@ -1026,20 +1068,13 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>                                 pmd_clear(pmd);
>                                 spin_unlock(&init_mm.page_table_lock);
>                                 pages++;
> -                       } else {
> -                               /* If here, we are freeing vmemmap pages. */
> -                               memset((void *)addr, PAGE_INUSE, next - addr);
> -
> -                               page_addr = page_address(pmd_page(*pmd));
> -                               if (!memchr_inv(page_addr, PAGE_INUSE,
> -                                               PMD_SIZE)) {
> +                       } else if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) &&
> +                                  vmemmap_pmd_is_unused(addr, next)) {
>                                         free_hugepage_table(pmd_page(*pmd),
>                                                             altmap);
> -
>                                         spin_lock(&init_mm.page_table_lock);
>                                         pmd_clear(pmd);
>                                         spin_unlock(&init_mm.page_table_lock);
> -                               }
>                         }
>
>                         continue;
> @@ -1492,11 +1527,17 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
>
>                                 addr_end = addr + PMD_SIZE;
>                                 p_end = p + PMD_SIZE;
> +
> +                               if (!IS_ALIGNED(addr, PMD_SIZE) ||
> +                                   !IS_ALIGNED(next, PMD_SIZE))
> +                                       vmemmap_use_new_sub_pmd(addr, next);
> +
>                                 continue;
>                         } else if (altmap)
>                                 return -ENOMEM; /* no fallback */
>                 } else if (pmd_large(*pmd)) {
>                         vmemmap_verify((pte_t *)pmd, node, addr, next);
> +                       vmemmap_use_sub_pmd(addr);

While building the linux next 20210310 tag for x86_64 architecture with clang-12
and gcc-9 the following warnings / errors were noticed.

arch/x86/mm/init_64.c:1585:6: error: implicit declaration of function
'vmemmap_use_new_sub_pmd' [-Werror,-Wimplicit-function-declaration]
                                        vmemmap_use_new_sub_pmd(addr, next);
                                        ^
arch/x86/mm/init_64.c:1591:4: error: implicit declaration of function
'vmemmap_use_sub_pmd' [-Werror,-Wimplicit-function-declaration]
                        vmemmap_use_sub_pmd(addr, next);
                        ^
2 errors generated.
make[3]: *** [scripts/Makefile.build:271: arch/x86/mm/init_64.o] Error 1

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>

Steps to reproduce:
-------------------
# TuxMake is a command line tool and Python library that provides
# portable and repeatable Linux kernel builds across a variety of
# architectures, toolchains, kernel configurations, and make targets.
#
# TuxMake supports the concept of runtimes.
# See https://docs.tuxmake.org/runtimes/, for that to work it requires
# that you install podman or docker on your system.
#
# To install tuxmake on your system globally:
# sudo pip3 install -U tuxmake
#
# See https://docs.tuxmake.org/ for complete documentation.


tuxmake --runtime podman --target-arch x86_64 --toolchain clang-12
--kconfig defconfig --kconfig-add
https://builds.tuxbuild.com/1pYCPt4WlgSfSdv1BULm6ABINeJ/config


Build pipeline error link,
https://gitlab.com/Linaro/lkft/mirrors/next/linux-next/-/jobs/1085496613#L428
Oscar Salvador March 10, 2021, 7:24 a.m. UTC | #2
On Wed, Mar 10, 2021 at 11:37:53AM +0530, Naresh Kamboju wrote:
> Hi Oscar,

Hi Naresh,

> While building the linux next 20210310 tag for x86_64 architecture with clang-12
> and gcc-9 the following warnings / errors were noticed.
> 
> arch/x86/mm/init_64.c:1585:6: error: implicit declaration of function
> 'vmemmap_use_new_sub_pmd' [-Werror,-Wimplicit-function-declaration]
>                                         vmemmap_use_new_sub_pmd(addr, next);
>                                         ^
> arch/x86/mm/init_64.c:1591:4: error: implicit declaration of function
> 'vmemmap_use_sub_pmd' [-Werror,-Wimplicit-function-declaration]
>                         vmemmap_use_sub_pmd(addr, next);
>                         ^
> 2 errors generated.
> make[3]: *** [scripts/Makefile.build:271: arch/x86/mm/init_64.o] Error 1
> 
> Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>

Yes, this was also reported by Zi Yan here [1].
Looking into your .config, seems to be the same issue as you have
CONFIG_SPARSE_VMEMMAP but !CONFIG_MEMORY_HOTPLUG.

This version fixes those compilation errors.

Thanks for reporting it anyway!

[1] https://lore.kernel.org/linux-mm/YEfoH8US4YVxak7r@localhost.localdomain/T/#ma566ff437ff4bf8fcc5f80f62cd0cc8761edd12d

> Steps to reproduce:
> -------------------
> # TuxMake is a command line tool and Python library that provides
> # portable and repeatable Linux kernel builds across a variety of
> # architectures, toolchains, kernel configurations, and make targets.
> #
> # TuxMake supports the concept of runtimes.
> # See https://docs.tuxmake.org/runtimes/, for that to work it requires
> # that you install podman or docker on your system.
> #
> # To install tuxmake on your system globally:
> # sudo pip3 install -U tuxmake
> #
> # See https://docs.tuxmake.org/ for complete documentation.
> 
> 
> tuxmake --runtime podman --target-arch x86_64 --toolchain clang-12
> --kconfig defconfig --kconfig-add
> https://builds.tuxbuild.com/1pYCPt4WlgSfSdv1BULm6ABINeJ/config
> 
> 
> Build pipeline error link,
> https://gitlab.com/Linaro/lkft/mirrors/next/linux-next/-/jobs/1085496613#L428
> 
> -- 
> Linaro LKFT
> https://lkft.linaro.org
Dave Hansen March 11, 2021, 9:29 p.m. UTC | #3
On 3/9/21 1:40 PM, Oscar Salvador wrote:
> +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
> +{
> +	/*
> +	 * Could be our memmap page is filled with PAGE_UNUSED already from a
> +	 * previous remove. Make sure to reset it.
> +	 */
> +	vmemmap_use_sub_pmd(start);
> +
> +	/*
> +	 * Mark with PAGE_UNUSED the unused parts of the new memmap range
> +	 */
> +	if (!IS_ALIGNED(start, PMD_SIZE))
> +		memset((void *)start, PAGE_UNUSED,
> +		        start - ALIGN_DOWN(start, PMD_SIZE));
> +	if (!IS_ALIGNED(end, PMD_SIZE))
> +		memset((void *)end, PAGE_UNUSED,
> +		        ALIGN(end, PMD_SIZE) - end);
> +}
> +#endif

This is apparently under both CONFIG_SPARSEMEM_VMEMMAP and
CONFIG_MEMORY_HOTPLUG #ifdefs.  It errors out at compile-time with this
config: https://sr71.net/~dave/intel/config-mmotm-20210311

> linux.git/arch/x86/mm/init_64.c: In function 'vmemmap_populate_hugepages':
> linux.git/arch/x86/mm/init_64.c:1585:6: error: implicit declaration of function 'vmemmap_use_new_sub_pmd' [-Werror=implicit-function-declaration]
>       vmemmap_use_new_sub_pmd(addr, next);
>       ^~~~~~~~~~~~~~~~~~~~~~~
> /home/davehans/linux.git/arch/x86/mm/init_64.c:1591:4: error: implicit declaration of function 'vmemmap_use_sub_pmd' [-Werror=implicit-function-declaration]
>     vmemmap_use_sub_pmd(addr, next);
>     ^~~~~~~~~~~~~~~~~~~

I didn't see a quick fix other than #ifdef'ing the call sites, which is
pretty ugly.
Oscar Salvador March 11, 2021, 9:48 p.m. UTC | #4
On Thu, Mar 11, 2021 at 01:29:39PM -0800, Dave Hansen wrote:
> On 3/9/21 1:40 PM, Oscar Salvador wrote:
> > +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
> > +{
> > +	/*
> > +	 * Could be our memmap page is filled with PAGE_UNUSED already from a
> > +	 * previous remove. Make sure to reset it.
> > +	 */
> > +	vmemmap_use_sub_pmd(start);
> > +
> > +	/*
> > +	 * Mark with PAGE_UNUSED the unused parts of the new memmap range
> > +	 */
> > +	if (!IS_ALIGNED(start, PMD_SIZE))
> > +		memset((void *)start, PAGE_UNUSED,
> > +		        start - ALIGN_DOWN(start, PMD_SIZE));
> > +	if (!IS_ALIGNED(end, PMD_SIZE))
> > +		memset((void *)end, PAGE_UNUSED,
> > +		        ALIGN(end, PMD_SIZE) - end);
> > +}
> > +#endif
> 
> This is apparently under both CONFIG_SPARSEMEM_VMEMMAP and
> CONFIG_MEMORY_HOTPLUG #ifdefs.  It errors out at compile-time with this
> config: https://sr71.net/~dave/intel/config-mmotm-20210311

It seems that mmotm still has v5.
v6 (this one) fixed that up. I basically moved the code out of
MEMORY_HOTPLUG #ifdefs.

I could not reproduce your error on v6.
Matthew Wilcox (Oracle) March 30, 2021, 2:29 a.m. UTC | #5
On Thu, Mar 11, 2021 at 10:48:34PM +0100, Oscar Salvador wrote:
> On Thu, Mar 11, 2021 at 01:29:39PM -0800, Dave Hansen wrote:
> > On 3/9/21 1:40 PM, Oscar Salvador wrote:
> > > +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
> > > +{
> > > +	/*
> > > +	 * Could be our memmap page is filled with PAGE_UNUSED already from a
> > > +	 * previous remove. Make sure to reset it.
> > > +	 */
> > > +	vmemmap_use_sub_pmd(start);
> > > +
> > > +	/*
> > > +	 * Mark with PAGE_UNUSED the unused parts of the new memmap range
> > > +	 */
> > > +	if (!IS_ALIGNED(start, PMD_SIZE))
> > > +		memset((void *)start, PAGE_UNUSED,
> > > +		        start - ALIGN_DOWN(start, PMD_SIZE));
> > > +	if (!IS_ALIGNED(end, PMD_SIZE))
> > > +		memset((void *)end, PAGE_UNUSED,
> > > +		        ALIGN(end, PMD_SIZE) - end);
> > > +}
> > > +#endif
> > 
> > This is apparently under both CONFIG_SPARSEMEM_VMEMMAP and
> > CONFIG_MEMORY_HOTPLUG #ifdefs.  It errors out at compile-time with this
> > config: https://sr71.net/~dave/intel/config-mmotm-20210311
> 
> It seems that mmotm still has v5.
> v6 (this one) fixed that up. I basically moved the code out of
> MEMORY_HOTPLUG #ifdefs.
> 
> I could not reproduce your error on v6.

I can reproduce this with next-20210329.

.config attached.
Oscar Salvador April 2, 2021, 5:58 p.m. UTC | #6
On Tue, Mar 30, 2021 at 03:29:50AM +0100, Matthew Wilcox wrote:
> I can reproduce this with next-20210329.
> 
> .config attached.

Hi Matthew,

Thanks for the report. I tried to reproduce this with the attached
config on next-20210401 and I had no luck.


You still see it on that one?

Thanks
diff mbox series

Patch

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9ecb3c488ac8..d93b36856ed3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -826,6 +826,51 @@  void __init paging_init(void)
 	zone_sizes_init();
 }
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define PAGE_UNUSED 0xFD
+
+/* Returns true if the PMD is completely unused and thus it can be freed */
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+	memset((void *)addr, PAGE_UNUSED, end - addr);
+
+	return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
+}
+
+static void __meminit vmemmap_use_sub_pmd(unsigned long start)
+{
+	/*
+	 * As we expect to add in the same granularity as we remove, it's
+	 * sufficient to mark only some piece used to block the memmap page from
+	 * getting removed when removing some other adjacent memmap (just in
+	 * case the first memmap never gets initialized e.g., because the memory
+	 * block never gets onlined).
+	 */
+	memset((void *)start, 0, sizeof(struct page));
+}
+
+static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+	/*
+	 * Could be our memmap page is filled with PAGE_UNUSED already from a
+	 * previous remove. Make sure to reset it.
+	 */
+	vmemmap_use_sub_pmd(start);
+
+	/*
+	 * Mark with PAGE_UNUSED the unused parts of the new memmap range
+	 */
+	if (!IS_ALIGNED(start, PMD_SIZE))
+		memset((void *)start, PAGE_UNUSED,
+		        start - ALIGN_DOWN(start, PMD_SIZE));
+	if (!IS_ALIGNED(end, PMD_SIZE))
+		memset((void *)end, PAGE_UNUSED,
+		        ALIGN(end, PMD_SIZE) - end);
+}
+#endif
+
 /*
  * Memory hotplug specific functions
  */
@@ -871,8 +916,6 @@  int arch_add_memory(int nid, u64 start, u64 size,
 	return add_pages(nid, start_pfn, nr_pages, params);
 }
 
-#define PAGE_INUSE 0xFD
-
 static void __meminit free_pagetable(struct page *page, int order)
 {
 	unsigned long magic;
@@ -1006,7 +1049,6 @@  remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 	unsigned long next, pages = 0;
 	pte_t *pte_base;
 	pmd_t *pmd;
-	void *page_addr;
 
 	pmd = pmd_start + pmd_index(addr);
 	for (; addr < end; addr = next, pmd++) {
@@ -1026,20 +1068,13 @@  remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
-			} else {
-				/* If here, we are freeing vmemmap pages. */
-				memset((void *)addr, PAGE_INUSE, next - addr);
-
-				page_addr = page_address(pmd_page(*pmd));
-				if (!memchr_inv(page_addr, PAGE_INUSE,
-						PMD_SIZE)) {
+			} else if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) &&
+				   vmemmap_pmd_is_unused(addr, next)) {
 					free_hugepage_table(pmd_page(*pmd),
 							    altmap);
-
 					spin_lock(&init_mm.page_table_lock);
 					pmd_clear(pmd);
 					spin_unlock(&init_mm.page_table_lock);
-				}
 			}
 
 			continue;
@@ -1492,11 +1527,17 @@  static int __meminit vmemmap_populate_hugepages(unsigned long start,
 
 				addr_end = addr + PMD_SIZE;
 				p_end = p + PMD_SIZE;
+
+				if (!IS_ALIGNED(addr, PMD_SIZE) ||
+				    !IS_ALIGNED(next, PMD_SIZE))
+					vmemmap_use_new_sub_pmd(addr, next);
+
 				continue;
 			} else if (altmap)
 				return -ENOMEM; /* no fallback */
 		} else if (pmd_large(*pmd)) {
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
+			vmemmap_use_sub_pmd(addr);
 			continue;
 		}
 		if (vmemmap_populate_basepages(addr, next, node, NULL))