diff mbox series

[4/8] mm: Add __dump_folio()

Message ID 20240227192337.757313-5-willy@infradead.org (mailing list archive)
State New
Headers show
Series PageFlags cleanups | expand

Commit Message

Matthew Wilcox (Oracle) Feb. 27, 2024, 7:23 p.m. UTC
Turn __dump_page() into a wrapper around __dump_folio().  Snapshot the
page & folio into a stack variable so we don't hit BUG_ON() if an
allocation is freed under us and what was a folio pointer becomes a
pointer to a tail page.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/debug.c | 120 +++++++++++++++++++++++++++++------------------------
 1 file changed, 66 insertions(+), 54 deletions(-)

Comments

SeongJae Park Feb. 28, 2024, 9:34 p.m. UTC | #1
Hi,

On Tue, 27 Feb 2024 19:23:31 +0000 "Matthew Wilcox (Oracle)" <willy@infradead.org> wrote:

> Turn __dump_page() into a wrapper around __dump_folio().  Snapshot the
> page & folio into a stack variable so we don't hit BUG_ON() if an
> allocation is freed under us and what was a folio pointer becomes a
> pointer to a tail page.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/debug.c | 120 +++++++++++++++++++++++++++++------------------------
>  1 file changed, 66 insertions(+), 54 deletions(-)
> 
> diff --git a/mm/debug.c b/mm/debug.c
> index ee533a5ceb79..96555fc78f1a 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
[...]
> +static void __dump_page(const struct page *page)
> +{
> +	struct folio *foliop, folio;
> +	struct page precise;
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long idx, nr_pages = 1;
> +	int loops = 5;
> +
> +again:
> +	memcpy(&precise, page, sizeof(*page));
> +	foliop = page_folio(&precise);
> +	idx = folio_page_idx(foliop, page);
> +	if (idx != 0) {
> +		if (idx < (1UL << PUD_ORDER)) {
> +			memcpy(&folio, foliop, 2 * sizeof(struct page));
> +			nr_pages = folio_nr_pages(&folio);
> +		}
> +
> +		if (idx > nr_pages) {
> +			if (loops-- > 0)
> +				goto again;
> +			printk("page does not match folio\n");
> +			precise.compound_head &= ~1UL;
> +			foliop = (struct folio *)&precise;
> +			idx = 0;
> +		}
> +	}
> +
> +	__dump_folio(foliop, &precise, pfn, idx);
>  }

I just found one of my build tests that runs with a CONFIG_MMU disabled config
fails with below build error on mm-unstable starting from this patch.  After
enabling CONFIG_MMU, the build didn't fail. I haven't had a time to look into
the code, but reporting first.


      CC      mm/debug.o
    In file included from ...linux/include/asm-generic/pgtable-nopud.h:7,
                     from ...linux/arch/m68k/include/asm/pgtable_no.h:5,
                     from ...linux/arch/m68k/include/asm/pgtable.h:6,
                     from ...linux/include/linux/pgtable.h:6,
                     from ...linux/include/linux/mm.h:29,
                     from ...linux/mm/debug.c:10:
    ...linux/mm/debug.c: In function '__dump_page':
    ...linux/include/asm-generic/pgtable-nop4d.h:11:33: error: 'PGDIR_SHIFT' undeclared (first use in this function); did you mean 'PUD_SHIFT'?
       11 | #define P4D_SHIFT               PGDIR_SHIFT
          |                                 ^~~~~~~~~~~
    ...linux/include/asm-generic/pgtable-nopud.h:18:25: note: in expansion of macro 'P4D_SHIFT'
       18 | #define PUD_SHIFT       P4D_SHIFT
          |                         ^~~~~~~~~
    ...linux/include/linux/pgtable.h:9:26: note: in expansion of macro 'PUD_SHIFT'
        9 | #define PUD_ORDER       (PUD_SHIFT - PAGE_SHIFT)
          |                          ^~~~~~~~~
    ...linux/mm/debug.c:128:35: note: in expansion of macro 'PUD_ORDER'
      128 |                 if (idx < (1UL << PUD_ORDER)) {
          |                                   ^~~~~~~~~
    ...linux/include/asm-generic/pgtable-nop4d.h:11:33: note: each undeclared identifier is reported only once for each function it appears in
       11 | #define P4D_SHIFT               PGDIR_SHIFT
          |                                 ^~~~~~~~~~~
    ...linux/include/asm-generic/pgtable-nopud.h:18:25: note: in expansion of macro 'P4D_SHIFT'
       18 | #define PUD_SHIFT       P4D_SHIFT
          |                         ^~~~~~~~~
    ...linux/include/linux/pgtable.h:9:26: note: in expansion of macro 'PUD_SHIFT'
        9 | #define PUD_ORDER       (PUD_SHIFT - PAGE_SHIFT)
          |                          ^~~~~~~~~
    ...linux/mm/debug.c:128:35: note: in expansion of macro 'PUD_ORDER'
      128 |                 if (idx < (1UL << PUD_ORDER)) {
          |                                   ^~~~~~~~~

Thanks,
SJ

>  
>  void dump_page(struct page *page, const char *reason)
> -- 
> 2.43.0
Matthew Wilcox (Oracle) Feb. 29, 2024, 4:37 a.m. UTC | #2
On Wed, Feb 28, 2024 at 01:34:58PM -0800, SeongJae Park wrote:
>     ...linux/mm/debug.c: In function '__dump_page':
>     ...linux/include/asm-generic/pgtable-nop4d.h:11:33: error: 'PGDIR_SHIFT' undeclared (first use in this function); did you mean 'PUD_SHIFT'?
>        11 | #define P4D_SHIFT               PGDIR_SHIFT
>           |                                 ^~~~~~~~~~~
>     ...linux/include/asm-generic/pgtable-nopud.h:18:25: note: in expansion of macro 'P4D_SHIFT'
>        18 | #define PUD_SHIFT       P4D_SHIFT
>           |                         ^~~~~~~~~
>     ...linux/include/linux/pgtable.h:9:26: note: in expansion of macro 'PUD_SHIFT'
>         9 | #define PUD_ORDER       (PUD_SHIFT - PAGE_SHIFT)
>           |                          ^~~~~~~~~
>     ...linux/mm/debug.c:128:35: note: in expansion of macro 'PUD_ORDER'
>       128 |                 if (idx < (1UL << PUD_ORDER)) {

Thanks.  Can you try this?

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 49d87a4d29b9..e25e86b755be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2078,6 +2078,13 @@ static inline long folio_nr_pages(struct folio *folio)
 #endif
 }
 
+/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+#define MAX_FOLIO_NR_PAGES	(1UL << PUD_ORDER)
+#else
+#define MAX_FOLIO_NR_PAGES	MAX_ORDER_NR_PAGES
+#endif
+
 /*
  * compound_nr() returns the number of pages in this potentially compound
  * page.  compound_nr() can be called on a tail page, and is defined to
diff --git a/mm/debug.c b/mm/debug.c
index 6149944016a7..32ac7d79fd04 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -125,7 +125,7 @@ static void __dump_page(const struct page *page)
 	foliop = page_folio(&precise);
 	idx = folio_page_idx(foliop, page);
 	if (idx != 0) {
-		if (idx < (1UL << PUD_ORDER)) {
+		if (idx < MAX_FOLIO_NR_PAGES) {
 			memcpy(&folio, foliop, 2 * sizeof(struct page));
 			nr_pages = folio_nr_pages(&folio);
 		}
SeongJae Park Feb. 29, 2024, 5:05 a.m. UTC | #3
On Thu, 29 Feb 2024 04:37:31 +0000 Matthew Wilcox <willy@infradead.org> wrote:

> On Wed, Feb 28, 2024 at 01:34:58PM -0800, SeongJae Park wrote:
> >     ...linux/mm/debug.c: In function '__dump_page':
> >     ...linux/include/asm-generic/pgtable-nop4d.h:11:33: error: 'PGDIR_SHIFT' undeclared (first use in this function); did you mean 'PUD_SHIFT'?
> >        11 | #define P4D_SHIFT               PGDIR_SHIFT
> >           |                                 ^~~~~~~~~~~
> >     ...linux/include/asm-generic/pgtable-nopud.h:18:25: note: in expansion of macro 'P4D_SHIFT'
> >        18 | #define PUD_SHIFT       P4D_SHIFT
> >           |                         ^~~~~~~~~
> >     ...linux/include/linux/pgtable.h:9:26: note: in expansion of macro 'PUD_SHIFT'
> >         9 | #define PUD_ORDER       (PUD_SHIFT - PAGE_SHIFT)
> >           |                          ^~~~~~~~~
> >     ...linux/mm/debug.c:128:35: note: in expansion of macro 'PUD_ORDER'
> >       128 |                 if (idx < (1UL << PUD_ORDER)) {
> 
> Thanks.  Can you try this?
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 49d87a4d29b9..e25e86b755be 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2078,6 +2078,13 @@ static inline long folio_nr_pages(struct folio *folio)
>  #endif
>  }
>  
> +/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
> +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
> +#define MAX_FOLIO_NR_PAGES	(1UL << PUD_ORDER)
> +#else
> +#define MAX_FOLIO_NR_PAGES	MAX_ORDER_NR_PAGES
> +#endif
> +
>  /*
>   * compound_nr() returns the number of pages in this potentially compound
>   * page.  compound_nr() can be called on a tail page, and is defined to
> diff --git a/mm/debug.c b/mm/debug.c
> index 6149944016a7..32ac7d79fd04 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -125,7 +125,7 @@ static void __dump_page(const struct page *page)
>  	foliop = page_folio(&precise);
>  	idx = folio_page_idx(foliop, page);
>  	if (idx != 0) {
> -		if (idx < (1UL << PUD_ORDER)) {
> +		if (idx < MAX_FOLIO_NR_PAGES) {
>  			memcpy(&folio, foliop, 2 * sizeof(struct page));
>  			nr_pages = folio_nr_pages(&folio);
>  		}

Thank you for this fast and kind reply.  I confirmed this fixes my issue :)

Tested-by: SeongJae Park <sj@kernel.org>


Thanks,
SJ
Ryan Roberts March 1, 2024, 10:21 a.m. UTC | #4
Hi Matthew,

On 27/02/2024 19:23, Matthew Wilcox (Oracle) wrote:
> Turn __dump_page() into a wrapper around __dump_folio().  Snapshot the
> page & folio into a stack variable so we don't hit BUG_ON() if an
> allocation is freed under us and what was a folio pointer becomes a
> pointer to a tail page.

I'm seeing a couple of panics caused by this patch. I already raised the first one at [1], and it looks like there is a bug in Ard's patch (which he now has a proposed fix for) which provokes the bug in this.

[1] https://lore.kernel.org/linux-arm-kernel/fc691e8d-1a50-4be6-a3b2-d60d6f2e2487@arm.com/

The other way to trigger it is to run the mm kselftest:

  gup_test -ct -F 0x1 0 19 0x1000

This calls into the kernel and deliberately calls dump_page() via dump_pages_test() in gup_test.c.

Panic is as follows (root cause identified below):

[   22.994800] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
[   22.995428] Mem abort info:
[   22.995617]   ESR = 0x0000000096000005
[   22.995867]   EC = 0x25: DABT (current EL), IL = 32 bits
[   22.996215]   SET = 0, FnV = 0
[   22.996419]   EA = 0, S1PTW = 0
[   22.996628]   FSC = 0x05: level 1 translation fault
[   22.996951] Data abort info:
[   22.997145]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
[   22.997541]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[   22.998025]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[   22.998438] user pgtable: 4k pages, 39-bit VAs, pgdp=000000019f2d6000
[   22.998937] [0000000000000008] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
[   22.999608] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP
[   23.000083] Modules linked in:
[   23.000319] CPU: 6 PID: 1222 Comm: gup_test Not tainted 6.8.0-rc6-00915-g7f43e0f76e47 #2
[   23.000883] Hardware name: linux,dummy-virt (DT)
[   23.001209] pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   23.001621] pc : get_pfnblock_flags_mask+0x3c/0x68
[   23.001929] lr : __dump_page+0x188/0x400
[   23.002168] sp : ffffffc0885ebb40
[   23.002370] x29: ffffffc0885ebb40 x28: 0000000000ffffc0 x27: 0000000000000000
[   23.002839] x26: 0000000000000000 x25: ffffffc0885ebba0 x24: 00000000ffffffff
[   23.003335] x23: ffffffeabbbc1000 x22: 0000000000000030 x21: ffffffeabb9f5e98
[   23.003869] x20: ffffffc0885ebba0 x19: ffffffc0885ebba0 x18: ffffffffffffffff
[   23.004489] x17: 34383833383a6465 x16: 7070616d5f736567 x15: ffffffeabd058782
[   23.004989] x14: 0000000000000000 x13: 3030303730303039 x12: 3138666666666666
[   23.005501] x11: fffffffffffe0000 x10: ffffffeabca9c018 x9 : ffffffeab9f0c798
[   23.005980] x8 : 00000000ffffefff x7 : ffffffeabca9c018 x6 : 0000000000000000
[   23.006448] x5 : 000003fffffffc28 x4 : 0001fffffffe144a x3 : 0000000000000000
[   23.006931] x2 : 0000000000000007 x1 : ffffffff0a257aee x0 : 00000000001fffff
[   23.007436] Call trace:
[   23.007623]  get_pfnblock_flags_mask+0x3c/0x68
[   23.007928]  dump_page+0x2c/0x70
[   23.008156]  gup_test_ioctl+0xb34/0xc40
[   23.008416]  __arm64_sys_ioctl+0xb0/0x100
[   23.008694]  invoke_syscall+0x50/0x128
[   23.008944]  el0_svc_common.constprop.0+0x48/0xf8
[   23.009259]  do_el0_svc+0x28/0x40
[   23.009499]  el0_svc+0x34/0xb8
[   23.009720]  el0t_64_sync_handler+0x13c/0x158
[   23.010029]  el0t_64_sync+0x190/0x198
[   23.010293] Code: d37b1884 f100007f 8b040064 9a831083 (f9400460) 
[   23.010714] ---[ end trace 0000000000000000 ]---


> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/debug.c | 120 +++++++++++++++++++++++++++++------------------------
>  1 file changed, 66 insertions(+), 54 deletions(-)
> 
> diff --git a/mm/debug.c b/mm/debug.c
> index ee533a5ceb79..96555fc78f1a 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -51,84 +51,96 @@ const struct trace_print_flags vmaflag_names[] = {
>  	{0, NULL}
>  };
>  
> -static void __dump_page(struct page *page)
> +static void __dump_folio(struct folio *folio, struct page *page,
> +		unsigned long pfn, unsigned long idx)
>  {
> -	struct folio *folio = page_folio(page);
> -	struct page *head = &folio->page;
> -	struct address_space *mapping;
> -	bool compound = PageCompound(page);
> -	/*
> -	 * Accessing the pageblock without the zone lock. It could change to
> -	 * "isolate" again in the meantime, but since we are just dumping the
> -	 * state for debugging, it should be fine to accept a bit of
> -	 * inaccuracy here due to racing.
> -	 */
> -	bool page_cma = is_migrate_cma_page(page);
> -	int mapcount;
> +	struct address_space *mapping = folio_mapping(folio);
> +	bool page_cma;
> +	int mapcount = 0;
>  	char *type = "";
>  
> -	if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
> -		/*
> -		 * Corrupt page, so we cannot call page_mapping. Instead, do a
> -		 * safe subset of the steps that page_mapping() does. Caution:
> -		 * this will be misleading for tail pages, PageSwapCache pages,
> -		 * and potentially other situations. (See the page_mapping()
> -		 * implementation for what's missing here.)
> -		 */
> -		unsigned long tmp = (unsigned long)page->mapping;
> -
> -		if (tmp & PAGE_MAPPING_ANON)
> -			mapping = NULL;
> -		else
> -			mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
> -		head = page;
> -		folio = (struct folio *)page;
> -		compound = false;
> -	} else {
> -		mapping = page_mapping(page);
> -	}
> -
>  	/*
> -	 * Avoid VM_BUG_ON() in page_mapcount().
> -	 * page->_mapcount space in struct page is used by sl[aou]b pages to
> -	 * encode own info.
> +	 * page->_mapcount space in struct page is used by slab pages to
> +	 * encode own info, and we must avoid calling page_folio() again.
>  	 */
> -	mapcount = PageSlab(head) ? 0 : page_mapcount(page);
> -
> -	pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
> -			page, page_ref_count(head), mapcount, mapping,
> -			page_to_pgoff(page), page_to_pfn(page));
> -	if (compound) {
> -		pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
> -				head, compound_order(head),
> +	if (!folio_test_slab(folio)) {
> +		mapcount = atomic_read(&page->_mapcount) + 1;
> +		if (folio_test_large(folio))
> +			mapcount += folio_entire_mapcount(folio);
> +	}
> +
> +	pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
> +			folio_ref_count(folio), mapcount, mapping,
> +			folio->index + idx, pfn);
> +	if (folio_test_large(folio)) {
> +		pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
> +				folio_order(folio),
>  				folio_entire_mapcount(folio),
>  				folio_nr_pages_mapped(folio),
>  				atomic_read(&folio->_pincount));
>  	}
>  
>  #ifdef CONFIG_MEMCG
> -	if (head->memcg_data)
> -		pr_warn("memcg:%lx\n", head->memcg_data);
> +	if (folio->memcg_data)
> +		pr_warn("memcg:%lx\n", folio->memcg_data);
>  #endif
> -	if (PageKsm(page))
> +	if (folio_test_ksm(folio))
>  		type = "ksm ";
> -	else if (PageAnon(page))
> +	else if (folio_test_anon(folio))
>  		type = "anon ";
>  	else if (mapping)
>  		dump_mapping(mapping);
>  	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
>  
> -	pr_warn("%sflags: %pGp%s\n", type, &head->flags,
> +	/*
> +	 * Accessing the pageblock without the zone lock. It could change to
> +	 * "isolate" again in the meantime, but since we are just dumping the
> +	 * state for debugging, it should be fine to accept a bit of
> +	 * inaccuracy here due to racing.
> +	 */
> +	page_cma = is_migrate_cma_page(page);

Problem is here: is_migrate_cma_page() is a macro that resolves to this:

page_cma = get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) == MIGRATE_CMA;

And since page is on the stack, page_to_pfn() gives a very wrong answer.

I confirmed that the problem goes away for both cases above, when changing the line to:

page_cma = get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK) == MIGRATE_CMA;

Thanks,
Ryan

> +	pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
>  		page_cma ? " CMA" : "");
> -	pr_warn("page_type: %pGt\n", &head->page_type);
> +	pr_warn("page_type: %pGt\n", &folio->page.page_type);
>  
>  	print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
>  			sizeof(unsigned long), page,
>  			sizeof(struct page), false);
> -	if (head != page)
> +	if (folio_test_large(folio))
>  		print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
> -			sizeof(unsigned long), head,
> -			sizeof(struct page), false);
> +			sizeof(unsigned long), folio,
> +			2 * sizeof(struct page), false);
> +}
> +
> +static void __dump_page(const struct page *page)
> +{
> +	struct folio *foliop, folio;
> +	struct page precise;
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long idx, nr_pages = 1;
> +	int loops = 5;
> +
> +again:
> +	memcpy(&precise, page, sizeof(*page));
> +	foliop = page_folio(&precise);
> +	idx = folio_page_idx(foliop, page);
> +	if (idx != 0) {
> +		if (idx < (1UL << PUD_ORDER)) {
> +			memcpy(&folio, foliop, 2 * sizeof(struct page));
> +			nr_pages = folio_nr_pages(&folio);
> +		}
> +
> +		if (idx > nr_pages) {
> +			if (loops-- > 0)
> +				goto again;
> +			printk("page does not match folio\n");
> +			precise.compound_head &= ~1UL;
> +			foliop = (struct folio *)&precise;
> +			idx = 0;
> +		}
> +	}
> +
> +	__dump_folio(foliop, &precise, pfn, idx);
>  }
>  
>  void dump_page(struct page *page, const char *reason)
Matthew Wilcox (Oracle) March 1, 2024, 9:32 p.m. UTC | #5
On Fri, Mar 01, 2024 at 10:21:10AM +0000, Ryan Roberts wrote:
> > +	page_cma = is_migrate_cma_page(page);
> 
> Problem is here: is_migrate_cma_page() is a macro that resolves to this:

Ah, yeah, maybe somebody should be testing with CONFIG_CMA enabled.
Ahem.

> page_cma = get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) == MIGRATE_CMA;
> 
> And since page is on the stack, page_to_pfn() gives a very wrong answer.
> 
> I confirmed that the problem goes away for both cases above, when changing the line to:
> 
> page_cma = get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK) == MIGRATE_CMA;

Thanks!  I think what we end up wanting is ...

From f005189ad418d05d168c0ff00daecc2a444733ef Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 1 Mar 2024 16:11:20 -0500
Subject: [PATCH] mm: Fix __dump_folio

Ryan Roberts reports that (if you have CONFIG_CMA enabled), calling
__dump_folio() will panic as we call is_migrate_cma_page() with a
stack copy of struct page, which gets passed to page_to_pfn().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mmzone.h | 3 +++
 mm/debug.c             | 4 +---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 633812a1d220..c11b7cde81ef 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -76,9 +76,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES];
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 #  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
+#  define is_migrate_cma_folio(folio, pfn)	(MIGRATE_CMA ==		\
+	get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
 #else
 #  define is_migrate_cma(migratetype) false
 #  define is_migrate_cma_page(_page) false
+#  define is_migrate_cma_folio(folio, pfn) false
 #endif
 
 static inline bool is_migrate_movable(int mt)
diff --git a/mm/debug.c b/mm/debug.c
index 32ac7d79fd04..e7aa8a9d5d86 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -55,7 +55,6 @@ static void __dump_folio(struct folio *folio, struct page *page,
 		unsigned long pfn, unsigned long idx)
 {
 	struct address_space *mapping = folio_mapping(folio);
-	bool page_cma;
 	int mapcount = 0;
 	char *type = "";
 
@@ -98,9 +97,8 @@ static void __dump_folio(struct folio *folio, struct page *page,
 	 * state for debugging, it should be fine to accept a bit of
 	 * inaccuracy here due to racing.
 	 */
-	page_cma = is_migrate_cma_page(page);
 	pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
-		page_cma ? " CMA" : "");
+		is_migrate_cma_folio(folio, pfn) ? " CMA" : "");
 	pr_warn("page_type: %pGt\n", &folio->page.page_type);
 
 	print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
Matthew Wilcox (Oracle) March 4, 2024, 7:02 p.m. UTC | #6
Further testing revealed some more problems.  We were getting confused
between various different pointers leading to spurious messages about
the page not matching the folio and passing the wrong pointer to
__dump_folio().  Here's a fix-3 patch which I tested like so:

+static int __init page_dump(void)
+{
+       struct page *page;
+
+       printk("testing page dump\n");
+
+       page = alloc_page(GFP_KERNEL);
+       dump_page(page, "single");
+       put_page(page);
+       page = alloc_pages(GFP_KERNEL | __GFP_COMP, 2);
+       dump_page(page, "head");
+       dump_page(page + 1, "tail 1");
+       dump_page(page + 2, "tail 2");
+       dump_page(page + 3, "tail 3");
+       put_page(page);
+
+       return 0;
+}
+
+module_init(page_dump);

(needed some extra debug to check the values of the pointers being
passed to __dump_folio() which we wouldn't want to include)

---
 mm/debug.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/mm/debug.c b/mm/debug.c
index e7aa8a9d5d86..4dae73bc4530 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -121,23 +121,31 @@ static void __dump_page(const struct page *page)
 again:
 	memcpy(&precise, page, sizeof(*page));
 	foliop = page_folio(&precise);
-	idx = folio_page_idx(foliop, page);
-	if (idx != 0) {
-		if (idx < MAX_FOLIO_NR_PAGES) {
-			memcpy(&folio, foliop, 2 * sizeof(struct page));
-			nr_pages = folio_nr_pages(&folio);
-		}
+	if (foliop == (struct folio *)&precise) {
+		idx = 0;
+		if (!folio_test_large(foliop))
+			goto dump;
+		foliop = (struct folio *)page;
+	} else {
+		idx = folio_page_idx(foliop, page);
+	}
 
-		if (idx > nr_pages) {
-			if (loops-- > 0)
-				goto again;
-			printk("page does not match folio\n");
-			precise.compound_head &= ~1UL;
-			foliop = (struct folio *)&precise;
-			idx = 0;
-		}
+	if (idx < MAX_FOLIO_NR_PAGES) {
+		memcpy(&folio, foliop, 2 * sizeof(struct page));
+		nr_pages = folio_nr_pages(&folio);
+		foliop = &folio;
+	}
+
+	if (idx > nr_pages) {
+		if (loops-- > 0)
+			goto again;
+		printk("page does not match folio\n");
+		precise.compound_head &= ~1UL;
+		foliop = (struct folio *)&precise;
+		idx = 0;
 	}
 
+dump:
 	__dump_folio(foliop, &precise, pfn, idx);
 }
Kees Cook May 14, 2024, 4:33 a.m. UTC | #7
Hi!

While working on testing an improved -Warray-bounds in GCC, I encountered
this, which seems to be reasonable:

In file included from ./arch/x86/include/generated/asm/rwonce.h:1,
                 from ../include/linux/compiler.h:299,
                 from ../include/linux/array_size.h:5,
                 from ../include/linux/kernel.h:16,
                 from ../mm/debug.c:9:
In function 'page_fixed_fake_head',
    inlined from '_compound_head' at ../include/linux/page-flags.h:251:24,
    inlined from '__dump_page' at ../mm/debug.c:123:11:
../include/asm-generic/rwonce.h:44:26: warning: array subscript 9 is outside array bounds of 'struct page[1]' [-Warray-bounds=]
   44 | #define __READ_ONCE(x)  (*(const volatile __unqual_scalar_typeof(x) *)&(x))
      |                         ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../include/asm-generic/rwonce.h:50:9: note: in expansion of macro '__READ_ONCE'
   50 |         __READ_ONCE(x);                                                 \
      |         ^~~~~~~~~~~
../include/linux/page-flags.h:226:38: note: in expansion of macro 'READ_ONCE'
  226 |                 unsigned long head = READ_ONCE(page[1].compound_head);
      |                                      ^~~~~~~~~
../mm/debug.c: In function '__dump_page':
../mm/debug.c:116:21: note: at offset 72 into object 'precise' of size 64
  116 |         struct page precise;
      |                     ^~~~~~~

(Not noted in this warning is that the code passes through page_folio()
_Generic macro.)

It doesn't like that it can see that "precise" is exactly one page, so
looking at page[1] later is going to freak out. I suspect this may be
"impossible" at run-time, but I'm not 100% sure. Regardless, the compiler
can't tell.

I suspect just making precise be a 2 page array would make this happy,
but it wasn't clear to me how such a page should be initialized.

-Kees

--
Kees Cook
Matthew Wilcox (Oracle) May 14, 2024, 4:53 a.m. UTC | #8
On Mon, May 13, 2024 at 09:33:57PM -0700, Kees Cook wrote:
> Hi!
> 
> While working on testing an improved -Warray-bounds in GCC, I encountered
> this, which seems to be reasonable:

Eek.  I think you're right.  This is a bad interaction between the page
dumping code and the fixed fake head code.  I will need to think about
this (and LSFMM is happening right now, so I don't necessarily have a
lot of time to think).  I'll get back to you as soon as I can.

> In file included from ./arch/x86/include/generated/asm/rwonce.h:1,
>                  from ../include/linux/compiler.h:299,
>                  from ../include/linux/array_size.h:5,
>                  from ../include/linux/kernel.h:16,
>                  from ../mm/debug.c:9:
> In function 'page_fixed_fake_head',
>     inlined from '_compound_head' at ../include/linux/page-flags.h:251:24,
>     inlined from '__dump_page' at ../mm/debug.c:123:11:
> ../include/asm-generic/rwonce.h:44:26: warning: array subscript 9 is outside array bounds of 'struct page[1]' [-Warray-bounds=]
>    44 | #define __READ_ONCE(x)  (*(const volatile __unqual_scalar_typeof(x) *)&(x))
>       |                         ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> ../include/asm-generic/rwonce.h:50:9: note: in expansion of macro '__READ_ONCE'
>    50 |         __READ_ONCE(x);                                                 \
>       |         ^~~~~~~~~~~
> ../include/linux/page-flags.h:226:38: note: in expansion of macro 'READ_ONCE'
>   226 |                 unsigned long head = READ_ONCE(page[1].compound_head);
>       |                                      ^~~~~~~~~
> ../mm/debug.c: In function '__dump_page':
> ../mm/debug.c:116:21: note: at offset 72 into object 'precise' of size 64
>   116 |         struct page precise;
>       |                     ^~~~~~~
> 
> (Not noted in this warning is that the code passes through page_folio()
> _Generic macro.)
> 
> It doesn't like that it can see that "precise" is exactly one page, so
> looking at page[1] later is going to freak out. I suspect this may be
> "impossible" at run-time, but I'm not 100% sure. Regardless, the compiler
> can't tell.
> 
> I suspect just making precise be a 2 page array would make this happy,
> but it wasn't clear to me how such a page should be initialized.
> 
> -Kees
> 
> --
> Kees Cook
Matthew Wilcox (Oracle) May 14, 2024, 2:25 p.m. UTC | #9
On Mon, May 13, 2024 at 09:33:57PM -0700, Kees Cook wrote:
> In function 'page_fixed_fake_head',
>     inlined from '_compound_head' at ../include/linux/page-flags.h:251:24,
>     inlined from '__dump_page' at ../mm/debug.c:123:11:
> ../include/asm-generic/rwonce.h:44:26: warning: array subscript 9 is outside array bounds of 'struct page[1]' [-Warray-bounds=]
> 
> (Not noted in this warning is that the code passes through page_folio()
> _Generic macro.)
> 
> It doesn't like that it can see that "precise" is exactly one page, so
> looking at page[1] later is going to freak out. I suspect this may be
> "impossible" at run-time, but I'm not 100% sure. Regardless, the compiler
> can't tell.

Actually, I'm not sure that I can tell that it's impossible.

I think we just need to open-code page_folio() here so that we don't
get into the fixed fake head palaver.  Something like this, although
it's only compile-tested.


diff --git a/mm/debug.c b/mm/debug.c
index e3ff3ac19fa1..47ba8b0a4872 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -110,19 +110,22 @@ static void __dump_page(const struct page *page)
 {
 	struct folio *foliop, folio;
 	struct page precise;
+	unsigned long head;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long idx, nr_pages = 1;
 	int loops = 5;
 
 again:
 	memcpy(&precise, page, sizeof(*page));
-	foliop = page_folio(&precise);
-	if (foliop == (struct folio *)&precise) {
+	head = precise.compound_head;
+	if ((head & 1) == 0) {
+		foliop = (struct folio *)&precise;
 		idx = 0;
 		if (!folio_test_large(foliop))
 			goto dump;
 		foliop = (struct folio *)page;
 	} else {
+		foliop = (struct folio *)(head - 1);
 		idx = folio_page_idx(foliop, page);
 	}
diff mbox series

Patch

diff --git a/mm/debug.c b/mm/debug.c
index ee533a5ceb79..96555fc78f1a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -51,84 +51,96 @@  const struct trace_print_flags vmaflag_names[] = {
 	{0, NULL}
 };
 
-static void __dump_page(struct page *page)
+static void __dump_folio(struct folio *folio, struct page *page,
+		unsigned long pfn, unsigned long idx)
 {
-	struct folio *folio = page_folio(page);
-	struct page *head = &folio->page;
-	struct address_space *mapping;
-	bool compound = PageCompound(page);
-	/*
-	 * Accessing the pageblock without the zone lock. It could change to
-	 * "isolate" again in the meantime, but since we are just dumping the
-	 * state for debugging, it should be fine to accept a bit of
-	 * inaccuracy here due to racing.
-	 */
-	bool page_cma = is_migrate_cma_page(page);
-	int mapcount;
+	struct address_space *mapping = folio_mapping(folio);
+	bool page_cma;
+	int mapcount = 0;
 	char *type = "";
 
-	if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
-		/*
-		 * Corrupt page, so we cannot call page_mapping. Instead, do a
-		 * safe subset of the steps that page_mapping() does. Caution:
-		 * this will be misleading for tail pages, PageSwapCache pages,
-		 * and potentially other situations. (See the page_mapping()
-		 * implementation for what's missing here.)
-		 */
-		unsigned long tmp = (unsigned long)page->mapping;
-
-		if (tmp & PAGE_MAPPING_ANON)
-			mapping = NULL;
-		else
-			mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
-		head = page;
-		folio = (struct folio *)page;
-		compound = false;
-	} else {
-		mapping = page_mapping(page);
-	}
-
 	/*
-	 * Avoid VM_BUG_ON() in page_mapcount().
-	 * page->_mapcount space in struct page is used by sl[aou]b pages to
-	 * encode own info.
+	 * page->_mapcount space in struct page is used by slab pages to
+	 * encode own info, and we must avoid calling page_folio() again.
 	 */
-	mapcount = PageSlab(head) ? 0 : page_mapcount(page);
-
-	pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
-			page, page_ref_count(head), mapcount, mapping,
-			page_to_pgoff(page), page_to_pfn(page));
-	if (compound) {
-		pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
-				head, compound_order(head),
+	if (!folio_test_slab(folio)) {
+		mapcount = atomic_read(&page->_mapcount) + 1;
+		if (folio_test_large(folio))
+			mapcount += folio_entire_mapcount(folio);
+	}
+
+	pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+			folio_ref_count(folio), mapcount, mapping,
+			folio->index + idx, pfn);
+	if (folio_test_large(folio)) {
+		pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
+				folio_order(folio),
 				folio_entire_mapcount(folio),
 				folio_nr_pages_mapped(folio),
 				atomic_read(&folio->_pincount));
 	}
 
 #ifdef CONFIG_MEMCG
-	if (head->memcg_data)
-		pr_warn("memcg:%lx\n", head->memcg_data);
+	if (folio->memcg_data)
+		pr_warn("memcg:%lx\n", folio->memcg_data);
 #endif
-	if (PageKsm(page))
+	if (folio_test_ksm(folio))
 		type = "ksm ";
-	else if (PageAnon(page))
+	else if (folio_test_anon(folio))
 		type = "anon ";
 	else if (mapping)
 		dump_mapping(mapping);
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
 
-	pr_warn("%sflags: %pGp%s\n", type, &head->flags,
+	/*
+	 * Accessing the pageblock without the zone lock. It could change to
+	 * "isolate" again in the meantime, but since we are just dumping the
+	 * state for debugging, it should be fine to accept a bit of
+	 * inaccuracy here due to racing.
+	 */
+	page_cma = is_migrate_cma_page(page);
+	pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
 		page_cma ? " CMA" : "");
-	pr_warn("page_type: %pGt\n", &head->page_type);
+	pr_warn("page_type: %pGt\n", &folio->page.page_type);
 
 	print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
 			sizeof(unsigned long), page,
 			sizeof(struct page), false);
-	if (head != page)
+	if (folio_test_large(folio))
 		print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
-			sizeof(unsigned long), head,
-			sizeof(struct page), false);
+			sizeof(unsigned long), folio,
+			2 * sizeof(struct page), false);
+}
+
+static void __dump_page(const struct page *page)
+{
+	struct folio *foliop, folio;
+	struct page precise;
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long idx, nr_pages = 1;
+	int loops = 5;
+
+again:
+	memcpy(&precise, page, sizeof(*page));
+	foliop = page_folio(&precise);
+	idx = folio_page_idx(foliop, page);
+	if (idx != 0) {
+		if (idx < (1UL << PUD_ORDER)) {
+			memcpy(&folio, foliop, 2 * sizeof(struct page));
+			nr_pages = folio_nr_pages(&folio);
+		}
+
+		if (idx > nr_pages) {
+			if (loops-- > 0)
+				goto again;
+			printk("page does not match folio\n");
+			precise.compound_head &= ~1UL;
+			foliop = (struct folio *)&precise;
+			idx = 0;
+		}
+	}
+
+	__dump_folio(foliop, &precise, pfn, idx);
 }
 
 void dump_page(struct page *page, const char *reason)