diff mbox series

[Resend,RFC] mm: introduce __GFP_TRACKLEAK to track in-kernel allocation

Message ID 1662116347-17649-1-git-send-email-zhaoyang.huang@unisoc.com (mailing list archive)
State New
Headers show
Series [Resend,RFC] mm: introduce __GFP_TRACKLEAK to track in-kernel allocation | expand

Commit Message

zhaoyang.huang Sept. 2, 2022, 10:59 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Kthread and drivers could fetch memory via alloc_pages directly which make them
hard to debug when leaking. Solve this by introducing __GFP_TRACELEAK and reuse
kmemleak mechanism which unified most of kernel cosuming pages into kmemleak.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 include/linux/gfp.h        | 4 +++-
 include/linux/page-flags.h | 3 +++
 mm/kmemleak.c              | 2 +-
 mm/page_alloc.c            | 6 ++++++
 4 files changed, 13 insertions(+), 2 deletions(-)

Comments

Andrew Morton Sept. 2, 2022, 6:58 p.m. UTC | #1
Cc willy for page-flags changes.

On Fri, 2 Sep 2022 18:59:07 +0800 "zhaoyang.huang" <zhaoyang.huang@unisoc.com> wrote:

> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> Kthread and drivers could fetch memory via alloc_pages directly which make them
> hard to debug when leaking. Solve this by introducing __GFP_TRACELEAK and reuse
> kmemleak mechanism which unified most of kernel cosuming pages into kmemleak.
> 
> ...
>

cc wi
> index 2d2ccae..081ab54 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -68,6 +68,7 @@
>  #else
>  #define ___GFP_NOLOCKDEP	0
>  #endif
> +#define ___GFP_TRACKLEAK	0x10000000u
>  /* If the above are modified, __GFP_BITS_SHIFT may need updating */
>  
>  /*
> @@ -259,12 +260,13 @@
>  #define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
>  #define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
>  #define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
> +#define __GFP_TRACKLEAK   ((__force gfp_t)___GFP_TRACKLEAK)
>  
>  /* Disable lockdep for GFP context tracking */
>  #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
>  
>  /* Room for N __GFP_FOO bits */
> -#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
> +#define __GFP_BITS_SHIFT (28 + IS_ENABLED(CONFIG_LOCKDEP))
>  #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>  
>  /**
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e66f7aa..ef0f814 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -942,6 +942,7 @@ static inline bool is_page_hwpoison(struct page *page)
>  #define PG_offline	0x00000100
>  #define PG_table	0x00000200
>  #define PG_guard	0x00000400
> +#define PG_trackleak	0x00000800
>  
>  #define PageType(page, flag)						\
>  	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
> @@ -1012,6 +1013,8 @@ static inline int page_has_type(struct page *page)
>   */
>  PAGE_TYPE_OPS(Guard, guard)
>  
> +PAGE_TYPE_OPS(Trackleak, trackleak)

We'd want this to evaluate to zero at compile time if
CONFIG_HAVE_DEBUG_KMEMLEAK=n

>  extern bool is_free_buddy_page(struct page *page);
>  
>  PAGEFLAG(Isolated, isolated, PF_ANY);
> diff --git a/mm/kmemleak.c b/mm/kmemleak.c
> index 422f28f..a182f5d 100644
> --- a/mm/kmemleak.c
> +++ b/mm/kmemleak.c
> @@ -1471,7 +1471,7 @@ static void kmemleak_scan(void)
>  			if (page_zone(page) != zone)
>  				continue;
>  			/* only scan if page is in use */
> -			if (page_count(page) == 0 || PageReserved(page))
> +			if (page_count(page) == 0)

Please changelog this alteration.

>  				continue;
>  			scan_block(page, page + 1, NULL);
>  			if (!(pfn & 63))
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e008a3d..d8995c6 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1361,6 +1361,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
>  		page->mapping = NULL;
>  	if (memcg_kmem_enabled() && PageMemcgKmem(page))
>  		__memcg_kmem_uncharge_page(page, order);
> +	if (PageTrackleak(page))
> +		kmemleak_free(page);
>  	if (check_free)
>  		bad += check_free_page(page);
>  	if (bad)
> @@ -5444,6 +5446,10 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
>  		__free_pages(page, order);
>  		page = NULL;
>  	}
> +	if (gfp & __GFP_TRACKLEAK) {

And we'd want __GFP_TRACKLEAK to evaluate to zero at compile time if
CONFIG_HAVE_DEBUG_KMEMLEAK=n.

> +		kmemleak_alloc(page_address(page), PAGE_SIZE << order, 1, gfp & ~__GFP_TRACKLEAK);
> +		__SetPageTrackleak(page);
> +	}
>  
>  	trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
>  
> -- 
> 1.9.1
Matthew Wilcox (Oracle) Sept. 2, 2022, 7:08 p.m. UTC | #2
On Fri, Sep 02, 2022 at 11:58:39AM -0700, Andrew Morton wrote:
> Cc willy for page-flags changes.

Thanks.  This is probably OK.  The biggest problem is that it won't
work for drivers which allocate memory and then map it to userspace.
If they try, they'll get a nice splat, but it may limit the usefulness
of this option.  We should probably document that limitation in this
patch.

> On Fri, 2 Sep 2022 18:59:07 +0800 "zhaoyang.huang" <zhaoyang.huang@unisoc.com> wrote:
> > +++ b/mm/page_alloc.c
> > @@ -1361,6 +1361,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
> >  		page->mapping = NULL;
> >  	if (memcg_kmem_enabled() && PageMemcgKmem(page))
> >  		__memcg_kmem_uncharge_page(page, order);
> > +	if (PageTrackleak(page))
> > +		kmemleak_free(page);

Don't we also need to __ClearPageTrackleak()?

> > +	if (gfp & __GFP_TRACKLEAK) {
> 
> And we'd want __GFP_TRACKLEAK to evaluate to zero at compile time if
> CONFIG_HAVE_DEBUG_KMEMLEAK=n.
> 
> > +		kmemleak_alloc(page_address(page), PAGE_SIZE << order, 1, gfp & ~__GFP_TRACKLEAK);
> > +		__SetPageTrackleak(page);
> > +	}

We only set this on the first page we allocate.  I think there's a
problem for multi-page, non-compound allocations, no?  Particularly
when you consider the problem fixed in e320d3012d25.

I'm not opposed to this tracking, it just needs a bit more thought and
awareness of some of the corner cases of the VM.  A few test cases would
be nice; they could demonstrate that this works for both compound and
non-compound high-order allocations.
Zhaoyang Huang Sept. 6, 2022, 7:29 a.m. UTC | #3
On Sat, Sep 3, 2022 at 3:08 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Sep 02, 2022 at 11:58:39AM -0700, Andrew Morton wrote:
> > Cc willy for page-flags changes.
>
> Thanks.  This is probably OK.  The biggest problem is that it won't
> work for drivers which allocate memory and then map it to userspace.
> If they try, they'll get a nice splat, but it may limit the usefulness
> of this option.  We should probably document that limitation in this
> patch.
>
> > On Fri, 2 Sep 2022 18:59:07 +0800 "zhaoyang.huang" <zhaoyang.huang@unisoc.com> wrote:
> > > +++ b/mm/page_alloc.c
> > > @@ -1361,6 +1361,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
> > >             page->mapping = NULL;
> > >     if (memcg_kmem_enabled() && PageMemcgKmem(page))
> > >             __memcg_kmem_uncharge_page(page, order);
> > > +   if (PageTrackleak(page))
> > > +           kmemleak_free(page);
>
> Don't we also need to __ClearPageTrackleak()?
ok
>
> > > +   if (gfp & __GFP_TRACKLEAK) {
> >
> > And we'd want __GFP_TRACKLEAK to evaluate to zero at compile time if
> > CONFIG_HAVE_DEBUG_KMEMLEAK=n.
> >
> > > +           kmemleak_alloc(page_address(page), PAGE_SIZE << order, 1, gfp & ~__GFP_TRACKLEAK);
> > > +           __SetPageTrackleak(page);
> > > +   }
>
> We only set this on the first page we allocate.  I think there's a
> problem for multi-page, non-compound allocations, no?  Particularly
> when you consider the problem fixed in e320d3012d25.
Please correct me if I am wrong. AFAICT, the leak_object is created
for tracking from page_address(page) to page_address(page) + PAGE_SIZE
<< order via checksum of the whole area while not referring to
page->cnt. Non-compound high order tail pages will not be checked
anymore since the leak_object has been freed together with head page
by put_page, whereas, it would not be a problem as the tail pages
should NOT be accessed also as they should be considered as buddy
pages, no?
>
> I'm not opposed to this tracking, it just needs a bit more thought and
> awareness of some of the corner cases of the VM.  A few test cases would
> be nice; they could demonstrate that this works for both compound and
> non-compound high-order allocations.
Catalin Marinas Sept. 7, 2022, 10 a.m. UTC | #4
On Fri, Sep 02, 2022 at 06:59:07PM +0800, zhaoyang.huang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> Kthread and drivers could fetch memory via alloc_pages directly which make them
> hard to debug when leaking. Solve this by introducing __GFP_TRACELEAK and reuse
> kmemleak mechanism which unified most of kernel cosuming pages into kmemleak.

This may be helpful for debugging individual drivers but they could as
well call kmemleak_alloc/free() directly and not bother with new GFP and
page flags.

I wonder whether we could go the other way around. Add a
__GFP_NOLEAKTRACE (we have SLAB_NOLEAKTRACE for example) and pass it in
the places where we don't want pages to be scanned/tracked: page cache
pages (too many and they don't store pointers to other kernel objects),
sl*b, CMA etc. allocations (basically in all places where you have
kmemleak_alloc() calls, otherwise the pointers overlap and confuse
kmemleak).
Zhaoyang Huang Sept. 12, 2022, 2:23 a.m. UTC | #5
On Wed, Sep 7, 2022 at 6:00 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Fri, Sep 02, 2022 at 06:59:07PM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > Kthread and drivers could fetch memory via alloc_pages directly which make them
> > hard to debug when leaking. Solve this by introducing __GFP_TRACELEAK and reuse
> > kmemleak mechanism which unified most of kernel cosuming pages into kmemleak.
>
> This may be helpful for debugging individual drivers but they could as
> well call kmemleak_alloc/free() directly and not bother with new GFP and
> page flags.
Sure, it could be done as you suggested. However, I would like to have
all memory related things wrapped together and leaving the user a
simple entrance. Besides, some drivers are designed in
Producer/Consumer mode where pages are got and freed by different
peers, which may lead to unpair kmemleak operation.
>
> I wonder whether we could go the other way around. Add a
> __GFP_NOLEAKTRACE (we have SLAB_NOLEAKTRACE for example) and pass it in
> the places where we don't want pages to be scanned/tracked: page cache
> pages (too many and they don't store pointers to other kernel objects),
> sl*b, CMA etc. allocations (basically in all places where you have
> kmemleak_alloc() calls, otherwise the pointers overlap and confuse
> kmemleak).
>
> --
> Catalin
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2d2ccae..081ab54 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -68,6 +68,7 @@ 
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
+#define ___GFP_TRACKLEAK	0x10000000u
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -259,12 +260,13 @@ 
 #define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
 #define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
 #define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+#define __GFP_TRACKLEAK   ((__force gfp_t)___GFP_TRACKLEAK)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (28 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa..ef0f814 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -942,6 +942,7 @@  static inline bool is_page_hwpoison(struct page *page)
 #define PG_offline	0x00000100
 #define PG_table	0x00000200
 #define PG_guard	0x00000400
+#define PG_trackleak	0x00000800
 
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -1012,6 +1013,8 @@  static inline int page_has_type(struct page *page)
  */
 PAGE_TYPE_OPS(Guard, guard)
 
+PAGE_TYPE_OPS(Trackleak, trackleak)
+
 extern bool is_free_buddy_page(struct page *page);
 
 PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 422f28f..a182f5d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1471,7 +1471,7 @@  static void kmemleak_scan(void)
 			if (page_zone(page) != zone)
 				continue;
 			/* only scan if page is in use */
-			if (page_count(page) == 0 || PageReserved(page))
+			if (page_count(page) == 0)
 				continue;
 			scan_block(page, page + 1, NULL);
 			if (!(pfn & 63))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e008a3d..d8995c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1361,6 +1361,8 @@  static __always_inline bool free_pages_prepare(struct page *page,
 		page->mapping = NULL;
 	if (memcg_kmem_enabled() && PageMemcgKmem(page))
 		__memcg_kmem_uncharge_page(page, order);
+	if (PageTrackleak(page))
+		kmemleak_free(page);
 	if (check_free)
 		bad += check_free_page(page);
 	if (bad)
@@ -5444,6 +5446,10 @@  struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 		__free_pages(page, order);
 		page = NULL;
 	}
+	if (gfp & __GFP_TRACKLEAK) {
+		kmemleak_alloc(page_address(page), PAGE_SIZE << order, 1, gfp & ~__GFP_TRACKLEAK);
+		__SetPageTrackleak(page);
+	}
 
 	trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);