diff mbox series

[v4,1/4] mm/slub: enable debugging memory wasting of kmalloc

Message ID 20220829075618.69069-2-feng.tang@intel.com (mailing list archive)
State New
Headers show
Series mm/slub: some debug enhancements for kmalloc objects | expand

Commit Message

Feng Tang Aug. 29, 2022, 7:56 a.m. UTC
kmalloc's API family is critical for mm, with one nature that it will
round up the request size to a fixed one (mostly power of 2). Say
when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
could be allocated, so in worst case, there is around 50% memory
space waste.

The wastage is not a big issue for requests that get allocated/freed
quickly, but may cause problems with objects that have longer life
time.

We've met a kernel boot OOM panic (v5.10), and from the dumped slab
info:

    [   26.062145] kmalloc-2k            814056KB     814056KB

From debug we found there are huge number of 'struct iova_magazine',
whose size is 1032 bytes (1024 + 8), so each allocation will waste
1016 bytes. Though the issue was solved by giving the right (bigger)
size of RAM, it is still nice to optimize the size (either use a
kmalloc friendly size or create a dedicated slab for it).

And from lkml archive, there was another crash kernel OOM case [1]
back in 2019, which seems to be related with the similar slab waste
situation, as the log is similar:

    [    4.332648] iommu: Adding device 0000:20:02.0 to group 16
    [    4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0
    ...
    [    4.857565] kmalloc-2048           59164KB      59164KB

The crash kernel only has 256M memory, and 59M is pretty big here.
(Note: the related code has been changed and optimised in recent
kernel [2], these logs are just picked to demo the problem, also
a patch changing its size to 1024 bytes has been merged)

So add an way to track each kmalloc's memory waste info, and
leverage the existing SLUB debug framework (specifically
SLUB_STORE_USER) to show its call stack of original allocation,
so that user can evaluate the waste situation, identify some hot
spots and optimize accordingly, for a better utilization of memory.

The waste info is integrated into existing interface:
'/sys/kernel/debug/slab/kmalloc-xx/alloc_traces', one example of
'kmalloc-4k' after boot is:

126 ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe] waste=233856/1856 age=1493302/1493830/1494358 pid=1284 cpus=32 nodes=1
        __slab_alloc.isra.86+0x52/0x80
        __kmalloc_node+0x143/0x350
        ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe]
        ixgbe_init_interrupt_scheme+0x1a6/0x730 [ixgbe]
        ixgbe_probe+0xc8e/0x10d0 [ixgbe]
        local_pci_probe+0x42/0x80
        work_for_cpu_fn+0x13/0x20
        process_one_work+0x1c5/0x390

which means in 'kmalloc-4k' slab, there are 126 requests of
2240 bytes which got a 4KB space (wasting 1856 bytes each
and 233856 bytes in total). And when system starts some real
workload like multiple docker instances, there are more
severe waste.

[1]. https://lkml.org/lkml/2019/8/12/266
[2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/

[Thanks Hyeonggon for pointing out several bugs about sorting/format]
[Thanks Vlastimil for suggesting way to reduce memory usage of
 orig_size and keep it only for kmalloc objects]

Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 include/linux/slab.h |  2 +
 mm/slub.c            | 94 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 81 insertions(+), 15 deletions(-)

Comments

Hyeonggon Yoo Aug. 31, 2022, 2:52 p.m. UTC | #1
On Mon, Aug 29, 2022 at 03:56:15PM +0800, Feng Tang wrote:
> kmalloc's API family is critical for mm, with one nature that it will
> round up the request size to a fixed one (mostly power of 2). Say
> when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
> could be allocated, so in worst case, there is around 50% memory
> space waste.
> 

[...]

>  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> -			  unsigned long addr, struct kmem_cache_cpu *c)
> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>  {
>  	void *freelist;
>  	struct slab *slab;
> @@ -3115,6 +3158,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  
>  		if (s->flags & SLAB_STORE_USER)
>  			set_track(s, freelist, TRACK_ALLOC, addr);
> +		set_orig_size(s, freelist, orig_size);
>  
>  		return freelist;
>  	}
> @@ -3140,6 +3184,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		 */
>  		if (s->flags & SLAB_STORE_USER)
>  			set_track(s, freelist, TRACK_ALLOC, addr);
> +		set_orig_size(s, freelist, orig_size);
> +
>  		return freelist;
>  	}

Maybe we can move set_track() and set_orig_size() to after slab_post_alloc_hook().
something like alloc/free hooks for debugging caches? (and drop orig_size parameter.) 

Thanks!

>  
> @@ -3182,7 +3228,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>   * pointer.
>   */
>  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> -			  unsigned long addr, struct kmem_cache_cpu *c)
> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>  {
>  	void *p;
Feng Tang Sept. 1, 2022, 5:04 a.m. UTC | #2
On Wed, Aug 31, 2022 at 10:52:15PM +0800, Hyeonggon Yoo wrote:
> On Mon, Aug 29, 2022 at 03:56:15PM +0800, Feng Tang wrote:
> > kmalloc's API family is critical for mm, with one nature that it will
> > round up the request size to a fixed one (mostly power of 2). Say
> > when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
> > could be allocated, so in worst case, there is around 50% memory
> > space waste.
> > 
> 
> [...]
> 
> >  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > -			  unsigned long addr, struct kmem_cache_cpu *c)
> > +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
> >  {
> >  	void *freelist;
> >  	struct slab *slab;
> > @@ -3115,6 +3158,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >  
> >  		if (s->flags & SLAB_STORE_USER)
> >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > +		set_orig_size(s, freelist, orig_size);
> >  
> >  		return freelist;
> >  	}
> > @@ -3140,6 +3184,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >  		 */
> >  		if (s->flags & SLAB_STORE_USER)
> >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > +		set_orig_size(s, freelist, orig_size);
> > +
> >  		return freelist;
> >  	}
> 
> Maybe we can move set_track() and set_orig_size() to after slab_post_alloc_hook().
> something like alloc/free hooks for debugging caches? (and drop orig_size parameter.) 

Yep, we discussed this during v3 review
https://lore.kernel.org/lkml/442d2b9c-9f07-8954-b90e-b4a9f8b64303@intel.com/

Will revisit this considering recent refactoring and the following
kmalloc data redzone patches.

Thanks,
Feng

> Thanks!
Hyeonggon Yoo Sept. 1, 2022, 11:14 a.m. UTC | #3
On Thu, Sep 01, 2022 at 01:04:58PM +0800, Feng Tang wrote:
> On Wed, Aug 31, 2022 at 10:52:15PM +0800, Hyeonggon Yoo wrote:
> > On Mon, Aug 29, 2022 at 03:56:15PM +0800, Feng Tang wrote:
> > > kmalloc's API family is critical for mm, with one nature that it will
> > > round up the request size to a fixed one (mostly power of 2). Say
> > > when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
> > > could be allocated, so in worst case, there is around 50% memory
> > > space waste.
> > > 
> > 
> > [...]
> > 
> > >  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > > -			  unsigned long addr, struct kmem_cache_cpu *c)
> > > +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
> > >  {
> > >  	void *freelist;
> > >  	struct slab *slab;
> > > @@ -3115,6 +3158,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > >  
> > >  		if (s->flags & SLAB_STORE_USER)
> > >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > > +		set_orig_size(s, freelist, orig_size);
> > >  
> > >  		return freelist;
> > >  	}
> > > @@ -3140,6 +3184,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > >  		 */
> > >  		if (s->flags & SLAB_STORE_USER)
> > >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > > +		set_orig_size(s, freelist, orig_size);
> > > +
> > >  		return freelist;
> > >  	}
> > 
> > Maybe we can move set_track() and set_orig_size() to after slab_post_alloc_hook().
> > something like alloc/free hooks for debugging caches? (and drop orig_size parameter.) 
> 
> Yep, we discussed this during v3 review
> https://lore.kernel.org/lkml/442d2b9c-9f07-8954-b90e-b4a9f8b64303@intel.com/

Ah, I missed that :) Thanks!

Considering the added cost (should be low) and races with validation,
I think this approach will cost more than it get. Sorry for the noise.

p.s. I think I can review this series in few days.
Thanks for your efforts!

> Will revisit this considering recent refactoring and the following
> kmalloc data redzone patches.

> Thanks,
> Feng
> 
> > Thanks!
Hyeonggon Yoo Sept. 1, 2022, 2:01 p.m. UTC | #4
On Mon, Aug 29, 2022 at 03:56:15PM +0800, Feng Tang wrote:
> kmalloc's API family is critical for mm, with one nature that it will
> round up the request size to a fixed one (mostly power of 2). Say
> when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
> could be allocated, so in worst case, there is around 50% memory
> space waste.
> 
> The wastage is not a big issue for requests that get allocated/freed
> quickly, but may cause problems with objects that have longer life
> time.
> 
> We've met a kernel boot OOM panic (v5.10), and from the dumped slab
> info:
> 
>     [   26.062145] kmalloc-2k            814056KB     814056KB
> 
> From debug we found there are huge number of 'struct iova_magazine',
> whose size is 1032 bytes (1024 + 8), so each allocation will waste
> 1016 bytes. Though the issue was solved by giving the right (bigger)
> size of RAM, it is still nice to optimize the size (either use a
> kmalloc friendly size or create a dedicated slab for it).
> 
> And from lkml archive, there was another crash kernel OOM case [1]
> back in 2019, which seems to be related with the similar slab waste
> situation, as the log is similar:
> 
>     [    4.332648] iommu: Adding device 0000:20:02.0 to group 16
>     [    4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0
>     ...
>     [    4.857565] kmalloc-2048           59164KB      59164KB
> 
> The crash kernel only has 256M memory, and 59M is pretty big here.
> (Note: the related code has been changed and optimised in recent
> kernel [2], these logs are just picked to demo the problem, also
> a patch changing its size to 1024 bytes has been merged)
> 
> So add an way to track each kmalloc's memory waste info, and
> leverage the existing SLUB debug framework (specifically
> SLUB_STORE_USER) to show its call stack of original allocation,
> so that user can evaluate the waste situation, identify some hot
> spots and optimize accordingly, for a better utilization of memory.
> 
> The waste info is integrated into existing interface:
> '/sys/kernel/debug/slab/kmalloc-xx/alloc_traces', one example of
> 'kmalloc-4k' after boot is:
> 
> 126 ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe] waste=233856/1856 age=1493302/1493830/1494358 pid=1284 cpus=32 nodes=1
>         __slab_alloc.isra.86+0x52/0x80
>         __kmalloc_node+0x143/0x350
>         ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe]
>         ixgbe_init_interrupt_scheme+0x1a6/0x730 [ixgbe]
>         ixgbe_probe+0xc8e/0x10d0 [ixgbe]
>         local_pci_probe+0x42/0x80
>         work_for_cpu_fn+0x13/0x20
>         process_one_work+0x1c5/0x390
> 
> which means in 'kmalloc-4k' slab, there are 126 requests of
> 2240 bytes which got a 4KB space (wasting 1856 bytes each
> and 233856 bytes in total). And when system starts some real
> workload like multiple docker instances, there are more
> severe waste.
> 
> [1]. https://lkml.org/lkml/2019/8/12/266
> [2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/
> 
> [Thanks Hyeonggon for pointing out several bugs about sorting/format]
> [Thanks Vlastimil for suggesting way to reduce memory usage of
>  orig_size and keep it only for kmalloc objects]
> 
> Signed-off-by: Feng Tang <feng.tang@intel.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: John Garry <john.garry@huawei.com>
> Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
>  include/linux/slab.h |  2 +
>  mm/slub.c            | 94 +++++++++++++++++++++++++++++++++++++-------
>  2 files changed, 81 insertions(+), 15 deletions(-)


Would you update Documentation/mm/slub.rst as well?
(alloc_traces part)

[...]

>   */
>  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> -			  unsigned long addr, struct kmem_cache_cpu *c)
> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>  {
>  	void *freelist;
>  	struct slab *slab;
> @@ -3115,6 +3158,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  
>  		if (s->flags & SLAB_STORE_USER)
>  			set_track(s, freelist, TRACK_ALLOC, addr);
> +		set_orig_size(s, freelist, orig_size);
>  
>  		return freelist;
>  	}
> @@ -3140,6 +3184,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		 */
>  		if (s->flags & SLAB_STORE_USER)
>  			set_track(s, freelist, TRACK_ALLOC, addr);
> +		set_orig_size(s, freelist, orig_size);
> +
>  		return freelist;
>  	}


This patch is okay but with patch 4, init_object() initializes redzone/poison area
using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
Why not do it in init_object() in the first time?

Also, updating redzone/poison area after alloc_single_from_new_slab()
(outside list_lock, after adding slab to list) will introduce races with validation.

So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.

I can miss something, please kindly let me know if I did ;)

Anything else looks good to me.
Thanks!
Feng Tang Sept. 2, 2022, 6:15 a.m. UTC | #5
On Thu, Sep 01, 2022 at 10:01:13PM +0800, Hyeonggon Yoo wrote:
> On Mon, Aug 29, 2022 at 03:56:15PM +0800, Feng Tang wrote:
> > kmalloc's API family is critical for mm, with one nature that it will
> > round up the request size to a fixed one (mostly power of 2). Say
> > when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
> > could be allocated, so in worst case, there is around 50% memory
> > space waste.
> > 
> > The wastage is not a big issue for requests that get allocated/freed
> > quickly, but may cause problems with objects that have longer life
> > time.
> > 
> > We've met a kernel boot OOM panic (v5.10), and from the dumped slab
> > info:
> > 
> >     [   26.062145] kmalloc-2k            814056KB     814056KB
> > 
> > From debug we found there are huge number of 'struct iova_magazine',
> > whose size is 1032 bytes (1024 + 8), so each allocation will waste
> > 1016 bytes. Though the issue was solved by giving the right (bigger)
> > size of RAM, it is still nice to optimize the size (either use a
> > kmalloc friendly size or create a dedicated slab for it).
> > 
> > And from lkml archive, there was another crash kernel OOM case [1]
> > back in 2019, which seems to be related with the similar slab waste
> > situation, as the log is similar:
> > 
> >     [    4.332648] iommu: Adding device 0000:20:02.0 to group 16
> >     [    4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0
> >     ...
> >     [    4.857565] kmalloc-2048           59164KB      59164KB
> > 
> > The crash kernel only has 256M memory, and 59M is pretty big here.
> > (Note: the related code has been changed and optimised in recent
> > kernel [2], these logs are just picked to demo the problem, also
> > a patch changing its size to 1024 bytes has been merged)
> > 
> > So add an way to track each kmalloc's memory waste info, and
> > leverage the existing SLUB debug framework (specifically
> > SLUB_STORE_USER) to show its call stack of original allocation,
> > so that user can evaluate the waste situation, identify some hot
> > spots and optimize accordingly, for a better utilization of memory.
> > 
> > The waste info is integrated into existing interface:
> > '/sys/kernel/debug/slab/kmalloc-xx/alloc_traces', one example of
> > 'kmalloc-4k' after boot is:
> > 
> > 126 ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe] waste=233856/1856 age=1493302/1493830/1494358 pid=1284 cpus=32 nodes=1
> >         __slab_alloc.isra.86+0x52/0x80
> >         __kmalloc_node+0x143/0x350
> >         ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe]
> >         ixgbe_init_interrupt_scheme+0x1a6/0x730 [ixgbe]
> >         ixgbe_probe+0xc8e/0x10d0 [ixgbe]
> >         local_pci_probe+0x42/0x80
> >         work_for_cpu_fn+0x13/0x20
> >         process_one_work+0x1c5/0x390
> > 
> > which means in 'kmalloc-4k' slab, there are 126 requests of
> > 2240 bytes which got a 4KB space (wasting 1856 bytes each
> > and 233856 bytes in total). And when system starts some real
> > workload like multiple docker instances, there are more
> > severe waste.
> > 
> > [1]. https://lkml.org/lkml/2019/8/12/266
> > [2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/
> > 
> > [Thanks Hyeonggon for pointing out several bugs about sorting/format]
> > [Thanks Vlastimil for suggesting way to reduce memory usage of
> >  orig_size and keep it only for kmalloc objects]
> > 
> > Signed-off-by: Feng Tang <feng.tang@intel.com>
> > Cc: Robin Murphy <robin.murphy@arm.com>
> > Cc: John Garry <john.garry@huawei.com>
> > Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
> > ---
> >  include/linux/slab.h |  2 +
> >  mm/slub.c            | 94 +++++++++++++++++++++++++++++++++++++-------
> >  2 files changed, 81 insertions(+), 15 deletions(-)
> 
> 
> Would you update Documentation/mm/slub.rst as well?
> (alloc_traces part)
 
Sure, will do.

> [...]
> 
> >   */
> >  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > -			  unsigned long addr, struct kmem_cache_cpu *c)
> > +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
> >  {
> >  	void *freelist;
> >  	struct slab *slab;
> > @@ -3115,6 +3158,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >  
> >  		if (s->flags & SLAB_STORE_USER)
> >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > +		set_orig_size(s, freelist, orig_size);
> >  
> >  		return freelist;
> >  	}
> > @@ -3140,6 +3184,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >  		 */
> >  		if (s->flags & SLAB_STORE_USER)
> >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > +		set_orig_size(s, freelist, orig_size);
> > +
> >  		return freelist;
> >  	}
> 
> 
> This patch is okay but with patch 4, init_object() initializes redzone/poison area
> using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
> Why not do it in init_object() in the first time?
> 
> Also, updating redzone/poison area after alloc_single_from_new_slab()
> (outside list_lock, after adding slab to list) will introduce races with validation.
> 
> So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.

Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
done in alloc_debug_processing() (through init_object()). When
rebasing to v4, I met the classical problem: how to pass 'orig_size'
parameter :)

In latest 'for-next' branch, one call path for alloc_debug_processing()
is
  ___slab_alloc
    get_partial
      get_any_partial
        get_partial_node
          alloc_debug_processing

Adding 'orig_size' paramter to all these function looks horrible, and
I couldn't figure out a good way and chosed to put those ops after
'set_track()'

Thanks,
Feng

> I can miss something, please kindly let me know if I did ;)
> 
> Anything else looks good to me.
> Thanks!
> 
> -- 
> Thanks,
> Hyeonggon
Hyeonggon Yoo Sept. 4, 2022, 9:03 a.m. UTC | #6
On Fri, Sep 02, 2022 at 02:15:45PM +0800, Feng Tang wrote:
> On Thu, Sep 01, 2022 at 10:01:13PM +0800, Hyeonggon Yoo wrote:
> > On Mon, Aug 29, 2022 at 03:56:15PM +0800, Feng Tang wrote:
> > > kmalloc's API family is critical for mm, with one nature that it will
> > > round up the request size to a fixed one (mostly power of 2). Say
> > > when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
> > > could be allocated, so in worst case, there is around 50% memory
> > > space waste.
> > > 

[...]

> > > 
> > > Signed-off-by: Feng Tang <feng.tang@intel.com>
> > > Cc: Robin Murphy <robin.murphy@arm.com>
> > > Cc: John Garry <john.garry@huawei.com>
> > > Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
> > > ---
> > >  include/linux/slab.h |  2 +
> > >  mm/slub.c            | 94 +++++++++++++++++++++++++++++++++++++-------
> > >  2 files changed, 81 insertions(+), 15 deletions(-)
> > 
> > 
> > Would you update Documentation/mm/slub.rst as well?
> > (alloc_traces part)
>  
> Sure, will do.
> 
> > [...]
> > 
> > >   */
> > >  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > > -			  unsigned long addr, struct kmem_cache_cpu *c)
> > > +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
> > >  {
> > >  	void *freelist;
> > >  	struct slab *slab;
> > > @@ -3115,6 +3158,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > >  
> > >  		if (s->flags & SLAB_STORE_USER)
> > >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > > +		set_orig_size(s, freelist, orig_size);
> > >  
> > >  		return freelist;
> > >  	}
> > > @@ -3140,6 +3184,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > >  		 */
> > >  		if (s->flags & SLAB_STORE_USER)
> > >  			set_track(s, freelist, TRACK_ALLOC, addr);
> > > +		set_orig_size(s, freelist, orig_size);
> > > +
> > >  		return freelist;
> > >  	}
> > 
> > 
> > This patch is okay but with patch 4, init_object() initializes redzone/poison area
> > using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
> > Why not do it in init_object() in the first time?
> > 
> > Also, updating redzone/poison area after alloc_single_from_new_slab()
> > (outside list_lock, after adding slab to list) will introduce races with validation.
> > 
> > So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.
> 
> Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
> done in alloc_debug_processing() (through init_object()). When
> rebasing to v4, I met the classical problem: how to pass 'orig_size'
> parameter :)
> 
> In latest 'for-next' branch, one call path for alloc_debug_processing()
> is
>   ___slab_alloc
>     get_partial
>       get_any_partial
>         get_partial_node
>           alloc_debug_processing
> 
> Adding 'orig_size' paramter to all these function looks horrible, and
> I couldn't figure out a good way and chosed to put those ops after
> 'set_track()'

IMO adding a parameter to them isn't too horrible...
I don't see better solution than adding a parameter with current implementation.
(Yeah, the code is quite complicated...)

It won't affect performance to meaningful degree as most of
allocations will be served from cpu slab or percpu partial list.
Feng Tang Sept. 4, 2022, 9:42 a.m. UTC | #7
On Sun, Sep 04, 2022 at 05:03:34PM +0800, Hyeonggon Yoo wrote:
[...]
> > > 
> > > This patch is okay but with patch 4, init_object() initializes redzone/poison area
> > > using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
> > > Why not do it in init_object() in the first time?
> > > 
> > > Also, updating redzone/poison area after alloc_single_from_new_slab()
> > > (outside list_lock, after adding slab to list) will introduce races with validation.
> > > 
> > > So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.
> > 
> > Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
> > done in alloc_debug_processing() (through init_object()). When
> > rebasing to v4, I met the classical problem: how to pass 'orig_size'
> > parameter :)
> > 
> > In latest 'for-next' branch, one call path for alloc_debug_processing()
> > is
> >   ___slab_alloc
> >     get_partial
> >       get_any_partial
> >         get_partial_node
> >           alloc_debug_processing
> > 
> > Adding 'orig_size' paramter to all these function looks horrible, and
> > I couldn't figure out a good way and chosed to put those ops after
> > 'set_track()'
> 
> IMO adding a parameter to them isn't too horrible...
> I don't see better solution than adding a parameter with current implementation.
> (Yeah, the code is quite complicated...)
> 
> It won't affect performance to meaningful degree as most of
> allocations will be served from cpu slab or percpu partial list. 

Thanks for the suggestion! I'm fine with it and just afraid other
developers may dislike the extra parameter. 

The race condition you mentioned is a valid concern, and I have thought
about it, one way is moving the set_orig_size() after the redzone/poision
setup, and in 'check_object()' we can detect whether the 'orig_size' is
set, and skip that check if it's not set yet. As the manual validate_slab
triggered from sysfs interface is a rare debug activity, I think skipping
one object shouldn't hurt much.

Thanks,
Feng

> -- 
> Thanks,
> Hyeonggon
>
Hyeonggon Yoo Sept. 4, 2022, 10:58 a.m. UTC | #8
On Sun, Sep 04, 2022 at 05:42:33PM +0800, Feng Tang wrote:
> On Sun, Sep 04, 2022 at 05:03:34PM +0800, Hyeonggon Yoo wrote:
> [...]
> > > > 
> > > > This patch is okay but with patch 4, init_object() initializes redzone/poison area
> > > > using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
> > > > Why not do it in init_object() in the first time?
> > > > 
> > > > Also, updating redzone/poison area after alloc_single_from_new_slab()
> > > > (outside list_lock, after adding slab to list) will introduce races with validation.
> > > > 
> > > > So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.
> > > 
> > > Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
> > > done in alloc_debug_processing() (through init_object()). When
> > > rebasing to v4, I met the classical problem: how to pass 'orig_size'
> > > parameter :)
> > > 
> > > In latest 'for-next' branch, one call path for alloc_debug_processing()
> > > is
> > >   ___slab_alloc
> > >     get_partial
> > >       get_any_partial
> > >         get_partial_node
> > >           alloc_debug_processing
> > > 
> > > Adding 'orig_size' paramter to all these function looks horrible, and
> > > I couldn't figure out a good way and chosed to put those ops after
> > > 'set_track()'
> > 
> > IMO adding a parameter to them isn't too horrible...
> > I don't see better solution than adding a parameter with current implementation.
> > (Yeah, the code is quite complicated...)
> > 
> > It won't affect performance to meaningful degree as most of
> > allocations will be served from cpu slab or percpu partial list. 
> 
> Thanks for the suggestion! I'm fine with it and just afraid other
> developers may dislike the extra parameter. 
> 
> The race condition you mentioned is a valid concern, and I have thought
> about it, one way is moving the set_orig_size() after the redzone/poision
> setup, and in 'check_object()' we can detect whether the 'orig_size' is
> set, and skip that check if it's not set yet. As the manual validate_slab
> triggered from sysfs interface is a rare debug activity, I think skipping
> one object shouldn't hurt much.

That will require smp_wmb()/smp_rmb() pair to make sure that
effects of set_orig_size() to be visible after redzone/poison setup.

Isn't it simpler to add a parameter?

> 
> Thanks,
> Feng
> 
> > -- 
> > Thanks,
> > Hyeonggon
> >
Feng Tang Sept. 5, 2022, 2:55 a.m. UTC | #9
On Sun, Sep 04, 2022 at 06:58:49PM +0800, Hyeonggon Yoo wrote:
> On Sun, Sep 04, 2022 at 05:42:33PM +0800, Feng Tang wrote:
> > On Sun, Sep 04, 2022 at 05:03:34PM +0800, Hyeonggon Yoo wrote:
> > [...]
> > > > > 
> > > > > This patch is okay but with patch 4, init_object() initializes redzone/poison area
> > > > > using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
> > > > > Why not do it in init_object() in the first time?
> > > > > 
> > > > > Also, updating redzone/poison area after alloc_single_from_new_slab()
> > > > > (outside list_lock, after adding slab to list) will introduce races with validation.
> > > > > 
> > > > > So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.
> > > > 
> > > > Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
> > > > done in alloc_debug_processing() (through init_object()). When
> > > > rebasing to v4, I met the classical problem: how to pass 'orig_size'
> > > > parameter :)
> > > > 
> > > > In latest 'for-next' branch, one call path for alloc_debug_processing()
> > > > is
> > > >   ___slab_alloc
> > > >     get_partial
> > > >       get_any_partial
> > > >         get_partial_node
> > > >           alloc_debug_processing
> > > > 
> > > > Adding 'orig_size' paramter to all these function looks horrible, and
> > > > I couldn't figure out a good way and chosed to put those ops after
> > > > 'set_track()'
> > > 
> > > IMO adding a parameter to them isn't too horrible...
> > > I don't see better solution than adding a parameter with current implementation.
> > > (Yeah, the code is quite complicated...)
> > > 
> > > It won't affect performance to meaningful degree as most of
> > > allocations will be served from cpu slab or percpu partial list. 
> > 
> > Thanks for the suggestion! I'm fine with it and just afraid other
> > developers may dislike the extra parameter. 
> > 
> > The race condition you mentioned is a valid concern, and I have thought
> > about it, one way is moving the set_orig_size() after the redzone/poision
> > setup, and in 'check_object()' we can detect whether the 'orig_size' is
> > set, and skip that check if it's not set yet. As the manual validate_slab
> > triggered from sysfs interface is a rare debug activity, I think skipping
> > one object shouldn't hurt much.
> 
> That will require smp_wmb()/smp_rmb() pair to make sure that
> effects of set_orig_size() to be visible after redzone/poison setup.

Yes, synchronization is needed here.

> Isn't it simpler to add a parameter?

OK, I can go this way in v5 if other developers are fine. thanks

- Feng
Vlastimil Babka Sept. 5, 2022, 6:29 a.m. UTC | #10
On 9/5/22 04:55, Feng Tang wrote:
> On Sun, Sep 04, 2022 at 06:58:49PM +0800, Hyeonggon Yoo wrote:
>> On Sun, Sep 04, 2022 at 05:42:33PM +0800, Feng Tang wrote:
>> > On Sun, Sep 04, 2022 at 05:03:34PM +0800, Hyeonggon Yoo wrote:
>> > [...]
>> > > > > 
>> > > > > This patch is okay but with patch 4, init_object() initializes redzone/poison area
>> > > > > using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
>> > > > > Why not do it in init_object() in the first time?
>> > > > > 
>> > > > > Also, updating redzone/poison area after alloc_single_from_new_slab()
>> > > > > (outside list_lock, after adding slab to list) will introduce races with validation.
>> > > > > 
>> > > > > So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.
>> > > > 
>> > > > Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
>> > > > done in alloc_debug_processing() (through init_object()). When
>> > > > rebasing to v4, I met the classical problem: how to pass 'orig_size'
>> > > > parameter :)
>> > > > 
>> > > > In latest 'for-next' branch, one call path for alloc_debug_processing()
>> > > > is
>> > > >   ___slab_alloc
>> > > >     get_partial
>> > > >       get_any_partial
>> > > >         get_partial_node
>> > > >           alloc_debug_processing
>> > > > 
>> > > > Adding 'orig_size' paramter to all these function looks horrible, and
>> > > > I couldn't figure out a good way and chosed to put those ops after
>> > > > 'set_track()'
>> > > 
>> > > IMO adding a parameter to them isn't too horrible...
>> > > I don't see better solution than adding a parameter with current implementation.
>> > > (Yeah, the code is quite complicated...)
>> > > 
>> > > It won't affect performance to meaningful degree as most of
>> > > allocations will be served from cpu slab or percpu partial list. 
>> > 
>> > Thanks for the suggestion! I'm fine with it and just afraid other
>> > developers may dislike the extra parameter. 
>> > 
>> > The race condition you mentioned is a valid concern, and I have thought
>> > about it, one way is moving the set_orig_size() after the redzone/poision
>> > setup, and in 'check_object()' we can detect whether the 'orig_size' is
>> > set, and skip that check if it's not set yet. As the manual validate_slab
>> > triggered from sysfs interface is a rare debug activity, I think skipping
>> > one object shouldn't hurt much.
>> 
>> That will require smp_wmb()/smp_rmb() pair to make sure that
>> effects of set_orig_size() to be visible after redzone/poison setup.
> 
> Yes, synchronization is needed here.
> 
>> Isn't it simpler to add a parameter?
> 
> OK, I can go this way in v5 if other developers are fine. thanks

How about get_partial() instantiates an on-stack structure that contains
gfpflags, ret_slab, orig_size and passes pointer to that to all the nested
functions.

Would be similar to "struct alloc_context" in page allocation.
Something like "struct partial_context pc"?

> - Feng
Feng Tang Sept. 5, 2022, 7:06 a.m. UTC | #11
On Mon, Sep 05, 2022 at 02:29:51PM +0800, Vlastimil Babka wrote:
> On 9/5/22 04:55, Feng Tang wrote:
> > On Sun, Sep 04, 2022 at 06:58:49PM +0800, Hyeonggon Yoo wrote:
> >> On Sun, Sep 04, 2022 at 05:42:33PM +0800, Feng Tang wrote:
> >> > On Sun, Sep 04, 2022 at 05:03:34PM +0800, Hyeonggon Yoo wrote:
> >> > [...]
> >> > > > > 
> >> > > > > This patch is okay but with patch 4, init_object() initializes redzone/poison area
> >> > > > > using s->object_size, and init_kmalloc_object() fixes redzone/poison area using orig_size.
> >> > > > > Why not do it in init_object() in the first time?
> >> > > > > 
> >> > > > > Also, updating redzone/poison area after alloc_single_from_new_slab()
> >> > > > > (outside list_lock, after adding slab to list) will introduce races with validation.
> >> > > > > 
> >> > > > > So I think doing set_orig_size()/init_kmalloc_object() in alloc_debug_processing() would make more sense.
> >> > > > 
> >> > > > Yes, this makes sense, and in v3, kmalloc redzone/poison setup was
> >> > > > done in alloc_debug_processing() (through init_object()). When
> >> > > > rebasing to v4, I met the classical problem: how to pass 'orig_size'
> >> > > > parameter :)
> >> > > > 
> >> > > > In latest 'for-next' branch, one call path for alloc_debug_processing()
> >> > > > is
> >> > > >   ___slab_alloc
> >> > > >     get_partial
> >> > > >       get_any_partial
> >> > > >         get_partial_node
> >> > > >           alloc_debug_processing
> >> > > > 
> >> > > > Adding 'orig_size' paramter to all these function looks horrible, and
> >> > > > I couldn't figure out a good way and chosed to put those ops after
> >> > > > 'set_track()'
> >> > > 
> >> > > IMO adding a parameter to them isn't too horrible...
> >> > > I don't see better solution than adding a parameter with current implementation.
> >> > > (Yeah, the code is quite complicated...)
> >> > > 
> >> > > It won't affect performance to meaningful degree as most of
> >> > > allocations will be served from cpu slab or percpu partial list. 
> >> > 
> >> > Thanks for the suggestion! I'm fine with it and just afraid other
> >> > developers may dislike the extra parameter. 
> >> > 
> >> > The race condition you mentioned is a valid concern, and I have thought
> >> > about it, one way is moving the set_orig_size() after the redzone/poision
> >> > setup, and in 'check_object()' we can detect whether the 'orig_size' is
> >> > set, and skip that check if it's not set yet. As the manual validate_slab
> >> > triggered from sysfs interface is a rare debug activity, I think skipping
> >> > one object shouldn't hurt much.
> >> 
> >> That will require smp_wmb()/smp_rmb() pair to make sure that
> >> effects of set_orig_size() to be visible after redzone/poison setup.
> > 
> > Yes, synchronization is needed here.
> > 
> >> Isn't it simpler to add a parameter?
> > 
> > OK, I can go this way in v5 if other developers are fine. thanks
> 
> How about get_partial() instantiates an on-stack structure that contains
> gfpflags, ret_slab, orig_size and passes pointer to that to all the nested
> functions.
> 
> Would be similar to "struct alloc_context" in page allocation.
> Something like "struct partial_context pc"?

Yep! This would make the parameters passing much tidier. Will try
this way. 

More aggressively is to also embed the 'kmem_cache' parameter into
it, but this may make the code look ambiguous.

Thanks,
Feng
Vlastimil Babka Sept. 5, 2022, 7:33 a.m. UTC | #12
On 9/5/22 09:06, Feng Tang wrote:
> On Mon, Sep 05, 2022 at 02:29:51PM +0800, Vlastimil Babka wrote:
>> 
>> How about get_partial() instantiates an on-stack structure that contains
>> gfpflags, ret_slab, orig_size and passes pointer to that to all the nested
>> functions.
>> 
>> Would be similar to "struct alloc_context" in page allocation.
>> Something like "struct partial_context pc"?
> 
> Yep! This would make the parameters passing much tidier. Will try
> this way. 
> 
> More aggressively is to also embed the 'kmem_cache' parameter into
> it, but this may make the code look ambiguous.

That one is used a lot everywhere, so it would be tedious to dereference it
from a struct, and also might be a bit better code if it's in a register.

> Thanks,
> Feng
> 
>
Feng Tang Sept. 5, 2022, 8:37 a.m. UTC | #13
On Mon, Sep 05, 2022 at 03:33:14PM +0800, Vlastimil Babka wrote:
> On 9/5/22 09:06, Feng Tang wrote:
> > On Mon, Sep 05, 2022 at 02:29:51PM +0800, Vlastimil Babka wrote:
> >> 
> >> How about get_partial() instantiates an on-stack structure that contains
> >> gfpflags, ret_slab, orig_size and passes pointer to that to all the nested
> >> functions.
> >> 
> >> Would be similar to "struct alloc_context" in page allocation.
> >> Something like "struct partial_context pc"?
> > 
> > Yep! This would make the parameters passing much tidier. Will try
> > this way. 
> > 
> > More aggressively is to also embed the 'kmem_cache' parameter into
> > it, but this may make the code look ambiguous.
> 
> That one is used a lot everywhere, so it would be tedious to dereference it
> from a struct, and also might be a bit better code if it's in a register.

Got it. Following is the incremental patch for 1/4, which uses the
'partial_context' to pass parameters. And actually the 4/4 patch will
benefit more from this refactoring, as the object initialization doesn't
need to be separated and has race issue.

Thanks,
Feng

---
diff --git a/mm/slub.c b/mm/slub.c
index 82e7bd3a3966..7497fb6ca8e2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -194,6 +194,12 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
 #endif
 #endif		/* CONFIG_SLUB_DEBUG */
 
+struct partial_context {
+	struct slab **slab;
+	gfp_t flags;
+	int orig_size;
+};
+
 static inline bool kmem_cache_debug(struct kmem_cache *s)
 {
 	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
@@ -1333,7 +1339,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
 }
 
 static noinline int alloc_debug_processing(struct kmem_cache *s,
-					struct slab *slab, void *object)
+			struct slab *slab, void *object, int orig_size)
 {
 	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 		if (!alloc_consistency_checks(s, slab, object))
@@ -1342,6 +1348,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
 
 	/* Success. Perform special debug activities for allocs */
 	trace(s, slab, object, 1);
+	set_orig_size(s, object, orig_size);
 	init_object(s, object, SLUB_RED_ACTIVE);
 	return 1;
 
@@ -1610,7 +1617,7 @@ static inline
 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
-	struct slab *slab, void *object) { return 0; }
+	struct slab *slab, void *object, int orig_size) { return 0; }
 
 static inline void set_orig_size(struct kmem_cache *s,
 	void *object, unsigned int orig_size) {}
@@ -2042,7 +2049,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
  * it to full list if it was the last free object.
  */
 static void *alloc_single_from_partial(struct kmem_cache *s,
-		struct kmem_cache_node *n, struct slab *slab)
+		struct kmem_cache_node *n, struct slab *slab, int orig_size)
 {
 	void *object;
 
@@ -2052,7 +2059,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
 	slab->freelist = get_freepointer(s, object);
 	slab->inuse++;
 
-	if (!alloc_debug_processing(s, slab, object)) {
+	if (!alloc_debug_processing(s, slab, object, orig_size)) {
 		remove_partial(n, slab);
 		return NULL;
 	}
@@ -2071,7 +2078,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
  * and put the slab to the partial (or full) list.
  */
 static void *alloc_single_from_new_slab(struct kmem_cache *s,
-					struct slab *slab)
+					struct slab *slab, int orig_size)
 {
 	int nid = slab_nid(slab);
 	struct kmem_cache_node *n = get_node(s, nid);
@@ -2083,7 +2090,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s,
 	slab->freelist = get_freepointer(s, object);
 	slab->inuse = 1;
 
-	if (!alloc_debug_processing(s, slab, object))
+	if (!alloc_debug_processing(s, slab, object, orig_size))
 		/*
 		 * It's not really expected that this would fail on a
 		 * freshly allocated slab, but a concurrent memory
@@ -2161,7 +2168,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
  * Try to allocate a partial slab from a specific node.
  */
 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
-			      struct slab **ret_slab, gfp_t gfpflags)
+			      struct partial_context *pc)
 {
 	struct slab *slab, *slab2;
 	void *object = NULL;
@@ -2181,11 +2188,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
 		void *t;
 
-		if (!pfmemalloc_match(slab, gfpflags))
+		if (!pfmemalloc_match(slab, pc->flags))
 			continue;
 
 		if (kmem_cache_debug(s)) {
-			object = alloc_single_from_partial(s, n, slab);
+			object = alloc_single_from_partial(s, n, slab, pc->orig_size);
 			if (object)
 				break;
 			continue;
@@ -2196,7 +2203,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 			break;
 
 		if (!object) {
-			*ret_slab = slab;
+			*pc->slab = slab;
 			stat(s, ALLOC_FROM_PARTIAL);
 			object = t;
 		} else {
@@ -2220,14 +2227,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 /*
  * Get a slab from somewhere. Search in increasing NUMA distances.
  */
-static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
-			     struct slab **ret_slab)
+static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
 {
 #ifdef CONFIG_NUMA
 	struct zonelist *zonelist;
 	struct zoneref *z;
 	struct zone *zone;
-	enum zone_type highest_zoneidx = gfp_zone(flags);
+	enum zone_type highest_zoneidx = gfp_zone(pc->flags);
 	void *object;
 	unsigned int cpuset_mems_cookie;
 
@@ -2255,15 +2261,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 
 	do {
 		cpuset_mems_cookie = read_mems_allowed_begin();
-		zonelist = node_zonelist(mempolicy_slab_node(), flags);
+		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
 		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
 			struct kmem_cache_node *n;
 
 			n = get_node(s, zone_to_nid(zone));
 
-			if (n && cpuset_zone_allowed(zone, flags) &&
+			if (n && cpuset_zone_allowed(zone, pc->flags) &&
 					n->nr_partial > s->min_partial) {
-				object = get_partial_node(s, n, ret_slab, flags);
+				object = get_partial_node(s, n, pc);
 				if (object) {
 					/*
 					 * Don't check read_mems_allowed_retry()
@@ -2284,8 +2290,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 /*
  * Get a partial slab, lock it and return it.
  */
-static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
-			 struct slab **ret_slab)
+static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
 {
 	void *object;
 	int searchnode = node;
@@ -2293,11 +2298,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
 	if (node == NUMA_NO_NODE)
 		searchnode = numa_mem_id();
 
-	object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
+	object = get_partial_node(s, get_node(s, searchnode), pc);
 	if (object || node != NUMA_NO_NODE)
 		return object;
 
-	return get_any_partial(s, flags, ret_slab);
+	return get_any_partial(s, pc);
 }
 
 #ifdef CONFIG_PREEMPTION
@@ -3022,6 +3027,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	void *freelist;
 	struct slab *slab;
 	unsigned long flags;
+	struct partial_context pc;
 
 	stat(s, ALLOC_SLOWPATH);
 
@@ -3135,7 +3141,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 new_objects:
 
-	freelist = get_partial(s, gfpflags, node, &slab);
+	pc.flags = gfpflags;
+	pc.slab = &slab;
+	pc.orig_size = orig_size;
+	freelist = get_partial(s, node, &pc);
 	if (freelist)
 		goto check_new_slab;
 
@@ -3151,14 +3160,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	stat(s, ALLOC_SLAB);
 
 	if (kmem_cache_debug(s)) {
-		freelist = alloc_single_from_new_slab(s, slab);
+		freelist = alloc_single_from_new_slab(s, slab, orig_size);
 
 		if (unlikely(!freelist))
 			goto new_objects;
 
 		if (s->flags & SLAB_STORE_USER)
 			set_track(s, freelist, TRACK_ALLOC, addr);
-		set_orig_size(s, freelist, orig_size);
 
 		return freelist;
 	}
@@ -3184,7 +3192,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		 */
 		if (s->flags & SLAB_STORE_USER)
 			set_track(s, freelist, TRACK_ALLOC, addr);
-		set_orig_size(s, freelist, orig_size);
 
 		return freelist;
 	}
Hyeonggon Yoo Sept. 6, 2022, 1:39 p.m. UTC | #14
On Mon, Sep 05, 2022 at 04:37:05PM +0800, Feng Tang wrote:
> On Mon, Sep 05, 2022 at 03:33:14PM +0800, Vlastimil Babka wrote:
> > On 9/5/22 09:06, Feng Tang wrote:
> > > On Mon, Sep 05, 2022 at 02:29:51PM +0800, Vlastimil Babka wrote:
> > >> 
> > >> How about get_partial() instantiates an on-stack structure that contains
> > >> gfpflags, ret_slab, orig_size and passes pointer to that to all the nested
> > >> functions.
> > >> 
> > >> Would be similar to "struct alloc_context" in page allocation.
> > >> Something like "struct partial_context pc"?
> > > 
> > > Yep! This would make the parameters passing much tidier. Will try
> > > this way. 
> > > 
> > > More aggressively is to also embed the 'kmem_cache' parameter into
> > > it, but this may make the code look ambiguous.
> > 
> > That one is used a lot everywhere, so it would be tedious to dereference it
> > from a struct, and also might be a bit better code if it's in a register.
> 
> Got it. Following is the incremental patch for 1/4, which uses the
> 'partial_context' to pass parameters. And actually the 4/4 patch will
> benefit more from this refactoring, as the object initialization doesn't
> need to be separated and has race issue.
> 
> Thanks,
> Feng

Looks fine to me.
will review when next version arrives :)

> ---
> diff --git a/mm/slub.c b/mm/slub.c
> index 82e7bd3a3966..7497fb6ca8e2 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -194,6 +194,12 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
>  #endif
>  #endif		/* CONFIG_SLUB_DEBUG */
>  
> +struct partial_context {
> +	struct slab **slab;
> +	gfp_t flags;
> +	int orig_size;
> +};
> +
>  static inline bool kmem_cache_debug(struct kmem_cache *s)
>  {
>  	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
> @@ -1333,7 +1339,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
>  }
>  
>  static noinline int alloc_debug_processing(struct kmem_cache *s,
> -					struct slab *slab, void *object)
> +			struct slab *slab, void *object, int orig_size)
>  {
>  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>  		if (!alloc_consistency_checks(s, slab, object))
> @@ -1342,6 +1348,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
>  
>  	/* Success. Perform special debug activities for allocs */
>  	trace(s, slab, object, 1);
> +	set_orig_size(s, object, orig_size);
>  	init_object(s, object, SLUB_RED_ACTIVE);
>  	return 1;
>  
> @@ -1610,7 +1617,7 @@ static inline
>  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>  
>  static inline int alloc_debug_processing(struct kmem_cache *s,
> -	struct slab *slab, void *object) { return 0; }
> +	struct slab *slab, void *object, int orig_size) { return 0; }
>  
>  static inline void set_orig_size(struct kmem_cache *s,
>  	void *object, unsigned int orig_size) {}
> @@ -2042,7 +2049,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
>   * it to full list if it was the last free object.
>   */
>  static void *alloc_single_from_partial(struct kmem_cache *s,
> -		struct kmem_cache_node *n, struct slab *slab)
> +		struct kmem_cache_node *n, struct slab *slab, int orig_size)
>  {
>  	void *object;
>  
> @@ -2052,7 +2059,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
>  	slab->freelist = get_freepointer(s, object);
>  	slab->inuse++;
>  
> -	if (!alloc_debug_processing(s, slab, object)) {
> +	if (!alloc_debug_processing(s, slab, object, orig_size)) {
>  		remove_partial(n, slab);
>  		return NULL;
>  	}
> @@ -2071,7 +2078,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
>   * and put the slab to the partial (or full) list.
>   */
>  static void *alloc_single_from_new_slab(struct kmem_cache *s,
> -					struct slab *slab)
> +					struct slab *slab, int orig_size)
>  {
>  	int nid = slab_nid(slab);
>  	struct kmem_cache_node *n = get_node(s, nid);
> @@ -2083,7 +2090,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s,
>  	slab->freelist = get_freepointer(s, object);
>  	slab->inuse = 1;
>  
> -	if (!alloc_debug_processing(s, slab, object))
> +	if (!alloc_debug_processing(s, slab, object, orig_size))
>  		/*
>  		 * It's not really expected that this would fail on a
>  		 * freshly allocated slab, but a concurrent memory
> @@ -2161,7 +2168,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
>   * Try to allocate a partial slab from a specific node.
>   */
>  static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
> -			      struct slab **ret_slab, gfp_t gfpflags)
> +			      struct partial_context *pc)
>  {
>  	struct slab *slab, *slab2;
>  	void *object = NULL;
> @@ -2181,11 +2188,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>  	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
>  		void *t;
>  
> -		if (!pfmemalloc_match(slab, gfpflags))
> +		if (!pfmemalloc_match(slab, pc->flags))
>  			continue;
>  
>  		if (kmem_cache_debug(s)) {
> -			object = alloc_single_from_partial(s, n, slab);
> +			object = alloc_single_from_partial(s, n, slab, pc->orig_size);
>  			if (object)
>  				break;
>  			continue;
> @@ -2196,7 +2203,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>  			break;
>  
>  		if (!object) {
> -			*ret_slab = slab;
> +			*pc->slab = slab;
>  			stat(s, ALLOC_FROM_PARTIAL);
>  			object = t;
>  		} else {
> @@ -2220,14 +2227,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>  /*
>   * Get a slab from somewhere. Search in increasing NUMA distances.
>   */
> -static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
> -			     struct slab **ret_slab)
> +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
>  {
>  #ifdef CONFIG_NUMA
>  	struct zonelist *zonelist;
>  	struct zoneref *z;
>  	struct zone *zone;
> -	enum zone_type highest_zoneidx = gfp_zone(flags);
> +	enum zone_type highest_zoneidx = gfp_zone(pc->flags);
>  	void *object;
>  	unsigned int cpuset_mems_cookie;
>  
> @@ -2255,15 +2261,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
>  
>  	do {
>  		cpuset_mems_cookie = read_mems_allowed_begin();
> -		zonelist = node_zonelist(mempolicy_slab_node(), flags);
> +		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
>  		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
>  			struct kmem_cache_node *n;
>  
>  			n = get_node(s, zone_to_nid(zone));
>  
> -			if (n && cpuset_zone_allowed(zone, flags) &&
> +			if (n && cpuset_zone_allowed(zone, pc->flags) &&
>  					n->nr_partial > s->min_partial) {
> -				object = get_partial_node(s, n, ret_slab, flags);
> +				object = get_partial_node(s, n, pc);
>  				if (object) {
>  					/*
>  					 * Don't check read_mems_allowed_retry()
> @@ -2284,8 +2290,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
>  /*
>   * Get a partial slab, lock it and return it.
>   */
> -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
> -			 struct slab **ret_slab)
> +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
>  {
>  	void *object;
>  	int searchnode = node;
> @@ -2293,11 +2298,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
>  	if (node == NUMA_NO_NODE)
>  		searchnode = numa_mem_id();
>  
> -	object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
> +	object = get_partial_node(s, get_node(s, searchnode), pc);
>  	if (object || node != NUMA_NO_NODE)
>  		return object;
>  
> -	return get_any_partial(s, flags, ret_slab);
> +	return get_any_partial(s, pc);
>  }
>  
>  #ifdef CONFIG_PREEMPTION
> @@ -3022,6 +3027,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	void *freelist;
>  	struct slab *slab;
>  	unsigned long flags;
> +	struct partial_context pc;
>  
>  	stat(s, ALLOC_SLOWPATH);
>  
> @@ -3135,7 +3141,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  
>  new_objects:
>  
> -	freelist = get_partial(s, gfpflags, node, &slab);
> +	pc.flags = gfpflags;
> +	pc.slab = &slab;
> +	pc.orig_size = orig_size;
> +	freelist = get_partial(s, node, &pc);
>  	if (freelist)
>  		goto check_new_slab;
>  
> @@ -3151,14 +3160,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	stat(s, ALLOC_SLAB);
>  
>  	if (kmem_cache_debug(s)) {
> -		freelist = alloc_single_from_new_slab(s, slab);
> +		freelist = alloc_single_from_new_slab(s, slab, orig_size);
>  
>  		if (unlikely(!freelist))
>  			goto new_objects;
>  
>  		if (s->flags & SLAB_STORE_USER)
>  			set_track(s, freelist, TRACK_ALLOC, addr);
> -		set_orig_size(s, freelist, orig_size);
>  
>  		return freelist;
>  	}
> @@ -3184,7 +3192,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		 */
>  		if (s->flags & SLAB_STORE_USER)
>  			set_track(s, freelist, TRACK_ALLOC, addr);
> -		set_orig_size(s, freelist, orig_size);
>  
>  		return freelist;
>  	}
diff mbox series

Patch

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9b592e611cb1..6dc495f76644 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -29,6 +29,8 @@ 
 #define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)
 /* DEBUG: Poison objects */
 #define SLAB_POISON		((slab_flags_t __force)0x00000800U)
+/* Indicate a kmalloc slab */
+#define SLAB_KMALLOC		((slab_flags_t __force)0x00001000U)
 /* Align objs on cache lines */
 #define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
diff --git a/mm/slub.c b/mm/slub.c
index 5df44e00b1aa..d8bab650ed99 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -199,6 +199,12 @@  static inline bool kmem_cache_debug(struct kmem_cache *s)
 	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
 }
 
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+	return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+			(s->flags & SLAB_KMALLOC));
+}
+
 void *fixup_red_left(struct kmem_cache *s, void *p)
 {
 	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -785,6 +791,33 @@  static void print_slab_info(const struct slab *slab)
 	       folio_flags(folio, 0));
 }
 
+static inline void set_orig_size(struct kmem_cache *s,
+					void *object, unsigned int orig_size)
+{
+	void *p = kasan_reset_tag(object);
+
+	if (!slub_debug_orig_size(s))
+		return;
+
+	p += get_info_end(s);
+	p += sizeof(struct track) * 2;
+
+	*(unsigned int *)p = orig_size;
+}
+
+static unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+	void *p = kasan_reset_tag(object);
+
+	if (!slub_debug_orig_size(s))
+		return s->object_size;
+
+	p += get_info_end(s);
+	p += sizeof(struct track) * 2;
+
+	return *(unsigned int *)p;
+}
+
 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 {
 	struct va_format vaf;
@@ -844,6 +877,9 @@  static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
 	if (s->flags & SLAB_STORE_USER)
 		off += 2 * sizeof(struct track);
 
+	if (slub_debug_orig_size(s))
+		off += sizeof(unsigned int);
+
 	off += kasan_metadata_size(s);
 
 	if (off != size_from_object(s))
@@ -995,10 +1031,14 @@  static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
 {
 	unsigned long off = get_info_end(s);	/* The end of info */
 
-	if (s->flags & SLAB_STORE_USER)
+	if (s->flags & SLAB_STORE_USER) {
 		/* We also have user information there */
 		off += 2 * sizeof(struct track);
 
+		if (s->flags & SLAB_KMALLOC)
+			off += sizeof(unsigned int);
+	}
+
 	off += kasan_metadata_size(s);
 
 	if (size_from_object(s) == off)
@@ -1572,6 +1612,9 @@  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 static inline int alloc_debug_processing(struct kmem_cache *s,
 	struct slab *slab, void *object) { return 0; }
 
+static inline void set_orig_size(struct kmem_cache *s,
+	void *object, unsigned int orig_size) {}
+
 static inline void free_debug_processing(
 	struct kmem_cache *s, struct slab *slab,
 	void *head, void *tail, int bulk_cnt,
@@ -2974,7 +3017,7 @@  static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
  * already disabled (which is the case for bulk allocation).
  */
 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
-			  unsigned long addr, struct kmem_cache_cpu *c)
+			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 {
 	void *freelist;
 	struct slab *slab;
@@ -3115,6 +3158,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 		if (s->flags & SLAB_STORE_USER)
 			set_track(s, freelist, TRACK_ALLOC, addr);
+		set_orig_size(s, freelist, orig_size);
 
 		return freelist;
 	}
@@ -3140,6 +3184,8 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		 */
 		if (s->flags & SLAB_STORE_USER)
 			set_track(s, freelist, TRACK_ALLOC, addr);
+		set_orig_size(s, freelist, orig_size);
+
 		return freelist;
 	}
 
@@ -3182,7 +3228,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  * pointer.
  */
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
-			  unsigned long addr, struct kmem_cache_cpu *c)
+			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 {
 	void *p;
 
@@ -3195,7 +3241,7 @@  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	c = slub_get_cpu_ptr(s->cpu_slab);
 #endif
 
-	p = ___slab_alloc(s, gfpflags, node, addr, c);
+	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
 #ifdef CONFIG_PREEMPT_COUNT
 	slub_put_cpu_ptr(s->cpu_slab);
 #endif
@@ -3280,7 +3326,7 @@  static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l
 
 	if (!USE_LOCKLESS_FAST_PATH() ||
 	    unlikely(!object || !slab || !node_match(slab, node))) {
-		object = __slab_alloc(s, gfpflags, node, addr, c);
+		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
 	} else {
 		void *next_object = get_freepointer_safe(s, object);
 
@@ -3747,7 +3793,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			 * of re-populating per CPU c->freelist
 			 */
 			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
-					    _RET_IP_, c);
+					    _RET_IP_, c, s->object_size);
 			if (unlikely(!p[i]))
 				goto error;
 
@@ -4150,12 +4196,17 @@  static int calculate_sizes(struct kmem_cache *s)
 	}
 
 #ifdef CONFIG_SLUB_DEBUG
-	if (flags & SLAB_STORE_USER)
+	if (flags & SLAB_STORE_USER) {
 		/*
 		 * Need to store information about allocs and frees after
 		 * the object.
 		 */
 		size += 2 * sizeof(struct track);
+
+		/* Save the original kmalloc request size */
+		if (flags & SLAB_KMALLOC)
+			size += sizeof(unsigned int);
+	}
 #endif
 
 	kasan_cache_create(s, &size, &s->flags);
@@ -4770,7 +4821,7 @@  void __init kmem_cache_init(void)
 
 	/* Now we can use the kmem_cache to allocate kmalloc slabs */
 	setup_kmalloc_cache_index_table();
-	create_kmalloc_caches(0);
+	create_kmalloc_caches(SLAB_KMALLOC);
 
 	/* Setup random freelists for each cache */
 	init_freelist_randomization();
@@ -4937,6 +4988,7 @@  struct location {
 	depot_stack_handle_t handle;
 	unsigned long count;
 	unsigned long addr;
+	unsigned long waste;
 	long long sum_time;
 	long min_time;
 	long max_time;
@@ -4983,13 +5035,15 @@  static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
 }
 
 static int add_location(struct loc_track *t, struct kmem_cache *s,
-				const struct track *track)
+				const struct track *track,
+				unsigned int orig_size)
 {
 	long start, end, pos;
 	struct location *l;
-	unsigned long caddr, chandle;
+	unsigned long caddr, chandle, cwaste;
 	unsigned long age = jiffies - track->when;
 	depot_stack_handle_t handle = 0;
+	unsigned int waste = s->object_size - orig_size;
 
 #ifdef CONFIG_STACKDEPOT
 	handle = READ_ONCE(track->handle);
@@ -5007,11 +5061,13 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 		if (pos == end)
 			break;
 
-		caddr = t->loc[pos].addr;
-		chandle = t->loc[pos].handle;
-		if ((track->addr == caddr) && (handle == chandle)) {
+		l = &t->loc[pos];
+		caddr = l->addr;
+		chandle = l->handle;
+		cwaste = l->waste;
+		if ((track->addr == caddr) && (handle == chandle) &&
+			(waste == cwaste)) {
 
-			l = &t->loc[pos];
 			l->count++;
 			if (track->when) {
 				l->sum_time += age;
@@ -5036,6 +5092,9 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 			end = pos;
 		else if (track->addr == caddr && handle < chandle)
 			end = pos;
+		else if (track->addr == caddr && handle == chandle &&
+				waste < cwaste)
+			end = pos;
 		else
 			start = pos;
 	}
@@ -5059,6 +5118,7 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 	l->min_pid = track->pid;
 	l->max_pid = track->pid;
 	l->handle = handle;
+	l->waste = waste;
 	cpumask_clear(to_cpumask(l->cpus));
 	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
 	nodes_clear(l->nodes);
@@ -5077,7 +5137,7 @@  static void process_slab(struct loc_track *t, struct kmem_cache *s,
 
 	for_each_object(p, s, addr, slab->objects)
 		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
-			add_location(t, s, get_track(s, p, alloc));
+			add_location(t, s, get_track(s, p, alloc), get_orig_size(s, p));
 }
 #endif  /* CONFIG_DEBUG_FS   */
 #endif	/* CONFIG_SLUB_DEBUG */
@@ -5942,6 +6002,10 @@  static int slab_debugfs_show(struct seq_file *seq, void *v)
 		else
 			seq_puts(seq, "<not-available>");
 
+		if (l->waste)
+			seq_printf(seq, " waste=%lu/%lu",
+				l->count * l->waste, l->waste);
+
 		if (l->sum_time != l->min_time) {
 			seq_printf(seq, " age=%ld/%llu/%ld",
 				l->min_time, div_u64(l->sum_time, l->count),