diff mbox series

[1/3] mm/slub: fix the race between validate_slab and slab_free

Message ID 20220529081535.69275-1-rongwei.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series [1/3] mm/slub: fix the race between validate_slab and slab_free | expand

Commit Message

Rongwei Wang May 29, 2022, 8:15 a.m. UTC
In use cases where allocating and freeing slab frequently, some
error messages, such as "Left Redzone overwritten", "First byte
0xbb instead of 0xcc" would be printed when validating slabs.
That's because an object has been filled with SLAB_RED_INACTIVE,
but has not been added to slab's freelist. And between these
two states, the behaviour of validating slab is likely to occur.

Actually, it doesn't mean the slab can not work stably. But, these
confusing messages will disturb slab debugging more or less.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
 mm/slub.c | 40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

Comments

Hyeonggon Yoo May 29, 2022, 11:37 a.m. UTC | #1
On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
> In use cases where allocating and freeing slab frequently, some
> error messages, such as "Left Redzone overwritten", "First byte
> 0xbb instead of 0xcc" would be printed when validating slabs.
> That's because an object has been filled with SLAB_RED_INACTIVE,
> but has not been added to slab's freelist. And between these
> two states, the behaviour of validating slab is likely to occur.
> 
> Actually, it doesn't mean the slab can not work stably. But, these
> confusing messages will disturb slab debugging more or less.
> 
> Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>

Have you observed it or it's from code inspection?

> ---
>  mm/slub.c | 40 +++++++++++++++++-----------------------
>  1 file changed, 17 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index ed5c2c03a47a..310e56d99116 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
>  	void *head, void *tail, int bulk_cnt,
>  	unsigned long addr)
>  {
> -	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
>  	void *object = head;
>  	int cnt = 0;
> -	unsigned long flags, flags2;
> +	unsigned long flags;
>  	int ret = 0;
>  
> -	spin_lock_irqsave(&n->list_lock, flags);
> -	slab_lock(slab, &flags2);
> -
> +	slab_lock(slab, &flags);
>  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>  		if (!check_slab(s, slab))
>  			goto out;
> @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
>  		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
>  			 bulk_cnt, cnt);
>  
> -	slab_unlock(slab, &flags2);
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	slab_unlock(slab, &flags);
>  	if (!ret)
>  		slab_fix(s, "Object at 0x%p not freed", object);
>  	return ret;
> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  
>  {
>  	void *prior;
> -	int was_frozen;
> +	int was_frozen, to_take_off = 0;
>  	struct slab new;
>  	unsigned long counters;
>  	struct kmem_cache_node *n = NULL;
> @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  	if (kfence_free(head))
>  		return;
>  
> +	n = get_node(s, slab_nid(slab));
> +	spin_lock_irqsave(&n->list_lock, flags);
> +

Oh please don't do this.

SLUB free slowpath can be hit a lot depending on workload.

__slab_free() try its best not to take n->list_lock. currently takes n->list_lock
only when the slab need to be taken from list.

Unconditionally taking n->list_lock will degrade performance.

>  	if (kmem_cache_debug(s) &&
> -	    !free_debug_processing(s, slab, head, tail, cnt, addr))
> +	    !free_debug_processing(s, slab, head, tail, cnt, addr)) {
> +
> +		spin_unlock_irqrestore(&n->list_lock, flags);
>  		return;
> +	}
>  
>  	do {
> -		if (unlikely(n)) {
> -			spin_unlock_irqrestore(&n->list_lock, flags);
> -			n = NULL;
> -		}
> +		if (unlikely(to_take_off))
> +			to_take_off = 0;
>  		prior = slab->freelist;
>  		counters = slab->counters;
>  		set_freepointer(s, tail, prior);
> @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  				new.frozen = 1;
>  
>  			} else { /* Needs to be taken off a list */
> -
> -				n = get_node(s, slab_nid(slab));
>  				/*
> -				 * Speculatively acquire the list_lock.
>  				 * If the cmpxchg does not succeed then we may
> -				 * drop the list_lock without any processing.
> -				 *
> -				 * Otherwise the list_lock will synchronize with
> -				 * other processors updating the list of slabs.
> +				 * drop this behavior without any processing.
>  				 */
> -				spin_lock_irqsave(&n->list_lock, flags);
> -
> +				to_take_off = 1;
>  			}
>  		}
>  
> @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  		head, new.counters,
>  		"__slab_free"));
>  
> -	if (likely(!n)) {
> +	if (likely(!to_take_off)) {
>  
> +		spin_unlock_irqrestore(&n->list_lock, flags);
>  		if (likely(was_frozen)) {
>  			/*
>  			 * The list lock was not taken therefore no list
> 
> -- 
> 2.27.0
>
David Rientjes May 30, 2022, 9:14 p.m. UTC | #2
On Sun, 29 May 2022, Hyeonggon Yoo wrote:

> > diff --git a/mm/slub.c b/mm/slub.c
> > index ed5c2c03a47a..310e56d99116 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> >  	void *head, void *tail, int bulk_cnt,
> >  	unsigned long addr)
> >  {
> > -	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> >  	void *object = head;
> >  	int cnt = 0;
> > -	unsigned long flags, flags2;
> > +	unsigned long flags;
> >  	int ret = 0;
> >  
> > -	spin_lock_irqsave(&n->list_lock, flags);
> > -	slab_lock(slab, &flags2);
> > -
> > +	slab_lock(slab, &flags);
> >  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> >  		if (!check_slab(s, slab))
> >  			goto out;
> > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> >  		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> >  			 bulk_cnt, cnt);
> >  
> > -	slab_unlock(slab, &flags2);
> > -	spin_unlock_irqrestore(&n->list_lock, flags);
> > +	slab_unlock(slab, &flags);
> >  	if (!ret)
> >  		slab_fix(s, "Object at 0x%p not freed", object);
> >  	return ret;
> > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >  
> >  {
> >  	void *prior;
> > -	int was_frozen;
> > +	int was_frozen, to_take_off = 0;
> >  	struct slab new;
> >  	unsigned long counters;
> >  	struct kmem_cache_node *n = NULL;
> > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >  	if (kfence_free(head))
> >  		return;
> >  
> > +	n = get_node(s, slab_nid(slab));
> > +	spin_lock_irqsave(&n->list_lock, flags);
> > +
> 
> Oh please don't do this.
> 
> SLUB free slowpath can be hit a lot depending on workload.
> 
> __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> only when the slab need to be taken from list.
> 
> Unconditionally taking n->list_lock will degrade performance.
> 

This is a good point, it would be useful to gather some benchmarks for 
workloads that are known to thrash some caches and would hit this path 
such as netperf TCP_RR.
Muchun Song May 31, 2022, 3:47 a.m. UTC | #3
On Sun, May 29, 2022 at 11:37:06AM +0000, Hyeonggon Yoo wrote:
> On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
> > In use cases where allocating and freeing slab frequently, some
> > error messages, such as "Left Redzone overwritten", "First byte
> > 0xbb instead of 0xcc" would be printed when validating slabs.
> > That's because an object has been filled with SLAB_RED_INACTIVE,
> > but has not been added to slab's freelist. And between these
> > two states, the behaviour of validating slab is likely to occur.
> > 
> > Actually, it doesn't mean the slab can not work stably. But, these
> > confusing messages will disturb slab debugging more or less.
> > 
> > Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
> 
> Have you observed it or it's from code inspection?
> 
> > ---
> >  mm/slub.c | 40 +++++++++++++++++-----------------------
> >  1 file changed, 17 insertions(+), 23 deletions(-)
> > 
> > diff --git a/mm/slub.c b/mm/slub.c
> > index ed5c2c03a47a..310e56d99116 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> >  	void *head, void *tail, int bulk_cnt,
> >  	unsigned long addr)
> >  {
> > -	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> >  	void *object = head;
> >  	int cnt = 0;
> > -	unsigned long flags, flags2;
> > +	unsigned long flags;
> >  	int ret = 0;
> >  
> > -	spin_lock_irqsave(&n->list_lock, flags);
> > -	slab_lock(slab, &flags2);
> > -
> > +	slab_lock(slab, &flags);
> >  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> >  		if (!check_slab(s, slab))
> >  			goto out;
> > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> >  		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> >  			 bulk_cnt, cnt);
> >  
> > -	slab_unlock(slab, &flags2);
> > -	spin_unlock_irqrestore(&n->list_lock, flags);
> > +	slab_unlock(slab, &flags);
> >  	if (!ret)
> >  		slab_fix(s, "Object at 0x%p not freed", object);
> >  	return ret;
> > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >  
> >  {
> >  	void *prior;
> > -	int was_frozen;
> > +	int was_frozen, to_take_off = 0;
> >  	struct slab new;
> >  	unsigned long counters;
> >  	struct kmem_cache_node *n = NULL;
> > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >  	if (kfence_free(head))
> >  		return;
> >  
> > +	n = get_node(s, slab_nid(slab));
> > +	spin_lock_irqsave(&n->list_lock, flags);
> > +
> 
> Oh please don't do this.
> 
> SLUB free slowpath can be hit a lot depending on workload.
> 
> __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> only when the slab need to be taken from list.
> 
> Unconditionally taking n->list_lock will degrade performance.
>

I can confirm you are right. We have encountered this issue in practise.
We have deployed somen HDFS instance on a 256-CPU machine.  When there
are lots of IO requests from users, we can see lots of threads are contended
on n->list_lock.  Lots of call traces are like following:

  ffffffff810dfbb4 native_queued_spin_lock_slowpath+0x1a4
  ffffffff81780ffb _raw_spin_lock+0x1b
  ffffffff8127327e get_partial_node.isra.81+0x5e
  ffffffff812752d3 ___slab_alloc+0x2f3
  ffffffff8127559c __slab_alloc+0x1c
  ffffffff81275828 kmem_cache_alloc+0x278
  ffffffff812e9e3d alloc_buffer_head+0x1d
  ffffffff812e9f74 alloc_page_buffers+0xa4
  ffffffff812eb0e9 create_empty_buffers+0x19
  ffffffff812eb37d create_page_buffers+0x7d
  ffffffff812ed78d __block_write_begin_int+0x9d

I thought it was because there are lots of threads which consume local
CPU slab cache quickly and then both of them try to get a new slab
from node partial list.  Since there are 256 CPUs, the contention
is more competitive and easy to be visible.

Any thoughts on this issue (e.e. how to ease contention)? Comments
are welcome.

Thanks.
Rongwei Wang May 31, 2022, 8:50 a.m. UTC | #4
On 5/29/22 7:37 PM, Hyeonggon Yoo wrote:
> On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
>> In use cases where allocating and freeing slab frequently, some
>> error messages, such as "Left Redzone overwritten", "First byte
>> 0xbb instead of 0xcc" would be printed when validating slabs.
>> That's because an object has been filled with SLAB_RED_INACTIVE,
>> but has not been added to slab's freelist. And between these
>> two states, the behaviour of validating slab is likely to occur.
>>
>> Actually, it doesn't mean the slab can not work stably. But, these
>> confusing messages will disturb slab debugging more or less.
>>
>> Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
> 
> Have you observed it or it's from code inspection?
Hi, Hyeonggon

I try to build a module to trigger the race:

#define SLUB_KTHREAD_MAX 1
static int do_slub_alloc(void *data)
{
         char *mm = NULL;
         char *mm1 = NULL;
         char *mm2 = NULL;
         char *mm3 = NULL;

         allow_signal(SIGTERM);

         while (1) {
                 mm = kmalloc(2048, GFP_KERNEL);
                 if (mm)
                         mm[0x100] = 0x21;

                 if (mm)
                         kfree(mm);

                 mm = NULL;
                 if (kthread_should_stop())
                         break;
         }

         return 0;
}

static int __init mini_init(void)
{
         char *mm;
         int i = 0;
         unsigned int index;
         char kth_name[11] = "do_slub_00";

         for (i = 0; i < SLUB_KTHREAD_MAX; i++) {
                 kth_name[9] = '0' + i%10;
                 kth_name[8] = '0' + i/10;
                 slub_thread[i] = kthread_run(do_slub_alloc, NULL, 
kth_name);
         }

         return 0;
}
module_init(mini_init);

And in my system, I add 'slub_debug=UFPZ' to the boot options. Next, the 
error messages will be printed when I test "slabinfo -v" or "echo 1 > 
/sys/kernel/slab/kmalloc-2048/validate".

> 
>> ---
>>   mm/slub.c | 40 +++++++++++++++++-----------------------
>>   1 file changed, 17 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index ed5c2c03a47a..310e56d99116 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
>>   	void *head, void *tail, int bulk_cnt,
>>   	unsigned long addr)
>>   {
>> -	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
>>   	void *object = head;
>>   	int cnt = 0;
>> -	unsigned long flags, flags2;
>> +	unsigned long flags;
>>   	int ret = 0;
>>   
>> -	spin_lock_irqsave(&n->list_lock, flags);
>> -	slab_lock(slab, &flags2);
>> -
>> +	slab_lock(slab, &flags);
>>   	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>>   		if (!check_slab(s, slab))
>>   			goto out;
>> @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
>>   		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
>>   			 bulk_cnt, cnt);
>>   
>> -	slab_unlock(slab, &flags2);
>> -	spin_unlock_irqrestore(&n->list_lock, flags);
>> +	slab_unlock(slab, &flags);
>>   	if (!ret)
>>   		slab_fix(s, "Object at 0x%p not freed", object);
>>   	return ret;
>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>   
>>   {
>>   	void *prior;
>> -	int was_frozen;
>> +	int was_frozen, to_take_off = 0;
>>   	struct slab new;
>>   	unsigned long counters;
>>   	struct kmem_cache_node *n = NULL;
>> @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>   	if (kfence_free(head))
>>   		return;
>>   
>> +	n = get_node(s, slab_nid(slab));
>> +	spin_lock_irqsave(&n->list_lock, flags);
>> +
> 
> Oh please don't do this.
> 
> SLUB free slowpath can be hit a lot depending on workload.
Thanks, your words remind me. Actually, I put the original in 
free_debug_processing() lock on the outside of it. Looks this change is 
small. Indeed, it will degrade performance more or less.

And do you have other ideas?:)

-wrw
> 
> __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> only when the slab need to be taken from list.
> 
> Unconditionally taking n->list_lock will degrade performance.
> 
>>   	if (kmem_cache_debug(s) &&
>> -	    !free_debug_processing(s, slab, head, tail, cnt, addr))
>> +	    !free_debug_processing(s, slab, head, tail, cnt, addr)) {
>> +
>> +		spin_unlock_irqrestore(&n->list_lock, flags);
>>   		return;
>> +	}
>>   
>>   	do {
>> -		if (unlikely(n)) {
>> -			spin_unlock_irqrestore(&n->list_lock, flags);
>> -			n = NULL;
>> -		}
>> +		if (unlikely(to_take_off))
>> +			to_take_off = 0;
>>   		prior = slab->freelist;
>>   		counters = slab->counters;
>>   		set_freepointer(s, tail, prior);
>> @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>   				new.frozen = 1;
>>   
>>   			} else { /* Needs to be taken off a list */
>> -
>> -				n = get_node(s, slab_nid(slab));
>>   				/*
>> -				 * Speculatively acquire the list_lock.
>>   				 * If the cmpxchg does not succeed then we may
>> -				 * drop the list_lock without any processing.
>> -				 *
>> -				 * Otherwise the list_lock will synchronize with
>> -				 * other processors updating the list of slabs.
>> +				 * drop this behavior without any processing.
>>   				 */
>> -				spin_lock_irqsave(&n->list_lock, flags);
>> -
>> +				to_take_off = 1;
>>   			}
>>   		}
>>   
>> @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>   		head, new.counters,
>>   		"__slab_free"));
>>   
>> -	if (likely(!n)) {
>> +	if (likely(!to_take_off)) {
>>   
>> +		spin_unlock_irqrestore(&n->list_lock, flags);
>>   		if (likely(was_frozen)) {
>>   			/*
>>   			 * The list lock was not taken therefore no list
>>
>> -- 
>> 2.27.0
>>
Christoph Lameter June 2, 2022, 3:14 p.m. UTC | #5
On Mon, 30 May 2022, David Rientjes wrote:

> > Unconditionally taking n->list_lock will degrade performance.
>
> This is a good point, it would be useful to gather some benchmarks for
> workloads that are known to thrash some caches and would hit this path
> such as netperf TCP_RR.

Its obvious that adding new spinlocks to some of the hottest functions in
the kernel will degrade performance. This goes against the basic design of
these functions to be as efficient as possible.
Rongwei Wang June 3, 2022, 3:35 a.m. UTC | #6
Hi, Christoph, David, Muchun and Hyeonggon

Thanks for your time.

Recently, I am also find other ways to solve this. That case was 
provided by Muchun is useful (Thanks Muchun!). Indeed, it seems that use 
n->list_lock here is unwise. Actually, I'm not sure if you recognize the 
existence of such race? If all agrees this race, then the next question 
may be: do we want to solve this problem? or as David said, it would be 
better to deprecate validate attribute directly. I have no idea about 
it, hope to rely on your experience.

In fact, I mainly want to collect your views on whether or how to fix 
this bug here. Thanks!

Thanks again for your time:).
-wrw

On 6/2/22 11:14 PM, Christoph Lameter wrote:
> On Mon, 30 May 2022, David Rientjes wrote:
> 
>>> Unconditionally taking n->list_lock will degrade performance.
>>
>> This is a good point, it would be useful to gather some benchmarks for
>> workloads that are known to thrash some caches and would hit this path
>> such as netperf TCP_RR.
> 
> Its obvious that adding new spinlocks to some of the hottest functions in
> the kernel will degrade performance. This goes against the basic design of
> these functions to be as efficient as possible.
Hyeonggon Yoo June 4, 2022, 11:05 a.m. UTC | #7
On Tue, May 31, 2022 at 11:47:22AM +0800, Muchun Song wrote:
> On Sun, May 29, 2022 at 11:37:06AM +0000, Hyeonggon Yoo wrote:
> > On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
> > > In use cases where allocating and freeing slab frequently, some
> > > error messages, such as "Left Redzone overwritten", "First byte
> > > 0xbb instead of 0xcc" would be printed when validating slabs.
> > > That's because an object has been filled with SLAB_RED_INACTIVE,
> > > but has not been added to slab's freelist. And between these
> > > two states, the behaviour of validating slab is likely to occur.
> > > 
> > > Actually, it doesn't mean the slab can not work stably. But, these
> > > confusing messages will disturb slab debugging more or less.
> > > 
> > > Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
> > 
> > Have you observed it or it's from code inspection?
> > 
> > > ---
> > >  mm/slub.c | 40 +++++++++++++++++-----------------------
> > >  1 file changed, 17 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/mm/slub.c b/mm/slub.c
> > > index ed5c2c03a47a..310e56d99116 100644
> > > --- a/mm/slub.c
> > > +++ b/mm/slub.c
> > > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> > >  	void *head, void *tail, int bulk_cnt,
> > >  	unsigned long addr)
> > >  {
> > > -	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> > >  	void *object = head;
> > >  	int cnt = 0;
> > > -	unsigned long flags, flags2;
> > > +	unsigned long flags;
> > >  	int ret = 0;
> > >  
> > > -	spin_lock_irqsave(&n->list_lock, flags);
> > > -	slab_lock(slab, &flags2);
> > > -
> > > +	slab_lock(slab, &flags);
> > >  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> > >  		if (!check_slab(s, slab))
> > >  			goto out;
> > > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> > >  		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> > >  			 bulk_cnt, cnt);
> > >  
> > > -	slab_unlock(slab, &flags2);
> > > -	spin_unlock_irqrestore(&n->list_lock, flags);
> > > +	slab_unlock(slab, &flags);
> > >  	if (!ret)
> > >  		slab_fix(s, "Object at 0x%p not freed", object);
> > >  	return ret;
> > > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > >  
> > >  {
> > >  	void *prior;
> > > -	int was_frozen;
> > > +	int was_frozen, to_take_off = 0;
> > >  	struct slab new;
> > >  	unsigned long counters;
> > >  	struct kmem_cache_node *n = NULL;
> > > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > >  	if (kfence_free(head))
> > >  		return;
> > >  
> > > +	n = get_node(s, slab_nid(slab));
> > > +	spin_lock_irqsave(&n->list_lock, flags);
> > > +
> > 
> > Oh please don't do this.
> > 
> > SLUB free slowpath can be hit a lot depending on workload.
> > 
> > __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> > only when the slab need to be taken from list.
> > 
> > Unconditionally taking n->list_lock will degrade performance.
> >
> 
> I can confirm you are right. We have encountered this issue in practise.
> We have deployed somen HDFS instance on a 256-CPU machine.  When there
> are lots of IO requests from users, we can see lots of threads are contended
> on n->list_lock.  Lots of call traces are like following:
> 
>   ffffffff810dfbb4 native_queued_spin_lock_slowpath+0x1a4
>   ffffffff81780ffb _raw_spin_lock+0x1b
>   ffffffff8127327e get_partial_node.isra.81+0x5e
>   ffffffff812752d3 ___slab_alloc+0x2f3
>   ffffffff8127559c __slab_alloc+0x1c
>   ffffffff81275828 kmem_cache_alloc+0x278
>   ffffffff812e9e3d alloc_buffer_head+0x1d
>   ffffffff812e9f74 alloc_page_buffers+0xa4
>   ffffffff812eb0e9 create_empty_buffers+0x19
>   ffffffff812eb37d create_page_buffers+0x7d
>   ffffffff812ed78d __block_write_begin_int+0x9d
> 
> I thought it was because there are lots of threads which consume local
> CPU slab cache quickly and then both of them try to get a new slab
> from node partial list.  Since there are 256 CPUs, the contention
> is more competitive and easy to be visible.
> 
> Any thoughts on this issue (e.e. how to ease contention)? Comments
> are welcome.

How does increasing number of partial slabs affect your situation?
(increasing /sys/slab/<cache name>/cpu_partial)

> Thanks.
> 
>
Christoph Lameter June 7, 2022, 12:14 p.m. UTC | #8
On Fri, 3 Jun 2022, Rongwei Wang wrote:

> Recently, I am also find other ways to solve this. That case was provided by
> Muchun is useful (Thanks Muchun!). Indeed, it seems that use n->list_lock here
> is unwise. Actually, I'm not sure if you recognize the existence of such race?
> If all agrees this race, then the next question may be: do we want to solve
> this problem? or as David said, it would be better to deprecate validate
> attribute directly. I have no idea about it, hope to rely on your experience.
>
> In fact, I mainly want to collect your views on whether or how to fix this bug
> here. Thanks!


Well validate_slab() is rarely used and should not cause the hot paths to
incur performance penalties. Fix it in the validation logic somehow? Or
document the issue and warn that validation may not be correct if there
are current operations on the slab being validated.
Rongwei Wang June 8, 2022, 3:04 a.m. UTC | #9
On 6/7/22 8:14 PM, Christoph Lameter wrote:
> On Fri, 3 Jun 2022, Rongwei Wang wrote:
> 
>> Recently, I am also find other ways to solve this. That case was provided by
>> Muchun is useful (Thanks Muchun!). Indeed, it seems that use n->list_lock here
>> is unwise. Actually, I'm not sure if you recognize the existence of such race?
>> If all agrees this race, then the next question may be: do we want to solve
>> this problem? or as David said, it would be better to deprecate validate
>> attribute directly. I have no idea about it, hope to rely on your experience.
>>
>> In fact, I mainly want to collect your views on whether or how to fix this bug
>> here. Thanks!
> 
> 
> Well validate_slab() is rarely used and should not cause the hot paths to
> incur performance penalties. Fix it in the validation logic somehow? Or
> document the issue and warn that validation may not be correct if there
If available, I think document the issue and warn this incorrect 
behavior is OK. But it still prints a large amount of confusing 
messages, and disturbs us?
> are current operations on the slab being validated.
And I am trying to fix it in following way. In a short, these changes 
only works under the slub debug mode, and not affects the normal mode 
(I'm not sure). It looks not elegant enough. And if all approve of this 
way, I can submit the next version.

Anyway, thanks for your time:).
-wrw

@@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, 
struct slab *slab,

  {
         void *prior;
-       int was_frozen;
+       int was_frozen, to_take_off = 0;
         struct slab new;
         unsigned long counters;
         struct kmem_cache_node *n = NULL;
@@ -3315,14 +3311,23 @@ static void __slab_free(struct kmem_cache *s, 
struct slab *slab,
         if (kfence_free(head))
                 return;

-       if (kmem_cache_debug(s) &&
-           !free_debug_processing(s, slab, head, tail, cnt, addr))
-               return;
+       n = get_node(s, slab_nid(slab));
+       if (kmem_cache_debug(s)) {
+               int ret;

-       do {
-               if (unlikely(n)) {
+               spin_lock_irqsave(&n->list_lock, flags);
+               ret = free_debug_processing(s, slab, head, tail, cnt, addr);
+               if (!ret) {
                         spin_unlock_irqrestore(&n->list_lock, flags);
-                       n = NULL;
+                       return;
+               }
+       }
+
+       do {
+               if (unlikely(to_take_off)) {
+                       if (!kmem_cache_debug(s))
+                               spin_unlock_irqrestore(&n->list_lock, 
flags);
+                       to_take_off = 0;
                 }
                 prior = slab->freelist;
                 counters = slab->counters;
@@ -3343,8 +3348,6 @@ static void __slab_free(struct kmem_cache *s, 
struct slab *slab,
                                 new.frozen = 1;

                         } else { /* Needs to be taken off a list */
-
-                               n = get_node(s, slab_nid(slab));
                                 /*
                                  * Speculatively acquire the list_lock.
                                  * If the cmpxchg does not succeed then 
we may
@@ -3353,8 +3356,10 @@ static void __slab_free(struct kmem_cache *s, 
struct slab *slab,
                                  * Otherwise the list_lock will 
synchronize with
                                  * other processors updating the list 
of slabs.
                                  */
-                               spin_lock_irqsave(&n->list_lock, flags);
+                               if (!kmem_cache_debug(s))
+                                       spin_lock_irqsave(&n->list_lock, 
flags);

+                               to_take_off = 1;
                         }
                 }

@@ -3363,8 +3368,9 @@ static void __slab_free(struct kmem_cache *s, 
struct slab *slab,
                 head, new.counters,
                 "__slab_free"));

-       if (likely(!n)) {
-
+       if (likely(!to_take_off)) {
+               if (kmem_cache_debug(s))
+                       spin_unlock_irqrestore(&n->list_lock, flags);
                 if (likely(was_frozen)) {
                         /*
                          * The list lock was not taken therefore no list
>
Christoph Lameter June 8, 2022, 12:23 p.m. UTC | #10
On Wed, 8 Jun 2022, Rongwei Wang wrote:

> If available, I think document the issue and warn this incorrect behavior is
> OK. But it still prints a large amount of confusing messages, and disturbs us?

Correct it would be great if you could fix this in a way that does not
impact performance.

> > are current operations on the slab being validated.
> And I am trying to fix it in following way. In a short, these changes only
> works under the slub debug mode, and not affects the normal mode (I'm not
> sure). It looks not elegant enough. And if all approve of this way, I can
> submit the next version.


>
> Anyway, thanks for your time:).
> -wrw
>
> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
struct
> slab *slab,
>
>  {
>         void *prior;
> -       int was_frozen;
> +       int was_frozen, to_take_off = 0;
>         struct slab new;

to_take_off has the role of !n ? Why is that needed?

> -       do {
> -               if (unlikely(n)) {
> +               spin_lock_irqsave(&n->list_lock, flags);
> +               ret = free_debug_processing(s, slab, head, tail, cnt, addr);

Ok so the idea is to take the lock only if kmem_cache_debug. That looks
ok. But it still adds a number of new branches etc to the free loop.

Some performance tests would be useful.
Rongwei Wang June 11, 2022, 4:04 a.m. UTC | #11
On 6/8/22 8:23 PM, Christoph Lameter wrote:
> On Wed, 8 Jun 2022, Rongwei Wang wrote:
> 
>> If available, I think document the issue and warn this incorrect behavior is
>> OK. But it still prints a large amount of confusing messages, and disturbs us?
> 
> Correct it would be great if you could fix this in a way that does not
> impact performance.
> 
>>> are current operations on the slab being validated.
>> And I am trying to fix it in following way. In a short, these changes only
>> works under the slub debug mode, and not affects the normal mode (I'm not
>> sure). It looks not elegant enough. And if all approve of this way, I can
>> submit the next version.
> 
> 
>>
>> Anyway, thanks for your time:).
>> -wrw
>>
>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
> struct
>> slab *slab,
>>
>>   {
>>          void *prior;
>> -       int was_frozen;
>> +       int was_frozen, to_take_off = 0;
>>          struct slab new;
> 
> to_take_off has the role of !n ? Why is that needed?
> 
>> -       do {
>> -               if (unlikely(n)) {
>> +               spin_lock_irqsave(&n->list_lock, flags);
>> +               ret = free_debug_processing(s, slab, head, tail, cnt, addr);
> 
> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
> ok. But it still adds a number of new branches etc to the free loop.
> 
> Some performance tests would be useful.
Hi Christoph

Thanks for your time!
Do you have some advice in benchmarks that need me to test? And I find 
that hackbench and lkp was used frequently in mm/slub.c commits[1,2]. 
But I have no idea how to use these two benchmarks test to cover the 
above changes. Can you give some examples? Thanks very much!

Sorry for late reply!

[1]https://lore.kernel.org/lkml/20210301080404.GF12822@xsang-OptiPlex-9020/
[2]https://lore.kernel.org/linux-mm/20210128134512.GF3592@techsingularity.net/
Christoph Lameter June 13, 2022, 1:50 p.m. UTC | #12
On Sat, 11 Jun 2022, Rongwei Wang wrote:

> > Ok so the idea is to take the lock only if kmem_cache_debug. That looks
> > ok. But it still adds a number of new branches etc to the free loop.
> >
> > Some performance tests would be useful.
> Hi Christoph
>
> Thanks for your time!
> Do you have some advice in benchmarks that need me to test? And I find that
> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no
> idea how to use these two benchmarks test to cover the above changes. Can you
> give some examples? Thanks very much!


Hi Rongwei,

Well run hackbench with an without the change.

There are also synthetic benchmarks available  at
https://gentwo.org/christoph/slub/tests/

These measure the cycles that slab operations take. However, they are a
bit old and I think Pekka may have a newer version of these
patches.

Greetings,
	Christoph
Rongwei Wang June 14, 2022, 2:38 a.m. UTC | #13
On 6/13/22 9:50 PM, Christoph Lameter wrote:
> On Sat, 11 Jun 2022, Rongwei Wang wrote:
> 
>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>> ok. But it still adds a number of new branches etc to the free loop.
>>>
>>> Some performance tests would be useful.
>> Hi Christoph
>>
>> Thanks for your time!
>> Do you have some advice in benchmarks that need me to test? And I find that
>> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no
>> idea how to use these two benchmarks test to cover the above changes. Can you
>> give some examples? Thanks very much!
> 
> 
> Hi Rongwei,
> 
> Well run hackbench with an without the change.
OK
> 
> There are also synthetic benchmarks available  at
> https://gentwo.org/christoph/slub/tests/
That's great, thanks very much!
> 
> These measure the cycles that slab operations take. However, they are a
> bit old and I think Pekka may have a newer version of these
> patches.
> 
> Greetings,
> 	Christoph
Rongwei Wang June 17, 2022, 7:55 a.m. UTC | #14
On 6/13/22 9:50 PM, Christoph Lameter wrote:
> On Sat, 11 Jun 2022, Rongwei Wang wrote:
> 
>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>> ok. But it still adds a number of new branches etc to the free loop.
>>>
>>> Some performance tests would be useful.
>> Hi Christoph
>>
>> Thanks for your time!
>> Do you have some advice in benchmarks that need me to test? And I find that
>> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no
>> idea how to use these two benchmarks test to cover the above changes. Can you
>> give some examples? Thanks very much!
> 
> 
> Hi Rongwei,
> 
> Well run hackbench with an without the change.
> 
> There are also synthetic benchmarks available  at
> https://gentwo.org/christoph/slub/tests/
Christoph, I refer [1] to test some data below. The slub_test case is 
same to your provided. And here you the result of its test (the baseline 
is the data of upstream kernel, and fix is results of patched kernel).

my test environment: arm64 vm (32 cores and 128G memory)

And I have removed 'slub_debug=UFPZ' in cmdline before testing the 
following two groups of data.

[1]https://lore.kernel.org/linux-mm/20200527103545.4348ac10@carbon/

Single thread testing

1. Kmalloc: Repeatedly allocate then free test

                    before (baseline)        fix
                    kmalloc      kfree       kmalloc      kfree
10000 times 8      7 cycles     8 cycles    5 cycles     7 cycles
10000 times 16     4 cycles     8 cycles    3 cycles     6 cycles
10000 times 32     4 cycles     8 cycles    3 cycles     6 cycles
10000 times 64     3 cycles     8 cycles    3 cycles     6 cycles
10000 times 128    3 cycles     8 cycles    3 cycles     6 cycles
10000 times 256    12 cycles    8 cycles    11 cycles    7 cycles
10000 times 512    27 cycles    10 cycles   23 cycles    11 cycles
10000 times 1024   18 cycles    9 cycles    20 cycles    10 cycles
10000 times 2048   54 cycles    12 cycles   54 cycles    12 cycles
10000 times 4096   105 cycles   20 cycles   105 cycles   25 cycles
10000 times 8192   210 cycles   35 cycles   212 cycles   39 cycles
10000 times 16384  133 cycles   45 cycles   119 cycles   46 cycles


2. Kmalloc: alloc/free test

                                   before (base)   fix
10000 times kmalloc(8)/kfree      3 cycles        3 cycles
10000 times kmalloc(16)/kfree     3 cycles        3 cycles
10000 times kmalloc(32)/kfree     3 cycles        3 cycles
10000 times kmalloc(64)/kfree     3 cycles        3 cycles
10000 times kmalloc(128)/kfree    3 cycles        3 cycles
10000 times kmalloc(256)/kfree    3 cycles        3 cycles
10000 times kmalloc(512)/kfree    3 cycles        3 cycles
10000 times kmalloc(1024)/kfree   3 cycles        3 cycles
10000 times kmalloc(2048)/kfree   3 cycles        3 cycles
10000 times kmalloc(4096)/kfree   3 cycles        3 cycles
10000 times kmalloc(8192)/kfree   3 cycles        3 cycles
10000 times kmalloc(16384)/kfree  33 cycles       33 cycles


Concurrent allocs

                                 before (baseline)   fix
Kmalloc N*alloc N*free(8)       Average=17/18       Average=11/11
Kmalloc N*alloc N*free(16)      Average=15/49       Average=9/11
Kmalloc N*alloc N*free(32)      Average=15/40       Average=9/11
Kmalloc N*alloc N*free(64)      Average=15/44       Average=9/10
Kmalloc N*alloc N*free(128)     Average=15/42       Average=10/10
Kmalloc N*alloc N*free(256)     Average=128/28      Average=71/22
Kmalloc N*alloc N*free(512)     Average=206/34      Average=178/26
Kmalloc N*alloc N*free(1024)	Average=762/37      Average=369/27
Kmalloc N*alloc N*free(2048)	Average=327/58      Average=339/33
Kmalloc N*alloc N*free(4096)    Average=2255/128    Average=1813/64

                                 before (baseline)   fix
Kmalloc N*(alloc free)(8)       Average=3           Average=3
Kmalloc N*(alloc free)(16)      Average=3           Average=3
Kmalloc N*(alloc free)(32)      Average=3           Average=3
Kmalloc N*(alloc free)(64)      Average=3           Average=3
Kmalloc N*(alloc free)(128)     Average=3           Average=3
Kmalloc N*(alloc free)(256)     Average=3           Average=3
Kmalloc N*(alloc free)(512)     Average=3           Average=3
Kmalloc N*(alloc free)(1024)    Average=3           Average=3
Kmalloc N*(alloc free)(2048)    Average=3           Average=3
Kmalloc N*(alloc free)(4096)	Average=3           Average=3

According to the above data, It seems that no significant performance 
degradation in patched kernel. Plus, in concurrent allocs test, likes 
Kmalloc N*alloc N*free(1024), the data of 'fix' column is better than 
baseline (it looks less is better, if I am wrong, please let me know). 
And if you have other suggestions, I can try to test more data.

Thanks for your time!
-wrw
> 
> These measure the cycles that slab operations take. However, they are a
> bit old and I think Pekka may have a newer version of these
> patches.
> 
> Greetings,
> 	Christoph
Vlastimil Babka June 17, 2022, 9:40 a.m. UTC | #15
On 6/8/22 14:23, Christoph Lameter wrote:
> On Wed, 8 Jun 2022, Rongwei Wang wrote:
> 
>> If available, I think document the issue and warn this incorrect behavior is
>> OK. But it still prints a large amount of confusing messages, and disturbs us?
> 
> Correct it would be great if you could fix this in a way that does not
> impact performance.
> 
>> > are current operations on the slab being validated.
>> And I am trying to fix it in following way. In a short, these changes only
>> works under the slub debug mode, and not affects the normal mode (I'm not
>> sure). It looks not elegant enough. And if all approve of this way, I can
>> submit the next version.
> 
> 
>>
>> Anyway, thanks for your time:).
>> -wrw
>>
>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
> struct
>> slab *slab,
>>
>>  {
>>         void *prior;
>> -       int was_frozen;
>> +       int was_frozen, to_take_off = 0;
>>         struct slab new;
> 
> to_take_off has the role of !n ? Why is that needed?
> 
>> -       do {
>> -               if (unlikely(n)) {
>> +               spin_lock_irqsave(&n->list_lock, flags);
>> +               ret = free_debug_processing(s, slab, head, tail, cnt, addr);
> 
> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
> ok. But it still adds a number of new branches etc to the free loop.

It also further complicates the already tricky code. I wonder if we should
make more benefit from the fact that for kmem_cache_debug() caches we don't
leave any slabs on percpu or percpu partial lists, and also in
free_debug_processing() we aready take both list_lock and slab_lock. If we
just did the freeing immediately there under those locks, we would be
protected against other freeing cpus by that list_lock and don't need the
double cmpxchg tricks.

What about against allocating cpus? More tricky as those will currently end
up privatizing the freelist via get_partial(), only to deactivate it again,
so our list_lock+slab_lock in freeing path would not protect in the
meanwhile. But the allocation is currently very inefficient for debug
caches, as in get_partial() it will take the list_lock to take the slab from
partial list and then in most cases again in deactivate_slab() to return it.

If instead the allocation path for kmem_cache_debug() cache would take a
single object from the partial list (not whole freelist) under list_lock, it
would be ultimately more efficient, and protect against freeing using
list_lock. Sounds like an idea worth trying to me?
And of course we would stop creating the 'validate' sysfs files for
non-debug caches.
Christoph Lameter June 17, 2022, 2:19 p.m. UTC | #16
On Fri, 17 Jun 2022, Rongwei Wang wrote:

> Christoph, I refer [1] to test some data below. The slub_test case is same to
> your provided. And here you the result of its test (the baseline is the data
> of upstream kernel, and fix is results of patched kernel).

Ah good.
> Single thread testing
>
> 1. Kmalloc: Repeatedly allocate then free test
>
>                    before (baseline)        fix
>                    kmalloc      kfree       kmalloc      kfree
> 10000 times 8      7 cycles     8 cycles    5 cycles     7 cycles
> 10000 times 16     4 cycles     8 cycles    3 cycles     6 cycles
> 10000 times 32     4 cycles     8 cycles    3 cycles     6 cycles

Well the cycle reduction is strange. Tests are not done in the same
environment? Maybe good to not use NUMA or bind to the same cpu

> 10000 times 64     3 cycles     8 cycles    3 cycles     6 cycles
> 10000 times 128    3 cycles     8 cycles    3 cycles     6 cycles
> 10000 times 256    12 cycles    8 cycles    11 cycles    7 cycles
> 10000 times 512    27 cycles    10 cycles   23 cycles    11 cycles
> 10000 times 1024   18 cycles    9 cycles    20 cycles    10 cycles
> 10000 times 2048   54 cycles    12 cycles   54 cycles    12 cycles
> 10000 times 4096   105 cycles   20 cycles   105 cycles   25 cycles
> 10000 times 8192   210 cycles   35 cycles   212 cycles   39 cycles
> 10000 times 16384  133 cycles   45 cycles   119 cycles   46 cycles


Seems to be different environments.

> According to the above data, It seems that no significant performance
> degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc
> N*alloc N*free(1024), the data of 'fix' column is better than baseline (it
> looks less is better, if I am wrong, please let me know). And if you have
> other suggestions, I can try to test more data.

Well can you explain the cycle reduction?
Rongwei Wang June 18, 2022, 2:33 a.m. UTC | #17
On 6/17/22 10:19 PM, Christoph Lameter wrote:
> On Fri, 17 Jun 2022, Rongwei Wang wrote:
> 
>> Christoph, I refer [1] to test some data below. The slub_test case is same to
>> your provided. And here you the result of its test (the baseline is the data
>> of upstream kernel, and fix is results of patched kernel).
> 
> Ah good.
>> Single thread testing
>>
>> 1. Kmalloc: Repeatedly allocate then free test
>>
>>                     before (baseline)        fix
>>                     kmalloc      kfree       kmalloc      kfree
>> 10000 times 8      7 cycles     8 cycles    5 cycles     7 cycles
>> 10000 times 16     4 cycles     8 cycles    3 cycles     6 cycles
>> 10000 times 32     4 cycles     8 cycles    3 cycles     6 cycles
> 
> Well the cycle reduction is strange. Tests are not done in the same
> environment? Maybe good to not use NUMA or bind to the same cpu
It's the same environment. I can sure. And there are four nodes (32G 
per-node and 8 cores per-node) in my test environment. whether I need to 
test in one node? If right, I can try.
> 
>> 10000 times 64     3 cycles     8 cycles    3 cycles     6 cycles
>> 10000 times 128    3 cycles     8 cycles    3 cycles     6 cycles
>> 10000 times 256    12 cycles    8 cycles    11 cycles    7 cycles
>> 10000 times 512    27 cycles    10 cycles   23 cycles    11 cycles
>> 10000 times 1024   18 cycles    9 cycles    20 cycles    10 cycles
>> 10000 times 2048   54 cycles    12 cycles   54 cycles    12 cycles
>> 10000 times 4096   105 cycles   20 cycles   105 cycles   25 cycles
>> 10000 times 8192   210 cycles   35 cycles   212 cycles   39 cycles
>> 10000 times 16384  133 cycles   45 cycles   119 cycles   46 cycles
> 
> 
> Seems to be different environments.
> 
>> According to the above data, It seems that no significant performance
>> degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc
>> N*alloc N*free(1024), the data of 'fix' column is better than baseline (it
>> looks less is better, if I am wrong, please let me know). And if you have
>> other suggestions, I can try to test more data.
> 
> Well can you explain the cycle reduction?
Maybe because of four nodes in my system or only 8 cores (very small) in 
each node? Thanks, you remind me that I need to increase core number of 
each node or change node number to compere the results.

Thanks!
Christoph Lameter June 20, 2022, 11:57 a.m. UTC | #18
On Sat, 18 Jun 2022, Rongwei Wang wrote:

> > Well the cycle reduction is strange. Tests are not done in the same
> > environment? Maybe good to not use NUMA or bind to the same cpu
> It's the same environment. I can sure. And there are four nodes (32G per-node
> and 8 cores per-node) in my test environment. whether I need to test in one
> node? If right, I can try.

Ok in a NUMA environment the memory allocation is randomized on bootup.
You may get different numbers after you reboot the system. Try to switch
NUMA off. Use s a single node to get consistent numbers.

It maybe useful to figure out what memory structure causes the increase in
latency in a NUMA environment. If you can figure that out and properly
allocate the memory structure that causes the increases in latency then
you may be able to increase the performance of the allocator.
Rongwei Wang June 26, 2022, 4:48 p.m. UTC | #19
On 6/20/22 7:57 PM, Christoph Lameter wrote:
> On Sat, 18 Jun 2022, Rongwei Wang wrote:
> 
>>> Well the cycle reduction is strange. Tests are not done in the same
>>> environment? Maybe good to not use NUMA or bind to the same cpu
>> It's the same environment. I can sure. And there are four nodes (32G per-node
>> and 8 cores per-node) in my test environment. whether I need to test in one
>> node? If right, I can try.
> 
> Ok in a NUMA environment the memory allocation is randomized on bootup.
> You may get different numbers after you reboot the system. Try to switch
> NUMA off. Use s a single node to get consistent numbers.
Sorry for late reply.

At first, let me share my test environment: arm64 VM (32 cores and 128G 
memory), and I only configure one node for this VM.
Plus, I had use 'numactl -N 0 -m 0 qemu-kvm ...' to start this VM. It 
mainly to avoid data jitter.

And I can sure my physical machine where this VM is located has same 
configuration when I tested prototype kernel and patched kernel. If 
above test environment has any problems, please let me know.

The following is the latest data:

Single thread testing
1. Kmalloc: Repeatedly allocate then free test
                   before                fix
                   kmalloc    kfree      kmalloc     kfree
10000 times 8     4 cycles   5 cycles	4 cycles    5 cycles
10000 times 16    3 cycles   5 cycles	3 cycles    5 cycles
10000 times 32    3 cycles   5 cycles	3 cycles    5 cycles
10000 times 64    3 cycles   5 cycles	3 cycles    5 cycles
10000 times 128   3 cycles   5 cycles	3 cycles    5 cycles
10000 times 256   14 cycles  9 cycles	6 cycles    8 cycles
10000 times 512   9 cycles   8 cycles	9 cycles    10 cycles
10000 times 1024  48 cycles  10 cycles	6 cycles    10 cycles
10000 times 2048  31 cycles  12 cycles	35 cycles   13 cycles
10000 times 4096  96 cycles  17 cycles	96 cycles   18 cycles
10000 times 8192  188 cycles 27 cycles	190 cycles  27 cycles
10000 times 16384 117 cycles 38 cycles  115 cycles  38 cycles
				
2. Kmalloc: alloc/free test
                                   before        fix
10000 times kmalloc(8)/kfree      3 cycles      3 cycles
10000 times kmalloc(16)/kfree     3 cycles      3 cycles
10000 times kmalloc(32)/kfree     3 cycles      3 cycles
10000 times kmalloc(64)/kfree     3 cycles      3 cycles
10000 times kmalloc(128)/kfree    3 cycles      3 cycles
10000 times kmalloc(256)/kfree    3 cycles      3 cycles
10000 times kmalloc(512)/kfree    3 cycles      3 cycles
10000 times kmalloc(1024)/kfree   3 cycles      3 cycles
10000 times kmalloc(2048)/kfree   3 cycles      3 cycles
10000 times kmalloc(4096)/kfree   3 cycles      3 cycles
10000 times kmalloc(8192)/kfree   3 cycles      3 cycles
10000 times kmalloc(16384)/kfree  33 cycles     33 cycles

Concurrent allocs
                                 before            fix
Kmalloc N*alloc N*free(8)       Average=13/14     Average=14/15
Kmalloc N*alloc N*free(16)      Average=13/15     Average=13/15
Kmalloc N*alloc N*free(32)      Average=13/15     Average=13/15
Kmalloc N*alloc N*free(64)      Average=13/15     Average=13/15
Kmalloc N*alloc N*free(128)     Average=13/15     Average=13/15
Kmalloc N*alloc N*free(256)     Average=137/29    Average=134/39
Kmalloc N*alloc N*free(512)     Average=61/29     Average=64/28
Kmalloc N*alloc N*free(1024)    Average=465/50    Average=656/55
Kmalloc N*alloc N*free(2048)    Average=503/97    Average=422/97
Kmalloc N*alloc N*free(4096)    Average=1592/206  Average=1624/207
		
Kmalloc N*(alloc free)(8)       Average=3         Average=3
Kmalloc N*(alloc free)(16)      Average=3         Average=3
Kmalloc N*(alloc free)(32)      Average=3         Average=3
Kmalloc N*(alloc free)(64)      Average=3         Average=3
Kmalloc N*(alloc free)(128)     Average=3         Average=3
Kmalloc N*(alloc free)(256)     Average=3         Average=3
Kmalloc N*(alloc free)(512)     Average=3         Average=3
Kmalloc N*(alloc free)(1024)    Average=3         Average=3
Kmalloc N*(alloc free)(2048)    Average=3         Average=3
Kmalloc N*(alloc free)(4096)    Average=3         Average=3

Can the above data indicate that this modification (only works when 
kmem_cache_debug(s) is true) does not introduce significant performance 
impact?

Thanks for your time.
> 
> It maybe useful to figure out what memory structure causes the increase in
> latency in a NUMA environment. If you can figure that out and properly
> allocate the memory structure that causes the increases in latency then
> you may be able to increase the performance of the allocator.
Rongwei Wang July 15, 2022, 8:05 a.m. UTC | #20
On 6/17/22 5:40 PM, Vlastimil Babka wrote:
> On 6/8/22 14:23, Christoph Lameter wrote:
>> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>>
>>> If available, I think document the issue and warn this incorrect behavior is
>>> OK. But it still prints a large amount of confusing messages, and disturbs us?
>>
>> Correct it would be great if you could fix this in a way that does not
>> impact performance.
>>
>>>> are current operations on the slab being validated.
>>> And I am trying to fix it in following way. In a short, these changes only
>>> works under the slub debug mode, and not affects the normal mode (I'm not
>>> sure). It looks not elegant enough. And if all approve of this way, I can
>>> submit the next version.
>>
>>
>>>
>>> Anyway, thanks for your time:).
>>> -wrw
>>>
>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
>> struct
>>> slab *slab,
>>>
>>>   {
>>>          void *prior;
>>> -       int was_frozen;
>>> +       int was_frozen, to_take_off = 0;
>>>          struct slab new;
>>
>> to_take_off has the role of !n ? Why is that needed?
>>
>>> -       do {
>>> -               if (unlikely(n)) {
>>> +               spin_lock_irqsave(&n->list_lock, flags);
>>> +               ret = free_debug_processing(s, slab, head, tail, cnt, addr);
>>
>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>> ok. But it still adds a number of new branches etc to the free loop.
> 
Hi, Vlastimil, sorry for missing your message long time.
> It also further complicates the already tricky code. I wonder if we should
> make more benefit from the fact that for kmem_cache_debug() caches we don't
> leave any slabs on percpu or percpu partial lists, and also in
> free_debug_processing() we aready take both list_lock and slab_lock. If we
> just did the freeing immediately there under those locks, we would be
> protected against other freeing cpus by that list_lock and don't need the
> double cmpxchg tricks.
enen, I'm not sure get your "don't need the double cmpxchg tricks" means 
completely. What you want to say is that replace cmpxchg_double_slab() 
here with following code when kmem_cache_debug(s)?

__slab_lock(slab);
if (slab->freelist == freelist_old && slab->counters == counters_old){
     slab->freelist = freelist_new;
     slab->counters = counters_new;
     __slab_unlock(slab);
     local_irq_restore(flags);
     return true;
}
__slab_unlock(slab);

If I make mistakes for your words, please let me know.
Thanks!
> 
> What about against allocating cpus? More tricky as those will currently end
> up privatizing the freelist via get_partial(), only to deactivate it again,
> so our list_lock+slab_lock in freeing path would not protect in the
> meanwhile. But the allocation is currently very inefficient for debug
> caches, as in get_partial() it will take the list_lock to take the slab from
> partial list and then in most cases again in deactivate_slab() to return it.
It seems that I need speed some time to eat these words. Anyway, thanks.
> 
> If instead the allocation path for kmem_cache_debug() cache would take a
> single object from the partial list (not whole freelist) under list_lock, it
> would be ultimately more efficient, and protect against freeing using
> list_lock. Sounds like an idea worth trying to me?

Hyeonggon had a similar advice that split freeing and allocating slab 
from debugging, likes below:


__slab_alloc() {
     if (kmem_cache_debug(s))
         slab_alloc_debug()
     else
         ___slab_alloc()
}

I guess that above code aims to solve your mentioned problem (idea)?

slab_free() {
     if (kmem_cache_debug(s))
         slab_free_debug()
     else
         __do_slab_free()
}

Currently, I only modify the code of freeing slab to fix the confusing 
messages of "slabinfo -v". If you agree, I can try to realize above 
mentioned slab_alloc_debug() code. Maybe it's also a challenge to me.

Thanks for your time.

> And of course we would stop creating the 'validate' sysfs files for
> non-debug caches.
Vlastimil Babka July 15, 2022, 10:33 a.m. UTC | #21
On 7/15/22 10:05, Rongwei Wang wrote:
> 
> 
> On 6/17/22 5:40 PM, Vlastimil Babka wrote:
>> On 6/8/22 14:23, Christoph Lameter wrote:
>>> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>>>
>>>> If available, I think document the issue and warn this incorrect
>>>> behavior is
>>>> OK. But it still prints a large amount of confusing messages, and
>>>> disturbs us?
>>>
>>> Correct it would be great if you could fix this in a way that does not
>>> impact performance.
>>>
>>>>> are current operations on the slab being validated.
>>>> And I am trying to fix it in following way. In a short, these changes only
>>>> works under the slub debug mode, and not affects the normal mode (I'm not
>>>> sure). It looks not elegant enough. And if all approve of this way, I can
>>>> submit the next version.
>>>
>>>
>>>>
>>>> Anyway, thanks for your time:).
>>>> -wrw
>>>>
>>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
>>> struct
>>>> slab *slab,
>>>>
>>>>   {
>>>>          void *prior;
>>>> -       int was_frozen;
>>>> +       int was_frozen, to_take_off = 0;
>>>>          struct slab new;
>>>
>>> to_take_off has the role of !n ? Why is that needed?
>>>
>>>> -       do {
>>>> -               if (unlikely(n)) {
>>>> +               spin_lock_irqsave(&n->list_lock, flags);
>>>> +               ret = free_debug_processing(s, slab, head, tail, cnt,
>>>> addr);
>>>
>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>> ok. But it still adds a number of new branches etc to the free loop.
>>
> Hi, Vlastimil, sorry for missing your message long time.

Hi, no problem.

>> It also further complicates the already tricky code. I wonder if we should
>> make more benefit from the fact that for kmem_cache_debug() caches we don't
>> leave any slabs on percpu or percpu partial lists, and also in
>> free_debug_processing() we aready take both list_lock and slab_lock. If we
>> just did the freeing immediately there under those locks, we would be
>> protected against other freeing cpus by that list_lock and don't need the
>> double cmpxchg tricks.
> enen, I'm not sure get your "don't need the double cmpxchg tricks" means
> completely. What you want to say is that replace cmpxchg_double_slab() here
> with following code when kmem_cache_debug(s)?
> 
> __slab_lock(slab);
> if (slab->freelist == freelist_old && slab->counters == counters_old){
>     slab->freelist = freelist_new;
>     slab->counters = counters_new;
>     __slab_unlock(slab);
>     local_irq_restore(flags);
>     return true;
> }
> __slab_unlock(slab);

Pretty much, but it's more complicated.

> If I make mistakes for your words, please let me know.
> Thanks!
>>
>> What about against allocating cpus? More tricky as those will currently end
>> up privatizing the freelist via get_partial(), only to deactivate it again,
>> so our list_lock+slab_lock in freeing path would not protect in the
>> meanwhile. But the allocation is currently very inefficient for debug
>> caches, as in get_partial() it will take the list_lock to take the slab from
>> partial list and then in most cases again in deactivate_slab() to return it.
> It seems that I need speed some time to eat these words. Anyway, thanks.
>>
>> If instead the allocation path for kmem_cache_debug() cache would take a
>> single object from the partial list (not whole freelist) under list_lock, it
>> would be ultimately more efficient, and protect against freeing using
>> list_lock. Sounds like an idea worth trying to me?
> 
> Hyeonggon had a similar advice that split freeing and allocating slab from
> debugging, likes below:
> 
> 
> __slab_alloc() {
>     if (kmem_cache_debug(s))
>         slab_alloc_debug()
>     else
>         ___slab_alloc()
> }
> 
> I guess that above code aims to solve your mentioned problem (idea)?
> 
> slab_free() {
>     if (kmem_cache_debug(s))
>         slab_free_debug()
>     else
>         __do_slab_free()
> }
> 
> Currently, I only modify the code of freeing slab to fix the confusing
> messages of "slabinfo -v". If you agree, I can try to realize above
> mentioned slab_alloc_debug() code. Maybe it's also a challenge to me.

I already started working on this approach and hope to post a RFC soon.

> Thanks for your time.
> 
>> And of course we would stop creating the 'validate' sysfs files for
>> non-debug caches.
Rongwei Wang July 15, 2022, 10:51 a.m. UTC | #22
On 7/15/22 6:33 PM, Vlastimil Babka wrote:
> On 7/15/22 10:05, Rongwei Wang wrote:
>>
>>
>> On 6/17/22 5:40 PM, Vlastimil Babka wrote:
>>> On 6/8/22 14:23, Christoph Lameter wrote:
>>>> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>>>>
>>>>> If available, I think document the issue and warn this incorrect
>>>>> behavior is
>>>>> OK. But it still prints a large amount of confusing messages, and
>>>>> disturbs us?
>>>>
>>>> Correct it would be great if you could fix this in a way that does not
>>>> impact performance.
>>>>
>>>>>> are current operations on the slab being validated.
>>>>> And I am trying to fix it in following way. In a short, these changes only
>>>>> works under the slub debug mode, and not affects the normal mode (I'm not
>>>>> sure). It looks not elegant enough. And if all approve of this way, I can
>>>>> submit the next version.
>>>>
>>>>
>>>>>
>>>>> Anyway, thanks for your time:).
>>>>> -wrw
>>>>>
>>>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
>>>> struct
>>>>> slab *slab,
>>>>>
>>>>>    {
>>>>>           void *prior;
>>>>> -       int was_frozen;
>>>>> +       int was_frozen, to_take_off = 0;
>>>>>           struct slab new;
>>>>
>>>> to_take_off has the role of !n ? Why is that needed?
>>>>
>>>>> -       do {
>>>>> -               if (unlikely(n)) {
>>>>> +               spin_lock_irqsave(&n->list_lock, flags);
>>>>> +               ret = free_debug_processing(s, slab, head, tail, cnt,
>>>>> addr);
>>>>
>>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>>> ok. But it still adds a number of new branches etc to the free loop.
>>>
>> Hi, Vlastimil, sorry for missing your message long time.
> 
> Hi, no problem.
> 
>>> It also further complicates the already tricky code. I wonder if we should
>>> make more benefit from the fact that for kmem_cache_debug() caches we don't
>>> leave any slabs on percpu or percpu partial lists, and also in
>>> free_debug_processing() we aready take both list_lock and slab_lock. If we
>>> just did the freeing immediately there under those locks, we would be
>>> protected against other freeing cpus by that list_lock and don't need the
>>> double cmpxchg tricks.
>> enen, I'm not sure get your "don't need the double cmpxchg tricks" means
>> completely. What you want to say is that replace cmpxchg_double_slab() here
>> with following code when kmem_cache_debug(s)?
>>
>> __slab_lock(slab);
>> if (slab->freelist == freelist_old && slab->counters == counters_old){
>>      slab->freelist = freelist_new;
>>      slab->counters = counters_new;
>>      __slab_unlock(slab);
>>      local_irq_restore(flags);
>>      return true;
>> }
>> __slab_unlock(slab);
> 
> Pretty much, but it's more complicated.
Yes, actually, I think reuse cmpxchg_double_slab() here is more concise 
in code. I'm already finish this part of code, but hesitating whether to 
replace cmpxchg_double_slab().
> 
>> If I make mistakes for your words, please let me know.
>> Thanks!
>>>
>>> What about against allocating cpus? More tricky as those will currently end
>>> up privatizing the freelist via get_partial(), only to deactivate it again,
>>> so our list_lock+slab_lock in freeing path would not protect in the
>>> meanwhile. But the allocation is currently very inefficient for debug
>>> caches, as in get_partial() it will take the list_lock to take the slab from
>>> partial list and then in most cases again in deactivate_slab() to return it.
>> It seems that I need speed some time to eat these words. Anyway, thanks.
>>>
>>> If instead the allocation path for kmem_cache_debug() cache would take a
>>> single object from the partial list (not whole freelist) under list_lock, it
>>> would be ultimately more efficient, and protect against freeing using
>>> list_lock. Sounds like an idea worth trying to me?
>>
>> Hyeonggon had a similar advice that split freeing and allocating slab from
>> debugging, likes below:
>>
>>
>> __slab_alloc() {
>>      if (kmem_cache_debug(s))
>>          slab_alloc_debug()
>>      else
>>          ___slab_alloc()
>> }
>>
>> I guess that above code aims to solve your mentioned problem (idea)?
>>
>> slab_free() {
>>      if (kmem_cache_debug(s))
>>          slab_free_debug()
>>      else
>>          __do_slab_free()
>> }
>>
>> Currently, I only modify the code of freeing slab to fix the confusing
>> messages of "slabinfo -v". If you agree, I can try to realize above
>> mentioned slab_alloc_debug() code. Maybe it's also a challenge to me.
> 
> I already started working on this approach and hope to post a RFC soon.
OK, that's great.
> 
>> Thanks for your time.
>>
>>> And of course we would stop creating the 'validate' sysfs files for
>>> non-debug caches.
Vlastimil Babka July 18, 2022, 11:09 a.m. UTC | #23
On 5/29/22 10:15, Rongwei Wang wrote:
> In use cases where allocating and freeing slab frequently, some
> error messages, such as "Left Redzone overwritten", "First byte
> 0xbb instead of 0xcc" would be printed when validating slabs.
> That's because an object has been filled with SLAB_RED_INACTIVE,
> but has not been added to slab's freelist. And between these
> two states, the behaviour of validating slab is likely to occur.
> 
> Actually, it doesn't mean the slab can not work stably. But, these
> confusing messages will disturb slab debugging more or less.
> 
> Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>

As I've said in the sub-thread I had the following kind of fix in mind. I
think it should cover the cases from your patches 1/3 and 3/3.

----8<----
From c35fe2a781a7bc4ef37ef3ded289f4ced82562bd Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 4 Jul 2022 14:09:09 +0200
Subject: [RFC] mm, slub: restrict sysfs validation to debug caches and make
 it safe

Rongwei Wang reports [1] that cache validation triggered by writing to
/sys/kernel/slab/<cache>/validate is racy against normal cache
operations (e.g. freeing) in a way that can cause false positive
inconsistency reports for caches with debugging enabled. The problem is
that debugging actions that mark object free or active and actual
freelist operations are not atomic, and the validation can see an
inconsistent state.

For caches that don't have debugging enabled, other races are possible
that result in false reports of wrong slab counts.

This patch attempts to solve these issues while not adding overhead to
normal (especially fastpath) operations for caches that do not have
debugging enabled, just to make possible userspace-triggered validation
safe. Instead, disable the validation for caches that don't have
debugging enabled and make the sysfs handler return -EINVAL.

For caches that do have debugging enabled, we can instead extend the
existing approach of not using percpu freelists to force all operations
to the slow paths where debugging is checked for and processed.

The processing on free in free_debug_processing() already happens under
n->list_lock and slab_lock() so we can extend it to actually do the
freeing as well and thus make it atomic against concurrent validation.

The processing on alloc in alloc_debug_processing() currently doesn't
take any locks, but we have to first allocate the object from a slab on
the partial list (as percpu slabs are always non-existent) and thus take
n->list_lock. Add a function alloc_single_from_partial() that
additionally takes slab_lock() for the debug processing and then grabs
just the allocated object instead of the whole freelist. This again
makes it atomic against validation and it is also ultimately more
efficient than the current grabbing of freelist immediately followed by
slab deactivation.

Neither of these changes affect the fast paths.

The function free_debug_processing() was moved so that it is placed
later than the definitions of add_partial(), remove_partial() and
discard_slab().

[1] https://lore.kernel.org/all/20220529081535.69275-1-rongwei.wang@linux.alibaba.com/

Reported-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 250 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 174 insertions(+), 76 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index b1281b8654bd..954fe7ad5ee1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
 }
 
 static noinline int alloc_debug_processing(struct kmem_cache *s,
-					struct slab *slab,
-					void *object, unsigned long addr)
+					struct slab *slab, void *object)
 {
 	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 		if (!alloc_consistency_checks(s, slab, object))
 			goto bad;
 	}
 
-	/* Success perform special debug activities for allocs */
-	if (s->flags & SLAB_STORE_USER)
-		set_track(s, object, TRACK_ALLOC, addr);
+	/* Success. Perform special debug activities for allocs */
 	trace(s, slab, object, 1);
 	init_object(s, object, SLUB_RED_ACTIVE);
 	return 1;
@@ -1385,63 +1382,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
 	return 1;
 }
 
-/* Supports checking bulk free of a constructed freelist */
-static noinline int free_debug_processing(
-	struct kmem_cache *s, struct slab *slab,
-	void *head, void *tail, int bulk_cnt,
-	unsigned long addr)
-{
-	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
-	void *object = head;
-	int cnt = 0;
-	unsigned long flags, flags2;
-	int ret = 0;
-	depot_stack_handle_t handle = 0;
-
-	if (s->flags & SLAB_STORE_USER)
-		handle = set_track_prepare();
-
-	spin_lock_irqsave(&n->list_lock, flags);
-	slab_lock(slab, &flags2);
-
-	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
-		if (!check_slab(s, slab))
-			goto out;
-	}
-
-next_object:
-	cnt++;
-
-	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
-		if (!free_consistency_checks(s, slab, object, addr))
-			goto out;
-	}
-
-	if (s->flags & SLAB_STORE_USER)
-		set_track_update(s, object, TRACK_FREE, addr, handle);
-	trace(s, slab, object, 0);
-	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
-	init_object(s, object, SLUB_RED_INACTIVE);
-
-	/* Reached end of constructed freelist yet? */
-	if (object != tail) {
-		object = get_freepointer(s, object);
-		goto next_object;
-	}
-	ret = 1;
-
-out:
-	if (cnt != bulk_cnt)
-		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
-			 bulk_cnt, cnt);
-
-	slab_unlock(slab, &flags2);
-	spin_unlock_irqrestore(&n->list_lock, flags);
-	if (!ret)
-		slab_fix(s, "Object at 0x%p not freed", object);
-	return ret;
-}
-
 /*
  * Parse a block of slub_debug options. Blocks are delimited by ';'
  *
@@ -1661,7 +1601,7 @@ static inline
 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
-	struct slab *slab, void *object, unsigned long addr) { return 0; }
+	struct slab *slab, void *object) { return 0; }
 
 static inline int free_debug_processing(
 	struct kmem_cache *s, struct slab *slab,
@@ -2102,6 +2042,42 @@ static inline void remove_partial(struct kmem_cache_node *n,
 	n->nr_partial--;
 }
 
+/*
+ * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
+ * slab from the n->partial list. Removes only a single object from the slab
+ * under slab_lock(), does the alloc_debug_processing() checks and leaves the
+ * slab on the list, or moves it to full list if it was the last object.
+ */
+static void *alloc_single_from_partial(struct kmem_cache *s,
+		struct kmem_cache_node *n, struct slab *slab)
+{
+	void *object;
+	unsigned long flags;
+
+	lockdep_assert_held(&n->list_lock);
+
+	slab_lock(slab, &flags);
+
+	object = slab->freelist;
+	slab->freelist = get_freepointer(s, object);
+	slab->inuse++;
+
+	if (!alloc_debug_processing(s, slab, object)) {
+		remove_partial(n, slab);
+		slab_unlock(slab, &flags);
+		return NULL;
+	}
+
+	if (slab->inuse == slab->objects) {
+		remove_partial(n, slab);
+		add_full(s, n, slab);
+	}
+
+	slab_unlock(slab, &flags);
+
+	return object;
+}
+
 /*
  * Remove slab from the partial list, freeze it and
  * return the pointer to the freelist.
@@ -2182,6 +2158,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 		if (!pfmemalloc_match(slab, gfpflags))
 			continue;
 
+		if (kmem_cache_debug(s)) {
+			object = alloc_single_from_partial(s, n, slab);
+			if (object)
+				break;
+			continue;
+		}
+
 		t = acquire_slab(s, n, slab, object == NULL);
 		if (!t)
 			break;
@@ -2788,6 +2771,104 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
 {
 	return atomic_long_read(&n->total_objects);
 }
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+	struct kmem_cache *s, struct slab *slab,
+	void *head, void *tail, int bulk_cnt,
+	unsigned long addr)
+{
+	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+	struct slab *slab_to_discard = NULL;
+	void *object = head;
+	int cnt = 0;
+	unsigned long flags, flags2;
+	int ret = 0;
+	depot_stack_handle_t handle = 0;
+
+	if (s->flags & SLAB_STORE_USER)
+		handle = set_track_prepare();
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	slab_lock(slab, &flags2);
+
+	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+		if (!check_slab(s, slab))
+			goto out;
+	}
+
+	if (slab->inuse < bulk_cnt) {
+		slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
+			 slab->inuse, bulk_cnt);
+		goto out;
+	}
+
+next_object:
+
+	if (++cnt > bulk_cnt)
+		goto out_cnt;
+
+	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+		if (!free_consistency_checks(s, slab, object, addr))
+			goto out;
+	}
+
+	if (s->flags & SLAB_STORE_USER)
+		set_track_update(s, object, TRACK_FREE, addr, handle);
+	trace(s, slab, object, 0);
+	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+	init_object(s, object, SLUB_RED_INACTIVE);
+
+	/* Reached end of constructed freelist yet? */
+	if (object != tail) {
+		object = get_freepointer(s, object);
+		goto next_object;
+	}
+	ret = 1;
+
+out_cnt:
+	if (cnt != bulk_cnt)
+		slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
+			 bulk_cnt, cnt);
+
+out:
+	if (ret) {
+		void *prior = slab->freelist;
+
+		/* Perform the actual freeing while we still hold the locks */
+		slab->inuse -= cnt;
+		set_freepointer(s, tail, prior);
+		slab->freelist = head;
+
+		/* Do we need to remove the slab from full or partial list? */
+		if (!prior) {
+			remove_full(s, n, slab);
+		} else if (slab->inuse == 0) {
+			remove_partial(n, slab);
+			stat(s, FREE_REMOVE_PARTIAL);
+		}
+
+		/* Do we need to discard the slab or add to partial list? */
+		if (slab->inuse == 0) {
+			slab_to_discard = slab;
+		} else if (!prior) {
+			add_partial(n, slab, DEACTIVATE_TO_TAIL);
+			stat(s, FREE_ADD_PARTIAL);
+		}
+
+	}
+
+	slab_unlock(slab, &flags2);
+	spin_unlock_irqrestore(&n->list_lock, flags);
+	if (!ret)
+		slab_fix(s, "Object at 0x%p not freed", object);
+	if (slab_to_discard) {
+		stat(s, FREE_SLAB);
+		discard_slab(s, slab);
+	}
+
+	return ret;
+}
 #endif /* CONFIG_SLUB_DEBUG */
 
 #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
@@ -3045,19 +3126,35 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 	stat(s, ALLOC_SLAB);
 
-check_new_slab:
-
 	if (kmem_cache_debug(s)) {
-		if (!alloc_debug_processing(s, slab, freelist, addr)) {
-			/* Slab failed checks. Next slab needed */
-			goto new_slab;
-		} else {
+		if (!alloc_debug_processing(s, slab, freelist))
 			/*
-			 * For debug case, we don't load freelist so that all
-			 * allocations go through alloc_debug_processing()
+			 * It's not really expected that this would fail on a
+			 * freshly allocated slab, but a concurrent memory
+			 * corruption in theory could cause that.
 			 */
-			goto return_single;
-		}
+			goto new_slab;
+
+		/*
+		 * For debug caches we don't load the freelist so that all
+		 * allocations have to go through alloc_debug_processing()
+		 */
+		if (s->flags & SLAB_STORE_USER)
+			set_track(s, freelist, TRACK_ALLOC, addr);
+		goto return_single;
+	}
+
+check_new_slab:
+
+	if (kmem_cache_debug(s)) {
+		/*
+		 * For debug caches here we had to go through
+		 * alloc_single_from_partial() so just store the tracking info
+		 * and return the object
+		 */
+		if (s->flags & SLAB_STORE_USER)
+			set_track(s, freelist, TRACK_ALLOC, addr);
+		return freelist;
 	}
 
 	if (unlikely(!pfmemalloc_match(slab, gfpflags)))
@@ -3341,9 +3438,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
 	if (kfence_free(head))
 		return;
 
-	if (kmem_cache_debug(s) &&
-	    !free_debug_processing(s, slab, head, tail, cnt, addr))
+	if (kmem_cache_debug(s)) {
+		free_debug_processing(s, slab, head, tail, cnt, addr);
 		return;
+	}
 
 	do {
 		if (unlikely(n)) {
@@ -5625,7 +5723,7 @@ static ssize_t validate_store(struct kmem_cache *s,
 {
 	int ret = -EINVAL;
 
-	if (buf[0] == '1') {
+	if (buf[0] == '1' && kmem_cache_debug(s)) {
 		ret = validate_slab_cache(s);
 		if (ret >= 0)
 			ret = length;
Rongwei Wang July 19, 2022, 2:15 p.m. UTC | #24
On 7/18/22 7:09 PM, Vlastimil Babka wrote:
> 
> On 5/29/22 10:15, Rongwei Wang wrote:
>> In use cases where allocating and freeing slab frequently, some
>> error messages, such as "Left Redzone overwritten", "First byte
>> 0xbb instead of 0xcc" would be printed when validating slabs.
>> That's because an object has been filled with SLAB_RED_INACTIVE,
>> but has not been added to slab's freelist. And between these
>> two states, the behaviour of validating slab is likely to occur.
>>
>> Actually, it doesn't mean the slab can not work stably. But, these
>> confusing messages will disturb slab debugging more or less.
>>
>> Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
> 
> As I've said in the sub-thread I had the following kind of fix in mind. I
> think it should cover the cases from your patches 1/3 and 3/3.
> 
> ----8<----
>  From c35fe2a781a7bc4ef37ef3ded289f4ced82562bd Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Mon, 4 Jul 2022 14:09:09 +0200
> Subject: [RFC] mm, slub: restrict sysfs validation to debug caches and make
>   it safe
> 
> Rongwei Wang reports [1] that cache validation triggered by writing to
> /sys/kernel/slab/<cache>/validate is racy against normal cache
> operations (e.g. freeing) in a way that can cause false positive
> inconsistency reports for caches with debugging enabled. The problem is
> that debugging actions that mark object free or active and actual
> freelist operations are not atomic, and the validation can see an
> inconsistent state.
> 
> For caches that don't have debugging enabled, other races are possible
> that result in false reports of wrong slab counts.
> 
> This patch attempts to solve these issues while not adding overhead to
> normal (especially fastpath) operations for caches that do not have
> debugging enabled, just to make possible userspace-triggered validation
> safe. Instead, disable the validation for caches that don't have
> debugging enabled and make the sysfs handler return -EINVAL.
> 
> For caches that do have debugging enabled, we can instead extend the
> existing approach of not using percpu freelists to force all operations
> to the slow paths where debugging is checked for and processed.
> 
> The processing on free in free_debug_processing() already happens under
> n->list_lock and slab_lock() so we can extend it to actually do the
> freeing as well and thus make it atomic against concurrent validation.
> 
> The processing on alloc in alloc_debug_processing() currently doesn't
> take any locks, but we have to first allocate the object from a slab on
> the partial list (as percpu slabs are always non-existent) and thus take
> n->list_lock. Add a function alloc_single_from_partial() that
> additionally takes slab_lock() for the debug processing and then grabs
> just the allocated object instead of the whole freelist. This again
> makes it atomic against validation and it is also ultimately more
> efficient than the current grabbing of freelist immediately followed by
> slab deactivation.
> 
> Neither of these changes affect the fast paths.
> 
> The function free_debug_processing() was moved so that it is placed
> later than the definitions of add_partial(), remove_partial() and
> discard_slab().
> 
> [1] https://lore.kernel.org/all/20220529081535.69275-1-rongwei.wang@linux.alibaba.com/
> 
> Reported-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>   mm/slub.c | 250 +++++++++++++++++++++++++++++++++++++-----------------
>   1 file changed, 174 insertions(+), 76 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index b1281b8654bd..954fe7ad5ee1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
>   }
>   
>   static noinline int alloc_debug_processing(struct kmem_cache *s,
> -					struct slab *slab,
> -					void *object, unsigned long addr)
> +					struct slab *slab, void *object)
>   {
>   	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>   		if (!alloc_consistency_checks(s, slab, object))
>   			goto bad;
>   	}
>   
> -	/* Success perform special debug activities for allocs */
> -	if (s->flags & SLAB_STORE_USER)
> -		set_track(s, object, TRACK_ALLOC, addr);
> +	/* Success. Perform special debug activities for allocs */
>   	trace(s, slab, object, 1);
>   	init_object(s, object, SLUB_RED_ACTIVE);
>   	return 1;
> @@ -1385,63 +1382,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
>   	return 1;
>   }
>   
> -/* Supports checking bulk free of a constructed freelist */
> -static noinline int free_debug_processing(
> -	struct kmem_cache *s, struct slab *slab,
> -	void *head, void *tail, int bulk_cnt,
> -	unsigned long addr)
> -{
> -	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> -	void *object = head;
> -	int cnt = 0;
> -	unsigned long flags, flags2;
> -	int ret = 0;
> -	depot_stack_handle_t handle = 0;
> -
> -	if (s->flags & SLAB_STORE_USER)
> -		handle = set_track_prepare();
> -
> -	spin_lock_irqsave(&n->list_lock, flags);
> -	slab_lock(slab, &flags2);
> -
> -	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> -		if (!check_slab(s, slab))
> -			goto out;
> -	}
> -
> -next_object:
> -	cnt++;
> -
> -	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> -		if (!free_consistency_checks(s, slab, object, addr))
> -			goto out;
> -	}
> -
> -	if (s->flags & SLAB_STORE_USER)
> -		set_track_update(s, object, TRACK_FREE, addr, handle);
> -	trace(s, slab, object, 0);
> -	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
> -	init_object(s, object, SLUB_RED_INACTIVE);
> -
> -	/* Reached end of constructed freelist yet? */
> -	if (object != tail) {
> -		object = get_freepointer(s, object);
> -		goto next_object;
> -	}
> -	ret = 1;
> -
> -out:
> -	if (cnt != bulk_cnt)
> -		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> -			 bulk_cnt, cnt);
> -
> -	slab_unlock(slab, &flags2);
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> -	if (!ret)
> -		slab_fix(s, "Object at 0x%p not freed", object);
> -	return ret;
> -}
> -
>   /*
>    * Parse a block of slub_debug options. Blocks are delimited by ';'
>    *
> @@ -1661,7 +1601,7 @@ static inline
>   void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>   
>   static inline int alloc_debug_processing(struct kmem_cache *s,
> -	struct slab *slab, void *object, unsigned long addr) { return 0; }
> +	struct slab *slab, void *object) { return 0; }
>   
>   static inline int free_debug_processing(
>   	struct kmem_cache *s, struct slab *slab,
> @@ -2102,6 +2042,42 @@ static inline void remove_partial(struct kmem_cache_node *n,
>   	n->nr_partial--;
>   }
>   
> +/*
> + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
> + * slab from the n->partial list. Removes only a single object from the slab
> + * under slab_lock(), does the alloc_debug_processing() checks and leaves the
> + * slab on the list, or moves it to full list if it was the last object.
> + */
> +static void *alloc_single_from_partial(struct kmem_cache *s,
> +		struct kmem_cache_node *n, struct slab *slab)
> +{
> +	void *object;
> +	unsigned long flags;
> +
> +	lockdep_assert_held(&n->list_lock);
> +
> +	slab_lock(slab, &flags);
> +
> +	object = slab->freelist;
> +	slab->freelist = get_freepointer(s, object);
> +	slab->inuse++;
> +
> +	if (!alloc_debug_processing(s, slab, object)) {
> +		remove_partial(n, slab);
> +		slab_unlock(slab, &flags);
> +		return NULL;
> +	}
> +
> +	if (slab->inuse == slab->objects) {
> +		remove_partial(n, slab);
> +		add_full(s, n, slab);
> +	}
> +
> +	slab_unlock(slab, &flags);
> +
> +	return object;
> +}
> +
>   /*
>    * Remove slab from the partial list, freeze it and
>    * return the pointer to the freelist.
> @@ -2182,6 +2158,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>   		if (!pfmemalloc_match(slab, gfpflags))
>   			continue;
>   
> +		if (kmem_cache_debug(s)) {
> +			object = alloc_single_from_partial(s, n, slab);
> +			if (object)
> +				break;
> +			continue;
> +		}
> +
>   		t = acquire_slab(s, n, slab, object == NULL);
>   		if (!t)
>   			break;
> @@ -2788,6 +2771,104 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
>   {
>   	return atomic_long_read(&n->total_objects);
>   }
> +
> +/* Supports checking bulk free of a constructed freelist */
> +static noinline int free_debug_processing(
> +	struct kmem_cache *s, struct slab *slab,
> +	void *head, void *tail, int bulk_cnt,
> +	unsigned long addr)
> +{
> +	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> +	struct slab *slab_to_discard = NULL;
> +	void *object = head;
> +	int cnt = 0;
> +	unsigned long flags, flags2;
> +	int ret = 0;
> +	depot_stack_handle_t handle = 0;
> +
> +	if (s->flags & SLAB_STORE_USER)
> +		handle = set_track_prepare();
> +
> +	spin_lock_irqsave(&n->list_lock, flags);
> +	slab_lock(slab, &flags2);
> +
> +	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> +		if (!check_slab(s, slab))
> +			goto out;
> +	}
> +
> +	if (slab->inuse < bulk_cnt) {
> +		slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
> +			 slab->inuse, bulk_cnt);
> +		goto out;
> +	}
> +
> +next_object:
> +
> +	if (++cnt > bulk_cnt)
> +		goto out_cnt;
> +
> +	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> +		if (!free_consistency_checks(s, slab, object, addr))
> +			goto out;
> +	}
> +
> +	if (s->flags & SLAB_STORE_USER)
> +		set_track_update(s, object, TRACK_FREE, addr, handle);
> +	trace(s, slab, object, 0);
> +	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
> +	init_object(s, object, SLUB_RED_INACTIVE);
> +
> +	/* Reached end of constructed freelist yet? */
> +	if (object != tail) {
> +		object = get_freepointer(s, object);
> +		goto next_object;
> +	}
> +	ret = 1;
> +
> +out_cnt:
> +	if (cnt != bulk_cnt)
> +		slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
> +			 bulk_cnt, cnt);
> +
> +out:
> +	if (ret) {
> +		void *prior = slab->freelist;
> +
> +		/* Perform the actual freeing while we still hold the locks */
> +		slab->inuse -= cnt;
> +		set_freepointer(s, tail, prior);
> +		slab->freelist = head;
> +
> +		/* Do we need to remove the slab from full or partial list? */
> +		if (!prior) {
> +			remove_full(s, n, slab);
> +		} else if (slab->inuse == 0) {
> +			remove_partial(n, slab);
> +			stat(s, FREE_REMOVE_PARTIAL);
> +		}
> +
> +		/* Do we need to discard the slab or add to partial list? */
> +		if (slab->inuse == 0) {
> +			slab_to_discard = slab;
> +		} else if (!prior) {
> +			add_partial(n, slab, DEACTIVATE_TO_TAIL);
> +			stat(s, FREE_ADD_PARTIAL);
> +		}
> +
> +	}
> +
> +	slab_unlock(slab, &flags2);
> +	spin_unlock_irqrestore(&n->list_lock, flags);
> +	if (!ret)
> +		slab_fix(s, "Object at 0x%p not freed", object);
> +	if (slab_to_discard) {
> +		stat(s, FREE_SLAB);
> +		discard_slab(s, slab);
> +	}
> +
> +	return ret;
> +}
I had test this patch, and it indeed deal with the bug that I described. 
Though I am also has prepared this part of code, your code is ok to me.

-wrw

>   #endif /* CONFIG_SLUB_DEBUG */
>   
>   #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
> @@ -3045,19 +3126,35 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>   
>   	stat(s, ALLOC_SLAB);
>   
> -check_new_slab:
> -
>   	if (kmem_cache_debug(s)) {
> -		if (!alloc_debug_processing(s, slab, freelist, addr)) {
> -			/* Slab failed checks. Next slab needed */
> -			goto new_slab;
> -		} else {
> +		if (!alloc_debug_processing(s, slab, freelist))
>   			/*
> -			 * For debug case, we don't load freelist so that all
> -			 * allocations go through alloc_debug_processing()
> +			 * It's not really expected that this would fail on a
> +			 * freshly allocated slab, but a concurrent memory
> +			 * corruption in theory could cause that.
>   			 */
> -			goto return_single;
> -		}
> +			goto new_slab;
> +
> +		/*
> +		 * For debug caches we don't load the freelist so that all
> +		 * allocations have to go through alloc_debug_processing()
> +		 */
> +		if (s->flags & SLAB_STORE_USER)
> +			set_track(s, freelist, TRACK_ALLOC, addr);
> +		goto return_single;
> +	}
> +
> +check_new_slab:
> +
> +	if (kmem_cache_debug(s)) {
> +		/*
> +		 * For debug caches here we had to go through
> +		 * alloc_single_from_partial() so just store the tracking info
> +		 * and return the object
> +		 */
> +		if (s->flags & SLAB_STORE_USER)
> +			set_track(s, freelist, TRACK_ALLOC, addr);
> +		return freelist;
>   	}
>   
>   	if (unlikely(!pfmemalloc_match(slab, gfpflags)))
> @@ -3341,9 +3438,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>   	if (kfence_free(head))
>   		return;
>   
> -	if (kmem_cache_debug(s) &&
> -	    !free_debug_processing(s, slab, head, tail, cnt, addr))
> +	if (kmem_cache_debug(s)) {
> +		free_debug_processing(s, slab, head, tail, cnt, addr);
>   		return;
> +	}
>   
>   	do {
>   		if (unlikely(n)) {
> @@ -5625,7 +5723,7 @@ static ssize_t validate_store(struct kmem_cache *s,
>   {
>   	int ret = -EINVAL;
>   
> -	if (buf[0] == '1') {
> +	if (buf[0] == '1' && kmem_cache_debug(s)) {
>   		ret = validate_slab_cache(s);
>   		if (ret >= 0)
>   			ret = length;
Vlastimil Babka July 19, 2022, 2:21 p.m. UTC | #25
On 7/19/22 16:15, Rongwei Wang wrote:
> 
...
>> +
>> +    slab_unlock(slab, &flags2);
>> +    spin_unlock_irqrestore(&n->list_lock, flags);
>> +    if (!ret)
>> +        slab_fix(s, "Object at 0x%p not freed", object);
>> +    if (slab_to_discard) {
>> +        stat(s, FREE_SLAB);
>> +        discard_slab(s, slab);
>> +    }
>> +
>> +    return ret;
>> +}
> I had test this patch, and it indeed deal with the bug that I described.

Thanks.

> Though I am also has prepared this part of code, your code is ok to me.

Aha, feel free to post your version, maybe it's simpler? We can compare.
Rongwei Wang July 19, 2022, 2:43 p.m. UTC | #26
On 7/19/22 10:21 PM, Vlastimil Babka wrote:
> On 7/19/22 16:15, Rongwei Wang wrote:
>>
> ...
>>> +
>>> +    slab_unlock(slab, &flags2);
>>> +    spin_unlock_irqrestore(&n->list_lock, flags);
>>> +    if (!ret)
>>> +        slab_fix(s, "Object at 0x%p not freed", object);
>>> +    if (slab_to_discard) {
>>> +        stat(s, FREE_SLAB);
>>> +        discard_slab(s, slab);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>> I had test this patch, and it indeed deal with the bug that I described.
> 
> Thanks.
> 
>> Though I am also has prepared this part of code, your code is ok to me.
> 
> Aha, feel free to post your version, maybe it's simpler? We can compare.
My code only includes the part of your free_debug_processing(), the 
structure of it likes:

slab_free() {
     if (kmem_cache_debug(s))
         slab_free_debug();
     else
         __do_slab_free();
}

The __slab_free_debug() here likes your free_debug_processing().

+/*
+ * Slow path handling for debugging.
+ */
+static void __slab_free_debug(struct kmem_cache *s, struct slab *slab,
+                       void *head, void *tail, int cnt,
+                       unsigned long addr)
+
+{
+       void *prior;
+       int was_frozen;
+       struct slab new;
+       unsigned long counters;
+       struct kmem_cache_node *n = NULL;
+       unsigned long flags;
+       int ret;
+
+       stat(s, FREE_SLOWPATH);
+
+       if (kfence_free(head))
+               return;
+
+       n = get_node(s, slab_nid(slab));
+
+       spin_lock_irqsave(&n->list_lock, flags);
+       ret = free_debug_processing(s, slab, head, tail, cnt, addr);
+       if (!ret) {
+               spin_unlock_irqrestore(&n->list_lock, flags);
+               return;
+       }
+
+       do {
+               prior = slab->freelist;
+               counters = slab->counters;
+               set_freepointer(s, tail, prior);
+               new.counters = counters;
+               was_frozen = new.frozen;
+               new.inuse -= cnt;
+       } while (!cmpxchg_double_slab(s, slab,
+               prior, counters,
+               head, new.counters,
+               "__slab_free"));
+
+       if ((new.inuse && prior) || was_frozen) {
+               spin_unlock_irqrestore(&n->list_lock, flags);
+               if (likely(was_frozen)) {
+                       stat(s, FREE_FROZEN);
+               }
+
+               return;
+       }
+
+       if (!new.inuse && n->nr_partial >= s->min_partial) {
+               /* Indicate no user in this slab, discarding it 
naturally. */
+               if (prior) {
+                       /* Slab on the partial list. */
+                       remove_partial(n, slab);
+                       stat(s, FREE_REMOVE_PARTIAL);
+               } else {
+                       /* Slab must be on the full list */
+                       remove_full(s, n, slab);
+               }
+
+               spin_unlock_irqrestore(&n->list_lock, flags);
+               stat(s, FREE_SLAB);
+               discard_slab(s, slab);
+               return;
+       }
+
+       /*
+        * Objects left in the slab. If it was not on the partial list 
before
+        * then add it.
+        */
+       if (!prior) {
+               remove_full(s, n, slab);
+               add_partial(n, slab, DEACTIVATE_TO_TAIL);
+               stat(s, FREE_ADD_PARTIAL);
+       }
+       spin_unlock_irqrestore(&n->list_lock, flags);
+
+       return;
+}
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index ed5c2c03a47a..310e56d99116 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1374,15 +1374,12 @@  static noinline int free_debug_processing(
 	void *head, void *tail, int bulk_cnt,
 	unsigned long addr)
 {
-	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
 	void *object = head;
 	int cnt = 0;
-	unsigned long flags, flags2;
+	unsigned long flags;
 	int ret = 0;
 
-	spin_lock_irqsave(&n->list_lock, flags);
-	slab_lock(slab, &flags2);
-
+	slab_lock(slab, &flags);
 	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 		if (!check_slab(s, slab))
 			goto out;
@@ -1414,8 +1411,7 @@  static noinline int free_debug_processing(
 		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
 			 bulk_cnt, cnt);
 
-	slab_unlock(slab, &flags2);
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	slab_unlock(slab, &flags);
 	if (!ret)
 		slab_fix(s, "Object at 0x%p not freed", object);
 	return ret;
@@ -3304,7 +3300,7 @@  static void __slab_free(struct kmem_cache *s, struct slab *slab,
 
 {
 	void *prior;
-	int was_frozen;
+	int was_frozen, to_take_off = 0;
 	struct slab new;
 	unsigned long counters;
 	struct kmem_cache_node *n = NULL;
@@ -3315,15 +3311,19 @@  static void __slab_free(struct kmem_cache *s, struct slab *slab,
 	if (kfence_free(head))
 		return;
 
+	n = get_node(s, slab_nid(slab));
+	spin_lock_irqsave(&n->list_lock, flags);
+
 	if (kmem_cache_debug(s) &&
-	    !free_debug_processing(s, slab, head, tail, cnt, addr))
+	    !free_debug_processing(s, slab, head, tail, cnt, addr)) {
+
+		spin_unlock_irqrestore(&n->list_lock, flags);
 		return;
+	}
 
 	do {
-		if (unlikely(n)) {
-			spin_unlock_irqrestore(&n->list_lock, flags);
-			n = NULL;
-		}
+		if (unlikely(to_take_off))
+			to_take_off = 0;
 		prior = slab->freelist;
 		counters = slab->counters;
 		set_freepointer(s, tail, prior);
@@ -3343,18 +3343,11 @@  static void __slab_free(struct kmem_cache *s, struct slab *slab,
 				new.frozen = 1;
 
 			} else { /* Needs to be taken off a list */
-
-				n = get_node(s, slab_nid(slab));
 				/*
-				 * Speculatively acquire the list_lock.
 				 * If the cmpxchg does not succeed then we may
-				 * drop the list_lock without any processing.
-				 *
-				 * Otherwise the list_lock will synchronize with
-				 * other processors updating the list of slabs.
+				 * drop this behavior without any processing.
 				 */
-				spin_lock_irqsave(&n->list_lock, flags);
-
+				to_take_off = 1;
 			}
 		}
 
@@ -3363,8 +3356,9 @@  static void __slab_free(struct kmem_cache *s, struct slab *slab,
 		head, new.counters,
 		"__slab_free"));
 
-	if (likely(!n)) {
+	if (likely(!to_take_off)) {
 
+		spin_unlock_irqrestore(&n->list_lock, flags);
 		if (likely(was_frozen)) {
 			/*
 			 * The list lock was not taken therefore no list