Message ID | 20220529081535.69275-1-rongwei.wang@linux.alibaba.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [1/3] mm/slub: fix the race between validate_slab and slab_free | expand |
On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote: > In use cases where allocating and freeing slab frequently, some > error messages, such as "Left Redzone overwritten", "First byte > 0xbb instead of 0xcc" would be printed when validating slabs. > That's because an object has been filled with SLAB_RED_INACTIVE, > but has not been added to slab's freelist. And between these > two states, the behaviour of validating slab is likely to occur. > > Actually, it doesn't mean the slab can not work stably. But, these > confusing messages will disturb slab debugging more or less. > > Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> Have you observed it or it's from code inspection? > --- > mm/slub.c | 40 +++++++++++++++++----------------------- > 1 file changed, 17 insertions(+), 23 deletions(-) > > diff --git a/mm/slub.c b/mm/slub.c > index ed5c2c03a47a..310e56d99116 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing( > void *head, void *tail, int bulk_cnt, > unsigned long addr) > { > - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); > void *object = head; > int cnt = 0; > - unsigned long flags, flags2; > + unsigned long flags; > int ret = 0; > > - spin_lock_irqsave(&n->list_lock, flags); > - slab_lock(slab, &flags2); > - > + slab_lock(slab, &flags); > if (s->flags & SLAB_CONSISTENCY_CHECKS) { > if (!check_slab(s, slab)) > goto out; > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing( > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", > bulk_cnt, cnt); > > - slab_unlock(slab, &flags2); > - spin_unlock_irqrestore(&n->list_lock, flags); > + slab_unlock(slab, &flags); > if (!ret) > slab_fix(s, "Object at 0x%p not freed", object); > return ret; > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > { > void *prior; > - int was_frozen; > + int was_frozen, to_take_off = 0; > struct slab new; > unsigned long counters; > struct kmem_cache_node *n = NULL; > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > if (kfence_free(head)) > return; > > + n = get_node(s, slab_nid(slab)); > + spin_lock_irqsave(&n->list_lock, flags); > + Oh please don't do this. SLUB free slowpath can be hit a lot depending on workload. __slab_free() try its best not to take n->list_lock. currently takes n->list_lock only when the slab need to be taken from list. Unconditionally taking n->list_lock will degrade performance. > if (kmem_cache_debug(s) && > - !free_debug_processing(s, slab, head, tail, cnt, addr)) > + !free_debug_processing(s, slab, head, tail, cnt, addr)) { > + > + spin_unlock_irqrestore(&n->list_lock, flags); > return; > + } > > do { > - if (unlikely(n)) { > - spin_unlock_irqrestore(&n->list_lock, flags); > - n = NULL; > - } > + if (unlikely(to_take_off)) > + to_take_off = 0; > prior = slab->freelist; > counters = slab->counters; > set_freepointer(s, tail, prior); > @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > new.frozen = 1; > > } else { /* Needs to be taken off a list */ > - > - n = get_node(s, slab_nid(slab)); > /* > - * Speculatively acquire the list_lock. > * If the cmpxchg does not succeed then we may > - * drop the list_lock without any processing. > - * > - * Otherwise the list_lock will synchronize with > - * other processors updating the list of slabs. > + * drop this behavior without any processing. > */ > - spin_lock_irqsave(&n->list_lock, flags); > - > + to_take_off = 1; > } > } > > @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > head, new.counters, > "__slab_free")); > > - if (likely(!n)) { > + if (likely(!to_take_off)) { > > + spin_unlock_irqrestore(&n->list_lock, flags); > if (likely(was_frozen)) { > /* > * The list lock was not taken therefore no list > > -- > 2.27.0 >
On Sun, 29 May 2022, Hyeonggon Yoo wrote: > > diff --git a/mm/slub.c b/mm/slub.c > > index ed5c2c03a47a..310e56d99116 100644 > > --- a/mm/slub.c > > +++ b/mm/slub.c > > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing( > > void *head, void *tail, int bulk_cnt, > > unsigned long addr) > > { > > - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); > > void *object = head; > > int cnt = 0; > > - unsigned long flags, flags2; > > + unsigned long flags; > > int ret = 0; > > > > - spin_lock_irqsave(&n->list_lock, flags); > > - slab_lock(slab, &flags2); > > - > > + slab_lock(slab, &flags); > > if (s->flags & SLAB_CONSISTENCY_CHECKS) { > > if (!check_slab(s, slab)) > > goto out; > > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing( > > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", > > bulk_cnt, cnt); > > > > - slab_unlock(slab, &flags2); > > - spin_unlock_irqrestore(&n->list_lock, flags); > > + slab_unlock(slab, &flags); > > if (!ret) > > slab_fix(s, "Object at 0x%p not freed", object); > > return ret; > > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > > > { > > void *prior; > > - int was_frozen; > > + int was_frozen, to_take_off = 0; > > struct slab new; > > unsigned long counters; > > struct kmem_cache_node *n = NULL; > > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > if (kfence_free(head)) > > return; > > > > + n = get_node(s, slab_nid(slab)); > > + spin_lock_irqsave(&n->list_lock, flags); > > + > > Oh please don't do this. > > SLUB free slowpath can be hit a lot depending on workload. > > __slab_free() try its best not to take n->list_lock. currently takes n->list_lock > only when the slab need to be taken from list. > > Unconditionally taking n->list_lock will degrade performance. > This is a good point, it would be useful to gather some benchmarks for workloads that are known to thrash some caches and would hit this path such as netperf TCP_RR.
On Sun, May 29, 2022 at 11:37:06AM +0000, Hyeonggon Yoo wrote: > On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote: > > In use cases where allocating and freeing slab frequently, some > > error messages, such as "Left Redzone overwritten", "First byte > > 0xbb instead of 0xcc" would be printed when validating slabs. > > That's because an object has been filled with SLAB_RED_INACTIVE, > > but has not been added to slab's freelist. And between these > > two states, the behaviour of validating slab is likely to occur. > > > > Actually, it doesn't mean the slab can not work stably. But, these > > confusing messages will disturb slab debugging more or less. > > > > Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> > > Have you observed it or it's from code inspection? > > > --- > > mm/slub.c | 40 +++++++++++++++++----------------------- > > 1 file changed, 17 insertions(+), 23 deletions(-) > > > > diff --git a/mm/slub.c b/mm/slub.c > > index ed5c2c03a47a..310e56d99116 100644 > > --- a/mm/slub.c > > +++ b/mm/slub.c > > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing( > > void *head, void *tail, int bulk_cnt, > > unsigned long addr) > > { > > - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); > > void *object = head; > > int cnt = 0; > > - unsigned long flags, flags2; > > + unsigned long flags; > > int ret = 0; > > > > - spin_lock_irqsave(&n->list_lock, flags); > > - slab_lock(slab, &flags2); > > - > > + slab_lock(slab, &flags); > > if (s->flags & SLAB_CONSISTENCY_CHECKS) { > > if (!check_slab(s, slab)) > > goto out; > > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing( > > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", > > bulk_cnt, cnt); > > > > - slab_unlock(slab, &flags2); > > - spin_unlock_irqrestore(&n->list_lock, flags); > > + slab_unlock(slab, &flags); > > if (!ret) > > slab_fix(s, "Object at 0x%p not freed", object); > > return ret; > > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > > > { > > void *prior; > > - int was_frozen; > > + int was_frozen, to_take_off = 0; > > struct slab new; > > unsigned long counters; > > struct kmem_cache_node *n = NULL; > > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > if (kfence_free(head)) > > return; > > > > + n = get_node(s, slab_nid(slab)); > > + spin_lock_irqsave(&n->list_lock, flags); > > + > > Oh please don't do this. > > SLUB free slowpath can be hit a lot depending on workload. > > __slab_free() try its best not to take n->list_lock. currently takes n->list_lock > only when the slab need to be taken from list. > > Unconditionally taking n->list_lock will degrade performance. > I can confirm you are right. We have encountered this issue in practise. We have deployed somen HDFS instance on a 256-CPU machine. When there are lots of IO requests from users, we can see lots of threads are contended on n->list_lock. Lots of call traces are like following: ffffffff810dfbb4 native_queued_spin_lock_slowpath+0x1a4 ffffffff81780ffb _raw_spin_lock+0x1b ffffffff8127327e get_partial_node.isra.81+0x5e ffffffff812752d3 ___slab_alloc+0x2f3 ffffffff8127559c __slab_alloc+0x1c ffffffff81275828 kmem_cache_alloc+0x278 ffffffff812e9e3d alloc_buffer_head+0x1d ffffffff812e9f74 alloc_page_buffers+0xa4 ffffffff812eb0e9 create_empty_buffers+0x19 ffffffff812eb37d create_page_buffers+0x7d ffffffff812ed78d __block_write_begin_int+0x9d I thought it was because there are lots of threads which consume local CPU slab cache quickly and then both of them try to get a new slab from node partial list. Since there are 256 CPUs, the contention is more competitive and easy to be visible. Any thoughts on this issue (e.e. how to ease contention)? Comments are welcome. Thanks.
On 5/29/22 7:37 PM, Hyeonggon Yoo wrote: > On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote: >> In use cases where allocating and freeing slab frequently, some >> error messages, such as "Left Redzone overwritten", "First byte >> 0xbb instead of 0xcc" would be printed when validating slabs. >> That's because an object has been filled with SLAB_RED_INACTIVE, >> but has not been added to slab's freelist. And between these >> two states, the behaviour of validating slab is likely to occur. >> >> Actually, it doesn't mean the slab can not work stably. But, these >> confusing messages will disturb slab debugging more or less. >> >> Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> > > Have you observed it or it's from code inspection? Hi, Hyeonggon I try to build a module to trigger the race: #define SLUB_KTHREAD_MAX 1 static int do_slub_alloc(void *data) { char *mm = NULL; char *mm1 = NULL; char *mm2 = NULL; char *mm3 = NULL; allow_signal(SIGTERM); while (1) { mm = kmalloc(2048, GFP_KERNEL); if (mm) mm[0x100] = 0x21; if (mm) kfree(mm); mm = NULL; if (kthread_should_stop()) break; } return 0; } static int __init mini_init(void) { char *mm; int i = 0; unsigned int index; char kth_name[11] = "do_slub_00"; for (i = 0; i < SLUB_KTHREAD_MAX; i++) { kth_name[9] = '0' + i%10; kth_name[8] = '0' + i/10; slub_thread[i] = kthread_run(do_slub_alloc, NULL, kth_name); } return 0; } module_init(mini_init); And in my system, I add 'slub_debug=UFPZ' to the boot options. Next, the error messages will be printed when I test "slabinfo -v" or "echo 1 > /sys/kernel/slab/kmalloc-2048/validate". > >> --- >> mm/slub.c | 40 +++++++++++++++++----------------------- >> 1 file changed, 17 insertions(+), 23 deletions(-) >> >> diff --git a/mm/slub.c b/mm/slub.c >> index ed5c2c03a47a..310e56d99116 100644 >> --- a/mm/slub.c >> +++ b/mm/slub.c >> @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing( >> void *head, void *tail, int bulk_cnt, >> unsigned long addr) >> { >> - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); >> void *object = head; >> int cnt = 0; >> - unsigned long flags, flags2; >> + unsigned long flags; >> int ret = 0; >> >> - spin_lock_irqsave(&n->list_lock, flags); >> - slab_lock(slab, &flags2); >> - >> + slab_lock(slab, &flags); >> if (s->flags & SLAB_CONSISTENCY_CHECKS) { >> if (!check_slab(s, slab)) >> goto out; >> @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing( >> slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", >> bulk_cnt, cnt); >> >> - slab_unlock(slab, &flags2); >> - spin_unlock_irqrestore(&n->list_lock, flags); >> + slab_unlock(slab, &flags); >> if (!ret) >> slab_fix(s, "Object at 0x%p not freed", object); >> return ret; >> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, >> >> { >> void *prior; >> - int was_frozen; >> + int was_frozen, to_take_off = 0; >> struct slab new; >> unsigned long counters; >> struct kmem_cache_node *n = NULL; >> @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, >> if (kfence_free(head)) >> return; >> >> + n = get_node(s, slab_nid(slab)); >> + spin_lock_irqsave(&n->list_lock, flags); >> + > > Oh please don't do this. > > SLUB free slowpath can be hit a lot depending on workload. Thanks, your words remind me. Actually, I put the original in free_debug_processing() lock on the outside of it. Looks this change is small. Indeed, it will degrade performance more or less. And do you have other ideas?:) -wrw > > __slab_free() try its best not to take n->list_lock. currently takes n->list_lock > only when the slab need to be taken from list. > > Unconditionally taking n->list_lock will degrade performance. > >> if (kmem_cache_debug(s) && >> - !free_debug_processing(s, slab, head, tail, cnt, addr)) >> + !free_debug_processing(s, slab, head, tail, cnt, addr)) { >> + >> + spin_unlock_irqrestore(&n->list_lock, flags); >> return; >> + } >> >> do { >> - if (unlikely(n)) { >> - spin_unlock_irqrestore(&n->list_lock, flags); >> - n = NULL; >> - } >> + if (unlikely(to_take_off)) >> + to_take_off = 0; >> prior = slab->freelist; >> counters = slab->counters; >> set_freepointer(s, tail, prior); >> @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, >> new.frozen = 1; >> >> } else { /* Needs to be taken off a list */ >> - >> - n = get_node(s, slab_nid(slab)); >> /* >> - * Speculatively acquire the list_lock. >> * If the cmpxchg does not succeed then we may >> - * drop the list_lock without any processing. >> - * >> - * Otherwise the list_lock will synchronize with >> - * other processors updating the list of slabs. >> + * drop this behavior without any processing. >> */ >> - spin_lock_irqsave(&n->list_lock, flags); >> - >> + to_take_off = 1; >> } >> } >> >> @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, >> head, new.counters, >> "__slab_free")); >> >> - if (likely(!n)) { >> + if (likely(!to_take_off)) { >> >> + spin_unlock_irqrestore(&n->list_lock, flags); >> if (likely(was_frozen)) { >> /* >> * The list lock was not taken therefore no list >> >> -- >> 2.27.0 >>
On Mon, 30 May 2022, David Rientjes wrote: > > Unconditionally taking n->list_lock will degrade performance. > > This is a good point, it would be useful to gather some benchmarks for > workloads that are known to thrash some caches and would hit this path > such as netperf TCP_RR. Its obvious that adding new spinlocks to some of the hottest functions in the kernel will degrade performance. This goes against the basic design of these functions to be as efficient as possible.
Hi, Christoph, David, Muchun and Hyeonggon Thanks for your time. Recently, I am also find other ways to solve this. That case was provided by Muchun is useful (Thanks Muchun!). Indeed, it seems that use n->list_lock here is unwise. Actually, I'm not sure if you recognize the existence of such race? If all agrees this race, then the next question may be: do we want to solve this problem? or as David said, it would be better to deprecate validate attribute directly. I have no idea about it, hope to rely on your experience. In fact, I mainly want to collect your views on whether or how to fix this bug here. Thanks! Thanks again for your time:). -wrw On 6/2/22 11:14 PM, Christoph Lameter wrote: > On Mon, 30 May 2022, David Rientjes wrote: > >>> Unconditionally taking n->list_lock will degrade performance. >> >> This is a good point, it would be useful to gather some benchmarks for >> workloads that are known to thrash some caches and would hit this path >> such as netperf TCP_RR. > > Its obvious that adding new spinlocks to some of the hottest functions in > the kernel will degrade performance. This goes against the basic design of > these functions to be as efficient as possible.
On Tue, May 31, 2022 at 11:47:22AM +0800, Muchun Song wrote: > On Sun, May 29, 2022 at 11:37:06AM +0000, Hyeonggon Yoo wrote: > > On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote: > > > In use cases where allocating and freeing slab frequently, some > > > error messages, such as "Left Redzone overwritten", "First byte > > > 0xbb instead of 0xcc" would be printed when validating slabs. > > > That's because an object has been filled with SLAB_RED_INACTIVE, > > > but has not been added to slab's freelist. And between these > > > two states, the behaviour of validating slab is likely to occur. > > > > > > Actually, it doesn't mean the slab can not work stably. But, these > > > confusing messages will disturb slab debugging more or less. > > > > > > Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> > > > > Have you observed it or it's from code inspection? > > > > > --- > > > mm/slub.c | 40 +++++++++++++++++----------------------- > > > 1 file changed, 17 insertions(+), 23 deletions(-) > > > > > > diff --git a/mm/slub.c b/mm/slub.c > > > index ed5c2c03a47a..310e56d99116 100644 > > > --- a/mm/slub.c > > > +++ b/mm/slub.c > > > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing( > > > void *head, void *tail, int bulk_cnt, > > > unsigned long addr) > > > { > > > - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); > > > void *object = head; > > > int cnt = 0; > > > - unsigned long flags, flags2; > > > + unsigned long flags; > > > int ret = 0; > > > > > > - spin_lock_irqsave(&n->list_lock, flags); > > > - slab_lock(slab, &flags2); > > > - > > > + slab_lock(slab, &flags); > > > if (s->flags & SLAB_CONSISTENCY_CHECKS) { > > > if (!check_slab(s, slab)) > > > goto out; > > > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing( > > > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", > > > bulk_cnt, cnt); > > > > > > - slab_unlock(slab, &flags2); > > > - spin_unlock_irqrestore(&n->list_lock, flags); > > > + slab_unlock(slab, &flags); > > > if (!ret) > > > slab_fix(s, "Object at 0x%p not freed", object); > > > return ret; > > > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > > > > > { > > > void *prior; > > > - int was_frozen; > > > + int was_frozen, to_take_off = 0; > > > struct slab new; > > > unsigned long counters; > > > struct kmem_cache_node *n = NULL; > > > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > > > if (kfence_free(head)) > > > return; > > > > > > + n = get_node(s, slab_nid(slab)); > > > + spin_lock_irqsave(&n->list_lock, flags); > > > + > > > > Oh please don't do this. > > > > SLUB free slowpath can be hit a lot depending on workload. > > > > __slab_free() try its best not to take n->list_lock. currently takes n->list_lock > > only when the slab need to be taken from list. > > > > Unconditionally taking n->list_lock will degrade performance. > > > > I can confirm you are right. We have encountered this issue in practise. > We have deployed somen HDFS instance on a 256-CPU machine. When there > are lots of IO requests from users, we can see lots of threads are contended > on n->list_lock. Lots of call traces are like following: > > ffffffff810dfbb4 native_queued_spin_lock_slowpath+0x1a4 > ffffffff81780ffb _raw_spin_lock+0x1b > ffffffff8127327e get_partial_node.isra.81+0x5e > ffffffff812752d3 ___slab_alloc+0x2f3 > ffffffff8127559c __slab_alloc+0x1c > ffffffff81275828 kmem_cache_alloc+0x278 > ffffffff812e9e3d alloc_buffer_head+0x1d > ffffffff812e9f74 alloc_page_buffers+0xa4 > ffffffff812eb0e9 create_empty_buffers+0x19 > ffffffff812eb37d create_page_buffers+0x7d > ffffffff812ed78d __block_write_begin_int+0x9d > > I thought it was because there are lots of threads which consume local > CPU slab cache quickly and then both of them try to get a new slab > from node partial list. Since there are 256 CPUs, the contention > is more competitive and easy to be visible. > > Any thoughts on this issue (e.e. how to ease contention)? Comments > are welcome. How does increasing number of partial slabs affect your situation? (increasing /sys/slab/<cache name>/cpu_partial) > Thanks. > >
On Fri, 3 Jun 2022, Rongwei Wang wrote: > Recently, I am also find other ways to solve this. That case was provided by > Muchun is useful (Thanks Muchun!). Indeed, it seems that use n->list_lock here > is unwise. Actually, I'm not sure if you recognize the existence of such race? > If all agrees this race, then the next question may be: do we want to solve > this problem? or as David said, it would be better to deprecate validate > attribute directly. I have no idea about it, hope to rely on your experience. > > In fact, I mainly want to collect your views on whether or how to fix this bug > here. Thanks! Well validate_slab() is rarely used and should not cause the hot paths to incur performance penalties. Fix it in the validation logic somehow? Or document the issue and warn that validation may not be correct if there are current operations on the slab being validated.
On 6/7/22 8:14 PM, Christoph Lameter wrote: > On Fri, 3 Jun 2022, Rongwei Wang wrote: > >> Recently, I am also find other ways to solve this. That case was provided by >> Muchun is useful (Thanks Muchun!). Indeed, it seems that use n->list_lock here >> is unwise. Actually, I'm not sure if you recognize the existence of such race? >> If all agrees this race, then the next question may be: do we want to solve >> this problem? or as David said, it would be better to deprecate validate >> attribute directly. I have no idea about it, hope to rely on your experience. >> >> In fact, I mainly want to collect your views on whether or how to fix this bug >> here. Thanks! > > > Well validate_slab() is rarely used and should not cause the hot paths to > incur performance penalties. Fix it in the validation logic somehow? Or > document the issue and warn that validation may not be correct if there If available, I think document the issue and warn this incorrect behavior is OK. But it still prints a large amount of confusing messages, and disturbs us? > are current operations on the slab being validated. And I am trying to fix it in following way. In a short, these changes only works under the slub debug mode, and not affects the normal mode (I'm not sure). It looks not elegant enough. And if all approve of this way, I can submit the next version. Anyway, thanks for your time:). -wrw @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, { void *prior; - int was_frozen; + int was_frozen, to_take_off = 0; struct slab new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -3315,14 +3311,23 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) - return; + n = get_node(s, slab_nid(slab)); + if (kmem_cache_debug(s)) { + int ret; - do { - if (unlikely(n)) { + spin_lock_irqsave(&n->list_lock, flags); + ret = free_debug_processing(s, slab, head, tail, cnt, addr); + if (!ret) { spin_unlock_irqrestore(&n->list_lock, flags); - n = NULL; + return; + } + } + + do { + if (unlikely(to_take_off)) { + if (!kmem_cache_debug(s)) + spin_unlock_irqrestore(&n->list_lock, flags); + to_take_off = 0; } prior = slab->freelist; counters = slab->counters; @@ -3343,8 +3348,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, new.frozen = 1; } else { /* Needs to be taken off a list */ - - n = get_node(s, slab_nid(slab)); /* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may @@ -3353,8 +3356,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ - spin_lock_irqsave(&n->list_lock, flags); + if (!kmem_cache_debug(s)) + spin_lock_irqsave(&n->list_lock, flags); + to_take_off = 1; } } @@ -3363,8 +3368,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, head, new.counters, "__slab_free")); - if (likely(!n)) { - + if (likely(!to_take_off)) { + if (kmem_cache_debug(s)) + spin_unlock_irqrestore(&n->list_lock, flags); if (likely(was_frozen)) { /* * The list lock was not taken therefore no list >
On Wed, 8 Jun 2022, Rongwei Wang wrote: > If available, I think document the issue and warn this incorrect behavior is > OK. But it still prints a large amount of confusing messages, and disturbs us? Correct it would be great if you could fix this in a way that does not impact performance. > > are current operations on the slab being validated. > And I am trying to fix it in following way. In a short, these changes only > works under the slub debug mode, and not affects the normal mode (I'm not > sure). It looks not elegant enough. And if all approve of this way, I can > submit the next version. > > Anyway, thanks for your time:). > -wrw > > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct > slab *slab, > > { > void *prior; > - int was_frozen; > + int was_frozen, to_take_off = 0; > struct slab new; to_take_off has the role of !n ? Why is that needed? > - do { > - if (unlikely(n)) { > + spin_lock_irqsave(&n->list_lock, flags); > + ret = free_debug_processing(s, slab, head, tail, cnt, addr); Ok so the idea is to take the lock only if kmem_cache_debug. That looks ok. But it still adds a number of new branches etc to the free loop. Some performance tests would be useful.
On 6/8/22 8:23 PM, Christoph Lameter wrote: > On Wed, 8 Jun 2022, Rongwei Wang wrote: > >> If available, I think document the issue and warn this incorrect behavior is >> OK. But it still prints a large amount of confusing messages, and disturbs us? > > Correct it would be great if you could fix this in a way that does not > impact performance. > >>> are current operations on the slab being validated. >> And I am trying to fix it in following way. In a short, these changes only >> works under the slub debug mode, and not affects the normal mode (I'm not >> sure). It looks not elegant enough. And if all approve of this way, I can >> submit the next version. > > >> >> Anyway, thanks for your time:). >> -wrw >> >> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, > struct >> slab *slab, >> >> { >> void *prior; >> - int was_frozen; >> + int was_frozen, to_take_off = 0; >> struct slab new; > > to_take_off has the role of !n ? Why is that needed? > >> - do { >> - if (unlikely(n)) { >> + spin_lock_irqsave(&n->list_lock, flags); >> + ret = free_debug_processing(s, slab, head, tail, cnt, addr); > > Ok so the idea is to take the lock only if kmem_cache_debug. That looks > ok. But it still adds a number of new branches etc to the free loop. > > Some performance tests would be useful. Hi Christoph Thanks for your time! Do you have some advice in benchmarks that need me to test? And I find that hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no idea how to use these two benchmarks test to cover the above changes. Can you give some examples? Thanks very much! Sorry for late reply! [1]https://lore.kernel.org/lkml/20210301080404.GF12822@xsang-OptiPlex-9020/ [2]https://lore.kernel.org/linux-mm/20210128134512.GF3592@techsingularity.net/
On Sat, 11 Jun 2022, Rongwei Wang wrote: > > Ok so the idea is to take the lock only if kmem_cache_debug. That looks > > ok. But it still adds a number of new branches etc to the free loop. > > > > Some performance tests would be useful. > Hi Christoph > > Thanks for your time! > Do you have some advice in benchmarks that need me to test? And I find that > hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no > idea how to use these two benchmarks test to cover the above changes. Can you > give some examples? Thanks very much! Hi Rongwei, Well run hackbench with an without the change. There are also synthetic benchmarks available at https://gentwo.org/christoph/slub/tests/ These measure the cycles that slab operations take. However, they are a bit old and I think Pekka may have a newer version of these patches. Greetings, Christoph
On 6/13/22 9:50 PM, Christoph Lameter wrote: > On Sat, 11 Jun 2022, Rongwei Wang wrote: > >>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks >>> ok. But it still adds a number of new branches etc to the free loop. >>> >>> Some performance tests would be useful. >> Hi Christoph >> >> Thanks for your time! >> Do you have some advice in benchmarks that need me to test? And I find that >> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no >> idea how to use these two benchmarks test to cover the above changes. Can you >> give some examples? Thanks very much! > > > Hi Rongwei, > > Well run hackbench with an without the change. OK > > There are also synthetic benchmarks available at > https://gentwo.org/christoph/slub/tests/ That's great, thanks very much! > > These measure the cycles that slab operations take. However, they are a > bit old and I think Pekka may have a newer version of these > patches. > > Greetings, > Christoph
On 6/13/22 9:50 PM, Christoph Lameter wrote: > On Sat, 11 Jun 2022, Rongwei Wang wrote: > >>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks >>> ok. But it still adds a number of new branches etc to the free loop. >>> >>> Some performance tests would be useful. >> Hi Christoph >> >> Thanks for your time! >> Do you have some advice in benchmarks that need me to test? And I find that >> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no >> idea how to use these two benchmarks test to cover the above changes. Can you >> give some examples? Thanks very much! > > > Hi Rongwei, > > Well run hackbench with an without the change. > > There are also synthetic benchmarks available at > https://gentwo.org/christoph/slub/tests/ Christoph, I refer [1] to test some data below. The slub_test case is same to your provided. And here you the result of its test (the baseline is the data of upstream kernel, and fix is results of patched kernel). my test environment: arm64 vm (32 cores and 128G memory) And I have removed 'slub_debug=UFPZ' in cmdline before testing the following two groups of data. [1]https://lore.kernel.org/linux-mm/20200527103545.4348ac10@carbon/ Single thread testing 1. Kmalloc: Repeatedly allocate then free test before (baseline) fix kmalloc kfree kmalloc kfree 10000 times 8 7 cycles 8 cycles 5 cycles 7 cycles 10000 times 16 4 cycles 8 cycles 3 cycles 6 cycles 10000 times 32 4 cycles 8 cycles 3 cycles 6 cycles 10000 times 64 3 cycles 8 cycles 3 cycles 6 cycles 10000 times 128 3 cycles 8 cycles 3 cycles 6 cycles 10000 times 256 12 cycles 8 cycles 11 cycles 7 cycles 10000 times 512 27 cycles 10 cycles 23 cycles 11 cycles 10000 times 1024 18 cycles 9 cycles 20 cycles 10 cycles 10000 times 2048 54 cycles 12 cycles 54 cycles 12 cycles 10000 times 4096 105 cycles 20 cycles 105 cycles 25 cycles 10000 times 8192 210 cycles 35 cycles 212 cycles 39 cycles 10000 times 16384 133 cycles 45 cycles 119 cycles 46 cycles 2. Kmalloc: alloc/free test before (base) fix 10000 times kmalloc(8)/kfree 3 cycles 3 cycles 10000 times kmalloc(16)/kfree 3 cycles 3 cycles 10000 times kmalloc(32)/kfree 3 cycles 3 cycles 10000 times kmalloc(64)/kfree 3 cycles 3 cycles 10000 times kmalloc(128)/kfree 3 cycles 3 cycles 10000 times kmalloc(256)/kfree 3 cycles 3 cycles 10000 times kmalloc(512)/kfree 3 cycles 3 cycles 10000 times kmalloc(1024)/kfree 3 cycles 3 cycles 10000 times kmalloc(2048)/kfree 3 cycles 3 cycles 10000 times kmalloc(4096)/kfree 3 cycles 3 cycles 10000 times kmalloc(8192)/kfree 3 cycles 3 cycles 10000 times kmalloc(16384)/kfree 33 cycles 33 cycles Concurrent allocs before (baseline) fix Kmalloc N*alloc N*free(8) Average=17/18 Average=11/11 Kmalloc N*alloc N*free(16) Average=15/49 Average=9/11 Kmalloc N*alloc N*free(32) Average=15/40 Average=9/11 Kmalloc N*alloc N*free(64) Average=15/44 Average=9/10 Kmalloc N*alloc N*free(128) Average=15/42 Average=10/10 Kmalloc N*alloc N*free(256) Average=128/28 Average=71/22 Kmalloc N*alloc N*free(512) Average=206/34 Average=178/26 Kmalloc N*alloc N*free(1024) Average=762/37 Average=369/27 Kmalloc N*alloc N*free(2048) Average=327/58 Average=339/33 Kmalloc N*alloc N*free(4096) Average=2255/128 Average=1813/64 before (baseline) fix Kmalloc N*(alloc free)(8) Average=3 Average=3 Kmalloc N*(alloc free)(16) Average=3 Average=3 Kmalloc N*(alloc free)(32) Average=3 Average=3 Kmalloc N*(alloc free)(64) Average=3 Average=3 Kmalloc N*(alloc free)(128) Average=3 Average=3 Kmalloc N*(alloc free)(256) Average=3 Average=3 Kmalloc N*(alloc free)(512) Average=3 Average=3 Kmalloc N*(alloc free)(1024) Average=3 Average=3 Kmalloc N*(alloc free)(2048) Average=3 Average=3 Kmalloc N*(alloc free)(4096) Average=3 Average=3 According to the above data, It seems that no significant performance degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc N*alloc N*free(1024), the data of 'fix' column is better than baseline (it looks less is better, if I am wrong, please let me know). And if you have other suggestions, I can try to test more data. Thanks for your time! -wrw > > These measure the cycles that slab operations take. However, they are a > bit old and I think Pekka may have a newer version of these > patches. > > Greetings, > Christoph
On 6/8/22 14:23, Christoph Lameter wrote: > On Wed, 8 Jun 2022, Rongwei Wang wrote: > >> If available, I think document the issue and warn this incorrect behavior is >> OK. But it still prints a large amount of confusing messages, and disturbs us? > > Correct it would be great if you could fix this in a way that does not > impact performance. > >> > are current operations on the slab being validated. >> And I am trying to fix it in following way. In a short, these changes only >> works under the slub debug mode, and not affects the normal mode (I'm not >> sure). It looks not elegant enough. And if all approve of this way, I can >> submit the next version. > > >> >> Anyway, thanks for your time:). >> -wrw >> >> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, > struct >> slab *slab, >> >> { >> void *prior; >> - int was_frozen; >> + int was_frozen, to_take_off = 0; >> struct slab new; > > to_take_off has the role of !n ? Why is that needed? > >> - do { >> - if (unlikely(n)) { >> + spin_lock_irqsave(&n->list_lock, flags); >> + ret = free_debug_processing(s, slab, head, tail, cnt, addr); > > Ok so the idea is to take the lock only if kmem_cache_debug. That looks > ok. But it still adds a number of new branches etc to the free loop. It also further complicates the already tricky code. I wonder if we should make more benefit from the fact that for kmem_cache_debug() caches we don't leave any slabs on percpu or percpu partial lists, and also in free_debug_processing() we aready take both list_lock and slab_lock. If we just did the freeing immediately there under those locks, we would be protected against other freeing cpus by that list_lock and don't need the double cmpxchg tricks. What about against allocating cpus? More tricky as those will currently end up privatizing the freelist via get_partial(), only to deactivate it again, so our list_lock+slab_lock in freeing path would not protect in the meanwhile. But the allocation is currently very inefficient for debug caches, as in get_partial() it will take the list_lock to take the slab from partial list and then in most cases again in deactivate_slab() to return it. If instead the allocation path for kmem_cache_debug() cache would take a single object from the partial list (not whole freelist) under list_lock, it would be ultimately more efficient, and protect against freeing using list_lock. Sounds like an idea worth trying to me? And of course we would stop creating the 'validate' sysfs files for non-debug caches.
On Fri, 17 Jun 2022, Rongwei Wang wrote: > Christoph, I refer [1] to test some data below. The slub_test case is same to > your provided. And here you the result of its test (the baseline is the data > of upstream kernel, and fix is results of patched kernel). Ah good. > Single thread testing > > 1. Kmalloc: Repeatedly allocate then free test > > before (baseline) fix > kmalloc kfree kmalloc kfree > 10000 times 8 7 cycles 8 cycles 5 cycles 7 cycles > 10000 times 16 4 cycles 8 cycles 3 cycles 6 cycles > 10000 times 32 4 cycles 8 cycles 3 cycles 6 cycles Well the cycle reduction is strange. Tests are not done in the same environment? Maybe good to not use NUMA or bind to the same cpu > 10000 times 64 3 cycles 8 cycles 3 cycles 6 cycles > 10000 times 128 3 cycles 8 cycles 3 cycles 6 cycles > 10000 times 256 12 cycles 8 cycles 11 cycles 7 cycles > 10000 times 512 27 cycles 10 cycles 23 cycles 11 cycles > 10000 times 1024 18 cycles 9 cycles 20 cycles 10 cycles > 10000 times 2048 54 cycles 12 cycles 54 cycles 12 cycles > 10000 times 4096 105 cycles 20 cycles 105 cycles 25 cycles > 10000 times 8192 210 cycles 35 cycles 212 cycles 39 cycles > 10000 times 16384 133 cycles 45 cycles 119 cycles 46 cycles Seems to be different environments. > According to the above data, It seems that no significant performance > degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc > N*alloc N*free(1024), the data of 'fix' column is better than baseline (it > looks less is better, if I am wrong, please let me know). And if you have > other suggestions, I can try to test more data. Well can you explain the cycle reduction?
On 6/17/22 10:19 PM, Christoph Lameter wrote: > On Fri, 17 Jun 2022, Rongwei Wang wrote: > >> Christoph, I refer [1] to test some data below. The slub_test case is same to >> your provided. And here you the result of its test (the baseline is the data >> of upstream kernel, and fix is results of patched kernel). > > Ah good. >> Single thread testing >> >> 1. Kmalloc: Repeatedly allocate then free test >> >> before (baseline) fix >> kmalloc kfree kmalloc kfree >> 10000 times 8 7 cycles 8 cycles 5 cycles 7 cycles >> 10000 times 16 4 cycles 8 cycles 3 cycles 6 cycles >> 10000 times 32 4 cycles 8 cycles 3 cycles 6 cycles > > Well the cycle reduction is strange. Tests are not done in the same > environment? Maybe good to not use NUMA or bind to the same cpu It's the same environment. I can sure. And there are four nodes (32G per-node and 8 cores per-node) in my test environment. whether I need to test in one node? If right, I can try. > >> 10000 times 64 3 cycles 8 cycles 3 cycles 6 cycles >> 10000 times 128 3 cycles 8 cycles 3 cycles 6 cycles >> 10000 times 256 12 cycles 8 cycles 11 cycles 7 cycles >> 10000 times 512 27 cycles 10 cycles 23 cycles 11 cycles >> 10000 times 1024 18 cycles 9 cycles 20 cycles 10 cycles >> 10000 times 2048 54 cycles 12 cycles 54 cycles 12 cycles >> 10000 times 4096 105 cycles 20 cycles 105 cycles 25 cycles >> 10000 times 8192 210 cycles 35 cycles 212 cycles 39 cycles >> 10000 times 16384 133 cycles 45 cycles 119 cycles 46 cycles > > > Seems to be different environments. > >> According to the above data, It seems that no significant performance >> degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc >> N*alloc N*free(1024), the data of 'fix' column is better than baseline (it >> looks less is better, if I am wrong, please let me know). And if you have >> other suggestions, I can try to test more data. > > Well can you explain the cycle reduction? Maybe because of four nodes in my system or only 8 cores (very small) in each node? Thanks, you remind me that I need to increase core number of each node or change node number to compere the results. Thanks!
On Sat, 18 Jun 2022, Rongwei Wang wrote: > > Well the cycle reduction is strange. Tests are not done in the same > > environment? Maybe good to not use NUMA or bind to the same cpu > It's the same environment. I can sure. And there are four nodes (32G per-node > and 8 cores per-node) in my test environment. whether I need to test in one > node? If right, I can try. Ok in a NUMA environment the memory allocation is randomized on bootup. You may get different numbers after you reboot the system. Try to switch NUMA off. Use s a single node to get consistent numbers. It maybe useful to figure out what memory structure causes the increase in latency in a NUMA environment. If you can figure that out and properly allocate the memory structure that causes the increases in latency then you may be able to increase the performance of the allocator.
On 6/20/22 7:57 PM, Christoph Lameter wrote: > On Sat, 18 Jun 2022, Rongwei Wang wrote: > >>> Well the cycle reduction is strange. Tests are not done in the same >>> environment? Maybe good to not use NUMA or bind to the same cpu >> It's the same environment. I can sure. And there are four nodes (32G per-node >> and 8 cores per-node) in my test environment. whether I need to test in one >> node? If right, I can try. > > Ok in a NUMA environment the memory allocation is randomized on bootup. > You may get different numbers after you reboot the system. Try to switch > NUMA off. Use s a single node to get consistent numbers. Sorry for late reply. At first, let me share my test environment: arm64 VM (32 cores and 128G memory), and I only configure one node for this VM. Plus, I had use 'numactl -N 0 -m 0 qemu-kvm ...' to start this VM. It mainly to avoid data jitter. And I can sure my physical machine where this VM is located has same configuration when I tested prototype kernel and patched kernel. If above test environment has any problems, please let me know. The following is the latest data: Single thread testing 1. Kmalloc: Repeatedly allocate then free test before fix kmalloc kfree kmalloc kfree 10000 times 8 4 cycles 5 cycles 4 cycles 5 cycles 10000 times 16 3 cycles 5 cycles 3 cycles 5 cycles 10000 times 32 3 cycles 5 cycles 3 cycles 5 cycles 10000 times 64 3 cycles 5 cycles 3 cycles 5 cycles 10000 times 128 3 cycles 5 cycles 3 cycles 5 cycles 10000 times 256 14 cycles 9 cycles 6 cycles 8 cycles 10000 times 512 9 cycles 8 cycles 9 cycles 10 cycles 10000 times 1024 48 cycles 10 cycles 6 cycles 10 cycles 10000 times 2048 31 cycles 12 cycles 35 cycles 13 cycles 10000 times 4096 96 cycles 17 cycles 96 cycles 18 cycles 10000 times 8192 188 cycles 27 cycles 190 cycles 27 cycles 10000 times 16384 117 cycles 38 cycles 115 cycles 38 cycles 2. Kmalloc: alloc/free test before fix 10000 times kmalloc(8)/kfree 3 cycles 3 cycles 10000 times kmalloc(16)/kfree 3 cycles 3 cycles 10000 times kmalloc(32)/kfree 3 cycles 3 cycles 10000 times kmalloc(64)/kfree 3 cycles 3 cycles 10000 times kmalloc(128)/kfree 3 cycles 3 cycles 10000 times kmalloc(256)/kfree 3 cycles 3 cycles 10000 times kmalloc(512)/kfree 3 cycles 3 cycles 10000 times kmalloc(1024)/kfree 3 cycles 3 cycles 10000 times kmalloc(2048)/kfree 3 cycles 3 cycles 10000 times kmalloc(4096)/kfree 3 cycles 3 cycles 10000 times kmalloc(8192)/kfree 3 cycles 3 cycles 10000 times kmalloc(16384)/kfree 33 cycles 33 cycles Concurrent allocs before fix Kmalloc N*alloc N*free(8) Average=13/14 Average=14/15 Kmalloc N*alloc N*free(16) Average=13/15 Average=13/15 Kmalloc N*alloc N*free(32) Average=13/15 Average=13/15 Kmalloc N*alloc N*free(64) Average=13/15 Average=13/15 Kmalloc N*alloc N*free(128) Average=13/15 Average=13/15 Kmalloc N*alloc N*free(256) Average=137/29 Average=134/39 Kmalloc N*alloc N*free(512) Average=61/29 Average=64/28 Kmalloc N*alloc N*free(1024) Average=465/50 Average=656/55 Kmalloc N*alloc N*free(2048) Average=503/97 Average=422/97 Kmalloc N*alloc N*free(4096) Average=1592/206 Average=1624/207 Kmalloc N*(alloc free)(8) Average=3 Average=3 Kmalloc N*(alloc free)(16) Average=3 Average=3 Kmalloc N*(alloc free)(32) Average=3 Average=3 Kmalloc N*(alloc free)(64) Average=3 Average=3 Kmalloc N*(alloc free)(128) Average=3 Average=3 Kmalloc N*(alloc free)(256) Average=3 Average=3 Kmalloc N*(alloc free)(512) Average=3 Average=3 Kmalloc N*(alloc free)(1024) Average=3 Average=3 Kmalloc N*(alloc free)(2048) Average=3 Average=3 Kmalloc N*(alloc free)(4096) Average=3 Average=3 Can the above data indicate that this modification (only works when kmem_cache_debug(s) is true) does not introduce significant performance impact? Thanks for your time. > > It maybe useful to figure out what memory structure causes the increase in > latency in a NUMA environment. If you can figure that out and properly > allocate the memory structure that causes the increases in latency then > you may be able to increase the performance of the allocator.
On 6/17/22 5:40 PM, Vlastimil Babka wrote: > On 6/8/22 14:23, Christoph Lameter wrote: >> On Wed, 8 Jun 2022, Rongwei Wang wrote: >> >>> If available, I think document the issue and warn this incorrect behavior is >>> OK. But it still prints a large amount of confusing messages, and disturbs us? >> >> Correct it would be great if you could fix this in a way that does not >> impact performance. >> >>>> are current operations on the slab being validated. >>> And I am trying to fix it in following way. In a short, these changes only >>> works under the slub debug mode, and not affects the normal mode (I'm not >>> sure). It looks not elegant enough. And if all approve of this way, I can >>> submit the next version. >> >> >>> >>> Anyway, thanks for your time:). >>> -wrw >>> >>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, >> struct >>> slab *slab, >>> >>> { >>> void *prior; >>> - int was_frozen; >>> + int was_frozen, to_take_off = 0; >>> struct slab new; >> >> to_take_off has the role of !n ? Why is that needed? >> >>> - do { >>> - if (unlikely(n)) { >>> + spin_lock_irqsave(&n->list_lock, flags); >>> + ret = free_debug_processing(s, slab, head, tail, cnt, addr); >> >> Ok so the idea is to take the lock only if kmem_cache_debug. That looks >> ok. But it still adds a number of new branches etc to the free loop. > Hi, Vlastimil, sorry for missing your message long time. > It also further complicates the already tricky code. I wonder if we should > make more benefit from the fact that for kmem_cache_debug() caches we don't > leave any slabs on percpu or percpu partial lists, and also in > free_debug_processing() we aready take both list_lock and slab_lock. If we > just did the freeing immediately there under those locks, we would be > protected against other freeing cpus by that list_lock and don't need the > double cmpxchg tricks. enen, I'm not sure get your "don't need the double cmpxchg tricks" means completely. What you want to say is that replace cmpxchg_double_slab() here with following code when kmem_cache_debug(s)? __slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old){ slab->freelist = freelist_new; slab->counters = counters_new; __slab_unlock(slab); local_irq_restore(flags); return true; } __slab_unlock(slab); If I make mistakes for your words, please let me know. Thanks! > > What about against allocating cpus? More tricky as those will currently end > up privatizing the freelist via get_partial(), only to deactivate it again, > so our list_lock+slab_lock in freeing path would not protect in the > meanwhile. But the allocation is currently very inefficient for debug > caches, as in get_partial() it will take the list_lock to take the slab from > partial list and then in most cases again in deactivate_slab() to return it. It seems that I need speed some time to eat these words. Anyway, thanks. > > If instead the allocation path for kmem_cache_debug() cache would take a > single object from the partial list (not whole freelist) under list_lock, it > would be ultimately more efficient, and protect against freeing using > list_lock. Sounds like an idea worth trying to me? Hyeonggon had a similar advice that split freeing and allocating slab from debugging, likes below: __slab_alloc() { if (kmem_cache_debug(s)) slab_alloc_debug() else ___slab_alloc() } I guess that above code aims to solve your mentioned problem (idea)? slab_free() { if (kmem_cache_debug(s)) slab_free_debug() else __do_slab_free() } Currently, I only modify the code of freeing slab to fix the confusing messages of "slabinfo -v". If you agree, I can try to realize above mentioned slab_alloc_debug() code. Maybe it's also a challenge to me. Thanks for your time. > And of course we would stop creating the 'validate' sysfs files for > non-debug caches.
On 7/15/22 10:05, Rongwei Wang wrote: > > > On 6/17/22 5:40 PM, Vlastimil Babka wrote: >> On 6/8/22 14:23, Christoph Lameter wrote: >>> On Wed, 8 Jun 2022, Rongwei Wang wrote: >>> >>>> If available, I think document the issue and warn this incorrect >>>> behavior is >>>> OK. But it still prints a large amount of confusing messages, and >>>> disturbs us? >>> >>> Correct it would be great if you could fix this in a way that does not >>> impact performance. >>> >>>>> are current operations on the slab being validated. >>>> And I am trying to fix it in following way. In a short, these changes only >>>> works under the slub debug mode, and not affects the normal mode (I'm not >>>> sure). It looks not elegant enough. And if all approve of this way, I can >>>> submit the next version. >>> >>> >>>> >>>> Anyway, thanks for your time:). >>>> -wrw >>>> >>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, >>> struct >>>> slab *slab, >>>> >>>> { >>>> void *prior; >>>> - int was_frozen; >>>> + int was_frozen, to_take_off = 0; >>>> struct slab new; >>> >>> to_take_off has the role of !n ? Why is that needed? >>> >>>> - do { >>>> - if (unlikely(n)) { >>>> + spin_lock_irqsave(&n->list_lock, flags); >>>> + ret = free_debug_processing(s, slab, head, tail, cnt, >>>> addr); >>> >>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks >>> ok. But it still adds a number of new branches etc to the free loop. >> > Hi, Vlastimil, sorry for missing your message long time. Hi, no problem. >> It also further complicates the already tricky code. I wonder if we should >> make more benefit from the fact that for kmem_cache_debug() caches we don't >> leave any slabs on percpu or percpu partial lists, and also in >> free_debug_processing() we aready take both list_lock and slab_lock. If we >> just did the freeing immediately there under those locks, we would be >> protected against other freeing cpus by that list_lock and don't need the >> double cmpxchg tricks. > enen, I'm not sure get your "don't need the double cmpxchg tricks" means > completely. What you want to say is that replace cmpxchg_double_slab() here > with following code when kmem_cache_debug(s)? > > __slab_lock(slab); > if (slab->freelist == freelist_old && slab->counters == counters_old){ > slab->freelist = freelist_new; > slab->counters = counters_new; > __slab_unlock(slab); > local_irq_restore(flags); > return true; > } > __slab_unlock(slab); Pretty much, but it's more complicated. > If I make mistakes for your words, please let me know. > Thanks! >> >> What about against allocating cpus? More tricky as those will currently end >> up privatizing the freelist via get_partial(), only to deactivate it again, >> so our list_lock+slab_lock in freeing path would not protect in the >> meanwhile. But the allocation is currently very inefficient for debug >> caches, as in get_partial() it will take the list_lock to take the slab from >> partial list and then in most cases again in deactivate_slab() to return it. > It seems that I need speed some time to eat these words. Anyway, thanks. >> >> If instead the allocation path for kmem_cache_debug() cache would take a >> single object from the partial list (not whole freelist) under list_lock, it >> would be ultimately more efficient, and protect against freeing using >> list_lock. Sounds like an idea worth trying to me? > > Hyeonggon had a similar advice that split freeing and allocating slab from > debugging, likes below: > > > __slab_alloc() { > if (kmem_cache_debug(s)) > slab_alloc_debug() > else > ___slab_alloc() > } > > I guess that above code aims to solve your mentioned problem (idea)? > > slab_free() { > if (kmem_cache_debug(s)) > slab_free_debug() > else > __do_slab_free() > } > > Currently, I only modify the code of freeing slab to fix the confusing > messages of "slabinfo -v". If you agree, I can try to realize above > mentioned slab_alloc_debug() code. Maybe it's also a challenge to me. I already started working on this approach and hope to post a RFC soon. > Thanks for your time. > >> And of course we would stop creating the 'validate' sysfs files for >> non-debug caches.
On 7/15/22 6:33 PM, Vlastimil Babka wrote: > On 7/15/22 10:05, Rongwei Wang wrote: >> >> >> On 6/17/22 5:40 PM, Vlastimil Babka wrote: >>> On 6/8/22 14:23, Christoph Lameter wrote: >>>> On Wed, 8 Jun 2022, Rongwei Wang wrote: >>>> >>>>> If available, I think document the issue and warn this incorrect >>>>> behavior is >>>>> OK. But it still prints a large amount of confusing messages, and >>>>> disturbs us? >>>> >>>> Correct it would be great if you could fix this in a way that does not >>>> impact performance. >>>> >>>>>> are current operations on the slab being validated. >>>>> And I am trying to fix it in following way. In a short, these changes only >>>>> works under the slub debug mode, and not affects the normal mode (I'm not >>>>> sure). It looks not elegant enough. And if all approve of this way, I can >>>>> submit the next version. >>>> >>>> >>>>> >>>>> Anyway, thanks for your time:). >>>>> -wrw >>>>> >>>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, >>>> struct >>>>> slab *slab, >>>>> >>>>> { >>>>> void *prior; >>>>> - int was_frozen; >>>>> + int was_frozen, to_take_off = 0; >>>>> struct slab new; >>>> >>>> to_take_off has the role of !n ? Why is that needed? >>>> >>>>> - do { >>>>> - if (unlikely(n)) { >>>>> + spin_lock_irqsave(&n->list_lock, flags); >>>>> + ret = free_debug_processing(s, slab, head, tail, cnt, >>>>> addr); >>>> >>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks >>>> ok. But it still adds a number of new branches etc to the free loop. >>> >> Hi, Vlastimil, sorry for missing your message long time. > > Hi, no problem. > >>> It also further complicates the already tricky code. I wonder if we should >>> make more benefit from the fact that for kmem_cache_debug() caches we don't >>> leave any slabs on percpu or percpu partial lists, and also in >>> free_debug_processing() we aready take both list_lock and slab_lock. If we >>> just did the freeing immediately there under those locks, we would be >>> protected against other freeing cpus by that list_lock and don't need the >>> double cmpxchg tricks. >> enen, I'm not sure get your "don't need the double cmpxchg tricks" means >> completely. What you want to say is that replace cmpxchg_double_slab() here >> with following code when kmem_cache_debug(s)? >> >> __slab_lock(slab); >> if (slab->freelist == freelist_old && slab->counters == counters_old){ >> slab->freelist = freelist_new; >> slab->counters = counters_new; >> __slab_unlock(slab); >> local_irq_restore(flags); >> return true; >> } >> __slab_unlock(slab); > > Pretty much, but it's more complicated. Yes, actually, I think reuse cmpxchg_double_slab() here is more concise in code. I'm already finish this part of code, but hesitating whether to replace cmpxchg_double_slab(). > >> If I make mistakes for your words, please let me know. >> Thanks! >>> >>> What about against allocating cpus? More tricky as those will currently end >>> up privatizing the freelist via get_partial(), only to deactivate it again, >>> so our list_lock+slab_lock in freeing path would not protect in the >>> meanwhile. But the allocation is currently very inefficient for debug >>> caches, as in get_partial() it will take the list_lock to take the slab from >>> partial list and then in most cases again in deactivate_slab() to return it. >> It seems that I need speed some time to eat these words. Anyway, thanks. >>> >>> If instead the allocation path for kmem_cache_debug() cache would take a >>> single object from the partial list (not whole freelist) under list_lock, it >>> would be ultimately more efficient, and protect against freeing using >>> list_lock. Sounds like an idea worth trying to me? >> >> Hyeonggon had a similar advice that split freeing and allocating slab from >> debugging, likes below: >> >> >> __slab_alloc() { >> if (kmem_cache_debug(s)) >> slab_alloc_debug() >> else >> ___slab_alloc() >> } >> >> I guess that above code aims to solve your mentioned problem (idea)? >> >> slab_free() { >> if (kmem_cache_debug(s)) >> slab_free_debug() >> else >> __do_slab_free() >> } >> >> Currently, I only modify the code of freeing slab to fix the confusing >> messages of "slabinfo -v". If you agree, I can try to realize above >> mentioned slab_alloc_debug() code. Maybe it's also a challenge to me. > > I already started working on this approach and hope to post a RFC soon. OK, that's great. > >> Thanks for your time. >> >>> And of course we would stop creating the 'validate' sysfs files for >>> non-debug caches.
On 5/29/22 10:15, Rongwei Wang wrote: > In use cases where allocating and freeing slab frequently, some > error messages, such as "Left Redzone overwritten", "First byte > 0xbb instead of 0xcc" would be printed when validating slabs. > That's because an object has been filled with SLAB_RED_INACTIVE, > but has not been added to slab's freelist. And between these > two states, the behaviour of validating slab is likely to occur. > > Actually, it doesn't mean the slab can not work stably. But, these > confusing messages will disturb slab debugging more or less. > > Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> As I've said in the sub-thread I had the following kind of fix in mind. I think it should cover the cases from your patches 1/3 and 3/3. ----8<---- From c35fe2a781a7bc4ef37ef3ded289f4ced82562bd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka <vbabka@suse.cz> Date: Mon, 4 Jul 2022 14:09:09 +0200 Subject: [RFC] mm, slub: restrict sysfs validation to debug caches and make it safe Rongwei Wang reports [1] that cache validation triggered by writing to /sys/kernel/slab/<cache>/validate is racy against normal cache operations (e.g. freeing) in a way that can cause false positive inconsistency reports for caches with debugging enabled. The problem is that debugging actions that mark object free or active and actual freelist operations are not atomic, and the validation can see an inconsistent state. For caches that don't have debugging enabled, other races are possible that result in false reports of wrong slab counts. This patch attempts to solve these issues while not adding overhead to normal (especially fastpath) operations for caches that do not have debugging enabled, just to make possible userspace-triggered validation safe. Instead, disable the validation for caches that don't have debugging enabled and make the sysfs handler return -EINVAL. For caches that do have debugging enabled, we can instead extend the existing approach of not using percpu freelists to force all operations to the slow paths where debugging is checked for and processed. The processing on free in free_debug_processing() already happens under n->list_lock and slab_lock() so we can extend it to actually do the freeing as well and thus make it atomic against concurrent validation. The processing on alloc in alloc_debug_processing() currently doesn't take any locks, but we have to first allocate the object from a slab on the partial list (as percpu slabs are always non-existent) and thus take n->list_lock. Add a function alloc_single_from_partial() that additionally takes slab_lock() for the debug processing and then grabs just the allocated object instead of the whole freelist. This again makes it atomic against validation and it is also ultimately more efficient than the current grabbing of freelist immediately followed by slab deactivation. Neither of these changes affect the fast paths. The function free_debug_processing() was moved so that it is placed later than the definitions of add_partial(), remove_partial() and discard_slab(). [1] https://lore.kernel.org/all/20220529081535.69275-1-rongwei.wang@linux.alibaba.com/ Reported-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> Signed-off-by: Vlastimil Babka <vbabka@suse.cz> --- mm/slub.c | 250 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 174 insertions(+), 76 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b1281b8654bd..954fe7ad5ee1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) + struct slab *slab, void *object) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1385,63 +1382,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track_update(s, object, TRACK_FREE, addr, handle); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* * Parse a block of slub_debug options. Blocks are delimited by ';' * @@ -1661,7 +1601,7 @@ static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } + struct slab *slab, void *object) { return 0; } static inline int free_debug_processing( struct kmem_cache *s, struct slab *slab, @@ -2102,6 +2042,42 @@ static inline void remove_partial(struct kmem_cache_node *n, n->nr_partial--; } +/* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Removes only a single object from the slab + * under slab_lock(), does the alloc_debug_processing() checks and leaves the + * slab on the list, or moves it to full list if it was the last object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab) +{ + void *object; + unsigned long flags; + + lockdep_assert_held(&n->list_lock); + + slab_lock(slab, &flags); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object)) { + remove_partial(n, slab); + slab_unlock(slab, &flags); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + slab_unlock(slab, &flags); + + return object; +} + /* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. @@ -2182,6 +2158,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(slab, gfpflags)) continue; + if (kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab); + if (object) + break; + continue; + } + t = acquire_slab(s, n, slab, object == NULL); if (!t) break; @@ -2788,6 +2771,104 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_to_discard = NULL; + void *object = head; + int cnt = 0; + unsigned long flags, flags2; + int ret = 0; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(slab, &flags2); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + ret = 1; + +out_cnt: + if (cnt != bulk_cnt) + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + bulk_cnt, cnt); + +out: + if (ret) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* Do we need to remove the slab from full or partial list? */ + if (!prior) { + remove_full(s, n, slab); + } else if (slab->inuse == 0) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + + /* Do we need to discard the slab or add to partial list? */ + if (slab->inuse == 0) { + slab_to_discard = slab; + } else if (!prior) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + + } + + slab_unlock(slab, &flags2); + spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + if (slab_to_discard) { + stat(s, FREE_SLAB); + discard_slab(s, slab); + } + + return ret; +} #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) @@ -3045,19 +3126,35 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLAB); -check_new_slab: - if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { + if (!alloc_debug_processing(s, slab, freelist)) /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. */ - goto return_single; - } + goto new_slab; + + /* + * For debug caches we don't load the freelist so that all + * allocations have to go through alloc_debug_processing() + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + goto return_single; + } + +check_new_slab: + + if (kmem_cache_debug(s)) { + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + return freelist; } if (unlikely(!pfmemalloc_match(slab, gfpflags))) @@ -3341,9 +3438,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + if (kmem_cache_debug(s)) { + free_debug_processing(s, slab, head, tail, cnt, addr); return; + } do { if (unlikely(n)) { @@ -5625,7 +5723,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length;
On 7/18/22 7:09 PM, Vlastimil Babka wrote: > > On 5/29/22 10:15, Rongwei Wang wrote: >> In use cases where allocating and freeing slab frequently, some >> error messages, such as "Left Redzone overwritten", "First byte >> 0xbb instead of 0xcc" would be printed when validating slabs. >> That's because an object has been filled with SLAB_RED_INACTIVE, >> but has not been added to slab's freelist. And between these >> two states, the behaviour of validating slab is likely to occur. >> >> Actually, it doesn't mean the slab can not work stably. But, these >> confusing messages will disturb slab debugging more or less. >> >> Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> > > As I've said in the sub-thread I had the following kind of fix in mind. I > think it should cover the cases from your patches 1/3 and 3/3. > > ----8<---- > From c35fe2a781a7bc4ef37ef3ded289f4ced82562bd Mon Sep 17 00:00:00 2001 > From: Vlastimil Babka <vbabka@suse.cz> > Date: Mon, 4 Jul 2022 14:09:09 +0200 > Subject: [RFC] mm, slub: restrict sysfs validation to debug caches and make > it safe > > Rongwei Wang reports [1] that cache validation triggered by writing to > /sys/kernel/slab/<cache>/validate is racy against normal cache > operations (e.g. freeing) in a way that can cause false positive > inconsistency reports for caches with debugging enabled. The problem is > that debugging actions that mark object free or active and actual > freelist operations are not atomic, and the validation can see an > inconsistent state. > > For caches that don't have debugging enabled, other races are possible > that result in false reports of wrong slab counts. > > This patch attempts to solve these issues while not adding overhead to > normal (especially fastpath) operations for caches that do not have > debugging enabled, just to make possible userspace-triggered validation > safe. Instead, disable the validation for caches that don't have > debugging enabled and make the sysfs handler return -EINVAL. > > For caches that do have debugging enabled, we can instead extend the > existing approach of not using percpu freelists to force all operations > to the slow paths where debugging is checked for and processed. > > The processing on free in free_debug_processing() already happens under > n->list_lock and slab_lock() so we can extend it to actually do the > freeing as well and thus make it atomic against concurrent validation. > > The processing on alloc in alloc_debug_processing() currently doesn't > take any locks, but we have to first allocate the object from a slab on > the partial list (as percpu slabs are always non-existent) and thus take > n->list_lock. Add a function alloc_single_from_partial() that > additionally takes slab_lock() for the debug processing and then grabs > just the allocated object instead of the whole freelist. This again > makes it atomic against validation and it is also ultimately more > efficient than the current grabbing of freelist immediately followed by > slab deactivation. > > Neither of these changes affect the fast paths. > > The function free_debug_processing() was moved so that it is placed > later than the definitions of add_partial(), remove_partial() and > discard_slab(). > > [1] https://lore.kernel.org/all/20220529081535.69275-1-rongwei.wang@linux.alibaba.com/ > > Reported-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz> > --- > mm/slub.c | 250 +++++++++++++++++++++++++++++++++++++----------------- > 1 file changed, 174 insertions(+), 76 deletions(-) > > diff --git a/mm/slub.c b/mm/slub.c > index b1281b8654bd..954fe7ad5ee1 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, > } > > static noinline int alloc_debug_processing(struct kmem_cache *s, > - struct slab *slab, > - void *object, unsigned long addr) > + struct slab *slab, void *object) > { > if (s->flags & SLAB_CONSISTENCY_CHECKS) { > if (!alloc_consistency_checks(s, slab, object)) > goto bad; > } > > - /* Success perform special debug activities for allocs */ > - if (s->flags & SLAB_STORE_USER) > - set_track(s, object, TRACK_ALLOC, addr); > + /* Success. Perform special debug activities for allocs */ > trace(s, slab, object, 1); > init_object(s, object, SLUB_RED_ACTIVE); > return 1; > @@ -1385,63 +1382,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, > return 1; > } > > -/* Supports checking bulk free of a constructed freelist */ > -static noinline int free_debug_processing( > - struct kmem_cache *s, struct slab *slab, > - void *head, void *tail, int bulk_cnt, > - unsigned long addr) > -{ > - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); > - void *object = head; > - int cnt = 0; > - unsigned long flags, flags2; > - int ret = 0; > - depot_stack_handle_t handle = 0; > - > - if (s->flags & SLAB_STORE_USER) > - handle = set_track_prepare(); > - > - spin_lock_irqsave(&n->list_lock, flags); > - slab_lock(slab, &flags2); > - > - if (s->flags & SLAB_CONSISTENCY_CHECKS) { > - if (!check_slab(s, slab)) > - goto out; > - } > - > -next_object: > - cnt++; > - > - if (s->flags & SLAB_CONSISTENCY_CHECKS) { > - if (!free_consistency_checks(s, slab, object, addr)) > - goto out; > - } > - > - if (s->flags & SLAB_STORE_USER) > - set_track_update(s, object, TRACK_FREE, addr, handle); > - trace(s, slab, object, 0); > - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ > - init_object(s, object, SLUB_RED_INACTIVE); > - > - /* Reached end of constructed freelist yet? */ > - if (object != tail) { > - object = get_freepointer(s, object); > - goto next_object; > - } > - ret = 1; > - > -out: > - if (cnt != bulk_cnt) > - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", > - bulk_cnt, cnt); > - > - slab_unlock(slab, &flags2); > - spin_unlock_irqrestore(&n->list_lock, flags); > - if (!ret) > - slab_fix(s, "Object at 0x%p not freed", object); > - return ret; > -} > - > /* > * Parse a block of slub_debug options. Blocks are delimited by ';' > * > @@ -1661,7 +1601,7 @@ static inline > void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} > > static inline int alloc_debug_processing(struct kmem_cache *s, > - struct slab *slab, void *object, unsigned long addr) { return 0; } > + struct slab *slab, void *object) { return 0; } > > static inline int free_debug_processing( > struct kmem_cache *s, struct slab *slab, > @@ -2102,6 +2042,42 @@ static inline void remove_partial(struct kmem_cache_node *n, > n->nr_partial--; > } > > +/* > + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a > + * slab from the n->partial list. Removes only a single object from the slab > + * under slab_lock(), does the alloc_debug_processing() checks and leaves the > + * slab on the list, or moves it to full list if it was the last object. > + */ > +static void *alloc_single_from_partial(struct kmem_cache *s, > + struct kmem_cache_node *n, struct slab *slab) > +{ > + void *object; > + unsigned long flags; > + > + lockdep_assert_held(&n->list_lock); > + > + slab_lock(slab, &flags); > + > + object = slab->freelist; > + slab->freelist = get_freepointer(s, object); > + slab->inuse++; > + > + if (!alloc_debug_processing(s, slab, object)) { > + remove_partial(n, slab); > + slab_unlock(slab, &flags); > + return NULL; > + } > + > + if (slab->inuse == slab->objects) { > + remove_partial(n, slab); > + add_full(s, n, slab); > + } > + > + slab_unlock(slab, &flags); > + > + return object; > +} > + > /* > * Remove slab from the partial list, freeze it and > * return the pointer to the freelist. > @@ -2182,6 +2158,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, > if (!pfmemalloc_match(slab, gfpflags)) > continue; > > + if (kmem_cache_debug(s)) { > + object = alloc_single_from_partial(s, n, slab); > + if (object) > + break; > + continue; > + } > + > t = acquire_slab(s, n, slab, object == NULL); > if (!t) > break; > @@ -2788,6 +2771,104 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) > { > return atomic_long_read(&n->total_objects); > } > + > +/* Supports checking bulk free of a constructed freelist */ > +static noinline int free_debug_processing( > + struct kmem_cache *s, struct slab *slab, > + void *head, void *tail, int bulk_cnt, > + unsigned long addr) > +{ > + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); > + struct slab *slab_to_discard = NULL; > + void *object = head; > + int cnt = 0; > + unsigned long flags, flags2; > + int ret = 0; > + depot_stack_handle_t handle = 0; > + > + if (s->flags & SLAB_STORE_USER) > + handle = set_track_prepare(); > + > + spin_lock_irqsave(&n->list_lock, flags); > + slab_lock(slab, &flags2); > + > + if (s->flags & SLAB_CONSISTENCY_CHECKS) { > + if (!check_slab(s, slab)) > + goto out; > + } > + > + if (slab->inuse < bulk_cnt) { > + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", > + slab->inuse, bulk_cnt); > + goto out; > + } > + > +next_object: > + > + if (++cnt > bulk_cnt) > + goto out_cnt; > + > + if (s->flags & SLAB_CONSISTENCY_CHECKS) { > + if (!free_consistency_checks(s, slab, object, addr)) > + goto out; > + } > + > + if (s->flags & SLAB_STORE_USER) > + set_track_update(s, object, TRACK_FREE, addr, handle); > + trace(s, slab, object, 0); > + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ > + init_object(s, object, SLUB_RED_INACTIVE); > + > + /* Reached end of constructed freelist yet? */ > + if (object != tail) { > + object = get_freepointer(s, object); > + goto next_object; > + } > + ret = 1; > + > +out_cnt: > + if (cnt != bulk_cnt) > + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", > + bulk_cnt, cnt); > + > +out: > + if (ret) { > + void *prior = slab->freelist; > + > + /* Perform the actual freeing while we still hold the locks */ > + slab->inuse -= cnt; > + set_freepointer(s, tail, prior); > + slab->freelist = head; > + > + /* Do we need to remove the slab from full or partial list? */ > + if (!prior) { > + remove_full(s, n, slab); > + } else if (slab->inuse == 0) { > + remove_partial(n, slab); > + stat(s, FREE_REMOVE_PARTIAL); > + } > + > + /* Do we need to discard the slab or add to partial list? */ > + if (slab->inuse == 0) { > + slab_to_discard = slab; > + } else if (!prior) { > + add_partial(n, slab, DEACTIVATE_TO_TAIL); > + stat(s, FREE_ADD_PARTIAL); > + } > + > + } > + > + slab_unlock(slab, &flags2); > + spin_unlock_irqrestore(&n->list_lock, flags); > + if (!ret) > + slab_fix(s, "Object at 0x%p not freed", object); > + if (slab_to_discard) { > + stat(s, FREE_SLAB); > + discard_slab(s, slab); > + } > + > + return ret; > +} I had test this patch, and it indeed deal with the bug that I described. Though I am also has prepared this part of code, your code is ok to me. -wrw > #endif /* CONFIG_SLUB_DEBUG */ > > #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) > @@ -3045,19 +3126,35 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > stat(s, ALLOC_SLAB); > > -check_new_slab: > - > if (kmem_cache_debug(s)) { > - if (!alloc_debug_processing(s, slab, freelist, addr)) { > - /* Slab failed checks. Next slab needed */ > - goto new_slab; > - } else { > + if (!alloc_debug_processing(s, slab, freelist)) > /* > - * For debug case, we don't load freelist so that all > - * allocations go through alloc_debug_processing() > + * It's not really expected that this would fail on a > + * freshly allocated slab, but a concurrent memory > + * corruption in theory could cause that. > */ > - goto return_single; > - } > + goto new_slab; > + > + /* > + * For debug caches we don't load the freelist so that all > + * allocations have to go through alloc_debug_processing() > + */ > + if (s->flags & SLAB_STORE_USER) > + set_track(s, freelist, TRACK_ALLOC, addr); > + goto return_single; > + } > + > +check_new_slab: > + > + if (kmem_cache_debug(s)) { > + /* > + * For debug caches here we had to go through > + * alloc_single_from_partial() so just store the tracking info > + * and return the object > + */ > + if (s->flags & SLAB_STORE_USER) > + set_track(s, freelist, TRACK_ALLOC, addr); > + return freelist; > } > > if (unlikely(!pfmemalloc_match(slab, gfpflags))) > @@ -3341,9 +3438,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > if (kfence_free(head)) > return; > > - if (kmem_cache_debug(s) && > - !free_debug_processing(s, slab, head, tail, cnt, addr)) > + if (kmem_cache_debug(s)) { > + free_debug_processing(s, slab, head, tail, cnt, addr); > return; > + } > > do { > if (unlikely(n)) { > @@ -5625,7 +5723,7 @@ static ssize_t validate_store(struct kmem_cache *s, > { > int ret = -EINVAL; > > - if (buf[0] == '1') { > + if (buf[0] == '1' && kmem_cache_debug(s)) { > ret = validate_slab_cache(s); > if (ret >= 0) > ret = length;
On 7/19/22 16:15, Rongwei Wang wrote: > ... >> + >> + slab_unlock(slab, &flags2); >> + spin_unlock_irqrestore(&n->list_lock, flags); >> + if (!ret) >> + slab_fix(s, "Object at 0x%p not freed", object); >> + if (slab_to_discard) { >> + stat(s, FREE_SLAB); >> + discard_slab(s, slab); >> + } >> + >> + return ret; >> +} > I had test this patch, and it indeed deal with the bug that I described. Thanks. > Though I am also has prepared this part of code, your code is ok to me. Aha, feel free to post your version, maybe it's simpler? We can compare.
On 7/19/22 10:21 PM, Vlastimil Babka wrote: > On 7/19/22 16:15, Rongwei Wang wrote: >> > ... >>> + >>> + slab_unlock(slab, &flags2); >>> + spin_unlock_irqrestore(&n->list_lock, flags); >>> + if (!ret) >>> + slab_fix(s, "Object at 0x%p not freed", object); >>> + if (slab_to_discard) { >>> + stat(s, FREE_SLAB); >>> + discard_slab(s, slab); >>> + } >>> + >>> + return ret; >>> +} >> I had test this patch, and it indeed deal with the bug that I described. > > Thanks. > >> Though I am also has prepared this part of code, your code is ok to me. > > Aha, feel free to post your version, maybe it's simpler? We can compare. My code only includes the part of your free_debug_processing(), the structure of it likes: slab_free() { if (kmem_cache_debug(s)) slab_free_debug(); else __do_slab_free(); } The __slab_free_debug() here likes your free_debug_processing(). +/* + * Slow path handling for debugging. + */ +static void __slab_free_debug(struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int cnt, + unsigned long addr) + +{ + void *prior; + int was_frozen; + struct slab new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long flags; + int ret; + + stat(s, FREE_SLOWPATH); + + if (kfence_free(head)) + return; + + n = get_node(s, slab_nid(slab)); + + spin_lock_irqsave(&n->list_lock, flags); + ret = free_debug_processing(s, slab, head, tail, cnt, addr); + if (!ret) { + spin_unlock_irqrestore(&n->list_lock, flags); + return; + } + + do { + prior = slab->freelist; + counters = slab->counters; + set_freepointer(s, tail, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse -= cnt; + } while (!cmpxchg_double_slab(s, slab, + prior, counters, + head, new.counters, + "__slab_free")); + + if ((new.inuse && prior) || was_frozen) { + spin_unlock_irqrestore(&n->list_lock, flags); + if (likely(was_frozen)) { + stat(s, FREE_FROZEN); + } + + return; + } + + if (!new.inuse && n->nr_partial >= s->min_partial) { + /* Indicate no user in this slab, discarding it naturally. */ + if (prior) { + /* Slab on the partial list. */ + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } else { + /* Slab must be on the full list */ + remove_full(s, n, slab); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, FREE_SLAB); + discard_slab(s, slab); + return; + } + + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (!prior) { + remove_full(s, n, slab); + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + spin_unlock_irqrestore(&n->list_lock, flags); + + return; +}
diff --git a/mm/slub.c b/mm/slub.c index ed5c2c03a47a..310e56d99116 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing( void *head, void *tail, int bulk_cnt, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); void *object = head; int cnt = 0; - unsigned long flags, flags2; + unsigned long flags; int ret = 0; - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - + slab_lock(slab, &flags); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, slab)) goto out; @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing( slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); + slab_unlock(slab, &flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); return ret; @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, { void *prior; - int was_frozen; + int was_frozen, to_take_off = 0; struct slab new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; + n = get_node(s, slab_nid(slab)); + spin_lock_irqsave(&n->list_lock, flags); + if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + !free_debug_processing(s, slab, head, tail, cnt, addr)) { + + spin_unlock_irqrestore(&n->list_lock, flags); return; + } do { - if (unlikely(n)) { - spin_unlock_irqrestore(&n->list_lock, flags); - n = NULL; - } + if (unlikely(to_take_off)) + to_take_off = 0; prior = slab->freelist; counters = slab->counters; set_freepointer(s, tail, prior); @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, new.frozen = 1; } else { /* Needs to be taken off a list */ - - n = get_node(s, slab_nid(slab)); /* - * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. + * drop this behavior without any processing. */ - spin_lock_irqsave(&n->list_lock, flags); - + to_take_off = 1; } } @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, head, new.counters, "__slab_free")); - if (likely(!n)) { + if (likely(!to_take_off)) { + spin_unlock_irqrestore(&n->list_lock, flags); if (likely(was_frozen)) { /* * The list lock was not taken therefore no list
In use cases where allocating and freeing slab frequently, some error messages, such as "Left Redzone overwritten", "First byte 0xbb instead of 0xcc" would be printed when validating slabs. That's because an object has been filled with SLAB_RED_INACTIVE, but has not been added to slab's freelist. And between these two states, the behaviour of validating slab is likely to occur. Actually, it doesn't mean the slab can not work stably. But, these confusing messages will disturb slab debugging more or less. Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com> --- mm/slub.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-)