Message ID | 20210505154613.17214-3-longman@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm: memcg/slab: Fix objcg pointer array handling problem | expand |
On 5/5/21 5:46 PM, Waiman Long wrote: > There are currently two problems in the way the objcg pointer array > (memcg_data) in the page structure is being allocated and freed. > > On its allocation, it is possible that the allocated objcg pointer > array comes from the same slab that requires memory accounting. If this > happens, the slab will never become empty again as there is at least > one object left (the obj_cgroup array) in the slab. > > When it is freed, the objcg pointer array object may be the last one > in its slab and hence causes kfree() to be called again. With the > right workload, the slab cache may be set up in a way that allows the > recursive kfree() calling loop to nest deep enough to cause a kernel > stack overflow and panic the system. > > One way to solve this problem is to split the kmalloc-<n> caches > (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> > (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of > kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All > the other caches can still allow a mix of accounted and non-accounted > objects. > > With this change, all the objcg pointer array objects will come from > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > both the recursive kfree() problem and non-freeable slab problem are > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > have mixed accounted and unaccounted objects, this will slightly reduce > the number of objcg pointer arrays that need to be allocated and save > a bit of memory. > > The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and > KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() > will include the newly added caches without change. > > Suggested-by: Vlastimil Babka <vbabka@suse.cz> > Signed-off-by: Waiman Long <longman@redhat.com> > --- > include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- > mm/slab_common.c | 23 +++++++++++++++-------- > 2 files changed, 49 insertions(+), 16 deletions(-) > > diff --git a/include/linux/slab.h b/include/linux/slab.h > index 0c97d788762c..f2d9ebc34f5c 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, > /* > * Whenever changing this, take care of that kmalloc_type() and > * create_kmalloc_caches() still work as intended. > + * > + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP > + * is for accounted objects only. All the other kmem caches can have both > + * accounted and non-accounted objects. > */ > enum kmalloc_cache_type { > KMALLOC_NORMAL = 0, > +#ifdef CONFIG_MEMCG_KMEM > + KMALLOC_CGROUP, > +#endif > KMALLOC_RECLAIM, > #ifdef CONFIG_ZONE_DMA > KMALLOC_DMA, > @@ -315,28 +322,47 @@ enum kmalloc_cache_type { > NR_KMALLOC_TYPES > }; > > +#ifndef CONFIG_MEMCG_KMEM > +#define KMALLOC_CGROUP KMALLOC_NORMAL > +#endif > +#ifndef CONFIG_ZONE_DMA > +#define KMALLOC_DMA KMALLOC_NORMAL > +#endif You could move this to the enum definition itself? E.g.: #ifdef CONFIG_MEMCG_KMEM KMALLOC_CGROUP, #else KMALLOC_CGROUP = KMALLOC_NORMAL, #endif > + > #ifndef CONFIG_SLOB > extern struct kmem_cache * > kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; > > +/* > + * Define gfp bits that should not be set for KMALLOC_NORMAL. > + */ > +#define KMALLOC_NOT_NORMAL_BITS \ > + (__GFP_RECLAIMABLE | \ > + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ > + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) > + > static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) > { > -#ifdef CONFIG_ZONE_DMA > /* > * The most common case is KMALLOC_NORMAL, so test for it > * with a single branch for both flags. Not "both flags" anymore. Something like "so test with a single branch that there are none of the flags that would select a different type" > */ > - if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) > + if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) > return KMALLOC_NORMAL; > > /* > - * At least one of the flags has to be set. If both are, __GFP_DMA > - * is more important. > + * At least one of the flags has to be set. Their priorities in > + * decreasing order are: > + * 1) __GFP_DMA > + * 2) __GFP_RECLAIMABLE > + * 3) __GFP_ACCOUNT > */ > - return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM; > -#else > - return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL; > -#endif > + if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) > + return KMALLOC_DMA; > + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) > + return KMALLOC_RECLAIM; > + else > + return KMALLOC_CGROUP; > } Works for me this way, thanks. > > /* > diff --git a/mm/slab_common.c b/mm/slab_common.c > index f8833d3e5d47..d750e3ba7af5 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -727,21 +727,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) > } > > #ifdef CONFIG_ZONE_DMA > -#define INIT_KMALLOC_INFO(__size, __short_size) \ > -{ \ > - .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ > - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ > - .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \ > - .size = __size, \ > -} > +#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, > +#else > +#define KMALLOC_DMA_NAME(sz) > +#endif > + > +#ifdef CONFIG_MEMCG_KMEM > +#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, > #else > +#define KMALLOC_CGROUP_NAME(sz) > +#endif > + > #define INIT_KMALLOC_INFO(__size, __short_size) \ > { \ > .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ > .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ > + KMALLOC_CGROUP_NAME(__short_size) \ > + KMALLOC_DMA_NAME(__short_size) \ > .size = __size, \ > } > -#endif > > /* > * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. > @@ -847,6 +851,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) > int i; > enum kmalloc_cache_type type; > > + /* > + * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined > + */ > for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { > for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { > if (!kmalloc_caches[type][i]) >
On Wed, May 5, 2021 at 8:47 AM Waiman Long <longman@redhat.com> wrote: > > There are currently two problems in the way the objcg pointer array > (memcg_data) in the page structure is being allocated and freed. > > On its allocation, it is possible that the allocated objcg pointer > array comes from the same slab that requires memory accounting. If this > happens, the slab will never become empty again as there is at least > one object left (the obj_cgroup array) in the slab. > > When it is freed, the objcg pointer array object may be the last one > in its slab and hence causes kfree() to be called again. With the > right workload, the slab cache may be set up in a way that allows the > recursive kfree() calling loop to nest deep enough to cause a kernel > stack overflow and panic the system. > > One way to solve this problem is to split the kmalloc-<n> caches > (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> > (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of > kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All > the other caches can still allow a mix of accounted and non-accounted > objects. > > With this change, all the objcg pointer array objects will come from > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > both the recursive kfree() problem and non-freeable slab problem are > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > have mixed accounted and unaccounted objects, this will slightly reduce > the number of objcg pointer arrays that need to be allocated and save > a bit of memory. > > The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and > KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() > will include the newly added caches without change. > > Suggested-by: Vlastimil Babka <vbabka@suse.cz> > Signed-off-by: Waiman Long <longman@redhat.com> One nit below and after incorporating Vlastimil's suggestions: Reviewed-by: Shakeel Butt <shakeelb@google.com> > --- > include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- > mm/slab_common.c | 23 +++++++++++++++-------- > 2 files changed, 49 insertions(+), 16 deletions(-) > > diff --git a/include/linux/slab.h b/include/linux/slab.h > index 0c97d788762c..f2d9ebc34f5c 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, > /* > * Whenever changing this, take care of that kmalloc_type() and > * create_kmalloc_caches() still work as intended. > + * > + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP > + * is for accounted objects only. I think you can say "KMALLOC_CGROUP is for accounted and unreclaimable objects only".
On 5/5/21 12:06 PM, Vlastimil Babka wrote: > On 5/5/21 5:46 PM, Waiman Long wrote: >> There are currently two problems in the way the objcg pointer array >> (memcg_data) in the page structure is being allocated and freed. >> >> On its allocation, it is possible that the allocated objcg pointer >> array comes from the same slab that requires memory accounting. If this >> happens, the slab will never become empty again as there is at least >> one object left (the obj_cgroup array) in the slab. >> >> When it is freed, the objcg pointer array object may be the last one >> in its slab and hence causes kfree() to be called again. With the >> right workload, the slab cache may be set up in a way that allows the >> recursive kfree() calling loop to nest deep enough to cause a kernel >> stack overflow and panic the system. >> >> One way to solve this problem is to split the kmalloc-<n> caches >> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> >> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of >> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All >> the other caches can still allow a mix of accounted and non-accounted >> objects. >> >> With this change, all the objcg pointer array objects will come from >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >> both the recursive kfree() problem and non-freeable slab problem are >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >> have mixed accounted and unaccounted objects, this will slightly reduce >> the number of objcg pointer arrays that need to be allocated and save >> a bit of memory. >> >> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and >> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() >> will include the newly added caches without change. >> >> Suggested-by: Vlastimil Babka <vbabka@suse.cz> >> Signed-off-by: Waiman Long <longman@redhat.com> >> --- >> include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- >> mm/slab_common.c | 23 +++++++++++++++-------- >> 2 files changed, 49 insertions(+), 16 deletions(-) >> >> diff --git a/include/linux/slab.h b/include/linux/slab.h >> index 0c97d788762c..f2d9ebc34f5c 100644 >> --- a/include/linux/slab.h >> +++ b/include/linux/slab.h >> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, >> /* >> * Whenever changing this, take care of that kmalloc_type() and >> * create_kmalloc_caches() still work as intended. >> + * >> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP >> + * is for accounted objects only. All the other kmem caches can have both >> + * accounted and non-accounted objects. >> */ >> enum kmalloc_cache_type { >> KMALLOC_NORMAL = 0, >> +#ifdef CONFIG_MEMCG_KMEM >> + KMALLOC_CGROUP, >> +#endif >> KMALLOC_RECLAIM, >> #ifdef CONFIG_ZONE_DMA >> KMALLOC_DMA, >> @@ -315,28 +322,47 @@ enum kmalloc_cache_type { >> NR_KMALLOC_TYPES >> }; >> >> +#ifndef CONFIG_MEMCG_KMEM >> +#define KMALLOC_CGROUP KMALLOC_NORMAL >> +#endif >> +#ifndef CONFIG_ZONE_DMA >> +#define KMALLOC_DMA KMALLOC_NORMAL >> +#endif > You could move this to the enum definition itself? E.g.: > > #ifdef CONFIG_MEMCG_KMEM > KMALLOC_CGROUP, > #else > KMALLOC_CGROUP = KMALLOC_NORMAL, > #endif > >> + >> #ifndef CONFIG_SLOB >> extern struct kmem_cache * >> kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; >> >> +/* >> + * Define gfp bits that should not be set for KMALLOC_NORMAL. >> + */ >> +#define KMALLOC_NOT_NORMAL_BITS \ >> + (__GFP_RECLAIMABLE | \ >> + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ >> + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) >> + >> static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) >> { >> -#ifdef CONFIG_ZONE_DMA >> /* >> * The most common case is KMALLOC_NORMAL, so test for it >> * with a single branch for both flags. > Not "both flags" anymore. Something like "so test with a single branch that > there are none of the flags that would select a different type" Right. I just left the comment there without taking a deeper look. My bad. Cheers, Longman
On 5/5/21 12:17 PM, Shakeel Butt wrote: > On Wed, May 5, 2021 at 8:47 AM Waiman Long <longman@redhat.com> wrote: >> There are currently two problems in the way the objcg pointer array >> (memcg_data) in the page structure is being allocated and freed. >> >> On its allocation, it is possible that the allocated objcg pointer >> array comes from the same slab that requires memory accounting. If this >> happens, the slab will never become empty again as there is at least >> one object left (the obj_cgroup array) in the slab. >> >> When it is freed, the objcg pointer array object may be the last one >> in its slab and hence causes kfree() to be called again. With the >> right workload, the slab cache may be set up in a way that allows the >> recursive kfree() calling loop to nest deep enough to cause a kernel >> stack overflow and panic the system. >> >> One way to solve this problem is to split the kmalloc-<n> caches >> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> >> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of >> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All >> the other caches can still allow a mix of accounted and non-accounted >> objects. >> >> With this change, all the objcg pointer array objects will come from >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >> both the recursive kfree() problem and non-freeable slab problem are >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >> have mixed accounted and unaccounted objects, this will slightly reduce >> the number of objcg pointer arrays that need to be allocated and save >> a bit of memory. >> >> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and >> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() >> will include the newly added caches without change. >> >> Suggested-by: Vlastimil Babka <vbabka@suse.cz> >> Signed-off-by: Waiman Long <longman@redhat.com> > One nit below and after incorporating Vlastimil's suggestions: > > Reviewed-by: Shakeel Butt <shakeelb@google.com> > >> --- >> include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- >> mm/slab_common.c | 23 +++++++++++++++-------- >> 2 files changed, 49 insertions(+), 16 deletions(-) >> >> diff --git a/include/linux/slab.h b/include/linux/slab.h >> index 0c97d788762c..f2d9ebc34f5c 100644 >> --- a/include/linux/slab.h >> +++ b/include/linux/slab.h >> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, >> /* >> * Whenever changing this, take care of that kmalloc_type() and >> * create_kmalloc_caches() still work as intended. >> + * >> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP >> + * is for accounted objects only. > I think you can say "KMALLOC_CGROUP is for accounted and unreclaimable > objects only". > Thanks for the suggestion. Will incorporate that. Cheers, Longman
On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: > There are currently two problems in the way the objcg pointer array > (memcg_data) in the page structure is being allocated and freed. > > On its allocation, it is possible that the allocated objcg pointer > array comes from the same slab that requires memory accounting. If this > happens, the slab will never become empty again as there is at least > one object left (the obj_cgroup array) in the slab. > > When it is freed, the objcg pointer array object may be the last one > in its slab and hence causes kfree() to be called again. With the > right workload, the slab cache may be set up in a way that allows the > recursive kfree() calling loop to nest deep enough to cause a kernel > stack overflow and panic the system. > > One way to solve this problem is to split the kmalloc-<n> caches > (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> > (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of > kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All > the other caches can still allow a mix of accounted and non-accounted > objects. I agree that it's likely the best approach here. Thanks for discovering and fixing the problem! > > With this change, all the objcg pointer array objects will come from > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > both the recursive kfree() problem and non-freeable slab problem are > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > have mixed accounted and unaccounted objects, this will slightly reduce > the number of objcg pointer arrays that need to be allocated and save > a bit of memory. Unfortunately the positive effect of this change will be likely reversed by a lower utilization due to a larger number of caches. Btw, I wonder if we also need a change in the slab caches merging procedure? KMALLOC_NORMAL caches should not be merged with caches which can potentially include accounted objects. > > The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and > KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() > will include the newly added caches without change. > > Suggested-by: Vlastimil Babka <vbabka@suse.cz> > Signed-off-by: Waiman Long <longman@redhat.com> > --- > include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- > mm/slab_common.c | 23 +++++++++++++++-------- > 2 files changed, 49 insertions(+), 16 deletions(-) > > diff --git a/include/linux/slab.h b/include/linux/slab.h > index 0c97d788762c..f2d9ebc34f5c 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, > /* > * Whenever changing this, take care of that kmalloc_type() and > * create_kmalloc_caches() still work as intended. > + * > + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP > + * is for accounted objects only. All the other kmem caches can have both > + * accounted and non-accounted objects. > */ > enum kmalloc_cache_type { > KMALLOC_NORMAL = 0, > +#ifdef CONFIG_MEMCG_KMEM > + KMALLOC_CGROUP, > +#endif > KMALLOC_RECLAIM, > #ifdef CONFIG_ZONE_DMA > KMALLOC_DMA, > @@ -315,28 +322,47 @@ enum kmalloc_cache_type { > NR_KMALLOC_TYPES > }; > > +#ifndef CONFIG_MEMCG_KMEM > +#define KMALLOC_CGROUP KMALLOC_NORMAL > +#endif > +#ifndef CONFIG_ZONE_DMA > +#define KMALLOC_DMA KMALLOC_NORMAL > +#endif > + > #ifndef CONFIG_SLOB > extern struct kmem_cache * > kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; > > +/* > + * Define gfp bits that should not be set for KMALLOC_NORMAL. > + */ > +#define KMALLOC_NOT_NORMAL_BITS \ > + (__GFP_RECLAIMABLE | \ > + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ > + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) > + > static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) > { > -#ifdef CONFIG_ZONE_DMA > /* > * The most common case is KMALLOC_NORMAL, so test for it > * with a single branch for both flags. > */ > - if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) > + if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) > return KMALLOC_NORMAL; Likely KMALLOC_CGROUP is also very popular, so maybe we want to change the optimization here a bit. > > /* > - * At least one of the flags has to be set. If both are, __GFP_DMA > - * is more important. > + * At least one of the flags has to be set. Their priorities in > + * decreasing order are: > + * 1) __GFP_DMA > + * 2) __GFP_RECLAIMABLE > + * 3) __GFP_ACCOUNT > */ > - return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM; > -#else > - return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL; > -#endif > + if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) > + return KMALLOC_DMA; > + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) > + return KMALLOC_RECLAIM; > + else > + return KMALLOC_CGROUP; > } > > /* > diff --git a/mm/slab_common.c b/mm/slab_common.c > index f8833d3e5d47..d750e3ba7af5 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -727,21 +727,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) > } > > #ifdef CONFIG_ZONE_DMA > -#define INIT_KMALLOC_INFO(__size, __short_size) \ > -{ \ > - .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ > - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ > - .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \ > - .size = __size, \ > -} > +#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, > +#else > +#define KMALLOC_DMA_NAME(sz) > +#endif > + > +#ifdef CONFIG_MEMCG_KMEM > +#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, > #else > +#define KMALLOC_CGROUP_NAME(sz) > +#endif > + > #define INIT_KMALLOC_INFO(__size, __short_size) \ > { \ > .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ > .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ > + KMALLOC_CGROUP_NAME(__short_size) \ > + KMALLOC_DMA_NAME(__short_size) \ > .size = __size, \ > } > -#endif > > /* > * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. > @@ -847,6 +851,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) > int i; > enum kmalloc_cache_type type; > > + /* > + * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined > + */ > for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { > for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { > if (!kmalloc_caches[type][i]) > -- > 2.18.1 >
On 5/5/21 7:30 PM, Roman Gushchin wrote: > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: >> >> With this change, all the objcg pointer array objects will come from >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >> both the recursive kfree() problem and non-freeable slab problem are >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >> have mixed accounted and unaccounted objects, this will slightly reduce >> the number of objcg pointer arrays that need to be allocated and save >> a bit of memory. > > Unfortunately the positive effect of this change will be likely > reversed by a lower utilization due to a larger number of caches. > > Btw, I wonder if we also need a change in the slab caches merging procedure? > KMALLOC_NORMAL caches should not be merged with caches which can potentially > include accounted objects. Good point. But looks like kmalloc* caches are extempt from all merging in create_boot_cache() via s->refcount = -1; /* Exempt from merging for now */ It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag to prevent accidental merging in case the above is ever removed. It would also better reflect reality, and ensure that the array is allocated immediately with the page, AFAICS.
On 5/5/21 1:30 PM, Roman Gushchin wrote: > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: >> There are currently two problems in the way the objcg pointer array >> (memcg_data) in the page structure is being allocated and freed. >> >> On its allocation, it is possible that the allocated objcg pointer >> array comes from the same slab that requires memory accounting. If this >> happens, the slab will never become empty again as there is at least >> one object left (the obj_cgroup array) in the slab. >> >> When it is freed, the objcg pointer array object may be the last one >> in its slab and hence causes kfree() to be called again. With the >> right workload, the slab cache may be set up in a way that allows the >> recursive kfree() calling loop to nest deep enough to cause a kernel >> stack overflow and panic the system. >> >> One way to solve this problem is to split the kmalloc-<n> caches >> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> >> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of >> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All >> the other caches can still allow a mix of accounted and non-accounted >> objects. > I agree that it's likely the best approach here. Thanks for discovering > and fixing the problem! > >> With this change, all the objcg pointer array objects will come from >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >> both the recursive kfree() problem and non-freeable slab problem are >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >> have mixed accounted and unaccounted objects, this will slightly reduce >> the number of objcg pointer arrays that need to be allocated and save >> a bit of memory. > Unfortunately the positive effect of this change will be likely > reversed by a lower utilization due to a larger number of caches. That is also true, will mention that. > > Btw, I wonder if we also need a change in the slab caches merging procedure? > KMALLOC_NORMAL caches should not be merged with caches which can potentially > include accounted objects. Thank for catching this omission. I will take a look and modify the merging procedure in a new patch. Accounting is usually specified at kmem_cache_create() time. Though, I did find one instance of setting ACCOUNT flag in kmem_cache_alloc(), I will ignore this case and merge accounted, but unreclaimable caches to KMALLOC_CGROUP. > >> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and >> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() >> will include the newly added caches without change. >> >> Suggested-by: Vlastimil Babka <vbabka@suse.cz> >> Signed-off-by: Waiman Long <longman@redhat.com> >> --- >> include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- >> mm/slab_common.c | 23 +++++++++++++++-------- >> 2 files changed, 49 insertions(+), 16 deletions(-) >> >> diff --git a/include/linux/slab.h b/include/linux/slab.h >> index 0c97d788762c..f2d9ebc34f5c 100644 >> --- a/include/linux/slab.h >> +++ b/include/linux/slab.h >> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, >> /* >> * Whenever changing this, take care of that kmalloc_type() and >> * create_kmalloc_caches() still work as intended. >> + * >> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP >> + * is for accounted objects only. All the other kmem caches can have both >> + * accounted and non-accounted objects. >> */ >> enum kmalloc_cache_type { >> KMALLOC_NORMAL = 0, >> +#ifdef CONFIG_MEMCG_KMEM >> + KMALLOC_CGROUP, >> +#endif >> KMALLOC_RECLAIM, >> #ifdef CONFIG_ZONE_DMA >> KMALLOC_DMA, >> @@ -315,28 +322,47 @@ enum kmalloc_cache_type { >> NR_KMALLOC_TYPES >> }; >> >> +#ifndef CONFIG_MEMCG_KMEM >> +#define KMALLOC_CGROUP KMALLOC_NORMAL >> +#endif >> +#ifndef CONFIG_ZONE_DMA >> +#define KMALLOC_DMA KMALLOC_NORMAL >> +#endif >> + >> #ifndef CONFIG_SLOB >> extern struct kmem_cache * >> kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; >> >> +/* >> + * Define gfp bits that should not be set for KMALLOC_NORMAL. >> + */ >> +#define KMALLOC_NOT_NORMAL_BITS \ >> + (__GFP_RECLAIMABLE | \ >> + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ >> + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) >> + >> static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) >> { >> -#ifdef CONFIG_ZONE_DMA >> /* >> * The most common case is KMALLOC_NORMAL, so test for it >> * with a single branch for both flags. >> */ >> - if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) >> + if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) >> return KMALLOC_NORMAL; > Likely KMALLOC_CGROUP is also very popular, so maybe we want to change the > optimization here a bit. I doubt this optimization is really noticeable and whether KMALLOC_CGROUP is really popular will depend on the workloads. I am not planning to spend additional time to micro-optimize this part of the code. Cheers, Longman
On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote: > On 5/5/21 7:30 PM, Roman Gushchin wrote: > > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: > >> > >> With this change, all the objcg pointer array objects will come from > >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > >> both the recursive kfree() problem and non-freeable slab problem are > >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > >> have mixed accounted and unaccounted objects, this will slightly reduce > >> the number of objcg pointer arrays that need to be allocated and save > >> a bit of memory. > > > > Unfortunately the positive effect of this change will be likely > > reversed by a lower utilization due to a larger number of caches. > > > > Btw, I wonder if we also need a change in the slab caches merging procedure? > > KMALLOC_NORMAL caches should not be merged with caches which can potentially > > include accounted objects. > > Good point. But looks like kmalloc* caches are extempt from all merging in > create_boot_cache() via > > s->refcount = -1; /* Exempt from merging for now */ Oh, interesting... I wonder if there is (still) a good reason for that? Maybe we can remove this limitation and save some memory? > > It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag > to prevent accidental merging in case the above is ever removed. It would also > better reflect reality, and ensure that the array is allocated immediately with > the page, AFAICS. That wouldn't be enough, because a !SLAB_ACCOUNT cache can still have accounted allocations and be merged with kmalloc-* cache. What we might wanna do is to keep the no-merging rule for kmalloc-*, but relax it for kmalloc-cg-* caches. But we can do it later, as a separate change. Thanks!
On Wed, May 05, 2021 at 02:11:52PM -0400, Waiman Long wrote: > On 5/5/21 1:30 PM, Roman Gushchin wrote: > > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: > > > There are currently two problems in the way the objcg pointer array > > > (memcg_data) in the page structure is being allocated and freed. > > > > > > On its allocation, it is possible that the allocated objcg pointer > > > array comes from the same slab that requires memory accounting. If this > > > happens, the slab will never become empty again as there is at least > > > one object left (the obj_cgroup array) in the slab. > > > > > > When it is freed, the objcg pointer array object may be the last one > > > in its slab and hence causes kfree() to be called again. With the > > > right workload, the slab cache may be set up in a way that allows the > > > recursive kfree() calling loop to nest deep enough to cause a kernel > > > stack overflow and panic the system. > > > > > > One way to solve this problem is to split the kmalloc-<n> caches > > > (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> > > > (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of > > > kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All > > > the other caches can still allow a mix of accounted and non-accounted > > > objects. > > I agree that it's likely the best approach here. Thanks for discovering > > and fixing the problem! > > > > > With this change, all the objcg pointer array objects will come from > > > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > > > both the recursive kfree() problem and non-freeable slab problem are > > > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > > > have mixed accounted and unaccounted objects, this will slightly reduce > > > the number of objcg pointer arrays that need to be allocated and save > > > a bit of memory. > > Unfortunately the positive effect of this change will be likely > > reversed by a lower utilization due to a larger number of caches. > > That is also true, will mention that. Thanks! > > > > > Btw, I wonder if we also need a change in the slab caches merging procedure? > > KMALLOC_NORMAL caches should not be merged with caches which can potentially > > include accounted objects. > > Thank for catching this omission. > > I will take a look and modify the merging procedure in a new patch. > Accounting is usually specified at kmem_cache_create() time. Though, I did > find one instance of setting ACCOUNT flag in kmem_cache_alloc(), I will > ignore this case and merge accounted, but unreclaimable caches to > KMALLOC_CGROUP. Vlastimil pointed out that it's not an actual problem, because kmalloc caches are exempt from the merging. Please, add a comment about it into the commit log/code. We might wanna relax this rule for kmalloc-cg-*, but we can do it later. > > > > > > The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and > > > KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() > > > will include the newly added caches without change. > > > > > > Suggested-by: Vlastimil Babka <vbabka@suse.cz> > > > Signed-off-by: Waiman Long <longman@redhat.com> > > > --- > > > include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- > > > mm/slab_common.c | 23 +++++++++++++++-------- > > > 2 files changed, 49 insertions(+), 16 deletions(-) > > > > > > diff --git a/include/linux/slab.h b/include/linux/slab.h > > > index 0c97d788762c..f2d9ebc34f5c 100644 > > > --- a/include/linux/slab.h > > > +++ b/include/linux/slab.h > > > @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, > > > /* > > > * Whenever changing this, take care of that kmalloc_type() and > > > * create_kmalloc_caches() still work as intended. > > > + * > > > + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP > > > + * is for accounted objects only. All the other kmem caches can have both > > > + * accounted and non-accounted objects. > > > */ > > > enum kmalloc_cache_type { > > > KMALLOC_NORMAL = 0, > > > +#ifdef CONFIG_MEMCG_KMEM > > > + KMALLOC_CGROUP, > > > +#endif > > > KMALLOC_RECLAIM, > > > #ifdef CONFIG_ZONE_DMA > > > KMALLOC_DMA, > > > @@ -315,28 +322,47 @@ enum kmalloc_cache_type { > > > NR_KMALLOC_TYPES > > > }; > > > +#ifndef CONFIG_MEMCG_KMEM > > > +#define KMALLOC_CGROUP KMALLOC_NORMAL > > > +#endif > > > +#ifndef CONFIG_ZONE_DMA > > > +#define KMALLOC_DMA KMALLOC_NORMAL > > > +#endif > > > + > > > #ifndef CONFIG_SLOB > > > extern struct kmem_cache * > > > kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; > > > +/* > > > + * Define gfp bits that should not be set for KMALLOC_NORMAL. > > > + */ > > > +#define KMALLOC_NOT_NORMAL_BITS \ > > > + (__GFP_RECLAIMABLE | \ > > > + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ > > > + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) > > > + > > > static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) > > > { > > > -#ifdef CONFIG_ZONE_DMA > > > /* > > > * The most common case is KMALLOC_NORMAL, so test for it > > > * with a single branch for both flags. > > > */ > > > - if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) > > > + if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) > > > return KMALLOC_NORMAL; > > Likely KMALLOC_CGROUP is also very popular, so maybe we want to change the > > optimization here a bit. > > I doubt this optimization is really noticeable and whether KMALLOC_CGROUP is > really popular will depend on the workloads. I am not planning to spend > additional time to micro-optimize this part of the code. Ok. Thanks!
On 5/5/21 2:02 PM, Vlastimil Babka wrote: > On 5/5/21 7:30 PM, Roman Gushchin wrote: >> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: >>> With this change, all the objcg pointer array objects will come from >>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >>> both the recursive kfree() problem and non-freeable slab problem are >>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >>> have mixed accounted and unaccounted objects, this will slightly reduce >>> the number of objcg pointer arrays that need to be allocated and save >>> a bit of memory. >> Unfortunately the positive effect of this change will be likely >> reversed by a lower utilization due to a larger number of caches. >> >> Btw, I wonder if we also need a change in the slab caches merging procedure? >> KMALLOC_NORMAL caches should not be merged with caches which can potentially >> include accounted objects. > Good point. But looks like kmalloc* caches are extempt from all merging in > create_boot_cache() via > > s->refcount = -1; /* Exempt from merging for now */ > > It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag > to prevent accidental merging in case the above is ever removed. It would also > better reflect reality, and ensure that the array is allocated immediately with > the page, AFAICS. > I am not sure if this is really true. struct kmem_cache *__init create_kmalloc_cache(const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); if (!s) panic("Out of memory when creating slab %s\n", name); create_boot_cache(s, name, size, flags, useroffset, usersize); kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; return s; } Even though refcount is set to -1 initially, it is set back to 1 afterward. So merging can still happen AFAICS. Cheers, Longman
On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote: > On 5/5/21 7:30 PM, Roman Gushchin wrote: > > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: > >> > >> With this change, all the objcg pointer array objects will come from > >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > >> both the recursive kfree() problem and non-freeable slab problem are > >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > >> have mixed accounted and unaccounted objects, this will slightly reduce > >> the number of objcg pointer arrays that need to be allocated and save > >> a bit of memory. > > > > Unfortunately the positive effect of this change will be likely > > reversed by a lower utilization due to a larger number of caches. > > > > Btw, I wonder if we also need a change in the slab caches merging procedure? > > KMALLOC_NORMAL caches should not be merged with caches which can potentially > > include accounted objects. > > Good point. But looks like kmalloc* caches are extempt from all merging in > create_boot_cache() via > > s->refcount = -1; /* Exempt from merging for now */ Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling into create_boot_cache? It means they are not exempt actually.
On Wed, May 05, 2021 at 02:31:28PM -0400, Waiman Long wrote: > On 5/5/21 2:02 PM, Vlastimil Babka wrote: > > On 5/5/21 7:30 PM, Roman Gushchin wrote: > > > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: > > > > With this change, all the objcg pointer array objects will come from > > > > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > > > > both the recursive kfree() problem and non-freeable slab problem are > > > > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > > > > have mixed accounted and unaccounted objects, this will slightly reduce > > > > the number of objcg pointer arrays that need to be allocated and save > > > > a bit of memory. > > > Unfortunately the positive effect of this change will be likely > > > reversed by a lower utilization due to a larger number of caches. > > > > > > Btw, I wonder if we also need a change in the slab caches merging procedure? > > > KMALLOC_NORMAL caches should not be merged with caches which can potentially > > > include accounted objects. > > Good point. But looks like kmalloc* caches are extempt from all merging in > > create_boot_cache() via > > > > s->refcount = -1; /* Exempt from merging for now */ > > > > It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag > > to prevent accidental merging in case the above is ever removed. It would also > > better reflect reality, and ensure that the array is allocated immediately with > > the page, AFAICS. > > > I am not sure if this is really true. > > struct kmem_cache *__init create_kmalloc_cache(const char *name, > unsigned int size, slab_flags_t flags, > unsigned int useroffset, unsigned int usersize) > { > struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); > > if (!s) > panic("Out of memory when creating slab %s\n", name); > > create_boot_cache(s, name, size, flags, useroffset, usersize); > kasan_cache_create_kmalloc(s); > list_add(&s->list, &slab_caches); > s->refcount = 1; > return s; > } > > Even though refcount is set to -1 initially, it is set back to 1 afterward. > So merging can still happen AFAICS. Right, thanks, I already noticed it. Then yeah, we should make sure we're not merging KMALLOC_NORMAL caches with any others.
On 5/5/21 2:11 PM, Waiman Long wrote: > On 5/5/21 1:30 PM, Roman Gushchin wrote: > >> >> Btw, I wonder if we also need a change in the slab caches merging >> procedure? >> KMALLOC_NORMAL caches should not be merged with caches which can >> potentially >> include accounted objects. > > Thank for catching this omission. > > I will take a look and modify the merging procedure in a new patch. > Accounting is usually specified at kmem_cache_create() time. Though, I > did find one instance of setting ACCOUNT flag in kmem_cache_alloc(), I > will ignore this case and merge accounted, but unreclaimable caches to > KMALLOC_CGROUP. In mm/slab_common.c: #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, : if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) continue; By making sure kmalloc-cg-* has SLAB_ACCOUNT bit set, a kmemcache created with with SLAB_ACCOUNT may merge with kmalloc-cg-* whereas one without SLAB_ACCOUNT may merge with kmalloc-* for now. So the current code should work fine for most cases. Though, if the ACCOUNT flag is set at kmem_cache_alloc() and the cache happens to be merged into kmalloc-*, we will have the rare case that an objcg pointer array may have to be added to a kmalloc-* cache. However, this is not a common practice, and the three cases (not one, sorry) that I found so far is in arch/x86/kvm/x86.c: ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); fs/hostfs/hostfs_kern.c: hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT); virt/kvm/kvm_main.c: vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); We will have to advise against doing that. Cheers, Longman
On 5/5/21 2:38 PM, Roman Gushchin wrote: > On Wed, May 05, 2021 at 02:31:28PM -0400, Waiman Long wrote: >> On 5/5/21 2:02 PM, Vlastimil Babka wrote: >>> On 5/5/21 7:30 PM, Roman Gushchin wrote: >>>> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: >>>>> With this change, all the objcg pointer array objects will come from >>>>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >>>>> both the recursive kfree() problem and non-freeable slab problem are >>>>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >>>>> have mixed accounted and unaccounted objects, this will slightly reduce >>>>> the number of objcg pointer arrays that need to be allocated and save >>>>> a bit of memory. >>>> Unfortunately the positive effect of this change will be likely >>>> reversed by a lower utilization due to a larger number of caches. >>>> >>>> Btw, I wonder if we also need a change in the slab caches merging procedure? >>>> KMALLOC_NORMAL caches should not be merged with caches which can potentially >>>> include accounted objects. >>> Good point. But looks like kmalloc* caches are extempt from all merging in >>> create_boot_cache() via >>> >>> s->refcount = -1; /* Exempt from merging for now */ >>> >>> It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag >>> to prevent accidental merging in case the above is ever removed. It would also >>> better reflect reality, and ensure that the array is allocated immediately with >>> the page, AFAICS. >>> >> I am not sure if this is really true. >> >> struct kmem_cache *__init create_kmalloc_cache(const char *name, >> unsigned int size, slab_flags_t flags, >> unsigned int useroffset, unsigned int usersize) >> { >> struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); >> >> if (!s) >> panic("Out of memory when creating slab %s\n", name); >> >> create_boot_cache(s, name, size, flags, useroffset, usersize); >> kasan_cache_create_kmalloc(s); >> list_add(&s->list, &slab_caches); >> s->refcount = 1; >> return s; >> } >> >> Even though refcount is set to -1 initially, it is set back to 1 afterward. >> So merging can still happen AFAICS. > Right, thanks, I already noticed it. Then yeah, we should make sure we're not > merging KMALLOC_NORMAL caches with any others. > That should be easy. We just set the refcount to -1 for the KMALLOC_NORMAL caches right after its creation then. Cheers, Longman
On 5/5/21 8:32 PM, Roman Gushchin wrote: > On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote: >> On 5/5/21 7:30 PM, Roman Gushchin wrote: >> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: >> >> >> >> With this change, all the objcg pointer array objects will come from >> >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >> >> both the recursive kfree() problem and non-freeable slab problem are >> >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >> >> have mixed accounted and unaccounted objects, this will slightly reduce >> >> the number of objcg pointer arrays that need to be allocated and save >> >> a bit of memory. >> > >> > Unfortunately the positive effect of this change will be likely >> > reversed by a lower utilization due to a larger number of caches. >> > >> > Btw, I wonder if we also need a change in the slab caches merging procedure? >> > KMALLOC_NORMAL caches should not be merged with caches which can potentially >> > include accounted objects. >> >> Good point. But looks like kmalloc* caches are extempt from all merging in >> create_boot_cache() via >> >> s->refcount = -1; /* Exempt from merging for now */ > > Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling > into create_boot_cache? Hmm I missed that Now I wonder why all kmalloc caches on my system have 0 aliases :) cat /sys/kernel/slab/kmalloc-*/aliases > It means they are not exempt actually. >
On Wed, May 05, 2021 at 11:29:54PM +0200, Vlastimil Babka wrote: > On 5/5/21 8:32 PM, Roman Gushchin wrote: > > On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote: > >> On 5/5/21 7:30 PM, Roman Gushchin wrote: > >> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: > >> >> > >> >> With this change, all the objcg pointer array objects will come from > >> >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So > >> >> both the recursive kfree() problem and non-freeable slab problem are > >> >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer > >> >> have mixed accounted and unaccounted objects, this will slightly reduce > >> >> the number of objcg pointer arrays that need to be allocated and save > >> >> a bit of memory. > >> > > >> > Unfortunately the positive effect of this change will be likely > >> > reversed by a lower utilization due to a larger number of caches. > >> > > >> > Btw, I wonder if we also need a change in the slab caches merging procedure? > >> > KMALLOC_NORMAL caches should not be merged with caches which can potentially > >> > include accounted objects. > >> > >> Good point. But looks like kmalloc* caches are extempt from all merging in > >> create_boot_cache() via > >> > >> s->refcount = -1; /* Exempt from merging for now */ > > > > Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling > > into create_boot_cache? > > Hmm I missed that > > Now I wonder why all kmalloc caches on my system have 0 aliases :) > cat /sys/kernel/slab/kmalloc-*/aliases Yeah, I noticed it too, it's a good question. And I remember a case from the past when it wasn't true (kmalloc-32 was shared with something else).
On 5/5/21 6:19 PM, Roman Gushchin wrote: > On Wed, May 05, 2021 at 11:29:54PM +0200, Vlastimil Babka wrote: >> On 5/5/21 8:32 PM, Roman Gushchin wrote: >>> On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote: >>>> On 5/5/21 7:30 PM, Roman Gushchin wrote: >>>>> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote: >>>>>> With this change, all the objcg pointer array objects will come from >>>>>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So >>>>>> both the recursive kfree() problem and non-freeable slab problem are >>>>>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer >>>>>> have mixed accounted and unaccounted objects, this will slightly reduce >>>>>> the number of objcg pointer arrays that need to be allocated and save >>>>>> a bit of memory. >>>>> Unfortunately the positive effect of this change will be likely >>>>> reversed by a lower utilization due to a larger number of caches. >>>>> >>>>> Btw, I wonder if we also need a change in the slab caches merging procedure? >>>>> KMALLOC_NORMAL caches should not be merged with caches which can potentially >>>>> include accounted objects. >>>> Good point. But looks like kmalloc* caches are extempt from all merging in >>>> create_boot_cache() via >>>> >>>> s->refcount = -1; /* Exempt from merging for now */ >>> Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling >>> into create_boot_cache? >> Hmm I missed that >> >> Now I wonder why all kmalloc caches on my system have 0 aliases :) >> cat /sys/kernel/slab/kmalloc-*/aliases > Yeah, I noticed it too, it's a good question. And I remember a case from > the past when it wasn't true (kmalloc-32 was shared with something else). > The criteria for cache merging require close to exact match in all attributes with a size difference of no more than sizeof(void *). So it is not easy to find a close match. Cheers, Longman
diff --git a/include/linux/slab.h b/include/linux/slab.h index 0c97d788762c..f2d9ebc34f5c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, /* * Whenever changing this, take care of that kmalloc_type() and * create_kmalloc_caches() still work as intended. + * + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP + * is for accounted objects only. All the other kmem caches can have both + * accounted and non-accounted objects. */ enum kmalloc_cache_type { KMALLOC_NORMAL = 0, +#ifdef CONFIG_MEMCG_KMEM + KMALLOC_CGROUP, +#endif KMALLOC_RECLAIM, #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, @@ -315,28 +322,47 @@ enum kmalloc_cache_type { NR_KMALLOC_TYPES }; +#ifndef CONFIG_MEMCG_KMEM +#define KMALLOC_CGROUP KMALLOC_NORMAL +#endif +#ifndef CONFIG_ZONE_DMA +#define KMALLOC_DMA KMALLOC_NORMAL +#endif + #ifndef CONFIG_SLOB extern struct kmem_cache * kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; +/* + * Define gfp bits that should not be set for KMALLOC_NORMAL. + */ +#define KMALLOC_NOT_NORMAL_BITS \ + (__GFP_RECLAIMABLE | \ + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) { -#ifdef CONFIG_ZONE_DMA /* * The most common case is KMALLOC_NORMAL, so test for it * with a single branch for both flags. */ - if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) + if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) return KMALLOC_NORMAL; /* - * At least one of the flags has to be set. If both are, __GFP_DMA - * is more important. + * At least one of the flags has to be set. Their priorities in + * decreasing order are: + * 1) __GFP_DMA + * 2) __GFP_RECLAIMABLE + * 3) __GFP_ACCOUNT */ - return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM; -#else - return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL; -#endif + if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) + return KMALLOC_DMA; + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + return KMALLOC_RECLAIM; + else + return KMALLOC_CGROUP; } /* diff --git a/mm/slab_common.c b/mm/slab_common.c index f8833d3e5d47..d750e3ba7af5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -727,21 +727,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) } #ifdef CONFIG_ZONE_DMA -#define INIT_KMALLOC_INFO(__size, __short_size) \ -{ \ - .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ - .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \ - .size = __size, \ -} +#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, +#else +#define KMALLOC_DMA_NAME(sz) +#endif + +#ifdef CONFIG_MEMCG_KMEM +#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, #else +#define KMALLOC_CGROUP_NAME(sz) +#endif + #define INIT_KMALLOC_INFO(__size, __short_size) \ { \ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + KMALLOC_CGROUP_NAME(__short_size) \ + KMALLOC_DMA_NAME(__short_size) \ .size = __size, \ } -#endif /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. @@ -847,6 +851,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) int i; enum kmalloc_cache_type type; + /* + * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined + */ for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[type][i])
There are currently two problems in the way the objcg pointer array (memcg_data) in the page structure is being allocated and freed. On its allocation, it is possible that the allocated objcg pointer array comes from the same slab that requires memory accounting. If this happens, the slab will never become empty again as there is at least one object left (the obj_cgroup array) in the slab. When it is freed, the objcg pointer array object may be the last one in its slab and hence causes kfree() to be called again. With the right workload, the slab cache may be set up in a way that allows the recursive kfree() calling loop to nest deep enough to cause a kernel stack overflow and panic the system. One way to solve this problem is to split the kmalloc-<n> caches (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All the other caches can still allow a mix of accounted and non-accounted objects. With this change, all the objcg pointer array objects will come from KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So both the recursive kfree() problem and non-freeable slab problem are gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer have mixed accounted and unaccounted objects, this will slightly reduce the number of objcg pointer arrays that need to be allocated and save a bit of memory. The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() will include the newly added caches without change. Suggested-by: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Waiman Long <longman@redhat.com> --- include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++-------- mm/slab_common.c | 23 +++++++++++++++-------- 2 files changed, 49 insertions(+), 16 deletions(-)