Message ID | 20220308114142.1744229-10-42.hyeyoo@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | common kmalloc subsystem on SLAB/SLUB | expand |
On 3/8/22 12:41, Hyeonggon Yoo wrote: > There is not much benefit for serving large objects in kmalloc(). > Let's pass large requests to page allocator like SLUB for better > maintenance of common code. > > [ vbabka@suse.cz: Enable and disable irq around free_large_kmalloc(). > Do not lose NUMA locality in __do_kmalloc_node(). > Use folio_slab(folio)->slab_cache instead of virt_to_cache(). > Remove large sizes in __kmalloc_index(). ] Thanks for the mention but that's generally only done like this if I took your patch and made those changes myself. But I just suggested them. Small suggested changes like this are usually just mentioned in e.g. v1->v2 changelogs. > Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> > --- > include/linux/slab.h | 23 +++++----------------- > mm/slab.c | 45 ++++++++++++++++++++++++++++++-------------- > mm/slab.h | 3 +++ > mm/slab_common.c | 25 +++++++++++++++++------- > mm/slub.c | 19 ------------------- > 5 files changed, 57 insertions(+), 58 deletions(-) > > diff --git a/include/linux/slab.h b/include/linux/slab.h > index dfcc8301d969..9ced225a3ea3 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -226,27 +226,17 @@ void kmem_dump_obj(void *object); > > #ifdef CONFIG_SLAB > /* > - * The largest kmalloc size supported by the SLAB allocators is > - * 32 megabyte (2^25) or the maximum allocatable page order if that is > - * less than 32 MB. > - * > - * WARNING: Its not easy to increase this value since the allocators have > - * to do various tricks to work around compiler limitations in order to > - * ensure proper constant folding. > + * SLAB and SLUB directly allocates requests fitting in to an order-1 page > + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. > */ > -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ > - (MAX_ORDER + PAGE_SHIFT - 1) : 25) > -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH > +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) > +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) > #ifndef KMALLOC_SHIFT_LOW > #define KMALLOC_SHIFT_LOW 5 > #endif > #endif > > #ifdef CONFIG_SLUB > -/* > - * SLUB directly allocates requests fitting in to an order-1 page > - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. > - */ > #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) > #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) > #ifndef KMALLOC_SHIFT_LOW > @@ -398,10 +388,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, > if (size <= 512 * 1024) return 19; > if (size <= 1024 * 1024) return 20; > if (size <= 2 * 1024 * 1024) return 21; > - if (size <= 4 * 1024 * 1024) return 22; > - if (size <= 8 * 1024 * 1024) return 23; > - if (size <= 16 * 1024 * 1024) return 24; > - if (size <= 32 * 1024 * 1024) return 25; > > if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) > BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); > @@ -411,6 +397,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, > /* Will never be reached. Needed because the compiler may complain */ > return -1; > } > +static_assert(PAGE_SHIFT <= 20); > #define kmalloc_index(s) __kmalloc_index(s, true) > #endif /* !CONFIG_SLOB */ > > diff --git a/mm/slab.c b/mm/slab.c > index 6ebf509bf2de..f0041f0125ba 100644 > --- a/mm/slab.c > +++ b/mm/slab.c > @@ -3568,7 +3568,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) > void *ret; > > if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) > - return NULL; > + return kmalloc_large_node(size, flags, node); Similar issue with caller not traced. > cachep = kmalloc_slab(size, flags); > if (unlikely(ZERO_OR_NULL_PTR(cachep))) > return cachep; > @@ -3642,15 +3642,25 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) > { > struct kmem_cache *s; > size_t i; > + struct folio *folio; > > local_irq_disable(); > for (i = 0; i < size; i++) { > void *objp = p[i]; > > - if (!orig_s) /* called via kfree_bulk */ > - s = virt_to_cache(objp); > - else > + if (!orig_s) { > + folio = virt_to_folio(objp); > + /* called via kfree_bulk */ > + if (!folio_test_slab(folio)) { > + local_irq_enable(); > + free_large_kmalloc(folio, objp); > + local_irq_disable(); > + continue; > + } > + s = folio_slab(folio)->slab_cache; > + } else > s = cache_from_obj(orig_s, objp); > + > if (!s) > continue; > > @@ -3679,20 +3689,25 @@ void kfree(const void *objp) > { > struct kmem_cache *c; > unsigned long flags; > + struct folio *folio; > + void *x = (void *) objp; I think you don't need to add 'x', just do the cast while calling free_large_kmalloc(), same as done for __cache_free(). > > trace_kfree(_RET_IP_, objp); > > if (unlikely(ZERO_OR_NULL_PTR(objp))) > return; > - local_irq_save(flags); > - kfree_debugcheck(objp); > - c = virt_to_cache(objp); > - if (!c) { > - local_irq_restore(flags); > + > + folio = virt_to_folio(objp); > + if (!folio_test_slab(folio)) { > + free_large_kmalloc(folio, x); > return; > } > - debug_check_no_locks_freed(objp, c->object_size); > > + c = folio_slab(folio)->slab_cache; > + > + local_irq_save(flags); > + kfree_debugcheck(objp); > + debug_check_no_locks_freed(objp, c->object_size); > debug_check_no_obj_freed(objp, c->object_size); > __cache_free(c, (void *)objp, _RET_IP_); > local_irq_restore(flags);
On Thu, Mar 24, 2022 at 07:08:27PM +0100, Vlastimil Babka wrote: > On 3/8/22 12:41, Hyeonggon Yoo wrote: > > There is not much benefit for serving large objects in kmalloc(). > > Let's pass large requests to page allocator like SLUB for better > > maintenance of common code. > > > > [ vbabka@suse.cz: Enable and disable irq around free_large_kmalloc(). > > Do not lose NUMA locality in __do_kmalloc_node(). > > Use folio_slab(folio)->slab_cache instead of virt_to_cache(). > > Remove large sizes in __kmalloc_index(). ] A bit late to reply but better late than never... > > Thanks for the mention but that's generally only done like this if I took > your patch and made those changes myself. But I just suggested them. Small > suggested changes like this are usually just mentioned in e.g. v1->v2 > changelogs. Ah, okay. I didn't know about the convention. thanks for letting me know! > > Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> > > --- > > include/linux/slab.h | 23 +++++----------------- > > mm/slab.c | 45 ++++++++++++++++++++++++++++++-------------- > > mm/slab.h | 3 +++ > > mm/slab_common.c | 25 +++++++++++++++++------- > > mm/slub.c | 19 ------------------- > > 5 files changed, 57 insertions(+), 58 deletions(-) > > > > diff --git a/include/linux/slab.h b/include/linux/slab.h > > index dfcc8301d969..9ced225a3ea3 100644 > > --- a/include/linux/slab.h > > +++ b/include/linux/slab.h > > @@ -226,27 +226,17 @@ void kmem_dump_obj(void *object); > > > > #ifdef CONFIG_SLAB > > /* > > - * The largest kmalloc size supported by the SLAB allocators is > > - * 32 megabyte (2^25) or the maximum allocatable page order if that is > > - * less than 32 MB. > > - * > > - * WARNING: Its not easy to increase this value since the allocators have > > - * to do various tricks to work around compiler limitations in order to > > - * ensure proper constant folding. > > + * SLAB and SLUB directly allocates requests fitting in to an order-1 page > > + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. > > */ > > -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ > > - (MAX_ORDER + PAGE_SHIFT - 1) : 25) > > -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH > > +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) > > +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) > > #ifndef KMALLOC_SHIFT_LOW > > #define KMALLOC_SHIFT_LOW 5 > > #endif > > #endif > > > > #ifdef CONFIG_SLUB > > -/* > > - * SLUB directly allocates requests fitting in to an order-1 page > > - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. > > - */ > > #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) > > #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) > > #ifndef KMALLOC_SHIFT_LOW > > @@ -398,10 +388,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, > > if (size <= 512 * 1024) return 19; > > if (size <= 1024 * 1024) return 20; > > if (size <= 2 * 1024 * 1024) return 21; > > - if (size <= 4 * 1024 * 1024) return 22; > > - if (size <= 8 * 1024 * 1024) return 23; > > - if (size <= 16 * 1024 * 1024) return 24; > > - if (size <= 32 * 1024 * 1024) return 25; > > > > if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) > > BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); > > @@ -411,6 +397,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, > > /* Will never be reached. Needed because the compiler may complain */ > > return -1; > > } > > +static_assert(PAGE_SHIFT <= 20); > > #define kmalloc_index(s) __kmalloc_index(s, true) > > #endif /* !CONFIG_SLOB */ > > > > diff --git a/mm/slab.c b/mm/slab.c > > index 6ebf509bf2de..f0041f0125ba 100644 > > --- a/mm/slab.c > > +++ b/mm/slab.c > > @@ -3568,7 +3568,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) > > void *ret; > > > > if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) > > - return NULL; > > + return kmalloc_large_node(size, flags, node); > > Similar issue with caller not traced. > Actually I moved tracepoint into kmalloc_large_node(), but the problem I think was I write patches hard to review. in v2 I split some patches to be more reviewable. thanks!! > > cachep = kmalloc_slab(size, flags); > > if (unlikely(ZERO_OR_NULL_PTR(cachep))) > > return cachep; > > @@ -3642,15 +3642,25 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) > > { > > struct kmem_cache *s; > > size_t i; > > + struct folio *folio; > > > > local_irq_disable(); > > for (i = 0; i < size; i++) { > > void *objp = p[i]; > > > > - if (!orig_s) /* called via kfree_bulk */ > > - s = virt_to_cache(objp); > > - else > > + if (!orig_s) { > > + folio = virt_to_folio(objp); > > + /* called via kfree_bulk */ > > + if (!folio_test_slab(folio)) { > > + local_irq_enable(); > > + free_large_kmalloc(folio, objp); > > + local_irq_disable(); > > + continue; > > + } > > + s = folio_slab(folio)->slab_cache; > > + } else > > s = cache_from_obj(orig_s, objp); > > + > > if (!s) > > continue; > > > > @@ -3679,20 +3689,25 @@ void kfree(const void *objp) > > { > > struct kmem_cache *c; > > unsigned long flags; > > + struct folio *folio; > > + void *x = (void *) objp; > > I think you don't need to add 'x', just do the cast while calling > free_large_kmalloc(), same as done for __cache_free(). > in fact also SLUB's kfree defines x. But your suggestion sounds better. Anyway did it in v2. thanks! > > > > trace_kfree(_RET_IP_, objp); > > > > if (unlikely(ZERO_OR_NULL_PTR(objp))) > > return; > > - local_irq_save(flags); > > - kfree_debugcheck(objp); > > - c = virt_to_cache(objp); > > - if (!c) { > > - local_irq_restore(flags); > > + > > + folio = virt_to_folio(objp); > > + if (!folio_test_slab(folio)) { > > + free_large_kmalloc(folio, x); > > return; > > } > > - debug_check_no_locks_freed(objp, c->object_size); > > > > + c = folio_slab(folio)->slab_cache; > > + > > + local_irq_save(flags); > > + kfree_debugcheck(objp); > > + debug_check_no_locks_freed(objp, c->object_size); > > debug_check_no_obj_freed(objp, c->object_size); > > __cache_free(c, (void *)objp, _RET_IP_); > > local_irq_restore(flags);
diff --git a/include/linux/slab.h b/include/linux/slab.h index dfcc8301d969..9ced225a3ea3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -226,27 +226,17 @@ void kmem_dump_obj(void *object); #ifdef CONFIG_SLAB /* - * The largest kmalloc size supported by the SLAB allocators is - * 32 megabyte (2^25) or the maximum allocatable page order if that is - * less than 32 MB. - * - * WARNING: Its not easy to increase this value since the allocators have - * to do various tricks to work around compiler limitations in order to - * ensure proper constant folding. + * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ - (MAX_ORDER + PAGE_SHIFT - 1) : 25) -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif #ifdef CONFIG_SLUB -/* - * SLUB directly allocates requests fitting in to an order-1 page - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. - */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW @@ -398,10 +388,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; - if (size <= 4 * 1024 * 1024) return 22; - if (size <= 8 * 1024 * 1024) return 23; - if (size <= 16 * 1024 * 1024) return 24; - if (size <= 32 * 1024 * 1024) return 25; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); @@ -411,6 +397,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, /* Will never be reached. Needed because the compiler may complain */ return -1; } +static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ diff --git a/mm/slab.c b/mm/slab.c index 6ebf509bf2de..f0041f0125ba 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3568,7 +3568,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) void *ret; if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; + return kmalloc_large_node(size, flags, node); cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3642,15 +3642,25 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { struct kmem_cache *s; size_t i; + struct folio *folio; local_irq_disable(); for (i = 0; i < size; i++) { void *objp = p[i]; - if (!orig_s) /* called via kfree_bulk */ - s = virt_to_cache(objp); - else + if (!orig_s) { + folio = virt_to_folio(objp); + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else s = cache_from_obj(orig_s, objp); + if (!s) continue; @@ -3679,20 +3689,25 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + struct folio *folio; + void *x = (void *) objp; trace_kfree(_RET_IP_, objp); if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); - kfree_debugcheck(objp); - c = virt_to_cache(objp); - if (!c) { - local_irq_restore(flags); + + folio = virt_to_folio(objp); + if (!folio_test_slab(folio)) { + free_large_kmalloc(folio, x); return; } - debug_check_no_locks_freed(objp, c->object_size); + c = folio_slab(folio)->slab_cache; + + local_irq_save(flags); + kfree_debugcheck(objp); + debug_check_no_locks_freed(objp, c->object_size); debug_check_no_obj_freed(objp, c->object_size); __cache_free(c, (void *)objp, _RET_IP_); local_irq_restore(flags); @@ -4114,15 +4129,17 @@ void __check_heap_object(const void *ptr, unsigned long n, size_t __ksize(const void *objp) { struct kmem_cache *c; - size_t size; + struct folio *folio; BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - c = virt_to_cache(objp); - size = c ? c->object_size : 0; + folio = virt_to_folio(objp); + if (!folio_test_slab(folio)) + return folio_size(folio); - return size; + c = folio_slab(folio)->slab_cache; + return c->object_size; } EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index c7f2abc2b154..eb6e26784d69 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -664,6 +664,9 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } + +void free_large_kmalloc(struct folio *folio, void *object); + #endif /* CONFIG_SLOB */ static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 1fe2f2a7326d..af67005a151f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -759,8 +759,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is - * kmalloc-32M. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -784,11 +784,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(262144, 256k), INIT_KMALLOC_INFO(524288, 512k), INIT_KMALLOC_INFO(1048576, 1M), - INIT_KMALLOC_INFO(2097152, 2M), - INIT_KMALLOC_INFO(4194304, 4M), - INIT_KMALLOC_INFO(8388608, 8M), - INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M) + INIT_KMALLOC_INFO(2097152, 2M) }; /* @@ -913,6 +909,21 @@ void __init create_kmalloc_caches(slab_flags_t flags) } #endif } + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) diff --git a/mm/slub.c b/mm/slub.c index d8fb987ff7e0..283c4ac92ffe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1678,12 +1678,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static __always_inline void kfree_hook(void *x) -{ - kmemleak_free(x); - kasan_kfree_large(x); -} - static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { @@ -3501,19 +3495,6 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same
There is not much benefit for serving large objects in kmalloc(). Let's pass large requests to page allocator like SLUB for better maintenance of common code. [ vbabka@suse.cz: Enable and disable irq around free_large_kmalloc(). Do not lose NUMA locality in __do_kmalloc_node(). Use folio_slab(folio)->slab_cache instead of virt_to_cache(). Remove large sizes in __kmalloc_index(). ] Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- include/linux/slab.h | 23 +++++----------------- mm/slab.c | 45 ++++++++++++++++++++++++++++++-------------- mm/slab.h | 3 +++ mm/slab_common.c | 25 +++++++++++++++++------- mm/slub.c | 19 ------------------- 5 files changed, 57 insertions(+), 58 deletions(-)