Message ID | 20240221194052.927623-15-surenb@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Memory allocation profiling | expand |
On Wed, Feb 21, 2024 at 11:40:27AM -0800, Suren Baghdasaryan wrote: > [...] > +struct alloc_tag { > + struct codetag ct; > + struct alloc_tag_counters __percpu *counters; > +} __aligned(8); > [...] > +#define DEFINE_ALLOC_TAG(_alloc_tag) \ > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > + __section("alloc_tags") = { \ > + .ct = CODE_TAG_INIT, \ > + .counters = &_alloc_tag_cntr }; > [...] > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > +{ > + swap(current->alloc_tag, tag); > + return tag; > +} Future security hardening improvement idea based on this infrastructure: it should be possible to implement per-allocation-site kmem caches. For example, we could create: struct alloc_details { u32 flags; union { u32 size; /* not valid after __init completes */ struct kmem_cache *cache; }; }; - add struct alloc_details to struct alloc_tag - move the tags section into .ro_after_init - extend alloc_hooks() to populate flags and size: .flags = __builtin_constant_p(size) ? KMALLOC_ALLOCATE_FIXED : KMALLOC_ALLOCATE_BUCKETS; .size = __builtin_constant_p(size) ? size : SIZE_MAX; - during kernel start or module init, walk the alloc_tag list and create either a fixed-size kmem_cache or to allocate a full set of kmalloc-buckets, and update the "cache" member. - adjust kmalloc core routines to use current->alloc_tag->cache instead of using the global buckets. This would get us fully separated allocations, producing better than type-based levels of granularity, exceeding what we have currently with CONFIG_RANDOM_KMALLOC_CACHES. Does this look possible, or am I misunderstanding something in the infrastructure being created here?
On Wed, Feb 21, 2024 at 03:05:32PM -0800, Kees Cook wrote: > On Wed, Feb 21, 2024 at 11:40:27AM -0800, Suren Baghdasaryan wrote: > > [...] > > +struct alloc_tag { > > + struct codetag ct; > > + struct alloc_tag_counters __percpu *counters; > > +} __aligned(8); > > [...] > > +#define DEFINE_ALLOC_TAG(_alloc_tag) \ > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > + __section("alloc_tags") = { \ > > + .ct = CODE_TAG_INIT, \ > > + .counters = &_alloc_tag_cntr }; > > [...] > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > +{ > > + swap(current->alloc_tag, tag); > > + return tag; > > +} > > Future security hardening improvement idea based on this infrastructure: > it should be possible to implement per-allocation-site kmem caches. For > example, we could create: > > struct alloc_details { > u32 flags; > union { > u32 size; /* not valid after __init completes */ > struct kmem_cache *cache; > }; > }; > > - add struct alloc_details to struct alloc_tag > - move the tags section into .ro_after_init > - extend alloc_hooks() to populate flags and size: > .flags = __builtin_constant_p(size) ? KMALLOC_ALLOCATE_FIXED > : KMALLOC_ALLOCATE_BUCKETS; > .size = __builtin_constant_p(size) ? size : SIZE_MAX; > - during kernel start or module init, walk the alloc_tag list > and create either a fixed-size kmem_cache or to allocate a > full set of kmalloc-buckets, and update the "cache" member. > - adjust kmalloc core routines to use current->alloc_tag->cache instead > of using the global buckets. > > This would get us fully separated allocations, producing better than > type-based levels of granularity, exceeding what we have currently with > CONFIG_RANDOM_KMALLOC_CACHES. > > Does this look possible, or am I misunderstanding something in the > infrastructure being created here? Definitely possible, but... would we want this? That would produce a _lot_ of kmem caches, and don't we already try to collapse those where possible to reduce internal fragmentation?
On Wed, Feb 21, 2024 at 06:29:17PM -0500, Kent Overstreet wrote: > On Wed, Feb 21, 2024 at 03:05:32PM -0800, Kees Cook wrote: > > On Wed, Feb 21, 2024 at 11:40:27AM -0800, Suren Baghdasaryan wrote: > > > [...] > > > +struct alloc_tag { > > > + struct codetag ct; > > > + struct alloc_tag_counters __percpu *counters; > > > +} __aligned(8); > > > [...] > > > +#define DEFINE_ALLOC_TAG(_alloc_tag) \ > > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > > + __section("alloc_tags") = { \ > > > + .ct = CODE_TAG_INIT, \ > > > + .counters = &_alloc_tag_cntr }; > > > [...] > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > > +{ > > > + swap(current->alloc_tag, tag); > > > + return tag; > > > +} > > > > Future security hardening improvement idea based on this infrastructure: > > it should be possible to implement per-allocation-site kmem caches. For > > example, we could create: > > > > struct alloc_details { > > u32 flags; > > union { > > u32 size; /* not valid after __init completes */ > > struct kmem_cache *cache; > > }; > > }; > > > > - add struct alloc_details to struct alloc_tag > > - move the tags section into .ro_after_init > > - extend alloc_hooks() to populate flags and size: > > .flags = __builtin_constant_p(size) ? KMALLOC_ALLOCATE_FIXED > > : KMALLOC_ALLOCATE_BUCKETS; > > .size = __builtin_constant_p(size) ? size : SIZE_MAX; > > - during kernel start or module init, walk the alloc_tag list > > and create either a fixed-size kmem_cache or to allocate a > > full set of kmalloc-buckets, and update the "cache" member. > > - adjust kmalloc core routines to use current->alloc_tag->cache instead > > of using the global buckets. > > > > This would get us fully separated allocations, producing better than > > type-based levels of granularity, exceeding what we have currently with > > CONFIG_RANDOM_KMALLOC_CACHES. > > > > Does this look possible, or am I misunderstanding something in the > > infrastructure being created here? > > Definitely possible, but... would we want this? Yes, very very much. One of the worst and mostly unaddressed weaknesses with the kernel right now is use-after-free based type confusion[0], which depends on merged caches (or cache reuse). This doesn't solve cross-allocator (kmalloc/page_alloc) type confusion (as terrifyingly demonstrated[1] by Jann Horn), but it does help with what has been a very common case of "use msg_msg to impersonate your target object"[2] exploitation. > That would produce a _lot_ of kmem caches Fewer than you'd expect, but yes, there is some overhead. However, out-of-tree forks of Linux have successfully experimented with this already and seen good results[3]. > and don't we already try to collapse those where possible to reduce > internal fragmentation? In the past, yes, but the desire for security has tended to have more people building with SLAB_MERGE_DEFAULT=n and/or CONFIG_RANDOM_KMALLOC_CACHES=y (or booting with "slab_nomerge"). Just doing the type safety isn't sufficient without the cross-allocator safety, but we've also had solutions for that proposed[4]. -Kees [0] https://github.com/KSPP/linux/issues/189 [1] https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html [2] https://www.willsroot.io/2021/08/corctf-2021-fire-of-salvation-writeup.html https://google.github.io/security-research/pocs/linux/cve-2021-22555/writeup.html#exploring-struct-msg_msg [3] https://grsecurity.net/how_autoslab_changes_the_memory_unsafety_game [4] https://lore.kernel.org/linux-hardening/20230915105933.495735-1-matteorizzo@google.com/
On Wed, Feb 21, 2024 at 04:25:02PM -0800, Kees Cook wrote: > On Wed, Feb 21, 2024 at 06:29:17PM -0500, Kent Overstreet wrote: > > On Wed, Feb 21, 2024 at 03:05:32PM -0800, Kees Cook wrote: > > > On Wed, Feb 21, 2024 at 11:40:27AM -0800, Suren Baghdasaryan wrote: > > > > [...] > > > > +struct alloc_tag { > > > > + struct codetag ct; > > > > + struct alloc_tag_counters __percpu *counters; > > > > +} __aligned(8); > > > > [...] > > > > +#define DEFINE_ALLOC_TAG(_alloc_tag) \ > > > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > > > + __section("alloc_tags") = { \ > > > > + .ct = CODE_TAG_INIT, \ > > > > + .counters = &_alloc_tag_cntr }; > > > > [...] > > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > > > +{ > > > > + swap(current->alloc_tag, tag); > > > > + return tag; > > > > +} > > > > > > Future security hardening improvement idea based on this infrastructure: > > > it should be possible to implement per-allocation-site kmem caches. For > > > example, we could create: > > > > > > struct alloc_details { > > > u32 flags; > > > union { > > > u32 size; /* not valid after __init completes */ > > > struct kmem_cache *cache; > > > }; > > > }; > > > > > > - add struct alloc_details to struct alloc_tag > > > - move the tags section into .ro_after_init > > > - extend alloc_hooks() to populate flags and size: > > > .flags = __builtin_constant_p(size) ? KMALLOC_ALLOCATE_FIXED > > > : KMALLOC_ALLOCATE_BUCKETS; > > > .size = __builtin_constant_p(size) ? size : SIZE_MAX; > > > - during kernel start or module init, walk the alloc_tag list > > > and create either a fixed-size kmem_cache or to allocate a > > > full set of kmalloc-buckets, and update the "cache" member. > > > - adjust kmalloc core routines to use current->alloc_tag->cache instead > > > of using the global buckets. > > > > > > This would get us fully separated allocations, producing better than > > > type-based levels of granularity, exceeding what we have currently with > > > CONFIG_RANDOM_KMALLOC_CACHES. > > > > > > Does this look possible, or am I misunderstanding something in the > > > infrastructure being created here? > > > > Definitely possible, but... would we want this? > > Yes, very very much. One of the worst and mostly unaddressed weaknesses > with the kernel right now is use-after-free based type confusion[0], which > depends on merged caches (or cache reuse). > > This doesn't solve cross-allocator (kmalloc/page_alloc) type confusion > (as terrifyingly demonstrated[1] by Jann Horn), but it does help with > what has been a very common case of "use msg_msg to impersonate your > target object"[2] exploitation. We have a ton of code that references PAGE_SIZE and uses the page allocator completely unnecessarily - that's something worth harping about at conferences; if we could motivate people to clean that stuff up it'd have a lot of positive effects. > > That would produce a _lot_ of kmem caches > > Fewer than you'd expect, but yes, there is some overhead. However, > out-of-tree forks of Linux have successfully experimented with this > already and seen good results[3]. So in that case - I don't think there's any need for a separate alloc_details; we'd just add a kmem_cache * to alloc_tag and then hook into the codetag init/unload path to create and destroy the kmem caches. No need to adjust the slab code either; alloc_hooks() itself could dispatch to kmem_cache_alloc() instead of kmalloc() if this is in use.
On Wed, Feb 21, 2024 at 07:34:44PM -0500, Kent Overstreet wrote: > On Wed, Feb 21, 2024 at 04:25:02PM -0800, Kees Cook wrote: > > On Wed, Feb 21, 2024 at 06:29:17PM -0500, Kent Overstreet wrote: > > > On Wed, Feb 21, 2024 at 03:05:32PM -0800, Kees Cook wrote: > > > > On Wed, Feb 21, 2024 at 11:40:27AM -0800, Suren Baghdasaryan wrote: > > > > > [...] > > > > > +struct alloc_tag { > > > > > + struct codetag ct; > > > > > + struct alloc_tag_counters __percpu *counters; > > > > > +} __aligned(8); > > > > > [...] > > > > > +#define DEFINE_ALLOC_TAG(_alloc_tag) \ > > > > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > > > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > > > > + __section("alloc_tags") = { \ > > > > > + .ct = CODE_TAG_INIT, \ > > > > > + .counters = &_alloc_tag_cntr }; > > > > > [...] > > > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > > > > +{ > > > > > + swap(current->alloc_tag, tag); > > > > > + return tag; > > > > > +} > > > > > > > > Future security hardening improvement idea based on this infrastructure: > > > > it should be possible to implement per-allocation-site kmem caches. For > > > > example, we could create: > > > > > > > > struct alloc_details { > > > > u32 flags; > > > > union { > > > > u32 size; /* not valid after __init completes */ > > > > struct kmem_cache *cache; > > > > }; > > > > }; > > > > > > > > - add struct alloc_details to struct alloc_tag > > > > - move the tags section into .ro_after_init > > > > - extend alloc_hooks() to populate flags and size: > > > > .flags = __builtin_constant_p(size) ? KMALLOC_ALLOCATE_FIXED > > > > : KMALLOC_ALLOCATE_BUCKETS; > > > > .size = __builtin_constant_p(size) ? size : SIZE_MAX; > > > > - during kernel start or module init, walk the alloc_tag list > > > > and create either a fixed-size kmem_cache or to allocate a > > > > full set of kmalloc-buckets, and update the "cache" member. > > > > - adjust kmalloc core routines to use current->alloc_tag->cache instead > > > > of using the global buckets. > > > > > > > > This would get us fully separated allocations, producing better than > > > > type-based levels of granularity, exceeding what we have currently with > > > > CONFIG_RANDOM_KMALLOC_CACHES. > > > > > > > > Does this look possible, or am I misunderstanding something in the > > > > infrastructure being created here? > > > > > > Definitely possible, but... would we want this? > > > > Yes, very very much. One of the worst and mostly unaddressed weaknesses > > with the kernel right now is use-after-free based type confusion[0], which > > depends on merged caches (or cache reuse). > > > > This doesn't solve cross-allocator (kmalloc/page_alloc) type confusion > > (as terrifyingly demonstrated[1] by Jann Horn), but it does help with > > what has been a very common case of "use msg_msg to impersonate your > > target object"[2] exploitation. > > We have a ton of code that references PAGE_SIZE and uses the page > allocator completely unnecessarily - that's something worth harping > about at conferences; if we could motivate people to clean that stuff up > it'd have a lot of positive effects. > > > > That would produce a _lot_ of kmem caches > > > > Fewer than you'd expect, but yes, there is some overhead. However, > > out-of-tree forks of Linux have successfully experimented with this > > already and seen good results[3]. > > So in that case - I don't think there's any need for a separate > alloc_details; we'd just add a kmem_cache * to alloc_tag and then hook > into the codetag init/unload path to create and destroy the kmem caches. Okay, sounds good. There needs to be a place to track "is this a fixed size or a run-time size" choice. > No need to adjust the slab code either; alloc_hooks() itself could > dispatch to kmem_cache_alloc() instead of kmalloc() if this is in use. Right, it'd go to either kmem_cache_alloc() directly, or to a modified kmalloc() that used the passed-in cache is the base for an array of sized buckets, rather than the global (or 16-way global) buckets. Yay for the future!
On 2/21/24 20:40, Suren Baghdasaryan wrote: > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > +{ > + __alloc_tag_sub(ref, bytes); > +} > + > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > +{ > + __alloc_tag_sub(ref, bytes); > +} > + Nit: just notice these are now the same and maybe you could just drop both wrappers and rename __alloc_tag_sub to alloc_tag_sub?
Another thing I noticed, dunno how critical On 2/21/24 20:40, Suren Baghdasaryan wrote: > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) > +{ > + struct alloc_tag *tag; > + > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); > +#endif > + if (!ref || !ref->ct) > + return; This is quite careful. > + > + tag = ct_to_alloc_tag(ref->ct); > + > + this_cpu_sub(tag->counters->bytes, bytes); > + this_cpu_dec(tag->counters->calls); > + > + ref->ct = NULL; > +} > + > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > +{ > + __alloc_tag_sub(ref, bytes); > +} > + > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > +{ > + __alloc_tag_sub(ref, bytes); > +} > + > +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) > +{ > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > + WARN_ONCE(ref && ref->ct, > + "alloc_tag was not cleared (got tag for %s:%u)\n",\ > + ref->ct->filename, ref->ct->lineno); > + > + WARN_ONCE(!tag, "current->alloc_tag not set"); > +#endif > + if (!ref || !tag) > + return; This too. > + > + ref->ct = &tag->ct; > + /* > + * We need in increment the call counter every time we have a new > + * allocation or when we split a large allocation into smaller ones. > + * Each new reference for every sub-allocation needs to increment call > + * counter because when we free each part the counter will be decremented. > + */ > + this_cpu_inc(tag->counters->calls); > +} > + > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) > +{ > + alloc_tag_ref_set(ref, tag); We might have returned from alloc_tag_ref_set() due to !tag > + this_cpu_add(tag->counters->bytes, bytes); But here we still assume it's valid. > +} > +
On Wed, Feb 28, 2024 at 8:29 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 2/21/24 20:40, Suren Baghdasaryan wrote: > > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > +{ > > + __alloc_tag_sub(ref, bytes); > > +} > > + > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > > +{ > > + __alloc_tag_sub(ref, bytes); > > +} > > + > > Nit: just notice these are now the same and maybe you could just drop both > wrappers and rename __alloc_tag_sub to alloc_tag_sub? Ack. > > -- > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
On Wed, Feb 28, 2024 at 8:41 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > Another thing I noticed, dunno how critical > > On 2/21/24 20:40, Suren Baghdasaryan wrote: > > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > +{ > > + struct alloc_tag *tag; > > + > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); > > +#endif > > + if (!ref || !ref->ct) > > + return; > > This is quite careful. > > > + > > + tag = ct_to_alloc_tag(ref->ct); > > + > > + this_cpu_sub(tag->counters->bytes, bytes); > > + this_cpu_dec(tag->counters->calls); > > + > > + ref->ct = NULL; > > +} > > + > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > +{ > > + __alloc_tag_sub(ref, bytes); > > +} > > + > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > > +{ > > + __alloc_tag_sub(ref, bytes); > > +} > > + > > +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) > > +{ > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > + WARN_ONCE(ref && ref->ct, > > + "alloc_tag was not cleared (got tag for %s:%u)\n",\ > > + ref->ct->filename, ref->ct->lineno); > > + > > + WARN_ONCE(!tag, "current->alloc_tag not set"); > > +#endif > > + if (!ref || !tag) > > + return; > > This too. > > > + > > + ref->ct = &tag->ct; > > + /* > > + * We need in increment the call counter every time we have a new > > + * allocation or when we split a large allocation into smaller ones. > > + * Each new reference for every sub-allocation needs to increment call > > + * counter because when we free each part the counter will be decremented. > > + */ > > + this_cpu_inc(tag->counters->calls); > > +} > > + > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) > > +{ > > + alloc_tag_ref_set(ref, tag); > > We might have returned from alloc_tag_ref_set() due to !tag > > > + this_cpu_add(tag->counters->bytes, bytes); > > But here we still assume it's valid. Yes, this is a blunder on my side after splitting alloc_tag_ref_set() into a separate function. I'll fix this in the next version. Thanks! > > > +} > > + > > -- > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index c59889de122b..e86c968a7a0e 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: - legacy_va_layout - lowmem_reserve_ratio - max_map_count +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) - memory_failure_early_kill - memory_failure_recovery - min_free_kbytes @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. The default value is 65530. +mem_profiling +============== + +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) + +1: Enable memory profiling. + +0: Disable memory profiling. + +Enabling memory profiling introduces a small performance overhead for all +memory allocations. + +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. + + memory_failure_early_kill: ========================== diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 104c6d047d9b..8150dc3d689c 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -688,6 +688,7 @@ files are there, and which are missing. ============ =============================================================== File Content ============ =============================================================== + allocinfo Memory allocations profiling information apm Advanced power management info bootconfig Kernel command line obtained from boot config, and, if there were kernel parameters from the @@ -953,6 +954,34 @@ also be allocatable although a lot of filesystem metadata may have to be reclaimed to achieve this. +allocinfo +~~~~~~~ + +Provides information about memory allocations at all locations in the code +base. Each allocation in the code is identified by its source file, line +number, module (if originates from a loadable module) and the function calling +the allocation. The number of bytes allocated and number of calls at each +location are reported. + +Example output. + +:: + + > sort -rn /proc/allocinfo + 127664128 31168 mm/page_ext.c:270 func:alloc_page_ext + 56373248 4737 mm/slub.c:2259 func:alloc_slab_page + 14880768 3633 mm/readahead.c:247 func:page_cache_ra_unbounded + 14417920 3520 mm/mm_init.c:2530 func:alloc_large_system_hash + 13377536 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs + 11718656 2861 mm/filemap.c:1919 func:__filemap_get_folio + 9192960 2800 kernel/fork.c:307 func:alloc_thread_stack_node + 4206592 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable + 4136960 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start + 3940352 962 mm/memory.c:4214 func:alloc_anon_folio + 2894464 22613 fs/kernfs/dir.c:615 func:__kernfs_new_node + ... + + meminfo ~~~~~~~ diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h new file mode 100644 index 000000000000..64f536b80380 --- /dev/null +++ b/include/asm-generic/codetag.lds.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_GENERIC_CODETAG_LDS_H +#define __ASM_GENERIC_CODETAG_LDS_H + +#define SECTION_WITH_BOUNDARIES(_name) \ + . = ALIGN(8); \ + __start_##_name = .; \ + KEEP(*(_name)) \ + __stop_##_name = .; + +#define CODETAG_SECTIONS() \ + SECTION_WITH_BOUNDARIES(alloc_tags) + +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5dd3a61d673d..c9997dc50c50 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -50,6 +50,8 @@ * [__nosave_begin, __nosave_end] for the nosave data */ +#include <asm-generic/codetag.lds.h> + #ifndef LOAD_OFFSET #define LOAD_OFFSET 0 #endif @@ -366,6 +368,7 @@ . = ALIGN(8); \ BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ + CODETAG_SECTIONS() \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h new file mode 100644 index 000000000000..be3ba955846c --- /dev/null +++ b/include/linux/alloc_tag.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * allocation tagging + */ +#ifndef _LINUX_ALLOC_TAG_H +#define _LINUX_ALLOC_TAG_H + +#include <linux/bug.h> +#include <linux/codetag.h> +#include <linux/container_of.h> +#include <linux/preempt.h> +#include <asm/percpu.h> +#include <linux/cpumask.h> +#include <linux/static_key.h> + +struct alloc_tag_counters { + u64 bytes; + u64 calls; +}; + +/* + * An instance of this structure is created in a special ELF section at every + * allocation callsite. At runtime, the special section is treated as + * an array of these. Embedded codetag utilizes codetag framework. + */ +struct alloc_tag { + struct codetag ct; + struct alloc_tag_counters __percpu *counters; +} __aligned(8); + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) +{ + return container_of(ct, struct alloc_tag, ct); +} + +#ifdef ARCH_NEEDS_WEAK_PER_CPU +/* + * When percpu variables are required to be defined as weak, static percpu + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). + */ +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU" +#endif + +#define DEFINE_ALLOC_TAG(_alloc_tag) \ + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section("alloc_tags") = { \ + .ct = CODE_TAG_INIT, \ + .counters = &_alloc_tag_cntr }; + +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static inline bool mem_alloc_profiling_enabled(void) +{ + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + &mem_alloc_profiling_key); +} + +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) +{ + struct alloc_tag_counters v = { 0, 0 }; + struct alloc_tag_counters *counter; + int cpu; + + for_each_possible_cpu(cpu) { + counter = per_cpu_ptr(tag->counters, cpu); + v.bytes += counter->bytes; + v.calls += counter->calls; + } + + return v; +} + +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + struct alloc_tag *tag; + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); +#endif + if (!ref || !ref->ct) + return; + + tag = ct_to_alloc_tag(ref->ct); + + this_cpu_sub(tag->counters->bytes, bytes); + this_cpu_dec(tag->counters->calls); + + ref->ct = NULL; +} + +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + __alloc_tag_sub(ref, bytes); +} + +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) +{ + __alloc_tag_sub(ref, bytes); +} + +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN_ONCE(ref && ref->ct, + "alloc_tag was not cleared (got tag for %s:%u)\n",\ + ref->ct->filename, ref->ct->lineno); + + WARN_ONCE(!tag, "current->alloc_tag not set"); +#endif + if (!ref || !tag) + return; + + ref->ct = &tag->ct; + /* + * We need in increment the call counter every time we have a new + * allocation or when we split a large allocation into smaller ones. + * Each new reference for every sub-allocation needs to increment call + * counter because when we free each part the counter will be decremented. + */ + this_cpu_inc(tag->counters->calls); +} + +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) +{ + alloc_tag_ref_set(ref, tag); + this_cpu_add(tag->counters->bytes, bytes); +} + +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +#define DEFINE_ALLOC_TAG(_alloc_tag) +static inline bool mem_alloc_profiling_enabled(void) { return false; } +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, + size_t bytes) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + +#endif /* _LINUX_ALLOC_TAG_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..eede1f92bcc6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -770,6 +770,10 @@ struct task_struct { unsigned int flags; unsigned int ptrace; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; +#endif + #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; @@ -810,6 +814,7 @@ struct task_struct { struct task_group *sched_task_group; #endif + #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +{ + swap(current->alloc_tag, tag); + return tag; +} + +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; +} +#else +#define alloc_tag_save(_tag) NULL +#define alloc_tag_restore(_tag, _old) do {} while (0) +#endif + #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0be2d00c3696..78d258ca508f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -972,6 +972,31 @@ config CODE_TAGGING bool select KALLSYMS +config MEM_ALLOC_PROFILING + bool "Enable memory allocation profiling" + default n + depends on PROC_FS + depends on !DEBUG_FORCE_WEAK_PER_CPU + select CODE_TAGGING + help + Track allocation source code and record total allocation size + initiated at that code location. The mechanism can be used to track + memory leaks with a low performance and memory impact. + +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + bool "Enable memory allocation profiling by default" + default y + depends on MEM_ALLOC_PROFILING + +config MEM_ALLOC_PROFILING_DEBUG + bool "Memory allocation profiler debugging" + default n + depends on MEM_ALLOC_PROFILING + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + help + Adds warnings with helpful error messages for memory allocation + profiling. + source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" source "lib/Kconfig.kmsan" diff --git a/lib/Makefile b/lib/Makefile index 6b48b22fdfac..859112f09bf5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_CODE_TAGGING) += codetag.o +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o + lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c new file mode 100644 index 000000000000..f09c8a422bc2 --- /dev/null +++ b/lib/alloc_tag.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/alloc_tag.h> +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_buf.h> +#include <linux/seq_file.h> + +static struct codetag_type *alloc_tag_cttype; + +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static void *allocinfo_start(struct seq_file *m, loff_t *pos) +{ + struct codetag_iterator *iter; + struct codetag *ct; + loff_t node = *pos; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + m->private = iter; + if (!iter) + return NULL; + + codetag_lock_module_list(alloc_tag_cttype, true); + *iter = codetag_get_ct_iter(alloc_tag_cttype); + while ((ct = codetag_next_ct(iter)) != NULL && node) + node--; + + return ct ? iter : NULL; +} + +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + struct codetag *ct = codetag_next_ct(iter); + + (*pos)++; + if (!ct) + return NULL; + + return iter; +} + +static void allocinfo_stop(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; + + if (iter) { + codetag_lock_module_list(alloc_tag_cttype, false); + kfree(iter); + } +} + +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) +{ + struct alloc_tag *tag = ct_to_alloc_tag(ct); + struct alloc_tag_counters counter = alloc_tag_read(tag); + s64 bytes = counter.bytes; + + seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls); + codetag_to_text(out, ct); + seq_buf_putc(out, ' '); + seq_buf_putc(out, '\n'); +} + +static int allocinfo_show(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + char *bufp; + size_t n = seq_get_buf(m, &bufp); + struct seq_buf buf; + + seq_buf_init(&buf, bufp, n); + alloc_tag_to_text(&buf, iter->ct); + seq_commit(m, seq_buf_used(&buf)); + return 0; +} + +static const struct seq_operations allocinfo_seq_op = { + .start = allocinfo_start, + .next = allocinfo_next, + .stop = allocinfo_stop, + .show = allocinfo_show, +}; + +static void __init procfs_init(void) +{ + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); +} + +static bool alloc_tag_module_unload(struct codetag_type *cttype, + struct codetag_module *cmod) +{ + struct codetag_iterator iter = codetag_get_ct_iter(cttype); + struct alloc_tag_counters counter; + bool module_unused = true; + struct alloc_tag *tag; + struct codetag *ct; + + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { + if (iter.cmod != cmod) + continue; + + tag = ct_to_alloc_tag(ct); + counter = alloc_tag_read(tag); + + if (WARN(counter.bytes, + "%s:%u module %s func:%s has %llu allocated at module unload", + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) + module_unused = false; + } + + return module_unused; +} + +static struct ctl_table memory_allocation_profiling_sysctls[] = { + { + .procname = "mem_profiling", + .data = &mem_alloc_profiling_key, +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + .mode = 0444, +#else + .mode = 0644, +#endif + .proc_handler = proc_do_static_key, + }, + { } +}; + +static int __init alloc_tag_init(void) +{ + const struct codetag_type_desc desc = { + .section = "alloc_tags", + .tag_size = sizeof(struct alloc_tag), + .module_unload = alloc_tag_module_unload, + }; + + alloc_tag_cttype = codetag_register_type(&desc); + if (IS_ERR_OR_NULL(alloc_tag_cttype)) + return PTR_ERR(alloc_tag_cttype); + + register_sysctl_init("vm", memory_allocation_profiling_sysctls); + procfs_init(); + + return 0; +} +module_init(alloc_tag_init); diff --git a/scripts/module.lds.S b/scripts/module.lds.S index bf5bcf2836d8..45c67a0994f3 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -9,6 +9,8 @@ #define DISCARD_EH_FRAME *(.eh_frame) #endif +#include <asm-generic/codetag.lds.h> + SECTIONS { /DISCARD/ : { *(.discard) @@ -47,12 +49,17 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) + CODETAG_SECTIONS() } .rodata : { *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) } +#else + .data : { + CODETAG_SECTIONS() + } #endif }