Message ID | 20240212213922.783301-14-surenb@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Memory allocation profiling | expand |
On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > instrument memory allocators. It registers an "alloc_tags" codetag type > with /proc/allocinfo interface to output allocation tag information when Please don't add anything new to the top-level /proc directory. This should likely live in /sys. > the feature is enabled. > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory > allocation profiling instrumentation. > Memory allocation profiling can be enabled or disabled at runtime using > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation > profiling by default. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > --- > Documentation/admin-guide/sysctl/vm.rst | 16 +++ > Documentation/filesystems/proc.rst | 28 +++++ > include/asm-generic/codetag.lds.h | 14 +++ > include/asm-generic/vmlinux.lds.h | 3 + > include/linux/alloc_tag.h | 133 ++++++++++++++++++++ > include/linux/sched.h | 24 ++++ > lib/Kconfig.debug | 25 ++++ > lib/Makefile | 2 + > lib/alloc_tag.c | 158 ++++++++++++++++++++++++ > scripts/module.lds.S | 7 ++ > 10 files changed, 410 insertions(+) > create mode 100644 include/asm-generic/codetag.lds.h > create mode 100644 include/linux/alloc_tag.h > create mode 100644 lib/alloc_tag.c > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > index c59889de122b..a214719492ea 100644 > --- a/Documentation/admin-guide/sysctl/vm.rst > +++ b/Documentation/admin-guide/sysctl/vm.rst > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: > - legacy_va_layout > - lowmem_reserve_ratio > - max_map_count > +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) > - memory_failure_early_kill > - memory_failure_recovery > - min_free_kbytes > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. > The default value is 65530. > > > +mem_profiling > +============== > + > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) > + > +1: Enable memory profiling. > + > +0: Disabld memory profiling. > + > +Enabling memory profiling introduces a small performance overhead for all > +memory allocations. > + > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. > + > + > memory_failure_early_kill: > ========================== > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst > index 104c6d047d9b..40d6d18308e4 100644 > --- a/Documentation/filesystems/proc.rst > +++ b/Documentation/filesystems/proc.rst > @@ -688,6 +688,7 @@ files are there, and which are missing. > ============ =============================================================== > File Content > ============ =============================================================== > + allocinfo Memory allocations profiling information > apm Advanced power management info > bootconfig Kernel command line obtained from boot config, > and, if there were kernel parameters from the > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be > reclaimed to achieve this. > > > +allocinfo > +~~~~~~~ > + > +Provides information about memory allocations at all locations in the code > +base. Each allocation in the code is identified by its source file, line > +number, module and the function calling the allocation. The number of bytes > +allocated at each location is reported. > + > +Example output. > + > +:: > + > + > cat /proc/allocinfo > + > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc > + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare > + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc > + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func > + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf > + ... > + > + > meminfo > ~~~~~~~ > > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h > new file mode 100644 > index 000000000000..64f536b80380 > --- /dev/null > +++ b/include/asm-generic/codetag.lds.h > @@ -0,0 +1,14 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +#ifndef __ASM_GENERIC_CODETAG_LDS_H > +#define __ASM_GENERIC_CODETAG_LDS_H > + > +#define SECTION_WITH_BOUNDARIES(_name) \ > + . = ALIGN(8); \ > + __start_##_name = .; \ > + KEEP(*(_name)) \ > + __stop_##_name = .; > + > +#define CODETAG_SECTIONS() \ > + SECTION_WITH_BOUNDARIES(alloc_tags) > + > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h > index 5dd3a61d673d..c9997dc50c50 100644 > --- a/include/asm-generic/vmlinux.lds.h > +++ b/include/asm-generic/vmlinux.lds.h > @@ -50,6 +50,8 @@ > * [__nosave_begin, __nosave_end] for the nosave data > */ > > +#include <asm-generic/codetag.lds.h> > + > #ifndef LOAD_OFFSET > #define LOAD_OFFSET 0 > #endif > @@ -366,6 +368,7 @@ > . = ALIGN(8); \ > BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ > BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ > + CODETAG_SECTIONS() \ > LIKELY_PROFILE() \ > BRANCH_PROFILE() \ > TRACE_PRINTKS() \ > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h > new file mode 100644 > index 000000000000..cf55a149fa84 > --- /dev/null > +++ b/include/linux/alloc_tag.h > @@ -0,0 +1,133 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * allocation tagging > + */ > +#ifndef _LINUX_ALLOC_TAG_H > +#define _LINUX_ALLOC_TAG_H > + > +#include <linux/bug.h> > +#include <linux/codetag.h> > +#include <linux/container_of.h> > +#include <linux/preempt.h> > +#include <asm/percpu.h> > +#include <linux/cpumask.h> > +#include <linux/static_key.h> > + > +struct alloc_tag_counters { > + u64 bytes; > + u64 calls; > +}; > + > +/* > + * An instance of this structure is created in a special ELF section at every > + * allocation callsite. At runtime, the special section is treated as > + * an array of these. Embedded codetag utilizes codetag framework. > + */ > +struct alloc_tag { > + struct codetag ct; > + struct alloc_tag_counters __percpu *counters; > +} __aligned(8); > + > +#ifdef CONFIG_MEM_ALLOC_PROFILING > + > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) > +{ > + return container_of(ct, struct alloc_tag, ct); > +} > + > +#ifdef ARCH_NEEDS_WEAK_PER_CPU > +/* > + * When percpu variables are required to be defined as weak, static percpu > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). > + */ > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU" Is this enforced via Kconfig as well? (Looks like only alpha and s390?) > +#endif > + > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > + __section("alloc_tags") = { \ > + .ct = CODE_TAG_INIT, \ > + .counters = &_alloc_tag_cntr }; \ > + struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) > + > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > + mem_alloc_profiling_key); > + > +static inline bool mem_alloc_profiling_enabled(void) > +{ > + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > + &mem_alloc_profiling_key); > +} > + > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) > +{ > + struct alloc_tag_counters v = { 0, 0 }; > + struct alloc_tag_counters *counter; > + int cpu; > + > + for_each_possible_cpu(cpu) { > + counter = per_cpu_ptr(tag->counters, cpu); > + v.bytes += counter->bytes; > + v.calls += counter->calls; > + } > + > + return v; > +} > + > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) > +{ > + struct alloc_tag *tag; > + > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); > +#endif > + if (!ref || !ref->ct) > + return; > + > + tag = ct_to_alloc_tag(ref->ct); > + > + this_cpu_sub(tag->counters->bytes, bytes); > + this_cpu_dec(tag->counters->calls); > + > + ref->ct = NULL; > +} > + > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > +{ > + __alloc_tag_sub(ref, bytes); > +} > + > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > +{ > + __alloc_tag_sub(ref, bytes); > +} > + > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) > +{ > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > + WARN_ONCE(ref && ref->ct, > + "alloc_tag was not cleared (got tag for %s:%u)\n",\ > + ref->ct->filename, ref->ct->lineno); > + > + WARN_ONCE(!tag, "current->alloc_tag not set"); > +#endif > + if (!ref || !tag) > + return; > + > + ref->ct = &tag->ct; > + this_cpu_add(tag->counters->bytes, bytes); > + this_cpu_inc(tag->counters->calls); > +} > + > +#else > + > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, > + size_t bytes) {} > + > +#endif > + > +#endif /* _LINUX_ALLOC_TAG_H */ > diff --git a/include/linux/sched.h b/include/linux/sched.h > index ffe8f618ab86..da68a10517c8 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -770,6 +770,10 @@ struct task_struct { > unsigned int flags; > unsigned int ptrace; > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > + struct alloc_tag *alloc_tag; > +#endif Normally scheduling is very sensitive to having anything early in task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef area. > + > #ifdef CONFIG_SMP > int on_cpu; > struct __call_single_node wake_entry; > @@ -810,6 +814,7 @@ struct task_struct { > struct task_group *sched_task_group; > #endif > > + > #ifdef CONFIG_UCLAMP_TASK > /* > * Clamp values requested for a scheduling entity. > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } > > extern void sched_set_stop_task(int cpu, struct task_struct *stop); > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > +{ > + swap(current->alloc_tag, tag); > + return tag; > +} > + > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) > +{ > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); > +#endif > + current->alloc_tag = old; > +} > +#else > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } > +#define alloc_tag_restore(_tag, _old) > +#endif > + > #endif > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > index 0be2d00c3696..78d258ca508f 100644 > --- a/lib/Kconfig.debug > +++ b/lib/Kconfig.debug > @@ -972,6 +972,31 @@ config CODE_TAGGING > bool > select KALLSYMS > > +config MEM_ALLOC_PROFILING > + bool "Enable memory allocation profiling" > + default n > + depends on PROC_FS > + depends on !DEBUG_FORCE_WEAK_PER_CPU > + select CODE_TAGGING > + help > + Track allocation source code and record total allocation size > + initiated at that code location. The mechanism can be used to track > + memory leaks with a low performance and memory impact. > + > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > + bool "Enable memory allocation profiling by default" > + default y > + depends on MEM_ALLOC_PROFILING > + > +config MEM_ALLOC_PROFILING_DEBUG > + bool "Memory allocation profiler debugging" > + default n > + depends on MEM_ALLOC_PROFILING > + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > + help > + Adds warnings with helpful error messages for memory allocation > + profiling. > + > source "lib/Kconfig.kasan" > source "lib/Kconfig.kfence" > source "lib/Kconfig.kmsan" > diff --git a/lib/Makefile b/lib/Makefile > index 6b48b22fdfac..859112f09bf5 100644 > --- a/lib/Makefile > +++ b/lib/Makefile > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o > > obj-$(CONFIG_CODE_TAGGING) += codetag.o > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o > + > lib-$(CONFIG_GENERIC_BUG) += bug.o > > obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c > new file mode 100644 > index 000000000000..4fc031f9cefd > --- /dev/null > +++ b/lib/alloc_tag.c > @@ -0,0 +1,158 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include <linux/alloc_tag.h> > +#include <linux/fs.h> > +#include <linux/gfp.h> > +#include <linux/module.h> > +#include <linux/proc_fs.h> > +#include <linux/seq_buf.h> > +#include <linux/seq_file.h> > + > +static struct codetag_type *alloc_tag_cttype; > + > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > + mem_alloc_profiling_key); > + > +static void *allocinfo_start(struct seq_file *m, loff_t *pos) > +{ > + struct codetag_iterator *iter; > + struct codetag *ct; > + loff_t node = *pos; > + > + iter = kzalloc(sizeof(*iter), GFP_KERNEL); > + m->private = iter; > + if (!iter) > + return NULL; > + > + codetag_lock_module_list(alloc_tag_cttype, true); > + *iter = codetag_get_ct_iter(alloc_tag_cttype); > + while ((ct = codetag_next_ct(iter)) != NULL && node) > + node--; > + > + return ct ? iter : NULL; > +} > + > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) > +{ > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > + struct codetag *ct = codetag_next_ct(iter); > + > + (*pos)++; > + if (!ct) > + return NULL; > + > + return iter; > +} > + > +static void allocinfo_stop(struct seq_file *m, void *arg) > +{ > + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; > + > + if (iter) { > + codetag_lock_module_list(alloc_tag_cttype, false); > + kfree(iter); > + } > +} > + > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) > +{ > + struct alloc_tag *tag = ct_to_alloc_tag(ct); > + struct alloc_tag_counters counter = alloc_tag_read(tag); > + s64 bytes = counter.bytes; > + char val[10], *p = val; > + > + if (bytes < 0) { > + *p++ = '-'; > + bytes = -bytes; > + } > + > + string_get_size(bytes, 1, > + STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, > + p, val + ARRAY_SIZE(val) - p); > + > + seq_buf_printf(out, "%8s %8llu ", val, counter.calls); > + codetag_to_text(out, ct); > + seq_buf_putc(out, ' '); > + seq_buf_putc(out, '\n'); > +} /me does happy seq_buf dance! > + > +static int allocinfo_show(struct seq_file *m, void *arg) > +{ > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > + char *bufp; > + size_t n = seq_get_buf(m, &bufp); > + struct seq_buf buf; > + > + seq_buf_init(&buf, bufp, n); > + alloc_tag_to_text(&buf, iter->ct); > + seq_commit(m, seq_buf_used(&buf)); > + return 0; > +} > + > +static const struct seq_operations allocinfo_seq_op = { > + .start = allocinfo_start, > + .next = allocinfo_next, > + .stop = allocinfo_stop, > + .show = allocinfo_show, > +}; > + > +static void __init procfs_init(void) > +{ > + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); > +} As mentioned, this really should be in /sys somewhere. > + > +static bool alloc_tag_module_unload(struct codetag_type *cttype, > + struct codetag_module *cmod) > +{ > + struct codetag_iterator iter = codetag_get_ct_iter(cttype); > + struct alloc_tag_counters counter; > + bool module_unused = true; > + struct alloc_tag *tag; > + struct codetag *ct; > + > + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { > + if (iter.cmod != cmod) > + continue; > + > + tag = ct_to_alloc_tag(ct); > + counter = alloc_tag_read(tag); > + > + if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload", > + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) > + module_unused = false; > + } > + > + return module_unused; > +} > + > +static struct ctl_table memory_allocation_profiling_sysctls[] = { > + { > + .procname = "mem_profiling", > + .data = &mem_alloc_profiling_key, > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > + .mode = 0444, > +#else > + .mode = 0644, > +#endif > + .proc_handler = proc_do_static_key, > + }, > + { } > +}; > + > +static int __init alloc_tag_init(void) > +{ > + const struct codetag_type_desc desc = { > + .section = "alloc_tags", > + .tag_size = sizeof(struct alloc_tag), > + .module_unload = alloc_tag_module_unload, > + }; > + > + alloc_tag_cttype = codetag_register_type(&desc); > + if (IS_ERR_OR_NULL(alloc_tag_cttype)) > + return PTR_ERR(alloc_tag_cttype); > + > + register_sysctl_init("vm", memory_allocation_profiling_sysctls); > + procfs_init(); > + > + return 0; > +} > +module_init(alloc_tag_init); > diff --git a/scripts/module.lds.S b/scripts/module.lds.S > index bf5bcf2836d8..45c67a0994f3 100644 > --- a/scripts/module.lds.S > +++ b/scripts/module.lds.S > @@ -9,6 +9,8 @@ > #define DISCARD_EH_FRAME *(.eh_frame) > #endif > > +#include <asm-generic/codetag.lds.h> > + > SECTIONS { > /DISCARD/ : { > *(.discard) > @@ -47,12 +49,17 @@ SECTIONS { > .data : { > *(.data .data.[0-9a-zA-Z_]*) > *(.data..L*) > + CODETAG_SECTIONS() > } > > .rodata : { > *(.rodata .rodata.[0-9a-zA-Z_]*) > *(.rodata..L*) > } > +#else > + .data : { > + CODETAG_SECTIONS() > + } > #endif > } Otherwise, looks good.
On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote: > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > > instrument memory allocators. It registers an "alloc_tags" codetag type > > with /proc/allocinfo interface to output allocation tag information when > > Please don't add anything new to the top-level /proc directory. This > should likely live in /sys. Ack. I'll find a more appropriate place for it then. It just seemed like such generic information which would belong next to meminfo/zoneinfo and such... > > > the feature is enabled. > > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory > > allocation profiling instrumentation. > > Memory allocation profiling can be enabled or disabled at runtime using > > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation > > profiling by default. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev> > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > > --- > > Documentation/admin-guide/sysctl/vm.rst | 16 +++ > > Documentation/filesystems/proc.rst | 28 +++++ > > include/asm-generic/codetag.lds.h | 14 +++ > > include/asm-generic/vmlinux.lds.h | 3 + > > include/linux/alloc_tag.h | 133 ++++++++++++++++++++ > > include/linux/sched.h | 24 ++++ > > lib/Kconfig.debug | 25 ++++ > > lib/Makefile | 2 + > > lib/alloc_tag.c | 158 ++++++++++++++++++++++++ > > scripts/module.lds.S | 7 ++ > > 10 files changed, 410 insertions(+) > > create mode 100644 include/asm-generic/codetag.lds.h > > create mode 100644 include/linux/alloc_tag.h > > create mode 100644 lib/alloc_tag.c > > > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > > index c59889de122b..a214719492ea 100644 > > --- a/Documentation/admin-guide/sysctl/vm.rst > > +++ b/Documentation/admin-guide/sysctl/vm.rst > > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: > > - legacy_va_layout > > - lowmem_reserve_ratio > > - max_map_count > > +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) > > - memory_failure_early_kill > > - memory_failure_recovery > > - min_free_kbytes > > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. > > The default value is 65530. > > > > > > +mem_profiling > > +============== > > + > > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) > > + > > +1: Enable memory profiling. > > + > > +0: Disabld memory profiling. > > + > > +Enabling memory profiling introduces a small performance overhead for all > > +memory allocations. > > + > > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. > > + > > + > > memory_failure_early_kill: > > ========================== > > > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst > > index 104c6d047d9b..40d6d18308e4 100644 > > --- a/Documentation/filesystems/proc.rst > > +++ b/Documentation/filesystems/proc.rst > > @@ -688,6 +688,7 @@ files are there, and which are missing. > > ============ =============================================================== > > File Content > > ============ =============================================================== > > + allocinfo Memory allocations profiling information > > apm Advanced power management info > > bootconfig Kernel command line obtained from boot config, > > and, if there were kernel parameters from the > > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be > > reclaimed to achieve this. > > > > > > +allocinfo > > +~~~~~~~ > > + > > +Provides information about memory allocations at all locations in the code > > +base. Each allocation in the code is identified by its source file, line > > +number, module and the function calling the allocation. The number of bytes > > +allocated at each location is reported. > > + > > +Example output. > > + > > +:: > > + > > + > cat /proc/allocinfo > > + > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc > > + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare > > + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc > > + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func > > + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf > > + ... > > + > > + > > meminfo > > ~~~~~~~ > > > > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h > > new file mode 100644 > > index 000000000000..64f536b80380 > > --- /dev/null > > +++ b/include/asm-generic/codetag.lds.h > > @@ -0,0 +1,14 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +#ifndef __ASM_GENERIC_CODETAG_LDS_H > > +#define __ASM_GENERIC_CODETAG_LDS_H > > + > > +#define SECTION_WITH_BOUNDARIES(_name) \ > > + . = ALIGN(8); \ > > + __start_##_name = .; \ > > + KEEP(*(_name)) \ > > + __stop_##_name = .; > > + > > +#define CODETAG_SECTIONS() \ > > + SECTION_WITH_BOUNDARIES(alloc_tags) > > + > > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ > > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h > > index 5dd3a61d673d..c9997dc50c50 100644 > > --- a/include/asm-generic/vmlinux.lds.h > > +++ b/include/asm-generic/vmlinux.lds.h > > @@ -50,6 +50,8 @@ > > * [__nosave_begin, __nosave_end] for the nosave data > > */ > > > > +#include <asm-generic/codetag.lds.h> > > + > > #ifndef LOAD_OFFSET > > #define LOAD_OFFSET 0 > > #endif > > @@ -366,6 +368,7 @@ > > . = ALIGN(8); \ > > BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ > > BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ > > + CODETAG_SECTIONS() \ > > LIKELY_PROFILE() \ > > BRANCH_PROFILE() \ > > TRACE_PRINTKS() \ > > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h > > new file mode 100644 > > index 000000000000..cf55a149fa84 > > --- /dev/null > > +++ b/include/linux/alloc_tag.h > > @@ -0,0 +1,133 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* > > + * allocation tagging > > + */ > > +#ifndef _LINUX_ALLOC_TAG_H > > +#define _LINUX_ALLOC_TAG_H > > + > > +#include <linux/bug.h> > > +#include <linux/codetag.h> > > +#include <linux/container_of.h> > > +#include <linux/preempt.h> > > +#include <asm/percpu.h> > > +#include <linux/cpumask.h> > > +#include <linux/static_key.h> > > + > > +struct alloc_tag_counters { > > + u64 bytes; > > + u64 calls; > > +}; > > + > > +/* > > + * An instance of this structure is created in a special ELF section at every > > + * allocation callsite. At runtime, the special section is treated as > > + * an array of these. Embedded codetag utilizes codetag framework. > > + */ > > +struct alloc_tag { > > + struct codetag ct; > > + struct alloc_tag_counters __percpu *counters; > > +} __aligned(8); > > + > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > + > > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) > > +{ > > + return container_of(ct, struct alloc_tag, ct); > > +} > > + > > +#ifdef ARCH_NEEDS_WEAK_PER_CPU > > +/* > > + * When percpu variables are required to be defined as weak, static percpu > > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). > > + */ > > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU" > > Is this enforced via Kconfig as well? (Looks like only alpha and s390?) Unfortunately ARCH_NEEDS_WEAK_PER_CPU is not a Kconfig option but CONFIG_DEBUG_FORCE_WEAK_PER_CPU is, so that one is handled via Kconfig (see "depends on !DEBUG_FORCE_WEAK_PER_CPU" in this patch). We have to avoid both cases because of this: https://elixir.bootlin.com/linux/latest/source/include/linux/percpu-defs.h#L75, so I'm trying to provide an informative error here. > > > +#endif > > + > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > + __section("alloc_tags") = { \ > > + .ct = CODE_TAG_INIT, \ > > + .counters = &_alloc_tag_cntr }; \ > > + struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) > > + > > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > + mem_alloc_profiling_key); > > + > > +static inline bool mem_alloc_profiling_enabled(void) > > +{ > > + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > + &mem_alloc_profiling_key); > > +} > > + > > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) > > +{ > > + struct alloc_tag_counters v = { 0, 0 }; > > + struct alloc_tag_counters *counter; > > + int cpu; > > + > > + for_each_possible_cpu(cpu) { > > + counter = per_cpu_ptr(tag->counters, cpu); > > + v.bytes += counter->bytes; > > + v.calls += counter->calls; > > + } > > + > > + return v; > > +} > > + > > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > +{ > > + struct alloc_tag *tag; > > + > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); > > +#endif > > + if (!ref || !ref->ct) > > + return; > > + > > + tag = ct_to_alloc_tag(ref->ct); > > + > > + this_cpu_sub(tag->counters->bytes, bytes); > > + this_cpu_dec(tag->counters->calls); > > + > > + ref->ct = NULL; > > +} > > + > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > +{ > > + __alloc_tag_sub(ref, bytes); > > +} > > + > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > > +{ > > + __alloc_tag_sub(ref, bytes); > > +} > > + > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) > > +{ > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > + WARN_ONCE(ref && ref->ct, > > + "alloc_tag was not cleared (got tag for %s:%u)\n",\ > > + ref->ct->filename, ref->ct->lineno); > > + > > + WARN_ONCE(!tag, "current->alloc_tag not set"); > > +#endif > > + if (!ref || !tag) > > + return; > > + > > + ref->ct = &tag->ct; > > + this_cpu_add(tag->counters->bytes, bytes); > > + this_cpu_inc(tag->counters->calls); > > +} > > + > > +#else > > + > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, > > + size_t bytes) {} > > + > > +#endif > > + > > +#endif /* _LINUX_ALLOC_TAG_H */ > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index ffe8f618ab86..da68a10517c8 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -770,6 +770,10 @@ struct task_struct { > > unsigned int flags; > > unsigned int ptrace; > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > + struct alloc_tag *alloc_tag; > > +#endif > > Normally scheduling is very sensitive to having anything early in > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef > area. Thanks for the warning! We will look into that. > > > + > > #ifdef CONFIG_SMP > > int on_cpu; > > struct __call_single_node wake_entry; > > @@ -810,6 +814,7 @@ struct task_struct { > > struct task_group *sched_task_group; > > #endif > > > > + > > #ifdef CONFIG_UCLAMP_TASK > > /* > > * Clamp values requested for a scheduling entity. > > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } > > > > extern void sched_set_stop_task(int cpu, struct task_struct *stop); > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > +{ > > + swap(current->alloc_tag, tag); > > + return tag; > > +} > > + > > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) > > +{ > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); > > +#endif > > + current->alloc_tag = old; > > +} > > +#else > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } > > +#define alloc_tag_restore(_tag, _old) > > +#endif > > + > > #endif > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > index 0be2d00c3696..78d258ca508f 100644 > > --- a/lib/Kconfig.debug > > +++ b/lib/Kconfig.debug > > @@ -972,6 +972,31 @@ config CODE_TAGGING > > bool > > select KALLSYMS > > > > +config MEM_ALLOC_PROFILING > > + bool "Enable memory allocation profiling" > > + default n > > + depends on PROC_FS > > + depends on !DEBUG_FORCE_WEAK_PER_CPU > > + select CODE_TAGGING > > + help > > + Track allocation source code and record total allocation size > > + initiated at that code location. The mechanism can be used to track > > + memory leaks with a low performance and memory impact. > > + > > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > + bool "Enable memory allocation profiling by default" > > + default y > > + depends on MEM_ALLOC_PROFILING > > + > > +config MEM_ALLOC_PROFILING_DEBUG > > + bool "Memory allocation profiler debugging" > > + default n > > + depends on MEM_ALLOC_PROFILING > > + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > + help > > + Adds warnings with helpful error messages for memory allocation > > + profiling. > > + > > source "lib/Kconfig.kasan" > > source "lib/Kconfig.kfence" > > source "lib/Kconfig.kmsan" > > diff --git a/lib/Makefile b/lib/Makefile > > index 6b48b22fdfac..859112f09bf5 100644 > > --- a/lib/Makefile > > +++ b/lib/Makefile > > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ > > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o > > > > obj-$(CONFIG_CODE_TAGGING) += codetag.o > > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o > > + > > lib-$(CONFIG_GENERIC_BUG) += bug.o > > > > obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o > > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c > > new file mode 100644 > > index 000000000000..4fc031f9cefd > > --- /dev/null > > +++ b/lib/alloc_tag.c > > @@ -0,0 +1,158 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +#include <linux/alloc_tag.h> > > +#include <linux/fs.h> > > +#include <linux/gfp.h> > > +#include <linux/module.h> > > +#include <linux/proc_fs.h> > > +#include <linux/seq_buf.h> > > +#include <linux/seq_file.h> > > + > > +static struct codetag_type *alloc_tag_cttype; > > + > > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > + mem_alloc_profiling_key); > > + > > +static void *allocinfo_start(struct seq_file *m, loff_t *pos) > > +{ > > + struct codetag_iterator *iter; > > + struct codetag *ct; > > + loff_t node = *pos; > > + > > + iter = kzalloc(sizeof(*iter), GFP_KERNEL); > > + m->private = iter; > > + if (!iter) > > + return NULL; > > + > > + codetag_lock_module_list(alloc_tag_cttype, true); > > + *iter = codetag_get_ct_iter(alloc_tag_cttype); > > + while ((ct = codetag_next_ct(iter)) != NULL && node) > > + node--; > > + > > + return ct ? iter : NULL; > > +} > > + > > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) > > +{ > > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > > + struct codetag *ct = codetag_next_ct(iter); > > + > > + (*pos)++; > > + if (!ct) > > + return NULL; > > + > > + return iter; > > +} > > + > > +static void allocinfo_stop(struct seq_file *m, void *arg) > > +{ > > + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; > > + > > + if (iter) { > > + codetag_lock_module_list(alloc_tag_cttype, false); > > + kfree(iter); > > + } > > +} > > + > > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) > > +{ > > + struct alloc_tag *tag = ct_to_alloc_tag(ct); > > + struct alloc_tag_counters counter = alloc_tag_read(tag); > > + s64 bytes = counter.bytes; > > + char val[10], *p = val; > > + > > + if (bytes < 0) { > > + *p++ = '-'; > > + bytes = -bytes; > > + } > > + > > + string_get_size(bytes, 1, > > + STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, > > + p, val + ARRAY_SIZE(val) - p); > > + > > + seq_buf_printf(out, "%8s %8llu ", val, counter.calls); > > + codetag_to_text(out, ct); > > + seq_buf_putc(out, ' '); > > + seq_buf_putc(out, '\n'); > > +} > > /me does happy seq_buf dance! > > > + > > +static int allocinfo_show(struct seq_file *m, void *arg) > > +{ > > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > > + char *bufp; > > + size_t n = seq_get_buf(m, &bufp); > > + struct seq_buf buf; > > + > > + seq_buf_init(&buf, bufp, n); > > + alloc_tag_to_text(&buf, iter->ct); > > + seq_commit(m, seq_buf_used(&buf)); > > + return 0; > > +} > > + > > +static const struct seq_operations allocinfo_seq_op = { > > + .start = allocinfo_start, > > + .next = allocinfo_next, > > + .stop = allocinfo_stop, > > + .show = allocinfo_show, > > +}; > > + > > +static void __init procfs_init(void) > > +{ > > + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); > > +} > > As mentioned, this really should be in /sys somewhere. Ack. > > > + > > +static bool alloc_tag_module_unload(struct codetag_type *cttype, > > + struct codetag_module *cmod) > > +{ > > + struct codetag_iterator iter = codetag_get_ct_iter(cttype); > > + struct alloc_tag_counters counter; > > + bool module_unused = true; > > + struct alloc_tag *tag; > > + struct codetag *ct; > > + > > + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { > > + if (iter.cmod != cmod) > > + continue; > > + > > + tag = ct_to_alloc_tag(ct); > > + counter = alloc_tag_read(tag); > > + > > + if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload", > > + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) > > + module_unused = false; > > + } > > + > > + return module_unused; > > +} > > + > > +static struct ctl_table memory_allocation_profiling_sysctls[] = { > > + { > > + .procname = "mem_profiling", > > + .data = &mem_alloc_profiling_key, > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > + .mode = 0444, > > +#else > > + .mode = 0644, > > +#endif > > + .proc_handler = proc_do_static_key, > > + }, > > + { } > > +}; > > + > > +static int __init alloc_tag_init(void) > > +{ > > + const struct codetag_type_desc desc = { > > + .section = "alloc_tags", > > + .tag_size = sizeof(struct alloc_tag), > > + .module_unload = alloc_tag_module_unload, > > + }; > > + > > + alloc_tag_cttype = codetag_register_type(&desc); > > + if (IS_ERR_OR_NULL(alloc_tag_cttype)) > > + return PTR_ERR(alloc_tag_cttype); > > + > > + register_sysctl_init("vm", memory_allocation_profiling_sysctls); > > + procfs_init(); > > + > > + return 0; > > +} > > +module_init(alloc_tag_init); > > diff --git a/scripts/module.lds.S b/scripts/module.lds.S > > index bf5bcf2836d8..45c67a0994f3 100644 > > --- a/scripts/module.lds.S > > +++ b/scripts/module.lds.S > > @@ -9,6 +9,8 @@ > > #define DISCARD_EH_FRAME *(.eh_frame) > > #endif > > > > +#include <asm-generic/codetag.lds.h> > > + > > SECTIONS { > > /DISCARD/ : { > > *(.discard) > > @@ -47,12 +49,17 @@ SECTIONS { > > .data : { > > *(.data .data.[0-9a-zA-Z_]*) > > *(.data..L*) > > + CODETAG_SECTIONS() > > } > > > > .rodata : { > > *(.rodata .rodata.[0-9a-zA-Z_]*) > > *(.rodata..L*) > > } > > +#else > > + .data : { > > + CODETAG_SECTIONS() > > + } > > #endif > > } > > Otherwise, looks good. Thanks! > > -- > Kees Cook
On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote: > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote: > > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > > > instrument memory allocators. It registers an "alloc_tags" codetag type > > > with /proc/allocinfo interface to output allocation tag information when > > > > Please don't add anything new to the top-level /proc directory. This > > should likely live in /sys. > > Ack. I'll find a more appropriate place for it then. > It just seemed like such generic information which would belong next > to meminfo/zoneinfo and such... Save yourself a cycle of "rework the whole fs interface only to have someone else tell you no" and put it in debugfs, not sysfs. Wrangling with debugfs is easier than all the macro-happy sysfs stuff; you don't have to integrate with the "device" model; and there is no 'one value per file' rule. --D > > > > > the feature is enabled. > > > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory > > > allocation profiling instrumentation. > > > Memory allocation profiling can be enabled or disabled at runtime using > > > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. > > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation > > > profiling by default. > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev> > > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > > > --- > > > Documentation/admin-guide/sysctl/vm.rst | 16 +++ > > > Documentation/filesystems/proc.rst | 28 +++++ > > > include/asm-generic/codetag.lds.h | 14 +++ > > > include/asm-generic/vmlinux.lds.h | 3 + > > > include/linux/alloc_tag.h | 133 ++++++++++++++++++++ > > > include/linux/sched.h | 24 ++++ > > > lib/Kconfig.debug | 25 ++++ > > > lib/Makefile | 2 + > > > lib/alloc_tag.c | 158 ++++++++++++++++++++++++ > > > scripts/module.lds.S | 7 ++ > > > 10 files changed, 410 insertions(+) > > > create mode 100644 include/asm-generic/codetag.lds.h > > > create mode 100644 include/linux/alloc_tag.h > > > create mode 100644 lib/alloc_tag.c > > > > > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > > > index c59889de122b..a214719492ea 100644 > > > --- a/Documentation/admin-guide/sysctl/vm.rst > > > +++ b/Documentation/admin-guide/sysctl/vm.rst > > > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: > > > - legacy_va_layout > > > - lowmem_reserve_ratio > > > - max_map_count > > > +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) > > > - memory_failure_early_kill > > > - memory_failure_recovery > > > - min_free_kbytes > > > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. > > > The default value is 65530. > > > > > > > > > +mem_profiling > > > +============== > > > + > > > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) > > > + > > > +1: Enable memory profiling. > > > + > > > +0: Disabld memory profiling. > > > + > > > +Enabling memory profiling introduces a small performance overhead for all > > > +memory allocations. > > > + > > > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. > > > + > > > + > > > memory_failure_early_kill: > > > ========================== > > > > > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst > > > index 104c6d047d9b..40d6d18308e4 100644 > > > --- a/Documentation/filesystems/proc.rst > > > +++ b/Documentation/filesystems/proc.rst > > > @@ -688,6 +688,7 @@ files are there, and which are missing. > > > ============ =============================================================== > > > File Content > > > ============ =============================================================== > > > + allocinfo Memory allocations profiling information > > > apm Advanced power management info > > > bootconfig Kernel command line obtained from boot config, > > > and, if there were kernel parameters from the > > > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be > > > reclaimed to achieve this. > > > > > > > > > +allocinfo > > > +~~~~~~~ > > > + > > > +Provides information about memory allocations at all locations in the code > > > +base. Each allocation in the code is identified by its source file, line > > > +number, module and the function calling the allocation. The number of bytes > > > +allocated at each location is reported. > > > + > > > +Example output. > > > + > > > +:: > > > + > > > + > cat /proc/allocinfo > > > + > > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > > + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc > > > + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare > > > + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc > > > + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func > > > + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf > > > + ... > > > + > > > + > > > meminfo > > > ~~~~~~~ > > > > > > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h > > > new file mode 100644 > > > index 000000000000..64f536b80380 > > > --- /dev/null > > > +++ b/include/asm-generic/codetag.lds.h > > > @@ -0,0 +1,14 @@ > > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > > +#ifndef __ASM_GENERIC_CODETAG_LDS_H > > > +#define __ASM_GENERIC_CODETAG_LDS_H > > > + > > > +#define SECTION_WITH_BOUNDARIES(_name) \ > > > + . = ALIGN(8); \ > > > + __start_##_name = .; \ > > > + KEEP(*(_name)) \ > > > + __stop_##_name = .; > > > + > > > +#define CODETAG_SECTIONS() \ > > > + SECTION_WITH_BOUNDARIES(alloc_tags) > > > + > > > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ > > > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h > > > index 5dd3a61d673d..c9997dc50c50 100644 > > > --- a/include/asm-generic/vmlinux.lds.h > > > +++ b/include/asm-generic/vmlinux.lds.h > > > @@ -50,6 +50,8 @@ > > > * [__nosave_begin, __nosave_end] for the nosave data > > > */ > > > > > > +#include <asm-generic/codetag.lds.h> > > > + > > > #ifndef LOAD_OFFSET > > > #define LOAD_OFFSET 0 > > > #endif > > > @@ -366,6 +368,7 @@ > > > . = ALIGN(8); \ > > > BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ > > > BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ > > > + CODETAG_SECTIONS() \ > > > LIKELY_PROFILE() \ > > > BRANCH_PROFILE() \ > > > TRACE_PRINTKS() \ > > > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h > > > new file mode 100644 > > > index 000000000000..cf55a149fa84 > > > --- /dev/null > > > +++ b/include/linux/alloc_tag.h > > > @@ -0,0 +1,133 @@ > > > +/* SPDX-License-Identifier: GPL-2.0 */ > > > +/* > > > + * allocation tagging > > > + */ > > > +#ifndef _LINUX_ALLOC_TAG_H > > > +#define _LINUX_ALLOC_TAG_H > > > + > > > +#include <linux/bug.h> > > > +#include <linux/codetag.h> > > > +#include <linux/container_of.h> > > > +#include <linux/preempt.h> > > > +#include <asm/percpu.h> > > > +#include <linux/cpumask.h> > > > +#include <linux/static_key.h> > > > + > > > +struct alloc_tag_counters { > > > + u64 bytes; > > > + u64 calls; > > > +}; > > > + > > > +/* > > > + * An instance of this structure is created in a special ELF section at every > > > + * allocation callsite. At runtime, the special section is treated as > > > + * an array of these. Embedded codetag utilizes codetag framework. > > > + */ > > > +struct alloc_tag { > > > + struct codetag ct; > > > + struct alloc_tag_counters __percpu *counters; > > > +} __aligned(8); > > > + > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > + > > > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) > > > +{ > > > + return container_of(ct, struct alloc_tag, ct); > > > +} > > > + > > > +#ifdef ARCH_NEEDS_WEAK_PER_CPU > > > +/* > > > + * When percpu variables are required to be defined as weak, static percpu > > > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). > > > + */ > > > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU" > > > > Is this enforced via Kconfig as well? (Looks like only alpha and s390?) > > Unfortunately ARCH_NEEDS_WEAK_PER_CPU is not a Kconfig option but > CONFIG_DEBUG_FORCE_WEAK_PER_CPU is, so that one is handled via Kconfig > (see "depends on !DEBUG_FORCE_WEAK_PER_CPU" in this patch). We have to > avoid both cases because of this: > https://elixir.bootlin.com/linux/latest/source/include/linux/percpu-defs.h#L75, > so I'm trying to provide an informative error here. > > > > > > +#endif > > > + > > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ > > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > > + __section("alloc_tags") = { \ > > > + .ct = CODE_TAG_INIT, \ > > > + .counters = &_alloc_tag_cntr }; \ > > > + struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) > > > + > > > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > > + mem_alloc_profiling_key); > > > + > > > +static inline bool mem_alloc_profiling_enabled(void) > > > +{ > > > + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > > + &mem_alloc_profiling_key); > > > +} > > > + > > > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) > > > +{ > > > + struct alloc_tag_counters v = { 0, 0 }; > > > + struct alloc_tag_counters *counter; > > > + int cpu; > > > + > > > + for_each_possible_cpu(cpu) { > > > + counter = per_cpu_ptr(tag->counters, cpu); > > > + v.bytes += counter->bytes; > > > + v.calls += counter->calls; > > > + } > > > + > > > + return v; > > > +} > > > + > > > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > > +{ > > > + struct alloc_tag *tag; > > > + > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); > > > +#endif > > > + if (!ref || !ref->ct) > > > + return; > > > + > > > + tag = ct_to_alloc_tag(ref->ct); > > > + > > > + this_cpu_sub(tag->counters->bytes, bytes); > > > + this_cpu_dec(tag->counters->calls); > > > + > > > + ref->ct = NULL; > > > +} > > > + > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > > +{ > > > + __alloc_tag_sub(ref, bytes); > > > +} > > > + > > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > > > +{ > > > + __alloc_tag_sub(ref, bytes); > > > +} > > > + > > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) > > > +{ > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > + WARN_ONCE(ref && ref->ct, > > > + "alloc_tag was not cleared (got tag for %s:%u)\n",\ > > > + ref->ct->filename, ref->ct->lineno); > > > + > > > + WARN_ONCE(!tag, "current->alloc_tag not set"); > > > +#endif > > > + if (!ref || !tag) > > > + return; > > > + > > > + ref->ct = &tag->ct; > > > + this_cpu_add(tag->counters->bytes, bytes); > > > + this_cpu_inc(tag->counters->calls); > > > +} > > > + > > > +#else > > > + > > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} > > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} > > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, > > > + size_t bytes) {} > > > + > > > +#endif > > > + > > > +#endif /* _LINUX_ALLOC_TAG_H */ > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > > index ffe8f618ab86..da68a10517c8 100644 > > > --- a/include/linux/sched.h > > > +++ b/include/linux/sched.h > > > @@ -770,6 +770,10 @@ struct task_struct { > > > unsigned int flags; > > > unsigned int ptrace; > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > + struct alloc_tag *alloc_tag; > > > +#endif > > > > Normally scheduling is very sensitive to having anything early in > > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef > > area. > > Thanks for the warning! We will look into that. > > > > > > + > > > #ifdef CONFIG_SMP > > > int on_cpu; > > > struct __call_single_node wake_entry; > > > @@ -810,6 +814,7 @@ struct task_struct { > > > struct task_group *sched_task_group; > > > #endif > > > > > > + > > > #ifdef CONFIG_UCLAMP_TASK > > > /* > > > * Clamp values requested for a scheduling entity. > > > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } > > > > > > extern void sched_set_stop_task(int cpu, struct task_struct *stop); > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > > +{ > > > + swap(current->alloc_tag, tag); > > > + return tag; > > > +} > > > + > > > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) > > > +{ > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); > > > +#endif > > > + current->alloc_tag = old; > > > +} > > > +#else > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } > > > +#define alloc_tag_restore(_tag, _old) > > > +#endif > > > + > > > #endif > > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > > index 0be2d00c3696..78d258ca508f 100644 > > > --- a/lib/Kconfig.debug > > > +++ b/lib/Kconfig.debug > > > @@ -972,6 +972,31 @@ config CODE_TAGGING > > > bool > > > select KALLSYMS > > > > > > +config MEM_ALLOC_PROFILING > > > + bool "Enable memory allocation profiling" > > > + default n > > > + depends on PROC_FS > > > + depends on !DEBUG_FORCE_WEAK_PER_CPU > > > + select CODE_TAGGING > > > + help > > > + Track allocation source code and record total allocation size > > > + initiated at that code location. The mechanism can be used to track > > > + memory leaks with a low performance and memory impact. > > > + > > > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > > + bool "Enable memory allocation profiling by default" > > > + default y > > > + depends on MEM_ALLOC_PROFILING > > > + > > > +config MEM_ALLOC_PROFILING_DEBUG > > > + bool "Memory allocation profiler debugging" > > > + default n > > > + depends on MEM_ALLOC_PROFILING > > > + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > > + help > > > + Adds warnings with helpful error messages for memory allocation > > > + profiling. > > > + > > > source "lib/Kconfig.kasan" > > > source "lib/Kconfig.kfence" > > > source "lib/Kconfig.kmsan" > > > diff --git a/lib/Makefile b/lib/Makefile > > > index 6b48b22fdfac..859112f09bf5 100644 > > > --- a/lib/Makefile > > > +++ b/lib/Makefile > > > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ > > > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o > > > > > > obj-$(CONFIG_CODE_TAGGING) += codetag.o > > > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o > > > + > > > lib-$(CONFIG_GENERIC_BUG) += bug.o > > > > > > obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o > > > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c > > > new file mode 100644 > > > index 000000000000..4fc031f9cefd > > > --- /dev/null > > > +++ b/lib/alloc_tag.c > > > @@ -0,0 +1,158 @@ > > > +// SPDX-License-Identifier: GPL-2.0-only > > > +#include <linux/alloc_tag.h> > > > +#include <linux/fs.h> > > > +#include <linux/gfp.h> > > > +#include <linux/module.h> > > > +#include <linux/proc_fs.h> > > > +#include <linux/seq_buf.h> > > > +#include <linux/seq_file.h> > > > + > > > +static struct codetag_type *alloc_tag_cttype; > > > + > > > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > > + mem_alloc_profiling_key); > > > + > > > +static void *allocinfo_start(struct seq_file *m, loff_t *pos) > > > +{ > > > + struct codetag_iterator *iter; > > > + struct codetag *ct; > > > + loff_t node = *pos; > > > + > > > + iter = kzalloc(sizeof(*iter), GFP_KERNEL); > > > + m->private = iter; > > > + if (!iter) > > > + return NULL; > > > + > > > + codetag_lock_module_list(alloc_tag_cttype, true); > > > + *iter = codetag_get_ct_iter(alloc_tag_cttype); > > > + while ((ct = codetag_next_ct(iter)) != NULL && node) > > > + node--; > > > + > > > + return ct ? iter : NULL; > > > +} > > > + > > > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) > > > +{ > > > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > > > + struct codetag *ct = codetag_next_ct(iter); > > > + > > > + (*pos)++; > > > + if (!ct) > > > + return NULL; > > > + > > > + return iter; > > > +} > > > + > > > +static void allocinfo_stop(struct seq_file *m, void *arg) > > > +{ > > > + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; > > > + > > > + if (iter) { > > > + codetag_lock_module_list(alloc_tag_cttype, false); > > > + kfree(iter); > > > + } > > > +} > > > + > > > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) > > > +{ > > > + struct alloc_tag *tag = ct_to_alloc_tag(ct); > > > + struct alloc_tag_counters counter = alloc_tag_read(tag); > > > + s64 bytes = counter.bytes; > > > + char val[10], *p = val; > > > + > > > + if (bytes < 0) { > > > + *p++ = '-'; > > > + bytes = -bytes; > > > + } > > > + > > > + string_get_size(bytes, 1, > > > + STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, > > > + p, val + ARRAY_SIZE(val) - p); > > > + > > > + seq_buf_printf(out, "%8s %8llu ", val, counter.calls); > > > + codetag_to_text(out, ct); > > > + seq_buf_putc(out, ' '); > > > + seq_buf_putc(out, '\n'); > > > +} > > > > /me does happy seq_buf dance! > > > > > + > > > +static int allocinfo_show(struct seq_file *m, void *arg) > > > +{ > > > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > > > + char *bufp; > > > + size_t n = seq_get_buf(m, &bufp); > > > + struct seq_buf buf; > > > + > > > + seq_buf_init(&buf, bufp, n); > > > + alloc_tag_to_text(&buf, iter->ct); > > > + seq_commit(m, seq_buf_used(&buf)); > > > + return 0; > > > +} > > > + > > > +static const struct seq_operations allocinfo_seq_op = { > > > + .start = allocinfo_start, > > > + .next = allocinfo_next, > > > + .stop = allocinfo_stop, > > > + .show = allocinfo_show, > > > +}; > > > + > > > +static void __init procfs_init(void) > > > +{ > > > + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); > > > +} > > > > As mentioned, this really should be in /sys somewhere. > > Ack. > > > > > > + > > > +static bool alloc_tag_module_unload(struct codetag_type *cttype, > > > + struct codetag_module *cmod) > > > +{ > > > + struct codetag_iterator iter = codetag_get_ct_iter(cttype); > > > + struct alloc_tag_counters counter; > > > + bool module_unused = true; > > > + struct alloc_tag *tag; > > > + struct codetag *ct; > > > + > > > + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { > > > + if (iter.cmod != cmod) > > > + continue; > > > + > > > + tag = ct_to_alloc_tag(ct); > > > + counter = alloc_tag_read(tag); > > > + > > > + if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload", > > > + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) > > > + module_unused = false; > > > + } > > > + > > > + return module_unused; > > > +} > > > + > > > +static struct ctl_table memory_allocation_profiling_sysctls[] = { > > > + { > > > + .procname = "mem_profiling", > > > + .data = &mem_alloc_profiling_key, > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > + .mode = 0444, > > > +#else > > > + .mode = 0644, > > > +#endif > > > + .proc_handler = proc_do_static_key, > > > + }, > > > + { } > > > +}; > > > + > > > +static int __init alloc_tag_init(void) > > > +{ > > > + const struct codetag_type_desc desc = { > > > + .section = "alloc_tags", > > > + .tag_size = sizeof(struct alloc_tag), > > > + .module_unload = alloc_tag_module_unload, > > > + }; > > > + > > > + alloc_tag_cttype = codetag_register_type(&desc); > > > + if (IS_ERR_OR_NULL(alloc_tag_cttype)) > > > + return PTR_ERR(alloc_tag_cttype); > > > + > > > + register_sysctl_init("vm", memory_allocation_profiling_sysctls); > > > + procfs_init(); > > > + > > > + return 0; > > > +} > > > +module_init(alloc_tag_init); > > > diff --git a/scripts/module.lds.S b/scripts/module.lds.S > > > index bf5bcf2836d8..45c67a0994f3 100644 > > > --- a/scripts/module.lds.S > > > +++ b/scripts/module.lds.S > > > @@ -9,6 +9,8 @@ > > > #define DISCARD_EH_FRAME *(.eh_frame) > > > #endif > > > > > > +#include <asm-generic/codetag.lds.h> > > > + > > > SECTIONS { > > > /DISCARD/ : { > > > *(.discard) > > > @@ -47,12 +49,17 @@ SECTIONS { > > > .data : { > > > *(.data .data.[0-9a-zA-Z_]*) > > > *(.data..L*) > > > + CODETAG_SECTIONS() > > > } > > > > > > .rodata : { > > > *(.rodata .rodata.[0-9a-zA-Z_]*) > > > *(.rodata..L*) > > > } > > > +#else > > > + .data : { > > > + CODETAG_SECTIONS() > > > + } > > > #endif > > > } > > > > Otherwise, looks good. > > Thanks! > > > > > -- > > Kees Cook >
On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote: > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote: > > > > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > > > > instrument memory allocators. It registers an "alloc_tags" codetag type > > > > with /proc/allocinfo interface to output allocation tag information when > > > > > > Please don't add anything new to the top-level /proc directory. This > > > should likely live in /sys. > > > > Ack. I'll find a more appropriate place for it then. > > It just seemed like such generic information which would belong next > > to meminfo/zoneinfo and such... > > Save yourself a cycle of "rework the whole fs interface only to have > someone else tell you no" and put it in debugfs, not sysfs. Wrangling > with debugfs is easier than all the macro-happy sysfs stuff; you don't > have to integrate with the "device" model; and there is no 'one value > per file' rule. Thanks for the input. This file used to be in debugfs but reviewers felt it belonged in /proc if it's to be used in production environments. Some distros (like Android) disable debugfs in production. > > --D > > > > > > > > the feature is enabled. > > > > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory > > > > allocation profiling instrumentation. > > > > Memory allocation profiling can be enabled or disabled at runtime using > > > > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. > > > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation > > > > profiling by default. > > > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev> > > > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > > > > --- > > > > Documentation/admin-guide/sysctl/vm.rst | 16 +++ > > > > Documentation/filesystems/proc.rst | 28 +++++ > > > > include/asm-generic/codetag.lds.h | 14 +++ > > > > include/asm-generic/vmlinux.lds.h | 3 + > > > > include/linux/alloc_tag.h | 133 ++++++++++++++++++++ > > > > include/linux/sched.h | 24 ++++ > > > > lib/Kconfig.debug | 25 ++++ > > > > lib/Makefile | 2 + > > > > lib/alloc_tag.c | 158 ++++++++++++++++++++++++ > > > > scripts/module.lds.S | 7 ++ > > > > 10 files changed, 410 insertions(+) > > > > create mode 100644 include/asm-generic/codetag.lds.h > > > > create mode 100644 include/linux/alloc_tag.h > > > > create mode 100644 lib/alloc_tag.c > > > > > > > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > > > > index c59889de122b..a214719492ea 100644 > > > > --- a/Documentation/admin-guide/sysctl/vm.rst > > > > +++ b/Documentation/admin-guide/sysctl/vm.rst > > > > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: > > > > - legacy_va_layout > > > > - lowmem_reserve_ratio > > > > - max_map_count > > > > +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) > > > > - memory_failure_early_kill > > > > - memory_failure_recovery > > > > - min_free_kbytes > > > > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. > > > > The default value is 65530. > > > > > > > > > > > > +mem_profiling > > > > +============== > > > > + > > > > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) > > > > + > > > > +1: Enable memory profiling. > > > > + > > > > +0: Disabld memory profiling. > > > > + > > > > +Enabling memory profiling introduces a small performance overhead for all > > > > +memory allocations. > > > > + > > > > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. > > > > + > > > > + > > > > memory_failure_early_kill: > > > > ========================== > > > > > > > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst > > > > index 104c6d047d9b..40d6d18308e4 100644 > > > > --- a/Documentation/filesystems/proc.rst > > > > +++ b/Documentation/filesystems/proc.rst > > > > @@ -688,6 +688,7 @@ files are there, and which are missing. > > > > ============ =============================================================== > > > > File Content > > > > ============ =============================================================== > > > > + allocinfo Memory allocations profiling information > > > > apm Advanced power management info > > > > bootconfig Kernel command line obtained from boot config, > > > > and, if there were kernel parameters from the > > > > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be > > > > reclaimed to achieve this. > > > > > > > > > > > > +allocinfo > > > > +~~~~~~~ > > > > + > > > > +Provides information about memory allocations at all locations in the code > > > > +base. Each allocation in the code is identified by its source file, line > > > > +number, module and the function calling the allocation. The number of bytes > > > > +allocated at each location is reported. > > > > + > > > > +Example output. > > > > + > > > > +:: > > > > + > > > > + > cat /proc/allocinfo > > > > + > > > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > > > + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc > > > > + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare > > > > + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc > > > > + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func > > > > + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf > > > > + ... > > > > + > > > > + > > > > meminfo > > > > ~~~~~~~ > > > > > > > > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h > > > > new file mode 100644 > > > > index 000000000000..64f536b80380 > > > > --- /dev/null > > > > +++ b/include/asm-generic/codetag.lds.h > > > > @@ -0,0 +1,14 @@ > > > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > > > +#ifndef __ASM_GENERIC_CODETAG_LDS_H > > > > +#define __ASM_GENERIC_CODETAG_LDS_H > > > > + > > > > +#define SECTION_WITH_BOUNDARIES(_name) \ > > > > + . = ALIGN(8); \ > > > > + __start_##_name = .; \ > > > > + KEEP(*(_name)) \ > > > > + __stop_##_name = .; > > > > + > > > > +#define CODETAG_SECTIONS() \ > > > > + SECTION_WITH_BOUNDARIES(alloc_tags) > > > > + > > > > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ > > > > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h > > > > index 5dd3a61d673d..c9997dc50c50 100644 > > > > --- a/include/asm-generic/vmlinux.lds.h > > > > +++ b/include/asm-generic/vmlinux.lds.h > > > > @@ -50,6 +50,8 @@ > > > > * [__nosave_begin, __nosave_end] for the nosave data > > > > */ > > > > > > > > +#include <asm-generic/codetag.lds.h> > > > > + > > > > #ifndef LOAD_OFFSET > > > > #define LOAD_OFFSET 0 > > > > #endif > > > > @@ -366,6 +368,7 @@ > > > > . = ALIGN(8); \ > > > > BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ > > > > BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ > > > > + CODETAG_SECTIONS() \ > > > > LIKELY_PROFILE() \ > > > > BRANCH_PROFILE() \ > > > > TRACE_PRINTKS() \ > > > > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h > > > > new file mode 100644 > > > > index 000000000000..cf55a149fa84 > > > > --- /dev/null > > > > +++ b/include/linux/alloc_tag.h > > > > @@ -0,0 +1,133 @@ > > > > +/* SPDX-License-Identifier: GPL-2.0 */ > > > > +/* > > > > + * allocation tagging > > > > + */ > > > > +#ifndef _LINUX_ALLOC_TAG_H > > > > +#define _LINUX_ALLOC_TAG_H > > > > + > > > > +#include <linux/bug.h> > > > > +#include <linux/codetag.h> > > > > +#include <linux/container_of.h> > > > > +#include <linux/preempt.h> > > > > +#include <asm/percpu.h> > > > > +#include <linux/cpumask.h> > > > > +#include <linux/static_key.h> > > > > + > > > > +struct alloc_tag_counters { > > > > + u64 bytes; > > > > + u64 calls; > > > > +}; > > > > + > > > > +/* > > > > + * An instance of this structure is created in a special ELF section at every > > > > + * allocation callsite. At runtime, the special section is treated as > > > > + * an array of these. Embedded codetag utilizes codetag framework. > > > > + */ > > > > +struct alloc_tag { > > > > + struct codetag ct; > > > > + struct alloc_tag_counters __percpu *counters; > > > > +} __aligned(8); > > > > + > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > + > > > > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) > > > > +{ > > > > + return container_of(ct, struct alloc_tag, ct); > > > > +} > > > > + > > > > +#ifdef ARCH_NEEDS_WEAK_PER_CPU > > > > +/* > > > > + * When percpu variables are required to be defined as weak, static percpu > > > > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). > > > > + */ > > > > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU" > > > > > > Is this enforced via Kconfig as well? (Looks like only alpha and s390?) > > > > Unfortunately ARCH_NEEDS_WEAK_PER_CPU is not a Kconfig option but > > CONFIG_DEBUG_FORCE_WEAK_PER_CPU is, so that one is handled via Kconfig > > (see "depends on !DEBUG_FORCE_WEAK_PER_CPU" in this patch). We have to > > avoid both cases because of this: > > https://elixir.bootlin.com/linux/latest/source/include/linux/percpu-defs.h#L75, > > so I'm trying to provide an informative error here. > > > > > > > > > +#endif > > > > + > > > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ > > > > + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ > > > > + static struct alloc_tag _alloc_tag __used __aligned(8) \ > > > > + __section("alloc_tags") = { \ > > > > + .ct = CODE_TAG_INIT, \ > > > > + .counters = &_alloc_tag_cntr }; \ > > > > + struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) > > > > + > > > > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > > > + mem_alloc_profiling_key); > > > > + > > > > +static inline bool mem_alloc_profiling_enabled(void) > > > > +{ > > > > + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > > > + &mem_alloc_profiling_key); > > > > +} > > > > + > > > > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) > > > > +{ > > > > + struct alloc_tag_counters v = { 0, 0 }; > > > > + struct alloc_tag_counters *counter; > > > > + int cpu; > > > > + > > > > + for_each_possible_cpu(cpu) { > > > > + counter = per_cpu_ptr(tag->counters, cpu); > > > > + v.bytes += counter->bytes; > > > > + v.calls += counter->calls; > > > > + } > > > > + > > > > + return v; > > > > +} > > > > + > > > > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > > > +{ > > > > + struct alloc_tag *tag; > > > > + > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > > + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); > > > > +#endif > > > > + if (!ref || !ref->ct) > > > > + return; > > > > + > > > > + tag = ct_to_alloc_tag(ref->ct); > > > > + > > > > + this_cpu_sub(tag->counters->bytes, bytes); > > > > + this_cpu_dec(tag->counters->calls); > > > > + > > > > + ref->ct = NULL; > > > > +} > > > > + > > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) > > > > +{ > > > > + __alloc_tag_sub(ref, bytes); > > > > +} > > > > + > > > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) > > > > +{ > > > > + __alloc_tag_sub(ref, bytes); > > > > +} > > > > + > > > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) > > > > +{ > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > > + WARN_ONCE(ref && ref->ct, > > > > + "alloc_tag was not cleared (got tag for %s:%u)\n",\ > > > > + ref->ct->filename, ref->ct->lineno); > > > > + > > > > + WARN_ONCE(!tag, "current->alloc_tag not set"); > > > > +#endif > > > > + if (!ref || !tag) > > > > + return; > > > > + > > > > + ref->ct = &tag->ct; > > > > + this_cpu_add(tag->counters->bytes, bytes); > > > > + this_cpu_inc(tag->counters->calls); > > > > +} > > > > + > > > > +#else > > > > + > > > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) > > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} > > > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} > > > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, > > > > + size_t bytes) {} > > > > + > > > > +#endif > > > > + > > > > +#endif /* _LINUX_ALLOC_TAG_H */ > > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > > > index ffe8f618ab86..da68a10517c8 100644 > > > > --- a/include/linux/sched.h > > > > +++ b/include/linux/sched.h > > > > @@ -770,6 +770,10 @@ struct task_struct { > > > > unsigned int flags; > > > > unsigned int ptrace; > > > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > + struct alloc_tag *alloc_tag; > > > > +#endif > > > > > > Normally scheduling is very sensitive to having anything early in > > > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef > > > area. > > > > Thanks for the warning! We will look into that. > > > > > > > > > + > > > > #ifdef CONFIG_SMP > > > > int on_cpu; > > > > struct __call_single_node wake_entry; > > > > @@ -810,6 +814,7 @@ struct task_struct { > > > > struct task_group *sched_task_group; > > > > #endif > > > > > > > > + > > > > #ifdef CONFIG_UCLAMP_TASK > > > > /* > > > > * Clamp values requested for a scheduling entity. > > > > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } > > > > > > > > extern void sched_set_stop_task(int cpu, struct task_struct *stop); > > > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) > > > > +{ > > > > + swap(current->alloc_tag, tag); > > > > + return tag; > > > > +} > > > > + > > > > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) > > > > +{ > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > > + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); > > > > +#endif > > > > + current->alloc_tag = old; > > > > +} > > > > +#else > > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } > > > > +#define alloc_tag_restore(_tag, _old) > > > > +#endif > > > > + > > > > #endif > > > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > > > index 0be2d00c3696..78d258ca508f 100644 > > > > --- a/lib/Kconfig.debug > > > > +++ b/lib/Kconfig.debug > > > > @@ -972,6 +972,31 @@ config CODE_TAGGING > > > > bool > > > > select KALLSYMS > > > > > > > > +config MEM_ALLOC_PROFILING > > > > + bool "Enable memory allocation profiling" > > > > + default n > > > > + depends on PROC_FS > > > > + depends on !DEBUG_FORCE_WEAK_PER_CPU > > > > + select CODE_TAGGING > > > > + help > > > > + Track allocation source code and record total allocation size > > > > + initiated at that code location. The mechanism can be used to track > > > > + memory leaks with a low performance and memory impact. > > > > + > > > > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > > > + bool "Enable memory allocation profiling by default" > > > > + default y > > > > + depends on MEM_ALLOC_PROFILING > > > > + > > > > +config MEM_ALLOC_PROFILING_DEBUG > > > > + bool "Memory allocation profiler debugging" > > > > + default n > > > > + depends on MEM_ALLOC_PROFILING > > > > + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > > > + help > > > > + Adds warnings with helpful error messages for memory allocation > > > > + profiling. > > > > + > > > > source "lib/Kconfig.kasan" > > > > source "lib/Kconfig.kfence" > > > > source "lib/Kconfig.kmsan" > > > > diff --git a/lib/Makefile b/lib/Makefile > > > > index 6b48b22fdfac..859112f09bf5 100644 > > > > --- a/lib/Makefile > > > > +++ b/lib/Makefile > > > > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ > > > > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o > > > > > > > > obj-$(CONFIG_CODE_TAGGING) += codetag.o > > > > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o > > > > + > > > > lib-$(CONFIG_GENERIC_BUG) += bug.o > > > > > > > > obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o > > > > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c > > > > new file mode 100644 > > > > index 000000000000..4fc031f9cefd > > > > --- /dev/null > > > > +++ b/lib/alloc_tag.c > > > > @@ -0,0 +1,158 @@ > > > > +// SPDX-License-Identifier: GPL-2.0-only > > > > +#include <linux/alloc_tag.h> > > > > +#include <linux/fs.h> > > > > +#include <linux/gfp.h> > > > > +#include <linux/module.h> > > > > +#include <linux/proc_fs.h> > > > > +#include <linux/seq_buf.h> > > > > +#include <linux/seq_file.h> > > > > + > > > > +static struct codetag_type *alloc_tag_cttype; > > > > + > > > > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, > > > > + mem_alloc_profiling_key); > > > > + > > > > +static void *allocinfo_start(struct seq_file *m, loff_t *pos) > > > > +{ > > > > + struct codetag_iterator *iter; > > > > + struct codetag *ct; > > > > + loff_t node = *pos; > > > > + > > > > + iter = kzalloc(sizeof(*iter), GFP_KERNEL); > > > > + m->private = iter; > > > > + if (!iter) > > > > + return NULL; > > > > + > > > > + codetag_lock_module_list(alloc_tag_cttype, true); > > > > + *iter = codetag_get_ct_iter(alloc_tag_cttype); > > > > + while ((ct = codetag_next_ct(iter)) != NULL && node) > > > > + node--; > > > > + > > > > + return ct ? iter : NULL; > > > > +} > > > > + > > > > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) > > > > +{ > > > > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > > > > + struct codetag *ct = codetag_next_ct(iter); > > > > + > > > > + (*pos)++; > > > > + if (!ct) > > > > + return NULL; > > > > + > > > > + return iter; > > > > +} > > > > + > > > > +static void allocinfo_stop(struct seq_file *m, void *arg) > > > > +{ > > > > + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; > > > > + > > > > + if (iter) { > > > > + codetag_lock_module_list(alloc_tag_cttype, false); > > > > + kfree(iter); > > > > + } > > > > +} > > > > + > > > > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) > > > > +{ > > > > + struct alloc_tag *tag = ct_to_alloc_tag(ct); > > > > + struct alloc_tag_counters counter = alloc_tag_read(tag); > > > > + s64 bytes = counter.bytes; > > > > + char val[10], *p = val; > > > > + > > > > + if (bytes < 0) { > > > > + *p++ = '-'; > > > > + bytes = -bytes; > > > > + } > > > > + > > > > + string_get_size(bytes, 1, > > > > + STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, > > > > + p, val + ARRAY_SIZE(val) - p); > > > > + > > > > + seq_buf_printf(out, "%8s %8llu ", val, counter.calls); > > > > + codetag_to_text(out, ct); > > > > + seq_buf_putc(out, ' '); > > > > + seq_buf_putc(out, '\n'); > > > > +} > > > > > > /me does happy seq_buf dance! > > > > > > > + > > > > +static int allocinfo_show(struct seq_file *m, void *arg) > > > > +{ > > > > + struct codetag_iterator *iter = (struct codetag_iterator *)arg; > > > > + char *bufp; > > > > + size_t n = seq_get_buf(m, &bufp); > > > > + struct seq_buf buf; > > > > + > > > > + seq_buf_init(&buf, bufp, n); > > > > + alloc_tag_to_text(&buf, iter->ct); > > > > + seq_commit(m, seq_buf_used(&buf)); > > > > + return 0; > > > > +} > > > > + > > > > +static const struct seq_operations allocinfo_seq_op = { > > > > + .start = allocinfo_start, > > > > + .next = allocinfo_next, > > > > + .stop = allocinfo_stop, > > > > + .show = allocinfo_show, > > > > +}; > > > > + > > > > +static void __init procfs_init(void) > > > > +{ > > > > + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); > > > > +} > > > > > > As mentioned, this really should be in /sys somewhere. > > > > Ack. > > > > > > > > > + > > > > +static bool alloc_tag_module_unload(struct codetag_type *cttype, > > > > + struct codetag_module *cmod) > > > > +{ > > > > + struct codetag_iterator iter = codetag_get_ct_iter(cttype); > > > > + struct alloc_tag_counters counter; > > > > + bool module_unused = true; > > > > + struct alloc_tag *tag; > > > > + struct codetag *ct; > > > > + > > > > + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { > > > > + if (iter.cmod != cmod) > > > > + continue; > > > > + > > > > + tag = ct_to_alloc_tag(ct); > > > > + counter = alloc_tag_read(tag); > > > > + > > > > + if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload", > > > > + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) > > > > + module_unused = false; > > > > + } > > > > + > > > > + return module_unused; > > > > +} > > > > + > > > > +static struct ctl_table memory_allocation_profiling_sysctls[] = { > > > > + { > > > > + .procname = "mem_profiling", > > > > + .data = &mem_alloc_profiling_key, > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG > > > > + .mode = 0444, > > > > +#else > > > > + .mode = 0644, > > > > +#endif > > > > + .proc_handler = proc_do_static_key, > > > > + }, > > > > + { } > > > > +}; > > > > + > > > > +static int __init alloc_tag_init(void) > > > > +{ > > > > + const struct codetag_type_desc desc = { > > > > + .section = "alloc_tags", > > > > + .tag_size = sizeof(struct alloc_tag), > > > > + .module_unload = alloc_tag_module_unload, > > > > + }; > > > > + > > > > + alloc_tag_cttype = codetag_register_type(&desc); > > > > + if (IS_ERR_OR_NULL(alloc_tag_cttype)) > > > > + return PTR_ERR(alloc_tag_cttype); > > > > + > > > > + register_sysctl_init("vm", memory_allocation_profiling_sysctls); > > > > + procfs_init(); > > > > + > > > > + return 0; > > > > +} > > > > +module_init(alloc_tag_init); > > > > diff --git a/scripts/module.lds.S b/scripts/module.lds.S > > > > index bf5bcf2836d8..45c67a0994f3 100644 > > > > --- a/scripts/module.lds.S > > > > +++ b/scripts/module.lds.S > > > > @@ -9,6 +9,8 @@ > > > > #define DISCARD_EH_FRAME *(.eh_frame) > > > > #endif > > > > > > > > +#include <asm-generic/codetag.lds.h> > > > > + > > > > SECTIONS { > > > > /DISCARD/ : { > > > > *(.discard) > > > > @@ -47,12 +49,17 @@ SECTIONS { > > > > .data : { > > > > *(.data .data.[0-9a-zA-Z_]*) > > > > *(.data..L*) > > > > + CODETAG_SECTIONS() > > > > } > > > > > > > > .rodata : { > > > > *(.rodata .rodata.[0-9a-zA-Z_]*) > > > > *(.rodata..L*) > > > > } > > > > +#else > > > > + .data : { > > > > + CODETAG_SECTIONS() > > > > + } > > > > #endif > > > > } > > > > > > Otherwise, looks good. > > > > Thanks! > > > > > > > > -- > > > Kees Cook > >
On Tue, Feb 13, 2024 at 02:35:29PM -0800, Suren Baghdasaryan wrote: > On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote: > > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote: > > > > > > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > > > > > instrument memory allocators. It registers an "alloc_tags" codetag type > > > > > with /proc/allocinfo interface to output allocation tag information when > > > > > > > > Please don't add anything new to the top-level /proc directory. This > > > > should likely live in /sys. > > > > > > Ack. I'll find a more appropriate place for it then. > > > It just seemed like such generic information which would belong next > > > to meminfo/zoneinfo and such... > > > > Save yourself a cycle of "rework the whole fs interface only to have > > someone else tell you no" and put it in debugfs, not sysfs. Wrangling > > with debugfs is easier than all the macro-happy sysfs stuff; you don't > > have to integrate with the "device" model; and there is no 'one value > > per file' rule. > > Thanks for the input. This file used to be in debugfs but reviewers > felt it belonged in /proc if it's to be used in production > environments. Some distros (like Android) disable debugfs in > production. FWIW, I agree debugfs is not right. If others feel it's right in /proc, I certainly won't NAK -- it's just been that we've traditionally been trying to avoid continuing to pollute the top-level /proc and instead associate new things with something in /sys.
On Tue, 13 Feb 2024 14:38:16 -0800 Kees Cook <keescook@chromium.org> wrote: > > > Save yourself a cycle of "rework the whole fs interface only to have > > > someone else tell you no" and put it in debugfs, not sysfs. Wrangling > > > with debugfs is easier than all the macro-happy sysfs stuff; you don't > > > have to integrate with the "device" model; and there is no 'one value > > > per file' rule. > > > > Thanks for the input. This file used to be in debugfs but reviewers > > felt it belonged in /proc if it's to be used in production > > environments. Some distros (like Android) disable debugfs in > > production. > > FWIW, I agree debugfs is not right. If others feel it's right in /proc, > I certainly won't NAK -- it's just been that we've traditionally been > trying to avoid continuing to pollute the top-level /proc and instead > associate new things with something in /sys. You can create your own file system, but I would suggest using kernfs for it ;-) If you look in /sys/kernel/ you'll see a bunch of kernel file systems already there: ~# mount |grep kernel securityfs on /sys/kernel/security type securityfs (rw,nosuid,nodev,noexec,relatime) debugfs on /sys/kernel/debug type debugfs (rw,nosuid,nodev,noexec,relatime) tracefs on /sys/kernel/tracing type tracefs (rw,nosuid,nodev,noexec,relatime) configfs on /sys/kernel/config type configfs (rw,nosuid,nodev,noexec,relatime) -- Steve
On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote: > +Example output. > + > +:: > + > + > cat /proc/allocinfo > + > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one I don't really like the fancy MiB stuff. Wouldn't it be better to just present the amount of memory in plain old bytes, so people can use sort -n on it? And it's easier to tell big-from-small at a glance because big has more digits. Also, the first thing any sort of downstream processing of this data is going to have to do is to convert the fancified output back into plain-old-bytes. So why not just emit plain-old-bytes? If someone wants the fancy output (and nobody does) then that can be done in userspace.
On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote: > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote: > > > +Example output. > > + > > +:: > > + > > + > cat /proc/allocinfo > > + > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > I don't really like the fancy MiB stuff. Wouldn't it be better to just > present the amount of memory in plain old bytes, so people can use sort > -n on it? They can use sort -h on it; the string_get_size() patch was specifically so that we could make the output compatible with sort -h > And it's easier to tell big-from-small at a glance because > big has more digits. > > Also, the first thing any sort of downstream processing of this data is > going to have to do is to convert the fancified output back into > plain-old-bytes. So why not just emit plain-old-bytes? > > If someone wants the fancy output (and nobody does) then that can be > done in userspace. I like simpler, more discoverable tools; e.g. we've got a bunch of interesting stuff in scripts/ but it doesn't get used nearly as much - not as accessible as cat'ing a file, definitely not going to be installed by default. I'm just optimizing for the most common use case. I doubt there's going to be nearly as much consumption by tools, and I'm ok with making them do the conversion back to bytes if they really need it.
On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet <kent.overstreet@linux.dev> wrote: > > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote: > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote: > > > > > +Example output. > > > + > > > +:: > > > + > > > + > cat /proc/allocinfo > > > + > > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > > > I don't really like the fancy MiB stuff. Wouldn't it be better to just > > present the amount of memory in plain old bytes, so people can use sort > > -n on it? > > They can use sort -h on it; the string_get_size() patch was specifically > so that we could make the output compatible with sort -h > > > And it's easier to tell big-from-small at a glance because > > big has more digits. > > > > Also, the first thing any sort of downstream processing of this data is > > going to have to do is to convert the fancified output back into > > plain-old-bytes. So why not just emit plain-old-bytes? > > > > If someone wants the fancy output (and nobody does) then that can be > > done in userspace. > > I like simpler, more discoverable tools; e.g. we've got a bunch of > interesting stuff in scripts/ but it doesn't get used nearly as much - > not as accessible as cat'ing a file, definitely not going to be > installed by default. I also prefer plain bytes instead of MiB. A driver developer that wants to verify up-to the byte allocations for a new data structure that they added is going to be disappointed by the rounded MiB numbers. The data contained in this file is not consumable without at least "sort -h -r", so why not just output bytes instead? There is /proc/slabinfo and there is a slabtop tool. For raw /proc/allocinfo we can create an alloctop tool that would parse, sort and show data in human readable format based on various criteria. We should also add at the top of this file "allocinfo - version: 1.0", to allow future extensions (i.e. column for proc name). Pasha
On Thu, Feb 15, 2024 at 08:22:44PM -0500, Pasha Tatashin wrote: > On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet > <kent.overstreet@linux.dev> wrote: > > > > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote: > > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > > +Example output. > > > > + > > > > +:: > > > > + > > > > + > cat /proc/allocinfo > > > > + > > > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > > > > > I don't really like the fancy MiB stuff. Wouldn't it be better to just > > > present the amount of memory in plain old bytes, so people can use sort > > > -n on it? > > > > They can use sort -h on it; the string_get_size() patch was specifically > > so that we could make the output compatible with sort -h > > > > > And it's easier to tell big-from-small at a glance because > > > big has more digits. > > > > > > Also, the first thing any sort of downstream processing of this data is > > > going to have to do is to convert the fancified output back into > > > plain-old-bytes. So why not just emit plain-old-bytes? > > > > > > If someone wants the fancy output (and nobody does) then that can be > > > done in userspace. > > > > I like simpler, more discoverable tools; e.g. we've got a bunch of > > interesting stuff in scripts/ but it doesn't get used nearly as much - > > not as accessible as cat'ing a file, definitely not going to be > > installed by default. > > I also prefer plain bytes instead of MiB. A driver developer that > wants to verify up-to the byte allocations for a new data structure > that they added is going to be disappointed by the rounded MiB > numbers. That's a fair point. > The data contained in this file is not consumable without at least > "sort -h -r", so why not just output bytes instead? > > There is /proc/slabinfo and there is a slabtop tool. > For raw /proc/allocinfo we can create an alloctop tool that would > parse, sort and show data in human readable format based on various > criteria. > > We should also add at the top of this file "allocinfo - version: 1.0", > to allow future extensions (i.e. column for proc name). How would we feel about exposing two different versions in /proc? It should be a pretty minimal addition to .text. Personally, I hate trying to count long strings digits by eyeball...
On 2/13/24 23:38, Kees Cook wrote: > On Tue, Feb 13, 2024 at 02:35:29PM -0800, Suren Baghdasaryan wrote: >> On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote: >> > >> > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote: >> > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote: >> > > > >> > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: >> > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily >> > > > > instrument memory allocators. It registers an "alloc_tags" codetag type >> > > > > with /proc/allocinfo interface to output allocation tag information when >> > > > >> > > > Please don't add anything new to the top-level /proc directory. This >> > > > should likely live in /sys. >> > > >> > > Ack. I'll find a more appropriate place for it then. >> > > It just seemed like such generic information which would belong next >> > > to meminfo/zoneinfo and such... >> > >> > Save yourself a cycle of "rework the whole fs interface only to have >> > someone else tell you no" and put it in debugfs, not sysfs. Wrangling >> > with debugfs is easier than all the macro-happy sysfs stuff; you don't >> > have to integrate with the "device" model; and there is no 'one value >> > per file' rule. >> >> Thanks for the input. This file used to be in debugfs but reviewers >> felt it belonged in /proc if it's to be used in production >> environments. Some distros (like Android) disable debugfs in >> production. > > FWIW, I agree debugfs is not right. If others feel it's right in /proc, > I certainly won't NAK -- it's just been that we've traditionally been > trying to avoid continuing to pollute the top-level /proc and instead > associate new things with something in /sys. Sysfs is really a "one value per file" thing though. /proc might be ok for a single overview file.
On Fri, Feb 16, 2024 at 12:50 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 2/13/24 23:38, Kees Cook wrote: > > On Tue, Feb 13, 2024 at 02:35:29PM -0800, Suren Baghdasaryan wrote: > >> On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote: > >> > > >> > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote: > >> > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote: > >> > > > > >> > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > >> > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > >> > > > > instrument memory allocators. It registers an "alloc_tags" codetag type > >> > > > > with /proc/allocinfo interface to output allocation tag information when > >> > > > > >> > > > Please don't add anything new to the top-level /proc directory. This > >> > > > should likely live in /sys. > >> > > > >> > > Ack. I'll find a more appropriate place for it then. > >> > > It just seemed like such generic information which would belong next > >> > > to meminfo/zoneinfo and such... > >> > > >> > Save yourself a cycle of "rework the whole fs interface only to have > >> > someone else tell you no" and put it in debugfs, not sysfs. Wrangling > >> > with debugfs is easier than all the macro-happy sysfs stuff; you don't > >> > have to integrate with the "device" model; and there is no 'one value > >> > per file' rule. > >> > >> Thanks for the input. This file used to be in debugfs but reviewers > >> felt it belonged in /proc if it's to be used in production > >> environments. Some distros (like Android) disable debugfs in > >> production. > > > > FWIW, I agree debugfs is not right. If others feel it's right in /proc, > > I certainly won't NAK -- it's just been that we've traditionally been > > trying to avoid continuing to pollute the top-level /proc and instead > > associate new things with something in /sys. > > Sysfs is really a "one value per file" thing though. /proc might be ok for a > single overview file. I'm preparing v4 and will keep the file it under /proc for now unless there are strong objections.
On 2/12/24 22:38, Suren Baghdasaryan wrote: > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > instrument memory allocators. It registers an "alloc_tags" codetag type > with /proc/allocinfo interface to output allocation tag information when > the feature is enabled. > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory > allocation profiling instrumentation. > Memory allocation profiling can be enabled or disabled at runtime using > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation > profiling by default. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > --- > Documentation/admin-guide/sysctl/vm.rst | 16 +++ > Documentation/filesystems/proc.rst | 28 +++++ > include/asm-generic/codetag.lds.h | 14 +++ > include/asm-generic/vmlinux.lds.h | 3 + > include/linux/alloc_tag.h | 133 ++++++++++++++++++++ > include/linux/sched.h | 24 ++++ > lib/Kconfig.debug | 25 ++++ > lib/Makefile | 2 + > lib/alloc_tag.c | 158 ++++++++++++++++++++++++ > scripts/module.lds.S | 7 ++ > 10 files changed, 410 insertions(+) > create mode 100644 include/asm-generic/codetag.lds.h > create mode 100644 include/linux/alloc_tag.h > create mode 100644 lib/alloc_tag.c > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > index c59889de122b..a214719492ea 100644 > --- a/Documentation/admin-guide/sysctl/vm.rst > +++ b/Documentation/admin-guide/sysctl/vm.rst > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: > - legacy_va_layout > - lowmem_reserve_ratio > - max_map_count > +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) > - memory_failure_early_kill > - memory_failure_recovery > - min_free_kbytes > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. > The default value is 65530. > > > +mem_profiling > +============== > + > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) > + > +1: Enable memory profiling. > + > +0: Disabld memory profiling. Disable ... > +allocinfo > +~~~~~~~ > + > +Provides information about memory allocations at all locations in the code > +base. Each allocation in the code is identified by its source file, line > +number, module and the function calling the allocation. The number of bytes > +allocated at each location is reported. See, it even says "number of bytes" :) > + > +Example output. > + > +:: > + > + > cat /proc/allocinfo > + > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page Is "module" meant in the usual kernel module sense? In that case IIRC is more common to annotate things e.g. [xfs] in case it's really a module, and nothing if it's built it, such as slub. Is that "slub" simply derived from "mm/slub.c"? Then it's just redundant? > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc > + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare > + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc > + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func > + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf > + ... > + > + > meminfo ... > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > index 0be2d00c3696..78d258ca508f 100644 > --- a/lib/Kconfig.debug > +++ b/lib/Kconfig.debug > @@ -972,6 +972,31 @@ config CODE_TAGGING > bool > select KALLSYMS > > +config MEM_ALLOC_PROFILING > + bool "Enable memory allocation profiling" > + default n > + depends on PROC_FS > + depends on !DEBUG_FORCE_WEAK_PER_CPU > + select CODE_TAGGING > + help > + Track allocation source code and record total allocation size > + initiated at that code location. The mechanism can be used to track > + memory leaks with a low performance and memory impact. > + > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > + bool "Enable memory allocation profiling by default" > + default y I'd go with default n as that I'd select for a general distro. > + depends on MEM_ALLOC_PROFILING > + > +config MEM_ALLOC_PROFILING_DEBUG > + bool "Memory allocation profiler debugging" > + default n > + depends on MEM_ALLOC_PROFILING > + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > + help > + Adds warnings with helpful error messages for memory allocation > + profiling. > +
On Thu, Feb 15, 2024 at 5:27 PM Kent Overstreet <kent.overstreet@linux.dev> wrote: > > On Thu, Feb 15, 2024 at 08:22:44PM -0500, Pasha Tatashin wrote: > > On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet > > <kent.overstreet@linux.dev> wrote: > > > > > > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote: > > > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > > > > +Example output. > > > > > + > > > > > +:: > > > > > + > > > > > + > cat /proc/allocinfo > > > > > + > > > > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > > > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > > > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > > > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > > > > > > > I don't really like the fancy MiB stuff. Wouldn't it be better to just > > > > present the amount of memory in plain old bytes, so people can use sort > > > > -n on it? > > > > > > They can use sort -h on it; the string_get_size() patch was specifically > > > so that we could make the output compatible with sort -h > > > > > > > And it's easier to tell big-from-small at a glance because > > > > big has more digits. > > > > > > > > Also, the first thing any sort of downstream processing of this data is > > > > going to have to do is to convert the fancified output back into > > > > plain-old-bytes. So why not just emit plain-old-bytes? > > > > > > > > If someone wants the fancy output (and nobody does) then that can be > > > > done in userspace. > > > > > > I like simpler, more discoverable tools; e.g. we've got a bunch of > > > interesting stuff in scripts/ but it doesn't get used nearly as much - > > > not as accessible as cat'ing a file, definitely not going to be > > > installed by default. > > > > I also prefer plain bytes instead of MiB. A driver developer that > > wants to verify up-to the byte allocations for a new data structure > > that they added is going to be disappointed by the rounded MiB > > numbers. > > That's a fair point. > > > The data contained in this file is not consumable without at least > > "sort -h -r", so why not just output bytes instead? > > > > There is /proc/slabinfo and there is a slabtop tool. > > For raw /proc/allocinfo we can create an alloctop tool that would > > parse, sort and show data in human readable format based on various > > criteria. > > > > We should also add at the top of this file "allocinfo - version: 1.0", > > to allow future extensions (i.e. column for proc name). > > How would we feel about exposing two different versions in /proc? It > should be a pretty minimal addition to .text. > > Personally, I hate trying to count long strings digits by eyeball... Maybe something like this work for everyone then?: 160432128 (153MiB) mm/slub.c:1826 module:slub func:alloc_slab_page
On Fri, Feb 16, 2024 at 1:02 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Thu, Feb 15, 2024 at 5:27 PM Kent Overstreet > <kent.overstreet@linux.dev> wrote: > > > > On Thu, Feb 15, 2024 at 08:22:44PM -0500, Pasha Tatashin wrote: > > > On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet > > > <kent.overstreet@linux.dev> wrote: > > > > > > > > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote: > > > > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > > > > > > +Example output. > > > > > > + > > > > > > +:: > > > > > > + > > > > > > + > cat /proc/allocinfo > > > > > > + > > > > > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > > > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > > > > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > > > > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > > > > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > > > > > > > > > I don't really like the fancy MiB stuff. Wouldn't it be better to just > > > > > present the amount of memory in plain old bytes, so people can use sort > > > > > -n on it? > > > > > > > > They can use sort -h on it; the string_get_size() patch was specifically > > > > so that we could make the output compatible with sort -h > > > > > > > > > And it's easier to tell big-from-small at a glance because > > > > > big has more digits. > > > > > > > > > > Also, the first thing any sort of downstream processing of this data is > > > > > going to have to do is to convert the fancified output back into > > > > > plain-old-bytes. So why not just emit plain-old-bytes? > > > > > > > > > > If someone wants the fancy output (and nobody does) then that can be > > > > > done in userspace. > > > > > > > > I like simpler, more discoverable tools; e.g. we've got a bunch of > > > > interesting stuff in scripts/ but it doesn't get used nearly as much - > > > > not as accessible as cat'ing a file, definitely not going to be > > > > installed by default. > > > > > > I also prefer plain bytes instead of MiB. A driver developer that > > > wants to verify up-to the byte allocations for a new data structure > > > that they added is going to be disappointed by the rounded MiB > > > numbers. > > > > That's a fair point. > > > > > The data contained in this file is not consumable without at least > > > "sort -h -r", so why not just output bytes instead? > > > > > > There is /proc/slabinfo and there is a slabtop tool. > > > For raw /proc/allocinfo we can create an alloctop tool that would > > > parse, sort and show data in human readable format based on various > > > criteria. > > > > > > We should also add at the top of this file "allocinfo - version: 1.0", > > > to allow future extensions (i.e. column for proc name). > > > > How would we feel about exposing two different versions in /proc? It > > should be a pretty minimal addition to .text. > > > > Personally, I hate trying to count long strings digits by eyeball... > > Maybe something like this work for everyone then?: s/work/would work making too many mistakes. time for bed... > > 160432128 (153MiB) mm/slub.c:1826 module:slub func:alloc_slab_page
> > Personally, I hate trying to count long strings digits by eyeball... > > Maybe something like this work for everyone then?: > > 160432128 (153MiB) mm/slub.c:1826 module:slub func:alloc_slab_page That would be even harder to parse. This one liner should converts bytes to human readable size: sort -rn /proc/allocinfo | numfmt --to=iec Also, a "alloctop" script that would auto-update the current top allocators would be useful to put in tools/mm/ Pasha
On Mon, Feb 12, 2024 at 02:40:12PM -0800, Kees Cook wrote: > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index ffe8f618ab86..da68a10517c8 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -770,6 +770,10 @@ struct task_struct { > > unsigned int flags; > > unsigned int ptrace; > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > + struct alloc_tag *alloc_tag; > > +#endif > > Normally scheduling is very sensitive to having anything early in > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef > area. This is even hotter than the scheduler members; we actually do want it up front.
On Fri, Feb 16, 2024 at 06:26:06PM -0500, Kent Overstreet wrote: > On Mon, Feb 12, 2024 at 02:40:12PM -0800, Kees Cook wrote: > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote: > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > > index ffe8f618ab86..da68a10517c8 100644 > > > --- a/include/linux/sched.h > > > +++ b/include/linux/sched.h > > > @@ -770,6 +770,10 @@ struct task_struct { > > > unsigned int flags; > > > unsigned int ptrace; > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > + struct alloc_tag *alloc_tag; > > > +#endif > > > > Normally scheduling is very sensitive to having anything early in > > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef > > area. > > This is even hotter than the scheduler members; we actually do want it > up front. It is? I would imagine the scheduler would touch stuff more than the allocator, but whatever works. :)
On Fri, Feb 16, 2024 at 12:18:09PM -0500, Pasha Tatashin wrote: > > > Personally, I hate trying to count long strings digits by eyeball... > > > > Maybe something like this work for everyone then?: > > > > 160432128 (153MiB) mm/slub.c:1826 module:slub func:alloc_slab_page > > That would be even harder to parse. > > This one liner should converts bytes to human readable size: > sort -rn /proc/allocinfo | numfmt --to=iec I like this, it doesn't print out that godawful kibibytes crap
On Fri, Feb 16, 2024 at 8:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 2/12/24 22:38, Suren Baghdasaryan wrote: > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily > > instrument memory allocators. It registers an "alloc_tags" codetag type > > with /proc/allocinfo interface to output allocation tag information when > > the feature is enabled. > > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory > > allocation profiling instrumentation. > > Memory allocation profiling can be enabled or disabled at runtime using > > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation > > profiling by default. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev> > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > > --- > > Documentation/admin-guide/sysctl/vm.rst | 16 +++ > > Documentation/filesystems/proc.rst | 28 +++++ > > include/asm-generic/codetag.lds.h | 14 +++ > > include/asm-generic/vmlinux.lds.h | 3 + > > include/linux/alloc_tag.h | 133 ++++++++++++++++++++ > > include/linux/sched.h | 24 ++++ > > lib/Kconfig.debug | 25 ++++ > > lib/Makefile | 2 + > > lib/alloc_tag.c | 158 ++++++++++++++++++++++++ > > scripts/module.lds.S | 7 ++ > > 10 files changed, 410 insertions(+) > > create mode 100644 include/asm-generic/codetag.lds.h > > create mode 100644 include/linux/alloc_tag.h > > create mode 100644 lib/alloc_tag.c > > > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > > index c59889de122b..a214719492ea 100644 > > --- a/Documentation/admin-guide/sysctl/vm.rst > > +++ b/Documentation/admin-guide/sysctl/vm.rst > > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: > > - legacy_va_layout > > - lowmem_reserve_ratio > > - max_map_count > > +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) > > - memory_failure_early_kill > > - memory_failure_recovery > > - min_free_kbytes > > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. > > The default value is 65530. > > > > > > +mem_profiling > > +============== > > + > > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) > > + > > +1: Enable memory profiling. > > + > > +0: Disabld memory profiling. > > Disable Ack. > > ... > > > +allocinfo > > +~~~~~~~ > > + > > +Provides information about memory allocations at all locations in the code > > +base. Each allocation in the code is identified by its source file, line > > +number, module and the function calling the allocation. The number of bytes > > +allocated at each location is reported. > > See, it even says "number of bytes" :) Yes, we are changing the output to bytes. > > > + > > +Example output. > > + > > +:: > > + > > + > cat /proc/allocinfo > > + > > + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page > > Is "module" meant in the usual kernel module sense? In that case IIRC is > more common to annotate things e.g. [xfs] in case it's really a module, and > nothing if it's built it, such as slub. Is that "slub" simply derived from > "mm/slub.c"? Then it's just redundant? Sounds good. The new example would look like this: > sort -rn /proc/allocinfo 127664128 31168 mm/page_ext.c:270 func:alloc_page_ext 56373248 4737 mm/slub.c:2259 func:alloc_slab_page 14880768 3633 mm/readahead.c:247 func:page_cache_ra_unbounded 14417920 3520 mm/mm_init.c:2530 func:alloc_large_system_hash 13377536 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs 11718656 2861 mm/filemap.c:1919 func:__filemap_get_folio 9192960 2800 kernel/fork.c:307 func:alloc_thread_stack_node 4206592 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable 4136960 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start 3940352 962 mm/memory.c:4214 func:alloc_anon_folio 2894464 22613 fs/kernfs/dir.c:615 func:__kernfs_new_node ... Note that [ctagmod] is the only allocation from a module in this example. > > > + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order > > + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts > > + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact > > + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one > > + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc > > + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare > > + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc > > + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func > > + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf > > + ... > > + > > + > > meminfo > > ... > > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > index 0be2d00c3696..78d258ca508f 100644 > > --- a/lib/Kconfig.debug > > +++ b/lib/Kconfig.debug > > @@ -972,6 +972,31 @@ config CODE_TAGGING > > bool > > select KALLSYMS > > > > +config MEM_ALLOC_PROFILING > > + bool "Enable memory allocation profiling" > > + default n > > + depends on PROC_FS > > + depends on !DEBUG_FORCE_WEAK_PER_CPU > > + select CODE_TAGGING > > + help > > + Track allocation source code and record total allocation size > > + initiated at that code location. The mechanism can be used to track > > + memory leaks with a low performance and memory impact. > > + > > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > + bool "Enable memory allocation profiling by default" > > + default y > > I'd go with default n as that I'd select for a general distro. Well, we have MEM_ALLOC_PROFILING=n by default, so if it was switched on manually, that is a strong sign that the user wants it enabled IMO. So, enabling this switch by default seems logical to me. If a distro wants to have the feature compiled in but disabled by default then this is perfectly doable, just need to set both options appropriately. Does my logic make sense? > > > + depends on MEM_ALLOC_PROFILING > > + > > +config MEM_ALLOC_PROFILING_DEBUG > > + bool "Memory allocation profiler debugging" > > + default n > > + depends on MEM_ALLOC_PROFILING > > + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT > > + help > > + Adds warnings with helpful error messages for memory allocation > > + profiling. > > + >
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index c59889de122b..a214719492ea 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: - legacy_va_layout - lowmem_reserve_ratio - max_map_count +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) - memory_failure_early_kill - memory_failure_recovery - min_free_kbytes @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. The default value is 65530. +mem_profiling +============== + +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) + +1: Enable memory profiling. + +0: Disabld memory profiling. + +Enabling memory profiling introduces a small performance overhead for all +memory allocations. + +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. + + memory_failure_early_kill: ========================== diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 104c6d047d9b..40d6d18308e4 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -688,6 +688,7 @@ files are there, and which are missing. ============ =============================================================== File Content ============ =============================================================== + allocinfo Memory allocations profiling information apm Advanced power management info bootconfig Kernel command line obtained from boot config, and, if there were kernel parameters from the @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be reclaimed to achieve this. +allocinfo +~~~~~~~ + +Provides information about memory allocations at all locations in the code +base. Each allocation in the code is identified by its source file, line +number, module and the function calling the allocation. The number of bytes +allocated at each location is reported. + +Example output. + +:: + + > cat /proc/allocinfo + + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf + ... + + meminfo ~~~~~~~ diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h new file mode 100644 index 000000000000..64f536b80380 --- /dev/null +++ b/include/asm-generic/codetag.lds.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_GENERIC_CODETAG_LDS_H +#define __ASM_GENERIC_CODETAG_LDS_H + +#define SECTION_WITH_BOUNDARIES(_name) \ + . = ALIGN(8); \ + __start_##_name = .; \ + KEEP(*(_name)) \ + __stop_##_name = .; + +#define CODETAG_SECTIONS() \ + SECTION_WITH_BOUNDARIES(alloc_tags) + +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5dd3a61d673d..c9997dc50c50 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -50,6 +50,8 @@ * [__nosave_begin, __nosave_end] for the nosave data */ +#include <asm-generic/codetag.lds.h> + #ifndef LOAD_OFFSET #define LOAD_OFFSET 0 #endif @@ -366,6 +368,7 @@ . = ALIGN(8); \ BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ + CODETAG_SECTIONS() \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h new file mode 100644 index 000000000000..cf55a149fa84 --- /dev/null +++ b/include/linux/alloc_tag.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * allocation tagging + */ +#ifndef _LINUX_ALLOC_TAG_H +#define _LINUX_ALLOC_TAG_H + +#include <linux/bug.h> +#include <linux/codetag.h> +#include <linux/container_of.h> +#include <linux/preempt.h> +#include <asm/percpu.h> +#include <linux/cpumask.h> +#include <linux/static_key.h> + +struct alloc_tag_counters { + u64 bytes; + u64 calls; +}; + +/* + * An instance of this structure is created in a special ELF section at every + * allocation callsite. At runtime, the special section is treated as + * an array of these. Embedded codetag utilizes codetag framework. + */ +struct alloc_tag { + struct codetag ct; + struct alloc_tag_counters __percpu *counters; +} __aligned(8); + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) +{ + return container_of(ct, struct alloc_tag, ct); +} + +#ifdef ARCH_NEEDS_WEAK_PER_CPU +/* + * When percpu variables are required to be defined as weak, static percpu + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). + */ +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU" +#endif + +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section("alloc_tags") = { \ + .ct = CODE_TAG_INIT, \ + .counters = &_alloc_tag_cntr }; \ + struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) + +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static inline bool mem_alloc_profiling_enabled(void) +{ + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + &mem_alloc_profiling_key); +} + +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) +{ + struct alloc_tag_counters v = { 0, 0 }; + struct alloc_tag_counters *counter; + int cpu; + + for_each_possible_cpu(cpu) { + counter = per_cpu_ptr(tag->counters, cpu); + v.bytes += counter->bytes; + v.calls += counter->calls; + } + + return v; +} + +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + struct alloc_tag *tag; + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); +#endif + if (!ref || !ref->ct) + return; + + tag = ct_to_alloc_tag(ref->ct); + + this_cpu_sub(tag->counters->bytes, bytes); + this_cpu_dec(tag->counters->calls); + + ref->ct = NULL; +} + +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + __alloc_tag_sub(ref, bytes); +} + +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) +{ + __alloc_tag_sub(ref, bytes); +} + +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN_ONCE(ref && ref->ct, + "alloc_tag was not cleared (got tag for %s:%u)\n",\ + ref->ct->filename, ref->ct->lineno); + + WARN_ONCE(!tag, "current->alloc_tag not set"); +#endif + if (!ref || !tag) + return; + + ref->ct = &tag->ct; + this_cpu_add(tag->counters->bytes, bytes); + this_cpu_inc(tag->counters->calls); +} + +#else + +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, + size_t bytes) {} + +#endif + +#endif /* _LINUX_ALLOC_TAG_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..da68a10517c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -770,6 +770,10 @@ struct task_struct { unsigned int flags; unsigned int ptrace; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; +#endif + #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; @@ -810,6 +814,7 @@ struct task_struct { struct task_group *sched_task_group; #endif + #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +{ + swap(current->alloc_tag, tag); + return tag; +} + +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; +} +#else +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } +#define alloc_tag_restore(_tag, _old) +#endif + #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0be2d00c3696..78d258ca508f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -972,6 +972,31 @@ config CODE_TAGGING bool select KALLSYMS +config MEM_ALLOC_PROFILING + bool "Enable memory allocation profiling" + default n + depends on PROC_FS + depends on !DEBUG_FORCE_WEAK_PER_CPU + select CODE_TAGGING + help + Track allocation source code and record total allocation size + initiated at that code location. The mechanism can be used to track + memory leaks with a low performance and memory impact. + +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + bool "Enable memory allocation profiling by default" + default y + depends on MEM_ALLOC_PROFILING + +config MEM_ALLOC_PROFILING_DEBUG + bool "Memory allocation profiler debugging" + default n + depends on MEM_ALLOC_PROFILING + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + help + Adds warnings with helpful error messages for memory allocation + profiling. + source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" source "lib/Kconfig.kmsan" diff --git a/lib/Makefile b/lib/Makefile index 6b48b22fdfac..859112f09bf5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_CODE_TAGGING) += codetag.o +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o + lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c new file mode 100644 index 000000000000..4fc031f9cefd --- /dev/null +++ b/lib/alloc_tag.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/alloc_tag.h> +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_buf.h> +#include <linux/seq_file.h> + +static struct codetag_type *alloc_tag_cttype; + +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static void *allocinfo_start(struct seq_file *m, loff_t *pos) +{ + struct codetag_iterator *iter; + struct codetag *ct; + loff_t node = *pos; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + m->private = iter; + if (!iter) + return NULL; + + codetag_lock_module_list(alloc_tag_cttype, true); + *iter = codetag_get_ct_iter(alloc_tag_cttype); + while ((ct = codetag_next_ct(iter)) != NULL && node) + node--; + + return ct ? iter : NULL; +} + +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + struct codetag *ct = codetag_next_ct(iter); + + (*pos)++; + if (!ct) + return NULL; + + return iter; +} + +static void allocinfo_stop(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; + + if (iter) { + codetag_lock_module_list(alloc_tag_cttype, false); + kfree(iter); + } +} + +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) +{ + struct alloc_tag *tag = ct_to_alloc_tag(ct); + struct alloc_tag_counters counter = alloc_tag_read(tag); + s64 bytes = counter.bytes; + char val[10], *p = val; + + if (bytes < 0) { + *p++ = '-'; + bytes = -bytes; + } + + string_get_size(bytes, 1, + STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, + p, val + ARRAY_SIZE(val) - p); + + seq_buf_printf(out, "%8s %8llu ", val, counter.calls); + codetag_to_text(out, ct); + seq_buf_putc(out, ' '); + seq_buf_putc(out, '\n'); +} + +static int allocinfo_show(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + char *bufp; + size_t n = seq_get_buf(m, &bufp); + struct seq_buf buf; + + seq_buf_init(&buf, bufp, n); + alloc_tag_to_text(&buf, iter->ct); + seq_commit(m, seq_buf_used(&buf)); + return 0; +} + +static const struct seq_operations allocinfo_seq_op = { + .start = allocinfo_start, + .next = allocinfo_next, + .stop = allocinfo_stop, + .show = allocinfo_show, +}; + +static void __init procfs_init(void) +{ + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); +} + +static bool alloc_tag_module_unload(struct codetag_type *cttype, + struct codetag_module *cmod) +{ + struct codetag_iterator iter = codetag_get_ct_iter(cttype); + struct alloc_tag_counters counter; + bool module_unused = true; + struct alloc_tag *tag; + struct codetag *ct; + + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { + if (iter.cmod != cmod) + continue; + + tag = ct_to_alloc_tag(ct); + counter = alloc_tag_read(tag); + + if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload", + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) + module_unused = false; + } + + return module_unused; +} + +static struct ctl_table memory_allocation_profiling_sysctls[] = { + { + .procname = "mem_profiling", + .data = &mem_alloc_profiling_key, +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + .mode = 0444, +#else + .mode = 0644, +#endif + .proc_handler = proc_do_static_key, + }, + { } +}; + +static int __init alloc_tag_init(void) +{ + const struct codetag_type_desc desc = { + .section = "alloc_tags", + .tag_size = sizeof(struct alloc_tag), + .module_unload = alloc_tag_module_unload, + }; + + alloc_tag_cttype = codetag_register_type(&desc); + if (IS_ERR_OR_NULL(alloc_tag_cttype)) + return PTR_ERR(alloc_tag_cttype); + + register_sysctl_init("vm", memory_allocation_profiling_sysctls); + procfs_init(); + + return 0; +} +module_init(alloc_tag_init); diff --git a/scripts/module.lds.S b/scripts/module.lds.S index bf5bcf2836d8..45c67a0994f3 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -9,6 +9,8 @@ #define DISCARD_EH_FRAME *(.eh_frame) #endif +#include <asm-generic/codetag.lds.h> + SECTIONS { /DISCARD/ : { *(.discard) @@ -47,12 +49,17 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) + CODETAG_SECTIONS() } .rodata : { *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) } +#else + .data : { + CODETAG_SECTIONS() + } #endif }