Message ID | 20240212213922.783301-32-surenb@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Memory allocation profiling | expand |
On Mon, Feb 12, 2024 at 01:39:17PM -0800, Suren Baghdasaryan wrote: > Include allocations in show_mem reports. > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > --- > include/linux/alloc_tag.h | 2 ++ > lib/alloc_tag.c | 38 ++++++++++++++++++++++++++++++++++++++ > mm/show_mem.c | 15 +++++++++++++++ > 3 files changed, 55 insertions(+) > > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h > index 3fe51e67e231..0a5973c4ad77 100644 > --- a/include/linux/alloc_tag.h > +++ b/include/linux/alloc_tag.h > @@ -30,6 +30,8 @@ struct alloc_tag { > > #ifdef CONFIG_MEM_ALLOC_PROFILING > > +void alloc_tags_show_mem_report(struct seq_buf *s); > + > static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) > { > return container_of(ct, struct alloc_tag, ct); > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c > index 2d5226d9262d..54312c213860 100644 > --- a/lib/alloc_tag.c > +++ b/lib/alloc_tag.c > @@ -96,6 +96,44 @@ static const struct seq_operations allocinfo_seq_op = { > .show = allocinfo_show, > }; > > +void alloc_tags_show_mem_report(struct seq_buf *s) > +{ > + struct codetag_iterator iter; > + struct codetag *ct; > + struct { > + struct codetag *tag; > + size_t bytes; > + } tags[10], n; > + unsigned int i, nr = 0; > + > + codetag_lock_module_list(alloc_tag_cttype, true); > + iter = codetag_get_ct_iter(alloc_tag_cttype); > + while ((ct = codetag_next_ct(&iter))) { > + struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct)); > + > + n.tag = ct; > + n.bytes = counter.bytes; > + > + for (i = 0; i < nr; i++) > + if (n.bytes > tags[i].bytes) > + break; > + > + if (i < ARRAY_SIZE(tags)) { > + nr -= nr == ARRAY_SIZE(tags); > + memmove(&tags[i + 1], > + &tags[i], > + sizeof(tags[0]) * (nr - i)); > + nr++; > + tags[i] = n; > + } > + } > + > + for (i = 0; i < nr; i++) > + alloc_tag_to_text(s, tags[i].tag); > + > + codetag_lock_module_list(alloc_tag_cttype, false); > +} > + > static void __init procfs_init(void) > { > proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); > diff --git a/mm/show_mem.c b/mm/show_mem.c > index 8dcfafbd283c..d514c15ca076 100644 > --- a/mm/show_mem.c > +++ b/mm/show_mem.c > @@ -12,6 +12,7 @@ > #include <linux/hugetlb.h> > #include <linux/mm.h> > #include <linux/mmzone.h> > +#include <linux/seq_buf.h> > #include <linux/swap.h> > #include <linux/vmstat.h> > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > #ifdef CONFIG_MEMORY_FAILURE > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > #endif > +#ifdef CONFIG_MEM_ALLOC_PROFILING > + { > + struct seq_buf s; > + char *buf = kmalloc(4096, GFP_ATOMIC); Why 4096? Maybe use PAGE_SIZE instead? > + > + if (buf) { > + printk("Memory allocations:\n"); This needs a printk prefix, or better yet, just use pr_info() or similar. > + seq_buf_init(&s, buf, 4096); > + alloc_tags_show_mem_report(&s); > + printk("%s", buf); Once a seq_buf "consumes" a char *, please don't use any directly any more. This should be: pr_info("%s", seq_buf_str(s)); Otherwise %NUL termination isn't certain. Very likely, given the use case here, but let's use good hygiene. :) > + kfree(buf); > + } > + } > +#endif > } > -- > 2.43.0.687.g38aa6559b0-goog >
On Mon, 12 Feb 2024 16:10:02 -0800 Kees Cook <keescook@chromium.org> wrote: > > #endif > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > + { > > + struct seq_buf s; > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > Why 4096? Maybe use PAGE_SIZE instead? Will it make a difference for architectures that don't have 4096 PAGE_SIZE? Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K! -- Steve
On Mon, Feb 12, 2024 at 07:22:42PM -0500, Steven Rostedt wrote: > On Mon, 12 Feb 2024 16:10:02 -0800 > Kees Cook <keescook@chromium.org> wrote: > > > > #endif > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > + { > > > + struct seq_buf s; > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > Why 4096? Maybe use PAGE_SIZE instead? > > Will it make a difference for architectures that don't have 4096 PAGE_SIZE? > Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K! it's just a string buffer
On Mon, Feb 12, 2024 at 8:33 PM Kent Overstreet <kent.overstreet@linux.dev> wrote: > > On Mon, Feb 12, 2024 at 07:22:42PM -0500, Steven Rostedt wrote: > > On Mon, 12 Feb 2024 16:10:02 -0800 > > Kees Cook <keescook@chromium.org> wrote: > > > > > > #endif > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > + { > > > > + struct seq_buf s; > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > > > Why 4096? Maybe use PAGE_SIZE instead? > > > > Will it make a difference for architectures that don't have 4096 PAGE_SIZE? > > Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K! > > it's just a string buffer We should document that __show_mem() prints only the top 10 largest allocations, therefore as long as this buffer is large enough to hold 10 records we should be good. Technically we could simply print one record at a time and then the buffer can be smaller.
On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: [...] > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > #ifdef CONFIG_MEMORY_FAILURE > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > #endif > +#ifdef CONFIG_MEM_ALLOC_PROFILING > + { > + struct seq_buf s; > + char *buf = kmalloc(4096, GFP_ATOMIC); > + > + if (buf) { > + printk("Memory allocations:\n"); > + seq_buf_init(&s, buf, 4096); > + alloc_tags_show_mem_report(&s); > + printk("%s", buf); > + kfree(buf); > + } > + } > +#endif I am pretty sure I have already objected to this. Memory allocations in the oom path are simply no go unless there is absolutely no other way around that. In this case the buffer could be preallocated.
On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > [...] > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > #ifdef CONFIG_MEMORY_FAILURE > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > #endif > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > + { > > + struct seq_buf s; > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > + > > + if (buf) { > > + printk("Memory allocations:\n"); > > + seq_buf_init(&s, buf, 4096); > > + alloc_tags_show_mem_report(&s); > > + printk("%s", buf); > > + kfree(buf); > > + } > > + } > > +#endif > > I am pretty sure I have already objected to this. Memory allocations in > the oom path are simply no go unless there is absolutely no other way > around that. In this case the buffer could be preallocated. Good point. We will change this to a smaller buffer allocated on the stack and will print records one-by-one. Thanks! > > -- > Michal Hocko > SUSE Labs
On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > [...] > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > #ifdef CONFIG_MEMORY_FAILURE > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > #endif > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > + { > > > + struct seq_buf s; > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > + > > > + if (buf) { > > > + printk("Memory allocations:\n"); > > > + seq_buf_init(&s, buf, 4096); > > > + alloc_tags_show_mem_report(&s); > > > + printk("%s", buf); > > > + kfree(buf); > > > + } > > > + } > > > +#endif > > > > I am pretty sure I have already objected to this. Memory allocations in > > the oom path are simply no go unless there is absolutely no other way > > around that. In this case the buffer could be preallocated. > > Good point. We will change this to a smaller buffer allocated on the > stack and will print records one-by-one. Thanks! __show_mem could be called with a very deep call chains. A single pre-allocated buffer should just do ok.
On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > > [...] > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > > #ifdef CONFIG_MEMORY_FAILURE > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > > #endif > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > + { > > > > + struct seq_buf s; > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > + > > > > + if (buf) { > > > > + printk("Memory allocations:\n"); > > > > + seq_buf_init(&s, buf, 4096); > > > > + alloc_tags_show_mem_report(&s); > > > > + printk("%s", buf); > > > > + kfree(buf); > > > > + } > > > > + } > > > > +#endif > > > > > > I am pretty sure I have already objected to this. Memory allocations in > > > the oom path are simply no go unless there is absolutely no other way > > > around that. In this case the buffer could be preallocated. > > > > Good point. We will change this to a smaller buffer allocated on the > > stack and will print records one-by-one. Thanks! > > __show_mem could be called with a very deep call chains. A single > pre-allocated buffer should just do ok. Ack. Will do. > > -- > Michal Hocko > SUSE Labs
On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > > > [...] > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > > > #ifdef CONFIG_MEMORY_FAILURE > > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > > > #endif > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > > + { > > > > > + struct seq_buf s; > > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > > + > > > > > + if (buf) { > > > > > + printk("Memory allocations:\n"); > > > > > + seq_buf_init(&s, buf, 4096); > > > > > + alloc_tags_show_mem_report(&s); > > > > > + printk("%s", buf); > > > > > + kfree(buf); > > > > > + } > > > > > + } > > > > > +#endif > > > > > > > > I am pretty sure I have already objected to this. Memory allocations in > > > > the oom path are simply no go unless there is absolutely no other way > > > > around that. In this case the buffer could be preallocated. > > > > > > Good point. We will change this to a smaller buffer allocated on the > > > stack and will print records one-by-one. Thanks! > > > > __show_mem could be called with a very deep call chains. A single > > pre-allocated buffer should just do ok. > > Ack. Will do. No, we're not going to permanently burn 4k here. It's completely fine if the allocation fails, there's nothing "unsafe" about doing a GFP_ATOMIC allocation here.
On Thu, Feb 15, 2024 at 10:29 AM Kent Overstreet <kent.overstreet@linux.dev> wrote: > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > > > > [...] > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > > > > #ifdef CONFIG_MEMORY_FAILURE > > > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > > > > #endif > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > > > + { > > > > > > + struct seq_buf s; > > > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > > > + > > > > > > + if (buf) { > > > > > > + printk("Memory allocations:\n"); > > > > > > + seq_buf_init(&s, buf, 4096); > > > > > > + alloc_tags_show_mem_report(&s); > > > > > > + printk("%s", buf); > > > > > > + kfree(buf); > > > > > > + } > > > > > > + } > > > > > > +#endif > > > > > > > > > > I am pretty sure I have already objected to this. Memory allocations in > > > > > the oom path are simply no go unless there is absolutely no other way > > > > > around that. In this case the buffer could be preallocated. > > > > > > > > Good point. We will change this to a smaller buffer allocated on the > > > > stack and will print records one-by-one. Thanks! > > > > > > __show_mem could be called with a very deep call chains. A single > > > pre-allocated buffer should just do ok. > > > > Ack. Will do. > > No, we're not going to permanently burn 4k here. We don't need 4K here. Just enough to store one line and then print these 10 highest allocations one line at a time. This way we can also change that 10 to any higher number we like without any side effects. > > It's completely fine if the allocation fails, there's nothing "unsafe" > about doing a GFP_ATOMIC allocation here.
On Thu, Feb 15, 2024 at 10:33:53AM -0800, Suren Baghdasaryan wrote: > On Thu, Feb 15, 2024 at 10:29 AM Kent Overstreet > <kent.overstreet@linux.dev> wrote: > > > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > > > > > [...] > > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > > > > > #ifdef CONFIG_MEMORY_FAILURE > > > > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > > > > > #endif > > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > > > > + { > > > > > > > + struct seq_buf s; > > > > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > > > > + > > > > > > > + if (buf) { > > > > > > > + printk("Memory allocations:\n"); > > > > > > > + seq_buf_init(&s, buf, 4096); > > > > > > > + alloc_tags_show_mem_report(&s); > > > > > > > + printk("%s", buf); > > > > > > > + kfree(buf); > > > > > > > + } > > > > > > > + } > > > > > > > +#endif > > > > > > > > > > > > I am pretty sure I have already objected to this. Memory allocations in > > > > > > the oom path are simply no go unless there is absolutely no other way > > > > > > around that. In this case the buffer could be preallocated. > > > > > > > > > > Good point. We will change this to a smaller buffer allocated on the > > > > > stack and will print records one-by-one. Thanks! > > > > > > > > __show_mem could be called with a very deep call chains. A single > > > > pre-allocated buffer should just do ok. > > > > > > Ack. Will do. > > > > No, we're not going to permanently burn 4k here. > > We don't need 4K here. Just enough to store one line and then print > these 10 highest allocations one line at a time. This way we can also > change that 10 to any higher number we like without any side effects. There's no reason to make the change at all. If Michal thinks there's something "dangerous" about allocating a buffer here, he needs to able to explain what it is.
On Thu 15-02-24 13:29:40, Kent Overstreet wrote: > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > > > > [...] > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > > > > #ifdef CONFIG_MEMORY_FAILURE > > > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > > > > #endif > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > > > + { > > > > > > + struct seq_buf s; > > > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > > > + > > > > > > + if (buf) { > > > > > > + printk("Memory allocations:\n"); > > > > > > + seq_buf_init(&s, buf, 4096); > > > > > > + alloc_tags_show_mem_report(&s); > > > > > > + printk("%s", buf); > > > > > > + kfree(buf); > > > > > > + } > > > > > > + } > > > > > > +#endif > > > > > > > > > > I am pretty sure I have already objected to this. Memory allocations in > > > > > the oom path are simply no go unless there is absolutely no other way > > > > > around that. In this case the buffer could be preallocated. > > > > > > > > Good point. We will change this to a smaller buffer allocated on the > > > > stack and will print records one-by-one. Thanks! > > > > > > __show_mem could be called with a very deep call chains. A single > > > pre-allocated buffer should just do ok. > > > > Ack. Will do. > > No, we're not going to permanently burn 4k here. > > It's completely fine if the allocation fails, there's nothing "unsafe" > about doing a GFP_ATOMIC allocation here. Nobody is talking about safety. This is just a wrong thing to do when you are likely under OOM situation. This is a situation when you GFP_ATOMIC allocation is _likely_ to fail. Yes, yes you will get some additional memory reservers head room, but you shouldn't rely on that because that will make the output unreliable. Not something you want in situation when you really want to know that information. More over you do not need to preallocate a full page.
On Thu, Feb 15, 2024 at 10:41 AM Michal Hocko <mhocko@suse.com> wrote: > > On Thu 15-02-24 13:29:40, Kent Overstreet wrote: > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > > > > > [...] > > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > > > > > > #ifdef CONFIG_MEMORY_FAILURE > > > > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > > > > > > #endif > > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > > > > > > + { > > > > > > > + struct seq_buf s; > > > > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > > > > > > + > > > > > > > + if (buf) { > > > > > > > + printk("Memory allocations:\n"); > > > > > > > + seq_buf_init(&s, buf, 4096); > > > > > > > + alloc_tags_show_mem_report(&s); > > > > > > > + printk("%s", buf); > > > > > > > + kfree(buf); > > > > > > > + } > > > > > > > + } > > > > > > > +#endif > > > > > > > > > > > > I am pretty sure I have already objected to this. Memory allocations in > > > > > > the oom path are simply no go unless there is absolutely no other way > > > > > > around that. In this case the buffer could be preallocated. > > > > > > > > > > Good point. We will change this to a smaller buffer allocated on the > > > > > stack and will print records one-by-one. Thanks! > > > > > > > > __show_mem could be called with a very deep call chains. A single > > > > pre-allocated buffer should just do ok. > > > > > > Ack. Will do. > > > > No, we're not going to permanently burn 4k here. > > > > It's completely fine if the allocation fails, there's nothing "unsafe" > > about doing a GFP_ATOMIC allocation here. > > Nobody is talking about safety. This is just a wrong thing to do when > you are likely under OOM situation. This is a situation when you > GFP_ATOMIC allocation is _likely_ to fail. Yes, yes you will get some > additional memory reservers head room, but you shouldn't rely on that > because that will make the output unreliable. Not something you want in > situation when you really want to know that information. > > More over you do not need to preallocate a full page. Folks, please stop arguing about it. We have more important things to do. I'll fix it to use a small preallocated buffer. > > -- > Michal Hocko > SUSE Labs
On 2/15/24 19:29, Kent Overstreet wrote: > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: >> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: >> > >> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: >> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: >> > > > >> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: >> > > > [...] >> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) >> > > > > #ifdef CONFIG_MEMORY_FAILURE >> > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); >> > > > > #endif >> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING >> > > > > + { >> > > > > + struct seq_buf s; >> > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); >> > > > > + >> > > > > + if (buf) { >> > > > > + printk("Memory allocations:\n"); >> > > > > + seq_buf_init(&s, buf, 4096); >> > > > > + alloc_tags_show_mem_report(&s); >> > > > > + printk("%s", buf); >> > > > > + kfree(buf); >> > > > > + } >> > > > > + } >> > > > > +#endif >> > > > >> > > > I am pretty sure I have already objected to this. Memory allocations in >> > > > the oom path are simply no go unless there is absolutely no other way >> > > > around that. In this case the buffer could be preallocated. >> > > >> > > Good point. We will change this to a smaller buffer allocated on the >> > > stack and will print records one-by-one. Thanks! >> > >> > __show_mem could be called with a very deep call chains. A single >> > pre-allocated buffer should just do ok. >> >> Ack. Will do. > > No, we're not going to permanently burn 4k here. > > It's completely fine if the allocation fails, there's nothing "unsafe" > about doing a GFP_ATOMIC allocation here. Well, I think without __GFP_NOWARN it will cause a warning and thus recursion into __show_mem(), potentially infinite? Which is of course trivial to fix, but I'd myself rather sacrifice a bit of memory to get this potentially very useful output, if I enabled the profiling. The necessary memory overhead of page_ext and slabobj_ext makes the printing buffer overhead negligible in comparison?
On Thu, Feb 15, 2024 at 09:22:07PM +0100, Vlastimil Babka wrote: > On 2/15/24 19:29, Kent Overstreet wrote: > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > >> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > >> > > >> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > >> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > >> > > > > >> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > >> > > > [...] > >> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > >> > > > > #ifdef CONFIG_MEMORY_FAILURE > >> > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > >> > > > > #endif > >> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > >> > > > > + { > >> > > > > + struct seq_buf s; > >> > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > >> > > > > + > >> > > > > + if (buf) { > >> > > > > + printk("Memory allocations:\n"); > >> > > > > + seq_buf_init(&s, buf, 4096); > >> > > > > + alloc_tags_show_mem_report(&s); > >> > > > > + printk("%s", buf); > >> > > > > + kfree(buf); > >> > > > > + } > >> > > > > + } > >> > > > > +#endif > >> > > > > >> > > > I am pretty sure I have already objected to this. Memory allocations in > >> > > > the oom path are simply no go unless there is absolutely no other way > >> > > > around that. In this case the buffer could be preallocated. > >> > > > >> > > Good point. We will change this to a smaller buffer allocated on the > >> > > stack and will print records one-by-one. Thanks! > >> > > >> > __show_mem could be called with a very deep call chains. A single > >> > pre-allocated buffer should just do ok. > >> > >> Ack. Will do. > > > > No, we're not going to permanently burn 4k here. > > > > It's completely fine if the allocation fails, there's nothing "unsafe" > > about doing a GFP_ATOMIC allocation here. > > Well, I think without __GFP_NOWARN it will cause a warning and thus > recursion into __show_mem(), potentially infinite? Which is of course > trivial to fix, but I'd myself rather sacrifice a bit of memory to get this > potentially very useful output, if I enabled the profiling. The necessary > memory overhead of page_ext and slabobj_ext makes the printing buffer > overhead negligible in comparison? __GFP_NOWARN is a good point, we should have that. But - and correct me if I'm wrong here - doesn't an OOM kick in well before GFP_ATOMIC 4k allocations are failing? I'd expect the system to be well and truly hosed at that point. If we want this report to be 100% reliable, then yes the preallocated buffer makes sense - but I don't think 100% makes sense here; I think we can accept ~99% and give back that 4k.
On Thu 15-02-24 15:33:30, Kent Overstreet wrote: > On Thu, Feb 15, 2024 at 09:22:07PM +0100, Vlastimil Babka wrote: > > On 2/15/24 19:29, Kent Overstreet wrote: > > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote: > > >> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote: > > >> > > > >> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote: > > >> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote: > > >> > > > > > >> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote: > > >> > > > [...] > > >> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > >> > > > > #ifdef CONFIG_MEMORY_FAILURE > > >> > > > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); > > >> > > > > #endif > > >> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING > > >> > > > > + { > > >> > > > > + struct seq_buf s; > > >> > > > > + char *buf = kmalloc(4096, GFP_ATOMIC); > > >> > > > > + > > >> > > > > + if (buf) { > > >> > > > > + printk("Memory allocations:\n"); > > >> > > > > + seq_buf_init(&s, buf, 4096); > > >> > > > > + alloc_tags_show_mem_report(&s); > > >> > > > > + printk("%s", buf); > > >> > > > > + kfree(buf); > > >> > > > > + } > > >> > > > > + } > > >> > > > > +#endif > > >> > > > > > >> > > > I am pretty sure I have already objected to this. Memory allocations in > > >> > > > the oom path are simply no go unless there is absolutely no other way > > >> > > > around that. In this case the buffer could be preallocated. > > >> > > > > >> > > Good point. We will change this to a smaller buffer allocated on the > > >> > > stack and will print records one-by-one. Thanks! > > >> > > > >> > __show_mem could be called with a very deep call chains. A single > > >> > pre-allocated buffer should just do ok. > > >> > > >> Ack. Will do. > > > > > > No, we're not going to permanently burn 4k here. > > > > > > It's completely fine if the allocation fails, there's nothing "unsafe" > > > about doing a GFP_ATOMIC allocation here. > > > > Well, I think without __GFP_NOWARN it will cause a warning and thus > > recursion into __show_mem(), potentially infinite? Which is of course > > trivial to fix, but I'd myself rather sacrifice a bit of memory to get this > > potentially very useful output, if I enabled the profiling. The necessary > > memory overhead of page_ext and slabobj_ext makes the printing buffer > > overhead negligible in comparison? > > __GFP_NOWARN is a good point, we should have that. > > But - and correct me if I'm wrong here - doesn't an OOM kick in well > before GFP_ATOMIC 4k allocations are failing? Not really, GFP_ATOMIC users can compete with reclaimers and consume those reserves. > I'd expect the system to > be well and truly hosed at that point. It is OOMed... > If we want this report to be 100% reliable, then yes the preallocated > buffer makes sense - but I don't think 100% makes sense here; I think we > can accept ~99% and give back that 4k. Think about that from the memory reserves consumers. The atomic reserve is a scarse resource and now you want to use it for debugging purposes for which you could have preallocated.
On Thu, Feb 15, 2024 at 10:54:53PM +0100, Michal Hocko wrote: > On Thu 15-02-24 15:33:30, Kent Overstreet wrote: > > If we want this report to be 100% reliable, then yes the preallocated > > buffer makes sense - but I don't think 100% makes sense here; I think we > > can accept ~99% and give back that 4k. > > Think about that from the memory reserves consumers. The atomic reserve > is a scarse resource and now you want to use it for debugging purposes > for which you could have preallocated. _Memory_ is a finite resource that we shouldn't be using unnecessarily. We don't need this for the entire time we're under memory pressure; just the short duration it takes to generate the report, then it's back available for other users. You would have us dedicate 4k, from system bootup, that can never be used by other users. Again: this makes no sense. The whole point of having watermarks and shared reserves is so that every codepath doesn't have to have its own dedicated, private reserve, so that we can make better use of a shared finite resource.
On Thu, 15 Feb 2024 15:33:30 -0500 Kent Overstreet <kent.overstreet@linux.dev> wrote: > > Well, I think without __GFP_NOWARN it will cause a warning and thus > > recursion into __show_mem(), potentially infinite? Which is of course > > trivial to fix, but I'd myself rather sacrifice a bit of memory to get > > this potentially very useful output, if I enabled the profiling. The > > necessary memory overhead of page_ext and slabobj_ext makes the > > printing buffer overhead negligible in comparison? > > __GFP_NOWARN is a good point, we should have that. > > But - and correct me if I'm wrong here - doesn't an OOM kick in well > before GFP_ATOMIC 4k allocations are failing? I'd expect the system to > be well and truly hosed at that point. > > If we want this report to be 100% reliable, then yes the preallocated > buffer makes sense - but I don't think 100% makes sense here; I think we > can accept ~99% and give back that 4k. I just compiled v6.8-rc4 vanilla (with a fedora localmodconfig build) and saved it off (vmlinux.orig), then I compiled with the following: Applied the patches but did not enable anything: vmlinux.memtag-off Enabled MEM_ALLOC_PROFILING: vmlinux.memtag Enabled MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT: vmlinux.memtag-default-on Enabled MEM_ALLOC_PROFILING_DEBUG: vmlinux.memtag-debug And here's what I got: text data bss dec hex filename 29161847 18352730 5619716 53134293 32ac3d5 vmlinux.orig 29162286 18382638 5595140 53140064 32ada60 vmlinux.memtag-off (+5771) 29230868 18887662 5275652 53394182 32ebb06 vmlinux.memtag (+259889) 29230746 18887662 5275652 53394060 32eba8c vmlinux.memtag-default-on (+259767) dropped? 29276214 18946374 5177348 53399936 32ed180 vmlinux.memtag-debug (+265643) Just adding the patches increases the size by 5k. But the rest shows an increase of 259k, and you are worried about 4k (and possibly less?)??? -- Steve
On Thu, 15 Feb 2024 18:07:42 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > text data bss dec hex filename > 29161847 18352730 5619716 53134293 32ac3d5 vmlinux.orig > 29162286 18382638 5595140 53140064 32ada60 vmlinux.memtag-off (+5771) > 29230868 18887662 5275652 53394182 32ebb06 vmlinux.memtag (+259889) > 29230746 18887662 5275652 53394060 32eba8c vmlinux.memtag-default-on (+259767) dropped? > 29276214 18946374 5177348 53399936 32ed180 vmlinux.memtag-debug (+265643) If you plan on running this in production, and this increases the size of the text by 68k, have you measured the I$ pressure that this may induce? That is, what is the full overhead of having this enabled, as it could cause more instruction cache misses? I wonder if there has been measurements of it off. That is, having this configured in but default off still increases the text size by 68k. That can't be good on the instruction cache. -- Steve
On 2/15/24 15:07, Steven Rostedt wrote: > Just adding the patches increases the size by 5k. But the rest shows an > increase of 259k, and you are worried about 4k (and possibly less?)??? Doesn't the new page_ext thingy add a pointer per 'struct page', or ~0.2% of RAM, or ~32MB on a 16GB laptop? I, too, am confused why 4k is even remotely an issue.
On Thu, 15 Feb 2024 18:16:48 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > On Thu, 15 Feb 2024 18:07:42 -0500 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > text data bss dec hex filename > > 29161847 18352730 5619716 53134293 32ac3d5 vmlinux.orig > > 29162286 18382638 5595140 53140064 32ada60 vmlinux.memtag-off (+5771) > > 29230868 18887662 5275652 53394182 32ebb06 vmlinux.memtag (+259889) > > 29230746 18887662 5275652 53394060 32eba8c vmlinux.memtag-default-on (+259767) dropped? > > 29276214 18946374 5177348 53399936 32ed180 vmlinux.memtag-debug (+265643) > > If you plan on running this in production, and this increases the size of > the text by 68k, have you measured the I$ pressure that this may induce? > That is, what is the full overhead of having this enabled, as it could > cause more instruction cache misses? > > I wonder if there has been measurements of it off. That is, having this > configured in but default off still increases the text size by 68k. That > can't be good on the instruction cache. > I should have read the cover letter ;-) (someone pointed me to that on IRC): > Performance overhead: > To evaluate performance we implemented an in-kernel test executing > multiple get_free_page/free_page and kmalloc/kfree calls with allocation > sizes growing from 8 to 240 bytes with CPU frequency set to max and CPU > affinity set to a specific CPU to minimize the noise. Below are results > from running the test on Ubuntu 22.04.2 LTS with 6.8.0-rc1 kernel on > 56 core Intel Xeon: These are micro benchmarks, were any larger benchmarks taken? As microbenchmarks do not always show I$ issues (because the benchmark itself will warm up the cache). The cache issue could slow down tasks at a bigger picture, as it can cause more cache misses. Running other benchmarks under perf and recording the cache misses between the different configs would be a good picture to show. > > kmalloc pgalloc > (1 baseline) 6.764s 16.902s > (2 default disabled) 6.793s (+0.43%) 17.007s (+0.62%) > (3 default enabled) 7.197s (+6.40%) 23.666s (+40.02%) > (4 runtime enabled) 7.405s (+9.48%) 23.901s (+41.41%) > (5 memcg) 13.388s (+97.94%) 48.460s (+186.71%) > > Memory overhead: > Kernel size: > > text data bss dec diff > (1) 26515311 18890222 17018880 62424413 > (2) 26524728 19423818 16740352 62688898 264485 > (3) 26524724 19423818 16740352 62688894 264481 > (4) 26524728 19423818 16740352 62688898 264485 > (5) 26541782 18964374 16957440 62463596 39183 Similar to my builds. > > Memory consumption on a 56 core Intel CPU with 125GB of memory: > Code tags: 192 kB > PageExts: 262144 kB (256MB) > SlabExts: 9876 kB (9.6MB) > PcpuExts: 512 kB (0.5MB) > > Total overhead is 0.2% of total memory. All this, and we are still worried about 4k for useful debugging :-/ -- Steve
On Thu, Feb 15, 2024 at 06:07:42PM -0500, Steven Rostedt wrote: > On Thu, 15 Feb 2024 15:33:30 -0500 > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > > Well, I think without __GFP_NOWARN it will cause a warning and thus > > > recursion into __show_mem(), potentially infinite? Which is of course > > > trivial to fix, but I'd myself rather sacrifice a bit of memory to get > > > this potentially very useful output, if I enabled the profiling. The > > > necessary memory overhead of page_ext and slabobj_ext makes the > > > printing buffer overhead negligible in comparison? > > > > __GFP_NOWARN is a good point, we should have that. > > > > But - and correct me if I'm wrong here - doesn't an OOM kick in well > > before GFP_ATOMIC 4k allocations are failing? I'd expect the system to > > be well and truly hosed at that point. > > > > If we want this report to be 100% reliable, then yes the preallocated > > buffer makes sense - but I don't think 100% makes sense here; I think we > > can accept ~99% and give back that 4k. > > I just compiled v6.8-rc4 vanilla (with a fedora localmodconfig build) and > saved it off (vmlinux.orig), then I compiled with the following: > > Applied the patches but did not enable anything: vmlinux.memtag-off > Enabled MEM_ALLOC_PROFILING: vmlinux.memtag > Enabled MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT: vmlinux.memtag-default-on > Enabled MEM_ALLOC_PROFILING_DEBUG: vmlinux.memtag-debug > > And here's what I got: > > text data bss dec hex filename > 29161847 18352730 5619716 53134293 32ac3d5 vmlinux.orig > 29162286 18382638 5595140 53140064 32ada60 vmlinux.memtag-off (+5771) > 29230868 18887662 5275652 53394182 32ebb06 vmlinux.memtag (+259889) > 29230746 18887662 5275652 53394060 32eba8c vmlinux.memtag-default-on (+259767) dropped? > 29276214 18946374 5177348 53399936 32ed180 vmlinux.memtag-debug (+265643) > > Just adding the patches increases the size by 5k. But the rest shows an > increase of 259k, and you are worried about 4k (and possibly less?)??? Most of that is data (505024), not text (68582, or 66k). The data is mostly the alloc tags themselves (one per allocation callsite, and you compiled the entire kernel), so that's expected. Of the text, a lot of that is going to be slowpath stuff - module load and unload hooks, formatt and printing the output, other assorted bits. Then there's Allocation and deallocating obj extensions vectors - not slowpath but not super fast path, not every allocation. The fastpath instruction count overhead is pretty small - actually doing the accounting - the core of slub.c, page_alloc.c, percpu.c - setting/restoring the alloc tag: this is overhead we add to every allocation callsite, so it's the most relevant - but it's just a few instructions. So that's the breakdown. Definitely not zero overhead, but that fixed memory overhead (and additionally, the percpu counters) is the price we pay for very low runtime CPU overhead.
On Thu, Feb 15, 2024 at 03:19:33PM -0800, Dave Hansen wrote: > On 2/15/24 15:07, Steven Rostedt wrote: > > Just adding the patches increases the size by 5k. But the rest shows an > > increase of 259k, and you are worried about 4k (and possibly less?)??? > > Doesn't the new page_ext thingy add a pointer per 'struct page', or > ~0.2% of RAM, or ~32MB on a 16GB laptop? I, too, am confused why 4k is > even remotely an issue. page_ext adds a separate per-page array; it itself does not add a pointer to strugt page, it's an array lookup that uses the page pfn. We do add a pointer to page_ext, and that's (for now) unavoidable overhead - but we'll be looking at referencing code tags by index, and if that works out we'll be able to kill the page_ext dependency and just store the alloc tag index in page bits.
On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote:
> All this, and we are still worried about 4k for useful debugging :-/
Every additional 4k still needs justification. And whether we burn a
reserve on this will have no observable effect on user output in
remotely normal situations; if this allocation ever fails, we've already
been in an OOM situation for awhile and we've already printed out this
report many times, with less memory pressure where the allocation would
have succeeded.
On Thu, 15 Feb 2024 18:51:41 -0500 Kent Overstreet <kent.overstreet@linux.dev> wrote: > Most of that is data (505024), not text (68582, or 66k). > And the 4K extra would have been data too. > The data is mostly the alloc tags themselves (one per allocation > callsite, and you compiled the entire kernel), so that's expected. > > Of the text, a lot of that is going to be slowpath stuff - module load > and unload hooks, formatt and printing the output, other assorted bits. > > Then there's Allocation and deallocating obj extensions vectors - not > slowpath but not super fast path, not every allocation. > > The fastpath instruction count overhead is pretty small > - actually doing the accounting - the core of slub.c, page_alloc.c, > percpu.c > - setting/restoring the alloc tag: this is overhead we add to every > allocation callsite, so it's the most relevant - but it's just a few > instructions. > > So that's the breakdown. Definitely not zero overhead, but that fixed > memory overhead (and additionally, the percpu counters) is the price we > pay for very low runtime CPU overhead. But where are the benchmarks that are not micro-benchmarks. How much overhead does this cause to those? Is it in the noise, or is it noticeable? -- Steve
On Thu, Feb 15, 2024 at 07:21:41PM -0500, Steven Rostedt wrote: > On Thu, 15 Feb 2024 18:51:41 -0500 > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > Most of that is data (505024), not text (68582, or 66k). > > > > And the 4K extra would have been data too. "It's not that much" isn't an argument for being wasteful. > > The data is mostly the alloc tags themselves (one per allocation > > callsite, and you compiled the entire kernel), so that's expected. > > > > Of the text, a lot of that is going to be slowpath stuff - module load > > and unload hooks, formatt and printing the output, other assorted bits. > > > > Then there's Allocation and deallocating obj extensions vectors - not > > slowpath but not super fast path, not every allocation. > > > > The fastpath instruction count overhead is pretty small > > - actually doing the accounting - the core of slub.c, page_alloc.c, > > percpu.c > > - setting/restoring the alloc tag: this is overhead we add to every > > allocation callsite, so it's the most relevant - but it's just a few > > instructions. > > > > So that's the breakdown. Definitely not zero overhead, but that fixed > > memory overhead (and additionally, the percpu counters) is the price we > > pay for very low runtime CPU overhead. > > But where are the benchmarks that are not micro-benchmarks. How much > overhead does this cause to those? Is it in the noise, or is it noticeable? Microbenchmarks are how we magnify the effect of a change like this to the most we'll ever see. Barring cache effects, it'll be in the noise. Cache effects are a concern here because we're now touching task_struct in the allocation fast path; that is where the "compiled-in-but-turned-off" overhead comes from, because we can't add static keys for that code without doubling the amount of icache footprint, and I don't think that would be a great tradeoff. So: if your code has fastpath allocations where the hot part of task_struct isn't in cache, then this will be noticeable overhead to you, otherwise it won't be.
On Thu, 15 Feb 2024 19:32:38 -0500 Kent Overstreet <kent.overstreet@linux.dev> wrote: > > But where are the benchmarks that are not micro-benchmarks. How much > > overhead does this cause to those? Is it in the noise, or is it noticeable? > > Microbenchmarks are how we magnify the effect of a change like this to > the most we'll ever see. Barring cache effects, it'll be in the noise. > > Cache effects are a concern here because we're now touching task_struct > in the allocation fast path; that is where the > "compiled-in-but-turned-off" overhead comes from, because we can't add > static keys for that code without doubling the amount of icache > footprint, and I don't think that would be a great tradeoff. > > So: if your code has fastpath allocations where the hot part of > task_struct isn't in cache, then this will be noticeable overhead to > you, otherwise it won't be. All nice, but where are the benchmarks? This looks like it will have an affect on cache and you can talk all you want about how it will not be an issue, but without real world benchmarks, it's meaningless. Numbers talk. -- Steve
On Thu, Feb 15, 2024 at 07:39:15PM -0500, Steven Rostedt wrote: > On Thu, 15 Feb 2024 19:32:38 -0500 > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > > But where are the benchmarks that are not micro-benchmarks. How much > > > overhead does this cause to those? Is it in the noise, or is it noticeable? > > > > Microbenchmarks are how we magnify the effect of a change like this to > > the most we'll ever see. Barring cache effects, it'll be in the noise. > > > > Cache effects are a concern here because we're now touching task_struct > > in the allocation fast path; that is where the > > "compiled-in-but-turned-off" overhead comes from, because we can't add > > static keys for that code without doubling the amount of icache > > footprint, and I don't think that would be a great tradeoff. > > > > So: if your code has fastpath allocations where the hot part of > > task_struct isn't in cache, then this will be noticeable overhead to > > you, otherwise it won't be. > > All nice, but where are the benchmarks? This looks like it will have an > affect on cache and you can talk all you want about how it will not be an > issue, but without real world benchmarks, it's meaningless. Numbers talk. Steve, you're being demanding. We provided sufficient benchmarks to show the overhead is low enough for production, and then I gave you a detailed breakdown of where our overhead is and where it'll show up. I think that's reasonable.
On Thu, 15 Feb 2024 19:50:24 -0500 Kent Overstreet <kent.overstreet@linux.dev> wrote: > > All nice, but where are the benchmarks? This looks like it will have an > > affect on cache and you can talk all you want about how it will not be an > > issue, but without real world benchmarks, it's meaningless. Numbers talk. > > Steve, you're being demanding. We provided sufficient benchmarks to show > the overhead is low enough for production, and then I gave you a > detailed breakdown of where our overhead is and where it'll show up. I > think that's reasonable. It's not unreasonable or demanding to ask for benchmarks. You showed only micro-benchmarks that do not show how cache misses may affect the system. Honestly, it sounds like you did run other benchmarks and didn't like the results and are fighting to not have to produce them. Really, how hard is it? There's lots of benchmarks you can run, like hackbench, stress-ng, dbench. Why is this so difficult for you? -- Steve
On Thu, Feb 15, 2024 at 08:12:39PM -0500, Steven Rostedt wrote: > On Thu, 15 Feb 2024 19:50:24 -0500 > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > > All nice, but where are the benchmarks? This looks like it will have an > > > affect on cache and you can talk all you want about how it will not be an > > > issue, but without real world benchmarks, it's meaningless. Numbers talk. > > > > Steve, you're being demanding. We provided sufficient benchmarks to show > > the overhead is low enough for production, and then I gave you a > > detailed breakdown of where our overhead is and where it'll show up. I > > think that's reasonable. > > It's not unreasonable or demanding to ask for benchmarks. You showed only > micro-benchmarks that do not show how cache misses may affect the system. > Honestly, it sounds like you did run other benchmarks and didn't like the > results and are fighting to not have to produce them. Really, how hard is > it? There's lots of benchmarks you can run, like hackbench, stress-ng, > dbench. Why is this so difficult for you? Woah, this is verging into paranoid conspiracy territory. No, we haven't done other benchmarks, and if we had we'd be sharing them. And if I had more time to spend on performance of this patchset that's not where I'd be spending it; the next thing I'd be looking at would be assembly output of the hooking code and seeing if I could shave that down. But I already put a ton of work into shaving cycles on this patchset, I'm happy with the results, and I have other responsibilities and other things I need to be working on.
On Thu, Feb 15, 2024 at 5:18 PM Kent Overstreet <kent.overstreet@linux.dev> wrote: > > On Thu, Feb 15, 2024 at 08:12:39PM -0500, Steven Rostedt wrote: > > On Thu, 15 Feb 2024 19:50:24 -0500 > > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > > > > All nice, but where are the benchmarks? This looks like it will have an > > > > affect on cache and you can talk all you want about how it will not be an > > > > issue, but without real world benchmarks, it's meaningless. Numbers talk. > > > > > > Steve, you're being demanding. We provided sufficient benchmarks to show > > > the overhead is low enough for production, and then I gave you a > > > detailed breakdown of where our overhead is and where it'll show up. I > > > think that's reasonable. > > > > It's not unreasonable or demanding to ask for benchmarks. You showed only > > micro-benchmarks that do not show how cache misses may affect the system. > > Honestly, it sounds like you did run other benchmarks and didn't like the > > results and are fighting to not have to produce them. Really, how hard is > > it? There's lots of benchmarks you can run, like hackbench, stress-ng, > > dbench. Why is this so difficult for you? I'll run these benchmarks and will include the numbers in the next cover letter. > > Woah, this is verging into paranoid conspiracy territory. > > No, we haven't done other benchmarks, and if we had we'd be sharing > them. And if I had more time to spend on performance of this patchset > that's not where I'd be spending it; the next thing I'd be looking at > would be assembly output of the hooking code and seeing if I could shave > that down. > > But I already put a ton of work into shaving cycles on this patchset, > I'm happy with the results, and I have other responsibilities and other > things I need to be working on.
On Thu, Feb 15, 2024 at 3:56 PM Kent Overstreet <kent.overstreet@linux.dev> wrote: > > On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote: > > All this, and we are still worried about 4k for useful debugging :-/ I was planning to refactor this function to print one record at a time with a smaller buffer but after discussing with Kent, he has plans to reuse this function and having the report in one buffer is needed for that. > Every additional 4k still needs justification. And whether we burn a > reserve on this will have no observable effect on user output in > remotely normal situations; if this allocation ever fails, we've already > been in an OOM situation for awhile and we've already printed out this > report many times, with less memory pressure where the allocation would > have succeeded. I'm not sure this claim will always be true, specifically in the case of low-end devices with relatively low amounts of reserves and in the presence of a possible quick memory usage spike. We should also consider a case when panic_on_oom is set. All we get is one OOM report, so we get only one chance to capture this report. In any case, I don't yet have data to prove or disprove this claim but it will be interesting to test it with data from the field once the feature is deployed. For now I think with Vlastimil's __GFP_NOWARN suggestion the code becomes safe and the only risk is to lose this report. If we get cases with reports missing this data, we can easily change to reserved memory.
On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote: [...] > For now I think with Vlastimil's __GFP_NOWARN suggestion the code > becomes safe and the only risk is to lose this report. If we get cases > with reports missing this data, we can easily change to reserved > memory. This is not just about missing part of the oom report. This is annoying but not earth shattering. Eating into very small reserves (that might be the only usable memory while the system is struggling in OOM situation) could cause functional problems that would be non trivial to test for. All that for debugging purposes is just lame. If you want to reuse the code for a different purpose then abstract it and allocate the buffer when you can afford that and use preallocated on when in OOM situation. We have always went extra mile to avoid potentially disruptive operations from the oom handling code and I do not see any good reason to diverge from that principle.
On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote: > On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote: > [...] > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code > > becomes safe and the only risk is to lose this report. If we get cases > > with reports missing this data, we can easily change to reserved > > memory. > > This is not just about missing part of the oom report. This is annoying > but not earth shattering. Eating into very small reserves (that might be > the only usable memory while the system is struggling in OOM situation) > could cause functional problems that would be non trivial to test for. > All that for debugging purposes is just lame. If you want to reuse the code > for a different purpose then abstract it and allocate the buffer when you > can afford that and use preallocated on when in OOM situation. > > We have always went extra mile to avoid potentially disruptive > operations from the oom handling code and I do not see any good reason > to diverge from that principle. Michal, I gave you the logic between dedicated reserves and system reserves. Please stop repeating these vague what-ifs.
On Tue 20-02-24 12:18:49, Kent Overstreet wrote: > On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote: > > On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote: > > [...] > > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code > > > becomes safe and the only risk is to lose this report. If we get cases > > > with reports missing this data, we can easily change to reserved > > > memory. > > > > This is not just about missing part of the oom report. This is annoying > > but not earth shattering. Eating into very small reserves (that might be > > the only usable memory while the system is struggling in OOM situation) > > could cause functional problems that would be non trivial to test for. > > All that for debugging purposes is just lame. If you want to reuse the code > > for a different purpose then abstract it and allocate the buffer when you > > can afford that and use preallocated on when in OOM situation. > > > > We have always went extra mile to avoid potentially disruptive > > operations from the oom handling code and I do not see any good reason > > to diverge from that principle. > > Michal, I gave you the logic between dedicated reserves and system > reserves. Please stop repeating these vague what-ifs. Your argument makes little sense and it seems that it is impossible to explain that to you. I gave up on discussing this further with you. Consider NAK to any additional allocation from oom path unless you can give very _solid_ arguments this is absolutely necessary. "It's gona be fine and work most of the time" is not a solid argument.
On Tue, Feb 20, 2024 at 06:24:41PM +0100, Michal Hocko wrote: > On Tue 20-02-24 12:18:49, Kent Overstreet wrote: > > On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote: > > > On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote: > > > [...] > > > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code > > > > becomes safe and the only risk is to lose this report. If we get cases > > > > with reports missing this data, we can easily change to reserved > > > > memory. > > > > > > This is not just about missing part of the oom report. This is annoying > > > but not earth shattering. Eating into very small reserves (that might be > > > the only usable memory while the system is struggling in OOM situation) > > > could cause functional problems that would be non trivial to test for. > > > All that for debugging purposes is just lame. If you want to reuse the code > > > for a different purpose then abstract it and allocate the buffer when you > > > can afford that and use preallocated on when in OOM situation. > > > > > > We have always went extra mile to avoid potentially disruptive > > > operations from the oom handling code and I do not see any good reason > > > to diverge from that principle. > > > > Michal, I gave you the logic between dedicated reserves and system > > reserves. Please stop repeating these vague what-ifs. > > Your argument makes little sense and it seems that it is impossible to > explain that to you. I gave up on discussing this further with you. It was your choice to not engage with the technical discussion. And if you're not going to engage, repeating the same arguments that I already responded to 10 or 20 emails later is a pretty dishonest way to argue. You've been doing this kind of grandstanding throughout the entire discussion across every revision of the patchset. Knock it off.
On 2/19/24 18:17, Suren Baghdasaryan wrote: > On Thu, Feb 15, 2024 at 3:56 PM Kent Overstreet > <kent.overstreet@linux.dev> wrote: >> >> On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote: >> > All this, and we are still worried about 4k for useful debugging :-/ > > I was planning to refactor this function to print one record at a time > with a smaller buffer but after discussing with Kent, he has plans to > reuse this function and having the report in one buffer is needed for > that. We are printing to console, AFAICS all the code involved uses plain printk() I think it would be way easier to have a function using printk() for this use case than the seq_buf which is more suitable for /proc and friends. Then all concerns about buffers would be gone. It wouldn't be that much of a code duplication? >> Every additional 4k still needs justification. And whether we burn a >> reserve on this will have no observable effect on user output in >> remotely normal situations; if this allocation ever fails, we've already >> been in an OOM situation for awhile and we've already printed out this >> report many times, with less memory pressure where the allocation would >> have succeeded. > > I'm not sure this claim will always be true, specifically in the case > of low-end devices with relatively low amounts of reserves and in the That's right, GFP_ATOMIC failures can easily happen without prior OOMs. Consider a system where userspace allocations fill the memory as they usually do, up to high watermark. Then a burst of packets is received and handled by GFP_ATOMIC allocations that deplete the reserves and can't cause OOMs (OOM is when we fail to reclaim anything, but we are allocating from a context that can't reclaim), so the very first report would be an GFP_ATOMIC failure and now it can't allocate that buffer for printing. I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on this topic. > presence of a possible quick memory usage spike. We should also > consider a case when panic_on_oom is set. All we get is one OOM > report, so we get only one chance to capture this report. In any case, > I don't yet have data to prove or disprove this claim but it will be > interesting to test it with data from the field once the feature is > deployed. > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code > becomes safe and the only risk is to lose this report. If we get cases > with reports missing this data, we can easily change to reserved > memory.
On Tue, Feb 20, 2024 at 10:27 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 2/19/24 18:17, Suren Baghdasaryan wrote: > > On Thu, Feb 15, 2024 at 3:56 PM Kent Overstreet > > <kent.overstreet@linux.dev> wrote: > >> > >> On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote: > >> > All this, and we are still worried about 4k for useful debugging :-/ > > > > I was planning to refactor this function to print one record at a time > > with a smaller buffer but after discussing with Kent, he has plans to > > reuse this function and having the report in one buffer is needed for > > that. > > We are printing to console, AFAICS all the code involved uses plain printk() > I think it would be way easier to have a function using printk() for this > use case than the seq_buf which is more suitable for /proc and friends. Then > all concerns about buffers would be gone. It wouldn't be that much of a code > duplication? Ok, after discussing this with Kent, I'll change this patch to provide a function returning N top consumers (the array and N will be provided by the caller) and then we can print one record at a time with much less memory needed. That should address reusability concerns, will use memory more efficiently and will allow for more flexibility (more/less than 10 records if needed). Thanks for the feedback, everyone! > > >> Every additional 4k still needs justification. And whether we burn a > >> reserve on this will have no observable effect on user output in > >> remotely normal situations; if this allocation ever fails, we've already > >> been in an OOM situation for awhile and we've already printed out this > >> report many times, with less memory pressure where the allocation would > >> have succeeded. > > > > I'm not sure this claim will always be true, specifically in the case > > of low-end devices with relatively low amounts of reserves and in the > > That's right, GFP_ATOMIC failures can easily happen without prior OOMs. > Consider a system where userspace allocations fill the memory as they > usually do, up to high watermark. Then a burst of packets is received and > handled by GFP_ATOMIC allocations that deplete the reserves and can't cause > OOMs (OOM is when we fail to reclaim anything, but we are allocating from a > context that can't reclaim), so the very first report would be an GFP_ATOMIC > failure and now it can't allocate that buffer for printing. > > I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on > this topic. > > > presence of a possible quick memory usage spike. We should also > > consider a case when panic_on_oom is set. All we get is one OOM > > report, so we get only one chance to capture this report. In any case, > > I don't yet have data to prove or disprove this claim but it will be > > interesting to test it with data from the field once the feature is > > deployed. > > > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code > > becomes safe and the only risk is to lose this report. If we get cases > > with reports missing this data, we can easily change to reserved > > memory. >
On 2024/02/21 3:27, Vlastimil Babka wrote: > I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on > this topic. "[PATCH v3 10/35] lib: code tagging framework" says that codetag_lock_module_list() calls down_read() (i.e. sleeping operation), and "[PATCH v3 31/35] lib: add memory allocations report in show_mem()" says that __show_mem() calls alloc_tags_show_mem_report() after kmalloc(GFP_ATOMIC) (i.e. non-sleeping operation) but alloc_tags_show_mem_report() calls down_read() via codetag_lock_module_list() !? If __show_mem() might be called from atomic context (e.g. kmalloc(GFP_ATOMIC)), this will be a sleep in atomic bug. If __show_mem() might be called while semaphore is held for write, this will be a read-lock after write-lock deadlock bug. Not the matter of whether to allocate buffer statically or dynamically. Please don't hold a lock when trying to report memory usage.
On Wed, Feb 21, 2024 at 5:22 AM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote: > > On 2024/02/21 3:27, Vlastimil Babka wrote: > > I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on > > this topic. > > "[PATCH v3 10/35] lib: code tagging framework" says that codetag_lock_module_list() > calls down_read() (i.e. sleeping operation), and > "[PATCH v3 31/35] lib: add memory allocations report in show_mem()" says that > __show_mem() calls alloc_tags_show_mem_report() after kmalloc(GFP_ATOMIC) (i.e. > non-sleeping operation) but alloc_tags_show_mem_report() calls down_read() via > codetag_lock_module_list() !? > > If __show_mem() might be called from atomic context (e.g. kmalloc(GFP_ATOMIC)), > this will be a sleep in atomic bug. > If __show_mem() might be called while semaphore is held for write, > this will be a read-lock after write-lock deadlock bug. > > Not the matter of whether to allocate buffer statically or dynamically. > Please don't hold a lock when trying to report memory usage. Thanks for catching this, Tetsuo! Yes, we take the read-lock here to ensure that the list of modules is stable. I'm thinking I can replace the down_read() with down_read_trylock() and if we fail (there is a race with module load/unload) we will skip generating this report. The probability of racing with module load/unload while in OOM state I think is quite low, so skipping this report should not cause much information loss. > >
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 3fe51e67e231..0a5973c4ad77 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,6 +30,8 @@ struct alloc_tag { #ifdef CONFIG_MEM_ALLOC_PROFILING +void alloc_tags_show_mem_report(struct seq_buf *s); + static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) { return container_of(ct, struct alloc_tag, ct); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 2d5226d9262d..54312c213860 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -96,6 +96,44 @@ static const struct seq_operations allocinfo_seq_op = { .show = allocinfo_show, }; +void alloc_tags_show_mem_report(struct seq_buf *s) +{ + struct codetag_iterator iter; + struct codetag *ct; + struct { + struct codetag *tag; + size_t bytes; + } tags[10], n; + unsigned int i, nr = 0; + + codetag_lock_module_list(alloc_tag_cttype, true); + iter = codetag_get_ct_iter(alloc_tag_cttype); + while ((ct = codetag_next_ct(&iter))) { + struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct)); + + n.tag = ct; + n.bytes = counter.bytes; + + for (i = 0; i < nr; i++) + if (n.bytes > tags[i].bytes) + break; + + if (i < ARRAY_SIZE(tags)) { + nr -= nr == ARRAY_SIZE(tags); + memmove(&tags[i + 1], + &tags[i], + sizeof(tags[0]) * (nr - i)); + nr++; + tags[i] = n; + } + } + + for (i = 0; i < nr; i++) + alloc_tag_to_text(s, tags[i].tag); + + codetag_lock_module_list(alloc_tag_cttype, false); +} + static void __init procfs_init(void) { proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); diff --git a/mm/show_mem.c b/mm/show_mem.c index 8dcfafbd283c..d514c15ca076 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -12,6 +12,7 @@ #include <linux/hugetlb.h> #include <linux/mm.h> #include <linux/mmzone.h> +#include <linux/seq_buf.h> #include <linux/swap.h> #include <linux/vmstat.h> @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + { + struct seq_buf s; + char *buf = kmalloc(4096, GFP_ATOMIC); + + if (buf) { + printk("Memory allocations:\n"); + seq_buf_init(&s, buf, 4096); + alloc_tags_show_mem_report(&s); + printk("%s", buf); + kfree(buf); + } + } +#endif }