Message ID | 20200323080503.6224-2-jaewon31.kim@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | meminfo_extra: introduce meminfo extra | expand |
On Mon, Mar 23, 2020 at 05:05:01PM +0900, Jaewon Kim wrote: > Provide APIs to drivers so that they can show its memory usage on > /proc/meminfo_extra. > > int register_meminfo_extra(atomic_long_t *val, int shift, > const char *name); > int unregister_meminfo_extra(atomic_long_t *val); Nit, isn't it nicer to have the subsystem name first: meminfo_extra_register() meminfo_extra_unregister() ? > > Signed-off-by: Jaewon Kim <jaewon31.kim@samsung.com> > --- > v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c > use rcu to reduce lock overhead > v1: print info at /proc/meminfo > --- > fs/proc/Makefile | 1 + > fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/mm.h | 4 ++ > mm/page_alloc.c | 1 + > 4 files changed, 129 insertions(+) > create mode 100644 fs/proc/meminfo_extra.c > > diff --git a/fs/proc/Makefile b/fs/proc/Makefile > index bd08616ed8ba..83d2f55591c6 100644 > --- a/fs/proc/Makefile > +++ b/fs/proc/Makefile > @@ -19,6 +19,7 @@ proc-y += devices.o > proc-y += interrupts.o > proc-y += loadavg.o > proc-y += meminfo.o > +proc-y += meminfo_extra.o > proc-y += stat.o > proc-y += uptime.o > proc-y += util.o > diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c > new file mode 100644 > index 000000000000..bd3f0d2b7fb7 > --- /dev/null > +++ b/fs/proc/meminfo_extra.c > @@ -0,0 +1,123 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#include <linux/mm.h> > +#include <linux/proc_fs.h> > +#include <linux/seq_file.h> > +#include <linux/slab.h> > + > +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) > +{ > + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); > + seq_write(m, " kB\n", 4); > +} > + > +static LIST_HEAD(meminfo_head); > +static DEFINE_SPINLOCK(meminfo_lock); > + > +#define NAME_SIZE 15 > +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */ > + > +struct meminfo_extra { > + struct list_head list; > + atomic_long_t *val; > + int shift_for_page; > + char name[NAME_BUF_SIZE]; > + char name_pad[NAME_BUF_SIZE]; > +}; > + > +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > +{ > + struct meminfo_extra *meminfo, *memtemp; > + int len; > + int error = 0; > + > + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > + if (!meminfo) { > + error = -ENOMEM; > + goto out; > + } > + > + meminfo->val = val; > + meminfo->shift_for_page = shift; > + strncpy(meminfo->name, name, NAME_SIZE); > + len = strlen(meminfo->name); > + meminfo->name[len] = ':'; > + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > + while (++len < NAME_BUF_SIZE - 1) > + meminfo->name_pad[len] = ' '; > + > + spin_lock(&meminfo_lock); > + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > + if (memtemp->val == val) { > + error = -EINVAL; > + break; > + } > + } > + if (!error) > + list_add_tail_rcu(&meminfo->list, &meminfo_head); > + spin_unlock(&meminfo_lock); If you have a lock, why are you needing rcu? > + if (error) > + kfree(meminfo); > +out: > + > + return error; > +} > +EXPORT_SYMBOL(register_meminfo_extra); EXPORT_SYMBOL_GPL()? I have to ask :) thanks, greg k-h
Hi Jaewon, On 03/23/20 at 05:05pm, Jaewon Kim wrote: > Provide APIs to drivers so that they can show its memory usage on > /proc/meminfo_extra. > > int register_meminfo_extra(atomic_long_t *val, int shift, > const char *name); > int unregister_meminfo_extra(atomic_long_t *val); > > Signed-off-by: Jaewon Kim <jaewon31.kim@samsung.com> > --- > v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c > use rcu to reduce lock overhead > v1: print info at /proc/meminfo > --- > fs/proc/Makefile | 1 + > fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/mm.h | 4 ++ > mm/page_alloc.c | 1 + > 4 files changed, 129 insertions(+) > create mode 100644 fs/proc/meminfo_extra.c > > diff --git a/fs/proc/Makefile b/fs/proc/Makefile > index bd08616ed8ba..83d2f55591c6 100644 > --- a/fs/proc/Makefile > +++ b/fs/proc/Makefile > @@ -19,6 +19,7 @@ proc-y += devices.o > proc-y += interrupts.o > proc-y += loadavg.o > proc-y += meminfo.o > +proc-y += meminfo_extra.o > proc-y += stat.o > proc-y += uptime.o > proc-y += util.o > diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c > new file mode 100644 > index 000000000000..bd3f0d2b7fb7 > --- /dev/null > +++ b/fs/proc/meminfo_extra.c > @@ -0,0 +1,123 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#include <linux/mm.h> > +#include <linux/proc_fs.h> > +#include <linux/seq_file.h> > +#include <linux/slab.h> > + > +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) > +{ > + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); > + seq_write(m, " kB\n", 4); > +} > + > +static LIST_HEAD(meminfo_head); > +static DEFINE_SPINLOCK(meminfo_lock); > + > +#define NAME_SIZE 15 > +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */ > + > +struct meminfo_extra { > + struct list_head list; > + atomic_long_t *val; > + int shift_for_page; Can this be simplified to use a bytes value without an extra shift? > + char name[NAME_BUF_SIZE]; > + char name_pad[NAME_BUF_SIZE]; > +}; > + There should be document about below function here. > +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > +{ > + struct meminfo_extra *meminfo, *memtemp; > + int len; > + int error = 0; > + > + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > + if (!meminfo) { > + error = -ENOMEM; > + goto out; > + } > + > + meminfo->val = val; > + meminfo->shift_for_page = shift; > + strncpy(meminfo->name, name, NAME_SIZE); > + len = strlen(meminfo->name); > + meminfo->name[len] = ':'; > + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > + while (++len < NAME_BUF_SIZE - 1) > + meminfo->name_pad[len] = ' '; > + > + spin_lock(&meminfo_lock); > + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > + if (memtemp->val == val) { > + error = -EINVAL; > + break; > + } > + } > + if (!error) > + list_add_tail_rcu(&meminfo->list, &meminfo_head); > + spin_unlock(&meminfo_lock); > + if (error) > + kfree(meminfo); > +out: > + > + return error; > +} > +EXPORT_SYMBOL(register_meminfo_extra); > + > +int unregister_meminfo_extra(atomic_long_t *val) > +{ > + struct meminfo_extra *memtemp; > + int error = -EINVAL; > + > + spin_lock(&meminfo_lock); > + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > + if (memtemp->val == val) { > + list_del_rcu(&memtemp->list); > + error = 0; > + break; > + } > + } > + spin_unlock(&meminfo_lock); > + if (!error) { > + synchronize_rcu(); > + kfree(memtemp); > + } > + > + return error; > +} > +EXPORT_SYMBOL(unregister_meminfo_extra); > + > +static void __meminfo_extra(struct seq_file *m) > +{ > + struct meminfo_extra *memtemp; > + unsigned long nr_page; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > + nr_page = (unsigned long)atomic_long_read(memtemp->val); > + nr_page = nr_page >> memtemp->shift_for_page; > + if (m) > + show_val_kb(m, memtemp->name_pad, nr_page); > + else > + pr_cont("%s%lukB ", memtemp->name, nr_page); nr_page != nr_kb? > + } > + rcu_read_unlock(); > +} > + > +void show_meminfo_extra(void) > +{ > + __meminfo_extra(NULL); > +} > + > +static int meminfo_extra_proc_show(struct seq_file *m, void *v) > +{ > + __meminfo_extra(m); > + > + return 0; > +} > + > +static int __init proc_meminfo_extra_init(void) > +{ > + proc_create_single("meminfo_extra", 0, NULL, meminfo_extra_proc_show); > + return 0; > +} > +fs_initcall(proc_meminfo_extra_init); > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 52269e56c514..55317161ab57 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2898,6 +2898,10 @@ void __init setup_nr_node_ids(void); > static inline void setup_nr_node_ids(void) {} > #endif > > +void show_meminfo_extra(void); > +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name); > +int unregister_meminfo_extra(atomic_long_t *val); > + > extern int memcmp_pages(struct page *page1, struct page *page2); > > static inline int pages_identical(struct page *page1, struct page *page2) > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 3c4eb750a199..db1be9a39783 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5229,6 +5229,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) > struct zone *zone; > pg_data_t *pgdat; > > + show_meminfo_extra(); > for_each_populated_zone(zone) { > if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) > continue; > -- > 2.13.7 > > > _______________________________________________ > kexec mailing list > kexec@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec > Thanks Dave
On 2020년 03월 23일 18:53, Greg KH wrote: > On Mon, Mar 23, 2020 at 05:05:01PM +0900, Jaewon Kim wrote: >> Provide APIs to drivers so that they can show its memory usage on >> /proc/meminfo_extra. >> >> int register_meminfo_extra(atomic_long_t *val, int shift, >> const char *name); >> int unregister_meminfo_extra(atomic_long_t *val); > Nit, isn't it nicer to have the subsystem name first: > meminfo_extra_register() > meminfo_extra_unregister() > ? OK. Name can be changed. > > >> Signed-off-by: Jaewon Kim <jaewon31.kim@samsung.com> >> --- >> v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c >> use rcu to reduce lock overhead >> v1: print info at /proc/meminfo >> --- >> fs/proc/Makefile | 1 + >> fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ >> include/linux/mm.h | 4 ++ >> mm/page_alloc.c | 1 + >> 4 files changed, 129 insertions(+) >> create mode 100644 fs/proc/meminfo_extra.c >> >> diff --git a/fs/proc/Makefile b/fs/proc/Makefile >> index bd08616ed8ba..83d2f55591c6 100644 >> --- a/fs/proc/Makefile >> +++ b/fs/proc/Makefile >> @@ -19,6 +19,7 @@ proc-y += devices.o >> proc-y += interrupts.o >> proc-y += loadavg.o >> proc-y += meminfo.o >> +proc-y += meminfo_extra.o >> proc-y += stat.o >> proc-y += uptime.o >> proc-y += util.o >> diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c >> new file mode 100644 >> index 000000000000..bd3f0d2b7fb7 >> --- /dev/null >> +++ b/fs/proc/meminfo_extra.c >> @@ -0,0 +1,123 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +#include <linux/mm.h> >> +#include <linux/proc_fs.h> >> +#include <linux/seq_file.h> >> +#include <linux/slab.h> >> + >> +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) >> +{ >> + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); >> + seq_write(m, " kB\n", 4); >> +} >> + >> +static LIST_HEAD(meminfo_head); >> +static DEFINE_SPINLOCK(meminfo_lock); >> + >> +#define NAME_SIZE 15 >> +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */ >> + >> +struct meminfo_extra { >> + struct list_head list; >> + atomic_long_t *val; >> + int shift_for_page; >> + char name[NAME_BUF_SIZE]; >> + char name_pad[NAME_BUF_SIZE]; >> +}; >> + >> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) >> +{ >> + struct meminfo_extra *meminfo, *memtemp; >> + int len; >> + int error = 0; >> + >> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); >> + if (!meminfo) { >> + error = -ENOMEM; >> + goto out; >> + } >> + >> + meminfo->val = val; >> + meminfo->shift_for_page = shift; >> + strncpy(meminfo->name, name, NAME_SIZE); >> + len = strlen(meminfo->name); >> + meminfo->name[len] = ':'; >> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); >> + while (++len < NAME_BUF_SIZE - 1) >> + meminfo->name_pad[len] = ' '; >> + >> + spin_lock(&meminfo_lock); >> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { >> + if (memtemp->val == val) { >> + error = -EINVAL; >> + break; >> + } >> + } >> + if (!error) >> + list_add_tail_rcu(&meminfo->list, &meminfo_head); >> + spin_unlock(&meminfo_lock); > If you have a lock, why are you needing rcu? I think _rcu should be removed out of list_for_each_entry_rcu. But I'm confused about what you meant. I used rcu_read_lock on __meminfo_extra, and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > > > >> + if (error) >> + kfree(meminfo); >> +out: >> + >> + return error; >> +} >> +EXPORT_SYMBOL(register_meminfo_extra); > EXPORT_SYMBOL_GPL()? I have to ask :) I can use EXPORT_SYMBOL_GPL. > > thanks, > > greg k-h > > Hello Thank you for your comment. By the way there was not resolved discussion on v1 patch as I mentioned on cover page. I'd like to hear your opinion on this /proc/meminfo_extra node. Do you think this is meaningful or cannot co-exist with other future sysfs based API.
On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: > On 2020년 03월 23일 18:53, Greg KH wrote: > >> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > >> +{ > >> + struct meminfo_extra *meminfo, *memtemp; > >> + int len; > >> + int error = 0; > >> + > >> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > >> + if (!meminfo) { > >> + error = -ENOMEM; > >> + goto out; > >> + } > >> + > >> + meminfo->val = val; > >> + meminfo->shift_for_page = shift; > >> + strncpy(meminfo->name, name, NAME_SIZE); > >> + len = strlen(meminfo->name); > >> + meminfo->name[len] = ':'; > >> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > >> + while (++len < NAME_BUF_SIZE - 1) > >> + meminfo->name_pad[len] = ' '; > >> + > >> + spin_lock(&meminfo_lock); > >> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > >> + if (memtemp->val == val) { > >> + error = -EINVAL; > >> + break; > >> + } > >> + } > >> + if (!error) > >> + list_add_tail_rcu(&meminfo->list, &meminfo_head); > >> + spin_unlock(&meminfo_lock); > > If you have a lock, why are you needing rcu? > I think _rcu should be removed out of list_for_each_entry_rcu. > But I'm confused about what you meant. > I used rcu_read_lock on __meminfo_extra, > and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. If that's the case, then that's fine, it just didn't seem like that was needed. Or I might have been reading your rcu logic incorrectly... > >> + if (error) > >> + kfree(meminfo); > >> +out: > >> + > >> + return error; > >> +} > >> +EXPORT_SYMBOL(register_meminfo_extra); > > EXPORT_SYMBOL_GPL()? I have to ask :) > I can use EXPORT_SYMBOL_GPL. > > > > thanks, > > > > greg k-h > > > > > > Hello > Thank you for your comment. > > By the way there was not resolved discussion on v1 patch as I mentioned on cover page. > I'd like to hear your opinion on this /proc/meminfo_extra node. I think it is the propagation of an old and obsolete interface that you will have to support for the next 20+ years and yet not actually be useful :) > Do you think this is meaningful or cannot co-exist with other future > sysfs based API. What sysfs-based API? I still don't know _why_ you want this. The ION stuff is not needed as that code is about to be deleted, so who else wants this? What is the use-case for it that is so desperately needed that parsing yet-another-proc file is going to solve the problem? thanks, greg k-h
On 2020년 03월 24일 19:11, Greg KH wrote: > On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: >> On 2020년 03월 23일 18:53, Greg KH wrote: >>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) >>>> +{ >>>> + struct meminfo_extra *meminfo, *memtemp; >>>> + int len; >>>> + int error = 0; >>>> + >>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); >>>> + if (!meminfo) { >>>> + error = -ENOMEM; >>>> + goto out; >>>> + } >>>> + >>>> + meminfo->val = val; >>>> + meminfo->shift_for_page = shift; >>>> + strncpy(meminfo->name, name, NAME_SIZE); >>>> + len = strlen(meminfo->name); >>>> + meminfo->name[len] = ':'; >>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); >>>> + while (++len < NAME_BUF_SIZE - 1) >>>> + meminfo->name_pad[len] = ' '; >>>> + >>>> + spin_lock(&meminfo_lock); >>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { >>>> + if (memtemp->val == val) { >>>> + error = -EINVAL; >>>> + break; >>>> + } >>>> + } >>>> + if (!error) >>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); >>>> + spin_unlock(&meminfo_lock); >>> If you have a lock, why are you needing rcu? >> I think _rcu should be removed out of list_for_each_entry_rcu. >> But I'm confused about what you meant. >> I used rcu_read_lock on __meminfo_extra, >> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > If that's the case, then that's fine, it just didn't seem like that was > needed. Or I might have been reading your rcu logic incorrectly... > >>>> + if (error) >>>> + kfree(meminfo); >>>> +out: >>>> + >>>> + return error; >>>> +} >>>> +EXPORT_SYMBOL(register_meminfo_extra); >>> EXPORT_SYMBOL_GPL()? I have to ask :) >> I can use EXPORT_SYMBOL_GPL. >>> thanks, >>> >>> greg k-h >>> >>> >> Hello >> Thank you for your comment. >> >> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. >> I'd like to hear your opinion on this /proc/meminfo_extra node. > I think it is the propagation of an old and obsolete interface that you > will have to support for the next 20+ years and yet not actually be > useful :) > >> Do you think this is meaningful or cannot co-exist with other future >> sysfs based API. > What sysfs-based API? Please refer to mail thread on v1 patch set - https://lkml.org/lkml/fancy/2020/3/10/2102 especially discussion with Leon Romanovsky on https://lkml.org/lkml/fancy/2020/3/16/140 > > I still don't know _why_ you want this. The ION stuff is not needed as > that code is about to be deleted, so who else wants this? What is the > use-case for it that is so desperately needed that parsing > yet-another-proc file is going to solve the problem? In my Android device, there are graphic driver memory, zsmalloc memory except ION. I don't know other cases in other platform. Not desperately needed but I think we need one userspace knob to see overall hidden huge memory. Additionally I'd like to see all those hidden memory in OutOfMemory log. This is useful to get clue to find memory hogger. i.e.) show_mem on oom <6>[ 420.856428] Mem-Info: <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0 Thank you Jaewon Kim > > thanks, > > greg k-h > >
On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: > > > On 2020년 03월 24일 19:11, Greg KH wrote: > > On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: > >> On 2020년 03월 23일 18:53, Greg KH wrote: > >>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > >>>> +{ > >>>> + struct meminfo_extra *meminfo, *memtemp; > >>>> + int len; > >>>> + int error = 0; > >>>> + > >>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > >>>> + if (!meminfo) { > >>>> + error = -ENOMEM; > >>>> + goto out; > >>>> + } > >>>> + > >>>> + meminfo->val = val; > >>>> + meminfo->shift_for_page = shift; > >>>> + strncpy(meminfo->name, name, NAME_SIZE); > >>>> + len = strlen(meminfo->name); > >>>> + meminfo->name[len] = ':'; > >>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > >>>> + while (++len < NAME_BUF_SIZE - 1) > >>>> + meminfo->name_pad[len] = ' '; > >>>> + > >>>> + spin_lock(&meminfo_lock); > >>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > >>>> + if (memtemp->val == val) { > >>>> + error = -EINVAL; > >>>> + break; > >>>> + } > >>>> + } > >>>> + if (!error) > >>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); > >>>> + spin_unlock(&meminfo_lock); > >>> If you have a lock, why are you needing rcu? > >> I think _rcu should be removed out of list_for_each_entry_rcu. > >> But I'm confused about what you meant. > >> I used rcu_read_lock on __meminfo_extra, > >> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > > If that's the case, then that's fine, it just didn't seem like that was > > needed. Or I might have been reading your rcu logic incorrectly... > > > >>>> + if (error) > >>>> + kfree(meminfo); > >>>> +out: > >>>> + > >>>> + return error; > >>>> +} > >>>> +EXPORT_SYMBOL(register_meminfo_extra); > >>> EXPORT_SYMBOL_GPL()? I have to ask :) > >> I can use EXPORT_SYMBOL_GPL. > >>> thanks, > >>> > >>> greg k-h > >>> > >>> > >> Hello > >> Thank you for your comment. > >> > >> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. > >> I'd like to hear your opinion on this /proc/meminfo_extra node. > > I think it is the propagation of an old and obsolete interface that you > > will have to support for the next 20+ years and yet not actually be > > useful :) > > > >> Do you think this is meaningful or cannot co-exist with other future > >> sysfs based API. > > What sysfs-based API? > Please refer to mail thread on v1 patch set - https://lkml.org/lkml/fancy/2020/3/10/2102 > especially discussion with Leon Romanovsky on https://lkml.org/lkml/fancy/2020/3/16/140 I really do not understand what you are referring to here, sorry. I do not see any sysfs-based code in that thread. And try to use lore.kernel.org, lkml.org doesn't always work and we have no control over that :( > > I still don't know _why_ you want this. The ION stuff is not needed as > > that code is about to be deleted, so who else wants this? What is the > > use-case for it that is so desperately needed that parsing > > yet-another-proc file is going to solve the problem? > In my Android device, there are graphic driver memory, zsmalloc memory except ION. Ok, so what does Android have to do with this? > I don't know other cases in other platform. > Not desperately needed but I think we need one userspace knob to see overall hidden huge memory. Why? Who wants that? What would userspace do with that? And what exactly do you want to show? Is this just a debugging thing? Then use debugfs for that, not proc. Isn't that what the DRM developers are starting to do? > Additionally I'd like to see all those hidden memory in OutOfMemory log. How is anything hidden, can't you see it in the slab information? > This is useful to get clue to find memory hogger. > i.e.) show_mem on oom > <6>[ 420.856428] Mem-Info: > <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB > <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0 So what does this show you? That someone is takign a ton of ION memory for some unknown use? What can you do with that? What would you do with that? And memory is almost never assigned to a "driver", it is assigned to a "device" that uses it. Drivers can handle multiple devices at the same time, so why would you break this down by drivers? Are you assuming that a driver only talks to one piece of hardware? I think you need a much better use case for all of this other than "wouldn't it be nice to see some numbers", as that isn't going to help anyone out in the end. thanks, greg k-h
On 2020년 03월 24일 20:46, Greg KH wrote: > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: >> >> On 2020년 03월 24일 19:11, Greg KH wrote: >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: >>>> On 2020년 03월 23일 18:53, Greg KH wrote: >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) >>>>>> +{ >>>>>> + struct meminfo_extra *meminfo, *memtemp; >>>>>> + int len; >>>>>> + int error = 0; >>>>>> + >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); >>>>>> + if (!meminfo) { >>>>>> + error = -ENOMEM; >>>>>> + goto out; >>>>>> + } >>>>>> + >>>>>> + meminfo->val = val; >>>>>> + meminfo->shift_for_page = shift; >>>>>> + strncpy(meminfo->name, name, NAME_SIZE); >>>>>> + len = strlen(meminfo->name); >>>>>> + meminfo->name[len] = ':'; >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); >>>>>> + while (++len < NAME_BUF_SIZE - 1) >>>>>> + meminfo->name_pad[len] = ' '; >>>>>> + >>>>>> + spin_lock(&meminfo_lock); >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { >>>>>> + if (memtemp->val == val) { >>>>>> + error = -EINVAL; >>>>>> + break; >>>>>> + } >>>>>> + } >>>>>> + if (!error) >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); >>>>>> + spin_unlock(&meminfo_lock); >>>>> If you have a lock, why are you needing rcu? >>>> I think _rcu should be removed out of list_for_each_entry_rcu. >>>> But I'm confused about what you meant. >>>> I used rcu_read_lock on __meminfo_extra, >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. >>> If that's the case, then that's fine, it just didn't seem like that was >>> needed. Or I might have been reading your rcu logic incorrectly... >>> >>>>>> + if (error) >>>>>> + kfree(meminfo); >>>>>> +out: >>>>>> + >>>>>> + return error; >>>>>> +} >>>>>> +EXPORT_SYMBOL(register_meminfo_extra); >>>>> EXPORT_SYMBOL_GPL()? I have to ask :) >>>> I can use EXPORT_SYMBOL_GPL. >>>>> thanks, >>>>> >>>>> greg k-h >>>>> >>>>> >>>> Hello >>>> Thank you for your comment. >>>> >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. >>>> I'd like to hear your opinion on this /proc/meminfo_extra node. >>> I think it is the propagation of an old and obsolete interface that you >>> will have to support for the next 20+ years and yet not actually be >>> useful :) >>> >>>> Do you think this is meaningful or cannot co-exist with other future >>>> sysfs based API. >>> What sysfs-based API? >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102 >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140 > I really do not understand what you are referring to here, sorry. I do > not see any sysfs-based code in that thread. Sorry. I also did not see actual code. Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff? > > And try to use lore.kernel.org, lkml.org doesn't always work and we have > no control over that :( > >>> I still don't know _why_ you want this. The ION stuff is not needed as >>> that code is about to be deleted, so who else wants this? What is the >>> use-case for it that is so desperately needed that parsing >>> yet-another-proc file is going to solve the problem? >> In my Android device, there are graphic driver memory, zsmalloc memory except ION. > Ok, so what does Android have to do with this? Some driver in Android platform may use my API to show its memory usage. > >> I don't know other cases in other platform. >> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory. > Why? Who wants that? What would userspace do with that? And what > exactly do you want to show? > > Is this just a debugging thing? Then use debugfs for that, not proc. > Isn't that what the DRM developers are starting to do? > >> Additionally I'd like to see all those hidden memory in OutOfMemory log. > How is anything hidden, can't you see it in the slab information? > Let me explain more. 0. slab As I said in cover page, this is not for memory allocated by slab. I'd like to know where huge memory has gone. Those are directly allocated by alloc_pages instead of slab. /proc/slabinfo does not show this information. 1. /proc/meminfo_extra /proc/meminfo_extra could be debugging thing to see memory status at a certain time. But it, I think, is also basic information rather than just for debugging. It is similar with /proc/meminfo which is in procfs instead of debugfs. 2. oom log oom log in show_mem is more than just debugging. As existing oom log shows much memory information, I think we need the hidden memory info. Without these information, we do NOT know oom reason because other traditional stats are not enough. >> This is useful to get clue to find memory hogger. >> i.e.) show_mem on oom >> <6>[ 420.856428] Mem-Info: >> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB >> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0 > So what does this show you? That someone is takign a ton of ION memory > for some unknown use? What can you do with that? What would you do > with that? We may not know exact memory owner. But we can narrow down. Anyway I think this is meaningful instead of no clue. > > And memory is almost never assigned to a "driver", it is assigned to a > "device" that uses it. Drivers can handle multiple devices at the same > time, so why would you break this down by drivers? Are you assuming > that a driver only talks to one piece of hardware? Yes a driver may support several devices. I don't know if it same on an embedded device. Anyway I think the idea works even for several devices, although the driver should distinguish memory usage for each device and should register each memory stat. > > I think you need a much better use case for all of this other than > "wouldn't it be nice to see some numbers", as that isn't going to help > anyone out in the end. Sorry. As of now, I do not know other better use case, but I still think memory information should cover most of memory usage. Huge memory consumed by driver or other core logic should be shown in OoM. > > thanks, > > greg k-h > >
On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote: > > > On 2020년 03월 24일 20:46, Greg KH wrote: > > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: > >> > >> On 2020년 03월 24일 19:11, Greg KH wrote: > >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: > >>>> On 2020년 03월 23일 18:53, Greg KH wrote: > >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > >>>>>> +{ > >>>>>> + struct meminfo_extra *meminfo, *memtemp; > >>>>>> + int len; > >>>>>> + int error = 0; > >>>>>> + > >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > >>>>>> + if (!meminfo) { > >>>>>> + error = -ENOMEM; > >>>>>> + goto out; > >>>>>> + } > >>>>>> + > >>>>>> + meminfo->val = val; > >>>>>> + meminfo->shift_for_page = shift; > >>>>>> + strncpy(meminfo->name, name, NAME_SIZE); > >>>>>> + len = strlen(meminfo->name); > >>>>>> + meminfo->name[len] = ':'; > >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > >>>>>> + while (++len < NAME_BUF_SIZE - 1) > >>>>>> + meminfo->name_pad[len] = ' '; > >>>>>> + > >>>>>> + spin_lock(&meminfo_lock); > >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > >>>>>> + if (memtemp->val == val) { > >>>>>> + error = -EINVAL; > >>>>>> + break; > >>>>>> + } > >>>>>> + } > >>>>>> + if (!error) > >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); > >>>>>> + spin_unlock(&meminfo_lock); > >>>>> If you have a lock, why are you needing rcu? > >>>> I think _rcu should be removed out of list_for_each_entry_rcu. > >>>> But I'm confused about what you meant. > >>>> I used rcu_read_lock on __meminfo_extra, > >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > >>> If that's the case, then that's fine, it just didn't seem like that was > >>> needed. Or I might have been reading your rcu logic incorrectly... > >>> > >>>>>> + if (error) > >>>>>> + kfree(meminfo); > >>>>>> +out: > >>>>>> + > >>>>>> + return error; > >>>>>> +} > >>>>>> +EXPORT_SYMBOL(register_meminfo_extra); > >>>>> EXPORT_SYMBOL_GPL()? I have to ask :) > >>>> I can use EXPORT_SYMBOL_GPL. > >>>>> thanks, > >>>>> > >>>>> greg k-h > >>>>> > >>>>> > >>>> Hello > >>>> Thank you for your comment. > >>>> > >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. > >>>> I'd like to hear your opinion on this /proc/meminfo_extra node. > >>> I think it is the propagation of an old and obsolete interface that you > >>> will have to support for the next 20+ years and yet not actually be > >>> useful :) > >>> > >>>> Do you think this is meaningful or cannot co-exist with other future > >>>> sysfs based API. > >>> What sysfs-based API? > >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102 > >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140 > > I really do not understand what you are referring to here, sorry. I do > > not see any sysfs-based code in that thread. > Sorry. I also did not see actual code. > Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff? > > > > And try to use lore.kernel.org, lkml.org doesn't always work and we have > > no control over that :( > > > >>> I still don't know _why_ you want this. The ION stuff is not needed as > >>> that code is about to be deleted, so who else wants this? What is the > >>> use-case for it that is so desperately needed that parsing > >>> yet-another-proc file is going to solve the problem? > >> In my Android device, there are graphic driver memory, zsmalloc memory except ION. > > Ok, so what does Android have to do with this? > Some driver in Android platform may use my API to show its memory usage. I do not understand what this means. > >> I don't know other cases in other platform. > >> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory. > > Why? Who wants that? What would userspace do with that? And what > > exactly do you want to show? > > > > Is this just a debugging thing? Then use debugfs for that, not proc. > > Isn't that what the DRM developers are starting to do? > > > >> Additionally I'd like to see all those hidden memory in OutOfMemory log. > > How is anything hidden, can't you see it in the slab information? > > > Let me explain more. > > 0. slab > As I said in cover page, this is not for memory allocated by slab. Great, then have the subsystem that allocates such memory, be the thing that exports the information. Drivers "on their own" do not grab any memory without asking for it from other parts of the kernel. Modify those "other parts", this isn't a driver-specific thing at all. So, what "other parts" are involved here? > I'd like to know where huge memory has gone. > Those are directly allocated by alloc_pages instead of slab. > /proc/slabinfo does not show this information. Why isn't alloc_pages information exported anywhere? Work on that. > 1. /proc/meminfo_extra > /proc/meminfo_extra could be debugging thing to see memory status at a certain time. If it is debugging, then use debugfs. > But it, I think, is also basic information rather than just for debugging. Who would use that information for anything except debugging? > It is similar with /proc/meminfo which is in procfs instead of debugfs. meminfo is older than debugfs and sysfs, can't change that today. > 2. oom log > oom log in show_mem is more than just debugging. Why? Who sees this? > As existing oom log shows much memory information, I think we need the hidden memory info. > Without these information, we do NOT know oom reason because other traditional stats are not enough. Why not? Kernel users of memory shouldn't be triggering OOM events. > >> This is useful to get clue to find memory hogger. > >> i.e.) show_mem on oom > >> <6>[ 420.856428] Mem-Info: > >> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB > >> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0 > > So what does this show you? That someone is takign a ton of ION memory > > for some unknown use? What can you do with that? What would you do > > with that? > We may not know exact memory owner. But we can narrow down. > Anyway I think this is meaningful instead of no clue. Again, work on the subsystems that actually allocate the memory, not drivers. And if you want to mess with drivers, do it in a device-specific way, not a driver-specific way. > > And memory is almost never assigned to a "driver", it is assigned to a > > "device" that uses it. Drivers can handle multiple devices at the same > > time, so why would you break this down by drivers? Are you assuming > > that a driver only talks to one piece of hardware? > Yes a driver may support several devices. I don't know if it same on an embedded device. Why wouldn't it be? Is this new interface somehow only acceptable for systems with one-device-per-driver? If so, that's not going to work at all. > Anyway I think the idea works even for several devices, although the driver should > distinguish memory usage for each device and should register each memory stat. And how would that happen? thanks, greg k-h
On Tue, Mar 24, 2020 at 12:46:45PM +0100, Greg KH wrote: > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: > > I don't know other cases in other platform. > > Not desperately needed but I think we need one userspace knob to see overall hidden huge memory. > > Why? Who wants that? What would userspace do with that? And what > exactly do you want to show? > > Is this just a debugging thing? Then use debugfs for that, not proc. Yes, please use debugfs. Android can ban it in production all at once. > Isn't that what the DRM developers are starting to do? > > > Additionally I'd like to see all those hidden memory in OutOfMemory log. > > How is anything hidden, can't you see it in the slab information? There real usecase for something like that is vmware baloon driver. Two VM instances oversteal pages one from another resulting in guest OOM which looks like one giant get_free_page() memory leak from inside VM. I don't know how often this type of issue happens nowadays but countless OOM support tickets were filed and closed over this nonsense.
On 2020년 03월 24일 22:19, Greg KH wrote: > On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote: >> >> On 2020년 03월 24일 20:46, Greg KH wrote: >>> On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: >>>> On 2020년 03월 24일 19:11, Greg KH wrote: >>>>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: >>>>>> On 2020년 03월 23일 18:53, Greg KH wrote: >>>>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) >>>>>>>> +{ >>>>>>>> + struct meminfo_extra *meminfo, *memtemp; >>>>>>>> + int len; >>>>>>>> + int error = 0; >>>>>>>> + >>>>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); >>>>>>>> + if (!meminfo) { >>>>>>>> + error = -ENOMEM; >>>>>>>> + goto out; >>>>>>>> + } >>>>>>>> + >>>>>>>> + meminfo->val = val; >>>>>>>> + meminfo->shift_for_page = shift; >>>>>>>> + strncpy(meminfo->name, name, NAME_SIZE); >>>>>>>> + len = strlen(meminfo->name); >>>>>>>> + meminfo->name[len] = ':'; >>>>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); >>>>>>>> + while (++len < NAME_BUF_SIZE - 1) >>>>>>>> + meminfo->name_pad[len] = ' '; >>>>>>>> + >>>>>>>> + spin_lock(&meminfo_lock); >>>>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { >>>>>>>> + if (memtemp->val == val) { >>>>>>>> + error = -EINVAL; >>>>>>>> + break; >>>>>>>> + } >>>>>>>> + } >>>>>>>> + if (!error) >>>>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); >>>>>>>> + spin_unlock(&meminfo_lock); >>>>>>> If you have a lock, why are you needing rcu? >>>>>> I think _rcu should be removed out of list_for_each_entry_rcu. >>>>>> But I'm confused about what you meant. >>>>>> I used rcu_read_lock on __meminfo_extra, >>>>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. >>>>> If that's the case, then that's fine, it just didn't seem like that was >>>>> needed. Or I might have been reading your rcu logic incorrectly... >>>>> >>>>>>>> + if (error) >>>>>>>> + kfree(meminfo); >>>>>>>> +out: >>>>>>>> + >>>>>>>> + return error; >>>>>>>> +} >>>>>>>> +EXPORT_SYMBOL(register_meminfo_extra); >>>>>>> EXPORT_SYMBOL_GPL()? I have to ask :) >>>>>> I can use EXPORT_SYMBOL_GPL. >>>>>>> thanks, >>>>>>> >>>>>>> greg k-h >>>>>>> >>>>>>> >>>>>> Hello >>>>>> Thank you for your comment. >>>>>> >>>>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. >>>>>> I'd like to hear your opinion on this /proc/meminfo_extra node. >>>>> I think it is the propagation of an old and obsolete interface that you >>>>> will have to support for the next 20+ years and yet not actually be >>>>> useful :) >>>>> >>>>>> Do you think this is meaningful or cannot co-exist with other future >>>>>> sysfs based API. >>>>> What sysfs-based API? >>>> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102 >>>> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140 >>> I really do not understand what you are referring to here, sorry. I do >>> not see any sysfs-based code in that thread. >> Sorry. I also did not see actual code. >> Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff? >>> And try to use lore.kernel.org, lkml.org doesn't always work and we have >>> no control over that :( >>> >>>>> I still don't know _why_ you want this. The ION stuff is not needed as >>>>> that code is about to be deleted, so who else wants this? What is the >>>>> use-case for it that is so desperately needed that parsing >>>>> yet-another-proc file is going to solve the problem? >>>> In my Android device, there are graphic driver memory, zsmalloc memory except ION. >>> Ok, so what does Android have to do with this? >> Some driver in Android platform may use my API to show its memory usage. > I do not understand what this means. > >>>> I don't know other cases in other platform. >>>> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory. >>> Why? Who wants that? What would userspace do with that? And what >>> exactly do you want to show? >>> >>> Is this just a debugging thing? Then use debugfs for that, not proc. >>> Isn't that what the DRM developers are starting to do? >>> >>>> Additionally I'd like to see all those hidden memory in OutOfMemory log. >>> How is anything hidden, can't you see it in the slab information? >>> >> Let me explain more. >> >> 0. slab >> As I said in cover page, this is not for memory allocated by slab. > Great, then have the subsystem that allocates such memory, be the thing > that exports the information. Drivers "on their own" do not grab any > memory without asking for it from other parts of the kernel. > > Modify those "other parts", this isn't a driver-specific thing at all. > > So, what "other parts" are involved here? > >> I'd like to know where huge memory has gone. >> Those are directly allocated by alloc_pages instead of slab. >> /proc/slabinfo does not show this information. > Why isn't alloc_pages information exported anywhere? Work on that. > >> 1. /proc/meminfo_extra >> /proc/meminfo_extra could be debugging thing to see memory status at a certain time. > If it is debugging, then use debugfs. > >> But it, I think, is also basic information rather than just for debugging. > Who would use that information for anything except debugging? > >> It is similar with /proc/meminfo which is in procfs instead of debugfs. > meminfo is older than debugfs and sysfs, can't change that today. > >> 2. oom log >> oom log in show_mem is more than just debugging. > Why? Who sees this? > >> As existing oom log shows much memory information, I think we need the hidden memory info. >> Without these information, we do NOT know oom reason because other traditional stats are not enough. > Why not? Kernel users of memory shouldn't be triggering OOM events. > > >>>> This is useful to get clue to find memory hogger. >>>> i.e.) show_mem on oom >>>> <6>[ 420.856428] Mem-Info: >>>> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB >>>> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0 >>> So what does this show you? That someone is takign a ton of ION memory >>> for some unknown use? What can you do with that? What would you do >>> with that? >> We may not know exact memory owner. But we can narrow down. >> Anyway I think this is meaningful instead of no clue. > Again, work on the subsystems that actually allocate the memory, not > drivers. And if you want to mess with drivers, do it in a > device-specific way, not a driver-specific way. > >>> And memory is almost never assigned to a "driver", it is assigned to a >>> "device" that uses it. Drivers can handle multiple devices at the same >>> time, so why would you break this down by drivers? Are you assuming >>> that a driver only talks to one piece of hardware? >> Yes a driver may support several devices. I don't know if it same on an embedded device. > Why wouldn't it be? Is this new interface somehow only acceptable for > systems with one-device-per-driver? If so, that's not going to work at > all. > >> Anyway I think the idea works even for several devices, although the driver should >> distinguish memory usage for each device and should register each memory stat. > And how would that happen? > > thanks, > > greg k-h > > Thank you for you guys' comment Let me consider more
On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote: > > > On 2020년 03월 24일 20:46, Greg KH wrote: > > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: > >> > >> On 2020년 03월 24일 19:11, Greg KH wrote: > >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: > >>>> On 2020년 03월 23일 18:53, Greg KH wrote: > >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > >>>>>> +{ > >>>>>> + struct meminfo_extra *meminfo, *memtemp; > >>>>>> + int len; > >>>>>> + int error = 0; > >>>>>> + > >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > >>>>>> + if (!meminfo) { > >>>>>> + error = -ENOMEM; > >>>>>> + goto out; > >>>>>> + } > >>>>>> + > >>>>>> + meminfo->val = val; > >>>>>> + meminfo->shift_for_page = shift; > >>>>>> + strncpy(meminfo->name, name, NAME_SIZE); > >>>>>> + len = strlen(meminfo->name); > >>>>>> + meminfo->name[len] = ':'; > >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > >>>>>> + while (++len < NAME_BUF_SIZE - 1) > >>>>>> + meminfo->name_pad[len] = ' '; > >>>>>> + > >>>>>> + spin_lock(&meminfo_lock); > >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > >>>>>> + if (memtemp->val == val) { > >>>>>> + error = -EINVAL; > >>>>>> + break; > >>>>>> + } > >>>>>> + } > >>>>>> + if (!error) > >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); > >>>>>> + spin_unlock(&meminfo_lock); > >>>>> If you have a lock, why are you needing rcu? > >>>> I think _rcu should be removed out of list_for_each_entry_rcu. > >>>> But I'm confused about what you meant. > >>>> I used rcu_read_lock on __meminfo_extra, > >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > >>> If that's the case, then that's fine, it just didn't seem like that was > >>> needed. Or I might have been reading your rcu logic incorrectly... > >>> > >>>>>> + if (error) > >>>>>> + kfree(meminfo); > >>>>>> +out: > >>>>>> + > >>>>>> + return error; > >>>>>> +} > >>>>>> +EXPORT_SYMBOL(register_meminfo_extra); > >>>>> EXPORT_SYMBOL_GPL()? I have to ask :) > >>>> I can use EXPORT_SYMBOL_GPL. > >>>>> thanks, > >>>>> > >>>>> greg k-h > >>>>> > >>>>> > >>>> Hello > >>>> Thank you for your comment. > >>>> > >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. > >>>> I'd like to hear your opinion on this /proc/meminfo_extra node. > >>> I think it is the propagation of an old and obsolete interface that you > >>> will have to support for the next 20+ years and yet not actually be > >>> useful :) > >>> > >>>> Do you think this is meaningful or cannot co-exist with other future > >>>> sysfs based API. > >>> What sysfs-based API? > >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102 > >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140 > > I really do not understand what you are referring to here, sorry. I do > > not see any sysfs-based code in that thread. > Sorry. I also did not see actual code. > Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff? Sorry for being late, I wasn't in "TO:", so missed the whole discussion. Greg, We need the exposed information for the memory optimizations (debug, not production) of our high speed NICs. Our devices (mlx5) allocates a lot of memory, so optimization there can help us to scale in SRIOV mode easier and be less constraint by the memory. I want to emphasize that I don't like idea of extending /proc/* interface because it is going to be painful to grep on large machines with many devices. And I don't like the idea that every driver will need to register into this interface, because it will be abused almost immediately. My proposal was to create new sysfs file by driver/core and put all information automatically there, for example, it can be /sys/devices/pci0000:00/0000:00:0c.0/meminfo ^^^^^^^ Thanks
On Sun, Mar 29, 2020 at 10:19:07AM +0300, Leon Romanovsky wrote: > On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote: > > > > > > On 2020년 03월 24일 20:46, Greg KH wrote: > > > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: > > >> > > >> On 2020년 03월 24일 19:11, Greg KH wrote: > > >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: > > >>>> On 2020년 03월 23일 18:53, Greg KH wrote: > > >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > > >>>>>> +{ > > >>>>>> + struct meminfo_extra *meminfo, *memtemp; > > >>>>>> + int len; > > >>>>>> + int error = 0; > > >>>>>> + > > >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > > >>>>>> + if (!meminfo) { > > >>>>>> + error = -ENOMEM; > > >>>>>> + goto out; > > >>>>>> + } > > >>>>>> + > > >>>>>> + meminfo->val = val; > > >>>>>> + meminfo->shift_for_page = shift; > > >>>>>> + strncpy(meminfo->name, name, NAME_SIZE); > > >>>>>> + len = strlen(meminfo->name); > > >>>>>> + meminfo->name[len] = ':'; > > >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > > >>>>>> + while (++len < NAME_BUF_SIZE - 1) > > >>>>>> + meminfo->name_pad[len] = ' '; > > >>>>>> + > > >>>>>> + spin_lock(&meminfo_lock); > > >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > > >>>>>> + if (memtemp->val == val) { > > >>>>>> + error = -EINVAL; > > >>>>>> + break; > > >>>>>> + } > > >>>>>> + } > > >>>>>> + if (!error) > > >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); > > >>>>>> + spin_unlock(&meminfo_lock); > > >>>>> If you have a lock, why are you needing rcu? > > >>>> I think _rcu should be removed out of list_for_each_entry_rcu. > > >>>> But I'm confused about what you meant. > > >>>> I used rcu_read_lock on __meminfo_extra, > > >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > > >>> If that's the case, then that's fine, it just didn't seem like that was > > >>> needed. Or I might have been reading your rcu logic incorrectly... > > >>> > > >>>>>> + if (error) > > >>>>>> + kfree(meminfo); > > >>>>>> +out: > > >>>>>> + > > >>>>>> + return error; > > >>>>>> +} > > >>>>>> +EXPORT_SYMBOL(register_meminfo_extra); > > >>>>> EXPORT_SYMBOL_GPL()? I have to ask :) > > >>>> I can use EXPORT_SYMBOL_GPL. > > >>>>> thanks, > > >>>>> > > >>>>> greg k-h > > >>>>> > > >>>>> > > >>>> Hello > > >>>> Thank you for your comment. > > >>>> > > >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. > > >>>> I'd like to hear your opinion on this /proc/meminfo_extra node. > > >>> I think it is the propagation of an old and obsolete interface that you > > >>> will have to support for the next 20+ years and yet not actually be > > >>> useful :) > > >>> > > >>>> Do you think this is meaningful or cannot co-exist with other future > > >>>> sysfs based API. > > >>> What sysfs-based API? > > >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102 > > >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140 > > > I really do not understand what you are referring to here, sorry. I do > > > not see any sysfs-based code in that thread. > > Sorry. I also did not see actual code. > > Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff? > > Sorry for being late, I wasn't in "TO:", so missed the whole discussion. > > Greg, > > We need the exposed information for the memory optimizations (debug, not > production) of our high speed NICs. Our devices (mlx5) allocates a lot of > memory, so optimization there can help us to scale in SRIOV mode easier and > be less constraint by the memory. Great, then use debugfs and expose what ever you want in what ever way you want, no restrictions there, you do not need any type of kernel-wide /proc file for that today. > I want to emphasize that I don't like idea of extending /proc/* interface > because it is going to be painful to grep on large machines with many > devices. And I don't like the idea that every driver will need to register > into this interface, because it will be abused almost immediately. I agree. > My proposal was to create new sysfs file by driver/core and put all > information automatically there, for example, it can be > /sys/devices/pci0000:00/0000:00:0c.0/meminfo > ^^^^^^^ Nope, again, use debugfs, as sysfs is only one-value-per-file. thanks, greg k-h
On Sun, Mar 29, 2020 at 09:23:04AM +0200, Greg KH wrote: > On Sun, Mar 29, 2020 at 10:19:07AM +0300, Leon Romanovsky wrote: > > On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote: > > > > > > > > > On 2020년 03월 24일 20:46, Greg KH wrote: > > > > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote: > > > >> > > > >> On 2020년 03월 24일 19:11, Greg KH wrote: > > > >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote: > > > >>>> On 2020년 03월 23일 18:53, Greg KH wrote: > > > >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) > > > >>>>>> +{ > > > >>>>>> + struct meminfo_extra *meminfo, *memtemp; > > > >>>>>> + int len; > > > >>>>>> + int error = 0; > > > >>>>>> + > > > >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); > > > >>>>>> + if (!meminfo) { > > > >>>>>> + error = -ENOMEM; > > > >>>>>> + goto out; > > > >>>>>> + } > > > >>>>>> + > > > >>>>>> + meminfo->val = val; > > > >>>>>> + meminfo->shift_for_page = shift; > > > >>>>>> + strncpy(meminfo->name, name, NAME_SIZE); > > > >>>>>> + len = strlen(meminfo->name); > > > >>>>>> + meminfo->name[len] = ':'; > > > >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); > > > >>>>>> + while (++len < NAME_BUF_SIZE - 1) > > > >>>>>> + meminfo->name_pad[len] = ' '; > > > >>>>>> + > > > >>>>>> + spin_lock(&meminfo_lock); > > > >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { > > > >>>>>> + if (memtemp->val == val) { > > > >>>>>> + error = -EINVAL; > > > >>>>>> + break; > > > >>>>>> + } > > > >>>>>> + } > > > >>>>>> + if (!error) > > > >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head); > > > >>>>>> + spin_unlock(&meminfo_lock); > > > >>>>> If you have a lock, why are you needing rcu? > > > >>>> I think _rcu should be removed out of list_for_each_entry_rcu. > > > >>>> But I'm confused about what you meant. > > > >>>> I used rcu_read_lock on __meminfo_extra, > > > >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers. > > > >>> If that's the case, then that's fine, it just didn't seem like that was > > > >>> needed. Or I might have been reading your rcu logic incorrectly... > > > >>> > > > >>>>>> + if (error) > > > >>>>>> + kfree(meminfo); > > > >>>>>> +out: > > > >>>>>> + > > > >>>>>> + return error; > > > >>>>>> +} > > > >>>>>> +EXPORT_SYMBOL(register_meminfo_extra); > > > >>>>> EXPORT_SYMBOL_GPL()? I have to ask :) > > > >>>> I can use EXPORT_SYMBOL_GPL. > > > >>>>> thanks, > > > >>>>> > > > >>>>> greg k-h > > > >>>>> > > > >>>>> > > > >>>> Hello > > > >>>> Thank you for your comment. > > > >>>> > > > >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page. > > > >>>> I'd like to hear your opinion on this /proc/meminfo_extra node. > > > >>> I think it is the propagation of an old and obsolete interface that you > > > >>> will have to support for the next 20+ years and yet not actually be > > > >>> useful :) > > > >>> > > > >>>> Do you think this is meaningful or cannot co-exist with other future > > > >>>> sysfs based API. > > > >>> What sysfs-based API? > > > >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102 > > > >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140 > > > > I really do not understand what you are referring to here, sorry. I do > > > > not see any sysfs-based code in that thread. > > > Sorry. I also did not see actual code. > > > Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff? > > > > Sorry for being late, I wasn't in "TO:", so missed the whole discussion. > > > > Greg, > > > > We need the exposed information for the memory optimizations (debug, not > > production) of our high speed NICs. Our devices (mlx5) allocates a lot of > > memory, so optimization there can help us to scale in SRIOV mode easier and > > be less constraint by the memory. > > Great, then use debugfs and expose what ever you want in what ever way > you want, no restrictions there, you do not need any type of kernel-wide > /proc file for that today. No argue here, just gave you an example why Jaewon's idea is worth to explore. > > > I want to emphasize that I don't like idea of extending /proc/* interface > > because it is going to be painful to grep on large machines with many > > devices. And I don't like the idea that every driver will need to register > > into this interface, because it will be abused almost immediately. > > I agree. > > > My proposal was to create new sysfs file by driver/core and put all > > information automatically there, for example, it can be > > /sys/devices/pci0000:00/0000:00:0c.0/meminfo > > ^^^^^^^ > > Nope, again, use debugfs, as sysfs is only one-value-per-file. Everything that is not /proc and one global file for whole kernel is fine by me. Debugfs is more than enough for us. Thanks > > thanks, > > greg k-h
diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..83d2f55591c6 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -19,6 +19,7 @@ proc-y += devices.o proc-y += interrupts.o proc-y += loadavg.o proc-y += meminfo.o +proc-y += meminfo_extra.o proc-y += stat.o proc-y += uptime.o proc-y += util.o diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c new file mode 100644 index 000000000000..bd3f0d2b7fb7 --- /dev/null +++ b/fs/proc/meminfo_extra.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) +{ + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); + seq_write(m, " kB\n", 4); +} + +static LIST_HEAD(meminfo_head); +static DEFINE_SPINLOCK(meminfo_lock); + +#define NAME_SIZE 15 +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */ + +struct meminfo_extra { + struct list_head list; + atomic_long_t *val; + int shift_for_page; + char name[NAME_BUF_SIZE]; + char name_pad[NAME_BUF_SIZE]; +}; + +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name) +{ + struct meminfo_extra *meminfo, *memtemp; + int len; + int error = 0; + + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL); + if (!meminfo) { + error = -ENOMEM; + goto out; + } + + meminfo->val = val; + meminfo->shift_for_page = shift; + strncpy(meminfo->name, name, NAME_SIZE); + len = strlen(meminfo->name); + meminfo->name[len] = ':'; + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE); + while (++len < NAME_BUF_SIZE - 1) + meminfo->name_pad[len] = ' '; + + spin_lock(&meminfo_lock); + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { + if (memtemp->val == val) { + error = -EINVAL; + break; + } + } + if (!error) + list_add_tail_rcu(&meminfo->list, &meminfo_head); + spin_unlock(&meminfo_lock); + if (error) + kfree(meminfo); +out: + + return error; +} +EXPORT_SYMBOL(register_meminfo_extra); + +int unregister_meminfo_extra(atomic_long_t *val) +{ + struct meminfo_extra *memtemp; + int error = -EINVAL; + + spin_lock(&meminfo_lock); + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { + if (memtemp->val == val) { + list_del_rcu(&memtemp->list); + error = 0; + break; + } + } + spin_unlock(&meminfo_lock); + if (!error) { + synchronize_rcu(); + kfree(memtemp); + } + + return error; +} +EXPORT_SYMBOL(unregister_meminfo_extra); + +static void __meminfo_extra(struct seq_file *m) +{ + struct meminfo_extra *memtemp; + unsigned long nr_page; + + rcu_read_lock(); + list_for_each_entry_rcu(memtemp, &meminfo_head, list) { + nr_page = (unsigned long)atomic_long_read(memtemp->val); + nr_page = nr_page >> memtemp->shift_for_page; + if (m) + show_val_kb(m, memtemp->name_pad, nr_page); + else + pr_cont("%s%lukB ", memtemp->name, nr_page); + } + rcu_read_unlock(); +} + +void show_meminfo_extra(void) +{ + __meminfo_extra(NULL); +} + +static int meminfo_extra_proc_show(struct seq_file *m, void *v) +{ + __meminfo_extra(m); + + return 0; +} + +static int __init proc_meminfo_extra_init(void) +{ + proc_create_single("meminfo_extra", 0, NULL, meminfo_extra_proc_show); + return 0; +} +fs_initcall(proc_meminfo_extra_init); diff --git a/include/linux/mm.h b/include/linux/mm.h index 52269e56c514..55317161ab57 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2898,6 +2898,10 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +void show_meminfo_extra(void); +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name); +int unregister_meminfo_extra(atomic_long_t *val); + extern int memcmp_pages(struct page *page1, struct page *page2); static inline int pages_identical(struct page *page1, struct page *page2) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c4eb750a199..db1be9a39783 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5229,6 +5229,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) struct zone *zone; pg_data_t *pgdat; + show_meminfo_extra(); for_each_populated_zone(zone) { if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue;
Provide APIs to drivers so that they can show its memory usage on /proc/meminfo_extra. int register_meminfo_extra(atomic_long_t *val, int shift, const char *name); int unregister_meminfo_extra(atomic_long_t *val); Signed-off-by: Jaewon Kim <jaewon31.kim@samsung.com> --- v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c use rcu to reduce lock overhead v1: print info at /proc/meminfo --- fs/proc/Makefile | 1 + fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 4 ++ mm/page_alloc.c | 1 + 4 files changed, 129 insertions(+) create mode 100644 fs/proc/meminfo_extra.c