Message ID | ac070cd90c0d45b7a554366f235262fa5c566435.1622716926.git.legion@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v1] proc: Implement /proc/self/meminfo | expand |
[Cc linux-api] On Thu 03-06-21 12:43:07, legion@kernel.org wrote: > From: Alexey Gladkov <legion@kernel.org> > > The /proc/meminfo contains information regardless of the cgroups > restrictions. This file is still widely used [1]. This means that all > these programs will not work correctly inside container [2][3][4]. Some > programs try to respect the cgroups limits, but not all of them > implement support for all cgroup versions [5]. > > Correct information can be obtained from cgroups, but this requires the > cgroups to be available inside container and the correct version of > cgroups to be supported. > > There is lxcfs [6] that emulates /proc/meminfo using fuse to provide > information regarding cgroups. This patch can help them. > > This patch adds /proc/self/meminfo that contains a subset of > /proc/meminfo respecting cgroup restrictions. > > We cannot just create /proc/self/meminfo and make a symlink at the old > location because this will break the existing apparmor rules [7]. > Therefore, the patch adds a separate file with the same format. > > [1] https://codesearch.debian.net/search?q=%2Fproc%2Fmeminfo > [2] https://sources.debian.org/src/erlang/1:23.2.6+dfsg-1/lib/os_mon/c_src/memsup.c#L300 > [3] https://sources.debian.org/src/p7zip/16.02+dfsg-8/CPP/Windows/System.cpp/#L103 > [4] https://sources.debian.org/src/systemd/247.3-5/src/oom/oomd.c/#L138 > [5] https://sources.debian.org/src/nodejs/12.21.0%7Edfsg-4/deps/uv/src/unix/linux-core.c/#L1059 > [6] https://linuxcontainers.org/lxcfs/ > [7] https://gitlab.com/apparmor/apparmor/-/blob/master/profiles/apparmor.d/abstractions/base#L98 > > Signed-off-by: Alexey Gladkov <legion@kernel.org> > --- > fs/proc/base.c | 2 + > fs/proc/internal.h | 6 ++ > fs/proc/meminfo.c | 160 +++++++++++++++++++++++-------------- > include/linux/memcontrol.h | 2 + > include/linux/mm.h | 15 ++++ > mm/memcontrol.c | 80 +++++++++++++++++++ > mm/page_alloc.c | 28 ++++--- > 7 files changed, 222 insertions(+), 71 deletions(-) > > diff --git a/fs/proc/base.c b/fs/proc/base.c > index 58bbf334265b..e95837cf713f 100644 > --- a/fs/proc/base.c > +++ b/fs/proc/base.c > @@ -3269,6 +3269,7 @@ static const struct pid_entry tgid_base_stuff[] = { > #ifdef CONFIG_SECCOMP_CACHE_DEBUG > ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), > #endif > + ONE("meminfo", S_IRUGO, proc_meminfo_show), > }; > > static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) > @@ -3602,6 +3603,7 @@ static const struct pid_entry tid_base_stuff[] = { > #ifdef CONFIG_SECCOMP_CACHE_DEBUG > ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), > #endif > + ONE("meminfo", S_IRUGO, proc_meminfo_show), > }; > > static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > index 03415f3fb3a8..a6e8540afbd3 100644 > --- a/fs/proc/internal.h > +++ b/fs/proc/internal.h > @@ -241,6 +241,12 @@ extern int proc_net_init(void); > static inline int proc_net_init(void) { return 0; } > #endif > > +/* > + * meminfo.c > + */ > +extern int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns, > + struct pid *pid, struct task_struct *tsk); > + > /* > * proc_self.c > */ > diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c > index 6fa761c9cc78..3587a79d4b96 100644 > --- a/fs/proc/meminfo.c > +++ b/fs/proc/meminfo.c > @@ -16,6 +16,9 @@ > #ifdef CONFIG_CMA > #include <linux/cma.h> > #endif > +#ifdef CONFIG_MEMCG > +#include <linux/memcontrol.h> > +#endif > #include <asm/page.h> > #include "internal.h" > > @@ -23,91 +26,112 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) > { > } > > +static void proc_fill_meminfo(struct meminfo *mi) > +{ > + int lru; > + long cached; > + > + si_meminfo(&mi->si); > + si_swapinfo(&mi->si); > + > + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) > + mi->pages[lru] = global_node_page_state(NR_LRU_BASE + lru); > + > + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - mi->si.bufferram; > + if (cached < 0) > + cached = 0; > + > + mi->cached = cached; > + mi->swapcached = total_swapcache_pages(); > + mi->slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); > + mi->slab_unreclaimable = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); > + mi->anon_pages = global_node_page_state(NR_ANON_MAPPED); > + mi->mapped = global_node_page_state(NR_FILE_MAPPED); > + mi->nr_pagetable = global_node_page_state(NR_PAGETABLE); > + mi->dirty_pages = global_node_page_state(NR_FILE_DIRTY); > + mi->writeback_pages = global_node_page_state(NR_WRITEBACK); > +} > + > +#ifdef CONFIG_MEMCG > +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task) > +{ > + mem_fill_meminfo(mi, task); > +} > +#else > +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task) > +{ > + proc_fill_meminfo(mi); > +} > +#endif > + > static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) > { > seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); > seq_write(m, " kB\n", 4); > } > > +static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi) > +{ > + show_val_kb(m, "MemTotal: ", mi->si.totalram); > + show_val_kb(m, "MemFree: ", mi->si.freeram); > + show_val_kb(m, "MemAvailable: ", si_mem_available_mi(mi)); > + show_val_kb(m, "Buffers: ", mi->si.bufferram); > + show_val_kb(m, "Cached: ", mi->cached); > + show_val_kb(m, "SwapCached: ", mi->swapcached); > + show_val_kb(m, "Active: ", mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_ACTIVE_FILE]); > + show_val_kb(m, "Inactive: ", mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]); > + show_val_kb(m, "Active(anon): ", mi->pages[LRU_ACTIVE_ANON]); > + show_val_kb(m, "Inactive(anon): ", mi->pages[LRU_INACTIVE_ANON]); > + show_val_kb(m, "Active(file): ", mi->pages[LRU_ACTIVE_FILE]); > + show_val_kb(m, "Inactive(file): ", mi->pages[LRU_INACTIVE_FILE]); > + show_val_kb(m, "Unevictable: ", mi->pages[LRU_UNEVICTABLE]); > + > +#ifdef CONFIG_HIGHMEM > + show_val_kb(m, "HighTotal: ", mi->si.totalhigh); > + show_val_kb(m, "HighFree: ", mi->si.freehigh); > + show_val_kb(m, "LowTotal: ", mi->si.totalram - mi->si.totalhigh); > + show_val_kb(m, "LowFree: ", mi->si.freeram - mi->si.freehigh); > +#endif > + > + show_val_kb(m, "SwapTotal: ", mi->si.totalswap); > + show_val_kb(m, "SwapFree: ", mi->si.freeswap); > + show_val_kb(m, "Dirty: ", mi->dirty_pages); > + show_val_kb(m, "Writeback: ", mi->writeback_pages); > + > + show_val_kb(m, "AnonPages: ", mi->anon_pages); > + show_val_kb(m, "Mapped: ", mi->mapped); > + show_val_kb(m, "Shmem: ", mi->si.sharedram); > + show_val_kb(m, "Slab: ", mi->slab_reclaimable + mi->slab_unreclaimable); > + show_val_kb(m, "SReclaimable: ", mi->slab_reclaimable); > + show_val_kb(m, "SUnreclaim: ", mi->slab_unreclaimable); > + show_val_kb(m, "PageTables: ", mi->nr_pagetable); > + > + return 0; > +} > + > static int meminfo_proc_show(struct seq_file *m, void *v) > { > - struct sysinfo i; > - unsigned long committed; > - long cached; > - long available; > - unsigned long pages[NR_LRU_LISTS]; > - unsigned long sreclaimable, sunreclaim; > - int lru; > > - si_meminfo(&i); > - si_swapinfo(&i); > - committed = vm_memory_committed(); > + struct meminfo mi; > > - cached = global_node_page_state(NR_FILE_PAGES) - > - total_swapcache_pages() - i.bufferram; > - if (cached < 0) > - cached = 0; > + proc_fill_meminfo(&mi); > + meminfo_proc_show_mi(m, &mi); > > - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) > - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); > - > - available = si_mem_available(); > - sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); > - sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); > - > - show_val_kb(m, "MemTotal: ", i.totalram); > - show_val_kb(m, "MemFree: ", i.freeram); > - show_val_kb(m, "MemAvailable: ", available); > - show_val_kb(m, "Buffers: ", i.bufferram); > - show_val_kb(m, "Cached: ", cached); > - show_val_kb(m, "SwapCached: ", total_swapcache_pages()); > - show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + > - pages[LRU_ACTIVE_FILE]); > - show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + > - pages[LRU_INACTIVE_FILE]); > - show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); > - show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); > - show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); > - show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); > - show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); > show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); > > -#ifdef CONFIG_HIGHMEM > - show_val_kb(m, "HighTotal: ", i.totalhigh); > - show_val_kb(m, "HighFree: ", i.freehigh); > - show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); > - show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); > -#endif > - > #ifndef CONFIG_MMU > show_val_kb(m, "MmapCopy: ", > (unsigned long)atomic_long_read(&mmap_pages_allocated)); > #endif > > - show_val_kb(m, "SwapTotal: ", i.totalswap); > - show_val_kb(m, "SwapFree: ", i.freeswap); > - show_val_kb(m, "Dirty: ", > - global_node_page_state(NR_FILE_DIRTY)); > - show_val_kb(m, "Writeback: ", > - global_node_page_state(NR_WRITEBACK)); > - show_val_kb(m, "AnonPages: ", > - global_node_page_state(NR_ANON_MAPPED)); > - show_val_kb(m, "Mapped: ", > - global_node_page_state(NR_FILE_MAPPED)); > - show_val_kb(m, "Shmem: ", i.sharedram); > - show_val_kb(m, "KReclaimable: ", sreclaimable + > + show_val_kb(m, "KReclaimable: ", mi.slab_reclaimable + > global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); > - show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); > - show_val_kb(m, "SReclaimable: ", sreclaimable); > - show_val_kb(m, "SUnreclaim: ", sunreclaim); > seq_printf(m, "KernelStack: %8lu kB\n", > global_node_page_state(NR_KERNEL_STACK_KB)); > #ifdef CONFIG_SHADOW_CALL_STACK > seq_printf(m, "ShadowCallStack:%8lu kB\n", > global_node_page_state(NR_KERNEL_SCS_KB)); > #endif > - show_val_kb(m, "PageTables: ", > - global_node_page_state(NR_PAGETABLE)); > > show_val_kb(m, "NFS_Unstable: ", 0); > show_val_kb(m, "Bounce: ", > @@ -115,7 +139,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) > show_val_kb(m, "WritebackTmp: ", > global_node_page_state(NR_WRITEBACK_TEMP)); > show_val_kb(m, "CommitLimit: ", vm_commit_limit()); > - show_val_kb(m, "Committed_AS: ", committed); > + show_val_kb(m, "Committed_AS: ", vm_memory_committed()); > seq_printf(m, "VmallocTotal: %8lu kB\n", > (unsigned long)VMALLOC_TOTAL >> 10); > show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); > @@ -153,6 +177,20 @@ static int meminfo_proc_show(struct seq_file *m, void *v) > return 0; > } > > +int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns, > + struct pid *pid, struct task_struct *task) > +{ > + struct meminfo mi; > + > + fill_meminfo(&mi, task); > + > + meminfo_proc_show_mi(m, &mi); > + hugetlb_report_meminfo(m); > + arch_report_meminfo(m); > + > + return 0; > +} > + > static int __init proc_meminfo_init(void) > { > proc_create_single("meminfo", 0, NULL, meminfo_proc_show); > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index c193be760709..4a7e2894954f 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -1119,6 +1119,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, > gfp_t gfp_mask, > unsigned long *total_scanned); > > +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task); > + > #else /* CONFIG_MEMCG */ > > #define MEM_CGROUP_ID_SHIFT 0 > diff --git a/include/linux/mm.h b/include/linux/mm.h > index c274f75efcf9..7faeaddd5b88 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2467,6 +2467,20 @@ static inline int early_pfn_to_nid(unsigned long pfn) > extern int __meminit early_pfn_to_nid(unsigned long pfn); > #endif > > +struct meminfo { > + struct sysinfo si; > + unsigned long pages[NR_LRU_LISTS]; > + unsigned long cached; > + unsigned long swapcached; > + unsigned long anon_pages; > + unsigned long mapped; > + unsigned long nr_pagetable; > + unsigned long dirty_pages; > + unsigned long writeback_pages; > + unsigned long slab_reclaimable; > + unsigned long slab_unreclaimable; > +}; > + > extern void set_dma_reserve(unsigned long new_dma_reserve); > extern void memmap_init_range(unsigned long, int, unsigned long, > unsigned long, unsigned long, enum meminit_context, > @@ -2477,6 +2491,7 @@ extern int __meminit init_per_zone_wmark_min(void); > extern void mem_init(void); > extern void __init mmap_init(void); > extern void show_mem(unsigned int flags, nodemask_t *nodemask); > +extern long si_mem_available_mi(struct meminfo *mi); > extern long si_mem_available(void); > extern void si_meminfo(struct sysinfo * val); > extern void si_meminfo_node(struct sysinfo *val, int nid); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 64ada9e650a5..344b546f9e25 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3750,6 +3750,86 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, > return nr; > } > > +static void mem_cgroup_nr_pages(struct mem_cgroup *memcg, int nid, unsigned long *pages) > +{ > + struct mem_cgroup *iter; > + int i; > + > + for_each_mem_cgroup_tree(iter, memcg) { > + for (i = 0; i < NR_LRU_LISTS; i++) > + pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid, BIT(i), false); > + } > +} > + > +static void mem_cgroup_si_meminfo(struct sysinfo *si, struct task_struct *task) > +{ > + unsigned long memtotal, memused, swapsize; > + struct mem_cgroup *memcg; > + struct cgroup_subsys_state *css; > + > + css = task_css(task, memory_cgrp_id); > + memcg = mem_cgroup_from_css(css); > + > + memtotal = READ_ONCE(memcg->memory.max); > + > + if (memtotal != PAGE_COUNTER_MAX) { > + memused = page_counter_read(&memcg->memory); > + > + si->totalram = memtotal; > + si->freeram = (memtotal > memused ? memtotal - memused : 0); > + si->sharedram = memcg_page_state(memcg, NR_SHMEM); > + > + si->bufferram = nr_blockdev_pages(); > + si->totalhigh = totalhigh_pages(); > + si->freehigh = nr_free_highpages(); > + si->mem_unit = PAGE_SIZE; > + } else { > + si_meminfo(si); > + memused = si->totalram - si->freeram; > + } > + > + swapsize = READ_ONCE(memcg->memsw.max); > + > + if (swapsize != PAGE_COUNTER_MAX) { > + unsigned long swaptotal, swapused; > + > + swaptotal = swapsize - memtotal; > + swapused = page_counter_read(&memcg->memsw) - memused; > + si->totalswap = swaptotal; > + /* Due to global reclaim, memory.memsw.usage can be greater than > + * (memory.memsw.max - memory.max). */ > + si->freeswap = (swaptotal > swapused ? swaptotal - swapused : 0); > + } else { > + si_swapinfo(si); > + } > + > + css_put(css); > +} > + > +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task) > +{ > + struct cgroup_subsys_state *memcg_css = task_css(task, memory_cgrp_id); > + struct mem_cgroup *memcg = mem_cgroup_from_css(memcg_css); > + int nid; > + > + memset(&mi->pages, 0, sizeof(mi->pages)); > + > + mem_cgroup_si_meminfo(&mi->si, task); > + > + for_each_online_node(nid) > + mem_cgroup_nr_pages(memcg, nid, mi->pages); > + > + mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B); > + mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); > + mi->cached = memcg_page_state(memcg, NR_FILE_PAGES); > + mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE); > + mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED); > + mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED); > + mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE); > + mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY); > + mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK); > +} > + > static int memcg_numa_stat_show(struct seq_file *m, void *v) > { > struct numa_stat { > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index aaa1655cf682..0a3c9dcd2c13 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5551,18 +5551,13 @@ static inline void show_node(struct zone *zone) > printk("Node %d ", zone_to_nid(zone)); > } > > -long si_mem_available(void) > +long si_mem_available_mi(struct meminfo *mi) > { > long available; > unsigned long pagecache; > unsigned long wmark_low = 0; > - unsigned long pages[NR_LRU_LISTS]; > unsigned long reclaimable; > struct zone *zone; > - int lru; > - > - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) > - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); > > for_each_zone(zone) > wmark_low += low_wmark_pages(zone); > @@ -5571,14 +5566,14 @@ long si_mem_available(void) > * Estimate the amount of memory available for userspace allocations, > * without causing swapping. > */ > - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; > + available = mi->si.freeram - totalreserve_pages; > > /* > * Not all the page cache can be freed, otherwise the system will > * start swapping. Assume at least half of the page cache, or the > * low watermark worth of cache, needs to stay. > */ > - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; > + pagecache = mi->pages[LRU_ACTIVE_FILE] + mi->pages[LRU_INACTIVE_FILE]; > pagecache -= min(pagecache / 2, wmark_low); > available += pagecache; > > @@ -5587,14 +5582,27 @@ long si_mem_available(void) > * items that are in use, and cannot be freed. Cap this estimate at the > * low watermark. > */ > - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + > - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); > + reclaimable = mi->slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); > available += reclaimable - min(reclaimable / 2, wmark_low); > > if (available < 0) > available = 0; > return available; > } > + > +long si_mem_available(void) > +{ > + struct meminfo mi; > + int lru; > + > + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) > + mi.pages[lru] = global_node_page_state(NR_LRU_BASE + lru); > + > + mi.si.freeram = global_zone_page_state(NR_FREE_PAGES); > + mi.slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); > + > + return si_mem_available_mi(&mi); > +} > EXPORT_SYMBOL_GPL(si_mem_available); > > void si_meminfo(struct sysinfo *val) > -- > 2.29.3
Hi Alexey, legion@kernel.org writes: >From: Alexey Gladkov <legion@kernel.org> >The /proc/meminfo contains information regardless of the cgroups >restrictions. This file is still widely used [1]. This means that all >these programs will not work correctly inside container [2][3][4]. Some >programs try to respect the cgroups limits, but not all of them >implement support for all cgroup versions [5]. > >Correct information can be obtained from cgroups, but this requires the >cgroups to be available inside container and the correct version of >cgroups to be supported. Then they should add support for it. We already export these metrics as part of cgroups and plenty of applications like Docker, podman, containerd, systemd, runc, etc already support it. Putting stuff in /proc to get around the problem of "some other metric I need might not be exported to a container" is not a very compelling argument. If they want it, then export it to the container... Ultimately, if they're going to have to add support for a new /proc/self/meminfo file anyway, these use cases should just do it properly through the already supported APIs. >+ for_each_online_node(nid) >+ mem_cgroup_nr_pages(memcg, nid, mi->pages); >+ >+ mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B); >+ mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); >+ mi->cached = memcg_page_state(memcg, NR_FILE_PAGES); >+ mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE); >+ mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED); >+ mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED); >+ mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE); >+ mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY); >+ mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK); >+} This presents an extraordinarily confusing API. A cgroup can contain more than one process, so it's not right to present this information as "meminfo" in /proc/self when these statistics may not have any relation to the current task under question.
On 03.06.21 13:33, Chris Down wrote: Hi folks, > Putting stuff in /proc to get around the problem of "some other metric I > need might not be exported to a container" is not a very compelling > argument. If they want it, then export it to the container... > > Ultimately, if they're going to have to add support for a new > /proc/self/meminfo file anyway, these use cases should just do it > properly through the already supported APIs. It's even a bit more complex ... /proc/meminfo always tells what the *machine* has available, not what a process can eat up. That has been this way even long before cgroups. (eg. ulimits). Even if you want a container look more like a VM - /proc/meminfo showing what the container (instead of the machine) has available - just looking at the calling task's cgroup is also wrong. Because there're cgroups outside containers (that really shouldn't be affected) and there're even other cgroups inside the container (that further restrict below the container's limits). BTW: applications trying to autotune themselves by looking at /proc/meminfo are broken-by-design anyways. This never has been a valid metric on how much memory invididual processes can or should eat. --mtx
"Enrico Weigelt, metux IT consult" <lkml@metux.net> writes: > On 03.06.21 13:33, Chris Down wrote: > > Hi folks, > > >> Putting stuff in /proc to get around the problem of "some other metric I need >> might not be exported to a container" is not a very compelling argument. If >> they want it, then export it to the container... >> >> Ultimately, if they're going to have to add support for a new >> /proc/self/meminfo file anyway, these use cases should just do it properly >> through the already supported APIs. > > It's even a bit more complex ... > > /proc/meminfo always tells what the *machine* has available, not what a > process can eat up. That has been this way even long before cgroups. > (eg. ulimits). > > Even if you want a container look more like a VM - /proc/meminfo showing > what the container (instead of the machine) has available - just looking > at the calling task's cgroup is also wrong. Because there're cgroups > outside containers (that really shouldn't be affected) and there're even > other cgroups inside the container (that further restrict below the > container's limits). > > BTW: applications trying to autotune themselves by looking at > /proc/meminfo are broken-by-design anyways. This never has been a valid > metric on how much memory invididual processes can or should eat. Which brings us to the problem. Using /proc/meminfo is not valid unless your application can know it has the machine to itself. Something that is becoming increasing less common. Unless something has changed in the last couple of years, reading values out of the cgroup filesystem is both difficult (v1 and v2 have some gratuitous differences) and is actively discouraged. So what should applications do? Alex has found applications that are trying to do something with meminfo, and the fields that those applications care about. I don't see anyone making the case that specifically what the applications are trying to do is buggy. Alex's suggest is to have a /proc/self/meminfo that has the information that applications want, which would be something that would be easy to switch applications to. The patch to userspace at that point is as simple as 3 lines of code. I can imagine people take that patch into their userspace programs. The simple fact that people are using /proc/meminfo when it doesn't make sense for anything except system monitoring tools is a pretty solid bug report on the existing linux apis. So how do people suggest these applications get the information they need? Alex perhaps I missed it, but I didn't see a lot of description on why the individual applications are using meminfo. Can you provide a bit more detail? At least for a several of them, so we can see the trends and say yes this kind of information makes sense to provide to applications. I think that would help move forward the discussion about the best way to provide that information to userspace. Eric
On Wed, Jun 09, 2021 at 02:14:16PM -0500, Eric W. Biederman wrote: > "Enrico Weigelt, metux IT consult" <lkml@metux.net> writes: > > > On 03.06.21 13:33, Chris Down wrote: > > > > Hi folks, > > > > > >> Putting stuff in /proc to get around the problem of "some other metric I need > >> might not be exported to a container" is not a very compelling argument. If > >> they want it, then export it to the container... > >> > >> Ultimately, if they're going to have to add support for a new > >> /proc/self/meminfo file anyway, these use cases should just do it properly > >> through the already supported APIs. > > > > It's even a bit more complex ... > > > > /proc/meminfo always tells what the *machine* has available, not what a > > process can eat up. That has been this way even long before cgroups. > > (eg. ulimits). > > > > Even if you want a container look more like a VM - /proc/meminfo showing > > what the container (instead of the machine) has available - just looking > > at the calling task's cgroup is also wrong. Because there're cgroups > > outside containers (that really shouldn't be affected) and there're even > > other cgroups inside the container (that further restrict below the > > container's limits). > > > > BTW: applications trying to autotune themselves by looking at > > /proc/meminfo are broken-by-design anyways. This never has been a valid > > metric on how much memory invididual processes can or should eat. > > Which brings us to the problem. > > Using /proc/meminfo is not valid unless your application can know it has > the machine to itself. Something that is becoming increasing less > common. > > Unless something has changed in the last couple of years, reading values > out of the cgroup filesystem is both difficult (v1 and v2 have some > gratuitous differences) and is actively discouraged. > > So what should applications do? > > Alex has found applications that are trying to do something with > meminfo, and the fields that those applications care about. I don't see > anyone making the case that specifically what the applications are > trying to do is buggy. > > Alex's suggest is to have a /proc/self/meminfo that has the information > that applications want, which would be something that would be easy > to switch applications to. The patch to userspace at that point is > as simple as 3 lines of code. I can imagine people take that patch into > their userspace programs. But is it actually what applications want? Not all the information at the system level translates well to the container level. Things like available memory require a hierarchical assessment rather than just a look at the local level, since there could be limits higher up the tree. Not all items in meminfo have a container equivalent, either. The familiar format is likely a liability rather than an asset. > The simple fact that people are using /proc/meminfo when it doesn't make > sense for anything except system monitoring tools is a pretty solid bug > report on the existing linux apis. I agree that we likely need a better interface for applications to query the memory state of their container. But I don't think we should try to emulate a format that is a poor fit for this. We should also not speculate what users intended to do with the meminfo data right now. There is a surprising amount of misconception around what these values actually mean. I'd rather have users show up on the mailing list directly and outline the broader usecase.
Johannes Weiner <hannes@cmpxchg.org> writes: > On Wed, Jun 09, 2021 at 02:14:16PM -0500, Eric W. Biederman wrote: >> "Enrico Weigelt, metux IT consult" <lkml@metux.net> writes: >> >> > On 03.06.21 13:33, Chris Down wrote: >> > >> > Hi folks, >> > >> > >> >> Putting stuff in /proc to get around the problem of "some other metric I need >> >> might not be exported to a container" is not a very compelling argument. If >> >> they want it, then export it to the container... >> >> >> >> Ultimately, if they're going to have to add support for a new >> >> /proc/self/meminfo file anyway, these use cases should just do it properly >> >> through the already supported APIs. >> > >> > It's even a bit more complex ... >> > >> > /proc/meminfo always tells what the *machine* has available, not what a >> > process can eat up. That has been this way even long before cgroups. >> > (eg. ulimits). >> > >> > Even if you want a container look more like a VM - /proc/meminfo showing >> > what the container (instead of the machine) has available - just looking >> > at the calling task's cgroup is also wrong. Because there're cgroups >> > outside containers (that really shouldn't be affected) and there're even >> > other cgroups inside the container (that further restrict below the >> > container's limits). >> > >> > BTW: applications trying to autotune themselves by looking at >> > /proc/meminfo are broken-by-design anyways. This never has been a valid >> > metric on how much memory invididual processes can or should eat. >> >> Which brings us to the problem. >> >> Using /proc/meminfo is not valid unless your application can know it has >> the machine to itself. Something that is becoming increasing less >> common. >> >> Unless something has changed in the last couple of years, reading values >> out of the cgroup filesystem is both difficult (v1 and v2 have some >> gratuitous differences) and is actively discouraged. >> >> So what should applications do? >> >> Alex has found applications that are trying to do something with >> meminfo, and the fields that those applications care about. I don't see >> anyone making the case that specifically what the applications are >> trying to do is buggy. >> >> Alex's suggest is to have a /proc/self/meminfo that has the information >> that applications want, which would be something that would be easy >> to switch applications to. The patch to userspace at that point is >> as simple as 3 lines of code. I can imagine people take that patch into >> their userspace programs. > > But is it actually what applications want? > > Not all the information at the system level translates well to the > container level. Things like available memory require a hierarchical > assessment rather than just a look at the local level, since there > could be limits higher up the tree. That sounds like a bug in the implementation of /proc/self/meminfo. It certainly is a legitimate question to ask what are the limits from my perspective. > Not all items in meminfo have a container equivalent, either. Not all items in meminfo were implemented. > The familiar format is likely a liability rather than an asset. It could be. At the same time that is the only format anyone has proposed so we good counter proposal would be appreciated if you don't like the code that has been written. >> The simple fact that people are using /proc/meminfo when it doesn't make >> sense for anything except system monitoring tools is a pretty solid bug >> report on the existing linux apis. > > I agree that we likely need a better interface for applications to > query the memory state of their container. But I don't think we should > try to emulate a format that is a poor fit for this. I don't think it is the container that we care about (except for maybe system managment tools). I think the truly interesting case is applications asking what do I have available to me. > We should also not speculate what users intended to do with the > meminfo data right now. There is a surprising amount of misconception > around what these values actually mean. I'd rather have users show up > on the mailing list directly and outline the broader usecase. We are kernel developers, we can read code. We don't need to speculate. We can read the userspace code. If things are not clear we can ask their developers. Eric
On 6/9/21 16:56, Eric W. Biederman wrote: > Johannes Weiner <hannes@cmpxchg.org> writes: > >> On Wed, Jun 09, 2021 at 02:14:16PM -0500, Eric W. Biederman wrote: >>> "Enrico Weigelt, metux IT consult" <lkml@metux.net> writes: >>> >>>> On 03.06.21 13:33, Chris Down wrote: >>>> >>>> Hi folks, >>>> >>>> >>>>> Putting stuff in /proc to get around the problem of "some other metric I need >>>>> might not be exported to a container" is not a very compelling argument. If >>>>> they want it, then export it to the container... >>>>> >>>>> Ultimately, if they're going to have to add support for a new >>>>> /proc/self/meminfo file anyway, these use cases should just do it properly >>>>> through the already supported APIs. >>>> It's even a bit more complex ... >>>> >>>> /proc/meminfo always tells what the *machine* has available, not what a >>>> process can eat up. That has been this way even long before cgroups. >>>> (eg. ulimits). >>>> >>>> Even if you want a container look more like a VM - /proc/meminfo showing >>>> what the container (instead of the machine) has available - just looking >>>> at the calling task's cgroup is also wrong. Because there're cgroups >>>> outside containers (that really shouldn't be affected) and there're even >>>> other cgroups inside the container (that further restrict below the >>>> container's limits). >>>> >>>> BTW: applications trying to autotune themselves by looking at >>>> /proc/meminfo are broken-by-design anyways. This never has been a valid >>>> metric on how much memory invididual processes can or should eat. >>> Which brings us to the problem. >>> >>> Using /proc/meminfo is not valid unless your application can know it has >>> the machine to itself. Something that is becoming increasing less >>> common. >>> >>> Unless something has changed in the last couple of years, reading values >>> out of the cgroup filesystem is both difficult (v1 and v2 have some >>> gratuitous differences) and is actively discouraged. >>> >>> So what should applications do? >>> >>> Alex has found applications that are trying to do something with >>> meminfo, and the fields that those applications care about. I don't see >>> anyone making the case that specifically what the applications are >>> trying to do is buggy. >>> >>> Alex's suggest is to have a /proc/self/meminfo that has the information >>> that applications want, which would be something that would be easy >>> to switch applications to. The patch to userspace at that point is >>> as simple as 3 lines of code. I can imagine people take that patch into >>> their userspace programs. >> But is it actually what applications want? >> >> Not all the information at the system level translates well to the >> container level. Things like available memory require a hierarchical >> assessment rather than just a look at the local level, since there >> could be limits higher up the tree. > That sounds like a bug in the implementation of /proc/self/meminfo. > > It certainly is a legitimate question to ask what are the limits > from my perspective. > >> Not all items in meminfo have a container equivalent, either. > Not all items in meminfo were implemented. > >> The familiar format is likely a liability rather than an asset. > It could be. At the same time that is the only format anyone has > proposed so we good counter proposal would be appreciated if you don't > like the code that has been written. > >>> The simple fact that people are using /proc/meminfo when it doesn't make >>> sense for anything except system monitoring tools is a pretty solid bug >>> report on the existing linux apis. >> I agree that we likely need a better interface for applications to >> query the memory state of their container. But I don't think we should >> try to emulate a format that is a poor fit for this. > I don't think it is the container that we care about (except for maybe > system managment tools). I think the truly interesting case is > applications asking what do I have available to me. Have heard that the JRE makes assumptions on the number of threads to use based on memory. Lots of Humans use top and vmstat to try to figure out what is available in their environment. Debugging tools trying to figure out why an application is running poorly. We would like to not need to mount the cgroup file system into a container at all, and as Eric stated processes trying to differentiate between cgroupv1 and cgroupv2. >> We should also not speculate what users intended to do with the >> meminfo data right now. There is a surprising amount of misconception >> around what these values actually mean. I'd rather have users show up >> on the mailing list directly and outline the broader usecase. > We are kernel developers, we can read code. We don't need to speculate. > We can read the userspace code. If things are not clear we can ask > their developers. > > Eric > >
On Thu, Jun 03, 2021 at 12:43:07PM +0200, legion@kernel.org wrote: > From: Alexey Gladkov <legion@kernel.org> > > The /proc/meminfo contains information regardless of the cgroups > restrictions. This file is still widely used [1]. This means that all > these programs will not work correctly inside container [2][3][4]. Some > programs try to respect the cgroups limits, but not all of them > implement support for all cgroup versions [5]. > > Correct information can be obtained from cgroups, but this requires the > cgroups to be available inside container and the correct version of > cgroups to be supported. > > There is lxcfs [6] that emulates /proc/meminfo using fuse to provide > information regarding cgroups. This patch can help them. > > This patch adds /proc/self/meminfo that contains a subset of > /proc/meminfo respecting cgroup restrictions. > > We cannot just create /proc/self/meminfo and make a symlink at the old > location because this will break the existing apparmor rules [7]. > Therefore, the patch adds a separate file with the same format. Interesting work. Thanks. This is basically a variant of what I suggested at Plumbers and in [1]. Judging from the patches sent by Waiman Long in [2] to also virtualize /proc/cpuinfo and /sys/devices/system/cpu this is a larger push to provide virtualized system information to containers. Although somewhere in the thread here this veered off into apparently just being a way for a process to gather information about it's own resources. At which point I'm confused why looking at its cgroups isn't enough. So /proc/self/meminfo seems to just be the start. And note the two approaches seem to diverge too. This provides a new file while the other patchset virtualizes existing proc files/folders. In any case it seems you might want to talk since afaict you're all at the same company but don't seem to be aware of each others work (Which happens of course.). For the sake of history such patchsets have been pushed for before by the Siteground people. Chris and Johannes made a good point that the information provided in this file can be gathered from cgroups already. So applications should probably switch to reading those out of their cgroup and most are doing that already. And reading values out of cgroups is pretty straightforward even with the differences between cgroup v1 and v2. Userspace is doing it all over the place all of the time and the code has now existed for years so the cgroup interface is a problem. And with cgroup v2 it keeps growing so much more useful metrics that looking at meminfo isn't really cutting it anyway. So I think the argument that applications should start looking at their cgroup info if they want to find out detailed info is a solid argument that shouldn't be easily brushed aside. What might be worth is knowing exactly what applications are looking at /proc/meminfo and /proc/cpuinfo and make decision based on that info. None of that is clearly outlined in the thread unfortunately. So I immediately see two types of applications that could benefit from this patchset. The first ones are legacy applications that aren't aware of cgroups and aren't actively maintained. Introducing such functionality for these applications seems a weak argument. The second type is new and maintained applications that look at global info such as /proc/meminfo and /proc/cpuinfo. So such applications have ignored cgroups for a decade now. This makes it very unconvincing that they will suddenly switch to a newly introduced file. Especially if the entries in a new file aren't a 1:1 mapping of the old file. Johannes made another good point about it not being clear what applications actually want. And he's very right in that. It seems straightforward to virtualize things like meminfo but it actually isn't. And it's something you quite often discover after the fact. We have extensive experience implementing it in LXCFS in userspace. People kept and keep arguing what information exactly is supposed to go into calculating those values based on what best helps their use-case. Swap was an especially contentious point. In fact, sometimes users want to turn of swap even though it exists on the host and there's a command line switch in LXCFS to control that behavior. Another example supporting Johannes worry is virtualizing /proc/cpuinfo where some people wanted to virtualize cpu counts based on cpu shares. So we have two modes to virtualize cpus: based on cpuset alone or based on cpuset and cpu shares. And both modes are actively used. And that all really depends on application and workload. Finally, although LXCFS is briefly referenced in the commit message but it isn't explained very well and what it does. And we should consider it since this is a full existing userspace solution to the problem solved in this patchset including Dan's JRE use-case. This is a project started in 2014 and it is in production use since 2014 and it delivers the features of this patchset here and more. For example, it's used in the Linux susbystem of Chromebooks, it's used by Alibaba (see [3]) and it is used for the JRE use-case by Google's Anthos when migrating such legacy applications (see [4]). At first, I was convinced we could make use of /proc/self/meminfo in LXCFS which is why I held back but we can't. We can't simply bind-mount it over /proc/meminfo because it's not a 1:1 correspondence between all fields. We could potentially read some values we now calculate and display it in /proc/meminfo but we can't stop virtualizing /proc/meminfo itself. So we don't gain anything from this. When Alex asked me about it I tried to come up with good ways to integrate this but the gain is just too little for us. Because our experience tells us that applications that want this type of virtualization don't really care about heir own resources. They care about a virtualized view of the system's resources. And the system in question is often a container. But it get's very tricky since we don't really define what a container is. So what data the user wants to see depends on the used container runtime, type of container, and workload. An application container has very different needs than a system container that boots systemd. LXCFS can be very flexible here and virtualize according to the users preferences (see the split between cpuset and cpuset + cpu shares virtualization for cpu counts). In any case, LXCFS is a tiny FUSE filesystem which virtualizes various procfs and sysfs files for a container: /proc/cpuinfo /proc/diskstats /proc/meminfo /proc/stat /proc/swaps /proc/uptime /proc/slabinfo /sys/devices/system/cpu/* /sys/devices/system/cpu/online If you call top in a container that makes use of this it will display everything virtualized to the container (See [5] for an example of /proc/cpuinfo and /sys/devices/system/cpu/*.). And JRE will not overallocate resources. It's actively used for all of that. Below at [5] you can find an example where 2 cpus out of 8 have been assigned to the container's cpuset. The container values are virtualized as you can see. [1]: https://lkml.org/lkml/2020/6/4/951 [2]: https://lore.kernel.org/lkml/YMe/cGV4JPbzFRk0@slm.duckdns.org [3]: https://www.alibabacloud.com/blog/kubernetes-demystified-using-lxcfs-to-improve-container-resource-visibility_594109 [4]: https://cloud.google.com/blog/products/containers-kubernetes/migrate-for-anthos-streamlines-legacy-java-app-modernization [5]: ## /proc/cpuinfo #### Host brauner@wittgenstein|~ > ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]] drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu0 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu1 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu2 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu3 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu4 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu5 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu6 drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu7 #### Container brauner@wittgenstein|~ > lxc exec f1 -- ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]] drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu3 drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu4 ## /sys/devices/system/cpu/* #### Host brauner@wittgenstein|~ > grep ^processor /proc/cpuinfo processor : 0 processor : 1 processor : 2 processor : 3 processor : 4 processor : 5 processor : 6 processor : 7 #### Container brauner@wittgenstein|~ > lxc exec f1 -- grep ^processor /proc/cpuinfo processor : 0 processor : 1 ## top #### Host top - 13:16:47 up 15:54, 39 users, load average: 0,76, 0,47, 0,40 Tasks: 434 total, 1 running, 433 sleeping, 0 stopped, 0 zombie %Cpu0 : 2,7 us, 2,4 sy, 0,0 ni, 94,5 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st %Cpu1 : 3,3 us, 1,3 sy, 0,0 ni, 95,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st %Cpu2 : 1,6 us, 9,1 sy, 0,0 ni, 89,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st %Cpu3 : 2,3 us, 1,3 sy, 0,0 ni, 96,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st %Cpu4 : 2,7 us, 1,7 sy, 0,0 ni, 95,7 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st %Cpu5 : 2,9 us, 2,9 sy, 0,0 ni, 94,1 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st %Cpu6 : 2,3 us, 1,0 sy, 0,0 ni, 96,3 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st %Cpu7 : 3,3 us, 1,3 sy, 0,0 ni, 95,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st #### Container top - 11:16:13 up 2:08, 0 users, load average: 0.27, 0.36, 0.36 Tasks: 24 total, 1 running, 23 sleeping, 0 stopped, 0 zombie %Cpu0 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu1 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
On Tue, Jun 15, 2021 at 01:32:22PM +0200, Christian Brauner wrote: > On Thu, Jun 03, 2021 at 12:43:07PM +0200, legion@kernel.org wrote: > > From: Alexey Gladkov <legion@kernel.org> > > > > The /proc/meminfo contains information regardless of the cgroups > > restrictions. This file is still widely used [1]. This means that all > > these programs will not work correctly inside container [2][3][4]. Some > > programs try to respect the cgroups limits, but not all of them > > implement support for all cgroup versions [5]. > > > > Correct information can be obtained from cgroups, but this requires the > > cgroups to be available inside container and the correct version of > > cgroups to be supported. > > > > There is lxcfs [6] that emulates /proc/meminfo using fuse to provide > > information regarding cgroups. This patch can help them. > > > > This patch adds /proc/self/meminfo that contains a subset of > > /proc/meminfo respecting cgroup restrictions. > > > > We cannot just create /proc/self/meminfo and make a symlink at the old > > location because this will break the existing apparmor rules [7]. > > Therefore, the patch adds a separate file with the same format. > > Interesting work. Thanks. This is basically a variant of what I > suggested at Plumbers and in [1]. I made the second version of the patch [1], but then I had a conversation with Eric W. Biederman offlist. He convinced me that it is a bad idea to change all the values in meminfo to accommodate cgroups. But we agreed that MemAvailable in /proc/meminfo should respect cgroups limits. This field was created to hide implementation details when calculating available memory. You can see that it is quite widely used [2]. So I want to try to move in that direction. [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0 [2] https://codesearch.debian.net/search?q=MemAvailable%3A > Judging from the patches sent by Waiman Long in [2] to also virtualize > /proc/cpuinfo and /sys/devices/system/cpu this is a larger push to > provide virtualized system information to containers. > > Although somewhere in the thread here this veered off into apparently > just being a way for a process to gather information about it's own > resources. At which point I'm confused why looking at its cgroups > isn't enough. I think it's not enough. As an example: $ mount -t cgroup2 none /sys/fs/cgroup $ echo +memory > /sys/fs/cgroup/cgroup.subtree_control $ mkdir /sys/fs/cgroup/mem0 $ echo +memory > /sys/fs/cgroup/mem0/cgroup.subtree_control $ mkdir /sys/fs/cgroup/mem0/mem1 $ echo $$ > /sys/fs/cgroup/mem0/mem1/cgroup.procs I didn't set a limit and just added the shell to the group. $ cat /proc/self/cgroup 0::/mem0/mem1 $ cat /sys/fs/cgroup/mem0/mem1/memory.max max $ cat /sys/fs/cgroup/mem0/memory.max max In this case we need to use MemAvailable from /proc/meminfo. Another example: $ mount -t cgroup2 none /sys/fs/cgroup $ echo +memory > /sys/fs/cgroup/cgroup.subtree_control $ mkdir /sys/fs/cgroup/mem0 $ echo $(( 3 * 1024 * 1024 )) > /sys/fs/cgroup/mem0/memory.max $ echo +memory > /sys/fs/cgroup/mem0/cgroup.subtree_control $ mkdir /sys/fs/cgroup/mem0/mem1 $ echo $(( 3 * 1024 * 1024 * 1024 * 1024 )) > /sys/fs/cgroup/mem0/mem1/memory.max $ echo $$ > /sys/fs/cgroup/mem0/mem1/cgroup.procs $ head -3 /proc/meminfo MemTotal: 1002348 kB MemFree: 972712 kB MemAvailable: 968100 kB $ cat /sys/fs/cgroup/mem0{,/mem1}/memory.max 3145728 3298534883328 Now, I have cgroup limits, but you can write absolutely any value as a limit. So how much memory is available to shell in this case? To get this value, you need to take the minimum of MemAvailable and **/memory.max. ... or I fundamentally don't understand something. > So /proc/self/meminfo seems to just be the start. And note the two > approaches seem to diverge too. This provides a new file while the other > patchset virtualizes existing proc files/folders. > > In any case it seems you might want to talk since afaict you're all at > the same company but don't seem to be aware of each others work (Which > happens of course.). > > For the sake of history such patchsets have been pushed for before by > the Siteground people. > > Chris and Johannes made a good point that the information provided in > this file can be gathered from cgroups already. So applications should > probably switch to reading those out of their cgroup and most are doing > that already. > > And reading values out of cgroups is pretty straightforward even with > the differences between cgroup v1 and v2. Userspace is doing it all over > the place all of the time and the code has now existed for years so the > cgroup interface is a problem. And with cgroup v2 it keeps growing so > much more useful metrics that looking at meminfo isn't really cutting it > anyway. > > So I think the argument that applications should start looking at their > cgroup info if they want to find out detailed info is a solid argument > that shouldn't be easily brushed aside. > > What might be worth is knowing exactly what applications are looking at > /proc/meminfo and /proc/cpuinfo and make decision based on that info. > None of that is clearly outlined in the thread unfortunately. > > So I immediately see two types of applications that could benefit from > this patchset. The first ones are legacy applications that aren't aware > of cgroups and aren't actively maintained. Introducing such > functionality for these applications seems a weak argument. > > The second type is new and maintained applications that look at global > info such as /proc/meminfo and /proc/cpuinfo. So such applications have > ignored cgroups for a decade now. This makes it very unconvincing that > they will suddenly switch to a newly introduced file. Especially if the > entries in a new file aren't a 1:1 mapping of the old file. > > Johannes made another good point about it not being clear what > applications actually want. And he's very right in that. It seems > straightforward to virtualize things like meminfo but it actually isn't. > And it's something you quite often discover after the fact. We have > extensive experience implementing it in LXCFS in userspace. People kept > and keep arguing what information exactly is supposed to go into > calculating those values based on what best helps their use-case. > > Swap was an especially contentious point. In fact, sometimes users want > to turn of swap even though it exists on the host and there's a command > line switch in LXCFS to control that behavior. > > Another example supporting Johannes worry is virtualizing /proc/cpuinfo > where some people wanted to virtualize cpu counts based on cpu shares. > So we have two modes to virtualize cpus: based on cpuset alone or based > on cpuset and cpu shares. And both modes are actively used. And that all > really depends on application and workload. > > Finally, although LXCFS is briefly referenced in the commit message but > it isn't explained very well and what it does. > > And we should consider it since this is a full existing userspace > solution to the problem solved in this patchset including Dan's JRE > use-case. > > This is a project started in 2014 and it is in production use since 2014 > and it delivers the features of this patchset here and more. > > For example, it's used in the Linux susbystem of Chromebooks, it's used > by Alibaba (see [3]) and it is used for the JRE use-case by Google's > Anthos when migrating such legacy applications (see [4]). > > At first, I was convinced we could make use of /proc/self/meminfo in > LXCFS which is why I held back but we can't. We can't simply bind-mount > it over /proc/meminfo because it's not a 1:1 correspondence between all > fields. We could potentially read some values we now calculate and > display it in /proc/meminfo but we can't stop virtualizing /proc/meminfo > itself. So we don't gain anything from this. When Alex asked me about it > I tried to come up with good ways to integrate this but the gain is just > too little for us. > > Because our experience tells us that applications that want this type of > virtualization don't really care about heir own resources. They care > about a virtualized view of the system's resources. And the system in > question is often a container. But it get's very tricky since we don't > really define what a container is. So what data the user wants to see > depends on the used container runtime, type of container, and workload. > An application container has very different needs than a system > container that boots systemd. LXCFS can be very flexible here and > virtualize according to the users preferences (see the split between > cpuset and cpuset + cpu shares virtualization for cpu counts). > > In any case, LXCFS is a tiny FUSE filesystem which virtualizes various > procfs and sysfs files for a container: > > /proc/cpuinfo > /proc/diskstats > /proc/meminfo > /proc/stat > /proc/swaps > /proc/uptime > /proc/slabinfo > /sys/devices/system/cpu/* > /sys/devices/system/cpu/online > > If you call top in a container that makes use of this it will display > everything virtualized to the container (See [5] for an example of > /proc/cpuinfo and /sys/devices/system/cpu/*.). And JRE will not > overallocate resources. It's actively used for all of that. > > Below at [5] you can find an example where 2 cpus out of 8 have been > assigned to the container's cpuset. The container values are virtualized > as you can see. > > [1]: https://lkml.org/lkml/2020/6/4/951 > [2]: https://lore.kernel.org/lkml/YMe/cGV4JPbzFRk0@slm.duckdns.org > [3]: https://www.alibabacloud.com/blog/kubernetes-demystified-using-lxcfs-to-improve-container-resource-visibility_594109 > [4]: https://cloud.google.com/blog/products/containers-kubernetes/migrate-for-anthos-streamlines-legacy-java-app-modernization > [5]: ## /proc/cpuinfo > #### Host > brauner@wittgenstein|~ > > ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]] > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu0 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu1 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu2 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu3 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu4 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu5 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu6 > drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu7 > > #### Container > brauner@wittgenstein|~ > > lxc exec f1 -- ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]] > drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu3 > drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu4 > > ## /sys/devices/system/cpu/* > #### Host > brauner@wittgenstein|~ > > grep ^processor /proc/cpuinfo > processor : 0 > processor : 1 > processor : 2 > processor : 3 > processor : 4 > processor : 5 > processor : 6 > processor : 7 > > #### Container > brauner@wittgenstein|~ > > lxc exec f1 -- grep ^processor /proc/cpuinfo > processor : 0 > processor : 1 > > ## top > #### Host > top - 13:16:47 up 15:54, 39 users, load average: 0,76, 0,47, 0,40 > Tasks: 434 total, 1 running, 433 sleeping, 0 stopped, 0 zombie > %Cpu0 : 2,7 us, 2,4 sy, 0,0 ni, 94,5 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st > %Cpu1 : 3,3 us, 1,3 sy, 0,0 ni, 95,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st > %Cpu2 : 1,6 us, 9,1 sy, 0,0 ni, 89,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st > %Cpu3 : 2,3 us, 1,3 sy, 0,0 ni, 96,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st > %Cpu4 : 2,7 us, 1,7 sy, 0,0 ni, 95,7 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st > %Cpu5 : 2,9 us, 2,9 sy, 0,0 ni, 94,1 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st > %Cpu6 : 2,3 us, 1,0 sy, 0,0 ni, 96,3 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st > %Cpu7 : 3,3 us, 1,3 sy, 0,0 ni, 95,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st > > #### Container > top - 11:16:13 up 2:08, 0 users, load average: 0.27, 0.36, 0.36 > Tasks: 24 total, 1 running, 23 sleeping, 0 stopped, 0 zombie > %Cpu0 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st > %Cpu1 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st >
On Tue, Jun 15, 2021 at 5:47 AM Alexey Gladkov <legion@kernel.org> wrote: > [...] > > I made the second version of the patch [1], but then I had a conversation > with Eric W. Biederman offlist. He convinced me that it is a bad idea to > change all the values in meminfo to accommodate cgroups. But we agreed > that MemAvailable in /proc/meminfo should respect cgroups limits. This > field was created to hide implementation details when calculating > available memory. You can see that it is quite widely used [2]. > So I want to try to move in that direction. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0 > [2] https://codesearch.debian.net/search?q=MemAvailable%3A > Please see following two links on the previous discussion on having per-memcg MemAvailable stat. [1] https://lore.kernel.org/linux-mm/alpine.DEB.2.22.394.2006281445210.855265@chino.kir.corp.google.com/ [2] https://lore.kernel.org/linux-mm/alpine.DEB.2.23.453.2007142018150.2667860@chino.kir.corp.google.com/ MemAvailable itself is an imprecise metric and involving memcg makes this metric even more weird. The difference of semantics of swap accounting of v1 and v2 is one source of this weirdness (I have not checked your patch if it is handling this weirdness). The lazyfree and deferred split pages are another source. So, I am not sure if complicating an already imprecise metric will make it more useful.
Shakeel Butt <shakeelb@google.com> writes: > On Tue, Jun 15, 2021 at 5:47 AM Alexey Gladkov <legion@kernel.org> wrote: >> > [...] >> >> I made the second version of the patch [1], but then I had a conversation >> with Eric W. Biederman offlist. He convinced me that it is a bad idea to >> change all the values in meminfo to accommodate cgroups. But we agreed >> that MemAvailable in /proc/meminfo should respect cgroups limits. This >> field was created to hide implementation details when calculating >> available memory. You can see that it is quite widely used [2]. >> So I want to try to move in that direction. >> >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0 >> [2] https://codesearch.debian.net/search?q=MemAvailable%3A >> > > Please see following two links on the previous discussion on having > per-memcg MemAvailable stat. > > [1] https://lore.kernel.org/linux-mm/alpine.DEB.2.22.394.2006281445210.855265@chino.kir.corp.google.com/ > [2] https://lore.kernel.org/linux-mm/alpine.DEB.2.23.453.2007142018150.2667860@chino.kir.corp.google.com/ > > MemAvailable itself is an imprecise metric and involving memcg makes > this metric even more weird. The difference of semantics of swap > accounting of v1 and v2 is one source of this weirdness (I have not > checked your patch if it is handling this weirdness). The lazyfree and > deferred split pages are another source. > > So, I am not sure if complicating an already imprecise metric will > make it more useful. Making a good guess at how much memory can be allocated without triggering swapping or otherwise stressing the system is something that requires understanding our mm internals. To be able to continue changing the mm or even mm policy without introducing regressions in userspace we need to export values that userspace can use. At a first approximation that seems to look like MemAvailable. MemAvailable seems to have a good definition. Roughly the amount of memory that can be allocated without triggering swapping. Updated to include not trigger memory cgroup based swapping and I sounds good. I don't know if it will work in practice but I think it is worth exploring. I do know that hiding the implementation details and providing userspace with information it can directly use seems like the programming model that needs to be explored. Most programs should not care if they are in a memory cgroup, etc. Programs, load management systems, and even balloon drivers have a legitimately interest in how much additional load can be placed on a systems memory. A version of this that I remember working fairly well is free space on compressed filesystems. As I recall compressed filesystems report the amount of uncompressed space that is available (an underestimate). This results in the amount of space consumed going up faster than the free space goes down. We can't do exactly the same thing with our memory usability estimate, but having our estimate be a reliable underestimate might be enough to avoid problems with reporting too much memory as available to userspace. I know that MemAvailable already does that /2 so maybe it is already aiming at being an underestimate. Perhaps we need some additional accounting to help create a useful metric for userspace as well. I don't know the final answer. I do know that not designing an interface that userspace can use to deal with it's legitimate concerns is sticking our collective heads in the sand and wishing the problem will go away. Eric
On Wed 16-06-21 11:17:38, Eric W. Biederman wrote: [...] > MemAvailable seems to have a good definition. Roughly the amount of > memory that can be allocated without triggering swapping. Updated > to include not trigger memory cgroup based swapping and I sounds good. yes this definition is at least understandable but how do you want to define it in the memcg scope? There are two different source of memory pressure when dealing with memcgs. Internal one when a limit is hit and and external when the source of the reclaim comes from higher the hierarchy (including the global memory pressure). The former one would be quite easy to mimic with the global semantic but the later will get much more complex very quickly - a) you would need a snapshot of the whole cgroup tree and evaluate it against the global memory state b) you would have to consider memory reclaim protection c) the external memory pressure is distributed proportionaly to the size most of the time which is yet another complication. And more other challenges that have been already discussed. That being said, this might be possible to implement but I am not really sure this is viable and I strongly suspect that it will get unreliable in many situations in context of "how much you can allocate without swapping".
On Wed, Jun 16, 2021 at 9:17 AM Eric W. Biederman <ebiederm@xmission.com> wrote: > > Shakeel Butt <shakeelb@google.com> writes: > > > On Tue, Jun 15, 2021 at 5:47 AM Alexey Gladkov <legion@kernel.org> wrote: > >> > > [...] > >> > >> I made the second version of the patch [1], but then I had a conversation > >> with Eric W. Biederman offlist. He convinced me that it is a bad idea to > >> change all the values in meminfo to accommodate cgroups. But we agreed > >> that MemAvailable in /proc/meminfo should respect cgroups limits. This > >> field was created to hide implementation details when calculating > >> available memory. You can see that it is quite widely used [2]. > >> So I want to try to move in that direction. > >> > >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0 > >> [2] https://codesearch.debian.net/search?q=MemAvailable%3A > >> > > > > Please see following two links on the previous discussion on having > > per-memcg MemAvailable stat. > > > > [1] https://lore.kernel.org/linux-mm/alpine.DEB.2.22.394.2006281445210.855265@chino.kir.corp.google.com/ > > [2] https://lore.kernel.org/linux-mm/alpine.DEB.2.23.453.2007142018150.2667860@chino.kir.corp.google.com/ > > > > MemAvailable itself is an imprecise metric and involving memcg makes > > this metric even more weird. The difference of semantics of swap > > accounting of v1 and v2 is one source of this weirdness (I have not > > checked your patch if it is handling this weirdness). The lazyfree and > > deferred split pages are another source. > > > > So, I am not sure if complicating an already imprecise metric will > > make it more useful. > > Making a good guess at how much memory can be allocated without > triggering swapping or otherwise stressing the system is something that > requires understanding our mm internals. > > To be able to continue changing the mm or even mm policy without > introducing regressions in userspace we need to export values that > userspace can use. The issue is the dependence of such exported values on mm internals. MM internal code and policy changes will change this value and there is a potential of userspace regression. > > At a first approximation that seems to look like MemAvailable. > > MemAvailable seems to have a good definition. Roughly the amount of > memory that can be allocated without triggering swapping. Nowadays, I don't think MemAvailable giving "amount of memory that can be allocated without triggering swapping" is even roughly accurate. Actually IMO "without triggering swap" is not something an application should concern itself with where refaults from some swap types (zswap/swap-on-zram) are much faster than refaults from disk. > Updated > to include not trigger memory cgroup based swapping and I sounds good. > > I don't know if it will work in practice but I think it is worth > exploring. I agree. > > I do know that hiding the implementation details and providing userspace > with information it can directly use seems like the programming model > that needs to be explored. Most programs should not care if they are in > a memory cgroup, etc. Programs, load management systems, and even > balloon drivers have a legitimately interest in how much additional load > can be placed on a systems memory. > How much additional load can be placed on a system *until what*. I think we should focus more on the "until" part to make the problem more tractable. > > A version of this that I remember working fairly well is free space > on compressed filesystems. As I recall compressed filesystems report > the amount of uncompressed space that is available (an underestimate). > This results in the amount of space consumed going up faster than the > free space goes down. > > We can't do exactly the same thing with our memory usability estimate, > but having our estimate be a reliable underestimate might be enough > to avoid problems with reporting too much memory as available to > userspace. > > I know that MemAvailable already does that /2 so maybe it is already > aiming at being an underestimate. Perhaps we need some additional > accounting to help create a useful metric for userspace as well. > The real challenge here is that we are not 100% sure if a page is reclaimable until we try to reclaim it. For example we might have file lrus filled with lazyfree pages which might have been accessed. MemAvailable will show half the size of file lrus but once we try to reclaim them, we have to move them back to anon lru and drastic drop in MemAvailable. > > I don't know the final answer. I do know that not designing an > interface that userspace can use to deal with it's legitimate concerns > is sticking our collective heads in the sand and wishing the problem > will go away. I am a bit skeptical that a single interface would be enough but first we should formalize what exactly the application wants with some concrete use-cases. More specifically, are the applications interested in avoiding swapping or OOM or stall? Second, is the reactive approach acceptable? Instead of an upfront number representing the room for growth, how about just grow and backoff when some event (oom or stall) which we want to avoid is about to happen? This is achievable today for oom and stall with PSI and memory.high and it avoids the hard problem of reliably estimating the reclaimable memory.
On 19.06.21 01:38, Shakeel Butt wrote: > Nowadays, I don't think MemAvailable giving "amount of memory that can > be allocated without triggering swapping" is even roughly accurate. > Actually IMO "without triggering swap" is not something an application > should concern itself with where refaults from some swap types > (zswap/swap-on-zram) are much faster than refaults from disk. If we're talking about things like database workloads, there IMHO isn't anything really better than doing measurements with the actual loads and tuning incrementally. But: what is the actual optimization goal, why an application might want to know where swapping begins ? Computing performance ? Caching + IO Latency or throughput ? Network traffic (e.g. w/ iscsi) ? Power consumption ? >> I do know that hiding the implementation details and providing userspace >> with information it can directly use seems like the programming model >> that needs to be explored. Most programs should not care if they are in >> a memory cgroup, etc. Programs, load management systems, and even >> balloon drivers have a legitimately interest in how much additional load >> can be placed on a systems memory. What kind of load exactly ? CPU ? disk IO ? network ? > How much additional load can be placed on a system *until what*. I > think we should focus more on the "until" part to make the problem > more tractable. ACK. The interesting question is what to do in that case. An obvious move by an database system could be eg. filling only so much caches as there's spare physical RAM, in order to avoid useless swapping (since we'd potentiall produce more IO load when a cache is written out to swap, instead of just discarding it) But, this also depends ... #1: the application doesn't know the actual performance of the swap device, eg. the already mentioned zswap+friends, or some fast nvmem for swap vs disk for storage. #2: caches might also be implemented indirectly by mmap()ing the storage file/device and so using the kernel's cache here. in that case, the kernel would automatically discard the pages w/o going to swap. of course that only works if the cache is nothing but copying pages from storage into ram. A completely different scenario would be load management on a cluster like k8s. Here we usually care of cluster performance (dont care about individual nodes so muck), but wanna prevent individual nodes from being overloaded. Since we usually don't know much about the indivdual workload, we probably don't have much other chance than contigous monitoring and acting when a node is getting too busy - or trying to balance when new workloads are started, on current system load (and other metrics). In that case, I don't see where this new proc file should be of much help. > Second, is the reactive approach acceptable? Instead of an upfront > number representing the room for growth, how about just grow and > backoff when some event (oom or stall) which we want to avoid is about > to happen? This is achievable today for oom and stall with PSI and > memory.high and it avoids the hard problem of reliably estimating the > reclaimable memory. I tend to believe that for certain use cases it would be helpful if an application gets notified if some of its pages are soon getting swapped out due memory pressure. Then it could decide on its own which whether it should drop certain caches in order to prevent swapping. --mtx
diff --git a/fs/proc/base.c b/fs/proc/base.c index 58bbf334265b..e95837cf713f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3269,6 +3269,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif + ONE("meminfo", S_IRUGO, proc_meminfo_show), }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3602,6 +3603,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif + ONE("meminfo", S_IRUGO, proc_meminfo_show), }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 03415f3fb3a8..a6e8540afbd3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -241,6 +241,12 @@ extern int proc_net_init(void); static inline int proc_net_init(void) { return 0; } #endif +/* + * meminfo.c + */ +extern int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); + /* * proc_self.c */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6fa761c9cc78..3587a79d4b96 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -16,6 +16,9 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#ifdef CONFIG_MEMCG +#include <linux/memcontrol.h> +#endif #include <asm/page.h> #include "internal.h" @@ -23,91 +26,112 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } +static void proc_fill_meminfo(struct meminfo *mi) +{ + int lru; + long cached; + + si_meminfo(&mi->si); + si_swapinfo(&mi->si); + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + mi->pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - mi->si.bufferram; + if (cached < 0) + cached = 0; + + mi->cached = cached; + mi->swapcached = total_swapcache_pages(); + mi->slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + mi->slab_unreclaimable = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); + mi->anon_pages = global_node_page_state(NR_ANON_MAPPED); + mi->mapped = global_node_page_state(NR_FILE_MAPPED); + mi->nr_pagetable = global_node_page_state(NR_PAGETABLE); + mi->dirty_pages = global_node_page_state(NR_FILE_DIRTY); + mi->writeback_pages = global_node_page_state(NR_WRITEBACK); +} + +#ifdef CONFIG_MEMCG +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task) +{ + mem_fill_meminfo(mi, task); +} +#else +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task) +{ + proc_fill_meminfo(mi); +} +#endif + static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } +static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi) +{ + show_val_kb(m, "MemTotal: ", mi->si.totalram); + show_val_kb(m, "MemFree: ", mi->si.freeram); + show_val_kb(m, "MemAvailable: ", si_mem_available_mi(mi)); + show_val_kb(m, "Buffers: ", mi->si.bufferram); + show_val_kb(m, "Cached: ", mi->cached); + show_val_kb(m, "SwapCached: ", mi->swapcached); + show_val_kb(m, "Active: ", mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive: ", mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Active(anon): ", mi->pages[LRU_ACTIVE_ANON]); + show_val_kb(m, "Inactive(anon): ", mi->pages[LRU_INACTIVE_ANON]); + show_val_kb(m, "Active(file): ", mi->pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive(file): ", mi->pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Unevictable: ", mi->pages[LRU_UNEVICTABLE]); + +#ifdef CONFIG_HIGHMEM + show_val_kb(m, "HighTotal: ", mi->si.totalhigh); + show_val_kb(m, "HighFree: ", mi->si.freehigh); + show_val_kb(m, "LowTotal: ", mi->si.totalram - mi->si.totalhigh); + show_val_kb(m, "LowFree: ", mi->si.freeram - mi->si.freehigh); +#endif + + show_val_kb(m, "SwapTotal: ", mi->si.totalswap); + show_val_kb(m, "SwapFree: ", mi->si.freeswap); + show_val_kb(m, "Dirty: ", mi->dirty_pages); + show_val_kb(m, "Writeback: ", mi->writeback_pages); + + show_val_kb(m, "AnonPages: ", mi->anon_pages); + show_val_kb(m, "Mapped: ", mi->mapped); + show_val_kb(m, "Shmem: ", mi->si.sharedram); + show_val_kb(m, "Slab: ", mi->slab_reclaimable + mi->slab_unreclaimable); + show_val_kb(m, "SReclaimable: ", mi->slab_reclaimable); + show_val_kb(m, "SUnreclaim: ", mi->slab_unreclaimable); + show_val_kb(m, "PageTables: ", mi->nr_pagetable); + + return 0; +} + static int meminfo_proc_show(struct seq_file *m, void *v) { - struct sysinfo i; - unsigned long committed; - long cached; - long available; - unsigned long pages[NR_LRU_LISTS]; - unsigned long sreclaimable, sunreclaim; - int lru; - si_meminfo(&i); - si_swapinfo(&i); - committed = vm_memory_committed(); + struct meminfo mi; - cached = global_node_page_state(NR_FILE_PAGES) - - total_swapcache_pages() - i.bufferram; - if (cached < 0) - cached = 0; + proc_fill_meminfo(&mi); + meminfo_proc_show_mi(m, &mi); - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); - - available = si_mem_available(); - sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); - sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); - - show_val_kb(m, "MemTotal: ", i.totalram); - show_val_kb(m, "MemFree: ", i.freeram); - show_val_kb(m, "MemAvailable: ", available); - show_val_kb(m, "Buffers: ", i.bufferram); - show_val_kb(m, "Cached: ", cached); - show_val_kb(m, "SwapCached: ", total_swapcache_pages()); - show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + - pages[LRU_ACTIVE_FILE]); - show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + - pages[LRU_INACTIVE_FILE]); - show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); - show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); - show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); - show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); - show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); -#ifdef CONFIG_HIGHMEM - show_val_kb(m, "HighTotal: ", i.totalhigh); - show_val_kb(m, "HighFree: ", i.freehigh); - show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); - show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); -#endif - #ifndef CONFIG_MMU show_val_kb(m, "MmapCopy: ", (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif - show_val_kb(m, "SwapTotal: ", i.totalswap); - show_val_kb(m, "SwapFree: ", i.freeswap); - show_val_kb(m, "Dirty: ", - global_node_page_state(NR_FILE_DIRTY)); - show_val_kb(m, "Writeback: ", - global_node_page_state(NR_WRITEBACK)); - show_val_kb(m, "AnonPages: ", - global_node_page_state(NR_ANON_MAPPED)); - show_val_kb(m, "Mapped: ", - global_node_page_state(NR_FILE_MAPPED)); - show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "KReclaimable: ", sreclaimable + + show_val_kb(m, "KReclaimable: ", mi.slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); - show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); - show_val_kb(m, "SReclaimable: ", sreclaimable); - show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", global_node_page_state(NR_KERNEL_SCS_KB)); #endif - show_val_kb(m, "PageTables: ", - global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", @@ -115,7 +139,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); - show_val_kb(m, "Committed_AS: ", committed); + show_val_kb(m, "Committed_AS: ", vm_memory_committed()); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); @@ -153,6 +177,20 @@ static int meminfo_proc_show(struct seq_file *m, void *v) return 0; } +int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct meminfo mi; + + fill_meminfo(&mi, task); + + meminfo_proc_show_mi(m, &mi); + hugetlb_report_meminfo(m); + arch_report_meminfo(m); + + return 0; +} + static int __init proc_meminfo_init(void) { proc_create_single("meminfo", 0, NULL, meminfo_proc_show); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c193be760709..4a7e2894954f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1119,6 +1119,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 diff --git a/include/linux/mm.h b/include/linux/mm.h index c274f75efcf9..7faeaddd5b88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2467,6 +2467,20 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif +struct meminfo { + struct sysinfo si; + unsigned long pages[NR_LRU_LISTS]; + unsigned long cached; + unsigned long swapcached; + unsigned long anon_pages; + unsigned long mapped; + unsigned long nr_pagetable; + unsigned long dirty_pages; + unsigned long writeback_pages; + unsigned long slab_reclaimable; + unsigned long slab_unreclaimable; +}; + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, @@ -2477,6 +2491,7 @@ extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(unsigned int flags, nodemask_t *nodemask); +extern long si_mem_available_mi(struct meminfo *mi); extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64ada9e650a5..344b546f9e25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3750,6 +3750,86 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return nr; } +static void mem_cgroup_nr_pages(struct mem_cgroup *memcg, int nid, unsigned long *pages) +{ + struct mem_cgroup *iter; + int i; + + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < NR_LRU_LISTS; i++) + pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid, BIT(i), false); + } +} + +static void mem_cgroup_si_meminfo(struct sysinfo *si, struct task_struct *task) +{ + unsigned long memtotal, memused, swapsize; + struct mem_cgroup *memcg; + struct cgroup_subsys_state *css; + + css = task_css(task, memory_cgrp_id); + memcg = mem_cgroup_from_css(css); + + memtotal = READ_ONCE(memcg->memory.max); + + if (memtotal != PAGE_COUNTER_MAX) { + memused = page_counter_read(&memcg->memory); + + si->totalram = memtotal; + si->freeram = (memtotal > memused ? memtotal - memused : 0); + si->sharedram = memcg_page_state(memcg, NR_SHMEM); + + si->bufferram = nr_blockdev_pages(); + si->totalhigh = totalhigh_pages(); + si->freehigh = nr_free_highpages(); + si->mem_unit = PAGE_SIZE; + } else { + si_meminfo(si); + memused = si->totalram - si->freeram; + } + + swapsize = READ_ONCE(memcg->memsw.max); + + if (swapsize != PAGE_COUNTER_MAX) { + unsigned long swaptotal, swapused; + + swaptotal = swapsize - memtotal; + swapused = page_counter_read(&memcg->memsw) - memused; + si->totalswap = swaptotal; + /* Due to global reclaim, memory.memsw.usage can be greater than + * (memory.memsw.max - memory.max). */ + si->freeswap = (swaptotal > swapused ? swaptotal - swapused : 0); + } else { + si_swapinfo(si); + } + + css_put(css); +} + +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task) +{ + struct cgroup_subsys_state *memcg_css = task_css(task, memory_cgrp_id); + struct mem_cgroup *memcg = mem_cgroup_from_css(memcg_css); + int nid; + + memset(&mi->pages, 0, sizeof(mi->pages)); + + mem_cgroup_si_meminfo(&mi->si, task); + + for_each_online_node(nid) + mem_cgroup_nr_pages(memcg, nid, mi->pages); + + mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B); + mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + mi->cached = memcg_page_state(memcg, NR_FILE_PAGES); + mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE); + mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED); + mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED); + mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE); + mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY); + mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK); +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aaa1655cf682..0a3c9dcd2c13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5551,18 +5551,13 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } -long si_mem_available(void) +long si_mem_available_mi(struct meminfo *mi) { long available; unsigned long pagecache; unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; unsigned long reclaimable; struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += low_wmark_pages(zone); @@ -5571,14 +5566,14 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = mi->si.freeram - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will * start swapping. Assume at least half of the page cache, or the * low watermark worth of cache, needs to stay. */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache = mi->pages[LRU_ACTIVE_FILE] + mi->pages[LRU_INACTIVE_FILE]; pagecache -= min(pagecache / 2, wmark_low); available += pagecache; @@ -5587,14 +5582,27 @@ long si_mem_available(void) * items that are in use, and cannot be freed. Cap this estimate at the * low watermark. */ - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + reclaimable = mi->slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; return available; } + +long si_mem_available(void) +{ + struct meminfo mi; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + mi.pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + mi.si.freeram = global_zone_page_state(NR_FREE_PAGES); + mi.slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + + return si_mem_available_mi(&mi); +} EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val)