Message ID | 20200608230819.832349-4-guro@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm: memcg accounting of percpu memory | expand |
On Mon, Jun 08, 2020 at 04:08:17PM -0700, Roman Gushchin wrote: > Percpu memory can represent a noticeable chunk of the total > memory consumption, especially on big machines with many CPUs. > Let's track percpu memory usage for each memcg and display > it in memory.stat. > > A percpu allocation is usually scattered over multiple pages > (and nodes), and can be significantly smaller than a page. > So let's add a byte-sized counter on the memcg level: > MEMCG_PERCPU_B. Byte-sized vmstat infra created for slabs > can be perfectly reused for percpu case. > > Signed-off-by: Roman Gushchin <guro@fb.com> > Acked-by: Dennis Zhou <dennis@kernel.org> > --- > Documentation/admin-guide/cgroup-v2.rst | 4 ++++ > include/linux/memcontrol.h | 8 ++++++++ > mm/memcontrol.c | 4 +++- > mm/percpu.c | 10 ++++++++++ > 4 files changed, 25 insertions(+), 1 deletion(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index ce3e05e41724..7c1e784239bf 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1274,6 +1274,10 @@ PAGE_SIZE multiple when read back. > Amount of memory used for storing in-kernel data > structures. > > + percpu > + Amount of memory used for storing per-cpu kernel > + data structures. > + > sock > Amount of memory used in network transmission buffers > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index eede46c43573..7ed3af71a6fb 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -32,11 +32,19 @@ struct kmem_cache; > enum memcg_stat_item { > MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, > MEMCG_SOCK, > + MEMCG_PERCPU_B, > /* XXX: why are these zone and not node counters? */ > MEMCG_KERNEL_STACK_KB, > MEMCG_NR_STAT, > }; > > +static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) > +{ > + if (item == MEMCG_PERCPU_B) > + return true; > + return vmstat_item_in_bytes(item); This patch is now in -next and this line causes a warning from clang, which shows up in every translation unit that includes this header, which is a lot: include/linux/memcontrol.h:45:30: warning: implicit conversion from enumeration type 'enum memcg_stat_item' to different enumeration type 'enum node_stat_item' [-Wenum-conversion] return vmstat_item_in_bytes(item); ~~~~~~~~~~~~~~~~~~~~ ^~~~ 1 warning generated. I assume this conversion is intentional; if so, it seems like expecting a specific enum is misleading. Perhaps this should be applied on top? Cheers, Nathan diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2499f78cf32d..bddeb4ce7a4f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -38,7 +38,7 @@ enum memcg_stat_item { MEMCG_NR_STAT, }; -static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) +static __always_inline bool memcg_stat_item_in_bytes(int item) { if (item == MEMCG_PERCPU_B) return true; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 084ee1c17160..52d7961a24f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -211,7 +211,7 @@ enum node_stat_item { * measured in pages). This defines the API part, the internal representation * might be different. */ -static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) +static __always_inline bool vmstat_item_in_bytes(int item) { /* * Global and per-node slab counters track slab pages.
On Sun, Jun 21, 2020 at 06:48:03PM -0700, Nathan Chancellor wrote: > On Mon, Jun 08, 2020 at 04:08:17PM -0700, Roman Gushchin wrote: > > Percpu memory can represent a noticeable chunk of the total > > memory consumption, especially on big machines with many CPUs. > > Let's track percpu memory usage for each memcg and display > > it in memory.stat. > > > > A percpu allocation is usually scattered over multiple pages > > (and nodes), and can be significantly smaller than a page. > > So let's add a byte-sized counter on the memcg level: > > MEMCG_PERCPU_B. Byte-sized vmstat infra created for slabs > > can be perfectly reused for percpu case. > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > Acked-by: Dennis Zhou <dennis@kernel.org> > > --- > > Documentation/admin-guide/cgroup-v2.rst | 4 ++++ > > include/linux/memcontrol.h | 8 ++++++++ > > mm/memcontrol.c | 4 +++- > > mm/percpu.c | 10 ++++++++++ > > 4 files changed, 25 insertions(+), 1 deletion(-) > > > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > > index ce3e05e41724..7c1e784239bf 100644 > > --- a/Documentation/admin-guide/cgroup-v2.rst > > +++ b/Documentation/admin-guide/cgroup-v2.rst > > @@ -1274,6 +1274,10 @@ PAGE_SIZE multiple when read back. > > Amount of memory used for storing in-kernel data > > structures. > > > > + percpu > > + Amount of memory used for storing per-cpu kernel > > + data structures. > > + > > sock > > Amount of memory used in network transmission buffers > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > > index eede46c43573..7ed3af71a6fb 100644 > > --- a/include/linux/memcontrol.h > > +++ b/include/linux/memcontrol.h > > @@ -32,11 +32,19 @@ struct kmem_cache; > > enum memcg_stat_item { > > MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, > > MEMCG_SOCK, > > + MEMCG_PERCPU_B, > > /* XXX: why are these zone and not node counters? */ > > MEMCG_KERNEL_STACK_KB, > > MEMCG_NR_STAT, > > }; > > > > +static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) > > +{ > > + if (item == MEMCG_PERCPU_B) > > + return true; > > + return vmstat_item_in_bytes(item); > > This patch is now in -next and this line causes a warning from clang, > which shows up in every translation unit that includes this header, > which is a lot: > > include/linux/memcontrol.h:45:30: warning: implicit conversion from > enumeration type 'enum memcg_stat_item' to different enumeration type > 'enum node_stat_item' [-Wenum-conversion] > return vmstat_item_in_bytes(item); > ~~~~~~~~~~~~~~~~~~~~ ^~~~ > 1 warning generated. > > I assume this conversion is intentional; if so, it seems like expecting > a specific enum is misleading. Perhaps this should be applied on top? Hi Nathan! Yeah, these enums are kind of stacked on each other, so memcg_stat values extend node_stat values. And I think your patch is correct. I'm going to refresh the series with some small fixups. If you're not against it, I'll merge your patch into the corresponding patches. And thank you for reporting the problem! > > Cheers, > Nathan > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 2499f78cf32d..bddeb4ce7a4f 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -38,7 +38,7 @@ enum memcg_stat_item { > MEMCG_NR_STAT, > }; > > -static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) > +static __always_inline bool memcg_stat_item_in_bytes(int item) > { > if (item == MEMCG_PERCPU_B) > return true; > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 084ee1c17160..52d7961a24f0 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -211,7 +211,7 @@ enum node_stat_item { > * measured in pages). This defines the API part, the internal representation > * might be different. > */ > -static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) > +static __always_inline bool vmstat_item_in_bytes(int item) > { > /* > * Global and per-node slab counters track slab pages.
On Sun, Jun 21, 2020 at 08:26:35PM -0700, Roman Gushchin wrote: > On Sun, Jun 21, 2020 at 06:48:03PM -0700, Nathan Chancellor wrote: > > On Mon, Jun 08, 2020 at 04:08:17PM -0700, Roman Gushchin wrote: > > > Percpu memory can represent a noticeable chunk of the total > > > memory consumption, especially on big machines with many CPUs. > > > Let's track percpu memory usage for each memcg and display > > > it in memory.stat. > > > > > > A percpu allocation is usually scattered over multiple pages > > > (and nodes), and can be significantly smaller than a page. > > > So let's add a byte-sized counter on the memcg level: > > > MEMCG_PERCPU_B. Byte-sized vmstat infra created for slabs > > > can be perfectly reused for percpu case. > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > Acked-by: Dennis Zhou <dennis@kernel.org> > > > --- > > > Documentation/admin-guide/cgroup-v2.rst | 4 ++++ > > > include/linux/memcontrol.h | 8 ++++++++ > > > mm/memcontrol.c | 4 +++- > > > mm/percpu.c | 10 ++++++++++ > > > 4 files changed, 25 insertions(+), 1 deletion(-) > > > > > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > > > index ce3e05e41724..7c1e784239bf 100644 > > > --- a/Documentation/admin-guide/cgroup-v2.rst > > > +++ b/Documentation/admin-guide/cgroup-v2.rst > > > @@ -1274,6 +1274,10 @@ PAGE_SIZE multiple when read back. > > > Amount of memory used for storing in-kernel data > > > structures. > > > > > > + percpu > > > + Amount of memory used for storing per-cpu kernel > > > + data structures. > > > + > > > sock > > > Amount of memory used in network transmission buffers > > > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > > > index eede46c43573..7ed3af71a6fb 100644 > > > --- a/include/linux/memcontrol.h > > > +++ b/include/linux/memcontrol.h > > > @@ -32,11 +32,19 @@ struct kmem_cache; > > > enum memcg_stat_item { > > > MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, > > > MEMCG_SOCK, > > > + MEMCG_PERCPU_B, > > > /* XXX: why are these zone and not node counters? */ > > > MEMCG_KERNEL_STACK_KB, > > > MEMCG_NR_STAT, > > > }; > > > > > > +static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) > > > +{ > > > + if (item == MEMCG_PERCPU_B) > > > + return true; > > > + return vmstat_item_in_bytes(item); > > > > This patch is now in -next and this line causes a warning from clang, > > which shows up in every translation unit that includes this header, > > which is a lot: > > > > include/linux/memcontrol.h:45:30: warning: implicit conversion from > > enumeration type 'enum memcg_stat_item' to different enumeration type > > 'enum node_stat_item' [-Wenum-conversion] > > return vmstat_item_in_bytes(item); > > ~~~~~~~~~~~~~~~~~~~~ ^~~~ > > 1 warning generated. > > > > I assume this conversion is intentional; if so, it seems like expecting > > a specific enum is misleading. Perhaps this should be applied on top? > > Hi Nathan! > > Yeah, these enums are kind of stacked on each other, so memcg_stat values > extend node_stat values. And I think your patch is correct. Yeah, I figured. It happens in the kernel in a couple of different places that we have seen so far. So far, my suggestion seems to be the best one that we have uncovered when dealing with one enum that supplments or extends another. These functions are fairly self explanation so I don't think that blowing away the type safety here is a big deal. > I'm going to refresh the series with some small fixups. If you're not against > it, I'll merge your patch into the corresponding patches. Please do! Thanks for the quick response. Cheers, Nathan > And thank you for reporting the problem! > > > > > Cheers, > > Nathan > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > > index 2499f78cf32d..bddeb4ce7a4f 100644 > > --- a/include/linux/memcontrol.h > > +++ b/include/linux/memcontrol.h > > @@ -38,7 +38,7 @@ enum memcg_stat_item { > > MEMCG_NR_STAT, > > }; > > > > -static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) > > +static __always_inline bool memcg_stat_item_in_bytes(int item) > > { > > if (item == MEMCG_PERCPU_B) > > return true; > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index 084ee1c17160..52d7961a24f0 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -211,7 +211,7 @@ enum node_stat_item { > > * measured in pages). This defines the API part, the internal representation > > * might be different. > > */ > > -static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) > > +static __always_inline bool vmstat_item_in_bytes(int item) > > { > > /* > > * Global and per-node slab counters track slab pages.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index ce3e05e41724..7c1e784239bf 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1274,6 +1274,10 @@ PAGE_SIZE multiple when read back. Amount of memory used for storing in-kernel data structures. + percpu + Amount of memory used for storing per-cpu kernel + data structures. + sock Amount of memory used in network transmission buffers diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eede46c43573..7ed3af71a6fb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,11 +32,19 @@ struct kmem_cache; enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, + MEMCG_PERCPU_B, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, MEMCG_NR_STAT, }; +static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item) +{ + if (item == MEMCG_PERCPU_B) + return true; + return vmstat_item_in_bytes(item); +} + enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93b2e73ef2f7..839b4d890a90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -783,7 +783,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return; - if (vmstat_item_in_bytes(idx)) + if (memcg_stat_item_in_bytes(idx)) threshold <<= PAGE_SHIFT; x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); @@ -1490,6 +1490,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) seq_buf_printf(&s, "slab %llu\n", (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); + seq_buf_printf(&s, "percpu %llu\n", + (u64)memcg_page_state(memcg, MEMCG_PERCPU_B)); seq_buf_printf(&s, "sock %llu\n", (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); diff --git a/mm/percpu.c b/mm/percpu.c index 8ebd9fe30430..18d3d049bf91 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1608,6 +1608,11 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, if (chunk) { chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + size * num_possible_cpus()); + rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, size * num_possible_cpus()); obj_cgroup_put(objcg); @@ -1626,6 +1631,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + -(size * num_possible_cpus())); + rcu_read_unlock(); + obj_cgroup_put(objcg); }