Message ID | 20200127173453.2089565-17-guro@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | The new cgroup slab memory controller | expand |
On Mon, Jan 27, 2020 at 09:34:41AM -0800, Roman Gushchin wrote: > Allocate and release memory to store obj_cgroup pointers for each > non-root slab page. Reuse page->mem_cgroup pointer to store a pointer > to the allocated space. > > To distinguish between obj_cgroups and memcg pointers in case > when it's not obvious which one is used (as in page_cgroup_ino()), > let's always set the lowest bit in the obj_cgroup case. > > Signed-off-by: Roman Gushchin <guro@fb.com> > --- > include/linux/mm.h | 25 ++++++++++++++++++-- > include/linux/mm_types.h | 5 +++- > mm/memcontrol.c | 5 ++-- > mm/slab.c | 3 ++- > mm/slab.h | 51 +++++++++++++++++++++++++++++++++++++++- > mm/slub.c | 2 +- > 6 files changed, 83 insertions(+), 8 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 080f8ac8bfb7..65224becc4ca 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1264,12 +1264,33 @@ static inline void set_page_links(struct page *page, enum zone_type zone, > #ifdef CONFIG_MEMCG > static inline struct mem_cgroup *page_memcg(struct page *page) > { > - return page->mem_cgroup; > + struct mem_cgroup *memcg = page->mem_cgroup; > + > + /* > + * The lowest bit set means that memcg isn't a valid memcg pointer, > + * but a obj_cgroups pointer. In this case the page is shared and > + * isn't charged to any specific memory cgroup. Return NULL. > + */ > + if ((unsigned long) memcg & 0x1UL) > + memcg = NULL; > + > + return memcg; That should really WARN instead of silently returning NULL. Which callsite optimistically asks a page's cgroup when it has no idea whether that page is actually a userpage or not? > static inline struct mem_cgroup *page_memcg_rcu(struct page *page) > { > + struct mem_cgroup *memcg = READ_ONCE(page->mem_cgroup); > + > WARN_ON_ONCE(!rcu_read_lock_held()); > - return READ_ONCE(page->mem_cgroup); > + > + /* > + * The lowest bit set means that memcg isn't a valid memcg pointer, > + * but a obj_cgroups pointer. In this case the page is shared and > + * isn't charged to any specific memory cgroup. Return NULL. > + */ > + if ((unsigned long) memcg & 0x1UL) > + memcg = NULL; > + > + return memcg; Same here. > } > #else > static inline struct mem_cgroup *page_memcg(struct page *page) > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 270aa8fd2800..5102f00f3336 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -198,7 +198,10 @@ struct page { > atomic_t _refcount; > > #ifdef CONFIG_MEMCG > - struct mem_cgroup *mem_cgroup; > + union { > + struct mem_cgroup *mem_cgroup; > + struct obj_cgroup **obj_cgroups; > + }; Since you need the casts in both cases anyway, it's safer (and simpler) to do unsigned long mem_cgroup; to prevent accidental direct derefs in future code. Otherwise, this patch looks good to me!
On Mon, Feb 03, 2020 at 01:27:56PM -0500, Johannes Weiner wrote: > On Mon, Jan 27, 2020 at 09:34:41AM -0800, Roman Gushchin wrote: > > Allocate and release memory to store obj_cgroup pointers for each > > non-root slab page. Reuse page->mem_cgroup pointer to store a pointer > > to the allocated space. > > > > To distinguish between obj_cgroups and memcg pointers in case > > when it's not obvious which one is used (as in page_cgroup_ino()), > > let's always set the lowest bit in the obj_cgroup case. > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > --- > > include/linux/mm.h | 25 ++++++++++++++++++-- > > include/linux/mm_types.h | 5 +++- > > mm/memcontrol.c | 5 ++-- > > mm/slab.c | 3 ++- > > mm/slab.h | 51 +++++++++++++++++++++++++++++++++++++++- > > mm/slub.c | 2 +- > > 6 files changed, 83 insertions(+), 8 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 080f8ac8bfb7..65224becc4ca 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -1264,12 +1264,33 @@ static inline void set_page_links(struct page *page, enum zone_type zone, > > #ifdef CONFIG_MEMCG > > static inline struct mem_cgroup *page_memcg(struct page *page) > > { > > - return page->mem_cgroup; > > + struct mem_cgroup *memcg = page->mem_cgroup; > > + > > + /* > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > + * but a obj_cgroups pointer. In this case the page is shared and > > + * isn't charged to any specific memory cgroup. Return NULL. > > + */ > > + if ((unsigned long) memcg & 0x1UL) > > + memcg = NULL; > > + > > + return memcg; > > That should really WARN instead of silently returning NULL. Which > callsite optimistically asks a page's cgroup when it has no idea > whether that page is actually a userpage or not? For instance, look at page_cgroup_ino() called from the reading /proc/kpageflags. > > > static inline struct mem_cgroup *page_memcg_rcu(struct page *page) > > { > > + struct mem_cgroup *memcg = READ_ONCE(page->mem_cgroup); > > + > > WARN_ON_ONCE(!rcu_read_lock_held()); > > - return READ_ONCE(page->mem_cgroup); > > + > > + /* > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > + * but a obj_cgroups pointer. In this case the page is shared and > > + * isn't charged to any specific memory cgroup. Return NULL. > > + */ > > + if ((unsigned long) memcg & 0x1UL) > > + memcg = NULL; > > + > > + return memcg; > > Same here. > > > } > > #else > > static inline struct mem_cgroup *page_memcg(struct page *page) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > index 270aa8fd2800..5102f00f3336 100644 > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -198,7 +198,10 @@ struct page { > > atomic_t _refcount; > > > > #ifdef CONFIG_MEMCG > > - struct mem_cgroup *mem_cgroup; > > + union { > > + struct mem_cgroup *mem_cgroup; > > + struct obj_cgroup **obj_cgroups; > > + }; > > Since you need the casts in both cases anyway, it's safer (and > simpler) to do > > unsigned long mem_cgroup; > > to prevent accidental direct derefs in future code. Agree. Maybe even mem_cgroup_data? > > Otherwise, this patch looks good to me! Thanks!
On Mon, Feb 03, 2020 at 10:34:52AM -0800, Roman Gushchin wrote: > On Mon, Feb 03, 2020 at 01:27:56PM -0500, Johannes Weiner wrote: > > On Mon, Jan 27, 2020 at 09:34:41AM -0800, Roman Gushchin wrote: > > > Allocate and release memory to store obj_cgroup pointers for each > > > non-root slab page. Reuse page->mem_cgroup pointer to store a pointer > > > to the allocated space. > > > > > > To distinguish between obj_cgroups and memcg pointers in case > > > when it's not obvious which one is used (as in page_cgroup_ino()), > > > let's always set the lowest bit in the obj_cgroup case. > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > --- > > > include/linux/mm.h | 25 ++++++++++++++++++-- > > > include/linux/mm_types.h | 5 +++- > > > mm/memcontrol.c | 5 ++-- > > > mm/slab.c | 3 ++- > > > mm/slab.h | 51 +++++++++++++++++++++++++++++++++++++++- > > > mm/slub.c | 2 +- > > > 6 files changed, 83 insertions(+), 8 deletions(-) > > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > index 080f8ac8bfb7..65224becc4ca 100644 > > > --- a/include/linux/mm.h > > > +++ b/include/linux/mm.h > > > @@ -1264,12 +1264,33 @@ static inline void set_page_links(struct page *page, enum zone_type zone, > > > #ifdef CONFIG_MEMCG > > > static inline struct mem_cgroup *page_memcg(struct page *page) > > > { > > > - return page->mem_cgroup; > > > + struct mem_cgroup *memcg = page->mem_cgroup; > > > + > > > + /* > > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > > + * but a obj_cgroups pointer. In this case the page is shared and > > > + * isn't charged to any specific memory cgroup. Return NULL. > > > + */ > > > + if ((unsigned long) memcg & 0x1UL) > > > + memcg = NULL; > > > + > > > + return memcg; > > > > That should really WARN instead of silently returning NULL. Which > > callsite optimistically asks a page's cgroup when it has no idea > > whether that page is actually a userpage or not? > > For instance, look at page_cgroup_ino() called from the > reading /proc/kpageflags. But that checks PageSlab() and implements memcg_from_slab_page() to handle that case properly. And that's what we expect all callsites to do: make sure that the question asked actually makes sense, instead of having the interface paper over bogus requests. If that function is completely racy and PageSlab isn't stable, then it should really just open-code the lookup, rather than require weakening the interface for everybody else. > > > static inline struct mem_cgroup *page_memcg_rcu(struct page *page) > > > { > > > + struct mem_cgroup *memcg = READ_ONCE(page->mem_cgroup); > > > + > > > WARN_ON_ONCE(!rcu_read_lock_held()); > > > - return READ_ONCE(page->mem_cgroup); > > > + > > > + /* > > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > > + * but a obj_cgroups pointer. In this case the page is shared and > > > + * isn't charged to any specific memory cgroup. Return NULL. > > > + */ > > > + if ((unsigned long) memcg & 0x1UL) > > > + memcg = NULL; > > > + > > > + return memcg; > > > > Same here. > > > > > } > > > #else > > > static inline struct mem_cgroup *page_memcg(struct page *page) > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > > index 270aa8fd2800..5102f00f3336 100644 > > > --- a/include/linux/mm_types.h > > > +++ b/include/linux/mm_types.h > > > @@ -198,7 +198,10 @@ struct page { > > > atomic_t _refcount; > > > > > > #ifdef CONFIG_MEMCG > > > - struct mem_cgroup *mem_cgroup; > > > + union { > > > + struct mem_cgroup *mem_cgroup; > > > + struct obj_cgroup **obj_cgroups; > > > + }; > > > > Since you need the casts in both cases anyway, it's safer (and > > simpler) to do > > > > unsigned long mem_cgroup; > > > > to prevent accidental direct derefs in future code. > > Agree. Maybe even mem_cgroup_data? Personally, I don't think the suffix adds much. The type makes it so the compiler catches any accidental use, and access is very centralized so greppability doesn't matter much.
On Mon, Feb 03, 2020 at 03:46:27PM -0500, Johannes Weiner wrote: > On Mon, Feb 03, 2020 at 10:34:52AM -0800, Roman Gushchin wrote: > > On Mon, Feb 03, 2020 at 01:27:56PM -0500, Johannes Weiner wrote: > > > On Mon, Jan 27, 2020 at 09:34:41AM -0800, Roman Gushchin wrote: > > > > Allocate and release memory to store obj_cgroup pointers for each > > > > non-root slab page. Reuse page->mem_cgroup pointer to store a pointer > > > > to the allocated space. > > > > > > > > To distinguish between obj_cgroups and memcg pointers in case > > > > when it's not obvious which one is used (as in page_cgroup_ino()), > > > > let's always set the lowest bit in the obj_cgroup case. > > > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > > --- > > > > include/linux/mm.h | 25 ++++++++++++++++++-- > > > > include/linux/mm_types.h | 5 +++- > > > > mm/memcontrol.c | 5 ++-- > > > > mm/slab.c | 3 ++- > > > > mm/slab.h | 51 +++++++++++++++++++++++++++++++++++++++- > > > > mm/slub.c | 2 +- > > > > 6 files changed, 83 insertions(+), 8 deletions(-) > > > > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > > index 080f8ac8bfb7..65224becc4ca 100644 > > > > --- a/include/linux/mm.h > > > > +++ b/include/linux/mm.h > > > > @@ -1264,12 +1264,33 @@ static inline void set_page_links(struct page *page, enum zone_type zone, > > > > #ifdef CONFIG_MEMCG > > > > static inline struct mem_cgroup *page_memcg(struct page *page) > > > > { > > > > - return page->mem_cgroup; > > > > + struct mem_cgroup *memcg = page->mem_cgroup; > > > > + > > > > + /* > > > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > > > + * but a obj_cgroups pointer. In this case the page is shared and > > > > + * isn't charged to any specific memory cgroup. Return NULL. > > > > + */ > > > > + if ((unsigned long) memcg & 0x1UL) > > > > + memcg = NULL; > > > > + > > > > + return memcg; > > > > > > That should really WARN instead of silently returning NULL. Which > > > callsite optimistically asks a page's cgroup when it has no idea > > > whether that page is actually a userpage or not? > > > > For instance, look at page_cgroup_ino() called from the > > reading /proc/kpageflags. > > But that checks PageSlab() and implements memcg_from_slab_page() to > handle that case properly. And that's what we expect all callsites to > do: make sure that the question asked actually makes sense, instead of > having the interface paper over bogus requests. > > If that function is completely racy and PageSlab isn't stable, then it > should really just open-code the lookup, rather than require weakening > the interface for everybody else. Why though? Another example: process stack can be depending on the machine config and platform a vmalloc allocation, a slab allocation or a "high-order slab allocation", which is executed by the page allocator directly. It's kinda nice to have a function that hides accounting details and returns a valid memcg pointer for any kind of objects. To me it seems to be a valid question: for a given kernel object give me a pointer to the memory cgroup. Why it's weakening? Moreover, open-coding of this question leads to bugs like one fixed by ec9f02384f60 ("mm: workingset: fix vmstat counters for shadow nodes"). > > > > > static inline struct mem_cgroup *page_memcg_rcu(struct page *page) > > > > { > > > > + struct mem_cgroup *memcg = READ_ONCE(page->mem_cgroup); > > > > + > > > > WARN_ON_ONCE(!rcu_read_lock_held()); > > > > - return READ_ONCE(page->mem_cgroup); > > > > + > > > > + /* > > > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > > > + * but a obj_cgroups pointer. In this case the page is shared and > > > > + * isn't charged to any specific memory cgroup. Return NULL. > > > > + */ > > > > + if ((unsigned long) memcg & 0x1UL) > > > > + memcg = NULL; > > > > + > > > > + return memcg; > > > > > > Same here. > > > > > > > } > > > > #else > > > > static inline struct mem_cgroup *page_memcg(struct page *page) > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > > > index 270aa8fd2800..5102f00f3336 100644 > > > > --- a/include/linux/mm_types.h > > > > +++ b/include/linux/mm_types.h > > > > @@ -198,7 +198,10 @@ struct page { > > > > atomic_t _refcount; > > > > > > > > #ifdef CONFIG_MEMCG > > > > - struct mem_cgroup *mem_cgroup; > > > > + union { > > > > + struct mem_cgroup *mem_cgroup; > > > > + struct obj_cgroup **obj_cgroups; > > > > + }; > > > > > > Since you need the casts in both cases anyway, it's safer (and > > > simpler) to do > > > > > > unsigned long mem_cgroup; > > > > > > to prevent accidental direct derefs in future code. > > > > Agree. Maybe even mem_cgroup_data? > > Personally, I don't think the suffix adds much. The type makes it so > the compiler catches any accidental use, and access is very > centralized so greppability doesn't matter much.
On Mon, Feb 03, 2020 at 01:19:15PM -0800, Roman Gushchin wrote: > On Mon, Feb 03, 2020 at 03:46:27PM -0500, Johannes Weiner wrote: > > On Mon, Feb 03, 2020 at 10:34:52AM -0800, Roman Gushchin wrote: > > > On Mon, Feb 03, 2020 at 01:27:56PM -0500, Johannes Weiner wrote: > > > > On Mon, Jan 27, 2020 at 09:34:41AM -0800, Roman Gushchin wrote: > > > > > Allocate and release memory to store obj_cgroup pointers for each > > > > > non-root slab page. Reuse page->mem_cgroup pointer to store a pointer > > > > > to the allocated space. > > > > > > > > > > To distinguish between obj_cgroups and memcg pointers in case > > > > > when it's not obvious which one is used (as in page_cgroup_ino()), > > > > > let's always set the lowest bit in the obj_cgroup case. > > > > > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > > > --- > > > > > include/linux/mm.h | 25 ++++++++++++++++++-- > > > > > include/linux/mm_types.h | 5 +++- > > > > > mm/memcontrol.c | 5 ++-- > > > > > mm/slab.c | 3 ++- > > > > > mm/slab.h | 51 +++++++++++++++++++++++++++++++++++++++- > > > > > mm/slub.c | 2 +- > > > > > 6 files changed, 83 insertions(+), 8 deletions(-) > > > > > > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > > > index 080f8ac8bfb7..65224becc4ca 100644 > > > > > --- a/include/linux/mm.h > > > > > +++ b/include/linux/mm.h > > > > > @@ -1264,12 +1264,33 @@ static inline void set_page_links(struct page *page, enum zone_type zone, > > > > > #ifdef CONFIG_MEMCG > > > > > static inline struct mem_cgroup *page_memcg(struct page *page) > > > > > { > > > > > - return page->mem_cgroup; > > > > > + struct mem_cgroup *memcg = page->mem_cgroup; > > > > > + > > > > > + /* > > > > > + * The lowest bit set means that memcg isn't a valid memcg pointer, > > > > > + * but a obj_cgroups pointer. In this case the page is shared and > > > > > + * isn't charged to any specific memory cgroup. Return NULL. > > > > > + */ > > > > > + if ((unsigned long) memcg & 0x1UL) > > > > > + memcg = NULL; > > > > > + > > > > > + return memcg; > > > > > > > > That should really WARN instead of silently returning NULL. Which > > > > callsite optimistically asks a page's cgroup when it has no idea > > > > whether that page is actually a userpage or not? > > > > > > For instance, look at page_cgroup_ino() called from the > > > reading /proc/kpageflags. > > > > But that checks PageSlab() and implements memcg_from_slab_page() to > > handle that case properly. And that's what we expect all callsites to > > do: make sure that the question asked actually makes sense, instead of > > having the interface paper over bogus requests. > > > > If that function is completely racy and PageSlab isn't stable, then it > > should really just open-code the lookup, rather than require weakening > > the interface for everybody else. > > Why though? > > Another example: process stack can be depending on the machine config and > platform a vmalloc allocation, a slab allocation or a "high-order slab allocation", > which is executed by the page allocator directly. > > It's kinda nice to have a function that hides accounting details > and returns a valid memcg pointer for any kind of objects. Hm? I'm not objecting to that, memcg_from_obj() makes perfect sense to me, to use with kvmalloc() objects for example. I'm objecting to page_memcg() silently swallowing bogus inputs. That function shouldn't silently say "there is no cgroup associated with this page" when the true answer is "this page has MANY cgroups associated with it, this question doesn't make any sense". It's not exactly hard to imagine how this could cause bugs, is it? Where a caller should implement a slab case (exactly like page_cgroup_ino()) but is confused about the type of page it has, whether it's charged or not etc.?
diff --git a/include/linux/mm.h b/include/linux/mm.h index 080f8ac8bfb7..65224becc4ca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1264,12 +1264,33 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #ifdef CONFIG_MEMCG static inline struct mem_cgroup *page_memcg(struct page *page) { - return page->mem_cgroup; + struct mem_cgroup *memcg = page->mem_cgroup; + + /* + * The lowest bit set means that memcg isn't a valid memcg pointer, + * but a obj_cgroups pointer. In this case the page is shared and + * isn't charged to any specific memory cgroup. Return NULL. + */ + if ((unsigned long) memcg & 0x1UL) + memcg = NULL; + + return memcg; } static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { + struct mem_cgroup *memcg = READ_ONCE(page->mem_cgroup); + WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); + + /* + * The lowest bit set means that memcg isn't a valid memcg pointer, + * but a obj_cgroups pointer. In this case the page is shared and + * isn't charged to any specific memory cgroup. Return NULL. + */ + if ((unsigned long) memcg & 0x1UL) + memcg = NULL; + + return memcg; } #else static inline struct mem_cgroup *page_memcg(struct page *page) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 270aa8fd2800..5102f00f3336 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -198,7 +198,10 @@ struct page { atomic_t _refcount; #ifdef CONFIG_MEMCG - struct mem_cgroup *mem_cgroup; + union { + struct mem_cgroup *mem_cgroup; + struct obj_cgroup **obj_cgroups; + }; #endif /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9aa37bc61db5..94337ab1ebe9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -545,7 +545,8 @@ ino_t page_cgroup_ino(struct page *page) if (PageSlab(page) && !PageTail(page)) memcg = memcg_from_slab_page(page); else - memcg = READ_ONCE(page->mem_cgroup); + memcg = page_memcg_rcu(page); + while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) @@ -2783,7 +2784,7 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return memcg_from_slab_page(page); /* All other pages use page->mem_cgroup */ - return page->mem_cgroup; + return page_memcg(page); } static int memcg_alloc_cache_id(void) diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..22e161b57367 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1370,7 +1370,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - if (charge_slab_page(page, flags, cachep->gfporder, cachep)) { + if (charge_slab_page(page, flags, cachep->gfporder, cachep, + cachep->num)) { __free_pages(page, cachep->gfporder); return NULL; } diff --git a/mm/slab.h b/mm/slab.h index 7925f7005161..8ee8c3a250ac 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -319,6 +319,18 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } +static inline struct obj_cgroup **page_obj_cgroups(struct page *page) +{ + /* + * page->mem_cgroup and page->obj_cgroups are sharing the same + * space. To distinguish between them in case we don't know for sure + * that the page is a slab page (e.g. page_cgroup_ino()), let's + * always set the lowest bit of obj_cgroups. + */ + return (struct obj_cgroup **) + ((unsigned long)page->obj_cgroups & ~0x1UL); +} + /* * Expects a pointer to a slab page. Please note, that PageSlab() check * isn't sufficient, as it returns true also for tail compound slab pages, @@ -406,6 +418,25 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages); } +static inline int memcg_alloc_page_obj_cgroups(struct page *page, gfp_t gfp, + unsigned int objects) +{ + void *vec; + + vec = kcalloc(objects, sizeof(struct obj_cgroup *), gfp); + if (!vec) + return -ENOMEM; + + page->obj_cgroups = (struct obj_cgroup **) ((unsigned long)vec | 0x1UL); + return 0; +} + +static inline void memcg_free_page_obj_cgroups(struct page *page) +{ + kfree(page_obj_cgroups(page)); + page->obj_cgroups = NULL; +} + extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); @@ -455,6 +486,16 @@ static inline void memcg_uncharge_slab(struct page *page, int order, { } +static inline int memcg_alloc_page_obj_cgroups(struct page *page, gfp_t gfp, + unsigned int objects) +{ + return 0; +} + +static inline void memcg_free_page_obj_cgroups(struct page *page) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } @@ -479,14 +520,21 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) static __always_inline int charge_slab_page(struct page *page, gfp_t gfp, int order, - struct kmem_cache *s) + struct kmem_cache *s, + unsigned int objects) { + int ret; + if (is_root_cache(s)) { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), PAGE_SIZE << order); return 0; } + ret = memcg_alloc_page_obj_cgroups(page, gfp, objects); + if (ret) + return ret; + return memcg_charge_slab(page, gfp, order, s); } @@ -499,6 +547,7 @@ static __always_inline void uncharge_slab_page(struct page *page, int order, return; } + memcg_free_page_obj_cgroups(page); memcg_uncharge_slab(page, order, s); } diff --git a/mm/slub.c b/mm/slub.c index ed6aea234400..165e43076c8b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1516,7 +1516,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, else page = __alloc_pages_node(node, flags, order); - if (page && charge_slab_page(page, flags, order, s)) { + if (page && charge_slab_page(page, flags, order, s, oo_objects(oo))) { __free_pages(page, order); page = NULL; }
Allocate and release memory to store obj_cgroup pointers for each non-root slab page. Reuse page->mem_cgroup pointer to store a pointer to the allocated space. To distinguish between obj_cgroups and memcg pointers in case when it's not obvious which one is used (as in page_cgroup_ino()), let's always set the lowest bit in the obj_cgroup case. Signed-off-by: Roman Gushchin <guro@fb.com> --- include/linux/mm.h | 25 ++++++++++++++++++-- include/linux/mm_types.h | 5 +++- mm/memcontrol.c | 5 ++-- mm/slab.c | 3 ++- mm/slab.h | 51 +++++++++++++++++++++++++++++++++++++++- mm/slub.c | 2 +- 6 files changed, 83 insertions(+), 8 deletions(-)