diff mbox series

[v3,3/4] mm/page_owner: Print memcg information

Message ID 20220131192308.608837-4-longman@redhat.com (mailing list archive)
State New
Headers show
Series mm/page_owner: Extend page_owner to show memcg information | expand

Commit Message

Waiman Long Jan. 31, 2022, 7:23 p.m. UTC
It was found that a number of offlined memcgs were not freed because
they were pinned by some charged pages that were present. Even "echo
1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
offlined but not freed memcgs tend to increase in number over time with
the side effect that percpu memory consumption as shown in /proc/meminfo
also increases over time.

In order to find out more information about those pages that pin
offlined memcgs, the page_owner feature is extended to print memory
cgroup information especially whether the cgroup is offlined or not.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
---
 mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

Comments

Mike Rapoport Jan. 31, 2022, 8:51 p.m. UTC | #1
On Mon, Jan 31, 2022 at 02:23:07PM -0500, Waiman Long wrote:
> It was found that a number of offlined memcgs were not freed because
> they were pinned by some charged pages that were present. Even "echo
> 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
> offlined but not freed memcgs tend to increase in number over time with
> the side effect that percpu memory consumption as shown in /proc/meminfo
> also increases over time.
> 
> In order to find out more information about those pages that pin
> offlined memcgs, the page_owner feature is extended to print memory
> cgroup information especially whether the cgroup is offlined or not.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: David Rientjes <rientjes@google.com>
> ---
>  mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 28dac73e0542..a471c74c7fe0 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -10,6 +10,7 @@
>  #include <linux/migrate.h>
>  #include <linux/stackdepot.h>
>  #include <linux/seq_file.h>
> +#include <linux/memcontrol.h>
>  #include <linux/sched/clock.h>
>  
>  #include "internal.h"
> @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
>  	seq_putc(m, '\n');
>  }
>  
> +#ifdef CONFIG_MEMCG
> +/*
> + * Looking for memcg information and print it out
> + */
> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
> +					  struct page *page)
> +{
> +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
> +	struct mem_cgroup *memcg;
> +	bool onlined;
> +	char name[80];
> +
> +	if (!memcg_data)
> +		return;
> +
> +	if (memcg_data & MEMCG_DATA_OBJCGS)
> +		*pret += scnprintf(kbuf + *pret, count - *pret,
> +				"Slab cache page\n");

Don't we need to check for overflow here?

> +
> +	memcg = page_memcg_check(page);
> +	if (!memcg)
> +		return;
> +
> +	onlined = (memcg->css.flags & CSS_ONLINE);
> +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
> +	*pret += scnprintf(kbuf + *pret, count - *pret,
> +			"Charged %sto %smemcg %s\n",
> +			PageMemcgKmem(page) ? "(via objcg) " : "",
> +			onlined ? "" : "offlined ",
> +			name);

Ditto

> +}
> +#else /* CONFIG_MEMCG */
> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
> +					  struct page *page) { }

I think #ifdef inside the print_page_owner_memcg() functions will be
simpler and clearer.

> +#endif /* CONFIG_MEMCG */
> +
>  static ssize_t
>  print_page_owner(char __user *buf, size_t count, unsigned long pfn,
>  		struct page *page, struct page_owner *page_owner,
> @@ -365,6 +402,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
>  			migrate_reason_names[page_owner->last_migrate_reason]);
>  	}
>  
> +	print_page_owner_memcg(kbuf, count, &ret, page);
> +

ret can go over count here.
Why not make print_page_owner_memcg() an int so that the call will be
consistent with other calls in print_page_owner():

	ret += print_page_owner_memcg(kbuf, count, page);
	if (ret >= count)
		goto err;

>  	ret += snprintf(kbuf + ret, count - ret, "\n");
>  	if (ret >= count)
>  		goto err;
> -- 
> 2.27.0
>
Roman Gushchin Jan. 31, 2022, 8:51 p.m. UTC | #2
On Mon, Jan 31, 2022 at 02:23:07PM -0500, Waiman Long wrote:
> It was found that a number of offlined memcgs were not freed because
> they were pinned by some charged pages that were present. Even "echo
> 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
> offlined but not freed memcgs tend to increase in number over time with
> the side effect that percpu memory consumption as shown in /proc/meminfo
> also increases over time.
> 
> In order to find out more information about those pages that pin
> offlined memcgs, the page_owner feature is extended to print memory
> cgroup information especially whether the cgroup is offlined or not.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: David Rientjes <rientjes@google.com>

Acked-by: Roman Gushchin <guro@fb.com>

Thanks!
Waiman Long Jan. 31, 2022, 9:43 p.m. UTC | #3
On 1/31/22 15:51, Mike Rapoport wrote:
> On Mon, Jan 31, 2022 at 02:23:07PM -0500, Waiman Long wrote:
>> It was found that a number of offlined memcgs were not freed because
>> they were pinned by some charged pages that were present. Even "echo
>> 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
>> offlined but not freed memcgs tend to increase in number over time with
>> the side effect that percpu memory consumption as shown in /proc/meminfo
>> also increases over time.
>>
>> In order to find out more information about those pages that pin
>> offlined memcgs, the page_owner feature is extended to print memory
>> cgroup information especially whether the cgroup is offlined or not.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> Acked-by: David Rientjes <rientjes@google.com>
>> ---
>>   mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 39 insertions(+)
>>
>> diff --git a/mm/page_owner.c b/mm/page_owner.c
>> index 28dac73e0542..a471c74c7fe0 100644
>> --- a/mm/page_owner.c
>> +++ b/mm/page_owner.c
>> @@ -10,6 +10,7 @@
>>   #include <linux/migrate.h>
>>   #include <linux/stackdepot.h>
>>   #include <linux/seq_file.h>
>> +#include <linux/memcontrol.h>
>>   #include <linux/sched/clock.h>
>>   
>>   #include "internal.h"
>> @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
>>   	seq_putc(m, '\n');
>>   }
>>   
>> +#ifdef CONFIG_MEMCG
>> +/*
>> + * Looking for memcg information and print it out
>> + */
>> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
>> +					  struct page *page)
>> +{
>> +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
>> +	struct mem_cgroup *memcg;
>> +	bool onlined;
>> +	char name[80];
>> +
>> +	if (!memcg_data)
>> +		return;
>> +
>> +	if (memcg_data & MEMCG_DATA_OBJCGS)
>> +		*pret += scnprintf(kbuf + *pret, count - *pret,
>> +				"Slab cache page\n");
> Don't we need to check for overflow here?

See my previous patch 2 and the reason I used scnprintf() is that it 
never return a length that is >= the given size. So overflow won't 
happen. The final snprintf() in print_page_owner() will detect buffer 
overflow.


>
>> +
>> +	memcg = page_memcg_check(page);
>> +	if (!memcg)
>> +		return;
>> +
>> +	onlined = (memcg->css.flags & CSS_ONLINE);
>> +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
>> +	*pret += scnprintf(kbuf + *pret, count - *pret,
>> +			"Charged %sto %smemcg %s\n",
>> +			PageMemcgKmem(page) ? "(via objcg) " : "",
>> +			onlined ? "" : "offlined ",
>> +			name);
> Ditto
>
>> +}
>> +#else /* CONFIG_MEMCG */
>> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
>> +					  struct page *page) { }
> I think #ifdef inside the print_page_owner_memcg() functions will be
> simpler and clearer.
Yes, I see both styles used in kernel code though this style is probably 
more common. I will keep this unless there is a good reason to do otherwise.
>
>> +#endif /* CONFIG_MEMCG */
>> +
>>   static ssize_t
>>   print_page_owner(char __user *buf, size_t count, unsigned long pfn,
>>   		struct page *page, struct page_owner *page_owner,
>> @@ -365,6 +402,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
>>   			migrate_reason_names[page_owner->last_migrate_reason]);
>>   	}
>>   
>> +	print_page_owner_memcg(kbuf, count, &ret, page);
>> +
> ret can go over count here.
> Why not make print_page_owner_memcg() an int so that the call will be
> consistent with other calls in print_page_owner():
>
> 	ret += print_page_owner_memcg(kbuf, count, page);
> 	if (ret >= count)
> 		goto err;

See my comments above.

Cheers,
Longman
Mike Rapoport Feb. 1, 2022, 6:23 a.m. UTC | #4
On Mon, Jan 31, 2022 at 04:43:32PM -0500, Waiman Long wrote:
> 
> On 1/31/22 15:51, Mike Rapoport wrote:
> > On Mon, Jan 31, 2022 at 02:23:07PM -0500, Waiman Long wrote:
> > > It was found that a number of offlined memcgs were not freed because
> > > they were pinned by some charged pages that were present. Even "echo
> > > 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
> > > offlined but not freed memcgs tend to increase in number over time with
> > > the side effect that percpu memory consumption as shown in /proc/meminfo
> > > also increases over time.
> > > 
> > > In order to find out more information about those pages that pin
> > > offlined memcgs, the page_owner feature is extended to print memory
> > > cgroup information especially whether the cgroup is offlined or not.
> > > 
> > > Signed-off-by: Waiman Long <longman@redhat.com>
> > > Acked-by: David Rientjes <rientjes@google.com>
> > > ---
> > >   mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
> > >   1 file changed, 39 insertions(+)
> > > 
> > > diff --git a/mm/page_owner.c b/mm/page_owner.c
> > > index 28dac73e0542..a471c74c7fe0 100644
> > > --- a/mm/page_owner.c
> > > +++ b/mm/page_owner.c
> > > @@ -10,6 +10,7 @@
> > >   #include <linux/migrate.h>
> > >   #include <linux/stackdepot.h>
> > >   #include <linux/seq_file.h>
> > > +#include <linux/memcontrol.h>
> > >   #include <linux/sched/clock.h>
> > >   #include "internal.h"
> > > @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
> > >   	seq_putc(m, '\n');
> > >   }
> > > +#ifdef CONFIG_MEMCG
> > > +/*
> > > + * Looking for memcg information and print it out
> > > + */
> > > +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
> > > +					  struct page *page)
> > > +{
> > > +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
> > > +	struct mem_cgroup *memcg;
> > > +	bool onlined;
> > > +	char name[80];
> > > +
> > > +	if (!memcg_data)
> > > +		return;
> > > +
> > > +	if (memcg_data & MEMCG_DATA_OBJCGS)
> > > +		*pret += scnprintf(kbuf + *pret, count - *pret,
> > > +				"Slab cache page\n");
> > Don't we need to check for overflow here?
> 
> See my previous patch 2 and the reason I used scnprintf() is that it never
> return a length that is >= the given size. So overflow won't happen. The
> final snprintf() in print_page_owner() will detect buffer overflow.
 
Right, I've missed that 
 
> > > +
> > > +	memcg = page_memcg_check(page);
> > > +	if (!memcg)
> > > +		return;
> > > +
> > > +	onlined = (memcg->css.flags & CSS_ONLINE);
> > > +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
> > > +	*pret += scnprintf(kbuf + *pret, count - *pret,
> > > +			"Charged %sto %smemcg %s\n",
> > > +			PageMemcgKmem(page) ? "(via objcg) " : "",
> > > +			onlined ? "" : "offlined ",
> > > +			name);
> > Ditto
> > 
> > > +}
> > > +#else /* CONFIG_MEMCG */
> > > +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
> > > +					  struct page *page) { }

> > I think #ifdef inside the print_page_owner_memcg() functions will be
> > simpler and clearer.
>
> Yes, I see both styles used in kernel code though this style is probably
> more common. I will keep this unless there is a good reason to do otherwise.

Having #ifdef inside the function is safer wrt future updates. It's often
happens that non-default arm of #ifdef is forgotten. Besides, it's several
lines less.
 
> > > +#endif /* CONFIG_MEMCG */
> > > +
> > >   static ssize_t
> > >   print_page_owner(char __user *buf, size_t count, unsigned long pfn,
> > >   		struct page *page, struct page_owner *page_owner,
> > > @@ -365,6 +402,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
> > >   			migrate_reason_names[page_owner->last_migrate_reason]);
> > >   	}
> > > +	print_page_owner_memcg(kbuf, count, &ret, page);
> > > +
> > ret can go over count here.
> > Why not make print_page_owner_memcg() an int so that the call will be
> > consistent with other calls in print_page_owner():
> > 
> > 	ret += print_page_owner_memcg(kbuf, count, page);
> > 	if (ret >= count)
> > 		goto err;

I still think that 'int print_page_owner_memcg()' is clearer and more
readable.
 
> See my comments above.
> 
> Cheers,
> Longman
>
Michal Hocko Feb. 1, 2022, 10:54 a.m. UTC | #5
On Mon 31-01-22 14:23:07, Waiman Long wrote:
> It was found that a number of offlined memcgs were not freed because
> they were pinned by some charged pages that were present. Even "echo
> 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
> offlined but not freed memcgs tend to increase in number over time with
> the side effect that percpu memory consumption as shown in /proc/meminfo
> also increases over time.
> 
> In order to find out more information about those pages that pin
> offlined memcgs, the page_owner feature is extended to print memory
> cgroup information especially whether the cgroup is offlined or not.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: David Rientjes <rientjes@google.com>
> ---
>  mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 28dac73e0542..a471c74c7fe0 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -10,6 +10,7 @@
>  #include <linux/migrate.h>
>  #include <linux/stackdepot.h>
>  #include <linux/seq_file.h>
> +#include <linux/memcontrol.h>
>  #include <linux/sched/clock.h>
>  
>  #include "internal.h"
> @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
>  	seq_putc(m, '\n');
>  }
>  
> +#ifdef CONFIG_MEMCG
> +/*
> + * Looking for memcg information and print it out
> + */
> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
> +					  struct page *page)
> +{
> +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
> +	struct mem_cgroup *memcg;
> +	bool onlined;
> +	char name[80];
> +
> +	if (!memcg_data)
> +		return;
> +
> +	if (memcg_data & MEMCG_DATA_OBJCGS)
> +		*pret += scnprintf(kbuf + *pret, count - *pret,
> +				"Slab cache page\n");
> +
> +	memcg = page_memcg_check(page);
> +	if (!memcg)
> +		return;
> +
> +	onlined = (memcg->css.flags & CSS_ONLINE);
> +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
> +	*pret += scnprintf(kbuf + *pret, count - *pret,
> +			"Charged %sto %smemcg %s\n",
> +			PageMemcgKmem(page) ? "(via objcg) " : "",
> +			onlined ? "" : "offlined ",
> +			name);

I have asked in the previous version already but what makes the memcg
stable (why it cannot go away and be reallocated for something else)
while you are trying to get its name?
Waiman Long Feb. 1, 2022, 5:04 p.m. UTC | #6
On 2/1/22 05:54, Michal Hocko wrote:
> On Mon 31-01-22 14:23:07, Waiman Long wrote:
>> It was found that a number of offlined memcgs were not freed because
>> they were pinned by some charged pages that were present. Even "echo
>> 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
>> offlined but not freed memcgs tend to increase in number over time with
>> the side effect that percpu memory consumption as shown in /proc/meminfo
>> also increases over time.
>>
>> In order to find out more information about those pages that pin
>> offlined memcgs, the page_owner feature is extended to print memory
>> cgroup information especially whether the cgroup is offlined or not.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> Acked-by: David Rientjes <rientjes@google.com>
>> ---
>>   mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 39 insertions(+)
>>
>> diff --git a/mm/page_owner.c b/mm/page_owner.c
>> index 28dac73e0542..a471c74c7fe0 100644
>> --- a/mm/page_owner.c
>> +++ b/mm/page_owner.c
>> @@ -10,6 +10,7 @@
>>   #include <linux/migrate.h>
>>   #include <linux/stackdepot.h>
>>   #include <linux/seq_file.h>
>> +#include <linux/memcontrol.h>
>>   #include <linux/sched/clock.h>
>>   
>>   #include "internal.h"
>> @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
>>   	seq_putc(m, '\n');
>>   }
>>   
>> +#ifdef CONFIG_MEMCG
>> +/*
>> + * Looking for memcg information and print it out
>> + */
>> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
>> +					  struct page *page)
>> +{
>> +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
>> +	struct mem_cgroup *memcg;
>> +	bool onlined;
>> +	char name[80];
>> +
>> +	if (!memcg_data)
>> +		return;
>> +
>> +	if (memcg_data & MEMCG_DATA_OBJCGS)
>> +		*pret += scnprintf(kbuf + *pret, count - *pret,
>> +				"Slab cache page\n");
>> +
>> +	memcg = page_memcg_check(page);
>> +	if (!memcg)
>> +		return;
>> +
>> +	onlined = (memcg->css.flags & CSS_ONLINE);
>> +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
>> +	*pret += scnprintf(kbuf + *pret, count - *pret,
>> +			"Charged %sto %smemcg %s\n",
>> +			PageMemcgKmem(page) ? "(via objcg) " : "",
>> +			onlined ? "" : "offlined ",
>> +			name);
> I have asked in the previous version already but what makes the memcg
> stable (why it cannot go away and be reallocated for something else)
> while you are trying to get its name?

The memcg is not going away as long as the page isn't freed unless if it 
is indirectly connected via objcg. Of course, there can be a race 
between the page is going to be freed while the page_owner information 
is being displayed. One solution is to add a simple bit lock to each of 
the page_owner structure and acquire the lock when it is being written 
to or read from. Anyway a lot of these debugging aids or tools don't 
eliminate all the race conditions that affect the accuracy of the 
displayed information. I can add a patch to eliminate this direct memcg 
race if you think this is necessary.

Cheers,
Longman
Michal Hocko Feb. 2, 2022, 8:49 a.m. UTC | #7
On Tue 01-02-22 12:04:37, Waiman Long wrote:
> On 2/1/22 05:54, Michal Hocko wrote:
> > On Mon 31-01-22 14:23:07, Waiman Long wrote:
> > > It was found that a number of offlined memcgs were not freed because
> > > they were pinned by some charged pages that were present. Even "echo
> > > 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
> > > offlined but not freed memcgs tend to increase in number over time with
> > > the side effect that percpu memory consumption as shown in /proc/meminfo
> > > also increases over time.
> > > 
> > > In order to find out more information about those pages that pin
> > > offlined memcgs, the page_owner feature is extended to print memory
> > > cgroup information especially whether the cgroup is offlined or not.
> > > 
> > > Signed-off-by: Waiman Long <longman@redhat.com>
> > > Acked-by: David Rientjes <rientjes@google.com>
> > > ---
> > >   mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
> > >   1 file changed, 39 insertions(+)
> > > 
> > > diff --git a/mm/page_owner.c b/mm/page_owner.c
> > > index 28dac73e0542..a471c74c7fe0 100644
> > > --- a/mm/page_owner.c
> > > +++ b/mm/page_owner.c
> > > @@ -10,6 +10,7 @@
> > >   #include <linux/migrate.h>
> > >   #include <linux/stackdepot.h>
> > >   #include <linux/seq_file.h>
> > > +#include <linux/memcontrol.h>
> > >   #include <linux/sched/clock.h>
> > >   #include "internal.h"
> > > @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
> > >   	seq_putc(m, '\n');
> > >   }
> > > +#ifdef CONFIG_MEMCG
> > > +/*
> > > + * Looking for memcg information and print it out
> > > + */
> > > +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
> > > +					  struct page *page)
> > > +{
> > > +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
> > > +	struct mem_cgroup *memcg;
> > > +	bool onlined;
> > > +	char name[80];
> > > +
> > > +	if (!memcg_data)
> > > +		return;
> > > +
> > > +	if (memcg_data & MEMCG_DATA_OBJCGS)
> > > +		*pret += scnprintf(kbuf + *pret, count - *pret,
> > > +				"Slab cache page\n");
> > > +
> > > +	memcg = page_memcg_check(page);
> > > +	if (!memcg)
> > > +		return;
> > > +
> > > +	onlined = (memcg->css.flags & CSS_ONLINE);
> > > +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
> > > +	*pret += scnprintf(kbuf + *pret, count - *pret,
> > > +			"Charged %sto %smemcg %s\n",
> > > +			PageMemcgKmem(page) ? "(via objcg) " : "",
> > > +			onlined ? "" : "offlined ",
> > > +			name);
> > I have asked in the previous version already but what makes the memcg
> > stable (why it cannot go away and be reallocated for something else)
> > while you are trying to get its name?
> 
> The memcg is not going away as long as the page isn't freed unless if it is
> indirectly connected via objcg. Of course, there can be a race between the
> page is going to be freed while the page_owner information is being
> displayed.

Right. And that means that cgtoup_name can go off the rail and wander
through memory correct?

> One solution is to add a simple bit lock to each of the
> page_owner structure and acquire the lock when it is being written to or
> read from.

I do not really see how a bit lock could prevent memcg from going away.
On the other hand I think RCU read lock should be sufficient to keep the
memcg from going away completely.

> Anyway a lot of these debugging aids or tools don't eliminate all
> the race conditions that affect the accuracy of the displayed information. I
> can add a patch to eliminate this direct memcg race if you think this is
> necessary.

I do not mind inaccurate information. That is natural but reading
through a freed memory can be really harmfull. So this really need to be
sorted out.
Waiman Long Feb. 2, 2022, 4:12 p.m. UTC | #8
On 2/2/22 03:49, Michal Hocko wrote:
> On Tue 01-02-22 12:04:37, Waiman Long wrote:
>> On 2/1/22 05:54, Michal Hocko wrote:
>>> On Mon 31-01-22 14:23:07, Waiman Long wrote:
>>>> It was found that a number of offlined memcgs were not freed because
>>>> they were pinned by some charged pages that were present. Even "echo
>>>> 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These
>>>> offlined but not freed memcgs tend to increase in number over time with
>>>> the side effect that percpu memory consumption as shown in /proc/meminfo
>>>> also increases over time.
>>>>
>>>> In order to find out more information about those pages that pin
>>>> offlined memcgs, the page_owner feature is extended to print memory
>>>> cgroup information especially whether the cgroup is offlined or not.
>>>>
>>>> Signed-off-by: Waiman Long <longman@redhat.com>
>>>> Acked-by: David Rientjes <rientjes@google.com>
>>>> ---
>>>>    mm/page_owner.c | 39 +++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 39 insertions(+)
>>>>
>>>> diff --git a/mm/page_owner.c b/mm/page_owner.c
>>>> index 28dac73e0542..a471c74c7fe0 100644
>>>> --- a/mm/page_owner.c
>>>> +++ b/mm/page_owner.c
>>>> @@ -10,6 +10,7 @@
>>>>    #include <linux/migrate.h>
>>>>    #include <linux/stackdepot.h>
>>>>    #include <linux/seq_file.h>
>>>> +#include <linux/memcontrol.h>
>>>>    #include <linux/sched/clock.h>
>>>>    #include "internal.h"
>>>> @@ -325,6 +326,42 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
>>>>    	seq_putc(m, '\n');
>>>>    }
>>>> +#ifdef CONFIG_MEMCG
>>>> +/*
>>>> + * Looking for memcg information and print it out
>>>> + */
>>>> +static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
>>>> +					  struct page *page)
>>>> +{
>>>> +	unsigned long memcg_data = READ_ONCE(page->memcg_data);
>>>> +	struct mem_cgroup *memcg;
>>>> +	bool onlined;
>>>> +	char name[80];
>>>> +
>>>> +	if (!memcg_data)
>>>> +		return;
>>>> +
>>>> +	if (memcg_data & MEMCG_DATA_OBJCGS)
>>>> +		*pret += scnprintf(kbuf + *pret, count - *pret,
>>>> +				"Slab cache page\n");
>>>> +
>>>> +	memcg = page_memcg_check(page);
>>>> +	if (!memcg)
>>>> +		return;
>>>> +
>>>> +	onlined = (memcg->css.flags & CSS_ONLINE);
>>>> +	cgroup_name(memcg->css.cgroup, name, sizeof(name));
>>>> +	*pret += scnprintf(kbuf + *pret, count - *pret,
>>>> +			"Charged %sto %smemcg %s\n",
>>>> +			PageMemcgKmem(page) ? "(via objcg) " : "",
>>>> +			onlined ? "" : "offlined ",
>>>> +			name);
>>> I have asked in the previous version already but what makes the memcg
>>> stable (why it cannot go away and be reallocated for something else)
>>> while you are trying to get its name?
>> The memcg is not going away as long as the page isn't freed unless if it is
>> indirectly connected via objcg. Of course, there can be a race between the
>> page is going to be freed while the page_owner information is being
>> displayed.
> Right. And that means that cgtoup_name can go off the rail and wander
> through memory correct?
>
>> One solution is to add a simple bit lock to each of the
>> page_owner structure and acquire the lock when it is being written to or
>> read from.
> I do not really see how a bit lock could prevent memcg from going away.
> On the other hand I think RCU read lock should be sufficient to keep the
> memcg from going away completely.
Using rcu_read_lock() is also what I have been thinking of doing. So I 
will update the patch to add that for safety.
>
>> Anyway a lot of these debugging aids or tools don't eliminate all
>> the race conditions that affect the accuracy of the displayed information. I
>> can add a patch to eliminate this direct memcg race if you think this is
>> necessary.
> I do not mind inaccurate information. That is natural but reading
> through a freed memory can be really harmfull. So this really need to be
> sorted out.

Thanks for the review.

Cheers,
Longman
diff mbox series

Patch

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 28dac73e0542..a471c74c7fe0 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,7 @@ 
 #include <linux/migrate.h>
 #include <linux/stackdepot.h>
 #include <linux/seq_file.h>
+#include <linux/memcontrol.h>
 #include <linux/sched/clock.h>
 
 #include "internal.h"
@@ -325,6 +326,42 @@  void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 	seq_putc(m, '\n');
 }
 
+#ifdef CONFIG_MEMCG
+/*
+ * Looking for memcg information and print it out
+ */
+static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
+					  struct page *page)
+{
+	unsigned long memcg_data = READ_ONCE(page->memcg_data);
+	struct mem_cgroup *memcg;
+	bool onlined;
+	char name[80];
+
+	if (!memcg_data)
+		return;
+
+	if (memcg_data & MEMCG_DATA_OBJCGS)
+		*pret += scnprintf(kbuf + *pret, count - *pret,
+				"Slab cache page\n");
+
+	memcg = page_memcg_check(page);
+	if (!memcg)
+		return;
+
+	onlined = (memcg->css.flags & CSS_ONLINE);
+	cgroup_name(memcg->css.cgroup, name, sizeof(name));
+	*pret += scnprintf(kbuf + *pret, count - *pret,
+			"Charged %sto %smemcg %s\n",
+			PageMemcgKmem(page) ? "(via objcg) " : "",
+			onlined ? "" : "offlined ",
+			name);
+}
+#else /* CONFIG_MEMCG */
+static inline void print_page_owner_memcg(char *kbuf, size_t count, int *pret,
+					  struct page *page) { }
+#endif /* CONFIG_MEMCG */
+
 static ssize_t
 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		struct page *page, struct page_owner *page_owner,
@@ -365,6 +402,8 @@  print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 			migrate_reason_names[page_owner->last_migrate_reason]);
 	}
 
+	print_page_owner_memcg(kbuf, count, &ret, page);
+
 	ret += snprintf(kbuf + ret, count - ret, "\n");
 	if (ret >= count)
 		goto err;