diff mbox series

mm, slab: Extend slab/shrink to shrink all the memcg caches

Message ID 20190702183730.14461-1-longman@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm, slab: Extend slab/shrink to shrink all the memcg caches | expand

Commit Message

Waiman Long July 2, 2019, 6:37 p.m. UTC
Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
file to shrink the slab by flushing all the per-cpu slabs and free
slabs in partial lists. This applies only to the root caches, though.

Extends this capability by shrinking all the child memcg caches and
the root cache when a value of '2' is written to the shrink sysfs file.

On a 4-socket 112-core 224-thread x86-64 system after a parallel kernel
build, the the amount of memory occupied by slabs before shrinking
slabs were:

 # grep task_struct /proc/slabinfo
 task_struct         7114   7296   7744    4    8 : tunables    0    0
 0 : slabdata   1824   1824      0
 # grep "^S[lRU]" /proc/meminfo
 Slab:            1310444 kB
 SReclaimable:     377604 kB
 SUnreclaim:       932840 kB

After shrinking slabs:

 # grep "^S[lRU]" /proc/meminfo
 Slab:             695652 kB
 SReclaimable:     322796 kB
 SUnreclaim:       372856 kB
 # grep task_struct /proc/slabinfo
 task_struct         2262   2572   7744    4    8 : tunables    0    0
 0 : slabdata    643    643      0

Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/ABI/testing/sysfs-kernel-slab | 10 +++--
 mm/slab.h                                   |  1 +
 mm/slab_common.c                            | 43 +++++++++++++++++++++
 mm/slub.c                                   |  2 +
 4 files changed, 52 insertions(+), 4 deletions(-)

Comments

Waiman Long July 2, 2019, 6:39 p.m. UTC | #1
On 7/2/19 2:37 PM, Waiman Long wrote:
> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> file to shrink the slab by flushing all the per-cpu slabs and free
> slabs in partial lists. This applies only to the root caches, though.
>
> Extends this capability by shrinking all the child memcg caches and
> the root cache when a value of '2' is written to the shrink sysfs file.
>
> On a 4-socket 112-core 224-thread x86-64 system after a parallel kernel
> build, the the amount of memory occupied by slabs before shrinking
> slabs were:
>
>  # grep task_struct /proc/slabinfo
>  task_struct         7114   7296   7744    4    8 : tunables    0    0
>  0 : slabdata   1824   1824      0
>  # grep "^S[lRU]" /proc/meminfo
>  Slab:            1310444 kB
>  SReclaimable:     377604 kB
>  SUnreclaim:       932840 kB
>
> After shrinking slabs:
>
>  # grep "^S[lRU]" /proc/meminfo
>  Slab:             695652 kB
>  SReclaimable:     322796 kB
>  SUnreclaim:       372856 kB
>  # grep task_struct /proc/slabinfo
>  task_struct         2262   2572   7744    4    8 : tunables    0    0
>  0 : slabdata    643    643      0
>
> Signed-off-by: Waiman Long <longman@redhat.com>

This is a follow-up of my previous patch "mm, slab: Extend
vm/drop_caches to shrink kmem slabs". It is based on the linux-next tree.

-Longman

> ---
>  Documentation/ABI/testing/sysfs-kernel-slab | 10 +++--
>  mm/slab.h                                   |  1 +
>  mm/slab_common.c                            | 43 +++++++++++++++++++++
>  mm/slub.c                                   |  2 +
>  4 files changed, 52 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
> index 29601d93a1c2..2a3d0fc4b4ac 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-slab
> +++ b/Documentation/ABI/testing/sysfs-kernel-slab
> @@ -429,10 +429,12 @@ KernelVersion:	2.6.22
>  Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
>  		Christoph Lameter <cl@linux-foundation.org>
>  Description:
> -		The shrink file is written when memory should be reclaimed from
> -		a cache.  Empty partial slabs are freed and the partial list is
> -		sorted so the slabs with the fewest available objects are used
> -		first.
> +		A value of '1' is written to the shrink file when memory should
> +		be reclaimed from a cache.  Empty partial slabs are freed and
> +		the partial list is sorted so the slabs with the fewest
> +		available objects are used first.  When a value of '2' is
> +		written, all the corresponding child memory cgroup caches
> +		should be shrunk as well.  All other values are invalid.
>  
>  What:		/sys/kernel/slab/cache/slab_size
>  Date:		May 2007
> diff --git a/mm/slab.h b/mm/slab.h
> index 3b22931bb557..a16b2c7ff4dd 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -174,6 +174,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
>  void __kmemcg_cache_deactivate(struct kmem_cache *s);
>  void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
>  void slab_kmem_cache_release(struct kmem_cache *);
> +int kmem_cache_shrink_all(struct kmem_cache *s);
>  
>  struct seq_file;
>  struct file;
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 464faaa9fd81..493697ba1da5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -981,6 +981,49 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
>  }
>  EXPORT_SYMBOL(kmem_cache_shrink);
>  
> +/**
> + * kmem_cache_shrink_all - shrink a cache and all its memcg children
> + * @s: The root cache to shrink.
> + *
> + * Return: 0 if successful, -EINVAL if not a root cache
> + */
> +int kmem_cache_shrink_all(struct kmem_cache *s)
> +{
> +	struct kmem_cache *c;
> +
> +	if (!IS_ENABLED(CONFIG_MEMCG_KMEM)) {
> +		kmem_cache_shrink(s);
> +		return 0;
> +	}
> +	if (!is_root_cache(s))
> +		return -EINVAL;
> +
> +	/*
> +	 * The caller should have a reference to the root cache and so
> +	 * we don't need to take the slab_mutex. We have to take the
> +	 * slab_mutex, however, to iterate the memcg caches.
> +	 */
> +	get_online_cpus();
> +	get_online_mems();
> +	kasan_cache_shrink(s);
> +	__kmem_cache_shrink(s);
> +
> +	mutex_lock(&slab_mutex);
> +	for_each_memcg_cache(c, s) {
> +		/*
> +		 * Don't need to shrink deactivated memcg caches.
> +		 */
> +		if (s->flags & SLAB_DEACTIVATED)
> +			continue;
> +		kasan_cache_shrink(c);
> +		__kmem_cache_shrink(c);
> +	}
> +	mutex_unlock(&slab_mutex);
> +	put_online_mems();
> +	put_online_cpus();
> +	return 0;
> +}
> +
>  bool slab_is_available(void)
>  {
>  	return slab_state >= UP;
> diff --git a/mm/slub.c b/mm/slub.c
> index a384228ff6d3..5d7b0004c51f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -5298,6 +5298,8 @@ static ssize_t shrink_store(struct kmem_cache *s,
>  {
>  	if (buf[0] == '1')
>  		kmem_cache_shrink(s);
> +	else if (buf[0] == '2')
> +		kmem_cache_shrink_all(s);
>  	else
>  		return -EINVAL;
>  	return length;
David Rientjes July 2, 2019, 7:09 p.m. UTC | #2
On Tue, 2 Jul 2019, Waiman Long wrote:

> diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
> index 29601d93a1c2..2a3d0fc4b4ac 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-slab
> +++ b/Documentation/ABI/testing/sysfs-kernel-slab
> @@ -429,10 +429,12 @@ KernelVersion:	2.6.22
>  Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
>  		Christoph Lameter <cl@linux-foundation.org>
>  Description:
> -		The shrink file is written when memory should be reclaimed from
> -		a cache.  Empty partial slabs are freed and the partial list is
> -		sorted so the slabs with the fewest available objects are used
> -		first.
> +		A value of '1' is written to the shrink file when memory should
> +		be reclaimed from a cache.  Empty partial slabs are freed and
> +		the partial list is sorted so the slabs with the fewest
> +		available objects are used first.  When a value of '2' is
> +		written, all the corresponding child memory cgroup caches
> +		should be shrunk as well.  All other values are invalid.
>  

This should likely call out that '2' also does '1', that might not be 
clear enough.

>  What:		/sys/kernel/slab/cache/slab_size
>  Date:		May 2007
> diff --git a/mm/slab.h b/mm/slab.h
> index 3b22931bb557..a16b2c7ff4dd 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -174,6 +174,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
>  void __kmemcg_cache_deactivate(struct kmem_cache *s);
>  void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
>  void slab_kmem_cache_release(struct kmem_cache *);
> +int kmem_cache_shrink_all(struct kmem_cache *s);
>  
>  struct seq_file;
>  struct file;
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 464faaa9fd81..493697ba1da5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -981,6 +981,49 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
>  }
>  EXPORT_SYMBOL(kmem_cache_shrink);
>  
> +/**
> + * kmem_cache_shrink_all - shrink a cache and all its memcg children
> + * @s: The root cache to shrink.
> + *
> + * Return: 0 if successful, -EINVAL if not a root cache
> + */
> +int kmem_cache_shrink_all(struct kmem_cache *s)
> +{
> +	struct kmem_cache *c;
> +
> +	if (!IS_ENABLED(CONFIG_MEMCG_KMEM)) {
> +		kmem_cache_shrink(s);
> +		return 0;
> +	}
> +	if (!is_root_cache(s))
> +		return -EINVAL;
> +
> +	/*
> +	 * The caller should have a reference to the root cache and so
> +	 * we don't need to take the slab_mutex. We have to take the
> +	 * slab_mutex, however, to iterate the memcg caches.
> +	 */
> +	get_online_cpus();
> +	get_online_mems();
> +	kasan_cache_shrink(s);
> +	__kmem_cache_shrink(s);
> +
> +	mutex_lock(&slab_mutex);
> +	for_each_memcg_cache(c, s) {
> +		/*
> +		 * Don't need to shrink deactivated memcg caches.
> +		 */
> +		if (s->flags & SLAB_DEACTIVATED)
> +			continue;
> +		kasan_cache_shrink(c);
> +		__kmem_cache_shrink(c);
> +	}
> +	mutex_unlock(&slab_mutex);
> +	put_online_mems();
> +	put_online_cpus();
> +	return 0;
> +}
> +
>  bool slab_is_available(void)
>  {
>  	return slab_state >= UP;

I'm wondering how long this could take, i.e. how long we hold slab_mutex 
while we traverse each cache and shrink it.

Acked-by: David Rientjes <rientjes@google.com>
Waiman Long July 2, 2019, 7:15 p.m. UTC | #3
On 7/2/19 3:09 PM, David Rientjes wrote:
> On Tue, 2 Jul 2019, Waiman Long wrote:
>
>> diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
>> index 29601d93a1c2..2a3d0fc4b4ac 100644
>> --- a/Documentation/ABI/testing/sysfs-kernel-slab
>> +++ b/Documentation/ABI/testing/sysfs-kernel-slab
>> @@ -429,10 +429,12 @@ KernelVersion:	2.6.22
>>  Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
>>  		Christoph Lameter <cl@linux-foundation.org>
>>  Description:
>> -		The shrink file is written when memory should be reclaimed from
>> -		a cache.  Empty partial slabs are freed and the partial list is
>> -		sorted so the slabs with the fewest available objects are used
>> -		first.
>> +		A value of '1' is written to the shrink file when memory should
>> +		be reclaimed from a cache.  Empty partial slabs are freed and
>> +		the partial list is sorted so the slabs with the fewest
>> +		available objects are used first.  When a value of '2' is
>> +		written, all the corresponding child memory cgroup caches
>> +		should be shrunk as well.  All other values are invalid.
>>  
> This should likely call out that '2' also does '1', that might not be 
> clear enough.

You are right. I will reword the text to make it clearer.


>>  What:		/sys/kernel/slab/cache/slab_size
>>  Date:		May 2007
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 3b22931bb557..a16b2c7ff4dd 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -174,6 +174,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
>>  void __kmemcg_cache_deactivate(struct kmem_cache *s);
>>  void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
>>  void slab_kmem_cache_release(struct kmem_cache *);
>> +int kmem_cache_shrink_all(struct kmem_cache *s);
>>  
>>  struct seq_file;
>>  struct file;
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 464faaa9fd81..493697ba1da5 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -981,6 +981,49 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
>>  }
>>  EXPORT_SYMBOL(kmem_cache_shrink);
>>  
>> +/**
>> + * kmem_cache_shrink_all - shrink a cache and all its memcg children
>> + * @s: The root cache to shrink.
>> + *
>> + * Return: 0 if successful, -EINVAL if not a root cache
>> + */
>> +int kmem_cache_shrink_all(struct kmem_cache *s)
>> +{
>> +	struct kmem_cache *c;
>> +
>> +	if (!IS_ENABLED(CONFIG_MEMCG_KMEM)) {
>> +		kmem_cache_shrink(s);
>> +		return 0;
>> +	}
>> +	if (!is_root_cache(s))
>> +		return -EINVAL;
>> +
>> +	/*
>> +	 * The caller should have a reference to the root cache and so
>> +	 * we don't need to take the slab_mutex. We have to take the
>> +	 * slab_mutex, however, to iterate the memcg caches.
>> +	 */
>> +	get_online_cpus();
>> +	get_online_mems();
>> +	kasan_cache_shrink(s);
>> +	__kmem_cache_shrink(s);
>> +
>> +	mutex_lock(&slab_mutex);
>> +	for_each_memcg_cache(c, s) {
>> +		/*
>> +		 * Don't need to shrink deactivated memcg caches.
>> +		 */
>> +		if (s->flags & SLAB_DEACTIVATED)
>> +			continue;
>> +		kasan_cache_shrink(c);
>> +		__kmem_cache_shrink(c);
>> +	}
>> +	mutex_unlock(&slab_mutex);
>> +	put_online_mems();
>> +	put_online_cpus();
>> +	return 0;
>> +}
>> +
>>  bool slab_is_available(void)
>>  {
>>  	return slab_state >= UP;
> I'm wondering how long this could take, i.e. how long we hold slab_mutex 
> while we traverse each cache and shrink it.

It will depends on how many memcg caches are there. Actually, I have
been thinking about using the show method to show the time spent in the
last shrink operation. I am just not sure if it is worth doing. What do
you think?

-Longman
Roman Gushchin July 2, 2019, 7:30 p.m. UTC | #4
On Tue, Jul 02, 2019 at 02:37:30PM -0400, Waiman Long wrote:
> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> file to shrink the slab by flushing all the per-cpu slabs and free
> slabs in partial lists. This applies only to the root caches, though.
> 
> Extends this capability by shrinking all the child memcg caches and
> the root cache when a value of '2' is written to the shrink sysfs file.
> 
> On a 4-socket 112-core 224-thread x86-64 system after a parallel kernel
> build, the the amount of memory occupied by slabs before shrinking
> slabs were:
> 
>  # grep task_struct /proc/slabinfo
>  task_struct         7114   7296   7744    4    8 : tunables    0    0
>  0 : slabdata   1824   1824      0
>  # grep "^S[lRU]" /proc/meminfo
>  Slab:            1310444 kB
>  SReclaimable:     377604 kB
>  SUnreclaim:       932840 kB
> 
> After shrinking slabs:
> 
>  # grep "^S[lRU]" /proc/meminfo
>  Slab:             695652 kB
>  SReclaimable:     322796 kB
>  SUnreclaim:       372856 kB
>  # grep task_struct /proc/slabinfo
>  task_struct         2262   2572   7744    4    8 : tunables    0    0
>  0 : slabdata    643    643      0
> 
> Signed-off-by: Waiman Long <longman@redhat.com>

Acked-by: Roman Gushchin <guro@fb.com>

Thanks, Waiman!
Andrew Morton July 2, 2019, 8:03 p.m. UTC | #5
On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:

> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> file to shrink the slab by flushing all the per-cpu slabs and free
> slabs in partial lists. This applies only to the root caches, though.
> 
> Extends this capability by shrinking all the child memcg caches and
> the root cache when a value of '2' is written to the shrink sysfs file.

Why?

Please fully describe the value of the proposed feature to or users. 
Always.

> 
> ...
>
> --- a/Documentation/ABI/testing/sysfs-kernel-slab
> +++ b/Documentation/ABI/testing/sysfs-kernel-slab
> @@ -429,10 +429,12 @@ KernelVersion:	2.6.22
>  Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
>  		Christoph Lameter <cl@linux-foundation.org>
>  Description:
> -		The shrink file is written when memory should be reclaimed from
> -		a cache.  Empty partial slabs are freed and the partial list is
> -		sorted so the slabs with the fewest available objects are used
> -		first.
> +		A value of '1' is written to the shrink file when memory should
> +		be reclaimed from a cache.  Empty partial slabs are freed and
> +		the partial list is sorted so the slabs with the fewest
> +		available objects are used first.  When a value of '2' is
> +		written, all the corresponding child memory cgroup caches
> +		should be shrunk as well.  All other values are invalid.

One would expect this to be a bitfield, like /proc/sys/vm/drop_caches. 
So writing 3 does both forms of shrinking.

Yes, it happens to be the case that 2 is a superset of 1, but what
about if we add "4"?
Waiman Long July 2, 2019, 8:44 p.m. UTC | #6
On 7/2/19 4:03 PM, Andrew Morton wrote:
> On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:
>
>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>> file to shrink the slab by flushing all the per-cpu slabs and free
>> slabs in partial lists. This applies only to the root caches, though.
>>
>> Extends this capability by shrinking all the child memcg caches and
>> the root cache when a value of '2' is written to the shrink sysfs file.
> Why?
>
> Please fully describe the value of the proposed feature to or users. 
> Always.

Sure. Essentially, the sysfs shrink interface is not complete. It allows
the root cache to be shrunk, but not any of the memcg caches. 

The same can also be said for others slab sysfs files which show current
cache status. I don't think sysfs files are created for the memcg
caches, but I may be wrong. In many cases, information can be available
elsewhere like the slabinfo file. The shrink operation, however, has no
other alternative available.

>> ...
>>
>> --- a/Documentation/ABI/testing/sysfs-kernel-slab
>> +++ b/Documentation/ABI/testing/sysfs-kernel-slab
>> @@ -429,10 +429,12 @@ KernelVersion:	2.6.22
>>  Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
>>  		Christoph Lameter <cl@linux-foundation.org>
>>  Description:
>> -		The shrink file is written when memory should be reclaimed from
>> -		a cache.  Empty partial slabs are freed and the partial list is
>> -		sorted so the slabs with the fewest available objects are used
>> -		first.
>> +		A value of '1' is written to the shrink file when memory should
>> +		be reclaimed from a cache.  Empty partial slabs are freed and
>> +		the partial list is sorted so the slabs with the fewest
>> +		available objects are used first.  When a value of '2' is
>> +		written, all the corresponding child memory cgroup caches
>> +		should be shrunk as well.  All other values are invalid.
> One would expect this to be a bitfield, like /proc/sys/vm/drop_caches. 
> So writing 3 does both forms of shrinking.
>
> Yes, it happens to be the case that 2 is a superset of 1, but what
> about if we add "4"?
>
Yes, I can make it into a bit fields of 2 bits, just like
/proc/sys/vm/drop_caches.

Cheers,
Longman
Andrew Morton July 2, 2019, 9:33 p.m. UTC | #7
On Tue, 2 Jul 2019 16:44:24 -0400 Waiman Long <longman@redhat.com> wrote:

> On 7/2/19 4:03 PM, Andrew Morton wrote:
> > On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:
> >
> >> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> >> file to shrink the slab by flushing all the per-cpu slabs and free
> >> slabs in partial lists. This applies only to the root caches, though.
> >>
> >> Extends this capability by shrinking all the child memcg caches and
> >> the root cache when a value of '2' is written to the shrink sysfs file.
> > Why?
> >
> > Please fully describe the value of the proposed feature to or users. 
> > Always.
> 
> Sure. Essentially, the sysfs shrink interface is not complete. It allows
> the root cache to be shrunk, but not any of the memcg caches. 

But that doesn't describe anything of value.  Who wants to use this,
and why?  How will it be used?  What are the use-cases?
Michal Hocko July 3, 2019, 6:56 a.m. UTC | #8
On Tue 02-07-19 14:37:30, Waiman Long wrote:
> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> file to shrink the slab by flushing all the per-cpu slabs and free
> slabs in partial lists. This applies only to the root caches, though.
> 
> Extends this capability by shrinking all the child memcg caches and
> the root cache when a value of '2' is written to the shrink sysfs file.

Why do we need a new value for this functionality? I would tend to think
that skipping memcg caches is a bug/incomplete implementation. Or is it
a deliberate decision to cover root caches only?
Waiman Long July 3, 2019, 1:12 p.m. UTC | #9
On 7/3/19 2:56 AM, Michal Hocko wrote:
> On Tue 02-07-19 14:37:30, Waiman Long wrote:
>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>> file to shrink the slab by flushing all the per-cpu slabs and free
>> slabs in partial lists. This applies only to the root caches, though.
>>
>> Extends this capability by shrinking all the child memcg caches and
>> the root cache when a value of '2' is written to the shrink sysfs file.
> Why do we need a new value for this functionality? I would tend to think
> that skipping memcg caches is a bug/incomplete implementation. Or is it
> a deliberate decision to cover root caches only?

It is just that I don't want to change the existing behavior of the
current code. It will definitely take longer to shrink both the root
cache and the memcg caches. If we all agree that the only sensible
operation is to shrink root cache and the memcg caches together. I am
fine just adding memcg shrink without changing the sysfs interface
definition and be done with it.

Cheers,
Longman
Michal Hocko July 3, 2019, 2:37 p.m. UTC | #10
On Wed 03-07-19 09:12:13, Waiman Long wrote:
> On 7/3/19 2:56 AM, Michal Hocko wrote:
> > On Tue 02-07-19 14:37:30, Waiman Long wrote:
> >> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> >> file to shrink the slab by flushing all the per-cpu slabs and free
> >> slabs in partial lists. This applies only to the root caches, though.
> >>
> >> Extends this capability by shrinking all the child memcg caches and
> >> the root cache when a value of '2' is written to the shrink sysfs file.
> > Why do we need a new value for this functionality? I would tend to think
> > that skipping memcg caches is a bug/incomplete implementation. Or is it
> > a deliberate decision to cover root caches only?
> 
> It is just that I don't want to change the existing behavior of the
> current code. It will definitely take longer to shrink both the root
> cache and the memcg caches.

Does that matter? To whom and why? I do not expect this interface to be
used heavily.

> If we all agree that the only sensible
> operation is to shrink root cache and the memcg caches together. I am
> fine just adding memcg shrink without changing the sysfs interface
> definition and be done with it.

The existing documentation is really modest on the actual semantic:
Description:
                The shrink file is written when memory should be reclaimed from
                a cache.  Empty partial slabs are freed and the partial list is
                sorted so the slabs with the fewest available objects are used
                first.

which to me sounds like all slabs are free and nobody should be really
thinking of memcgs. This is simply drop_caches kinda thing. We surely do
not want to drop caches only for the root memcg for /proc/sys/vm/drop_caches
right?
Waiman Long July 3, 2019, 3:14 p.m. UTC | #11
On 7/3/19 10:37 AM, Michal Hocko wrote:
> On Wed 03-07-19 09:12:13, Waiman Long wrote:
>> On 7/3/19 2:56 AM, Michal Hocko wrote:
>>> On Tue 02-07-19 14:37:30, Waiman Long wrote:
>>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>>>> file to shrink the slab by flushing all the per-cpu slabs and free
>>>> slabs in partial lists. This applies only to the root caches, though.
>>>>
>>>> Extends this capability by shrinking all the child memcg caches and
>>>> the root cache when a value of '2' is written to the shrink sysfs file.
>>> Why do we need a new value for this functionality? I would tend to think
>>> that skipping memcg caches is a bug/incomplete implementation. Or is it
>>> a deliberate decision to cover root caches only?
>> It is just that I don't want to change the existing behavior of the
>> current code. It will definitely take longer to shrink both the root
>> cache and the memcg caches.
> Does that matter? To whom and why? I do not expect this interface to be
> used heavily.
The only concern that I can see is the fact that I need to take the
slab_mutex when iterating the memcg list to prevent concurrent
modification. That may have some impact on other applications running in
the system. However, I can put a precaution statement on the user-doc to
discuss the potential performance impact.
>> If we all agree that the only sensible
>> operation is to shrink root cache and the memcg caches together. I am
>> fine just adding memcg shrink without changing the sysfs interface
>> definition and be done with it.
> The existing documentation is really modest on the actual semantic:
> Description:
>                 The shrink file is written when memory should be reclaimed from
>                 a cache.  Empty partial slabs are freed and the partial list is
>                 sorted so the slabs with the fewest available objects are used
>                 first.
>
> which to me sounds like all slabs are free and nobody should be really
> thinking of memcgs. This is simply drop_caches kinda thing. We surely do
> not want to drop caches only for the root memcg for /proc/sys/vm/drop_caches
> right?
>
I am planning to reword the document to make the effect of using this
sysfs file more explicit.

Cheers,
Longman
Waiman Long July 3, 2019, 3:21 p.m. UTC | #12
On 7/2/19 5:33 PM, Andrew Morton wrote:
> On Tue, 2 Jul 2019 16:44:24 -0400 Waiman Long <longman@redhat.com> wrote:
>
>> On 7/2/19 4:03 PM, Andrew Morton wrote:
>>> On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:
>>>
>>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>>>> file to shrink the slab by flushing all the per-cpu slabs and free
>>>> slabs in partial lists. This applies only to the root caches, though.
>>>>
>>>> Extends this capability by shrinking all the child memcg caches and
>>>> the root cache when a value of '2' is written to the shrink sysfs file.
>>> Why?
>>>
>>> Please fully describe the value of the proposed feature to or users. 
>>> Always.
>> Sure. Essentially, the sysfs shrink interface is not complete. It allows
>> the root cache to be shrunk, but not any of the memcg caches. 
> But that doesn't describe anything of value.  Who wants to use this,
> and why?  How will it be used?  What are the use-cases?
>
For me, the primary motivation of posting this patch is to have a way to
make the number of active objects reported in /proc/slabinfo more
accurately reflect the number of objects that are actually being used by
the kernel. When measuring changes in slab objects consumption between
successive run of a certain workload, I can more easily see the amount
of increase. Without that, the data will have much more noise and it
will be harder to see a pattern.

Cheers,
Longman
Michal Hocko July 3, 2019, 3:53 p.m. UTC | #13
On Wed 03-07-19 11:21:16, Waiman Long wrote:
> On 7/2/19 5:33 PM, Andrew Morton wrote:
> > On Tue, 2 Jul 2019 16:44:24 -0400 Waiman Long <longman@redhat.com> wrote:
> >
> >> On 7/2/19 4:03 PM, Andrew Morton wrote:
> >>> On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:
> >>>
> >>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> >>>> file to shrink the slab by flushing all the per-cpu slabs and free
> >>>> slabs in partial lists. This applies only to the root caches, though.
> >>>>
> >>>> Extends this capability by shrinking all the child memcg caches and
> >>>> the root cache when a value of '2' is written to the shrink sysfs file.
> >>> Why?
> >>>
> >>> Please fully describe the value of the proposed feature to or users. 
> >>> Always.
> >> Sure. Essentially, the sysfs shrink interface is not complete. It allows
> >> the root cache to be shrunk, but not any of the memcg caches. 
> > But that doesn't describe anything of value.  Who wants to use this,
> > and why?  How will it be used?  What are the use-cases?
> >
> For me, the primary motivation of posting this patch is to have a way to
> make the number of active objects reported in /proc/slabinfo more
> accurately reflect the number of objects that are actually being used by
> the kernel.

I believe we have been through that. If the number is inexact due to
caching then lets fix slabinfo rather than trick around it and teach
people to do a magic write to some file that will "solve" a problem.
This is exactly what drop_caches turned out to be in fact. People just
got used to drop caches because they were told so by $random web page.
So really, think about the underlying problem and try to fix it.

It is true that you could argue that this patch is actually fixing the
existing interface because it doesn't really do what it is documented to
do and on those grounds I would agree with the change. But do not teach
people that they have to write to some file to get proper numbers.
Because that is just a bad idea and it will kick back the same way
drop_caches.
Christoph Lameter (Ampere) July 3, 2019, 4:10 p.m. UTC | #14
On Wed, 3 Jul 2019, Waiman Long wrote:

> On 7/3/19 2:56 AM, Michal Hocko wrote:
> > On Tue 02-07-19 14:37:30, Waiman Long wrote:
> >> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> >> file to shrink the slab by flushing all the per-cpu slabs and free
> >> slabs in partial lists. This applies only to the root caches, though.
> >>
> >> Extends this capability by shrinking all the child memcg caches and
> >> the root cache when a value of '2' is written to the shrink sysfs file.
> > Why do we need a new value for this functionality? I would tend to think
> > that skipping memcg caches is a bug/incomplete implementation. Or is it
> > a deliberate decision to cover root caches only?
>
> It is just that I don't want to change the existing behavior of the
> current code. It will definitely take longer to shrink both the root
> cache and the memcg caches. If we all agree that the only sensible
> operation is to shrink root cache and the memcg caches together. I am
> fine just adding memcg shrink without changing the sysfs interface
> definition and be done with it.

I think its best and consistent behavior to shrink all memcg caches
with the root cache. This looks like an oversight and thus a bugfix.
Waiman Long July 3, 2019, 4:13 p.m. UTC | #15
On 7/3/19 12:10 PM, Christopher Lameter wrote:
> On Wed, 3 Jul 2019, Waiman Long wrote:
>
>> On 7/3/19 2:56 AM, Michal Hocko wrote:
>>> On Tue 02-07-19 14:37:30, Waiman Long wrote:
>>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>>>> file to shrink the slab by flushing all the per-cpu slabs and free
>>>> slabs in partial lists. This applies only to the root caches, though.
>>>>
>>>> Extends this capability by shrinking all the child memcg caches and
>>>> the root cache when a value of '2' is written to the shrink sysfs file.
>>> Why do we need a new value for this functionality? I would tend to think
>>> that skipping memcg caches is a bug/incomplete implementation. Or is it
>>> a deliberate decision to cover root caches only?
>> It is just that I don't want to change the existing behavior of the
>> current code. It will definitely take longer to shrink both the root
>> cache and the memcg caches. If we all agree that the only sensible
>> operation is to shrink root cache and the memcg caches together. I am
>> fine just adding memcg shrink without changing the sysfs interface
>> definition and be done with it.
> I think its best and consistent behavior to shrink all memcg caches
> with the root cache. This looks like an oversight and thus a bugfix.
>
Yes, that is what I am now planning to do for the next version of the patch.

Cheers,
Longman
Waiman Long July 3, 2019, 4:16 p.m. UTC | #16
On 7/3/19 11:53 AM, Michal Hocko wrote:
> On Wed 03-07-19 11:21:16, Waiman Long wrote:
>> On 7/2/19 5:33 PM, Andrew Morton wrote:
>>> On Tue, 2 Jul 2019 16:44:24 -0400 Waiman Long <longman@redhat.com> wrote:
>>>
>>>> On 7/2/19 4:03 PM, Andrew Morton wrote:
>>>>> On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:
>>>>>
>>>>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>>>>>> file to shrink the slab by flushing all the per-cpu slabs and free
>>>>>> slabs in partial lists. This applies only to the root caches, though.
>>>>>>
>>>>>> Extends this capability by shrinking all the child memcg caches and
>>>>>> the root cache when a value of '2' is written to the shrink sysfs file.
>>>>> Why?
>>>>>
>>>>> Please fully describe the value of the proposed feature to or users. 
>>>>> Always.
>>>> Sure. Essentially, the sysfs shrink interface is not complete. It allows
>>>> the root cache to be shrunk, but not any of the memcg caches. 
>>> But that doesn't describe anything of value.  Who wants to use this,
>>> and why?  How will it be used?  What are the use-cases?
>>>
>> For me, the primary motivation of posting this patch is to have a way to
>> make the number of active objects reported in /proc/slabinfo more
>> accurately reflect the number of objects that are actually being used by
>> the kernel.
> I believe we have been through that. If the number is inexact due to
> caching then lets fix slabinfo rather than trick around it and teach
> people to do a magic write to some file that will "solve" a problem.
> This is exactly what drop_caches turned out to be in fact. People just
> got used to drop caches because they were told so by $random web page.
> So really, think about the underlying problem and try to fix it.
>
> It is true that you could argue that this patch is actually fixing the
> existing interface because it doesn't really do what it is documented to
> do and on those grounds I would agree with the change.

I do think that we should correct the shrink file to do what it is
designed to do to include the memcg caches as well.


>  But do not teach
> people that they have to write to some file to get proper numbers.
> Because that is just a bad idea and it will kick back the same way
> drop_caches.

The /proc/slabinfo file is a well-known file that is probably used
relatively extensively. Making it to scan through all the per-cpu
structures will probably cause performance issues as the slab_mutex has
to be taken during the whole duration of the scan. That could have
undesirable side effect.

Instead, I am thinking about extending the slab/objects sysfs file to
also show the number of objects hold up by the per-cpu structures and
thus we can get an accurate count by subtracting it from the reported
active objects. That will have a more limited performance impact as it
is just one kmem cache instead of all the kmem caches in the system.
Also the sysfs files are not as commonly used as slabinfo. That will be
another patch in the near future.

Cheers,
Longman
Michal Hocko July 4, 2019, 7:37 a.m. UTC | #17
On Wed 03-07-19 12:16:09, Waiman Long wrote:
> On 7/3/19 11:53 AM, Michal Hocko wrote:
> > On Wed 03-07-19 11:21:16, Waiman Long wrote:
> >> On 7/2/19 5:33 PM, Andrew Morton wrote:
> >>> On Tue, 2 Jul 2019 16:44:24 -0400 Waiman Long <longman@redhat.com> wrote:
> >>>
> >>>> On 7/2/19 4:03 PM, Andrew Morton wrote:
> >>>>> On Tue,  2 Jul 2019 14:37:30 -0400 Waiman Long <longman@redhat.com> wrote:
> >>>>>
> >>>>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> >>>>>> file to shrink the slab by flushing all the per-cpu slabs and free
> >>>>>> slabs in partial lists. This applies only to the root caches, though.
> >>>>>>
> >>>>>> Extends this capability by shrinking all the child memcg caches and
> >>>>>> the root cache when a value of '2' is written to the shrink sysfs file.
> >>>>> Why?
> >>>>>
> >>>>> Please fully describe the value of the proposed feature to or users. 
> >>>>> Always.
> >>>> Sure. Essentially, the sysfs shrink interface is not complete. It allows
> >>>> the root cache to be shrunk, but not any of the memcg caches. 
> >>> But that doesn't describe anything of value.  Who wants to use this,
> >>> and why?  How will it be used?  What are the use-cases?
> >>>
> >> For me, the primary motivation of posting this patch is to have a way to
> >> make the number of active objects reported in /proc/slabinfo more
> >> accurately reflect the number of objects that are actually being used by
> >> the kernel.
> > I believe we have been through that. If the number is inexact due to
> > caching then lets fix slabinfo rather than trick around it and teach
> > people to do a magic write to some file that will "solve" a problem.
> > This is exactly what drop_caches turned out to be in fact. People just
> > got used to drop caches because they were told so by $random web page.
> > So really, think about the underlying problem and try to fix it.
> >
> > It is true that you could argue that this patch is actually fixing the
> > existing interface because it doesn't really do what it is documented to
> > do and on those grounds I would agree with the change.
> 
> I do think that we should correct the shrink file to do what it is
> designed to do to include the memcg caches as well.
> 
> 
> >  But do not teach
> > people that they have to write to some file to get proper numbers.
> > Because that is just a bad idea and it will kick back the same way
> > drop_caches.
> 
> The /proc/slabinfo file is a well-known file that is probably used
> relatively extensively. Making it to scan through all the per-cpu
> structures will probably cause performance issues as the slab_mutex has
> to be taken during the whole duration of the scan. That could have
> undesirable side effect.

Please be more specific with some numbers ideally. Also if collecting
data is too expensive, why cannot we simply account cached objects count
in pcp manner?

> Instead, I am thinking about extending the slab/objects sysfs file to
> also show the number of objects hold up by the per-cpu structures and
> thus we can get an accurate count by subtracting it from the reported
> active objects. That will have a more limited performance impact as it
> is just one kmem cache instead of all the kmem caches in the system.
> Also the sysfs files are not as commonly used as slabinfo. That will be
> another patch in the near future.

Both are root only and once it is widespread that slabinfo doesn't
provide precise data you can expect tools will try to fix that by adding
another file(s) and we are back to square one, no? In other words
slabinfo
Peter Enderborg July 22, 2019, 12:46 p.m. UTC | #18
On 7/2/19 8:37 PM, Waiman Long wrote:
> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
> file to shrink the slab by flushing all the per-cpu slabs and free
> slabs in partial lists. This applies only to the root caches, though.
>
> Extends this capability by shrinking all the child memcg caches and
> the root cache when a value of '2' is written to the shrink sysfs file.
>
> On a 4-socket 112-core 224-thread x86-64 system after a parallel kernel
> build, the the amount of memory occupied by slabs before shrinking
> slabs were:
>
>  # grep task_struct /proc/slabinfo
>  task_struct         7114   7296   7744    4    8 : tunables    0    0
>  0 : slabdata   1824   1824      0
>  # grep "^S[lRU]" /proc/meminfo
>  Slab:            1310444 kB
>  SReclaimable:     377604 kB
>  SUnreclaim:       932840 kB
>
> After shrinking slabs:
>
>  # grep "^S[lRU]" /proc/meminfo
>  Slab:             695652 kB
>  SReclaimable:     322796 kB
>  SUnreclaim:       372856 kB
>  # grep task_struct /proc/slabinfo
>  task_struct         2262   2572   7744    4    8 : tunables    0    0
>  0 : slabdata    643    643      0


What is the time between this measurement points? Should not the shrinked memory show up as reclaimable?


> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  Documentation/ABI/testing/sysfs-kernel-slab | 10 +++--
>  mm/slab.h                                   |  1 +
>  mm/slab_common.c                            | 43 +++++++++++++++++++++
>  mm/slub.c                                   |  2 +
>  4 files changed, 52 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
> index 29601d93a1c2..2a3d0fc4b4ac 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-slab
> +++ b/Documentation/ABI/testing/sysfs-kernel-slab
> @@ -429,10 +429,12 @@ KernelVersion:	2.6.22
>  Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
>  		Christoph Lameter <cl@linux-foundation.org>
>  Description:
> -		The shrink file is written when memory should be reclaimed from
> -		a cache.  Empty partial slabs are freed and the partial list is
> -		sorted so the slabs with the fewest available objects are used
> -		first.
> +		A value of '1' is written to the shrink file when memory should
> +		be reclaimed from a cache.  Empty partial slabs are freed and
> +		the partial list is sorted so the slabs with the fewest
> +		available objects are used first.  When a value of '2' is
> +		written, all the corresponding child memory cgroup caches
> +		should be shrunk as well.  All other values are invalid.
>  
>  What:		/sys/kernel/slab/cache/slab_size
>  Date:		May 2007
> diff --git a/mm/slab.h b/mm/slab.h
> index 3b22931bb557..a16b2c7ff4dd 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -174,6 +174,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
>  void __kmemcg_cache_deactivate(struct kmem_cache *s);
>  void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
>  void slab_kmem_cache_release(struct kmem_cache *);
> +int kmem_cache_shrink_all(struct kmem_cache *s);
>  
>  struct seq_file;
>  struct file;
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 464faaa9fd81..493697ba1da5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -981,6 +981,49 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
>  }
>  EXPORT_SYMBOL(kmem_cache_shrink);
>  
> +/**
> + * kmem_cache_shrink_all - shrink a cache and all its memcg children
> + * @s: The root cache to shrink.
> + *
> + * Return: 0 if successful, -EINVAL if not a root cache
> + */
> +int kmem_cache_shrink_all(struct kmem_cache *s)
> +{
> +	struct kmem_cache *c;
> +
> +	if (!IS_ENABLED(CONFIG_MEMCG_KMEM)) {
> +		kmem_cache_shrink(s);
> +		return 0;
> +	}
> +	if (!is_root_cache(s))
> +		return -EINVAL;
> +
> +	/*
> +	 * The caller should have a reference to the root cache and so
> +	 * we don't need to take the slab_mutex. We have to take the
> +	 * slab_mutex, however, to iterate the memcg caches.
> +	 */
> +	get_online_cpus();
> +	get_online_mems();
> +	kasan_cache_shrink(s);
> +	__kmem_cache_shrink(s);
> +
> +	mutex_lock(&slab_mutex);
> +	for_each_memcg_cache(c, s) {
> +		/*
> +		 * Don't need to shrink deactivated memcg caches.
> +		 */
> +		if (s->flags & SLAB_DEACTIVATED)
> +			continue;
> +		kasan_cache_shrink(c);
> +		__kmem_cache_shrink(c);
> +	}
> +	mutex_unlock(&slab_mutex);
> +	put_online_mems();
> +	put_online_cpus();
> +	return 0;
> +}
> +
>  bool slab_is_available(void)
>  {
>  	return slab_state >= UP;
> diff --git a/mm/slub.c b/mm/slub.c
> index a384228ff6d3..5d7b0004c51f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -5298,6 +5298,8 @@ static ssize_t shrink_store(struct kmem_cache *s,
>  {
>  	if (buf[0] == '1')
>  		kmem_cache_shrink(s);
> +	else if (buf[0] == '2')
> +		kmem_cache_shrink_all(s);
>  	else
>  		return -EINVAL;
>  	return length;
Waiman Long July 23, 2019, 2:30 p.m. UTC | #19
On 7/22/19 8:46 AM, peter enderborg wrote:
> On 7/2/19 8:37 PM, Waiman Long wrote:
>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>> file to shrink the slab by flushing all the per-cpu slabs and free
>> slabs in partial lists. This applies only to the root caches, though.
>>
>> Extends this capability by shrinking all the child memcg caches and
>> the root cache when a value of '2' is written to the shrink sysfs file.
>>
>> On a 4-socket 112-core 224-thread x86-64 system after a parallel kernel
>> build, the the amount of memory occupied by slabs before shrinking
>> slabs were:
>>
>>  # grep task_struct /proc/slabinfo
>>  task_struct         7114   7296   7744    4    8 : tunables    0    0
>>  0 : slabdata   1824   1824      0
>>  # grep "^S[lRU]" /proc/meminfo
>>  Slab:            1310444 kB
>>  SReclaimable:     377604 kB
>>  SUnreclaim:       932840 kB
>>
>> After shrinking slabs:
>>
>>  # grep "^S[lRU]" /proc/meminfo
>>  Slab:             695652 kB
>>  SReclaimable:     322796 kB
>>  SUnreclaim:       372856 kB
>>  # grep task_struct /proc/slabinfo
>>  task_struct         2262   2572   7744    4    8 : tunables    0    0
>>  0 : slabdata    643    643      0
>
> What is the time between this measurement points? Should not the shrinked memory show up as reclaimable?

In this case, I echoed '2' to all the shrink sysfs files under
/sys/kernel/slab. The purpose of shrinking caches is to reclaim as much
unused memory slabs from all the caches, irrespective if they are
reclaimable or not. We do not reclaim any used objects. That is why we
see the numbers were reduced in both cases.

Cheers,
Longman
Vlastimil Babka Aug. 7, 2019, 4:33 p.m. UTC | #20
On 7/23/19 4:30 PM, Waiman Long wrote:
> On 7/22/19 8:46 AM, peter enderborg wrote:
>> On 7/2/19 8:37 PM, Waiman Long wrote:
>>> Currently, a value of '1" is written to /sys/kernel/slab/<slab>/shrink
>>> file to shrink the slab by flushing all the per-cpu slabs and free
>>> slabs in partial lists. This applies only to the root caches, though.
>>>
>>> Extends this capability by shrinking all the child memcg caches and
>>> the root cache when a value of '2' is written to the shrink sysfs file.
>>>
>>> On a 4-socket 112-core 224-thread x86-64 system after a parallel kernel
>>> build, the the amount of memory occupied by slabs before shrinking
>>> slabs were:
>>>
>>>  # grep task_struct /proc/slabinfo
>>>  task_struct         7114   7296   7744    4    8 : tunables    0    0
>>>  0 : slabdata   1824   1824      0
>>>  # grep "^S[lRU]" /proc/meminfo
>>>  Slab:            1310444 kB
>>>  SReclaimable:     377604 kB
>>>  SUnreclaim:       932840 kB
>>>
>>> After shrinking slabs:
>>>
>>>  # grep "^S[lRU]" /proc/meminfo
>>>  Slab:             695652 kB
>>>  SReclaimable:     322796 kB
>>>  SUnreclaim:       372856 kB
>>>  # grep task_struct /proc/slabinfo
>>>  task_struct         2262   2572   7744    4    8 : tunables    0    0
>>>  0 : slabdata    643    643      0
>>
>> What is the time between this measurement points? Should not the shrinked memory show up as reclaimable?
> 
> In this case, I echoed '2' to all the shrink sysfs files under
> /sys/kernel/slab. The purpose of shrinking caches is to reclaim as much
> unused memory slabs from all the caches, irrespective if they are
> reclaimable or not.

Well, SReclaimable counts pages allocated by kmem caches with
SLAB_RECLAIM_ACCOUNT flags, which should match those that have a shrinker
associated and can thus actually reclaim objects. That shrinking slabs affected
SReclaimable just a bit while reducing SUnreclaim by more than 50% looks
certainly odd.
For example the task_struct cache is not a reclaimable one, yet shows massive
reduction. Could be that the reclaimable objects were pinning non-reclaimable
ones, so the shrinking had secondary effects in non-reclaimable caches.

> We do not reclaim any used objects. That is why we
> see the numbers were reduced in both cases.
> 
> Cheers,
> Longman
>
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d93a1c2..2a3d0fc4b4ac 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -429,10 +429,12 @@  KernelVersion:	2.6.22
 Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 		Christoph Lameter <cl@linux-foundation.org>
 Description:
-		The shrink file is written when memory should be reclaimed from
-		a cache.  Empty partial slabs are freed and the partial list is
-		sorted so the slabs with the fewest available objects are used
-		first.
+		A value of '1' is written to the shrink file when memory should
+		be reclaimed from a cache.  Empty partial slabs are freed and
+		the partial list is sorted so the slabs with the fewest
+		available objects are used first.  When a value of '2' is
+		written, all the corresponding child memory cgroup caches
+		should be shrunk as well.  All other values are invalid.
 
 What:		/sys/kernel/slab/cache/slab_size
 Date:		May 2007
diff --git a/mm/slab.h b/mm/slab.h
index 3b22931bb557..a16b2c7ff4dd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -174,6 +174,7 @@  int __kmem_cache_shrink(struct kmem_cache *);
 void __kmemcg_cache_deactivate(struct kmem_cache *s);
 void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
 void slab_kmem_cache_release(struct kmem_cache *);
+int kmem_cache_shrink_all(struct kmem_cache *s);
 
 struct seq_file;
 struct file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 464faaa9fd81..493697ba1da5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -981,6 +981,49 @@  int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
+/**
+ * kmem_cache_shrink_all - shrink a cache and all its memcg children
+ * @s: The root cache to shrink.
+ *
+ * Return: 0 if successful, -EINVAL if not a root cache
+ */
+int kmem_cache_shrink_all(struct kmem_cache *s)
+{
+	struct kmem_cache *c;
+
+	if (!IS_ENABLED(CONFIG_MEMCG_KMEM)) {
+		kmem_cache_shrink(s);
+		return 0;
+	}
+	if (!is_root_cache(s))
+		return -EINVAL;
+
+	/*
+	 * The caller should have a reference to the root cache and so
+	 * we don't need to take the slab_mutex. We have to take the
+	 * slab_mutex, however, to iterate the memcg caches.
+	 */
+	get_online_cpus();
+	get_online_mems();
+	kasan_cache_shrink(s);
+	__kmem_cache_shrink(s);
+
+	mutex_lock(&slab_mutex);
+	for_each_memcg_cache(c, s) {
+		/*
+		 * Don't need to shrink deactivated memcg caches.
+		 */
+		if (s->flags & SLAB_DEACTIVATED)
+			continue;
+		kasan_cache_shrink(c);
+		__kmem_cache_shrink(c);
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_mems();
+	put_online_cpus();
+	return 0;
+}
+
 bool slab_is_available(void)
 {
 	return slab_state >= UP;
diff --git a/mm/slub.c b/mm/slub.c
index a384228ff6d3..5d7b0004c51f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5298,6 +5298,8 @@  static ssize_t shrink_store(struct kmem_cache *s,
 {
 	if (buf[0] == '1')
 		kmem_cache_shrink(s);
+	else if (buf[0] == '2')
+		kmem_cache_shrink_all(s);
 	else
 		return -EINVAL;
 	return length;