diff mbox series

[v3,2/6] mm: shrinkers: introduce debugfs interface for memory shrinkers

Message ID 20220509183820.573666-3-roman.gushchin@linux.dev (mailing list archive)
State New
Headers show
Series mm: introduce shrinker debugfs interface | expand

Commit Message

Roman Gushchin May 9, 2022, 6:38 p.m. UTC
This commit introduces the /sys/kernel/debug/shrinker debugfs
interface which provides an ability to observe the state of
individual kernel memory shrinkers.

Because the feature adds some memory overhead (which shouldn't be
large unless there is a huge amount of registered shrinkers), it's
guarded by a config option (enabled by default).

This commit introduces the "count" interface for each shrinker
registered in the system.

The output is in the following format:
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
...

To reduce the size of output on machines with many thousands cgroups,
if the total number of objects on all nodes is 0, the line is omitted.

If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
printed for all nodes except the first one.

This commit gives debugfs entries simple numeric names, which are not
very convenient. The following commit in the series will provide
shrinkers with more meaningful names.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 include/linux/shrinker.h |  19 ++++-
 lib/Kconfig.debug        |   9 +++
 mm/Makefile              |   1 +
 mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c              |   6 +-
 5 files changed, 203 insertions(+), 3 deletions(-)
 create mode 100644 mm/shrinker_debug.c

Comments

Kent Overstreet May 20, 2022, 4:45 p.m. UTC | #1
On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> This commit introduces the /sys/kernel/debug/shrinker debugfs
> interface which provides an ability to observe the state of
> individual kernel memory shrinkers.
> 
> Because the feature adds some memory overhead (which shouldn't be
> large unless there is a huge amount of registered shrinkers), it's
> guarded by a config option (enabled by default).
> 
> This commit introduces the "count" interface for each shrinker
> registered in the system.
> 
> The output is in the following format:
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> ...
> 
> To reduce the size of output on machines with many thousands cgroups,
> if the total number of objects on all nodes is 0, the line is omitted.
> 
> If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> printed for all nodes except the first one.
> 
> This commit gives debugfs entries simple numeric names, which are not
> very convenient. The following commit in the series will provide
> shrinkers with more meaningful names.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>

I think this looks reasonable

Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Christophe JAILLET May 20, 2022, 4:58 p.m. UTC | #2
Le 09/05/2022 à 20:38, Roman Gushchin a écrit :
> This commit introduces the /sys/kernel/debug/shrinker debugfs
> interface which provides an ability to observe the state of
> individual kernel memory shrinkers.
> 
> Because the feature adds some memory overhead (which shouldn't be
> large unless there is a huge amount of registered shrinkers), it's
> guarded by a config option (enabled by default).
> 
> This commit introduces the "count" interface for each shrinker
> registered in the system.
> 
> The output is in the following format:
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> ...
> 
> To reduce the size of output on machines with many thousands cgroups,
> if the total number of objects on all nodes is 0, the line is omitted.
> 
> If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> printed for all nodes except the first one.
> 
> This commit gives debugfs entries simple numeric names, which are not
> very convenient. The following commit in the series will provide
> shrinkers with more meaningful names.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
>   include/linux/shrinker.h |  19 ++++-
>   lib/Kconfig.debug        |   9 +++
>   mm/Makefile              |   1 +
>   mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
>   mm/vmscan.c              |   6 +-
>   5 files changed, 203 insertions(+), 3 deletions(-)
>   create mode 100644 mm/shrinker_debug.c
> 
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index 76fbf92b04d9..2ced8149c513 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -72,6 +72,10 @@ struct shrinker {
>   #ifdef CONFIG_MEMCG
>   	/* ID in shrinker_idr */
>   	int id;
> +#endif
> +#ifdef CONFIG_SHRINKER_DEBUG
> +	int debugfs_id;
> +	struct dentry *debugfs_entry;
>   #endif
>   	/* objs pending delete, per node */
>   	atomic_long_t *nr_deferred;
> @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
>   extern void unregister_shrinker(struct shrinker *shrinker);
>   extern void free_prealloced_shrinker(struct shrinker *shrinker);
>   extern void synchronize_shrinkers(void);
> -#endif
> +
> +#ifdef CONFIG_SHRINKER_DEBUG
> +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> +#else /* CONFIG_SHRINKER_DEBUG */
> +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> +	return 0;
> +}
> +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> +}
> +#endif /* CONFIG_SHRINKER_DEBUG */
> +#endif /* _LINUX_SHRINKER_H */
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 3fd7a2e9eaf1..5fa65a649798 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -733,6 +733,15 @@ config SLUB_STATS
>   	  out which slabs are relevant to a particular load.
>   	  Try running: slabinfo -DA
>   
> +config SHRINKER_DEBUG
> +	default y

The previous version of the serie had default 'n'.
Is it intentional to have it now activated by default? It looked more 
like a tuning functionality when fine grained mangement of shrinker is 
needed.


> +	bool "Enable shrinker debugging support"
> +	depends on DEBUG_FS
> +	help
> +	  Say Y to enable the shrinker debugfs interface which provides
> +	  visibility into the kernel memory shrinkers subsystem.
> +	  Disable it to avoid an extra memory footprint.
> +

[...]
Kent Overstreet May 20, 2022, 5 p.m. UTC | #3
On Fri, May 20, 2022 at 06:58:12PM +0200, Christophe JAILLET wrote:
> Le 09/05/2022 à 20:38, Roman Gushchin a écrit :
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 3fd7a2e9eaf1..5fa65a649798 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -733,6 +733,15 @@ config SLUB_STATS
> >   	  out which slabs are relevant to a particular load.
> >   	  Try running: slabinfo -DA
> > +config SHRINKER_DEBUG
> > +	default y
> 
> The previous version of the serie had default 'n'.
> Is it intentional to have it now activated by default? It looked more like a
> tuning functionality when fine grained mangement of shrinker is needed.

I think having this on by default if you've already enabled debugfs is smart -
it doesn't add runtime overhead, just a bit of code, and things that make the
system more observable are great to have on by default.
Roman Gushchin May 21, 2022, 12:27 a.m. UTC | #4
On Fri, May 20, 2022 at 06:58:12PM +0200, Christophe JAILLET wrote:
> Le 09/05/2022 à 20:38, Roman Gushchin a écrit :
> > This commit introduces the /sys/kernel/debug/shrinker debugfs
> > interface which provides an ability to observe the state of
> > individual kernel memory shrinkers.
> > 
> > Because the feature adds some memory overhead (which shouldn't be
> > large unless there is a huge amount of registered shrinkers), it's
> > guarded by a config option (enabled by default).
> > 
> > This commit introduces the "count" interface for each shrinker
> > registered in the system.
> > 
> > The output is in the following format:
> > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > ...
> > 
> > To reduce the size of output on machines with many thousands cgroups,
> > if the total number of objects on all nodes is 0, the line is omitted.
> > 
> > If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> > printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> > printed for all nodes except the first one.
> > 
> > This commit gives debugfs entries simple numeric names, which are not
> > very convenient. The following commit in the series will provide
> > shrinkers with more meaningful names.
> > 
> > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> > ---
> >   include/linux/shrinker.h |  19 ++++-
> >   lib/Kconfig.debug        |   9 +++
> >   mm/Makefile              |   1 +
> >   mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
> >   mm/vmscan.c              |   6 +-
> >   5 files changed, 203 insertions(+), 3 deletions(-)
> >   create mode 100644 mm/shrinker_debug.c
> > 
> > diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> > index 76fbf92b04d9..2ced8149c513 100644
> > --- a/include/linux/shrinker.h
> > +++ b/include/linux/shrinker.h
> > @@ -72,6 +72,10 @@ struct shrinker {
> >   #ifdef CONFIG_MEMCG
> >   	/* ID in shrinker_idr */
> >   	int id;
> > +#endif
> > +#ifdef CONFIG_SHRINKER_DEBUG
> > +	int debugfs_id;
> > +	struct dentry *debugfs_entry;
> >   #endif
> >   	/* objs pending delete, per node */
> >   	atomic_long_t *nr_deferred;
> > @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
> >   extern void unregister_shrinker(struct shrinker *shrinker);
> >   extern void free_prealloced_shrinker(struct shrinker *shrinker);
> >   extern void synchronize_shrinkers(void);
> > -#endif
> > +
> > +#ifdef CONFIG_SHRINKER_DEBUG
> > +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> > +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> > +#else /* CONFIG_SHRINKER_DEBUG */
> > +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> > +{
> > +	return 0;
> > +}
> > +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> > +{
> > +}
> > +#endif /* CONFIG_SHRINKER_DEBUG */
> > +#endif /* _LINUX_SHRINKER_H */
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 3fd7a2e9eaf1..5fa65a649798 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -733,6 +733,15 @@ config SLUB_STATS
> >   	  out which slabs are relevant to a particular load.
> >   	  Try running: slabinfo -DA
> > +config SHRINKER_DEBUG
> > +	default y
> 
> The previous version of the serie had default 'n'.
> Is it intentional to have it now activated by default? It looked more like a
> tuning functionality when fine grained mangement of shrinker is needed.

Yes, it is intentional.
The overhead is small, so I don't think we have a good reason to hide it
by default behind a config option. In my opinion, enabling it be default
will increase the chances to gather a useful data.
It was the feedback I've received for one of the previous versions of
the patchset, and I think it's totally valid.
And preserving the config option allows to have a zero overhead for
really constrained systems.

Thanks!
Roman Gushchin May 21, 2022, 12:27 a.m. UTC | #5
On Fri, May 20, 2022 at 12:45:12PM -0400, Kent Overstreet wrote:
> On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> > This commit introduces the /sys/kernel/debug/shrinker debugfs
> > interface which provides an ability to observe the state of
> > individual kernel memory shrinkers.
> > 
> > Because the feature adds some memory overhead (which shouldn't be
> > large unless there is a huge amount of registered shrinkers), it's
> > guarded by a config option (enabled by default).
> > 
> > This commit introduces the "count" interface for each shrinker
> > registered in the system.
> > 
> > The output is in the following format:
> > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > ...
> > 
> > To reduce the size of output on machines with many thousands cgroups,
> > if the total number of objects on all nodes is 0, the line is omitted.
> > 
> > If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> > printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> > printed for all nodes except the first one.
> > 
> > This commit gives debugfs entries simple numeric names, which are not
> > very convenient. The following commit in the series will provide
> > shrinkers with more meaningful names.
> > 
> > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> 
> I think this looks reasonable
> 
> Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>

Thank you!
Muchun Song May 22, 2022, 10:36 a.m. UTC | #6
On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> This commit introduces the /sys/kernel/debug/shrinker debugfs
> interface which provides an ability to observe the state of
> individual kernel memory shrinkers.
> 
> Because the feature adds some memory overhead (which shouldn't be
> large unless there is a huge amount of registered shrinkers), it's
> guarded by a config option (enabled by default).
> 
> This commit introduces the "count" interface for each shrinker
> registered in the system.
> 
> The output is in the following format:

Hi Roman,

Shoud we print a title to show what those numbers mean?  In this case,
it is more understandable.

> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> ...
> 
> To reduce the size of output on machines with many thousands cgroups,
> if the total number of objects on all nodes is 0, the line is omitted.
> 
> If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> printed for all nodes except the first one.
> 
> This commit gives debugfs entries simple numeric names, which are not
> very convenient. The following commit in the series will provide
> shrinkers with more meaningful names.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
>  include/linux/shrinker.h |  19 ++++-
>  lib/Kconfig.debug        |   9 +++
>  mm/Makefile              |   1 +
>  mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c              |   6 +-
>  5 files changed, 203 insertions(+), 3 deletions(-)
>  create mode 100644 mm/shrinker_debug.c
> 
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index 76fbf92b04d9..2ced8149c513 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -72,6 +72,10 @@ struct shrinker {
>  #ifdef CONFIG_MEMCG
>  	/* ID in shrinker_idr */
>  	int id;
> +#endif
> +#ifdef CONFIG_SHRINKER_DEBUG
> +	int debugfs_id;
> +	struct dentry *debugfs_entry;
>  #endif
>  	/* objs pending delete, per node */
>  	atomic_long_t *nr_deferred;
> @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
>  extern void unregister_shrinker(struct shrinker *shrinker);
>  extern void free_prealloced_shrinker(struct shrinker *shrinker);
>  extern void synchronize_shrinkers(void);
> -#endif
> +
> +#ifdef CONFIG_SHRINKER_DEBUG
> +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> +#else /* CONFIG_SHRINKER_DEBUG */
> +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> +	return 0;
> +}
> +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> +}
> +#endif /* CONFIG_SHRINKER_DEBUG */
> +#endif /* _LINUX_SHRINKER_H */
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 3fd7a2e9eaf1..5fa65a649798 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -733,6 +733,15 @@ config SLUB_STATS
>  	  out which slabs are relevant to a particular load.
>  	  Try running: slabinfo -DA
>  
> +config SHRINKER_DEBUG
> +	default y
> +	bool "Enable shrinker debugging support"
> +	depends on DEBUG_FS
> +	help
> +	  Say Y to enable the shrinker debugfs interface which provides
> +	  visibility into the kernel memory shrinkers subsystem.
> +	  Disable it to avoid an extra memory footprint.
> +
>  config HAVE_DEBUG_KMEMLEAK
>  	bool
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index 298c9991ab75..8083fa85a348 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
>  obj-$(CONFIG_IO_MAPPING) += io-mapping.o
>  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
>  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
> +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
> new file mode 100644
> index 000000000000..fd1f805a581a
> --- /dev/null
> +++ b/mm/shrinker_debug.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/idr.h>
> +#include <linux/slab.h>
> +#include <linux/debugfs.h>
> +#include <linux/seq_file.h>
> +#include <linux/shrinker.h>
> +#include <linux/memcontrol.h>
> +
> +/* defined in vmscan.c */
> +extern struct rw_semaphore shrinker_rwsem;
> +extern struct list_head shrinker_list;
> +
> +static DEFINE_IDA(shrinker_debugfs_ida);
> +static struct dentry *shrinker_debugfs_root;
> +
> +static unsigned long shrinker_count_objects(struct shrinker *shrinker,
> +					    struct mem_cgroup *memcg,
> +					    unsigned long *count_per_node)
> +{
> +	unsigned long nr, total = 0;
> +	int nid;
> +
> +	for_each_node(nid) {
> +		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
> +			struct shrink_control sc = {
> +				.gfp_mask = GFP_KERNEL,
> +				.nid = nid,
> +				.memcg = memcg,
> +			};
> +
> +			nr = shrinker->count_objects(shrinker, &sc);
> +			if (nr == SHRINK_EMPTY)
> +				nr = 0;
> +		} else {
> +			nr = 0;

For efficiency, we could break here, right?

> +		}
> +
> +		count_per_node[nid] = nr;
> +		total += nr;
> +	}
> +
> +	return total;
> +}
> +
> +static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
> +{
> +	struct shrinker *shrinker = (struct shrinker *)m->private;

Maybe we cound drop the cast since m->private is a void * type.

> +	unsigned long *count_per_node = NULL;

Do not need to be initialized, right?

> +	struct mem_cgroup *memcg;
> +	unsigned long total;
> +	bool memcg_aware;
> +	int ret, nid;
> +
> +	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
> +	if (!count_per_node)
> +		return -ENOMEM;
> +
> +	ret = down_read_killable(&shrinker_rwsem);
> +	if (ret) {
> +		kfree(count_per_node);
> +		return ret;
> +	}
> +	rcu_read_lock();
> +
> +	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
> +
> +	memcg = mem_cgroup_iter(NULL, NULL, NULL);
> +	do {
> +		if (memcg && !mem_cgroup_online(memcg))
> +			continue;
> +
> +		total = shrinker_count_objects(shrinker,
> +					       memcg_aware ? memcg : NULL,
> +					       count_per_node);
> +		if (total) {
> +			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
> +			for_each_node(nid)
> +				seq_printf(m, " %lu", count_per_node[nid]);
> +			seq_puts(m, "\n");

seq_putc(m, '\n') is more efficient.

> +		}
> +
> +		if (!memcg_aware) {
> +			mem_cgroup_iter_break(NULL, memcg);
> +			break;
> +		}
> +
> +		if (signal_pending(current)) {
> +			mem_cgroup_iter_break(NULL, memcg);
> +			ret = -EINTR;
> +			break;
> +		}
> +
> +		cond_resched();

We are in rcu read lock, cannot be scheduled, right?

> +	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> +
> +	rcu_read_unlock();
> +	up_read(&shrinker_rwsem);
> +
> +	kfree(count_per_node);
> +	return ret;
> +}
> +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
> +
> +int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> +	struct dentry *entry;
> +	char buf[16];
> +	int id;
> +
> +	lockdep_assert_held(&shrinker_rwsem);
> +
> +	/* debugfs isn't initialized yet, add debugfs entries later. */
> +	if (!shrinker_debugfs_root)
> +		return 0;
> +
> +	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return id;
> +	shrinker->debugfs_id = id;
> +
> +	snprintf(buf, sizeof(buf), "%d", id);
> +
> +	/* create debugfs entry */
> +	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
> +	if (IS_ERR(entry)) {
> +		ida_free(&shrinker_debugfs_ida, id);
> +		return PTR_ERR(entry);
> +	}
> +	shrinker->debugfs_entry = entry;
> +
> +	debugfs_create_file("count", 0220, entry, shrinker,
> +			    &shrinker_debugfs_count_fops);
> +	return 0;
> +}
> +
> +void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> +	lockdep_assert_held(&shrinker_rwsem);
> +
> +	if (!shrinker->debugfs_entry)
> +		return;
> +
> +	debugfs_remove_recursive(shrinker->debugfs_entry);
> +	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
> +}
> +
> +static int __init shrinker_debugfs_init(void)
> +{
> +	struct shrinker *shrinker;
> +	int ret;
> +
> +	if (!debugfs_initialized())
> +		return -ENODEV;
> +

Redundant check since it is checked in debugfs_create_dir().
So I think we could remove this.

> +	shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL);

We should use IS_ERR() to detect the error code.  So the following
check is wrong.

> +	if (!shrinker_debugfs_root)
> +		return -ENOMEM;
> +
> +	/* Create debugfs entries for shrinkers registered at boot */
> +	ret = down_write_killable(&shrinker_rwsem);

How could we kill this process?  IIUC, late_initcall() is called
from early init process, there is no way to kill this. Right?
If yes, I think we could just use down_write().

Thanks.

> +	if (ret)
> +		return ret;
> +
> +	list_for_each_entry(shrinker, &shrinker_list, list)
> +		if (!shrinker->debugfs_entry)
> +			ret = shrinker_debugfs_add(shrinker);
> +	up_write(&shrinker_rwsem);
> +
> +	return ret;
> +}
> +late_initcall(shrinker_debugfs_init);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c6918fff06e1..024f7056b98c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task,
>  	task->reclaim_state = rs;
>  }
>  
> -static LIST_HEAD(shrinker_list);
> -static DECLARE_RWSEM(shrinker_rwsem);
> +LIST_HEAD(shrinker_list);
> +DECLARE_RWSEM(shrinker_rwsem);
>  
>  #ifdef CONFIG_MEMCG
>  static int shrinker_nr_max;
> @@ -655,6 +655,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
>  	down_write(&shrinker_rwsem);
>  	list_add_tail(&shrinker->list, &shrinker_list);
>  	shrinker->flags |= SHRINKER_REGISTERED;
> +	WARN_ON_ONCE(shrinker_debugfs_add(shrinker));
>  	up_write(&shrinker_rwsem);
>  }
>  
> @@ -682,6 +683,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>  	shrinker->flags &= ~SHRINKER_REGISTERED;
>  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>  		unregister_memcg_shrinker(shrinker);
> +	shrinker_debugfs_remove(shrinker);
>  	up_write(&shrinker_rwsem);
>  
>  	kfree(shrinker->nr_deferred);
> -- 
> 2.35.3
> 
>
Roman Gushchin May 23, 2022, 6:24 p.m. UTC | #7
On Sun, May 22, 2022 at 06:36:56PM +0800, Muchun Song wrote:
> On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> > This commit introduces the /sys/kernel/debug/shrinker debugfs
> > interface which provides an ability to observe the state of
> > individual kernel memory shrinkers.
> > 
> > Because the feature adds some memory overhead (which shouldn't be
> > large unless there is a huge amount of registered shrinkers), it's
> > guarded by a config option (enabled by default).
> > 
> > This commit introduces the "count" interface for each shrinker
> > registered in the system.
> > 
> > The output is in the following format:
> 
> Hi Roman,

Hi Muchun!

Thank you for taking a look!

> 
> Shoud we print a title to show what those numbers mean?  In this case,
> it is more understandable.

No, I don't think so: this interface is not supposed to be used by
an average user and those who will be using it can refer to the provided
documentation. Printing the header each time will add some overhead for
no good reason.

> > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > ...
> > 
> > To reduce the size of output on machines with many thousands cgroups,
> > if the total number of objects on all nodes is 0, the line is omitted.
> > 
> > If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> > printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> > printed for all nodes except the first one.
> > 
> > This commit gives debugfs entries simple numeric names, which are not
> > very convenient. The following commit in the series will provide
> > shrinkers with more meaningful names.
> > 
> > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> > ---
> >  include/linux/shrinker.h |  19 ++++-
> >  lib/Kconfig.debug        |   9 +++
> >  mm/Makefile              |   1 +
> >  mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
> >  mm/vmscan.c              |   6 +-
> >  5 files changed, 203 insertions(+), 3 deletions(-)
> >  create mode 100644 mm/shrinker_debug.c
> > 
> > diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> > index 76fbf92b04d9..2ced8149c513 100644
> > --- a/include/linux/shrinker.h
> > +++ b/include/linux/shrinker.h
> > @@ -72,6 +72,10 @@ struct shrinker {
> >  #ifdef CONFIG_MEMCG
> >  	/* ID in shrinker_idr */
> >  	int id;
> > +#endif
> > +#ifdef CONFIG_SHRINKER_DEBUG
> > +	int debugfs_id;
> > +	struct dentry *debugfs_entry;
> >  #endif
> >  	/* objs pending delete, per node */
> >  	atomic_long_t *nr_deferred;
> > @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
> >  extern void unregister_shrinker(struct shrinker *shrinker);
> >  extern void free_prealloced_shrinker(struct shrinker *shrinker);
> >  extern void synchronize_shrinkers(void);
> > -#endif
> > +
> > +#ifdef CONFIG_SHRINKER_DEBUG
> > +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> > +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> > +#else /* CONFIG_SHRINKER_DEBUG */
> > +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> > +{
> > +	return 0;
> > +}
> > +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> > +{
> > +}
> > +#endif /* CONFIG_SHRINKER_DEBUG */
> > +#endif /* _LINUX_SHRINKER_H */
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 3fd7a2e9eaf1..5fa65a649798 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -733,6 +733,15 @@ config SLUB_STATS
> >  	  out which slabs are relevant to a particular load.
> >  	  Try running: slabinfo -DA
> >  
> > +config SHRINKER_DEBUG
> > +	default y
> > +	bool "Enable shrinker debugging support"
> > +	depends on DEBUG_FS
> > +	help
> > +	  Say Y to enable the shrinker debugfs interface which provides
> > +	  visibility into the kernel memory shrinkers subsystem.
> > +	  Disable it to avoid an extra memory footprint.
> > +
> >  config HAVE_DEBUG_KMEMLEAK
> >  	bool
> >  
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 298c9991ab75..8083fa85a348 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
> >  obj-$(CONFIG_IO_MAPPING) += io-mapping.o
> >  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
> >  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
> > +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> > diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
> > new file mode 100644
> > index 000000000000..fd1f805a581a
> > --- /dev/null
> > +++ b/mm/shrinker_debug.c
> > @@ -0,0 +1,171 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/idr.h>
> > +#include <linux/slab.h>
> > +#include <linux/debugfs.h>
> > +#include <linux/seq_file.h>
> > +#include <linux/shrinker.h>
> > +#include <linux/memcontrol.h>
> > +
> > +/* defined in vmscan.c */
> > +extern struct rw_semaphore shrinker_rwsem;
> > +extern struct list_head shrinker_list;
> > +
> > +static DEFINE_IDA(shrinker_debugfs_ida);
> > +static struct dentry *shrinker_debugfs_root;
> > +
> > +static unsigned long shrinker_count_objects(struct shrinker *shrinker,
> > +					    struct mem_cgroup *memcg,
> > +					    unsigned long *count_per_node)
> > +{
> > +	unsigned long nr, total = 0;
> > +	int nid;
> > +
> > +	for_each_node(nid) {
> > +		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
> > +			struct shrink_control sc = {
> > +				.gfp_mask = GFP_KERNEL,
> > +				.nid = nid,
> > +				.memcg = memcg,
> > +			};
> > +
> > +			nr = shrinker->count_objects(shrinker, &sc);
> > +			if (nr == SHRINK_EMPTY)
> > +				nr = 0;
> > +		} else {
> > +			nr = 0;
> 
> For efficiency, we could break here, right?

Not really, we need to fill count_per_node[] with zeros.

> 
> > +		}
> > +
> > +		count_per_node[nid] = nr;
> > +		total += nr;
> > +	}
> > +
> > +	return total;
> > +}
> > +
> > +static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
> > +{
> > +	struct shrinker *shrinker = (struct shrinker *)m->private;
> 
> Maybe we cound drop the cast since m->private is a void * type.

Ok.

> 
> > +	unsigned long *count_per_node = NULL;
> 
> Do not need to be initialized, right?

Right, will fix in v4.

> 
> > +	struct mem_cgroup *memcg;
> > +	unsigned long total;
> > +	bool memcg_aware;
> > +	int ret, nid;
> > +
> > +	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
> > +	if (!count_per_node)
> > +		return -ENOMEM;
> > +
> > +	ret = down_read_killable(&shrinker_rwsem);
> > +	if (ret) {
> > +		kfree(count_per_node);
> > +		return ret;
> > +	}
> > +	rcu_read_lock();
> > +
> > +	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
> > +
> > +	memcg = mem_cgroup_iter(NULL, NULL, NULL);
> > +	do {
> > +		if (memcg && !mem_cgroup_online(memcg))
> > +			continue;
> > +
> > +		total = shrinker_count_objects(shrinker,
> > +					       memcg_aware ? memcg : NULL,
> > +					       count_per_node);
> > +		if (total) {
> > +			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
> > +			for_each_node(nid)
> > +				seq_printf(m, " %lu", count_per_node[nid]);
> > +			seq_puts(m, "\n");
> 
> seq_putc(m, '\n') is more efficient.

Ok.

> 
> > +		}
> > +
> > +		if (!memcg_aware) {
> > +			mem_cgroup_iter_break(NULL, memcg);
> > +			break;
> > +		}
> > +
> > +		if (signal_pending(current)) {
> > +			mem_cgroup_iter_break(NULL, memcg);
> > +			ret = -EINTR;
> > +			break;
> > +		}
> > +
> > +		cond_resched();
> 
> We are in rcu read lock, cannot be scheduled, right?

This is a good one, thanks. Fixed.

> 
> > +	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> > +
> > +	rcu_read_unlock();
> > +	up_read(&shrinker_rwsem);
> > +
> > +	kfree(count_per_node);
> > +	return ret;
> > +}
> > +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
> > +
> > +int shrinker_debugfs_add(struct shrinker *shrinker)
> > +{
> > +	struct dentry *entry;
> > +	char buf[16];
> > +	int id;
> > +
> > +	lockdep_assert_held(&shrinker_rwsem);
> > +
> > +	/* debugfs isn't initialized yet, add debugfs entries later. */
> > +	if (!shrinker_debugfs_root)
> > +		return 0;
> > +
> > +	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
> > +	if (id < 0)
> > +		return id;
> > +	shrinker->debugfs_id = id;
> > +
> > +	snprintf(buf, sizeof(buf), "%d", id);
> > +
> > +	/* create debugfs entry */
> > +	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
> > +	if (IS_ERR(entry)) {
> > +		ida_free(&shrinker_debugfs_ida, id);
> > +		return PTR_ERR(entry);
> > +	}
> > +	shrinker->debugfs_entry = entry;
> > +
> > +	debugfs_create_file("count", 0220, entry, shrinker,
> > +			    &shrinker_debugfs_count_fops);
> > +	return 0;
> > +}
> > +
> > +void shrinker_debugfs_remove(struct shrinker *shrinker)
> > +{
> > +	lockdep_assert_held(&shrinker_rwsem);
> > +
> > +	if (!shrinker->debugfs_entry)
> > +		return;
> > +
> > +	debugfs_remove_recursive(shrinker->debugfs_entry);
> > +	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
> > +}
> > +
> > +static int __init shrinker_debugfs_init(void)
> > +{
> > +	struct shrinker *shrinker;
> > +	int ret;
> > +
> > +	if (!debugfs_initialized())
> > +		return -ENODEV;
> > +
> 
> Redundant check since it is checked in debugfs_create_dir().
> So I think we could remove this.
> 
> > +	shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL);
> 
> We should use IS_ERR() to detect the error code.  So the following
> check is wrong.

Right, will fix in the next version.

> 
> > +	if (!shrinker_debugfs_root)
> > +		return -ENOMEM;
> > +
> > +	/* Create debugfs entries for shrinkers registered at boot */
> > +	ret = down_write_killable(&shrinker_rwsem);
> 
> How could we kill this process?  IIUC, late_initcall() is called
> from early init process, there is no way to kill this. Right?
> If yes, I think we could just use down_write().

Ok, agree.

Thanks!
Muchun Song May 24, 2022, 2:06 a.m. UTC | #8
On Mon, May 23, 2022 at 11:24:10AM -0700, Roman Gushchin wrote:
> On Sun, May 22, 2022 at 06:36:56PM +0800, Muchun Song wrote:
> > On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> > > This commit introduces the /sys/kernel/debug/shrinker debugfs
> > > interface which provides an ability to observe the state of
> > > individual kernel memory shrinkers.
> > > 
> > > Because the feature adds some memory overhead (which shouldn't be
> > > large unless there is a huge amount of registered shrinkers), it's
> > > guarded by a config option (enabled by default).
> > > 
> > > This commit introduces the "count" interface for each shrinker
> > > registered in the system.
> > > 
> > > The output is in the following format:
> > 
> > Hi Roman,
> 
> Hi Muchun!
> 
> Thank you for taking a look!
> 
> > 
> > Shoud we print a title to show what those numbers mean?  In this case,
> > it is more understandable.
> 
> No, I don't think so: this interface is not supposed to be used by
> an average user and those who will be using it can refer to the provided
> documentation. Printing the header each time will add some overhead for
> no good reason.
>

Got it. Make sense.
 
> > > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> > > ...
> > > 
> > > To reduce the size of output on machines with many thousands cgroups,
> > > if the total number of objects on all nodes is 0, the line is omitted.
> > > 
> > > If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> > > printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> > > printed for all nodes except the first one.
> > > 
> > > This commit gives debugfs entries simple numeric names, which are not
> > > very convenient. The following commit in the series will provide
> > > shrinkers with more meaningful names.
> > > 
> > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> > > ---
> > >  include/linux/shrinker.h |  19 ++++-
> > >  lib/Kconfig.debug        |   9 +++
> > >  mm/Makefile              |   1 +
> > >  mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
> > >  mm/vmscan.c              |   6 +-
> > >  5 files changed, 203 insertions(+), 3 deletions(-)
> > >  create mode 100644 mm/shrinker_debug.c
> > > 
> > > diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> > > index 76fbf92b04d9..2ced8149c513 100644
> > > --- a/include/linux/shrinker.h
> > > +++ b/include/linux/shrinker.h
> > > @@ -72,6 +72,10 @@ struct shrinker {
> > >  #ifdef CONFIG_MEMCG
> > >  	/* ID in shrinker_idr */
> > >  	int id;
> > > +#endif
> > > +#ifdef CONFIG_SHRINKER_DEBUG
> > > +	int debugfs_id;
> > > +	struct dentry *debugfs_entry;
> > >  #endif
> > >  	/* objs pending delete, per node */
> > >  	atomic_long_t *nr_deferred;
> > > @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
> > >  extern void unregister_shrinker(struct shrinker *shrinker);
> > >  extern void free_prealloced_shrinker(struct shrinker *shrinker);
> > >  extern void synchronize_shrinkers(void);
> > > -#endif
> > > +
> > > +#ifdef CONFIG_SHRINKER_DEBUG
> > > +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> > > +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> > > +#else /* CONFIG_SHRINKER_DEBUG */
> > > +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> > > +{
> > > +	return 0;
> > > +}
> > > +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> > > +{
> > > +}
> > > +#endif /* CONFIG_SHRINKER_DEBUG */
> > > +#endif /* _LINUX_SHRINKER_H */
> > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > > index 3fd7a2e9eaf1..5fa65a649798 100644
> > > --- a/lib/Kconfig.debug
> > > +++ b/lib/Kconfig.debug
> > > @@ -733,6 +733,15 @@ config SLUB_STATS
> > >  	  out which slabs are relevant to a particular load.
> > >  	  Try running: slabinfo -DA
> > >  
> > > +config SHRINKER_DEBUG
> > > +	default y
> > > +	bool "Enable shrinker debugging support"
> > > +	depends on DEBUG_FS
> > > +	help
> > > +	  Say Y to enable the shrinker debugfs interface which provides
> > > +	  visibility into the kernel memory shrinkers subsystem.
> > > +	  Disable it to avoid an extra memory footprint.
> > > +
> > >  config HAVE_DEBUG_KMEMLEAK
> > >  	bool
> > >  
> > > diff --git a/mm/Makefile b/mm/Makefile
> > > index 298c9991ab75..8083fa85a348 100644
> > > --- a/mm/Makefile
> > > +++ b/mm/Makefile
> > > @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
> > >  obj-$(CONFIG_IO_MAPPING) += io-mapping.o
> > >  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
> > >  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
> > > +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> > > diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
> > > new file mode 100644
> > > index 000000000000..fd1f805a581a
> > > --- /dev/null
> > > +++ b/mm/shrinker_debug.c
> > > @@ -0,0 +1,171 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +#include <linux/idr.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/debugfs.h>
> > > +#include <linux/seq_file.h>
> > > +#include <linux/shrinker.h>
> > > +#include <linux/memcontrol.h>
> > > +
> > > +/* defined in vmscan.c */
> > > +extern struct rw_semaphore shrinker_rwsem;
> > > +extern struct list_head shrinker_list;
> > > +
> > > +static DEFINE_IDA(shrinker_debugfs_ida);
> > > +static struct dentry *shrinker_debugfs_root;
> > > +
> > > +static unsigned long shrinker_count_objects(struct shrinker *shrinker,
> > > +					    struct mem_cgroup *memcg,
> > > +					    unsigned long *count_per_node)
> > > +{
> > > +	unsigned long nr, total = 0;
> > > +	int nid;
> > > +
> > > +	for_each_node(nid) {
> > > +		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
> > > +			struct shrink_control sc = {
> > > +				.gfp_mask = GFP_KERNEL,
> > > +				.nid = nid,
> > > +				.memcg = memcg,
> > > +			};
> > > +
> > > +			nr = shrinker->count_objects(shrinker, &sc);
> > > +			if (nr == SHRINK_EMPTY)
> > > +				nr = 0;
> > > +		} else {
> > > +			nr = 0;
> > 
> > For efficiency, we could break here, right?
> 
> Not really, we need to fill count_per_node[] with zeros.
>

I thought count_per_node[] was initialized with zero by the caller
when allocated.  However, I am wrong.  Because it'll be reused
in each loop.  You are right.

> > 
> > > +		}
> > > +
> > > +		count_per_node[nid] = nr;
> > > +		total += nr;
> > > +	}
> > > +
> > > +	return total;
> > > +}
> > > +
> > > +static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
> > > +{
> > > +	struct shrinker *shrinker = (struct shrinker *)m->private;
> > 
> > Maybe we cound drop the cast since m->private is a void * type.
> 
> Ok.
> 
> > 
> > > +	unsigned long *count_per_node = NULL;
> > 
> > Do not need to be initialized, right?
> 
> Right, will fix in v4.
> 
> > 
> > > +	struct mem_cgroup *memcg;
> > > +	unsigned long total;
> > > +	bool memcg_aware;
> > > +	int ret, nid;
> > > +
> > > +	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
> > > +	if (!count_per_node)
> > > +		return -ENOMEM;
> > > +
> > > +	ret = down_read_killable(&shrinker_rwsem);
> > > +	if (ret) {
> > > +		kfree(count_per_node);
> > > +		return ret;
> > > +	}
> > > +	rcu_read_lock();
> > > +
> > > +	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
> > > +
> > > +	memcg = mem_cgroup_iter(NULL, NULL, NULL);
> > > +	do {
> > > +		if (memcg && !mem_cgroup_online(memcg))
> > > +			continue;
> > > +
> > > +		total = shrinker_count_objects(shrinker,
> > > +					       memcg_aware ? memcg : NULL,
> > > +					       count_per_node);
> > > +		if (total) {
> > > +			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
> > > +			for_each_node(nid)
> > > +				seq_printf(m, " %lu", count_per_node[nid]);
> > > +			seq_puts(m, "\n");
> > 
> > seq_putc(m, '\n') is more efficient.
> 
> Ok.
> 
> > 
> > > +		}
> > > +
> > > +		if (!memcg_aware) {
> > > +			mem_cgroup_iter_break(NULL, memcg);
> > > +			break;
> > > +		}
> > > +
> > > +		if (signal_pending(current)) {
> > > +			mem_cgroup_iter_break(NULL, memcg);
> > > +			ret = -EINTR;
> > > +			break;
> > > +		}
> > > +
> > > +		cond_resched();
> > 
> > We are in rcu read lock, cannot be scheduled, right?
> 
> This is a good one, thanks. Fixed.
> 
> > 
> > > +	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> > > +
> > > +	rcu_read_unlock();
> > > +	up_read(&shrinker_rwsem);
> > > +
> > > +	kfree(count_per_node);
> > > +	return ret;
> > > +}
> > > +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
> > > +
> > > +int shrinker_debugfs_add(struct shrinker *shrinker)
> > > +{
> > > +	struct dentry *entry;
> > > +	char buf[16];
> > > +	int id;
> > > +
> > > +	lockdep_assert_held(&shrinker_rwsem);
> > > +
> > > +	/* debugfs isn't initialized yet, add debugfs entries later. */
> > > +	if (!shrinker_debugfs_root)
> > > +		return 0;
> > > +
> > > +	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
> > > +	if (id < 0)
> > > +		return id;
> > > +	shrinker->debugfs_id = id;
> > > +
> > > +	snprintf(buf, sizeof(buf), "%d", id);
> > > +
> > > +	/* create debugfs entry */
> > > +	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
> > > +	if (IS_ERR(entry)) {
> > > +		ida_free(&shrinker_debugfs_ida, id);
> > > +		return PTR_ERR(entry);
> > > +	}
> > > +	shrinker->debugfs_entry = entry;
> > > +
> > > +	debugfs_create_file("count", 0220, entry, shrinker,
> > > +			    &shrinker_debugfs_count_fops);
> > > +	return 0;
> > > +}
> > > +
> > > +void shrinker_debugfs_remove(struct shrinker *shrinker)
> > > +{
> > > +	lockdep_assert_held(&shrinker_rwsem);
> > > +
> > > +	if (!shrinker->debugfs_entry)
> > > +		return;
> > > +
> > > +	debugfs_remove_recursive(shrinker->debugfs_entry);
> > > +	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
> > > +}
> > > +
> > > +static int __init shrinker_debugfs_init(void)
> > > +{
> > > +	struct shrinker *shrinker;
> > > +	int ret;
> > > +
> > > +	if (!debugfs_initialized())
> > > +		return -ENODEV;
> > > +
> > 
> > Redundant check since it is checked in debugfs_create_dir().
> > So I think we could remove this.
> > 
> > > +	shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL);
> > 
> > We should use IS_ERR() to detect the error code.  So the following
> > check is wrong.
> 
> Right, will fix in the next version.
> 
> > 
> > > +	if (!shrinker_debugfs_root)
> > > +		return -ENOMEM;
> > > +
> > > +	/* Create debugfs entries for shrinkers registered at boot */
> > > +	ret = down_write_killable(&shrinker_rwsem);
> > 
> > How could we kill this process?  IIUC, late_initcall() is called
> > from early init process, there is no way to kill this. Right?
> > If yes, I think we could just use down_write().
> 
> Ok, agree.
> 
> Thanks!
>
diff mbox series

Patch

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 76fbf92b04d9..2ced8149c513 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -72,6 +72,10 @@  struct shrinker {
 #ifdef CONFIG_MEMCG
 	/* ID in shrinker_idr */
 	int id;
+#endif
+#ifdef CONFIG_SHRINKER_DEBUG
+	int debugfs_id;
+	struct dentry *debugfs_entry;
 #endif
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
@@ -94,4 +98,17 @@  extern int register_shrinker(struct shrinker *shrinker);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
-#endif
+
+#ifdef CONFIG_SHRINKER_DEBUG
+extern int shrinker_debugfs_add(struct shrinker *shrinker);
+extern void shrinker_debugfs_remove(struct shrinker *shrinker);
+#else /* CONFIG_SHRINKER_DEBUG */
+static inline int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	return 0;
+}
+static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_SHRINKER_DEBUG */
+#endif /* _LINUX_SHRINKER_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 3fd7a2e9eaf1..5fa65a649798 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -733,6 +733,15 @@  config SLUB_STATS
 	  out which slabs are relevant to a particular load.
 	  Try running: slabinfo -DA
 
+config SHRINKER_DEBUG
+	default y
+	bool "Enable shrinker debugging support"
+	depends on DEBUG_FS
+	help
+	  Say Y to enable the shrinker debugfs interface which provides
+	  visibility into the kernel memory shrinkers subsystem.
+	  Disable it to avoid an extra memory footprint.
+
 config HAVE_DEBUG_KMEMLEAK
 	bool
 
diff --git a/mm/Makefile b/mm/Makefile
index 298c9991ab75..8083fa85a348 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@  obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
 obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
new file mode 100644
index 000000000000..fd1f805a581a
--- /dev/null
+++ b/mm/shrinker_debug.c
@@ -0,0 +1,171 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/shrinker.h>
+#include <linux/memcontrol.h>
+
+/* defined in vmscan.c */
+extern struct rw_semaphore shrinker_rwsem;
+extern struct list_head shrinker_list;
+
+static DEFINE_IDA(shrinker_debugfs_ida);
+static struct dentry *shrinker_debugfs_root;
+
+static unsigned long shrinker_count_objects(struct shrinker *shrinker,
+					    struct mem_cgroup *memcg,
+					    unsigned long *count_per_node)
+{
+	unsigned long nr, total = 0;
+	int nid;
+
+	for_each_node(nid) {
+		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
+			struct shrink_control sc = {
+				.gfp_mask = GFP_KERNEL,
+				.nid = nid,
+				.memcg = memcg,
+			};
+
+			nr = shrinker->count_objects(shrinker, &sc);
+			if (nr == SHRINK_EMPTY)
+				nr = 0;
+		} else {
+			nr = 0;
+		}
+
+		count_per_node[nid] = nr;
+		total += nr;
+	}
+
+	return total;
+}
+
+static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
+{
+	struct shrinker *shrinker = (struct shrinker *)m->private;
+	unsigned long *count_per_node = NULL;
+	struct mem_cgroup *memcg;
+	unsigned long total;
+	bool memcg_aware;
+	int ret, nid;
+
+	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+	if (!count_per_node)
+		return -ENOMEM;
+
+	ret = down_read_killable(&shrinker_rwsem);
+	if (ret) {
+		kfree(count_per_node);
+		return ret;
+	}
+	rcu_read_lock();
+
+	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		if (memcg && !mem_cgroup_online(memcg))
+			continue;
+
+		total = shrinker_count_objects(shrinker,
+					       memcg_aware ? memcg : NULL,
+					       count_per_node);
+		if (total) {
+			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+			for_each_node(nid)
+				seq_printf(m, " %lu", count_per_node[nid]);
+			seq_puts(m, "\n");
+		}
+
+		if (!memcg_aware) {
+			mem_cgroup_iter_break(NULL, memcg);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			mem_cgroup_iter_break(NULL, memcg);
+			ret = -EINTR;
+			break;
+		}
+
+		cond_resched();
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+
+	rcu_read_unlock();
+	up_read(&shrinker_rwsem);
+
+	kfree(count_per_node);
+	return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
+
+int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	struct dentry *entry;
+	char buf[16];
+	int id;
+
+	lockdep_assert_held(&shrinker_rwsem);
+
+	/* debugfs isn't initialized yet, add debugfs entries later. */
+	if (!shrinker_debugfs_root)
+		return 0;
+
+	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
+	if (id < 0)
+		return id;
+	shrinker->debugfs_id = id;
+
+	snprintf(buf, sizeof(buf), "%d", id);
+
+	/* create debugfs entry */
+	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
+	if (IS_ERR(entry)) {
+		ida_free(&shrinker_debugfs_ida, id);
+		return PTR_ERR(entry);
+	}
+	shrinker->debugfs_entry = entry;
+
+	debugfs_create_file("count", 0220, entry, shrinker,
+			    &shrinker_debugfs_count_fops);
+	return 0;
+}
+
+void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+	lockdep_assert_held(&shrinker_rwsem);
+
+	if (!shrinker->debugfs_entry)
+		return;
+
+	debugfs_remove_recursive(shrinker->debugfs_entry);
+	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+}
+
+static int __init shrinker_debugfs_init(void)
+{
+	struct shrinker *shrinker;
+	int ret;
+
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL);
+	if (!shrinker_debugfs_root)
+		return -ENOMEM;
+
+	/* Create debugfs entries for shrinkers registered at boot */
+	ret = down_write_killable(&shrinker_rwsem);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(shrinker, &shrinker_list, list)
+		if (!shrinker->debugfs_entry)
+			ret = shrinker_debugfs_add(shrinker);
+	up_write(&shrinker_rwsem);
+
+	return ret;
+}
+late_initcall(shrinker_debugfs_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c6918fff06e1..024f7056b98c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -190,8 +190,8 @@  static void set_task_reclaim_state(struct task_struct *task,
 	task->reclaim_state = rs;
 }
 
-static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+LIST_HEAD(shrinker_list);
+DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
@@ -655,6 +655,7 @@  void register_shrinker_prepared(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	shrinker->flags |= SHRINKER_REGISTERED;
+	WARN_ON_ONCE(shrinker_debugfs_add(shrinker));
 	up_write(&shrinker_rwsem);
 }
 
@@ -682,6 +683,7 @@  void unregister_shrinker(struct shrinker *shrinker)
 	shrinker->flags &= ~SHRINKER_REGISTERED;
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 		unregister_memcg_shrinker(shrinker);
+	shrinker_debugfs_remove(shrinker);
 	up_write(&shrinker_rwsem);
 
 	kfree(shrinker->nr_deferred);