diff mbox series

[v2] mm/vmscan: expose cgroup_ino for memcg reclaim tracepoints

Message ID 1557649528-11676-1-git-send-email-laoar.shao@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v2] mm/vmscan: expose cgroup_ino for memcg reclaim tracepoints | expand

Commit Message

Yafang Shao May 12, 2019, 8:25 a.m. UTC
We can use the exposed cgroup_ino to trace specified cgroup.

For example,
step 1, get the inode of the specified cgroup
	$ ls -di /tmp/cgroupv2/foo
step 2, set this inode into tracepoint filter to trace this cgroup only
	(assume the inode is 11)
	$ cd /sys/kernel/debug/tracing/events/vmscan/
	$ echo 'cgroup_ino == 11' > mm_vmscan_memcg_reclaim_begin/filter
	$ echo 'cgroup_ino == 11' > mm_vmscan_memcg_reclaim_end/filter

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>

---
v2: rebase it against the latest -mmotm
---
 include/trace/events/vmscan.h | 71 +++++++++++++++++++++++++++++++++++--------
 mm/vmscan.c                   | 18 ++++++++---
 2 files changed, 72 insertions(+), 17 deletions(-)

Comments

Andrew Morton May 22, 2019, 7:33 p.m. UTC | #1
On Sun, 12 May 2019 16:25:28 +0800 Yafang Shao <laoar.shao@gmail.com> wrote:

> We can use the exposed cgroup_ino to trace specified cgroup.
> 
> For example,
> step 1, get the inode of the specified cgroup
> 	$ ls -di /tmp/cgroupv2/foo
> step 2, set this inode into tracepoint filter to trace this cgroup only
> 	(assume the inode is 11)
> 	$ cd /sys/kernel/debug/tracing/events/vmscan/
> 	$ echo 'cgroup_ino == 11' > mm_vmscan_memcg_reclaim_begin/filter
> 	$ echo 'cgroup_ino == 11' > mm_vmscan_memcg_reclaim_end/filter

Seems straightforward enough.

But please explain the value of such a change.  What is wrong with the
current situation and how does this change improve things?  A simple
use-case scenario would be good.

I can guess why it is beneficial, but I'd rather not guess!

Thanks.
Yafang Shao May 23, 2019, 1:24 a.m. UTC | #2
On Thu, May 23, 2019 at 3:33 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Sun, 12 May 2019 16:25:28 +0800 Yafang Shao <laoar.shao@gmail.com> wrote:
>
> > We can use the exposed cgroup_ino to trace specified cgroup.
> >
> > For example,
> > step 1, get the inode of the specified cgroup
> >       $ ls -di /tmp/cgroupv2/foo
> > step 2, set this inode into tracepoint filter to trace this cgroup only
> >       (assume the inode is 11)
> >       $ cd /sys/kernel/debug/tracing/events/vmscan/
> >       $ echo 'cgroup_ino == 11' > mm_vmscan_memcg_reclaim_begin/filter
> >       $ echo 'cgroup_ino == 11' > mm_vmscan_memcg_reclaim_end/filter
>
> Seems straightforward enough.
>
> But please explain the value of such a change.  What is wrong with the
> current situation and how does this change improve things?  A simple
> use-case scenario would be good.
>
> I can guess why it is beneficial, but I'd rather not guess!
>

Got it.
The reason I made this change is to trace a specific container.

Sometimes there're lots of containers on one host.
Some of them are not important at all, so we don't care whether them
are under memory pressure.
While some of them are important, so we want't to know if these
containers are doing memcg reclaim and
how long this relaim takes.

Without this change, we don't know the memcg reclaim happend in which
container.

Thanks
Yafang
diff mbox series

Patch

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index a5ab297..c37e228 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -127,18 +127,43 @@ 
 );
 
 #ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_begin_template,
 
-	TP_PROTO(int order, gfp_t gfp_flags),
+	TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
 
-	TP_ARGS(order, gfp_flags)
+	TP_ARGS(cgroup_ino, order, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, cgroup_ino)
+		__field(int, order)
+		__field(gfp_t, gfp_flags)
+	),
+
+	TP_fast_assign(
+		__entry->cgroup_ino	= cgroup_ino;
+		__entry->order		= order;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("cgroup_ino=%u order=%d gfp_flags=%s",
+		__entry->cgroup_ino, __entry->order,
+		show_gfp_flags(__entry->gfp_flags))
 );
 
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template,
+	     mm_vmscan_memcg_reclaim_begin,
 
-	TP_PROTO(int order, gfp_t gfp_flags),
+	TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
 
-	TP_ARGS(order, gfp_flags)
+	TP_ARGS(cgroup_ino, order, gfp_flags)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template,
+	     mm_vmscan_memcg_softlimit_reclaim_begin,
+
+	TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
+
+	TP_ARGS(cgroup_ino, order, gfp_flags)
 );
 #endif /* CONFIG_MEMCG */
 
@@ -167,18 +192,40 @@ 
 );
 
 #ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end,
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_end_template,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
 
-	TP_ARGS(nr_reclaimed)
+	TP_ARGS(cgroup_ino, nr_reclaimed),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, cgroup_ino)
+		__field(unsigned long, nr_reclaimed)
+	),
+
+	TP_fast_assign(
+		__entry->cgroup_ino	= cgroup_ino;
+		__entry->nr_reclaimed	= nr_reclaimed;
+	),
+
+	TP_printk("cgroup_ino=%u nr_reclaimed=%lu",
+		__entry->cgroup_ino, __entry->nr_reclaimed)
 );
 
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template,
+	     mm_vmscan_memcg_reclaim_end,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
 
-	TP_ARGS(nr_reclaimed)
+	TP_ARGS(cgroup_ino, nr_reclaimed)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template,
+	     mm_vmscan_memcg_softlimit_reclaim_end,
+
+	TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
+
+	TP_ARGS(cgroup_ino, nr_reclaimed)
 );
 #endif /* CONFIG_MEMCG */
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d9c3e87..91c50dc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3244,8 +3244,10 @@  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 
-	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
-						      sc.gfp_mask);
+	trace_mm_vmscan_memcg_softlimit_reclaim_begin(
+					cgroup_ino(memcg->css.cgroup),
+					sc.order,
+					sc.gfp_mask);
 
 	/*
 	 * NOTE: Although we can get the priority field, using it
@@ -3256,7 +3258,9 @@  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	 */
 	shrink_node_memcg(pgdat, memcg, &sc);
 
-	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+	trace_mm_vmscan_memcg_softlimit_reclaim_end(
+					cgroup_ino(memcg->css.cgroup),
+					sc.nr_reclaimed);
 
 	*nr_scanned = sc.nr_scanned;
 	return sc.nr_reclaimed;
@@ -3294,7 +3298,9 @@  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 
 	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
 
-	trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+	trace_mm_vmscan_memcg_reclaim_begin(
+				cgroup_ino(memcg->css.cgroup),
+				0, sc.gfp_mask);
 
 	psi_memstall_enter(&pflags);
 	noreclaim_flag = memalloc_noreclaim_save();
@@ -3304,7 +3310,9 @@  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	memalloc_noreclaim_restore(noreclaim_flag);
 	psi_memstall_leave(&pflags);
 
-	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+	trace_mm_vmscan_memcg_reclaim_end(
+				cgroup_ino(memcg->css.cgroup),
+				nr_reclaimed);
 
 	return nr_reclaimed;
 }