diff mbox series

[5/6] perf record: Add cgroup support for off-cpu profiling

Message ID 20220518224725.742882-6-namhyung@kernel.org (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series perf record: Implement off-cpu profiling with BPF (v3) | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
netdev/tree_selection success Not a local patch

Commit Message

Namhyung Kim May 18, 2022, 10:47 p.m. UTC
This covers two different use cases.  The first one is cgroup
filtering given by -G/--cgroup option which controls the off-cpu
profiling for tasks in the given cgroups only.

The other use case is cgroup sampling which is enabled by
--all-cgroups option and it adds PERF_SAMPLE_CGROUP to the sample_type
to set the cgroup id of the task in the sample data.

Example output.

  $ sudo perf record -a --off-cpu --all-cgroups sleep 1

  $ sudo perf report --stdio -s comm,cgroup --call-graph=no
  ...
  # Samples: 144  of event 'offcpu-time'
  # Event count (approx.): 48452045427
  #
  # Children      Self  Command          Cgroup
  # ........  ........  ...............  ..........................................
  #
      61.57%     5.60%  Chrome_ChildIOT  /user.slice/user-657345.slice/user@657345.service/app.slice/...
      29.51%     7.38%  Web Content      /user.slice/user-657345.slice/user@657345.service/app.slice/...
      17.48%     1.59%  Chrome_IOThread  /user.slice/user-657345.slice/user@657345.service/app.slice/...
      16.48%     4.12%  pipewire-pulse   /user.slice/user-657345.slice/user@657345.service/session.slice/...
      14.48%     2.07%  perf             /user.slice/user-657345.slice/user@657345.service/app.slice/...
      14.30%     7.15%  CompositorTileW  /user.slice/user-657345.slice/user@657345.service/app.slice/...
      13.33%     6.67%  Timer            /user.slice/user-657345.slice/user@657345.service/app.slice/...
  ...

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-record.c            |  2 +-
 tools/perf/util/bpf_off_cpu.c          | 48 ++++++++++++++++++++++++--
 tools/perf/util/bpf_skel/off_cpu.bpf.c | 33 ++++++++++++++++++
 tools/perf/util/off_cpu.h              |  7 ++--
 4 files changed, 85 insertions(+), 5 deletions(-)

Comments

Ian Rogers May 19, 2022, 4:07 a.m. UTC | #1
On Wed, May 18, 2022 at 3:47 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> This covers two different use cases.  The first one is cgroup
> filtering given by -G/--cgroup option which controls the off-cpu
> profiling for tasks in the given cgroups only.
>
> The other use case is cgroup sampling which is enabled by
> --all-cgroups option and it adds PERF_SAMPLE_CGROUP to the sample_type
> to set the cgroup id of the task in the sample data.
>
> Example output.
>
>   $ sudo perf record -a --off-cpu --all-cgroups sleep 1
>
>   $ sudo perf report --stdio -s comm,cgroup --call-graph=no
>   ...
>   # Samples: 144  of event 'offcpu-time'
>   # Event count (approx.): 48452045427
>   #
>   # Children      Self  Command          Cgroup
>   # ........  ........  ...............  ..........................................
>   #
>       61.57%     5.60%  Chrome_ChildIOT  /user.slice/user-657345.slice/user@657345.service/app.slice/...
>       29.51%     7.38%  Web Content      /user.slice/user-657345.slice/user@657345.service/app.slice/...
>       17.48%     1.59%  Chrome_IOThread  /user.slice/user-657345.slice/user@657345.service/app.slice/...
>       16.48%     4.12%  pipewire-pulse   /user.slice/user-657345.slice/user@657345.service/session.slice/...
>       14.48%     2.07%  perf             /user.slice/user-657345.slice/user@657345.service/app.slice/...
>       14.30%     7.15%  CompositorTileW  /user.slice/user-657345.slice/user@657345.service/app.slice/...
>       13.33%     6.67%  Timer            /user.slice/user-657345.slice/user@657345.service/app.slice/...
>   ...
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Acked-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

> ---
>  tools/perf/builtin-record.c            |  2 +-
>  tools/perf/util/bpf_off_cpu.c          | 48 ++++++++++++++++++++++++--
>  tools/perf/util/bpf_skel/off_cpu.bpf.c | 33 ++++++++++++++++++
>  tools/perf/util/off_cpu.h              |  7 ++--
>  4 files changed, 85 insertions(+), 5 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 7f60d2eac0b4..77fa21c2c69f 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -907,7 +907,7 @@ static int record__config_text_poke(struct evlist *evlist)
>
>  static int record__config_off_cpu(struct record *rec)
>  {
> -       return off_cpu_prepare(rec->evlist, &rec->opts.target);
> +       return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
>  }
>
>  static bool record__kcore_readable(struct machine *machine)
> diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
> index 874856c55101..b73e84a02264 100644
> --- a/tools/perf/util/bpf_off_cpu.c
> +++ b/tools/perf/util/bpf_off_cpu.c
> @@ -5,10 +5,12 @@
>  #include "util/evlist.h"
>  #include "util/off_cpu.h"
>  #include "util/perf-hooks.h"
> +#include "util/record.h"
>  #include "util/session.h"
>  #include "util/target.h"
>  #include "util/cpumap.h"
>  #include "util/thread_map.h"
> +#include "util/cgroup.h"
>  #include <bpf/bpf.h>
>
>  #include "bpf_skel/off_cpu.skel.h"
> @@ -24,6 +26,7 @@ struct off_cpu_key {
>         u32 tgid;
>         u32 stack_id;
>         u32 state;
> +       u64 cgroup_id;
>  };
>
>  union off_cpu_data {
> @@ -116,10 +119,11 @@ static void check_sched_switch_args(void)
>         }
>  }
>
> -int off_cpu_prepare(struct evlist *evlist, struct target *target)
> +int off_cpu_prepare(struct evlist *evlist, struct target *target,
> +                   struct record_opts *opts)
>  {
>         int err, fd, i;
> -       int ncpus = 1, ntasks = 1;
> +       int ncpus = 1, ntasks = 1, ncgrps = 1;
>
>         if (off_cpu_config(evlist) < 0) {
>                 pr_err("Failed to config off-cpu BPF event\n");
> @@ -143,6 +147,21 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target)
>                 bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
>         }
>
> +       if (evlist__first(evlist)->cgrp) {
> +               ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
> +               bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
> +
> +               if (!cgroup_is_v2("perf_event"))
> +                       skel->rodata->uses_cgroup_v1 = true;
> +       }
> +
> +       if (opts->record_cgroup) {
> +               skel->rodata->needs_cgroup = true;
> +
> +               if (!cgroup_is_v2("perf_event"))
> +                       skel->rodata->uses_cgroup_v1 = true;
> +       }
> +
>         set_max_rlimit();
>         check_sched_switch_args();
>
> @@ -178,6 +197,29 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target)
>                 }
>         }
>
> +       if (evlist__first(evlist)->cgrp) {
> +               struct evsel *evsel;
> +               u8 val = 1;
> +
> +               skel->bss->has_cgroup = 1;
> +               fd = bpf_map__fd(skel->maps.cgroup_filter);
> +
> +               evlist__for_each_entry(evlist, evsel) {
> +                       struct cgroup *cgrp = evsel->cgrp;
> +
> +                       if (cgrp == NULL)
> +                               continue;
> +
> +                       if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
> +                               pr_err("Failed to read cgroup id of %s\n",
> +                                      cgrp->name);
> +                               goto out;
> +                       }
> +
> +                       bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
> +               }
> +       }
> +
>         err = off_cpu_bpf__attach(skel);
>         if (err) {
>                 pr_err("Failed to attach off-cpu BPF skeleton\n");
> @@ -275,6 +317,8 @@ int off_cpu_write(struct perf_session *session)
>                         /* calculate sample callchain data array length */
>                         n += len + 2;
>                 }
> +               if (sample_type & PERF_SAMPLE_CGROUP)
> +                       data.array[n++] = key.cgroup_id;
>                 /* TODO: handle more sample types */
>
>                 size = n * sizeof(u64);
> diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> index 986d7db6e75d..792ae2847080 100644
> --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
> +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> @@ -26,6 +26,7 @@ struct offcpu_key {
>         __u32 tgid;
>         __u32 stack_id;
>         __u32 state;
> +       __u64 cgroup_id;
>  };
>
>  struct {
> @@ -63,6 +64,13 @@ struct {
>         __uint(max_entries, 1);
>  } task_filter SEC(".maps");
>
> +struct {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __uint(key_size, sizeof(__u64));
> +       __uint(value_size, sizeof(__u8));
> +       __uint(max_entries, 1);
> +} cgroup_filter SEC(".maps");
> +
>  /* old kernel task_struct definition */
>  struct task_struct___old {
>         long state;
> @@ -71,8 +79,11 @@ struct task_struct___old {
>  int enabled = 0;
>  int has_cpu = 0;
>  int has_task = 0;
> +int has_cgroup = 0;
>
>  const volatile bool has_prev_state = false;
> +const volatile bool needs_cgroup = false;
> +const volatile bool uses_cgroup_v1 = false;
>
>  /*
>   * Old kernel used to call it task_struct->state and now it's '__state'.
> @@ -92,6 +103,18 @@ static inline int get_task_state(struct task_struct *t)
>         return BPF_CORE_READ(t_old, state);
>  }
>
> +static inline __u64 get_cgroup_id(struct task_struct *t)
> +{
> +       struct cgroup *cgrp;
> +
> +       if (uses_cgroup_v1)
> +               cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
> +       else
> +               cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
> +
> +       return BPF_CORE_READ(cgrp, kn, id);
> +}
> +
>  static inline int can_record(struct task_struct *t, int state)
>  {
>         /* kernel threads don't have user stack */
> @@ -120,6 +143,15 @@ static inline int can_record(struct task_struct *t, int state)
>                         return 0;
>         }
>
> +       if (has_cgroup) {
> +               __u8 *ok;
> +               __u64 cgrp_id = get_cgroup_id(t);
> +
> +               ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
> +               if (!ok)
> +                       return 0;
> +       }
> +
>         return 1;
>  }
>
> @@ -156,6 +188,7 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>                         .tgid = next->tgid,
>                         .stack_id = pelem->stack_id,
>                         .state = pelem->state,
> +                       .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
>                 };
>                 __u64 delta = ts - pelem->timestamp;
>                 __u64 *total;
> diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
> index f47af0232e55..548008f74d42 100644
> --- a/tools/perf/util/off_cpu.h
> +++ b/tools/perf/util/off_cpu.h
> @@ -4,15 +4,18 @@
>  struct evlist;
>  struct target;
>  struct perf_session;
> +struct record_opts;
>
>  #define OFFCPU_EVENT  "offcpu-time"
>
>  #ifdef HAVE_BPF_SKEL
> -int off_cpu_prepare(struct evlist *evlist, struct target *target);
> +int off_cpu_prepare(struct evlist *evlist, struct target *target,
> +                   struct record_opts *opts);
>  int off_cpu_write(struct perf_session *session);
>  #else
>  static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused,
> -                                 struct target *target __maybe_unused)
> +                                 struct target *target __maybe_unused,
> +                                 struct record_opts *opts __maybe_unused)
>  {
>         return -1;
>  }
> --
> 2.36.1.124.g0e6072fb45-goog
>
diff mbox series

Patch

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 7f60d2eac0b4..77fa21c2c69f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -907,7 +907,7 @@  static int record__config_text_poke(struct evlist *evlist)
 
 static int record__config_off_cpu(struct record *rec)
 {
-	return off_cpu_prepare(rec->evlist, &rec->opts.target);
+	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
 }
 
 static bool record__kcore_readable(struct machine *machine)
diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index 874856c55101..b73e84a02264 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -5,10 +5,12 @@ 
 #include "util/evlist.h"
 #include "util/off_cpu.h"
 #include "util/perf-hooks.h"
+#include "util/record.h"
 #include "util/session.h"
 #include "util/target.h"
 #include "util/cpumap.h"
 #include "util/thread_map.h"
+#include "util/cgroup.h"
 #include <bpf/bpf.h>
 
 #include "bpf_skel/off_cpu.skel.h"
@@ -24,6 +26,7 @@  struct off_cpu_key {
 	u32 tgid;
 	u32 stack_id;
 	u32 state;
+	u64 cgroup_id;
 };
 
 union off_cpu_data {
@@ -116,10 +119,11 @@  static void check_sched_switch_args(void)
 	}
 }
 
-int off_cpu_prepare(struct evlist *evlist, struct target *target)
+int off_cpu_prepare(struct evlist *evlist, struct target *target,
+		    struct record_opts *opts)
 {
 	int err, fd, i;
-	int ncpus = 1, ntasks = 1;
+	int ncpus = 1, ntasks = 1, ncgrps = 1;
 
 	if (off_cpu_config(evlist) < 0) {
 		pr_err("Failed to config off-cpu BPF event\n");
@@ -143,6 +147,21 @@  int off_cpu_prepare(struct evlist *evlist, struct target *target)
 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
 	}
 
+	if (evlist__first(evlist)->cgrp) {
+		ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
+		bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
+
+		if (!cgroup_is_v2("perf_event"))
+			skel->rodata->uses_cgroup_v1 = true;
+	}
+
+	if (opts->record_cgroup) {
+		skel->rodata->needs_cgroup = true;
+
+		if (!cgroup_is_v2("perf_event"))
+			skel->rodata->uses_cgroup_v1 = true;
+	}
+
 	set_max_rlimit();
 	check_sched_switch_args();
 
@@ -178,6 +197,29 @@  int off_cpu_prepare(struct evlist *evlist, struct target *target)
 		}
 	}
 
+	if (evlist__first(evlist)->cgrp) {
+		struct evsel *evsel;
+		u8 val = 1;
+
+		skel->bss->has_cgroup = 1;
+		fd = bpf_map__fd(skel->maps.cgroup_filter);
+
+		evlist__for_each_entry(evlist, evsel) {
+			struct cgroup *cgrp = evsel->cgrp;
+
+			if (cgrp == NULL)
+				continue;
+
+			if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
+				pr_err("Failed to read cgroup id of %s\n",
+				       cgrp->name);
+				goto out;
+			}
+
+			bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
+		}
+	}
+
 	err = off_cpu_bpf__attach(skel);
 	if (err) {
 		pr_err("Failed to attach off-cpu BPF skeleton\n");
@@ -275,6 +317,8 @@  int off_cpu_write(struct perf_session *session)
 			/* calculate sample callchain data array length */
 			n += len + 2;
 		}
+		if (sample_type & PERF_SAMPLE_CGROUP)
+			data.array[n++] = key.cgroup_id;
 		/* TODO: handle more sample types */
 
 		size = n * sizeof(u64);
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index 986d7db6e75d..792ae2847080 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -26,6 +26,7 @@  struct offcpu_key {
 	__u32 tgid;
 	__u32 stack_id;
 	__u32 state;
+	__u64 cgroup_id;
 };
 
 struct {
@@ -63,6 +64,13 @@  struct {
 	__uint(max_entries, 1);
 } task_filter SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64));
+	__uint(value_size, sizeof(__u8));
+	__uint(max_entries, 1);
+} cgroup_filter SEC(".maps");
+
 /* old kernel task_struct definition */
 struct task_struct___old {
 	long state;
@@ -71,8 +79,11 @@  struct task_struct___old {
 int enabled = 0;
 int has_cpu = 0;
 int has_task = 0;
+int has_cgroup = 0;
 
 const volatile bool has_prev_state = false;
+const volatile bool needs_cgroup = false;
+const volatile bool uses_cgroup_v1 = false;
 
 /*
  * Old kernel used to call it task_struct->state and now it's '__state'.
@@ -92,6 +103,18 @@  static inline int get_task_state(struct task_struct *t)
 	return BPF_CORE_READ(t_old, state);
 }
 
+static inline __u64 get_cgroup_id(struct task_struct *t)
+{
+	struct cgroup *cgrp;
+
+	if (uses_cgroup_v1)
+		cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
+	else
+		cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
+
+	return BPF_CORE_READ(cgrp, kn, id);
+}
+
 static inline int can_record(struct task_struct *t, int state)
 {
 	/* kernel threads don't have user stack */
@@ -120,6 +143,15 @@  static inline int can_record(struct task_struct *t, int state)
 			return 0;
 	}
 
+	if (has_cgroup) {
+		__u8 *ok;
+		__u64 cgrp_id = get_cgroup_id(t);
+
+		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
+		if (!ok)
+			return 0;
+	}
+
 	return 1;
 }
 
@@ -156,6 +188,7 @@  static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
 			.tgid = next->tgid,
 			.stack_id = pelem->stack_id,
 			.state = pelem->state,
+			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
 		};
 		__u64 delta = ts - pelem->timestamp;
 		__u64 *total;
diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
index f47af0232e55..548008f74d42 100644
--- a/tools/perf/util/off_cpu.h
+++ b/tools/perf/util/off_cpu.h
@@ -4,15 +4,18 @@ 
 struct evlist;
 struct target;
 struct perf_session;
+struct record_opts;
 
 #define OFFCPU_EVENT  "offcpu-time"
 
 #ifdef HAVE_BPF_SKEL
-int off_cpu_prepare(struct evlist *evlist, struct target *target);
+int off_cpu_prepare(struct evlist *evlist, struct target *target,
+		    struct record_opts *opts);
 int off_cpu_write(struct perf_session *session);
 #else
 static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused,
-				  struct target *target __maybe_unused)
+				  struct target *target __maybe_unused,
+				  struct record_opts *opts __maybe_unused)
 {
 	return -1;
 }