Message ID | 20220518224725.742882-6-namhyung@kernel.org (mailing list archive) |
---|---|
State | RFC |
Delegated to: | BPF |
Headers | show |
Series | perf record: Implement off-cpu profiling with BPF (v3) | expand |
Context | Check | Description |
---|---|---|
bpf/vmtest-bpf-next-VM_Test-2 | success | Logs for Kernel LATEST on ubuntu-latest with llvm-15 |
bpf/vmtest-bpf-next-VM_Test-3 | fail | Logs for Kernel LATEST on z15 with gcc |
bpf/vmtest-bpf-next-PR | fail | PR summary |
bpf/vmtest-bpf-next-VM_Test-1 | success | Logs for Kernel LATEST on ubuntu-latest with gcc |
netdev/tree_selection | success | Not a local patch |
On Wed, May 18, 2022 at 3:47 PM Namhyung Kim <namhyung@kernel.org> wrote: > > This covers two different use cases. The first one is cgroup > filtering given by -G/--cgroup option which controls the off-cpu > profiling for tasks in the given cgroups only. > > The other use case is cgroup sampling which is enabled by > --all-cgroups option and it adds PERF_SAMPLE_CGROUP to the sample_type > to set the cgroup id of the task in the sample data. > > Example output. > > $ sudo perf record -a --off-cpu --all-cgroups sleep 1 > > $ sudo perf report --stdio -s comm,cgroup --call-graph=no > ... > # Samples: 144 of event 'offcpu-time' > # Event count (approx.): 48452045427 > # > # Children Self Command Cgroup > # ........ ........ ............... .......................................... > # > 61.57% 5.60% Chrome_ChildIOT /user.slice/user-657345.slice/user@657345.service/app.slice/... > 29.51% 7.38% Web Content /user.slice/user-657345.slice/user@657345.service/app.slice/... > 17.48% 1.59% Chrome_IOThread /user.slice/user-657345.slice/user@657345.service/app.slice/... > 16.48% 4.12% pipewire-pulse /user.slice/user-657345.slice/user@657345.service/session.slice/... > 14.48% 2.07% perf /user.slice/user-657345.slice/user@657345.service/app.slice/... > 14.30% 7.15% CompositorTileW /user.slice/user-657345.slice/user@657345.service/app.slice/... > 13.33% 6.67% Timer /user.slice/user-657345.slice/user@657345.service/app.slice/... > ... > > Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Thanks, Ian > --- > tools/perf/builtin-record.c | 2 +- > tools/perf/util/bpf_off_cpu.c | 48 ++++++++++++++++++++++++-- > tools/perf/util/bpf_skel/off_cpu.bpf.c | 33 ++++++++++++++++++ > tools/perf/util/off_cpu.h | 7 ++-- > 4 files changed, 85 insertions(+), 5 deletions(-) > > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c > index 7f60d2eac0b4..77fa21c2c69f 100644 > --- a/tools/perf/builtin-record.c > +++ b/tools/perf/builtin-record.c > @@ -907,7 +907,7 @@ static int record__config_text_poke(struct evlist *evlist) > > static int record__config_off_cpu(struct record *rec) > { > - return off_cpu_prepare(rec->evlist, &rec->opts.target); > + return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts); > } > > static bool record__kcore_readable(struct machine *machine) > diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c > index 874856c55101..b73e84a02264 100644 > --- a/tools/perf/util/bpf_off_cpu.c > +++ b/tools/perf/util/bpf_off_cpu.c > @@ -5,10 +5,12 @@ > #include "util/evlist.h" > #include "util/off_cpu.h" > #include "util/perf-hooks.h" > +#include "util/record.h" > #include "util/session.h" > #include "util/target.h" > #include "util/cpumap.h" > #include "util/thread_map.h" > +#include "util/cgroup.h" > #include <bpf/bpf.h> > > #include "bpf_skel/off_cpu.skel.h" > @@ -24,6 +26,7 @@ struct off_cpu_key { > u32 tgid; > u32 stack_id; > u32 state; > + u64 cgroup_id; > }; > > union off_cpu_data { > @@ -116,10 +119,11 @@ static void check_sched_switch_args(void) > } > } > > -int off_cpu_prepare(struct evlist *evlist, struct target *target) > +int off_cpu_prepare(struct evlist *evlist, struct target *target, > + struct record_opts *opts) > { > int err, fd, i; > - int ncpus = 1, ntasks = 1; > + int ncpus = 1, ntasks = 1, ncgrps = 1; > > if (off_cpu_config(evlist) < 0) { > pr_err("Failed to config off-cpu BPF event\n"); > @@ -143,6 +147,21 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target) > bpf_map__set_max_entries(skel->maps.task_filter, ntasks); > } > > + if (evlist__first(evlist)->cgrp) { > + ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */ > + bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps); > + > + if (!cgroup_is_v2("perf_event")) > + skel->rodata->uses_cgroup_v1 = true; > + } > + > + if (opts->record_cgroup) { > + skel->rodata->needs_cgroup = true; > + > + if (!cgroup_is_v2("perf_event")) > + skel->rodata->uses_cgroup_v1 = true; > + } > + > set_max_rlimit(); > check_sched_switch_args(); > > @@ -178,6 +197,29 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target) > } > } > > + if (evlist__first(evlist)->cgrp) { > + struct evsel *evsel; > + u8 val = 1; > + > + skel->bss->has_cgroup = 1; > + fd = bpf_map__fd(skel->maps.cgroup_filter); > + > + evlist__for_each_entry(evlist, evsel) { > + struct cgroup *cgrp = evsel->cgrp; > + > + if (cgrp == NULL) > + continue; > + > + if (!cgrp->id && read_cgroup_id(cgrp) < 0) { > + pr_err("Failed to read cgroup id of %s\n", > + cgrp->name); > + goto out; > + } > + > + bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY); > + } > + } > + > err = off_cpu_bpf__attach(skel); > if (err) { > pr_err("Failed to attach off-cpu BPF skeleton\n"); > @@ -275,6 +317,8 @@ int off_cpu_write(struct perf_session *session) > /* calculate sample callchain data array length */ > n += len + 2; > } > + if (sample_type & PERF_SAMPLE_CGROUP) > + data.array[n++] = key.cgroup_id; > /* TODO: handle more sample types */ > > size = n * sizeof(u64); > diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c > index 986d7db6e75d..792ae2847080 100644 > --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c > +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c > @@ -26,6 +26,7 @@ struct offcpu_key { > __u32 tgid; > __u32 stack_id; > __u32 state; > + __u64 cgroup_id; > }; > > struct { > @@ -63,6 +64,13 @@ struct { > __uint(max_entries, 1); > } task_filter SEC(".maps"); > > +struct { > + __uint(type, BPF_MAP_TYPE_HASH); > + __uint(key_size, sizeof(__u64)); > + __uint(value_size, sizeof(__u8)); > + __uint(max_entries, 1); > +} cgroup_filter SEC(".maps"); > + > /* old kernel task_struct definition */ > struct task_struct___old { > long state; > @@ -71,8 +79,11 @@ struct task_struct___old { > int enabled = 0; > int has_cpu = 0; > int has_task = 0; > +int has_cgroup = 0; > > const volatile bool has_prev_state = false; > +const volatile bool needs_cgroup = false; > +const volatile bool uses_cgroup_v1 = false; > > /* > * Old kernel used to call it task_struct->state and now it's '__state'. > @@ -92,6 +103,18 @@ static inline int get_task_state(struct task_struct *t) > return BPF_CORE_READ(t_old, state); > } > > +static inline __u64 get_cgroup_id(struct task_struct *t) > +{ > + struct cgroup *cgrp; > + > + if (uses_cgroup_v1) > + cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); > + else > + cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); > + > + return BPF_CORE_READ(cgrp, kn, id); > +} > + > static inline int can_record(struct task_struct *t, int state) > { > /* kernel threads don't have user stack */ > @@ -120,6 +143,15 @@ static inline int can_record(struct task_struct *t, int state) > return 0; > } > > + if (has_cgroup) { > + __u8 *ok; > + __u64 cgrp_id = get_cgroup_id(t); > + > + ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); > + if (!ok) > + return 0; > + } > + > return 1; > } > > @@ -156,6 +188,7 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev, > .tgid = next->tgid, > .stack_id = pelem->stack_id, > .state = pelem->state, > + .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, > }; > __u64 delta = ts - pelem->timestamp; > __u64 *total; > diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h > index f47af0232e55..548008f74d42 100644 > --- a/tools/perf/util/off_cpu.h > +++ b/tools/perf/util/off_cpu.h > @@ -4,15 +4,18 @@ > struct evlist; > struct target; > struct perf_session; > +struct record_opts; > > #define OFFCPU_EVENT "offcpu-time" > > #ifdef HAVE_BPF_SKEL > -int off_cpu_prepare(struct evlist *evlist, struct target *target); > +int off_cpu_prepare(struct evlist *evlist, struct target *target, > + struct record_opts *opts); > int off_cpu_write(struct perf_session *session); > #else > static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused, > - struct target *target __maybe_unused) > + struct target *target __maybe_unused, > + struct record_opts *opts __maybe_unused) > { > return -1; > } > -- > 2.36.1.124.g0e6072fb45-goog >
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 7f60d2eac0b4..77fa21c2c69f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -907,7 +907,7 @@ static int record__config_text_poke(struct evlist *evlist) static int record__config_off_cpu(struct record *rec) { - return off_cpu_prepare(rec->evlist, &rec->opts.target); + return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts); } static bool record__kcore_readable(struct machine *machine) diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c index 874856c55101..b73e84a02264 100644 --- a/tools/perf/util/bpf_off_cpu.c +++ b/tools/perf/util/bpf_off_cpu.c @@ -5,10 +5,12 @@ #include "util/evlist.h" #include "util/off_cpu.h" #include "util/perf-hooks.h" +#include "util/record.h" #include "util/session.h" #include "util/target.h" #include "util/cpumap.h" #include "util/thread_map.h" +#include "util/cgroup.h" #include <bpf/bpf.h> #include "bpf_skel/off_cpu.skel.h" @@ -24,6 +26,7 @@ struct off_cpu_key { u32 tgid; u32 stack_id; u32 state; + u64 cgroup_id; }; union off_cpu_data { @@ -116,10 +119,11 @@ static void check_sched_switch_args(void) } } -int off_cpu_prepare(struct evlist *evlist, struct target *target) +int off_cpu_prepare(struct evlist *evlist, struct target *target, + struct record_opts *opts) { int err, fd, i; - int ncpus = 1, ntasks = 1; + int ncpus = 1, ntasks = 1, ncgrps = 1; if (off_cpu_config(evlist) < 0) { pr_err("Failed to config off-cpu BPF event\n"); @@ -143,6 +147,21 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target) bpf_map__set_max_entries(skel->maps.task_filter, ntasks); } + if (evlist__first(evlist)->cgrp) { + ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */ + bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps); + + if (!cgroup_is_v2("perf_event")) + skel->rodata->uses_cgroup_v1 = true; + } + + if (opts->record_cgroup) { + skel->rodata->needs_cgroup = true; + + if (!cgroup_is_v2("perf_event")) + skel->rodata->uses_cgroup_v1 = true; + } + set_max_rlimit(); check_sched_switch_args(); @@ -178,6 +197,29 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target) } } + if (evlist__first(evlist)->cgrp) { + struct evsel *evsel; + u8 val = 1; + + skel->bss->has_cgroup = 1; + fd = bpf_map__fd(skel->maps.cgroup_filter); + + evlist__for_each_entry(evlist, evsel) { + struct cgroup *cgrp = evsel->cgrp; + + if (cgrp == NULL) + continue; + + if (!cgrp->id && read_cgroup_id(cgrp) < 0) { + pr_err("Failed to read cgroup id of %s\n", + cgrp->name); + goto out; + } + + bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY); + } + } + err = off_cpu_bpf__attach(skel); if (err) { pr_err("Failed to attach off-cpu BPF skeleton\n"); @@ -275,6 +317,8 @@ int off_cpu_write(struct perf_session *session) /* calculate sample callchain data array length */ n += len + 2; } + if (sample_type & PERF_SAMPLE_CGROUP) + data.array[n++] = key.cgroup_id; /* TODO: handle more sample types */ size = n * sizeof(u64); diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c index 986d7db6e75d..792ae2847080 100644 --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -26,6 +26,7 @@ struct offcpu_key { __u32 tgid; __u32 stack_id; __u32 state; + __u64 cgroup_id; }; struct { @@ -63,6 +64,13 @@ struct { __uint(max_entries, 1); } task_filter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cgroup_filter SEC(".maps"); + /* old kernel task_struct definition */ struct task_struct___old { long state; @@ -71,8 +79,11 @@ struct task_struct___old { int enabled = 0; int has_cpu = 0; int has_task = 0; +int has_cgroup = 0; const volatile bool has_prev_state = false; +const volatile bool needs_cgroup = false; +const volatile bool uses_cgroup_v1 = false; /* * Old kernel used to call it task_struct->state and now it's '__state'. @@ -92,6 +103,18 @@ static inline int get_task_state(struct task_struct *t) return BPF_CORE_READ(t_old, state); } +static inline __u64 get_cgroup_id(struct task_struct *t) +{ + struct cgroup *cgrp; + + if (uses_cgroup_v1) + cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); + else + cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); + + return BPF_CORE_READ(cgrp, kn, id); +} + static inline int can_record(struct task_struct *t, int state) { /* kernel threads don't have user stack */ @@ -120,6 +143,15 @@ static inline int can_record(struct task_struct *t, int state) return 0; } + if (has_cgroup) { + __u8 *ok; + __u64 cgrp_id = get_cgroup_id(t); + + ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); + if (!ok) + return 0; + } + return 1; } @@ -156,6 +188,7 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev, .tgid = next->tgid, .stack_id = pelem->stack_id, .state = pelem->state, + .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, }; __u64 delta = ts - pelem->timestamp; __u64 *total; diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h index f47af0232e55..548008f74d42 100644 --- a/tools/perf/util/off_cpu.h +++ b/tools/perf/util/off_cpu.h @@ -4,15 +4,18 @@ struct evlist; struct target; struct perf_session; +struct record_opts; #define OFFCPU_EVENT "offcpu-time" #ifdef HAVE_BPF_SKEL -int off_cpu_prepare(struct evlist *evlist, struct target *target); +int off_cpu_prepare(struct evlist *evlist, struct target *target, + struct record_opts *opts); int off_cpu_write(struct perf_session *session); #else static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused, - struct target *target __maybe_unused) + struct target *target __maybe_unused, + struct record_opts *opts __maybe_unused) { return -1; }
This covers two different use cases. The first one is cgroup filtering given by -G/--cgroup option which controls the off-cpu profiling for tasks in the given cgroups only. The other use case is cgroup sampling which is enabled by --all-cgroups option and it adds PERF_SAMPLE_CGROUP to the sample_type to set the cgroup id of the task in the sample data. Example output. $ sudo perf record -a --off-cpu --all-cgroups sleep 1 $ sudo perf report --stdio -s comm,cgroup --call-graph=no ... # Samples: 144 of event 'offcpu-time' # Event count (approx.): 48452045427 # # Children Self Command Cgroup # ........ ........ ............... .......................................... # 61.57% 5.60% Chrome_ChildIOT /user.slice/user-657345.slice/user@657345.service/app.slice/... 29.51% 7.38% Web Content /user.slice/user-657345.slice/user@657345.service/app.slice/... 17.48% 1.59% Chrome_IOThread /user.slice/user-657345.slice/user@657345.service/app.slice/... 16.48% 4.12% pipewire-pulse /user.slice/user-657345.slice/user@657345.service/session.slice/... 14.48% 2.07% perf /user.slice/user-657345.slice/user@657345.service/app.slice/... 14.30% 7.15% CompositorTileW /user.slice/user-657345.slice/user@657345.service/app.slice/... 13.33% 6.67% Timer /user.slice/user-657345.slice/user@657345.service/app.slice/... ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> --- tools/perf/builtin-record.c | 2 +- tools/perf/util/bpf_off_cpu.c | 48 ++++++++++++++++++++++++-- tools/perf/util/bpf_skel/off_cpu.bpf.c | 33 ++++++++++++++++++ tools/perf/util/off_cpu.h | 7 ++-- 4 files changed, 85 insertions(+), 5 deletions(-)