Message ID | 20230314234237.3008956-4-namhyung@kernel.org (mailing list archive) |
---|---|
State | Handled Elsewhere |
Delegated to: | BPF |
Headers | show |
Series | perf record: Implement BPF sample filter (v5) | expand |
Em Tue, Mar 14, 2023 at 04:42:30PM -0700, Namhyung Kim escreveu: > Use --filter option to set BPF filter for generic events other than the > tracepoints or Intel PT. The BPF program will check the sample data and > filter according to the expression. > > For example, the below is the typical perf record for frequency mode. > The sample period started from 1 and increased gradually. > > $ sudo ./perf record -e cycles true > $ sudo ./perf script > perf-exec 2272336 546683.916875: 1 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916892: 1 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916899: 3 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916905: 17 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916911: 100 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916917: 589 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916924: 3470 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > perf-exec 2272336 546683.916930: 20465 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > true 2272336 546683.916940: 119873 cycles: ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms]) > true 2272336 546683.917003: 461349 cycles: ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms]) > true 2272336 546683.917237: 635778 cycles: ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms]) > > When you add a BPF filter to get samples having periods greater than 1000, > the output would look like below: Had to add: diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index be336f1b2b689602..153a13cdca9df1ea 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -19,6 +19,7 @@ #include "mmap.h" #include "stat.h" #include "metricgroup.h" +#include "util/bpf-filter.h" #include "util/env.h" #include "util/pmu.h" #include <internal/lib.h> @@ -135,6 +136,18 @@ int bpf_counter__disable(struct evsel *evsel __maybe_unused) return 0; } +// not to drag util/bpf-filter.c + +int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused) +{ + return 0; +} + +int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused) +{ + return 0; +} + /* * Support debug printing even though util/debug.c is not linked. That means * implementing 'verbose' and 'eprintf'. Please run 'perf test' before submitting patches, - Arnaldo > $ sudo ./perf record -e cycles --filter 'period > 1000' true > $ sudo ./perf script > perf-exec 2273949 546850.708501: 5029 cycles: ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms]) > perf-exec 2273949 546850.708508: 32409 cycles: ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms]) > perf-exec 2273949 546850.708526: 143369 cycles: ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms]) > perf-exec 2273949 546850.708600: 372650 cycles: ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms]) > perf-exec 2273949 546850.708791: 482953 cycles: ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms]) > true 2273949 546850.709036: 501985 cycles: ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms]) > true 2273949 546850.709292: 503065 cycles: 7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2) > > Acked-by: Jiri Olsa <jolsa@kernel.org> > Signed-off-by: Namhyung Kim <namhyung@kernel.org> > --- > tools/perf/Documentation/perf-record.txt | 15 +++++++++++--- > tools/perf/util/bpf_counter.c | 3 +-- > tools/perf/util/evlist.c | 25 +++++++++++++++++------- > tools/perf/util/evsel.c | 2 ++ > tools/perf/util/parse-events.c | 8 +++----- > 5 files changed, 36 insertions(+), 17 deletions(-) > > diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt > index ff815c2f67e8..122f71726eaa 100644 > --- a/tools/perf/Documentation/perf-record.txt > +++ b/tools/perf/Documentation/perf-record.txt > @@ -119,9 +119,12 @@ OPTIONS > "perf report" to view group events together. > > --filter=<filter>:: > - Event filter. This option should follow an event selector (-e) which > - selects either tracepoint event(s) or a hardware trace PMU > - (e.g. Intel PT or CoreSight). > + Event filter. This option should follow an event selector (-e). > + If the event is a tracepoint, the filter string will be parsed by > + the kernel. If the event is a hardware trace PMU (e.g. Intel PT > + or CoreSight), it'll be processed as an address filter. Otherwise > + it means a general filter using BPF which can be applied for any > + kind of event. > > - tracepoint filters > > @@ -176,6 +179,12 @@ OPTIONS > > Multiple filters can be separated with space or comma. > > + - bpf filters > + > + A BPF filter can access the sample data and make a decision based on the > + data. Users need to set an appropriate sample type to use the BPF > + filter. > + > --exclude-perf:: > Don't record events issued by perf itself. This option should follow > an event selector (-e) which selects tracepoint event(s). It adds a > diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c > index aa78a15a6f0a..1b77436e067e 100644 > --- a/tools/perf/util/bpf_counter.c > +++ b/tools/perf/util/bpf_counter.c > @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops; > > static inline bool bpf_counter_skip(struct evsel *evsel) > { > - return list_empty(&evsel->bpf_counter_list) && > - evsel->follower_skel == NULL; > + return evsel->bpf_counter_ops == NULL; > } > > int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) > diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c > index b74e12239aec..cc491a037836 100644 > --- a/tools/perf/util/evlist.c > +++ b/tools/perf/util/evlist.c > @@ -31,6 +31,7 @@ > #include "util/evlist-hybrid.h" > #include "util/pmu.h" > #include "util/sample.h" > +#include "util/bpf-filter.h" > #include <signal.h> > #include <unistd.h> > #include <sched.h> > @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) > int err = 0; > > evlist__for_each_entry(evlist, evsel) { > - if (evsel->filter == NULL) > - continue; > - > /* > * filters only work for tracepoint event, which doesn't have cpu limit. > * So evlist and evsel should always be same. > */ > - err = perf_evsel__apply_filter(&evsel->core, evsel->filter); > - if (err) { > - *err_evsel = evsel; > - break; > + if (evsel->filter) { > + err = perf_evsel__apply_filter(&evsel->core, evsel->filter); > + if (err) { > + *err_evsel = evsel; > + break; > + } > + } > + > + /* > + * non-tracepoint events can have BPF filters. > + */ > + if (!list_empty(&evsel->bpf_filters)) { > + err = perf_bpf_filter__prepare(evsel); > + if (err) { > + *err_evsel = evsel; > + break; > + } > } > } > > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c > index a83d8cd5eb51..dc3faf005c3b 100644 > --- a/tools/perf/util/evsel.c > +++ b/tools/perf/util/evsel.c > @@ -50,6 +50,7 @@ > #include "off_cpu.h" > #include "../perf-sys.h" > #include "util/parse-branch-options.h" > +#include "util/bpf-filter.h" > #include <internal/xyarray.h> > #include <internal/lib.h> > #include <internal/threadmap.h> > @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel) > assert(list_empty(&evsel->core.node)); > assert(evsel->evlist == NULL); > bpf_counter__destroy(evsel); > + perf_bpf_filter__destroy(evsel); > evsel__free_counts(evsel); > perf_evsel__free_fd(&evsel->core); > perf_evsel__free_id(&evsel->core); > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c > index 3b2e5bb3e852..6c5cf5244486 100644 > --- a/tools/perf/util/parse-events.c > +++ b/tools/perf/util/parse-events.c > @@ -28,6 +28,7 @@ > #include "perf.h" > #include "util/parse-events-hybrid.h" > #include "util/pmu-hybrid.h" > +#include "util/bpf-filter.h" > #include "tracepoint.h" > #include "thread_map.h" > > @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg) > perf_pmu__scan_file(pmu, "nr_addr_filters", > "%d", &nr_addr_filters); > > - if (!nr_addr_filters) { > - fprintf(stderr, > - "This CPU does not support address filtering\n"); > - return -1; > - } > + if (!nr_addr_filters) > + return perf_bpf_filter__parse(&evsel->bpf_filters, str); > > if (evsel__append_addr_filter(evsel, str) < 0) { > fprintf(stderr, > -- > 2.40.0.rc1.284.g88254d51c5-goog >
Hi Arnaldo, On Wed, Mar 15, 2023 at 6:47 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote: > > Em Tue, Mar 14, 2023 at 04:42:30PM -0700, Namhyung Kim escreveu: > > Use --filter option to set BPF filter for generic events other than the > > tracepoints or Intel PT. The BPF program will check the sample data and > > filter according to the expression. > > > > For example, the below is the typical perf record for frequency mode. > > The sample period started from 1 and increased gradually. > > > > $ sudo ./perf record -e cycles true > > $ sudo ./perf script > > perf-exec 2272336 546683.916875: 1 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916892: 1 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916899: 3 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916905: 17 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916911: 100 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916917: 589 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916924: 3470 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > perf-exec 2272336 546683.916930: 20465 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) > > true 2272336 546683.916940: 119873 cycles: ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms]) > > true 2272336 546683.917003: 461349 cycles: ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms]) > > true 2272336 546683.917237: 635778 cycles: ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms]) > > > > When you add a BPF filter to get samples having periods greater than 1000, > > the output would look like below: > > Had to add: > > diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c > index be336f1b2b689602..153a13cdca9df1ea 100644 > --- a/tools/perf/util/python.c > +++ b/tools/perf/util/python.c > @@ -19,6 +19,7 @@ > #include "mmap.h" > #include "stat.h" > #include "metricgroup.h" > +#include "util/bpf-filter.h" > #include "util/env.h" > #include "util/pmu.h" > #include <internal/lib.h> > @@ -135,6 +136,18 @@ int bpf_counter__disable(struct evsel *evsel __maybe_unused) > return 0; > } > > +// not to drag util/bpf-filter.c > + > +int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused) > +{ > + return 0; > +} > + > +int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused) > +{ > + return 0; > +} > + > /* > * Support debug printing even though util/debug.c is not linked. That means > * implementing 'verbose' and 'eprintf'. > > > Please run 'perf test' before submitting patches, Ugh, sorry. I think I ran it at some point but missed the python test :-p Anyway, I'm afraid you need to enclose with #ifndef HAVE_BPF_SKEL. Thanks, Namhyung > > - Arnaldo > > > $ sudo ./perf record -e cycles --filter 'period > 1000' true > > $ sudo ./perf script > > perf-exec 2273949 546850.708501: 5029 cycles: ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms]) > > perf-exec 2273949 546850.708508: 32409 cycles: ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms]) > > perf-exec 2273949 546850.708526: 143369 cycles: ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms]) > > perf-exec 2273949 546850.708600: 372650 cycles: ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms]) > > perf-exec 2273949 546850.708791: 482953 cycles: ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms]) > > true 2273949 546850.709036: 501985 cycles: ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms]) > > true 2273949 546850.709292: 503065 cycles: 7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2) > > > > Acked-by: Jiri Olsa <jolsa@kernel.org> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org> > > --- > > tools/perf/Documentation/perf-record.txt | 15 +++++++++++--- > > tools/perf/util/bpf_counter.c | 3 +-- > > tools/perf/util/evlist.c | 25 +++++++++++++++++------- > > tools/perf/util/evsel.c | 2 ++ > > tools/perf/util/parse-events.c | 8 +++----- > > 5 files changed, 36 insertions(+), 17 deletions(-) > > > > diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt > > index ff815c2f67e8..122f71726eaa 100644 > > --- a/tools/perf/Documentation/perf-record.txt > > +++ b/tools/perf/Documentation/perf-record.txt > > @@ -119,9 +119,12 @@ OPTIONS > > "perf report" to view group events together. > > > > --filter=<filter>:: > > - Event filter. This option should follow an event selector (-e) which > > - selects either tracepoint event(s) or a hardware trace PMU > > - (e.g. Intel PT or CoreSight). > > + Event filter. This option should follow an event selector (-e). > > + If the event is a tracepoint, the filter string will be parsed by > > + the kernel. If the event is a hardware trace PMU (e.g. Intel PT > > + or CoreSight), it'll be processed as an address filter. Otherwise > > + it means a general filter using BPF which can be applied for any > > + kind of event. > > > > - tracepoint filters > > > > @@ -176,6 +179,12 @@ OPTIONS > > > > Multiple filters can be separated with space or comma. > > > > + - bpf filters > > + > > + A BPF filter can access the sample data and make a decision based on the > > + data. Users need to set an appropriate sample type to use the BPF > > + filter. > > + > > --exclude-perf:: > > Don't record events issued by perf itself. This option should follow > > an event selector (-e) which selects tracepoint event(s). It adds a > > diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c > > index aa78a15a6f0a..1b77436e067e 100644 > > --- a/tools/perf/util/bpf_counter.c > > +++ b/tools/perf/util/bpf_counter.c > > @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops; > > > > static inline bool bpf_counter_skip(struct evsel *evsel) > > { > > - return list_empty(&evsel->bpf_counter_list) && > > - evsel->follower_skel == NULL; > > + return evsel->bpf_counter_ops == NULL; > > } > > > > int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) > > diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c > > index b74e12239aec..cc491a037836 100644 > > --- a/tools/perf/util/evlist.c > > +++ b/tools/perf/util/evlist.c > > @@ -31,6 +31,7 @@ > > #include "util/evlist-hybrid.h" > > #include "util/pmu.h" > > #include "util/sample.h" > > +#include "util/bpf-filter.h" > > #include <signal.h> > > #include <unistd.h> > > #include <sched.h> > > @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) > > int err = 0; > > > > evlist__for_each_entry(evlist, evsel) { > > - if (evsel->filter == NULL) > > - continue; > > - > > /* > > * filters only work for tracepoint event, which doesn't have cpu limit. > > * So evlist and evsel should always be same. > > */ > > - err = perf_evsel__apply_filter(&evsel->core, evsel->filter); > > - if (err) { > > - *err_evsel = evsel; > > - break; > > + if (evsel->filter) { > > + err = perf_evsel__apply_filter(&evsel->core, evsel->filter); > > + if (err) { > > + *err_evsel = evsel; > > + break; > > + } > > + } > > + > > + /* > > + * non-tracepoint events can have BPF filters. > > + */ > > + if (!list_empty(&evsel->bpf_filters)) { > > + err = perf_bpf_filter__prepare(evsel); > > + if (err) { > > + *err_evsel = evsel; > > + break; > > + } > > } > > } > > > > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c > > index a83d8cd5eb51..dc3faf005c3b 100644 > > --- a/tools/perf/util/evsel.c > > +++ b/tools/perf/util/evsel.c > > @@ -50,6 +50,7 @@ > > #include "off_cpu.h" > > #include "../perf-sys.h" > > #include "util/parse-branch-options.h" > > +#include "util/bpf-filter.h" > > #include <internal/xyarray.h> > > #include <internal/lib.h> > > #include <internal/threadmap.h> > > @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel) > > assert(list_empty(&evsel->core.node)); > > assert(evsel->evlist == NULL); > > bpf_counter__destroy(evsel); > > + perf_bpf_filter__destroy(evsel); > > evsel__free_counts(evsel); > > perf_evsel__free_fd(&evsel->core); > > perf_evsel__free_id(&evsel->core); > > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c > > index 3b2e5bb3e852..6c5cf5244486 100644 > > --- a/tools/perf/util/parse-events.c > > +++ b/tools/perf/util/parse-events.c > > @@ -28,6 +28,7 @@ > > #include "perf.h" > > #include "util/parse-events-hybrid.h" > > #include "util/pmu-hybrid.h" > > +#include "util/bpf-filter.h" > > #include "tracepoint.h" > > #include "thread_map.h" > > > > @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg) > > perf_pmu__scan_file(pmu, "nr_addr_filters", > > "%d", &nr_addr_filters); > > > > - if (!nr_addr_filters) { > > - fprintf(stderr, > > - "This CPU does not support address filtering\n"); > > - return -1; > > - } > > + if (!nr_addr_filters) > > + return perf_bpf_filter__parse(&evsel->bpf_filters, str); > > > > if (evsel__append_addr_filter(evsel, str) < 0) { > > fprintf(stderr, > > -- > > 2.40.0.rc1.284.g88254d51c5-goog > > > > -- > > - Arnaldo
On March 15, 2023 1:41:29 PM GMT-03:00, Namhyung Kim <namhyung@kernel.org> wrote: >Hi Arnaldo, > >On Wed, Mar 15, 2023 at 6:47 AM Arnaldo Carvalho de Melo ><acme@kernel.org> wrote: >> >> Em Tue, Mar 14, 2023 at 04:42:30PM -0700, Namhyung Kim escreveu: >> > Use --filter option to set BPF filter for generic events other than the >> > tracepoints or Intel PT. The BPF program will check the sample data and >> > filter according to the expression. >> > >> > For example, the below is the typical perf record for frequency mode. >> > The sample period started from 1 and increased gradually. >> > >> > $ sudo ./perf record -e cycles true >> > $ sudo ./perf script >> > perf-exec 2272336 546683.916875: 1 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916892: 1 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916899: 3 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916905: 17 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916911: 100 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916917: 589 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916924: 3470 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > perf-exec 2272336 546683.916930: 20465 cycles: ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms]) >> > true 2272336 546683.916940: 119873 cycles: ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms]) >> > true 2272336 546683.917003: 461349 cycles: ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms]) >> > true 2272336 546683.917237: 635778 cycles: ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms]) >> > >> > When you add a BPF filter to get samples having periods greater than 1000, >> > the output would look like below: >> >> Had to add: >> >> diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c >> index be336f1b2b689602..153a13cdca9df1ea 100644 >> --- a/tools/perf/util/python.c >> +++ b/tools/perf/util/python.c >> @@ -19,6 +19,7 @@ >> #include "mmap.h" >> #include "stat.h" >> #include "metricgroup.h" >> +#include "util/bpf-filter.h" >> #include "util/env.h" >> #include "util/pmu.h" >> #include <internal/lib.h> >> @@ -135,6 +136,18 @@ int bpf_counter__disable(struct evsel *evsel __maybe_unused) >> return 0; >> } >> >> +// not to drag util/bpf-filter.c >> + >> +int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused) >> +{ >> + return 0; >> +} >> + >> +int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused) >> +{ >> + return 0; >> +} >> + >> /* >> * Support debug printing even though util/debug.c is not linked. That means >> * implementing 'verbose' and 'eprintf'. >> >> >> Please run 'perf test' before submitting patches, > >Ugh, sorry. I think I ran it at some point but missed the python test :-p > >Anyway, I'm afraid you need to enclose with #ifndef HAVE_BPF_SKEL. Right, I noticed that > >Thanks, >Namhyung > > >> >> - Arnaldo >> >> > $ sudo ./perf record -e cycles --filter 'period > 1000' true >> > $ sudo ./perf script >> > perf-exec 2273949 546850.708501: 5029 cycles: ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms]) >> > perf-exec 2273949 546850.708508: 32409 cycles: ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms]) >> > perf-exec 2273949 546850.708526: 143369 cycles: ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms]) >> > perf-exec 2273949 546850.708600: 372650 cycles: ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms]) >> > perf-exec 2273949 546850.708791: 482953 cycles: ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms]) >> > true 2273949 546850.709036: 501985 cycles: ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms]) >> > true 2273949 546850.709292: 503065 cycles: 7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2) >> > >> > Acked-by: Jiri Olsa <jolsa@kernel.org> >> > Signed-off-by: Namhyung Kim <namhyung@kernel.org> >> > --- >> > tools/perf/Documentation/perf-record.txt | 15 +++++++++++--- >> > tools/perf/util/bpf_counter.c | 3 +-- >> > tools/perf/util/evlist.c | 25 +++++++++++++++++------- >> > tools/perf/util/evsel.c | 2 ++ >> > tools/perf/util/parse-events.c | 8 +++----- >> > 5 files changed, 36 insertions(+), 17 deletions(-) >> > >> > diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt >> > index ff815c2f67e8..122f71726eaa 100644 >> > --- a/tools/perf/Documentation/perf-record.txt >> > +++ b/tools/perf/Documentation/perf-record.txt >> > @@ -119,9 +119,12 @@ OPTIONS >> > "perf report" to view group events together. >> > >> > --filter=<filter>:: >> > - Event filter. This option should follow an event selector (-e) which >> > - selects either tracepoint event(s) or a hardware trace PMU >> > - (e.g. Intel PT or CoreSight). >> > + Event filter. This option should follow an event selector (-e). >> > + If the event is a tracepoint, the filter string will be parsed by >> > + the kernel. If the event is a hardware trace PMU (e.g. Intel PT >> > + or CoreSight), it'll be processed as an address filter. Otherwise >> > + it means a general filter using BPF which can be applied for any >> > + kind of event. >> > >> > - tracepoint filters >> > >> > @@ -176,6 +179,12 @@ OPTIONS >> > >> > Multiple filters can be separated with space or comma. >> > >> > + - bpf filters >> > + >> > + A BPF filter can access the sample data and make a decision based on the >> > + data. Users need to set an appropriate sample type to use the BPF >> > + filter. >> > + >> > --exclude-perf:: >> > Don't record events issued by perf itself. This option should follow >> > an event selector (-e) which selects tracepoint event(s). It adds a >> > diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c >> > index aa78a15a6f0a..1b77436e067e 100644 >> > --- a/tools/perf/util/bpf_counter.c >> > +++ b/tools/perf/util/bpf_counter.c >> > @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops; >> > >> > static inline bool bpf_counter_skip(struct evsel *evsel) >> > { >> > - return list_empty(&evsel->bpf_counter_list) && >> > - evsel->follower_skel == NULL; >> > + return evsel->bpf_counter_ops == NULL; >> > } >> > >> > int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) >> > diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c >> > index b74e12239aec..cc491a037836 100644 >> > --- a/tools/perf/util/evlist.c >> > +++ b/tools/perf/util/evlist.c >> > @@ -31,6 +31,7 @@ >> > #include "util/evlist-hybrid.h" >> > #include "util/pmu.h" >> > #include "util/sample.h" >> > +#include "util/bpf-filter.h" >> > #include <signal.h> >> > #include <unistd.h> >> > #include <sched.h> >> > @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) >> > int err = 0; >> > >> > evlist__for_each_entry(evlist, evsel) { >> > - if (evsel->filter == NULL) >> > - continue; >> > - >> > /* >> > * filters only work for tracepoint event, which doesn't have cpu limit. >> > * So evlist and evsel should always be same. >> > */ >> > - err = perf_evsel__apply_filter(&evsel->core, evsel->filter); >> > - if (err) { >> > - *err_evsel = evsel; >> > - break; >> > + if (evsel->filter) { >> > + err = perf_evsel__apply_filter(&evsel->core, evsel->filter); >> > + if (err) { >> > + *err_evsel = evsel; >> > + break; >> > + } >> > + } >> > + >> > + /* >> > + * non-tracepoint events can have BPF filters. >> > + */ >> > + if (!list_empty(&evsel->bpf_filters)) { >> > + err = perf_bpf_filter__prepare(evsel); >> > + if (err) { >> > + *err_evsel = evsel; >> > + break; >> > + } >> > } >> > } >> > >> > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c >> > index a83d8cd5eb51..dc3faf005c3b 100644 >> > --- a/tools/perf/util/evsel.c >> > +++ b/tools/perf/util/evsel.c >> > @@ -50,6 +50,7 @@ >> > #include "off_cpu.h" >> > #include "../perf-sys.h" >> > #include "util/parse-branch-options.h" >> > +#include "util/bpf-filter.h" >> > #include <internal/xyarray.h> >> > #include <internal/lib.h> >> > #include <internal/threadmap.h> >> > @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel) >> > assert(list_empty(&evsel->core.node)); >> > assert(evsel->evlist == NULL); >> > bpf_counter__destroy(evsel); >> > + perf_bpf_filter__destroy(evsel); >> > evsel__free_counts(evsel); >> > perf_evsel__free_fd(&evsel->core); >> > perf_evsel__free_id(&evsel->core); >> > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c >> > index 3b2e5bb3e852..6c5cf5244486 100644 >> > --- a/tools/perf/util/parse-events.c >> > +++ b/tools/perf/util/parse-events.c >> > @@ -28,6 +28,7 @@ >> > #include "perf.h" >> > #include "util/parse-events-hybrid.h" >> > #include "util/pmu-hybrid.h" >> > +#include "util/bpf-filter.h" >> > #include "tracepoint.h" >> > #include "thread_map.h" >> > >> > @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg) >> > perf_pmu__scan_file(pmu, "nr_addr_filters", >> > "%d", &nr_addr_filters); >> > >> > - if (!nr_addr_filters) { >> > - fprintf(stderr, >> > - "This CPU does not support address filtering\n"); >> > - return -1; >> > - } >> > + if (!nr_addr_filters) >> > + return perf_bpf_filter__parse(&evsel->bpf_filters, str); >> > >> > if (evsel__append_addr_filter(evsel, str) < 0) { >> > fprintf(stderr, >> > -- >> > 2.40.0.rc1.284.g88254d51c5-goog >> > >> >> -- >> >> - Arnaldo
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index ff815c2f67e8..122f71726eaa 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -119,9 +119,12 @@ OPTIONS "perf report" to view group events together. --filter=<filter>:: - Event filter. This option should follow an event selector (-e) which - selects either tracepoint event(s) or a hardware trace PMU - (e.g. Intel PT or CoreSight). + Event filter. This option should follow an event selector (-e). + If the event is a tracepoint, the filter string will be parsed by + the kernel. If the event is a hardware trace PMU (e.g. Intel PT + or CoreSight), it'll be processed as an address filter. Otherwise + it means a general filter using BPF which can be applied for any + kind of event. - tracepoint filters @@ -176,6 +179,12 @@ OPTIONS Multiple filters can be separated with space or comma. + - bpf filters + + A BPF filter can access the sample data and make a decision based on the + data. Users need to set an appropriate sample type to use the BPF + filter. + --exclude-perf:: Don't record events issued by perf itself. This option should follow an event selector (-e) which selects tracepoint event(s). It adds a diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index aa78a15a6f0a..1b77436e067e 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops; static inline bool bpf_counter_skip(struct evsel *evsel) { - return list_empty(&evsel->bpf_counter_list) && - evsel->follower_skel == NULL; + return evsel->bpf_counter_ops == NULL; } int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index b74e12239aec..cc491a037836 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -31,6 +31,7 @@ #include "util/evlist-hybrid.h" #include "util/pmu.h" #include "util/sample.h" +#include "util/bpf-filter.h" #include <signal.h> #include <unistd.h> #include <sched.h> @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) int err = 0; evlist__for_each_entry(evlist, evsel) { - if (evsel->filter == NULL) - continue; - /* * filters only work for tracepoint event, which doesn't have cpu limit. * So evlist and evsel should always be same. */ - err = perf_evsel__apply_filter(&evsel->core, evsel->filter); - if (err) { - *err_evsel = evsel; - break; + if (evsel->filter) { + err = perf_evsel__apply_filter(&evsel->core, evsel->filter); + if (err) { + *err_evsel = evsel; + break; + } + } + + /* + * non-tracepoint events can have BPF filters. + */ + if (!list_empty(&evsel->bpf_filters)) { + err = perf_bpf_filter__prepare(evsel); + if (err) { + *err_evsel = evsel; + break; + } } } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a83d8cd5eb51..dc3faf005c3b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -50,6 +50,7 @@ #include "off_cpu.h" #include "../perf-sys.h" #include "util/parse-branch-options.h" +#include "util/bpf-filter.h" #include <internal/xyarray.h> #include <internal/lib.h> #include <internal/threadmap.h> @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel) assert(list_empty(&evsel->core.node)); assert(evsel->evlist == NULL); bpf_counter__destroy(evsel); + perf_bpf_filter__destroy(evsel); evsel__free_counts(evsel); perf_evsel__free_fd(&evsel->core); perf_evsel__free_id(&evsel->core); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 3b2e5bb3e852..6c5cf5244486 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -28,6 +28,7 @@ #include "perf.h" #include "util/parse-events-hybrid.h" #include "util/pmu-hybrid.h" +#include "util/bpf-filter.h" #include "tracepoint.h" #include "thread_map.h" @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg) perf_pmu__scan_file(pmu, "nr_addr_filters", "%d", &nr_addr_filters); - if (!nr_addr_filters) { - fprintf(stderr, - "This CPU does not support address filtering\n"); - return -1; - } + if (!nr_addr_filters) + return perf_bpf_filter__parse(&evsel->bpf_filters, str); if (evsel__append_addr_filter(evsel, str) < 0) { fprintf(stderr,