diff mbox series

[-next,v5,1/2] perf stat: Support inherit events during fork() for bperf

Message ID 20241021110201.325617-2-wutengda@huaweicloud.com (mailing list archive)
State Not Applicable
Delegated to: BPF
Headers show
Series perf stat: Support inherit events for bperf | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Tengda Wu Oct. 21, 2024, 11:02 a.m. UTC
bperf has a nice ability to share PMUs, but it still does not support
inherit events during fork(), resulting in some deviations in its stat
results compared with perf.

perf stat result:
$ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
   Performance counter stats for './perf test -w sqrtloop':

       2,316,038,116      cycles
       2,859,350,725      instructions

         1.009603637 seconds time elapsed

         1.004196000 seconds user
         0.003950000 seconds sys

bperf stat result:
$ ./perf stat --bpf-counters -e cycles,instructions -- \
      ./perf test -w sqrtloop

   Performance counter stats for './perf test -w sqrtloop':

          18,762,093      cycles
          23,487,766      instructions

         1.008913769 seconds time elapsed

         1.003248000 seconds user
         0.004069000 seconds sys

In order to support event inheritance, two new bpf programs are added
to monitor the fork and exit of tasks respectively. When a task is
created, add it to the filter map to enable counting, and reuse the
`accum_key` of its parent task to count together with the parent task.
When a task exits, remove it from the filter map to disable counting.

After support:
$ ./perf stat --bpf-counters -e cycles,instructions -- \
      ./perf test -w sqrtloop

 Performance counter stats for './perf test -w sqrtloop':

     2,316,252,189      cycles
     2,859,946,547      instructions

       1.009422314 seconds time elapsed

       1.003597000 seconds user
       0.004270000 seconds sys

Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
---
 tools/perf/builtin-stat.c                     |  1 +
 tools/perf/util/bpf_counter.c                 | 35 +++++--
 tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
 tools/perf/util/bpf_skel/bperf_u.h            |  5 +
 tools/perf/util/target.h                      |  1 +
 5 files changed, 126 insertions(+), 14 deletions(-)

Comments

Namhyung Kim Oct. 22, 2024, 4:08 a.m. UTC | #1
Hello,

On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
> bperf has a nice ability to share PMUs, but it still does not support
> inherit events during fork(), resulting in some deviations in its stat
> results compared with perf.
> 
> perf stat result:
> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
>    Performance counter stats for './perf test -w sqrtloop':
> 
>        2,316,038,116      cycles
>        2,859,350,725      instructions
> 
>          1.009603637 seconds time elapsed
> 
>          1.004196000 seconds user
>          0.003950000 seconds sys
> 
> bperf stat result:
> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>       ./perf test -w sqrtloop
> 
>    Performance counter stats for './perf test -w sqrtloop':
> 
>           18,762,093      cycles
>           23,487,766      instructions
> 
>          1.008913769 seconds time elapsed
> 
>          1.003248000 seconds user
>          0.004069000 seconds sys
> 
> In order to support event inheritance, two new bpf programs are added
> to monitor the fork and exit of tasks respectively. When a task is
> created, add it to the filter map to enable counting, and reuse the
> `accum_key` of its parent task to count together with the parent task.
> When a task exits, remove it from the filter map to disable counting.
> 
> After support:
> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>       ./perf test -w sqrtloop
> 
>  Performance counter stats for './perf test -w sqrtloop':
> 
>      2,316,252,189      cycles
>      2,859,946,547      instructions
> 
>        1.009422314 seconds time elapsed
> 
>        1.003597000 seconds user
>        0.004270000 seconds sys
> 
> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
> ---
>  tools/perf/builtin-stat.c                     |  1 +
>  tools/perf/util/bpf_counter.c                 | 35 +++++--
>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
>  tools/perf/util/target.h                      |  1 +
>  5 files changed, 126 insertions(+), 14 deletions(-)
> 
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 3e6b9f216e80..8bc880479417 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
>  	} else if (big_num_opt == 0) /* User passed --no-big-num */
>  		stat_config.big_num = false;
>  
> +	target.inherit = !stat_config.no_inherit;
>  	err = target__validate(&target);
>  	if (err) {
>  		target__strerror(&target, err, errbuf, BUFSIZ);
> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> index 7a8af60e0f51..73fcafbffc6a 100644
> --- a/tools/perf/util/bpf_counter.c
> +++ b/tools/perf/util/bpf_counter.c
> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
>  }
>  
>  static	struct perf_cpu_map *all_cpu_map;
> +static __u32 filter_entry_cnt;
>  
>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>  				       struct perf_event_attr_map_entry *entry)
> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>  	return err;
>  }
>  
> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
> +					 enum bperf_filter_type filter_type,
> +					 bool inherit)
> +{
> +	struct bpf_link *link;
> +	int err = 0;
> +
> +	if ((filter_type == BPERF_FILTER_PID ||
> +	    filter_type == BPERF_FILTER_TGID) && inherit)
> +		/* attach all follower bpf progs to enable event inheritance */
> +		err = bperf_follower_bpf__attach(skel);
> +	else {
> +		link = bpf_program__attach(skel->progs.fexit_XXX);
> +		if (IS_ERR(link))
> +			err = PTR_ERR(link);
> +	}
> +
> +	return err;
> +}
> +
>  static int bperf__load(struct evsel *evsel, struct target *target)
>  {
>  	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
>  	int attr_map_fd, diff_map_fd = -1, err;
>  	enum bperf_filter_type filter_type;
> -	__u32 filter_entry_cnt, i;
> +	__u32 i;
>  
>  	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
>  		return -1;
> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>  	/* set up reading map */
>  	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
>  				 filter_entry_cnt);
> -	/* set up follower filter based on target */
> -	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
> -				 filter_entry_cnt);
>  	err = bperf_follower_bpf__load(evsel->follower_skel);
>  	if (err) {
>  		pr_err("Failed to load follower skeleton\n");
> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>  	for (i = 0; i < filter_entry_cnt; i++) {
>  		int filter_map_fd;
>  		__u32 key;
> +		struct bperf_filter_value fval = { i, 0 };
>  
>  		if (filter_type == BPERF_FILTER_PID ||
>  		    filter_type == BPERF_FILTER_TGID)
> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>  			break;
>  
>  		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
> -		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
> +		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
>  	}
>  
>  	evsel->follower_skel->bss->type = filter_type;
> +	evsel->follower_skel->bss->inherit = target->inherit;
>  
> -	err = bperf_follower_bpf__attach(evsel->follower_skel);
> +	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
> +					    target->inherit);
>  
>  out:
>  	if (err && evsel->bperf_leader_link_fd >= 0)
> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
>  	bperf_sync_counters(evsel);
>  	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
>  
> -	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
> +	for (i = 0; i < filter_entry_cnt; i++) {
>  		struct perf_cpu entry;
>  		__u32 cpu;
>  
> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> index f193998530d4..0595063139a3 100644
> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> @@ -5,6 +5,8 @@
>  #include <bpf/bpf_tracing.h>
>  #include "bperf_u.h"
>  
> +#define MAX_ENTRIES 102400
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>  	__uint(key_size, sizeof(__u32));
> @@ -22,25 +24,29 @@ struct {
>  struct {
>  	__uint(type, BPF_MAP_TYPE_HASH);
>  	__uint(key_size, sizeof(__u32));
> -	__uint(value_size, sizeof(__u32));
> +	__uint(value_size, sizeof(struct bperf_filter_value));
> +	__uint(max_entries, MAX_ENTRIES);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>  } filter SEC(".maps");
>  
>  enum bperf_filter_type type = 0;
>  int enabled = 0;
> +int inherit;
>  
>  SEC("fexit/XXX")
>  int BPF_PROG(fexit_XXX)
>  {
>  	struct bpf_perf_event_value *diff_val, *accum_val;
>  	__u32 filter_key, zero = 0;
> -	__u32 *accum_key;
> +	__u32 accum_key;
> +	struct bperf_filter_value *fval;
>  
>  	if (!enabled)
>  		return 0;
>  
>  	switch (type) {
>  	case BPERF_FILTER_GLOBAL:
> -		accum_key = &zero;
> +		accum_key = zero;
>  		goto do_add;
>  	case BPERF_FILTER_CPU:
>  		filter_key = bpf_get_smp_processor_id();
> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
>  		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
>  		break;
>  	case BPERF_FILTER_TGID:
> -		filter_key = bpf_get_current_pid_tgid() >> 32;
> +		/* Use pid as the filter_key to exclude new task counts
> +		 * when inherit is disabled. Don't worry about the existing
> +		 * children in TGID losing their counts, bpf_counter has
> +		 * already added them to the filter map via perf_thread_map
> +		 * before this bpf prog runs.
> +		 */
> +		filter_key = inherit ?
> +			     bpf_get_current_pid_tgid() >> 32 :
> +			     bpf_get_current_pid_tgid() & 0xffffffff;

I'm not sure why this is needed.  Isn't the existing code fine?

Thanks,
Namhyung


>  		break;
>  	default:
>  		return 0;
>  	}
>  
> -	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
> -	if (!accum_key)
> +	fval = bpf_map_lookup_elem(&filter, &filter_key);
> +	if (!fval)
>  		return 0;
>  
> +	accum_key = fval->accum_key;
> +	if (fval->exited)
> +		bpf_map_delete_elem(&filter, &filter_key);
> +
>  do_add:
>  	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
>  	if (!diff_val)
>  		return 0;
>  
> -	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
> +	accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
>  	if (!accum_val)
>  		return 0;
>  
> @@ -75,4 +93,70 @@ int BPF_PROG(fexit_XXX)
>  	return 0;
>  }
>  
> +/* The program is only used for PID or TGID filter types. */
> +SEC("tp_btf/task_newtask")
> +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
> +{
> +	__u32 parent_key, child_key;
> +	struct bperf_filter_value *parent_fval;
> +	struct bperf_filter_value child_fval = { 0 };
> +
> +	if (!enabled)
> +		return 0;
> +
> +	switch (type) {
> +	case BPERF_FILTER_PID:
> +		parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
> +		child_key = task->pid;
> +		break;
> +	case BPERF_FILTER_TGID:
> +		parent_key = bpf_get_current_pid_tgid() >> 32;
> +		child_key = task->tgid;
> +		if (child_key == parent_key)
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	/* Check if the current task is one of the target tasks to be counted */
> +	parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
> +	if (!parent_fval)
> +		return 0;
> +
> +	/* Start counting for the new task by adding it into filter map,
> +	 * inherit the accum key of its parent task so that they can be
> +	 * counted together.
> +	 */
> +	child_fval.accum_key = parent_fval->accum_key;
> +	child_fval.exited = 0;
> +	bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
> +
> +	return 0;
> +}
> +
> +/* The program is only used for PID or TGID filter types. */
> +SEC("tp_btf/sched_process_exit")
> +int BPF_PROG(on_exittask, struct task_struct *task)
> +{
> +	__u32 pid;
> +	struct bperf_filter_value *fval;
> +
> +	if (!enabled)
> +		return 0;
> +
> +	/* Stop counting for this task by removing it from filter map.
> +	 * For TGID type, if the pid can be found in the map, it means that
> +	 * this pid belongs to the leader task. After the task exits, the
> +	 * tgid of its child tasks (if any) will be 1, so the pid can be
> +	 * safely removed.
> +	 */
> +	pid = task->pid;
> +	fval = bpf_map_lookup_elem(&filter, &pid);
> +	if (fval)
> +		fval->exited = 1;
> +
> +	return 0;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
> index 1ce0c2c905c1..4a4a753980be 100644
> --- a/tools/perf/util/bpf_skel/bperf_u.h
> +++ b/tools/perf/util/bpf_skel/bperf_u.h
> @@ -11,4 +11,9 @@ enum bperf_filter_type {
>  	BPERF_FILTER_TGID,
>  };
>  
> +struct bperf_filter_value {
> +	__u32 accum_key;
> +	__u8 exited;
> +};
> +
>  #endif /* __BPERF_STAT_U_H */
> diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
> index d582cae8e105..2ee2cc30340f 100644
> --- a/tools/perf/util/target.h
> +++ b/tools/perf/util/target.h
> @@ -17,6 +17,7 @@ struct target {
>  	bool	     default_per_cpu;
>  	bool	     per_thread;
>  	bool	     use_bpf;
> +	bool	     inherit;
>  	int	     initial_delay;
>  	const char   *attr_map;
>  };
> -- 
> 2.34.1
>
Tengda Wu Oct. 22, 2024, 9:39 a.m. UTC | #2
On 2024/10/22 12:08, Namhyung Kim wrote:
> Hello,
> 
> On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
>> bperf has a nice ability to share PMUs, but it still does not support
>> inherit events during fork(), resulting in some deviations in its stat
>> results compared with perf.
>>
>> perf stat result:
>> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
>>    Performance counter stats for './perf test -w sqrtloop':
>>
>>        2,316,038,116      cycles
>>        2,859,350,725      instructions
>>
>>          1.009603637 seconds time elapsed
>>
>>          1.004196000 seconds user
>>          0.003950000 seconds sys
>>
>> bperf stat result:
>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>>       ./perf test -w sqrtloop
>>
>>    Performance counter stats for './perf test -w sqrtloop':
>>
>>           18,762,093      cycles
>>           23,487,766      instructions
>>
>>          1.008913769 seconds time elapsed
>>
>>          1.003248000 seconds user
>>          0.004069000 seconds sys
>>
>> In order to support event inheritance, two new bpf programs are added
>> to monitor the fork and exit of tasks respectively. When a task is
>> created, add it to the filter map to enable counting, and reuse the
>> `accum_key` of its parent task to count together with the parent task.
>> When a task exits, remove it from the filter map to disable counting.
>>
>> After support:
>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>>       ./perf test -w sqrtloop
>>
>>  Performance counter stats for './perf test -w sqrtloop':
>>
>>      2,316,252,189      cycles
>>      2,859,946,547      instructions
>>
>>        1.009422314 seconds time elapsed
>>
>>        1.003597000 seconds user
>>        0.004270000 seconds sys
>>
>> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
>> ---
>>  tools/perf/builtin-stat.c                     |  1 +
>>  tools/perf/util/bpf_counter.c                 | 35 +++++--
>>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
>>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
>>  tools/perf/util/target.h                      |  1 +
>>  5 files changed, 126 insertions(+), 14 deletions(-)
>>
>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>> index 3e6b9f216e80..8bc880479417 100644
>> --- a/tools/perf/builtin-stat.c
>> +++ b/tools/perf/builtin-stat.c
>> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
>>  	} else if (big_num_opt == 0) /* User passed --no-big-num */
>>  		stat_config.big_num = false;
>>  
>> +	target.inherit = !stat_config.no_inherit;
>>  	err = target__validate(&target);
>>  	if (err) {
>>  		target__strerror(&target, err, errbuf, BUFSIZ);
>> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
>> index 7a8af60e0f51..73fcafbffc6a 100644
>> --- a/tools/perf/util/bpf_counter.c
>> +++ b/tools/perf/util/bpf_counter.c
>> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
>>  }
>>  
>>  static	struct perf_cpu_map *all_cpu_map;
>> +static __u32 filter_entry_cnt;
>>  
>>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>>  				       struct perf_event_attr_map_entry *entry)
>> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>>  	return err;
>>  }
>>  
>> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
>> +					 enum bperf_filter_type filter_type,
>> +					 bool inherit)
>> +{
>> +	struct bpf_link *link;
>> +	int err = 0;
>> +
>> +	if ((filter_type == BPERF_FILTER_PID ||
>> +	    filter_type == BPERF_FILTER_TGID) && inherit)
>> +		/* attach all follower bpf progs to enable event inheritance */
>> +		err = bperf_follower_bpf__attach(skel);
>> +	else {
>> +		link = bpf_program__attach(skel->progs.fexit_XXX);
>> +		if (IS_ERR(link))
>> +			err = PTR_ERR(link);
>> +	}
>> +
>> +	return err;
>> +}
>> +
>>  static int bperf__load(struct evsel *evsel, struct target *target)
>>  {
>>  	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
>>  	int attr_map_fd, diff_map_fd = -1, err;
>>  	enum bperf_filter_type filter_type;
>> -	__u32 filter_entry_cnt, i;
>> +	__u32 i;
>>  
>>  	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
>>  		return -1;
>> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>  	/* set up reading map */
>>  	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
>>  				 filter_entry_cnt);
>> -	/* set up follower filter based on target */
>> -	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
>> -				 filter_entry_cnt);
>>  	err = bperf_follower_bpf__load(evsel->follower_skel);
>>  	if (err) {
>>  		pr_err("Failed to load follower skeleton\n");
>> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>  	for (i = 0; i < filter_entry_cnt; i++) {
>>  		int filter_map_fd;
>>  		__u32 key;
>> +		struct bperf_filter_value fval = { i, 0 };
>>  
>>  		if (filter_type == BPERF_FILTER_PID ||
>>  		    filter_type == BPERF_FILTER_TGID)
>> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>  			break;
>>  
>>  		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
>> -		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
>> +		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
>>  	}
>>  
>>  	evsel->follower_skel->bss->type = filter_type;
>> +	evsel->follower_skel->bss->inherit = target->inherit;
>>  
>> -	err = bperf_follower_bpf__attach(evsel->follower_skel);
>> +	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
>> +					    target->inherit);
>>  
>>  out:
>>  	if (err && evsel->bperf_leader_link_fd >= 0)
>> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
>>  	bperf_sync_counters(evsel);
>>  	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
>>  
>> -	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
>> +	for (i = 0; i < filter_entry_cnt; i++) {
>>  		struct perf_cpu entry;
>>  		__u32 cpu;
>>  
>> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>> index f193998530d4..0595063139a3 100644
>> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>> @@ -5,6 +5,8 @@
>>  #include <bpf/bpf_tracing.h>
>>  #include "bperf_u.h"
>>  
>> +#define MAX_ENTRIES 102400
>> +
>>  struct {
>>  	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>  	__uint(key_size, sizeof(__u32));
>> @@ -22,25 +24,29 @@ struct {
>>  struct {
>>  	__uint(type, BPF_MAP_TYPE_HASH);
>>  	__uint(key_size, sizeof(__u32));
>> -	__uint(value_size, sizeof(__u32));
>> +	__uint(value_size, sizeof(struct bperf_filter_value));
>> +	__uint(max_entries, MAX_ENTRIES);
>> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>>  } filter SEC(".maps");
>>  
>>  enum bperf_filter_type type = 0;
>>  int enabled = 0;
>> +int inherit;
>>  
>>  SEC("fexit/XXX")
>>  int BPF_PROG(fexit_XXX)
>>  {
>>  	struct bpf_perf_event_value *diff_val, *accum_val;
>>  	__u32 filter_key, zero = 0;
>> -	__u32 *accum_key;
>> +	__u32 accum_key;
>> +	struct bperf_filter_value *fval;
>>  
>>  	if (!enabled)
>>  		return 0;
>>  
>>  	switch (type) {
>>  	case BPERF_FILTER_GLOBAL:
>> -		accum_key = &zero;
>> +		accum_key = zero;
>>  		goto do_add;
>>  	case BPERF_FILTER_CPU:
>>  		filter_key = bpf_get_smp_processor_id();
>> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
>>  		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
>>  		break;
>>  	case BPERF_FILTER_TGID:
>> -		filter_key = bpf_get_current_pid_tgid() >> 32;
>> +		/* Use pid as the filter_key to exclude new task counts
>> +		 * when inherit is disabled. Don't worry about the existing
>> +		 * children in TGID losing their counts, bpf_counter has
>> +		 * already added them to the filter map via perf_thread_map
>> +		 * before this bpf prog runs.
>> +		 */
>> +		filter_key = inherit ?
>> +			     bpf_get_current_pid_tgid() >> 32 :
>> +			     bpf_get_current_pid_tgid() & 0xffffffff;
> 
> I'm not sure why this is needed.  Isn't the existing code fine?

No, it's not. If I don't modify here, all child threads will always be counted
when inherit is disabled.


Before explaining this modification, we may need to first clarify what is included
in the filter map.

1. The fexit_XXX prog determines whether to count by filter_key during each
   context switch. If the key is found in the filter map, it will be counted,
   otherwise not.
2. The keys in the filter map are synchronized from the perf_thread_map when
   bperf__load().
3. The threads in perf_thread_map are added through cmd_stat()->evlist__create_maps()
   before bperf__load().
4. evlist__create_maps() fills perf_thread_map by traversing the /proc/%d/task
   directory, and these pids belong to the same tgid.

Therefore, when the bperf command is issued, the filter map already holds all
existing threads with the same tgid as the specified process.


Now, let's take a look at the TGID case. We hope the behavior is as follows:

 * TGID w/ inherit : specified process + all children from the processes
 * TGID w/o inherit: specified process (all threads in the process) only

Assuming that a new thread is created during bperf stats, the new thread should
exhibit the following behavior in the fexit_XXX prog:

 * TGID w/ inherit : do_add
 * TGID w/o inherit: skip and return

Let's now test the code before and after modification.

Before modification: (filter_key = tgid)

 * TGID w/ inherit:
      create  : new thread
      enter   : fexit_XXX prog
      assign  : filter_key = new thread's tgid
      match   : bpf_map_lookup_elem(&filter, &filter_key)
      do_add
   (PASS)

 * TGID w/o inherit:
      [...]   /* like above */
      do_add
   (FAILED, expect skip and return)

After modification: (filter_key = inherit ? tgid : pid)

 * TGID w/ inherit:
      create  : new thread
      enter   : fexit_XXX prog
      assign  : filter_key = new thread's tgid
      match   : bpf_map_lookup_elem(&filter, &filter_key)
      do_add
   (PASS)

 * TGID w/o inherit:
      create  : new thread
      enter   : fexit_XXX prog
      assign  : filter_key = new thread's pid
      mismatch: bpf_map_lookup_elem(&filter, &filter_key)
      skip and return
   (PASS)

As we can see, filter_key=tgid counts incorrectly in TGID w/o inherit case,
and we need to change it to filter_key=pid to fix it.

Thanks,
Tengda


> 
> Thanks,
> Namhyung
> 
> 
>>  		break;
>>  	default:
>>  		return 0;
>>  	}
>>  
>> -	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
>> -	if (!accum_key)
>> +	fval = bpf_map_lookup_elem(&filter, &filter_key);
>> +	if (!fval)
>>  		return 0;
>>  
>> +	accum_key = fval->accum_key;
>> +	if (fval->exited)
>> +		bpf_map_delete_elem(&filter, &filter_key);
>> +
>>  do_add:
>>  	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
>>  	if (!diff_val)
>>  		return 0;
>>  
>> -	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
>> +	accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
>>  	if (!accum_val)
>>  		return 0;
>>  
>> @@ -75,4 +93,70 @@ int BPF_PROG(fexit_XXX)
>>  	return 0;
>>  }
>>  
>> +/* The program is only used for PID or TGID filter types. */
>> +SEC("tp_btf/task_newtask")
>> +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
>> +{
>> +	__u32 parent_key, child_key;
>> +	struct bperf_filter_value *parent_fval;
>> +	struct bperf_filter_value child_fval = { 0 };
>> +
>> +	if (!enabled)
>> +		return 0;
>> +
>> +	switch (type) {
>> +	case BPERF_FILTER_PID:
>> +		parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
>> +		child_key = task->pid;
>> +		break;
>> +	case BPERF_FILTER_TGID:
>> +		parent_key = bpf_get_current_pid_tgid() >> 32;
>> +		child_key = task->tgid;
>> +		if (child_key == parent_key)
>> +			return 0;
>> +		break;
>> +	default:
>> +		return 0;
>> +	}
>> +
>> +	/* Check if the current task is one of the target tasks to be counted */
>> +	parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
>> +	if (!parent_fval)
>> +		return 0;
>> +
>> +	/* Start counting for the new task by adding it into filter map,
>> +	 * inherit the accum key of its parent task so that they can be
>> +	 * counted together.
>> +	 */
>> +	child_fval.accum_key = parent_fval->accum_key;
>> +	child_fval.exited = 0;
>> +	bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
>> +
>> +	return 0;
>> +}
>> +
>> +/* The program is only used for PID or TGID filter types. */
>> +SEC("tp_btf/sched_process_exit")
>> +int BPF_PROG(on_exittask, struct task_struct *task)
>> +{
>> +	__u32 pid;
>> +	struct bperf_filter_value *fval;
>> +
>> +	if (!enabled)
>> +		return 0;
>> +
>> +	/* Stop counting for this task by removing it from filter map.
>> +	 * For TGID type, if the pid can be found in the map, it means that
>> +	 * this pid belongs to the leader task. After the task exits, the
>> +	 * tgid of its child tasks (if any) will be 1, so the pid can be
>> +	 * safely removed.
>> +	 */
>> +	pid = task->pid;
>> +	fval = bpf_map_lookup_elem(&filter, &pid);
>> +	if (fval)
>> +		fval->exited = 1;
>> +
>> +	return 0;
>> +}
>> +
>>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
>> diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
>> index 1ce0c2c905c1..4a4a753980be 100644
>> --- a/tools/perf/util/bpf_skel/bperf_u.h
>> +++ b/tools/perf/util/bpf_skel/bperf_u.h
>> @@ -11,4 +11,9 @@ enum bperf_filter_type {
>>  	BPERF_FILTER_TGID,
>>  };
>>  
>> +struct bperf_filter_value {
>> +	__u32 accum_key;
>> +	__u8 exited;
>> +};
>> +
>>  #endif /* __BPERF_STAT_U_H */
>> diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
>> index d582cae8e105..2ee2cc30340f 100644
>> --- a/tools/perf/util/target.h
>> +++ b/tools/perf/util/target.h
>> @@ -17,6 +17,7 @@ struct target {
>>  	bool	     default_per_cpu;
>>  	bool	     per_thread;
>>  	bool	     use_bpf;
>> +	bool	     inherit;
>>  	int	     initial_delay;
>>  	const char   *attr_map;
>>  };
>> -- 
>> 2.34.1
>>
Namhyung Kim Oct. 23, 2024, 10:45 p.m. UTC | #3
On Tue, Oct 22, 2024 at 05:39:23PM +0800, Tengda Wu wrote:
> 
> 
> On 2024/10/22 12:08, Namhyung Kim wrote:
> > Hello,
> > 
> > On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
> >> bperf has a nice ability to share PMUs, but it still does not support
> >> inherit events during fork(), resulting in some deviations in its stat
> >> results compared with perf.
> >>
> >> perf stat result:
> >> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
> >>    Performance counter stats for './perf test -w sqrtloop':
> >>
> >>        2,316,038,116      cycles
> >>        2,859,350,725      instructions
> >>
> >>          1.009603637 seconds time elapsed
> >>
> >>          1.004196000 seconds user
> >>          0.003950000 seconds sys
> >>
> >> bperf stat result:
> >> $ ./perf stat --bpf-counters -e cycles,instructions -- \
> >>       ./perf test -w sqrtloop
> >>
> >>    Performance counter stats for './perf test -w sqrtloop':
> >>
> >>           18,762,093      cycles
> >>           23,487,766      instructions
> >>
> >>          1.008913769 seconds time elapsed
> >>
> >>          1.003248000 seconds user
> >>          0.004069000 seconds sys
> >>
> >> In order to support event inheritance, two new bpf programs are added
> >> to monitor the fork and exit of tasks respectively. When a task is
> >> created, add it to the filter map to enable counting, and reuse the
> >> `accum_key` of its parent task to count together with the parent task.
> >> When a task exits, remove it from the filter map to disable counting.
> >>
> >> After support:
> >> $ ./perf stat --bpf-counters -e cycles,instructions -- \
> >>       ./perf test -w sqrtloop
> >>
> >>  Performance counter stats for './perf test -w sqrtloop':
> >>
> >>      2,316,252,189      cycles
> >>      2,859,946,547      instructions
> >>
> >>        1.009422314 seconds time elapsed
> >>
> >>        1.003597000 seconds user
> >>        0.004270000 seconds sys
> >>
> >> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
> >> ---
> >>  tools/perf/builtin-stat.c                     |  1 +
> >>  tools/perf/util/bpf_counter.c                 | 35 +++++--
> >>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
> >>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
> >>  tools/perf/util/target.h                      |  1 +
> >>  5 files changed, 126 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> >> index 3e6b9f216e80..8bc880479417 100644
> >> --- a/tools/perf/builtin-stat.c
> >> +++ b/tools/perf/builtin-stat.c
> >> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
> >>  	} else if (big_num_opt == 0) /* User passed --no-big-num */
> >>  		stat_config.big_num = false;
> >>  
> >> +	target.inherit = !stat_config.no_inherit;
> >>  	err = target__validate(&target);
> >>  	if (err) {
> >>  		target__strerror(&target, err, errbuf, BUFSIZ);
> >> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> >> index 7a8af60e0f51..73fcafbffc6a 100644
> >> --- a/tools/perf/util/bpf_counter.c
> >> +++ b/tools/perf/util/bpf_counter.c
> >> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
> >>  }
> >>  
> >>  static	struct perf_cpu_map *all_cpu_map;
> >> +static __u32 filter_entry_cnt;
> >>  
> >>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
> >>  				       struct perf_event_attr_map_entry *entry)
> >> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
> >>  	return err;
> >>  }
> >>  
> >> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
> >> +					 enum bperf_filter_type filter_type,
> >> +					 bool inherit)
> >> +{
> >> +	struct bpf_link *link;
> >> +	int err = 0;
> >> +
> >> +	if ((filter_type == BPERF_FILTER_PID ||
> >> +	    filter_type == BPERF_FILTER_TGID) && inherit)
> >> +		/* attach all follower bpf progs to enable event inheritance */
> >> +		err = bperf_follower_bpf__attach(skel);
> >> +	else {
> >> +		link = bpf_program__attach(skel->progs.fexit_XXX);
> >> +		if (IS_ERR(link))
> >> +			err = PTR_ERR(link);
> >> +	}
> >> +
> >> +	return err;
> >> +}
> >> +
> >>  static int bperf__load(struct evsel *evsel, struct target *target)
> >>  {
> >>  	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
> >>  	int attr_map_fd, diff_map_fd = -1, err;
> >>  	enum bperf_filter_type filter_type;
> >> -	__u32 filter_entry_cnt, i;
> >> +	__u32 i;
> >>  
> >>  	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
> >>  		return -1;
> >> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
> >>  	/* set up reading map */
> >>  	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
> >>  				 filter_entry_cnt);
> >> -	/* set up follower filter based on target */
> >> -	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
> >> -				 filter_entry_cnt);
> >>  	err = bperf_follower_bpf__load(evsel->follower_skel);
> >>  	if (err) {
> >>  		pr_err("Failed to load follower skeleton\n");
> >> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
> >>  	for (i = 0; i < filter_entry_cnt; i++) {
> >>  		int filter_map_fd;
> >>  		__u32 key;
> >> +		struct bperf_filter_value fval = { i, 0 };
> >>  
> >>  		if (filter_type == BPERF_FILTER_PID ||
> >>  		    filter_type == BPERF_FILTER_TGID)
> >> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
> >>  			break;
> >>  
> >>  		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
> >> -		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
> >> +		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
> >>  	}
> >>  
> >>  	evsel->follower_skel->bss->type = filter_type;
> >> +	evsel->follower_skel->bss->inherit = target->inherit;
> >>  
> >> -	err = bperf_follower_bpf__attach(evsel->follower_skel);
> >> +	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
> >> +					    target->inherit);
> >>  
> >>  out:
> >>  	if (err && evsel->bperf_leader_link_fd >= 0)
> >> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
> >>  	bperf_sync_counters(evsel);
> >>  	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
> >>  
> >> -	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
> >> +	for (i = 0; i < filter_entry_cnt; i++) {
> >>  		struct perf_cpu entry;
> >>  		__u32 cpu;
> >>  
> >> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> >> index f193998530d4..0595063139a3 100644
> >> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> >> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> >> @@ -5,6 +5,8 @@
> >>  #include <bpf/bpf_tracing.h>
> >>  #include "bperf_u.h"
> >>  
> >> +#define MAX_ENTRIES 102400
> >> +
> >>  struct {
> >>  	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> >>  	__uint(key_size, sizeof(__u32));
> >> @@ -22,25 +24,29 @@ struct {
> >>  struct {
> >>  	__uint(type, BPF_MAP_TYPE_HASH);
> >>  	__uint(key_size, sizeof(__u32));
> >> -	__uint(value_size, sizeof(__u32));
> >> +	__uint(value_size, sizeof(struct bperf_filter_value));
> >> +	__uint(max_entries, MAX_ENTRIES);
> >> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> >>  } filter SEC(".maps");
> >>  
> >>  enum bperf_filter_type type = 0;
> >>  int enabled = 0;
> >> +int inherit;
> >>  
> >>  SEC("fexit/XXX")
> >>  int BPF_PROG(fexit_XXX)
> >>  {
> >>  	struct bpf_perf_event_value *diff_val, *accum_val;
> >>  	__u32 filter_key, zero = 0;
> >> -	__u32 *accum_key;
> >> +	__u32 accum_key;
> >> +	struct bperf_filter_value *fval;
> >>  
> >>  	if (!enabled)
> >>  		return 0;
> >>  
> >>  	switch (type) {
> >>  	case BPERF_FILTER_GLOBAL:
> >> -		accum_key = &zero;
> >> +		accum_key = zero;
> >>  		goto do_add;
> >>  	case BPERF_FILTER_CPU:
> >>  		filter_key = bpf_get_smp_processor_id();
> >> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
> >>  		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
> >>  		break;
> >>  	case BPERF_FILTER_TGID:
> >> -		filter_key = bpf_get_current_pid_tgid() >> 32;
> >> +		/* Use pid as the filter_key to exclude new task counts
> >> +		 * when inherit is disabled. Don't worry about the existing
> >> +		 * children in TGID losing their counts, bpf_counter has
> >> +		 * already added them to the filter map via perf_thread_map
> >> +		 * before this bpf prog runs.
> >> +		 */
> >> +		filter_key = inherit ?
> >> +			     bpf_get_current_pid_tgid() >> 32 :
> >> +			     bpf_get_current_pid_tgid() & 0xffffffff;
> > 
> > I'm not sure why this is needed.  Isn't the existing code fine?
> 
> No, it's not. If I don't modify here, all child threads will always be counted
> when inherit is disabled.
> 
> 
> Before explaining this modification, we may need to first clarify what is included
> in the filter map.
> 
> 1. The fexit_XXX prog determines whether to count by filter_key during each
>    context switch. If the key is found in the filter map, it will be counted,
>    otherwise not.
> 2. The keys in the filter map are synchronized from the perf_thread_map when
>    bperf__load().
> 3. The threads in perf_thread_map are added through cmd_stat()->evlist__create_maps()
>    before bperf__load().
> 4. evlist__create_maps() fills perf_thread_map by traversing the /proc/%d/task
>    directory, and these pids belong to the same tgid.
> 
> Therefore, when the bperf command is issued, the filter map already holds all
> existing threads with the same tgid as the specified process.
> 
> 
> Now, let's take a look at the TGID case. We hope the behavior is as follows:
> 
>  * TGID w/ inherit : specified process + all children from the processes
>  * TGID w/o inherit: specified process (all threads in the process) only
> 
> Assuming that a new thread is created during bperf stats, the new thread should
> exhibit the following behavior in the fexit_XXX prog:
> 
>  * TGID w/ inherit : do_add
>  * TGID w/o inherit: skip and return
> 
> Let's now test the code before and after modification.
> 
> Before modification: (filter_key = tgid)
> 
>  * TGID w/ inherit:
>       create  : new thread
>       enter   : fexit_XXX prog
>       assign  : filter_key = new thread's tgid
>       match   : bpf_map_lookup_elem(&filter, &filter_key)
>       do_add
>    (PASS)
> 
>  * TGID w/o inherit:
>       [...]   /* like above */
>       do_add
>    (FAILED, expect skip and return)
> 
> After modification: (filter_key = inherit ? tgid : pid)
> 
>  * TGID w/ inherit:
>       create  : new thread
>       enter   : fexit_XXX prog
>       assign  : filter_key = new thread's tgid
>       match   : bpf_map_lookup_elem(&filter, &filter_key)
>       do_add
>    (PASS)
> 
>  * TGID w/o inherit:
>       create  : new thread
>       enter   : fexit_XXX prog
>       assign  : filter_key = new thread's pid
>       mismatch: bpf_map_lookup_elem(&filter, &filter_key)
>       skip and return
>    (PASS)
> 
> As we can see, filter_key=tgid counts incorrectly in TGID w/o inherit case,
> and we need to change it to filter_key=pid to fix it.

I'm sorry but I don't think I'm following.  A new thread in TGID mode
(regardless inherit) should be counted always, right?  Why do you
expect to skip it?

Thanks,
Namhyung
Tengda Wu Oct. 24, 2024, 2:10 a.m. UTC | #4
On 2024/10/24 6:45, Namhyung Kim wrote:
> On Tue, Oct 22, 2024 at 05:39:23PM +0800, Tengda Wu wrote:
>>
>>
>> On 2024/10/22 12:08, Namhyung Kim wrote:
>>> Hello,
>>>
>>> On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
>>>> bperf has a nice ability to share PMUs, but it still does not support
>>>> inherit events during fork(), resulting in some deviations in its stat
>>>> results compared with perf.
>>>>
>>>> perf stat result:
>>>> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
>>>>    Performance counter stats for './perf test -w sqrtloop':
>>>>
>>>>        2,316,038,116      cycles
>>>>        2,859,350,725      instructions
>>>>
>>>>          1.009603637 seconds time elapsed
>>>>
>>>>          1.004196000 seconds user
>>>>          0.003950000 seconds sys
>>>>
>>>> bperf stat result:
>>>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>>>>       ./perf test -w sqrtloop
>>>>
>>>>    Performance counter stats for './perf test -w sqrtloop':
>>>>
>>>>           18,762,093      cycles
>>>>           23,487,766      instructions
>>>>
>>>>          1.008913769 seconds time elapsed
>>>>
>>>>          1.003248000 seconds user
>>>>          0.004069000 seconds sys
>>>>
>>>> In order to support event inheritance, two new bpf programs are added
>>>> to monitor the fork and exit of tasks respectively. When a task is
>>>> created, add it to the filter map to enable counting, and reuse the
>>>> `accum_key` of its parent task to count together with the parent task.
>>>> When a task exits, remove it from the filter map to disable counting.
>>>>
>>>> After support:
>>>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>>>>       ./perf test -w sqrtloop
>>>>
>>>>  Performance counter stats for './perf test -w sqrtloop':
>>>>
>>>>      2,316,252,189      cycles
>>>>      2,859,946,547      instructions
>>>>
>>>>        1.009422314 seconds time elapsed
>>>>
>>>>        1.003597000 seconds user
>>>>        0.004270000 seconds sys
>>>>
>>>> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
>>>> ---
>>>>  tools/perf/builtin-stat.c                     |  1 +
>>>>  tools/perf/util/bpf_counter.c                 | 35 +++++--
>>>>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
>>>>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
>>>>  tools/perf/util/target.h                      |  1 +
>>>>  5 files changed, 126 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>>>> index 3e6b9f216e80..8bc880479417 100644
>>>> --- a/tools/perf/builtin-stat.c
>>>> +++ b/tools/perf/builtin-stat.c
>>>> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
>>>>  	} else if (big_num_opt == 0) /* User passed --no-big-num */
>>>>  		stat_config.big_num = false;
>>>>  
>>>> +	target.inherit = !stat_config.no_inherit;
>>>>  	err = target__validate(&target);
>>>>  	if (err) {
>>>>  		target__strerror(&target, err, errbuf, BUFSIZ);
>>>> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
>>>> index 7a8af60e0f51..73fcafbffc6a 100644
>>>> --- a/tools/perf/util/bpf_counter.c
>>>> +++ b/tools/perf/util/bpf_counter.c
>>>> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
>>>>  }
>>>>  
>>>>  static	struct perf_cpu_map *all_cpu_map;
>>>> +static __u32 filter_entry_cnt;
>>>>  
>>>>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>>>>  				       struct perf_event_attr_map_entry *entry)
>>>> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>>>>  	return err;
>>>>  }
>>>>  
>>>> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
>>>> +					 enum bperf_filter_type filter_type,
>>>> +					 bool inherit)
>>>> +{
>>>> +	struct bpf_link *link;
>>>> +	int err = 0;
>>>> +
>>>> +	if ((filter_type == BPERF_FILTER_PID ||
>>>> +	    filter_type == BPERF_FILTER_TGID) && inherit)
>>>> +		/* attach all follower bpf progs to enable event inheritance */
>>>> +		err = bperf_follower_bpf__attach(skel);
>>>> +	else {
>>>> +		link = bpf_program__attach(skel->progs.fexit_XXX);
>>>> +		if (IS_ERR(link))
>>>> +			err = PTR_ERR(link);
>>>> +	}
>>>> +
>>>> +	return err;
>>>> +}
>>>> +
>>>>  static int bperf__load(struct evsel *evsel, struct target *target)
>>>>  {
>>>>  	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
>>>>  	int attr_map_fd, diff_map_fd = -1, err;
>>>>  	enum bperf_filter_type filter_type;
>>>> -	__u32 filter_entry_cnt, i;
>>>> +	__u32 i;
>>>>  
>>>>  	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
>>>>  		return -1;
>>>> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>>>  	/* set up reading map */
>>>>  	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
>>>>  				 filter_entry_cnt);
>>>> -	/* set up follower filter based on target */
>>>> -	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
>>>> -				 filter_entry_cnt);
>>>>  	err = bperf_follower_bpf__load(evsel->follower_skel);
>>>>  	if (err) {
>>>>  		pr_err("Failed to load follower skeleton\n");
>>>> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>>>  	for (i = 0; i < filter_entry_cnt; i++) {
>>>>  		int filter_map_fd;
>>>>  		__u32 key;
>>>> +		struct bperf_filter_value fval = { i, 0 };
>>>>  
>>>>  		if (filter_type == BPERF_FILTER_PID ||
>>>>  		    filter_type == BPERF_FILTER_TGID)
>>>> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>>>  			break;
>>>>  
>>>>  		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
>>>> -		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
>>>> +		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
>>>>  	}
>>>>  
>>>>  	evsel->follower_skel->bss->type = filter_type;
>>>> +	evsel->follower_skel->bss->inherit = target->inherit;
>>>>  
>>>> -	err = bperf_follower_bpf__attach(evsel->follower_skel);
>>>> +	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
>>>> +					    target->inherit);
>>>>  
>>>>  out:
>>>>  	if (err && evsel->bperf_leader_link_fd >= 0)
>>>> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
>>>>  	bperf_sync_counters(evsel);
>>>>  	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
>>>>  
>>>> -	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
>>>> +	for (i = 0; i < filter_entry_cnt; i++) {
>>>>  		struct perf_cpu entry;
>>>>  		__u32 cpu;
>>>>  
>>>> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>>>> index f193998530d4..0595063139a3 100644
>>>> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>>>> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>>>> @@ -5,6 +5,8 @@
>>>>  #include <bpf/bpf_tracing.h>
>>>>  #include "bperf_u.h"
>>>>  
>>>> +#define MAX_ENTRIES 102400
>>>> +
>>>>  struct {
>>>>  	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>>>  	__uint(key_size, sizeof(__u32));
>>>> @@ -22,25 +24,29 @@ struct {
>>>>  struct {
>>>>  	__uint(type, BPF_MAP_TYPE_HASH);
>>>>  	__uint(key_size, sizeof(__u32));
>>>> -	__uint(value_size, sizeof(__u32));
>>>> +	__uint(value_size, sizeof(struct bperf_filter_value));
>>>> +	__uint(max_entries, MAX_ENTRIES);
>>>> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>>>>  } filter SEC(".maps");
>>>>  
>>>>  enum bperf_filter_type type = 0;
>>>>  int enabled = 0;
>>>> +int inherit;
>>>>  
>>>>  SEC("fexit/XXX")
>>>>  int BPF_PROG(fexit_XXX)
>>>>  {
>>>>  	struct bpf_perf_event_value *diff_val, *accum_val;
>>>>  	__u32 filter_key, zero = 0;
>>>> -	__u32 *accum_key;
>>>> +	__u32 accum_key;
>>>> +	struct bperf_filter_value *fval;
>>>>  
>>>>  	if (!enabled)
>>>>  		return 0;
>>>>  
>>>>  	switch (type) {
>>>>  	case BPERF_FILTER_GLOBAL:
>>>> -		accum_key = &zero;
>>>> +		accum_key = zero;
>>>>  		goto do_add;
>>>>  	case BPERF_FILTER_CPU:
>>>>  		filter_key = bpf_get_smp_processor_id();
>>>> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
>>>>  		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
>>>>  		break;
>>>>  	case BPERF_FILTER_TGID:
>>>> -		filter_key = bpf_get_current_pid_tgid() >> 32;
>>>> +		/* Use pid as the filter_key to exclude new task counts
>>>> +		 * when inherit is disabled. Don't worry about the existing
>>>> +		 * children in TGID losing their counts, bpf_counter has
>>>> +		 * already added them to the filter map via perf_thread_map
>>>> +		 * before this bpf prog runs.
>>>> +		 */
>>>> +		filter_key = inherit ?
>>>> +			     bpf_get_current_pid_tgid() >> 32 :
>>>> +			     bpf_get_current_pid_tgid() & 0xffffffff;
>>>
>>> I'm not sure why this is needed.  Isn't the existing code fine?
>>
>> No, it's not. If I don't modify here, all child threads will always be counted
>> when inherit is disabled.
>>
>>
>> Before explaining this modification, we may need to first clarify what is included
>> in the filter map.
>>
>> 1. The fexit_XXX prog determines whether to count by filter_key during each
>>    context switch. If the key is found in the filter map, it will be counted,
>>    otherwise not.
>> 2. The keys in the filter map are synchronized from the perf_thread_map when
>>    bperf__load().
>> 3. The threads in perf_thread_map are added through cmd_stat()->evlist__create_maps()
>>    before bperf__load().
>> 4. evlist__create_maps() fills perf_thread_map by traversing the /proc/%d/task
>>    directory, and these pids belong to the same tgid.
>>
>> Therefore, when the bperf command is issued, the filter map already holds all
>> existing threads with the same tgid as the specified process.
>>
>>
>> Now, let's take a look at the TGID case. We hope the behavior is as follows:
>>
>>  * TGID w/ inherit : specified process + all children from the processes
>>  * TGID w/o inherit: specified process (all threads in the process) only
>>
>> Assuming that a new thread is created during bperf stats, the new thread should
>> exhibit the following behavior in the fexit_XXX prog:
>>
>>  * TGID w/ inherit : do_add
>>  * TGID w/o inherit: skip and return
>>
>> Let's now test the code before and after modification.
>>
>> Before modification: (filter_key = tgid)
>>
>>  * TGID w/ inherit:
>>       create  : new thread
>>       enter   : fexit_XXX prog
>>       assign  : filter_key = new thread's tgid
>>       match   : bpf_map_lookup_elem(&filter, &filter_key)
>>       do_add
>>    (PASS)
>>
>>  * TGID w/o inherit:
>>       [...]   /* like above */
>>       do_add
>>    (FAILED, expect skip and return)
>>
>> After modification: (filter_key = inherit ? tgid : pid)
>>
>>  * TGID w/ inherit:
>>       create  : new thread
>>       enter   : fexit_XXX prog
>>       assign  : filter_key = new thread's tgid
>>       match   : bpf_map_lookup_elem(&filter, &filter_key)
>>       do_add
>>    (PASS)
>>
>>  * TGID w/o inherit:
>>       create  : new thread
>>       enter   : fexit_XXX prog
>>       assign  : filter_key = new thread's pid
>>       mismatch: bpf_map_lookup_elem(&filter, &filter_key)
>>       skip and return
>>    (PASS)
>>
>> As we can see, filter_key=tgid counts incorrectly in TGID w/o inherit case,
>> and we need to change it to filter_key=pid to fix it.
> 
> I'm sorry but I don't think I'm following.  A new thread in TGID mode
> (regardless inherit) should be counted always, right?  Why do you
> expect to skip it?

This is how perf originally performs. To confirm this, I wrote a workload
that creates one new thread per second and then stat it, as shown below.
You can see that in 'TGID w/o inherit' case, perf does not count for the
newly created threads.

Perf TGID w/ inherit:
---
  $ ./perf stat -e cpu-clock --timeout 5000 -- ./new_thread_per_second
  thread 367444: start [main]
  thread 367448: start
  thread 367455: start
  thread 367462: start
  thread 367466: start
  thread 367473: start
  ./new_thread_per_second: Terminated

  Performance counter stats for './new_thread_per_second':

          10,017.71 msec cpu-clock

        5.005538048 seconds time elapsed

        10.018777000 seconds user
        0.000000000 seconds sys

Perf TGID w/o inherit:
---
  $ ./perf stat -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
  thread 366679: start [main]
  thread 366686: start
  thread 366693: start
  thread 366697: start
  thread 366704: start
  thread 366708: start
  ./new_thread_per_second: Terminated

  Performance counter stats for './new_thread_per_second':

                4.29 msec cpu-clock

        5.005539338 seconds time elapsed

        10.019673000 seconds user
        0.000000000 seconds sys


Therefore, we also need to distinguish it in bperf so that the collection
results can be consistent with perf.

Bperf TGID w/o inherit: (BEFORE FIX)
---
  $ ./perf stat --bpf-counters -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
  thread 369127: start [main]
  thread 369134: start
  thread 369141: start
  thread 369145: start
  thread 369152: start
  thread 369156: start
  ./new_thread_per_second: Terminated

  Performance counter stats for './new_thread_per_second':

          10,019.05 msec cpu-clock

        5.005567266 seconds time elapsed

        10.018528000 seconds user
        0.000000000 seconds sys

Bperf TGID w/o inherit: (AFTER FIX)
---
  $ ./perf stat --bpf-counters -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
  thread 366616: start [main]
  thread 366623: start
  thread 366627: start
  thread 366634: start
  thread 366638: start
  thread 366645: start
  ./new_thread_per_second: Terminated

  Performance counter stats for './new_thread_per_second':

                4.95 msec cpu-clock

        5.005511173 seconds time elapsed

        10.018790000 seconds user
        0.000000000 seconds sys


Thanks,
Tengda
Namhyung Kim Oct. 28, 2024, 6:23 a.m. UTC | #5
On Wed, Oct 23, 2024 at 7:11 PM Tengda Wu <wutengda@huaweicloud.com> wrote:
>
>
>
> On 2024/10/24 6:45, Namhyung Kim wrote:
> > On Tue, Oct 22, 2024 at 05:39:23PM +0800, Tengda Wu wrote:
> >>
> >>
> >> On 2024/10/22 12:08, Namhyung Kim wrote:
> >>> Hello,
> >>>
> >>> On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
> >>>> bperf has a nice ability to share PMUs, but it still does not support
> >>>> inherit events during fork(), resulting in some deviations in its stat
> >>>> results compared with perf.
> >>>>
> >>>> perf stat result:
> >>>> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
> >>>>    Performance counter stats for './perf test -w sqrtloop':
> >>>>
> >>>>        2,316,038,116      cycles
> >>>>        2,859,350,725      instructions
> >>>>
> >>>>          1.009603637 seconds time elapsed
> >>>>
> >>>>          1.004196000 seconds user
> >>>>          0.003950000 seconds sys
> >>>>
> >>>> bperf stat result:
> >>>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
> >>>>       ./perf test -w sqrtloop
> >>>>
> >>>>    Performance counter stats for './perf test -w sqrtloop':
> >>>>
> >>>>           18,762,093      cycles
> >>>>           23,487,766      instructions
> >>>>
> >>>>          1.008913769 seconds time elapsed
> >>>>
> >>>>          1.003248000 seconds user
> >>>>          0.004069000 seconds sys
> >>>>
> >>>> In order to support event inheritance, two new bpf programs are added
> >>>> to monitor the fork and exit of tasks respectively. When a task is
> >>>> created, add it to the filter map to enable counting, and reuse the
> >>>> `accum_key` of its parent task to count together with the parent task.
> >>>> When a task exits, remove it from the filter map to disable counting.
> >>>>
> >>>> After support:
> >>>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
> >>>>       ./perf test -w sqrtloop
> >>>>
> >>>>  Performance counter stats for './perf test -w sqrtloop':
> >>>>
> >>>>      2,316,252,189      cycles
> >>>>      2,859,946,547      instructions
> >>>>
> >>>>        1.009422314 seconds time elapsed
> >>>>
> >>>>        1.003597000 seconds user
> >>>>        0.004270000 seconds sys
> >>>>
> >>>> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
> >>>> ---
> >>>>  tools/perf/builtin-stat.c                     |  1 +
> >>>>  tools/perf/util/bpf_counter.c                 | 35 +++++--
> >>>>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
> >>>>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
> >>>>  tools/perf/util/target.h                      |  1 +
> >>>>  5 files changed, 126 insertions(+), 14 deletions(-)
> >>>>
> >>>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> >>>> index 3e6b9f216e80..8bc880479417 100644
> >>>> --- a/tools/perf/builtin-stat.c
> >>>> +++ b/tools/perf/builtin-stat.c
> >>>> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
> >>>>    } else if (big_num_opt == 0) /* User passed --no-big-num */
> >>>>            stat_config.big_num = false;
> >>>>
> >>>> +  target.inherit = !stat_config.no_inherit;
> >>>>    err = target__validate(&target);
> >>>>    if (err) {
> >>>>            target__strerror(&target, err, errbuf, BUFSIZ);
> >>>> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> >>>> index 7a8af60e0f51..73fcafbffc6a 100644
> >>>> --- a/tools/perf/util/bpf_counter.c
> >>>> +++ b/tools/perf/util/bpf_counter.c
> >>>> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
> >>>>  }
> >>>>
> >>>>  static    struct perf_cpu_map *all_cpu_map;
> >>>> +static __u32 filter_entry_cnt;
> >>>>
> >>>>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
> >>>>                                   struct perf_event_attr_map_entry *entry)
> >>>> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
> >>>>    return err;
> >>>>  }
> >>>>
> >>>> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
> >>>> +                                   enum bperf_filter_type filter_type,
> >>>> +                                   bool inherit)
> >>>> +{
> >>>> +  struct bpf_link *link;
> >>>> +  int err = 0;
> >>>> +
> >>>> +  if ((filter_type == BPERF_FILTER_PID ||
> >>>> +      filter_type == BPERF_FILTER_TGID) && inherit)
> >>>> +          /* attach all follower bpf progs to enable event inheritance */
> >>>> +          err = bperf_follower_bpf__attach(skel);
> >>>> +  else {
> >>>> +          link = bpf_program__attach(skel->progs.fexit_XXX);
> >>>> +          if (IS_ERR(link))
> >>>> +                  err = PTR_ERR(link);
> >>>> +  }
> >>>> +
> >>>> +  return err;
> >>>> +}
> >>>> +
> >>>>  static int bperf__load(struct evsel *evsel, struct target *target)
> >>>>  {
> >>>>    struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
> >>>>    int attr_map_fd, diff_map_fd = -1, err;
> >>>>    enum bperf_filter_type filter_type;
> >>>> -  __u32 filter_entry_cnt, i;
> >>>> +  __u32 i;
> >>>>
> >>>>    if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
> >>>>            return -1;
> >>>> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
> >>>>    /* set up reading map */
> >>>>    bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
> >>>>                             filter_entry_cnt);
> >>>> -  /* set up follower filter based on target */
> >>>> -  bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
> >>>> -                           filter_entry_cnt);
> >>>>    err = bperf_follower_bpf__load(evsel->follower_skel);
> >>>>    if (err) {
> >>>>            pr_err("Failed to load follower skeleton\n");
> >>>> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
> >>>>    for (i = 0; i < filter_entry_cnt; i++) {
> >>>>            int filter_map_fd;
> >>>>            __u32 key;
> >>>> +          struct bperf_filter_value fval = { i, 0 };
> >>>>
> >>>>            if (filter_type == BPERF_FILTER_PID ||
> >>>>                filter_type == BPERF_FILTER_TGID)
> >>>> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
> >>>>                    break;
> >>>>
> >>>>            filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
> >>>> -          bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
> >>>> +          bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
> >>>>    }
> >>>>
> >>>>    evsel->follower_skel->bss->type = filter_type;
> >>>> +  evsel->follower_skel->bss->inherit = target->inherit;
> >>>>
> >>>> -  err = bperf_follower_bpf__attach(evsel->follower_skel);
> >>>> +  err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
> >>>> +                                      target->inherit);
> >>>>
> >>>>  out:
> >>>>    if (err && evsel->bperf_leader_link_fd >= 0)
> >>>> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
> >>>>    bperf_sync_counters(evsel);
> >>>>    reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
> >>>>
> >>>> -  for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
> >>>> +  for (i = 0; i < filter_entry_cnt; i++) {
> >>>>            struct perf_cpu entry;
> >>>>            __u32 cpu;
> >>>>
> >>>> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> >>>> index f193998530d4..0595063139a3 100644
> >>>> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> >>>> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> >>>> @@ -5,6 +5,8 @@
> >>>>  #include <bpf/bpf_tracing.h>
> >>>>  #include "bperf_u.h"
> >>>>
> >>>> +#define MAX_ENTRIES 102400
> >>>> +
> >>>>  struct {
> >>>>    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> >>>>    __uint(key_size, sizeof(__u32));
> >>>> @@ -22,25 +24,29 @@ struct {
> >>>>  struct {
> >>>>    __uint(type, BPF_MAP_TYPE_HASH);
> >>>>    __uint(key_size, sizeof(__u32));
> >>>> -  __uint(value_size, sizeof(__u32));
> >>>> +  __uint(value_size, sizeof(struct bperf_filter_value));
> >>>> +  __uint(max_entries, MAX_ENTRIES);
> >>>> +  __uint(map_flags, BPF_F_NO_PREALLOC);
> >>>>  } filter SEC(".maps");
> >>>>
> >>>>  enum bperf_filter_type type = 0;
> >>>>  int enabled = 0;
> >>>> +int inherit;
> >>>>
> >>>>  SEC("fexit/XXX")
> >>>>  int BPF_PROG(fexit_XXX)
> >>>>  {
> >>>>    struct bpf_perf_event_value *diff_val, *accum_val;
> >>>>    __u32 filter_key, zero = 0;
> >>>> -  __u32 *accum_key;
> >>>> +  __u32 accum_key;
> >>>> +  struct bperf_filter_value *fval;
> >>>>
> >>>>    if (!enabled)
> >>>>            return 0;
> >>>>
> >>>>    switch (type) {
> >>>>    case BPERF_FILTER_GLOBAL:
> >>>> -          accum_key = &zero;
> >>>> +          accum_key = zero;
> >>>>            goto do_add;
> >>>>    case BPERF_FILTER_CPU:
> >>>>            filter_key = bpf_get_smp_processor_id();
> >>>> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
> >>>>            filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
> >>>>            break;
> >>>>    case BPERF_FILTER_TGID:
> >>>> -          filter_key = bpf_get_current_pid_tgid() >> 32;
> >>>> +          /* Use pid as the filter_key to exclude new task counts
> >>>> +           * when inherit is disabled. Don't worry about the existing
> >>>> +           * children in TGID losing their counts, bpf_counter has
> >>>> +           * already added them to the filter map via perf_thread_map
> >>>> +           * before this bpf prog runs.
> >>>> +           */
> >>>> +          filter_key = inherit ?
> >>>> +                       bpf_get_current_pid_tgid() >> 32 :
> >>>> +                       bpf_get_current_pid_tgid() & 0xffffffff;
> >>>
> >>> I'm not sure why this is needed.  Isn't the existing code fine?
> >>
> >> No, it's not. If I don't modify here, all child threads will always be counted
> >> when inherit is disabled.
> >>
> >>
> >> Before explaining this modification, we may need to first clarify what is included
> >> in the filter map.
> >>
> >> 1. The fexit_XXX prog determines whether to count by filter_key during each
> >>    context switch. If the key is found in the filter map, it will be counted,
> >>    otherwise not.
> >> 2. The keys in the filter map are synchronized from the perf_thread_map when
> >>    bperf__load().
> >> 3. The threads in perf_thread_map are added through cmd_stat()->evlist__create_maps()
> >>    before bperf__load().
> >> 4. evlist__create_maps() fills perf_thread_map by traversing the /proc/%d/task
> >>    directory, and these pids belong to the same tgid.
> >>
> >> Therefore, when the bperf command is issued, the filter map already holds all
> >> existing threads with the same tgid as the specified process.
> >>
> >>
> >> Now, let's take a look at the TGID case. We hope the behavior is as follows:
> >>
> >>  * TGID w/ inherit : specified process + all children from the processes
> >>  * TGID w/o inherit: specified process (all threads in the process) only
> >>
> >> Assuming that a new thread is created during bperf stats, the new thread should
> >> exhibit the following behavior in the fexit_XXX prog:
> >>
> >>  * TGID w/ inherit : do_add
> >>  * TGID w/o inherit: skip and return
> >>
> >> Let's now test the code before and after modification.
> >>
> >> Before modification: (filter_key = tgid)
> >>
> >>  * TGID w/ inherit:
> >>       create  : new thread
> >>       enter   : fexit_XXX prog
> >>       assign  : filter_key = new thread's tgid
> >>       match   : bpf_map_lookup_elem(&filter, &filter_key)
> >>       do_add
> >>    (PASS)
> >>
> >>  * TGID w/o inherit:
> >>       [...]   /* like above */
> >>       do_add
> >>    (FAILED, expect skip and return)
> >>
> >> After modification: (filter_key = inherit ? tgid : pid)
> >>
> >>  * TGID w/ inherit:
> >>       create  : new thread
> >>       enter   : fexit_XXX prog
> >>       assign  : filter_key = new thread's tgid
> >>       match   : bpf_map_lookup_elem(&filter, &filter_key)
> >>       do_add
> >>    (PASS)
> >>
> >>  * TGID w/o inherit:
> >>       create  : new thread
> >>       enter   : fexit_XXX prog
> >>       assign  : filter_key = new thread's pid
> >>       mismatch: bpf_map_lookup_elem(&filter, &filter_key)
> >>       skip and return
> >>    (PASS)
> >>
> >> As we can see, filter_key=tgid counts incorrectly in TGID w/o inherit case,
> >> and we need to change it to filter_key=pid to fix it.
> >
> > I'm sorry but I don't think I'm following.  A new thread in TGID mode
> > (regardless inherit) should be counted always, right?  Why do you
> > expect to skip it?
>
> This is how perf originally performs. To confirm this, I wrote a workload
> that creates one new thread per second and then stat it, as shown below.
> You can see that in 'TGID w/o inherit' case, perf does not count for the
> newly created threads.
>
> Perf TGID w/ inherit:
> ---
>   $ ./perf stat -e cpu-clock --timeout 5000 -- ./new_thread_per_second
>   thread 367444: start [main]
>   thread 367448: start
>   thread 367455: start
>   thread 367462: start
>   thread 367466: start
>   thread 367473: start
>   ./new_thread_per_second: Terminated
>
>   Performance counter stats for './new_thread_per_second':
>
>           10,017.71 msec cpu-clock
>
>         5.005538048 seconds time elapsed
>
>         10.018777000 seconds user
>         0.000000000 seconds sys
>
> Perf TGID w/o inherit:
> ---
>   $ ./perf stat -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
>   thread 366679: start [main]
>   thread 366686: start
>   thread 366693: start
>   thread 366697: start
>   thread 366704: start
>   thread 366708: start
>   ./new_thread_per_second: Terminated
>
>   Performance counter stats for './new_thread_per_second':
>
>                 4.29 msec cpu-clock
>
>         5.005539338 seconds time elapsed
>
>         10.019673000 seconds user
>         0.000000000 seconds sys
>
>
> Therefore, we also need to distinguish it in bperf so that the collection
> results can be consistent with perf.
>
> Bperf TGID w/o inherit: (BEFORE FIX)
> ---
>   $ ./perf stat --bpf-counters -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
>   thread 369127: start [main]
>   thread 369134: start
>   thread 369141: start
>   thread 369145: start
>   thread 369152: start
>   thread 369156: start
>   ./new_thread_per_second: Terminated
>
>   Performance counter stats for './new_thread_per_second':
>
>           10,019.05 msec cpu-clock
>
>         5.005567266 seconds time elapsed
>
>         10.018528000 seconds user
>         0.000000000 seconds sys
>
> Bperf TGID w/o inherit: (AFTER FIX)
> ---
>   $ ./perf stat --bpf-counters -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
>   thread 366616: start [main]
>   thread 366623: start
>   thread 366627: start
>   thread 366634: start
>   thread 366638: start
>   thread 366645: start
>   ./new_thread_per_second: Terminated
>
>   Performance counter stats for './new_thread_per_second':
>
>                 4.95 msec cpu-clock
>
>         5.005511173 seconds time elapsed
>
>         10.018790000 seconds user
>         0.000000000 seconds sys
>
>
> Thanks,
> Tengda
>

Thanks for the explanation.  Ok I think it's the limitation of the current
implementation of perf_event that works at thread-level.  Even if we
can count events at process-level with bperf, it might be important to
keep the compatibility with the existing behavior.

Thanks,
Namhyung
Tengda Wu Oct. 28, 2024, 7:50 a.m. UTC | #6
On 2024/10/28 14:23, Namhyung Kim wrote:
> On Wed, Oct 23, 2024 at 7:11 PM Tengda Wu <wutengda@huaweicloud.com> wrote:
>>
>>
>>
>> On 2024/10/24 6:45, Namhyung Kim wrote:
>>> On Tue, Oct 22, 2024 at 05:39:23PM +0800, Tengda Wu wrote:
>>>>
>>>>
>>>> On 2024/10/22 12:08, Namhyung Kim wrote:
>>>>> Hello,
>>>>>
>>>>> On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
>>>>>> bperf has a nice ability to share PMUs, but it still does not support
>>>>>> inherit events during fork(), resulting in some deviations in its stat
>>>>>> results compared with perf.
>>>>>>
>>>>>> perf stat result:
>>>>>> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
>>>>>>    Performance counter stats for './perf test -w sqrtloop':
>>>>>>
>>>>>>        2,316,038,116      cycles
>>>>>>        2,859,350,725      instructions
>>>>>>
>>>>>>          1.009603637 seconds time elapsed
>>>>>>
>>>>>>          1.004196000 seconds user
>>>>>>          0.003950000 seconds sys
>>>>>>
>>>>>> bperf stat result:
>>>>>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>>>>>>       ./perf test -w sqrtloop
>>>>>>
>>>>>>    Performance counter stats for './perf test -w sqrtloop':
>>>>>>
>>>>>>           18,762,093      cycles
>>>>>>           23,487,766      instructions
>>>>>>
>>>>>>          1.008913769 seconds time elapsed
>>>>>>
>>>>>>          1.003248000 seconds user
>>>>>>          0.004069000 seconds sys
>>>>>>
>>>>>> In order to support event inheritance, two new bpf programs are added
>>>>>> to monitor the fork and exit of tasks respectively. When a task is
>>>>>> created, add it to the filter map to enable counting, and reuse the
>>>>>> `accum_key` of its parent task to count together with the parent task.
>>>>>> When a task exits, remove it from the filter map to disable counting.
>>>>>>
>>>>>> After support:
>>>>>> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>>>>>>       ./perf test -w sqrtloop
>>>>>>
>>>>>>  Performance counter stats for './perf test -w sqrtloop':
>>>>>>
>>>>>>      2,316,252,189      cycles
>>>>>>      2,859,946,547      instructions
>>>>>>
>>>>>>        1.009422314 seconds time elapsed
>>>>>>
>>>>>>        1.003597000 seconds user
>>>>>>        0.004270000 seconds sys
>>>>>>
>>>>>> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
>>>>>> ---
>>>>>>  tools/perf/builtin-stat.c                     |  1 +
>>>>>>  tools/perf/util/bpf_counter.c                 | 35 +++++--
>>>>>>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
>>>>>>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
>>>>>>  tools/perf/util/target.h                      |  1 +
>>>>>>  5 files changed, 126 insertions(+), 14 deletions(-)
>>>>>>
>>>>>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>>>>>> index 3e6b9f216e80..8bc880479417 100644
>>>>>> --- a/tools/perf/builtin-stat.c
>>>>>> +++ b/tools/perf/builtin-stat.c
>>>>>> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
>>>>>>    } else if (big_num_opt == 0) /* User passed --no-big-num */
>>>>>>            stat_config.big_num = false;
>>>>>>
>>>>>> +  target.inherit = !stat_config.no_inherit;
>>>>>>    err = target__validate(&target);
>>>>>>    if (err) {
>>>>>>            target__strerror(&target, err, errbuf, BUFSIZ);
>>>>>> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
>>>>>> index 7a8af60e0f51..73fcafbffc6a 100644
>>>>>> --- a/tools/perf/util/bpf_counter.c
>>>>>> +++ b/tools/perf/util/bpf_counter.c
>>>>>> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
>>>>>>  }
>>>>>>
>>>>>>  static    struct perf_cpu_map *all_cpu_map;
>>>>>> +static __u32 filter_entry_cnt;
>>>>>>
>>>>>>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>>>>>>                                   struct perf_event_attr_map_entry *entry)
>>>>>> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>>>>>>    return err;
>>>>>>  }
>>>>>>
>>>>>> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
>>>>>> +                                   enum bperf_filter_type filter_type,
>>>>>> +                                   bool inherit)
>>>>>> +{
>>>>>> +  struct bpf_link *link;
>>>>>> +  int err = 0;
>>>>>> +
>>>>>> +  if ((filter_type == BPERF_FILTER_PID ||
>>>>>> +      filter_type == BPERF_FILTER_TGID) && inherit)
>>>>>> +          /* attach all follower bpf progs to enable event inheritance */
>>>>>> +          err = bperf_follower_bpf__attach(skel);
>>>>>> +  else {
>>>>>> +          link = bpf_program__attach(skel->progs.fexit_XXX);
>>>>>> +          if (IS_ERR(link))
>>>>>> +                  err = PTR_ERR(link);
>>>>>> +  }
>>>>>> +
>>>>>> +  return err;
>>>>>> +}
>>>>>> +
>>>>>>  static int bperf__load(struct evsel *evsel, struct target *target)
>>>>>>  {
>>>>>>    struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
>>>>>>    int attr_map_fd, diff_map_fd = -1, err;
>>>>>>    enum bperf_filter_type filter_type;
>>>>>> -  __u32 filter_entry_cnt, i;
>>>>>> +  __u32 i;
>>>>>>
>>>>>>    if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
>>>>>>            return -1;
>>>>>> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>>>>>    /* set up reading map */
>>>>>>    bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
>>>>>>                             filter_entry_cnt);
>>>>>> -  /* set up follower filter based on target */
>>>>>> -  bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
>>>>>> -                           filter_entry_cnt);
>>>>>>    err = bperf_follower_bpf__load(evsel->follower_skel);
>>>>>>    if (err) {
>>>>>>            pr_err("Failed to load follower skeleton\n");
>>>>>> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>>>>>    for (i = 0; i < filter_entry_cnt; i++) {
>>>>>>            int filter_map_fd;
>>>>>>            __u32 key;
>>>>>> +          struct bperf_filter_value fval = { i, 0 };
>>>>>>
>>>>>>            if (filter_type == BPERF_FILTER_PID ||
>>>>>>                filter_type == BPERF_FILTER_TGID)
>>>>>> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>>>>>                    break;
>>>>>>
>>>>>>            filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
>>>>>> -          bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
>>>>>> +          bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
>>>>>>    }
>>>>>>
>>>>>>    evsel->follower_skel->bss->type = filter_type;
>>>>>> +  evsel->follower_skel->bss->inherit = target->inherit;
>>>>>>
>>>>>> -  err = bperf_follower_bpf__attach(evsel->follower_skel);
>>>>>> +  err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
>>>>>> +                                      target->inherit);
>>>>>>
>>>>>>  out:
>>>>>>    if (err && evsel->bperf_leader_link_fd >= 0)
>>>>>> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
>>>>>>    bperf_sync_counters(evsel);
>>>>>>    reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
>>>>>>
>>>>>> -  for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
>>>>>> +  for (i = 0; i < filter_entry_cnt; i++) {
>>>>>>            struct perf_cpu entry;
>>>>>>            __u32 cpu;
>>>>>>
>>>>>> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>>>>>> index f193998530d4..0595063139a3 100644
>>>>>> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>>>>>> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>>>>>> @@ -5,6 +5,8 @@
>>>>>>  #include <bpf/bpf_tracing.h>
>>>>>>  #include "bperf_u.h"
>>>>>>
>>>>>> +#define MAX_ENTRIES 102400
>>>>>> +
>>>>>>  struct {
>>>>>>    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>>>>>    __uint(key_size, sizeof(__u32));
>>>>>> @@ -22,25 +24,29 @@ struct {
>>>>>>  struct {
>>>>>>    __uint(type, BPF_MAP_TYPE_HASH);
>>>>>>    __uint(key_size, sizeof(__u32));
>>>>>> -  __uint(value_size, sizeof(__u32));
>>>>>> +  __uint(value_size, sizeof(struct bperf_filter_value));
>>>>>> +  __uint(max_entries, MAX_ENTRIES);
>>>>>> +  __uint(map_flags, BPF_F_NO_PREALLOC);
>>>>>>  } filter SEC(".maps");
>>>>>>
>>>>>>  enum bperf_filter_type type = 0;
>>>>>>  int enabled = 0;
>>>>>> +int inherit;
>>>>>>
>>>>>>  SEC("fexit/XXX")
>>>>>>  int BPF_PROG(fexit_XXX)
>>>>>>  {
>>>>>>    struct bpf_perf_event_value *diff_val, *accum_val;
>>>>>>    __u32 filter_key, zero = 0;
>>>>>> -  __u32 *accum_key;
>>>>>> +  __u32 accum_key;
>>>>>> +  struct bperf_filter_value *fval;
>>>>>>
>>>>>>    if (!enabled)
>>>>>>            return 0;
>>>>>>
>>>>>>    switch (type) {
>>>>>>    case BPERF_FILTER_GLOBAL:
>>>>>> -          accum_key = &zero;
>>>>>> +          accum_key = zero;
>>>>>>            goto do_add;
>>>>>>    case BPERF_FILTER_CPU:
>>>>>>            filter_key = bpf_get_smp_processor_id();
>>>>>> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
>>>>>>            filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
>>>>>>            break;
>>>>>>    case BPERF_FILTER_TGID:
>>>>>> -          filter_key = bpf_get_current_pid_tgid() >> 32;
>>>>>> +          /* Use pid as the filter_key to exclude new task counts
>>>>>> +           * when inherit is disabled. Don't worry about the existing
>>>>>> +           * children in TGID losing their counts, bpf_counter has
>>>>>> +           * already added them to the filter map via perf_thread_map
>>>>>> +           * before this bpf prog runs.
>>>>>> +           */
>>>>>> +          filter_key = inherit ?
>>>>>> +                       bpf_get_current_pid_tgid() >> 32 :
>>>>>> +                       bpf_get_current_pid_tgid() & 0xffffffff;
>>>>>
>>>>> I'm not sure why this is needed.  Isn't the existing code fine?
>>>>
>>>> No, it's not. If I don't modify here, all child threads will always be counted
>>>> when inherit is disabled.
>>>>
>>>>
>>>> Before explaining this modification, we may need to first clarify what is included
>>>> in the filter map.
>>>>
>>>> 1. The fexit_XXX prog determines whether to count by filter_key during each
>>>>    context switch. If the key is found in the filter map, it will be counted,
>>>>    otherwise not.
>>>> 2. The keys in the filter map are synchronized from the perf_thread_map when
>>>>    bperf__load().
>>>> 3. The threads in perf_thread_map are added through cmd_stat()->evlist__create_maps()
>>>>    before bperf__load().
>>>> 4. evlist__create_maps() fills perf_thread_map by traversing the /proc/%d/task
>>>>    directory, and these pids belong to the same tgid.
>>>>
>>>> Therefore, when the bperf command is issued, the filter map already holds all
>>>> existing threads with the same tgid as the specified process.
>>>>
>>>>
>>>> Now, let's take a look at the TGID case. We hope the behavior is as follows:
>>>>
>>>>  * TGID w/ inherit : specified process + all children from the processes
>>>>  * TGID w/o inherit: specified process (all threads in the process) only
>>>>
>>>> Assuming that a new thread is created during bperf stats, the new thread should
>>>> exhibit the following behavior in the fexit_XXX prog:
>>>>
>>>>  * TGID w/ inherit : do_add
>>>>  * TGID w/o inherit: skip and return
>>>>
>>>> Let's now test the code before and after modification.
>>>>
>>>> Before modification: (filter_key = tgid)
>>>>
>>>>  * TGID w/ inherit:
>>>>       create  : new thread
>>>>       enter   : fexit_XXX prog
>>>>       assign  : filter_key = new thread's tgid
>>>>       match   : bpf_map_lookup_elem(&filter, &filter_key)
>>>>       do_add
>>>>    (PASS)
>>>>
>>>>  * TGID w/o inherit:
>>>>       [...]   /* like above */
>>>>       do_add
>>>>    (FAILED, expect skip and return)
>>>>
>>>> After modification: (filter_key = inherit ? tgid : pid)
>>>>
>>>>  * TGID w/ inherit:
>>>>       create  : new thread
>>>>       enter   : fexit_XXX prog
>>>>       assign  : filter_key = new thread's tgid
>>>>       match   : bpf_map_lookup_elem(&filter, &filter_key)
>>>>       do_add
>>>>    (PASS)
>>>>
>>>>  * TGID w/o inherit:
>>>>       create  : new thread
>>>>       enter   : fexit_XXX prog
>>>>       assign  : filter_key = new thread's pid
>>>>       mismatch: bpf_map_lookup_elem(&filter, &filter_key)
>>>>       skip and return
>>>>    (PASS)
>>>>
>>>> As we can see, filter_key=tgid counts incorrectly in TGID w/o inherit case,
>>>> and we need to change it to filter_key=pid to fix it.
>>>
>>> I'm sorry but I don't think I'm following.  A new thread in TGID mode
>>> (regardless inherit) should be counted always, right?  Why do you
>>> expect to skip it?
>>
>> This is how perf originally performs. To confirm this, I wrote a workload
>> that creates one new thread per second and then stat it, as shown below.
>> You can see that in 'TGID w/o inherit' case, perf does not count for the
>> newly created threads.
>>
>> Perf TGID w/ inherit:
>> ---
>>   $ ./perf stat -e cpu-clock --timeout 5000 -- ./new_thread_per_second
>>   thread 367444: start [main]
>>   thread 367448: start
>>   thread 367455: start
>>   thread 367462: start
>>   thread 367466: start
>>   thread 367473: start
>>   ./new_thread_per_second: Terminated
>>
>>   Performance counter stats for './new_thread_per_second':
>>
>>           10,017.71 msec cpu-clock
>>
>>         5.005538048 seconds time elapsed
>>
>>         10.018777000 seconds user
>>         0.000000000 seconds sys
>>
>> Perf TGID w/o inherit:
>> ---
>>   $ ./perf stat -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
>>   thread 366679: start [main]
>>   thread 366686: start
>>   thread 366693: start
>>   thread 366697: start
>>   thread 366704: start
>>   thread 366708: start
>>   ./new_thread_per_second: Terminated
>>
>>   Performance counter stats for './new_thread_per_second':
>>
>>                 4.29 msec cpu-clock
>>
>>         5.005539338 seconds time elapsed
>>
>>         10.019673000 seconds user
>>         0.000000000 seconds sys
>>
>>
>> Therefore, we also need to distinguish it in bperf so that the collection
>> results can be consistent with perf.
>>
>> Bperf TGID w/o inherit: (BEFORE FIX)
>> ---
>>   $ ./perf stat --bpf-counters -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
>>   thread 369127: start [main]
>>   thread 369134: start
>>   thread 369141: start
>>   thread 369145: start
>>   thread 369152: start
>>   thread 369156: start
>>   ./new_thread_per_second: Terminated
>>
>>   Performance counter stats for './new_thread_per_second':
>>
>>           10,019.05 msec cpu-clock
>>
>>         5.005567266 seconds time elapsed
>>
>>         10.018528000 seconds user
>>         0.000000000 seconds sys
>>
>> Bperf TGID w/o inherit: (AFTER FIX)
>> ---
>>   $ ./perf stat --bpf-counters -e cpu-clock --timeout 5000 -i -- ./new_thread_per_second
>>   thread 366616: start [main]
>>   thread 366623: start
>>   thread 366627: start
>>   thread 366634: start
>>   thread 366638: start
>>   thread 366645: start
>>   ./new_thread_per_second: Terminated
>>
>>   Performance counter stats for './new_thread_per_second':
>>
>>                 4.95 msec cpu-clock
>>
>>         5.005511173 seconds time elapsed
>>
>>         10.018790000 seconds user
>>         0.000000000 seconds sys
>>
>>
>> Thanks,
>> Tengda
>>
> 
> Thanks for the explanation.  Ok I think it's the limitation of the current
> implementation of perf_event that works at thread-level.  Even if we
> can count events at process-level with bperf, it might be important to
> keep the compatibility with the existing behavior.
> 
> Thanks,
> Namhyung

Yeah, thanks for your every review!

Best regards,
Tengda
Namhyung Kim Oct. 28, 2024, 4:30 p.m. UTC | #7
Hi Song,

On Mon, Oct 21, 2024 at 11:02:00AM +0000, Tengda Wu wrote:
> bperf has a nice ability to share PMUs, but it still does not support
> inherit events during fork(), resulting in some deviations in its stat
> results compared with perf.
> 
> perf stat result:
> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
>    Performance counter stats for './perf test -w sqrtloop':
> 
>        2,316,038,116      cycles
>        2,859,350,725      instructions
> 
>          1.009603637 seconds time elapsed
> 
>          1.004196000 seconds user
>          0.003950000 seconds sys
> 
> bperf stat result:
> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>       ./perf test -w sqrtloop
> 
>    Performance counter stats for './perf test -w sqrtloop':
> 
>           18,762,093      cycles
>           23,487,766      instructions
> 
>          1.008913769 seconds time elapsed
> 
>          1.003248000 seconds user
>          0.004069000 seconds sys
> 
> In order to support event inheritance, two new bpf programs are added
> to monitor the fork and exit of tasks respectively. When a task is
> created, add it to the filter map to enable counting, and reuse the
> `accum_key` of its parent task to count together with the parent task.
> When a task exits, remove it from the filter map to disable counting.
> 
> After support:
> $ ./perf stat --bpf-counters -e cycles,instructions -- \
>       ./perf test -w sqrtloop
> 
>  Performance counter stats for './perf test -w sqrtloop':
> 
>      2,316,252,189      cycles
>      2,859,946,547      instructions
> 
>        1.009422314 seconds time elapsed
> 
>        1.003597000 seconds user
>        0.004270000 seconds sys
> 

Are you ok with this now?

Thanks,
Namhyung


> Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
> ---
>  tools/perf/builtin-stat.c                     |  1 +
>  tools/perf/util/bpf_counter.c                 | 35 +++++--
>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 98 +++++++++++++++++--
>  tools/perf/util/bpf_skel/bperf_u.h            |  5 +
>  tools/perf/util/target.h                      |  1 +
>  5 files changed, 126 insertions(+), 14 deletions(-)
> 
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 3e6b9f216e80..8bc880479417 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -2620,6 +2620,7 @@ int cmd_stat(int argc, const char **argv)
>  	} else if (big_num_opt == 0) /* User passed --no-big-num */
>  		stat_config.big_num = false;
>  
> +	target.inherit = !stat_config.no_inherit;
>  	err = target__validate(&target);
>  	if (err) {
>  		target__strerror(&target, err, errbuf, BUFSIZ);
> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> index 7a8af60e0f51..73fcafbffc6a 100644
> --- a/tools/perf/util/bpf_counter.c
> +++ b/tools/perf/util/bpf_counter.c
> @@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
>  }
>  
>  static	struct perf_cpu_map *all_cpu_map;
> +static __u32 filter_entry_cnt;
>  
>  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>  				       struct perf_event_attr_map_entry *entry)
> @@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
>  	return err;
>  }
>  
> +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
> +					 enum bperf_filter_type filter_type,
> +					 bool inherit)
> +{
> +	struct bpf_link *link;
> +	int err = 0;
> +
> +	if ((filter_type == BPERF_FILTER_PID ||
> +	    filter_type == BPERF_FILTER_TGID) && inherit)
> +		/* attach all follower bpf progs to enable event inheritance */
> +		err = bperf_follower_bpf__attach(skel);
> +	else {
> +		link = bpf_program__attach(skel->progs.fexit_XXX);
> +		if (IS_ERR(link))
> +			err = PTR_ERR(link);
> +	}
> +
> +	return err;
> +}
> +
>  static int bperf__load(struct evsel *evsel, struct target *target)
>  {
>  	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
>  	int attr_map_fd, diff_map_fd = -1, err;
>  	enum bperf_filter_type filter_type;
> -	__u32 filter_entry_cnt, i;
> +	__u32 i;
>  
>  	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
>  		return -1;
> @@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>  	/* set up reading map */
>  	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
>  				 filter_entry_cnt);
> -	/* set up follower filter based on target */
> -	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
> -				 filter_entry_cnt);
>  	err = bperf_follower_bpf__load(evsel->follower_skel);
>  	if (err) {
>  		pr_err("Failed to load follower skeleton\n");
> @@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>  	for (i = 0; i < filter_entry_cnt; i++) {
>  		int filter_map_fd;
>  		__u32 key;
> +		struct bperf_filter_value fval = { i, 0 };
>  
>  		if (filter_type == BPERF_FILTER_PID ||
>  		    filter_type == BPERF_FILTER_TGID)
> @@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>  			break;
>  
>  		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
> -		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
> +		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
>  	}
>  
>  	evsel->follower_skel->bss->type = filter_type;
> +	evsel->follower_skel->bss->inherit = target->inherit;
>  
> -	err = bperf_follower_bpf__attach(evsel->follower_skel);
> +	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
> +					    target->inherit);
>  
>  out:
>  	if (err && evsel->bperf_leader_link_fd >= 0)
> @@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
>  	bperf_sync_counters(evsel);
>  	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
>  
> -	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
> +	for (i = 0; i < filter_entry_cnt; i++) {
>  		struct perf_cpu entry;
>  		__u32 cpu;
>  
> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> index f193998530d4..0595063139a3 100644
> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
> @@ -5,6 +5,8 @@
>  #include <bpf/bpf_tracing.h>
>  #include "bperf_u.h"
>  
> +#define MAX_ENTRIES 102400
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>  	__uint(key_size, sizeof(__u32));
> @@ -22,25 +24,29 @@ struct {
>  struct {
>  	__uint(type, BPF_MAP_TYPE_HASH);
>  	__uint(key_size, sizeof(__u32));
> -	__uint(value_size, sizeof(__u32));
> +	__uint(value_size, sizeof(struct bperf_filter_value));
> +	__uint(max_entries, MAX_ENTRIES);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>  } filter SEC(".maps");
>  
>  enum bperf_filter_type type = 0;
>  int enabled = 0;
> +int inherit;
>  
>  SEC("fexit/XXX")
>  int BPF_PROG(fexit_XXX)
>  {
>  	struct bpf_perf_event_value *diff_val, *accum_val;
>  	__u32 filter_key, zero = 0;
> -	__u32 *accum_key;
> +	__u32 accum_key;
> +	struct bperf_filter_value *fval;
>  
>  	if (!enabled)
>  		return 0;
>  
>  	switch (type) {
>  	case BPERF_FILTER_GLOBAL:
> -		accum_key = &zero;
> +		accum_key = zero;
>  		goto do_add;
>  	case BPERF_FILTER_CPU:
>  		filter_key = bpf_get_smp_processor_id();
> @@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
>  		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
>  		break;
>  	case BPERF_FILTER_TGID:
> -		filter_key = bpf_get_current_pid_tgid() >> 32;
> +		/* Use pid as the filter_key to exclude new task counts
> +		 * when inherit is disabled. Don't worry about the existing
> +		 * children in TGID losing their counts, bpf_counter has
> +		 * already added them to the filter map via perf_thread_map
> +		 * before this bpf prog runs.
> +		 */
> +		filter_key = inherit ?
> +			     bpf_get_current_pid_tgid() >> 32 :
> +			     bpf_get_current_pid_tgid() & 0xffffffff;
>  		break;
>  	default:
>  		return 0;
>  	}
>  
> -	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
> -	if (!accum_key)
> +	fval = bpf_map_lookup_elem(&filter, &filter_key);
> +	if (!fval)
>  		return 0;
>  
> +	accum_key = fval->accum_key;
> +	if (fval->exited)
> +		bpf_map_delete_elem(&filter, &filter_key);
> +
>  do_add:
>  	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
>  	if (!diff_val)
>  		return 0;
>  
> -	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
> +	accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
>  	if (!accum_val)
>  		return 0;
>  
> @@ -75,4 +93,70 @@ int BPF_PROG(fexit_XXX)
>  	return 0;
>  }
>  
> +/* The program is only used for PID or TGID filter types. */
> +SEC("tp_btf/task_newtask")
> +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
> +{
> +	__u32 parent_key, child_key;
> +	struct bperf_filter_value *parent_fval;
> +	struct bperf_filter_value child_fval = { 0 };
> +
> +	if (!enabled)
> +		return 0;
> +
> +	switch (type) {
> +	case BPERF_FILTER_PID:
> +		parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
> +		child_key = task->pid;
> +		break;
> +	case BPERF_FILTER_TGID:
> +		parent_key = bpf_get_current_pid_tgid() >> 32;
> +		child_key = task->tgid;
> +		if (child_key == parent_key)
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	/* Check if the current task is one of the target tasks to be counted */
> +	parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
> +	if (!parent_fval)
> +		return 0;
> +
> +	/* Start counting for the new task by adding it into filter map,
> +	 * inherit the accum key of its parent task so that they can be
> +	 * counted together.
> +	 */
> +	child_fval.accum_key = parent_fval->accum_key;
> +	child_fval.exited = 0;
> +	bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
> +
> +	return 0;
> +}
> +
> +/* The program is only used for PID or TGID filter types. */
> +SEC("tp_btf/sched_process_exit")
> +int BPF_PROG(on_exittask, struct task_struct *task)
> +{
> +	__u32 pid;
> +	struct bperf_filter_value *fval;
> +
> +	if (!enabled)
> +		return 0;
> +
> +	/* Stop counting for this task by removing it from filter map.
> +	 * For TGID type, if the pid can be found in the map, it means that
> +	 * this pid belongs to the leader task. After the task exits, the
> +	 * tgid of its child tasks (if any) will be 1, so the pid can be
> +	 * safely removed.
> +	 */
> +	pid = task->pid;
> +	fval = bpf_map_lookup_elem(&filter, &pid);
> +	if (fval)
> +		fval->exited = 1;
> +
> +	return 0;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
> index 1ce0c2c905c1..4a4a753980be 100644
> --- a/tools/perf/util/bpf_skel/bperf_u.h
> +++ b/tools/perf/util/bpf_skel/bperf_u.h
> @@ -11,4 +11,9 @@ enum bperf_filter_type {
>  	BPERF_FILTER_TGID,
>  };
>  
> +struct bperf_filter_value {
> +	__u32 accum_key;
> +	__u8 exited;
> +};
> +
>  #endif /* __BPERF_STAT_U_H */
> diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
> index d582cae8e105..2ee2cc30340f 100644
> --- a/tools/perf/util/target.h
> +++ b/tools/perf/util/target.h
> @@ -17,6 +17,7 @@ struct target {
>  	bool	     default_per_cpu;
>  	bool	     per_thread;
>  	bool	     use_bpf;
> +	bool	     inherit;
>  	int	     initial_delay;
>  	const char   *attr_map;
>  };
> -- 
> 2.34.1
>
diff mbox series

Patch

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3e6b9f216e80..8bc880479417 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2620,6 +2620,7 @@  int cmd_stat(int argc, const char **argv)
 	} else if (big_num_opt == 0) /* User passed --no-big-num */
 		stat_config.big_num = false;
 
+	target.inherit = !stat_config.no_inherit;
 	err = target__validate(&target);
 	if (err) {
 		target__strerror(&target, err, errbuf, BUFSIZ);
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 7a8af60e0f51..73fcafbffc6a 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -394,6 +394,7 @@  static int bperf_check_target(struct evsel *evsel,
 }
 
 static	struct perf_cpu_map *all_cpu_map;
+static __u32 filter_entry_cnt;
 
 static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
 				       struct perf_event_attr_map_entry *entry)
@@ -444,12 +445,32 @@  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
 	return err;
 }
 
+static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
+					 enum bperf_filter_type filter_type,
+					 bool inherit)
+{
+	struct bpf_link *link;
+	int err = 0;
+
+	if ((filter_type == BPERF_FILTER_PID ||
+	    filter_type == BPERF_FILTER_TGID) && inherit)
+		/* attach all follower bpf progs to enable event inheritance */
+		err = bperf_follower_bpf__attach(skel);
+	else {
+		link = bpf_program__attach(skel->progs.fexit_XXX);
+		if (IS_ERR(link))
+			err = PTR_ERR(link);
+	}
+
+	return err;
+}
+
 static int bperf__load(struct evsel *evsel, struct target *target)
 {
 	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
 	int attr_map_fd, diff_map_fd = -1, err;
 	enum bperf_filter_type filter_type;
-	__u32 filter_entry_cnt, i;
+	__u32 i;
 
 	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
 		return -1;
@@ -529,9 +550,6 @@  static int bperf__load(struct evsel *evsel, struct target *target)
 	/* set up reading map */
 	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
 				 filter_entry_cnt);
-	/* set up follower filter based on target */
-	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
-				 filter_entry_cnt);
 	err = bperf_follower_bpf__load(evsel->follower_skel);
 	if (err) {
 		pr_err("Failed to load follower skeleton\n");
@@ -543,6 +561,7 @@  static int bperf__load(struct evsel *evsel, struct target *target)
 	for (i = 0; i < filter_entry_cnt; i++) {
 		int filter_map_fd;
 		__u32 key;
+		struct bperf_filter_value fval = { i, 0 };
 
 		if (filter_type == BPERF_FILTER_PID ||
 		    filter_type == BPERF_FILTER_TGID)
@@ -553,12 +572,14 @@  static int bperf__load(struct evsel *evsel, struct target *target)
 			break;
 
 		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
-		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
 	}
 
 	evsel->follower_skel->bss->type = filter_type;
+	evsel->follower_skel->bss->inherit = target->inherit;
 
-	err = bperf_follower_bpf__attach(evsel->follower_skel);
+	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
+					    target->inherit);
 
 out:
 	if (err && evsel->bperf_leader_link_fd >= 0)
@@ -623,7 +644,7 @@  static int bperf__read(struct evsel *evsel)
 	bperf_sync_counters(evsel);
 	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
 
-	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+	for (i = 0; i < filter_entry_cnt; i++) {
 		struct perf_cpu entry;
 		__u32 cpu;
 
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
index f193998530d4..0595063139a3 100644
--- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -5,6 +5,8 @@ 
 #include <bpf/bpf_tracing.h>
 #include "bperf_u.h"
 
+#define MAX_ENTRIES 102400
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(__u32));
@@ -22,25 +24,29 @@  struct {
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u32));
-	__uint(value_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bperf_filter_value));
+	__uint(max_entries, MAX_ENTRIES);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
 } filter SEC(".maps");
 
 enum bperf_filter_type type = 0;
 int enabled = 0;
+int inherit;
 
 SEC("fexit/XXX")
 int BPF_PROG(fexit_XXX)
 {
 	struct bpf_perf_event_value *diff_val, *accum_val;
 	__u32 filter_key, zero = 0;
-	__u32 *accum_key;
+	__u32 accum_key;
+	struct bperf_filter_value *fval;
 
 	if (!enabled)
 		return 0;
 
 	switch (type) {
 	case BPERF_FILTER_GLOBAL:
-		accum_key = &zero;
+		accum_key = zero;
 		goto do_add;
 	case BPERF_FILTER_CPU:
 		filter_key = bpf_get_smp_processor_id();
@@ -49,22 +55,34 @@  int BPF_PROG(fexit_XXX)
 		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
 		break;
 	case BPERF_FILTER_TGID:
-		filter_key = bpf_get_current_pid_tgid() >> 32;
+		/* Use pid as the filter_key to exclude new task counts
+		 * when inherit is disabled. Don't worry about the existing
+		 * children in TGID losing their counts, bpf_counter has
+		 * already added them to the filter map via perf_thread_map
+		 * before this bpf prog runs.
+		 */
+		filter_key = inherit ?
+			     bpf_get_current_pid_tgid() >> 32 :
+			     bpf_get_current_pid_tgid() & 0xffffffff;
 		break;
 	default:
 		return 0;
 	}
 
-	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
-	if (!accum_key)
+	fval = bpf_map_lookup_elem(&filter, &filter_key);
+	if (!fval)
 		return 0;
 
+	accum_key = fval->accum_key;
+	if (fval->exited)
+		bpf_map_delete_elem(&filter, &filter_key);
+
 do_add:
 	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
 	if (!diff_val)
 		return 0;
 
-	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+	accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
 	if (!accum_val)
 		return 0;
 
@@ -75,4 +93,70 @@  int BPF_PROG(fexit_XXX)
 	return 0;
 }
 
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/task_newtask")
+int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
+{
+	__u32 parent_key, child_key;
+	struct bperf_filter_value *parent_fval;
+	struct bperf_filter_value child_fval = { 0 };
+
+	if (!enabled)
+		return 0;
+
+	switch (type) {
+	case BPERF_FILTER_PID:
+		parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
+		child_key = task->pid;
+		break;
+	case BPERF_FILTER_TGID:
+		parent_key = bpf_get_current_pid_tgid() >> 32;
+		child_key = task->tgid;
+		if (child_key == parent_key)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+
+	/* Check if the current task is one of the target tasks to be counted */
+	parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
+	if (!parent_fval)
+		return 0;
+
+	/* Start counting for the new task by adding it into filter map,
+	 * inherit the accum key of its parent task so that they can be
+	 * counted together.
+	 */
+	child_fval.accum_key = parent_fval->accum_key;
+	child_fval.exited = 0;
+	bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
+
+	return 0;
+}
+
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/sched_process_exit")
+int BPF_PROG(on_exittask, struct task_struct *task)
+{
+	__u32 pid;
+	struct bperf_filter_value *fval;
+
+	if (!enabled)
+		return 0;
+
+	/* Stop counting for this task by removing it from filter map.
+	 * For TGID type, if the pid can be found in the map, it means that
+	 * this pid belongs to the leader task. After the task exits, the
+	 * tgid of its child tasks (if any) will be 1, so the pid can be
+	 * safely removed.
+	 */
+	pid = task->pid;
+	fval = bpf_map_lookup_elem(&filter, &pid);
+	if (fval)
+		fval->exited = 1;
+
+	return 0;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
index 1ce0c2c905c1..4a4a753980be 100644
--- a/tools/perf/util/bpf_skel/bperf_u.h
+++ b/tools/perf/util/bpf_skel/bperf_u.h
@@ -11,4 +11,9 @@  enum bperf_filter_type {
 	BPERF_FILTER_TGID,
 };
 
+struct bperf_filter_value {
+	__u32 accum_key;
+	__u8 exited;
+};
+
 #endif /* __BPERF_STAT_U_H */
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index d582cae8e105..2ee2cc30340f 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -17,6 +17,7 @@  struct target {
 	bool	     default_per_cpu;
 	bool	     per_thread;
 	bool	     use_bpf;
+	bool	     inherit;
 	int	     initial_delay;
 	const char   *attr_map;
 };