diff mbox series

[v3,bpf-next,2/3] bpf: introduce helper bpf_get_branch_snapshot

Message ID 20210830214106.4142056-3-songliubraving@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: introduce bpf_get_branch_snapshot | expand

Checks

Context Check Description
bpf/vmtest-bpf-next success Kernel LATEST + selftests
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 14 maintainers not CCed: joe@cilium.io andrii@kernel.org daniel@iogearbox.net kpsingh@kernel.org netdev@vger.kernel.org yhs@fb.com jackmanb@google.com brouer@redhat.com haoluo@google.com ast@kernel.org quentin@isovalent.com rostedt@goodmis.org kafai@fb.com john.fastabend@gmail.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 11923 this patch: 11923
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Lines should not end with a '(' WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 11445 this patch: 11445
netdev/header_inline success Link

Commit Message

Song Liu Aug. 30, 2021, 9:41 p.m. UTC
Introduce bpf_get_branch_snapshot(), which allows tracing pogram to get
branch trace from hardware (e.g. Intel LBR). To use the feature, the
user need to create perf_event with proper branch_record filtering
on each cpu, and then calls bpf_get_branch_snapshot in the bpf function.
On Intel CPUs, VLBR event (raw event 0x1b00) can be use for this.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/bpf.h            |  2 ++
 include/linux/filter.h         |  3 ++-
 include/uapi/linux/bpf.h       | 16 +++++++++++++
 kernel/bpf/trampoline.c        | 13 ++++++++++
 kernel/bpf/verifier.c          | 12 ++++++++++
 kernel/trace/bpf_trace.c       | 43 ++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 16 +++++++++++++
 7 files changed, 104 insertions(+), 1 deletion(-)

Comments

Andrii Nakryiko Aug. 30, 2021, 10:14 p.m. UTC | #1
On Mon, Aug 30, 2021 at 2:42 PM Song Liu <songliubraving@fb.com> wrote:
>
> Introduce bpf_get_branch_snapshot(), which allows tracing pogram to get
> branch trace from hardware (e.g. Intel LBR). To use the feature, the
> user need to create perf_event with proper branch_record filtering
> on each cpu, and then calls bpf_get_branch_snapshot in the bpf function.
> On Intel CPUs, VLBR event (raw event 0x1b00) can be use for this.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  include/linux/bpf.h            |  2 ++
>  include/linux/filter.h         |  3 ++-
>  include/uapi/linux/bpf.h       | 16 +++++++++++++
>  kernel/bpf/trampoline.c        | 13 ++++++++++
>  kernel/bpf/verifier.c          | 12 ++++++++++
>  kernel/trace/bpf_trace.c       | 43 ++++++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h | 16 +++++++++++++
>  7 files changed, 104 insertions(+), 1 deletion(-)
>

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 206c221453cfa..72e8b49da0bf9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6446,6 +6446,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>                 env->prog->call_get_func_ip = true;
>         }
>
> +       if (func_id == BPF_FUNC_get_branch_snapshot) {
> +               if (env->prog->aux->sleepable) {
> +                       verbose(env, "sleepable progs cannot call get_branch_snapshot\n");
> +                       return -ENOTSUPP;
> +               }
> +               if (!IS_ENABLED(CONFIG_PERF_EVENTS)) {
> +                       verbose(env, "func %s#%d not supported without CONFIG_PERF_EVENTS\n",
> +                               func_id_name(func_id), func_id);
> +                       return -ENOTSUPP;
> +               }
> +               env->prog->call_get_branch = true;
> +       }
>         if (changes_data)
>                 clear_all_pkt_pointers(env);
>         return 0;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 8e2eb950aa829..a01f26b7877e6 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1017,6 +1017,33 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
>         .arg1_type      = ARG_PTR_TO_CTX,
>  };
>
> +BPF_CALL_2(bpf_get_branch_snapshot, void *, buf, u32, size)

I bet we'll need u64 flags over time, let's add it right now. It's
similar to bpf_read_branch_records().

> +{
> +#ifdef CONFIG_PERF_EVENTS
> +       u32 max_size;
> +
> +       if (this_cpu_ptr(&bpf_perf_branch_snapshot)->nr == 0)
> +               return -EOPNOTSUPP;
> +
> +       max_size = this_cpu_ptr(&bpf_perf_branch_snapshot)->nr *
> +               sizeof(struct perf_branch_entry);
> +       memcpy(buf, this_cpu_ptr(&bpf_perf_branch_snapshot)->entries,
> +              min_t(u32, size, max_size));
> +

Check bpf_read_branch_records() implementation and it's argument
validation logic. Let's keep them consistent (e.g., it enforces that
size is a multiple of sizeof(struct perf_branch_entry)). Another
difference is that bpf_read_branch_records() returns number of bytes
filled, not number of records. That's consistent with accepting size
as number of bytes. Let's stick to this convention then, so bytes
everywhere.


> +       return this_cpu_ptr(&bpf_perf_branch_snapshot)->nr;
> +#else
> +       return -EOPNOTSUPP;
> +#endif
> +}
> +
> +static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
> +       .func           = bpf_get_branch_snapshot,
> +       .gpl_only       = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> +       .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
> +};
> +

[...]
kernel test robot Aug. 31, 2021, 11:16 a.m. UTC | #2
Hi Song,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Song-Liu/bpf-introduce-bpf_get_branch_snapshot/20210831-054455
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-a003-20210831 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/09548da8ea3e9a75d0d969b18c148ce17ed1c97d
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Song-Liu/bpf-introduce-bpf_get_branch_snapshot/20210831-054455
        git checkout 09548da8ea3e9a75d0d969b18c148ce17ed1c97d
        # save the attached .config to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: kernel/bpf/trampoline.o: in function `__bpf_prog_enter':
>> kernel/bpf/trampoline.c:578: undefined reference to `bpf_perf_branch_snapshot'


vim +578 kernel/bpf/trampoline.c

   551	
   552	/* The logic is similar to bpf_prog_run(), but with an explicit
   553	 * rcu_read_lock() and migrate_disable() which are required
   554	 * for the trampoline. The macro is split into
   555	 * call __bpf_prog_enter
   556	 * call prog->bpf_func
   557	 * call __bpf_prog_exit
   558	 *
   559	 * __bpf_prog_enter returns:
   560	 * 0 - skip execution of the bpf prog
   561	 * 1 - execute bpf prog
   562	 * [2..MAX_U64] - execute bpf prog and record execution time.
   563	 *     This is start time.
   564	 */
   565	u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
   566		__acquires(RCU)
   567	{
   568	#ifdef CONFIG_PERF_EVENTS
   569		/* Calling migrate_disable costs two entries in the LBR. To save
   570		 * some entries, we call perf_snapshot_branch_stack before
   571		 * migrate_disable to save some entries. This is OK because we
   572		 * care about the branch trace before entering the BPF program.
   573		 * If migrate happens exactly here, there isn't much we can do to
   574		 * preserve the data.
   575		 */
   576		if (prog->call_get_branch)
   577			static_call(perf_snapshot_branch_stack)(
 > 578				this_cpu_ptr(&bpf_perf_branch_snapshot));
   579	#endif
   580		rcu_read_lock();
   581		migrate_disable();
   582		if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
   583			inc_misses_counter(prog);
   584			return 0;
   585		}
   586		return bpf_prog_start_time();
   587	}
   588	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Peter Zijlstra Aug. 31, 2021, 3:32 p.m. UTC | #3
On Mon, Aug 30, 2021 at 02:41:05PM -0700, Song Liu wrote:

> @@ -564,6 +565,18 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
>  u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
>  	__acquires(RCU)
>  {
	preempt_disable_notrace();

> +#ifdef CONFIG_PERF_EVENTS
> +	/* Calling migrate_disable costs two entries in the LBR. To save
> +	 * some entries, we call perf_snapshot_branch_stack before
> +	 * migrate_disable to save some entries. This is OK because we
> +	 * care about the branch trace before entering the BPF program.
> +	 * If migrate happens exactly here, there isn't much we can do to
> +	 * preserve the data.
> +	 */
> +	if (prog->call_get_branch)
> +		static_call(perf_snapshot_branch_stack)(
> +			this_cpu_ptr(&bpf_perf_branch_snapshot));

Here the comment is accurate, but if you recall the calling context
requirements of perf_snapshot_branch_stack from the last patch, you'll
see it requires you have at the very least preemption disabled, which
you just violated.

I think you'll find that (on x86 at least) the suggested
preempt_disable_notrace() incurs no additional branches.

Still, there is the next point to consider...

> +#endif
>  	rcu_read_lock();
>  	migrate_disable();

	preempt_enable_notrace();

>  	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {

> @@ -1863,9 +1892,23 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
>  	preempt_enable();
>  }
>  
> +DEFINE_PER_CPU(struct perf_branch_snapshot, bpf_perf_branch_snapshot);
> +
>  static __always_inline
>  void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
>  {
> +#ifdef CONFIG_PERF_EVENTS
> +	/* Calling migrate_disable costs two entries in the LBR. To save
> +	 * some entries, we call perf_snapshot_branch_stack before
> +	 * migrate_disable to save some entries. This is OK because we
> +	 * care about the branch trace before entering the BPF program.
> +	 * If migrate happens exactly here, there isn't much we can do to
> +	 * preserve the data.
> +	 */
> +	if (prog->call_get_branch)
> +		static_call(perf_snapshot_branch_stack)(
> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
> +#endif
>  	cant_sleep();

In the face of ^^^^^^ the comment makes no sense. Still, what are the
nesting rules for __bpf_trace_run() and __bpf_prog_enter() ? I'm
thinking the trace one can nest inside an occurence of prog, at which
point you have pieces.

>  	rcu_read_lock();
>  	(void) bpf_prog_run(prog, args);
Song Liu Aug. 31, 2021, 4:41 p.m. UTC | #4
> On Aug 31, 2021, at 8:32 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, Aug 30, 2021 at 02:41:05PM -0700, Song Liu wrote:
> 
>> @@ -564,6 +565,18 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
>> u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
>> 	__acquires(RCU)
>> {
> 	preempt_disable_notrace();
> 
>> +#ifdef CONFIG_PERF_EVENTS
>> +	/* Calling migrate_disable costs two entries in the LBR. To save
>> +	 * some entries, we call perf_snapshot_branch_stack before
>> +	 * migrate_disable to save some entries. This is OK because we
>> +	 * care about the branch trace before entering the BPF program.
>> +	 * If migrate happens exactly here, there isn't much we can do to
>> +	 * preserve the data.
>> +	 */
>> +	if (prog->call_get_branch)
>> +		static_call(perf_snapshot_branch_stack)(
>> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
> 
> Here the comment is accurate, but if you recall the calling context
> requirements of perf_snapshot_branch_stack from the last patch, you'll
> see it requires you have at the very least preemption disabled, which
> you just violated.

> 
> I think you'll find that (on x86 at least) the suggested
> preempt_disable_notrace() incurs no additional branches.
> 
> Still, there is the next point to consider...
> 
>> +#endif
>> 	rcu_read_lock();
>> 	migrate_disable();
> 
> 	preempt_enable_notrace();

Do we want preempt_enable_notrace() after migrate_disable()? It feels a 
little weird to me.

> 
>> 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
> 
>> @@ -1863,9 +1892,23 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
>> 	preempt_enable();
>> }
>> 
>> +DEFINE_PER_CPU(struct perf_branch_snapshot, bpf_perf_branch_snapshot);
>> +
>> static __always_inline
>> void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
>> {
>> +#ifdef CONFIG_PERF_EVENTS
>> +	/* Calling migrate_disable costs two entries in the LBR. To save
>> +	 * some entries, we call perf_snapshot_branch_stack before
>> +	 * migrate_disable to save some entries. This is OK because we
>> +	 * care about the branch trace before entering the BPF program.
>> +	 * If migrate happens exactly here, there isn't much we can do to
>> +	 * preserve the data.
>> +	 */
>> +	if (prog->call_get_branch)
>> +		static_call(perf_snapshot_branch_stack)(
>> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
>> +#endif
>> 	cant_sleep();
> 
> In the face of ^^^^^^ the comment makes no sense. Still, what are the
> nesting rules for __bpf_trace_run() and __bpf_prog_enter() ? I'm
> thinking the trace one can nest inside an occurence of prog, at which
> point you have pieces.

I think broken LBR records is something we cannot really avoid in case 
of nesting. OTOH, these should be rare cases and will not hurt the results
in most the use cases. 

I should probably tighten the rules in verifier to only apply it for 
__bpf_prog_enter (where we have the primary use case). We can enable it
for other program types when there are other use cases. 

Thanks,
Song
Song Liu Aug. 31, 2021, 9:24 p.m. UTC | #5
> On Aug 31, 2021, at 9:41 AM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Aug 31, 2021, at 8:32 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>> On Mon, Aug 30, 2021 at 02:41:05PM -0700, Song Liu wrote:
>> 
>>> @@ -564,6 +565,18 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
>>> u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
>>> 	__acquires(RCU)
>>> {
>> 	preempt_disable_notrace();
>> 
>>> +#ifdef CONFIG_PERF_EVENTS
>>> +	/* Calling migrate_disable costs two entries in the LBR. To save
>>> +	 * some entries, we call perf_snapshot_branch_stack before
>>> +	 * migrate_disable to save some entries. This is OK because we
>>> +	 * care about the branch trace before entering the BPF program.
>>> +	 * If migrate happens exactly here, there isn't much we can do to
>>> +	 * preserve the data.
>>> +	 */
>>> +	if (prog->call_get_branch)
>>> +		static_call(perf_snapshot_branch_stack)(
>>> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
>> 
>> Here the comment is accurate, but if you recall the calling context
>> requirements of perf_snapshot_branch_stack from the last patch, you'll
>> see it requires you have at the very least preemption disabled, which
>> you just violated.
> 
>> 
>> I think you'll find that (on x86 at least) the suggested
>> preempt_disable_notrace() incurs no additional branches.
>> 
>> Still, there is the next point to consider...
>> 
>>> +#endif
>>> 	rcu_read_lock();
>>> 	migrate_disable();
>> 
>> 	preempt_enable_notrace();
> 
> Do we want preempt_enable_notrace() after migrate_disable()? It feels a 
> little weird to me.
> 
>> 
>>> 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
>> 
>>> @@ -1863,9 +1892,23 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
>>> 	preempt_enable();
>>> }
>>> 
>>> +DEFINE_PER_CPU(struct perf_branch_snapshot, bpf_perf_branch_snapshot);
>>> +
>>> static __always_inline
>>> void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
>>> {
>>> +#ifdef CONFIG_PERF_EVENTS
>>> +	/* Calling migrate_disable costs two entries in the LBR. To save
>>> +	 * some entries, we call perf_snapshot_branch_stack before
>>> +	 * migrate_disable to save some entries. This is OK because we
>>> +	 * care about the branch trace before entering the BPF program.
>>> +	 * If migrate happens exactly here, there isn't much we can do to
>>> +	 * preserve the data.
>>> +	 */
>>> +	if (prog->call_get_branch)
>>> +		static_call(perf_snapshot_branch_stack)(
>>> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
>>> +#endif
>>> 	cant_sleep();
>> 
>> In the face of ^^^^^^ the comment makes no sense. Still, what are the
>> nesting rules for __bpf_trace_run() and __bpf_prog_enter() ? I'm
>> thinking the trace one can nest inside an occurence of prog, at which
>> point you have pieces.
> 
> I think broken LBR records is something we cannot really avoid in case 
> of nesting. OTOH, these should be rare cases and will not hurt the results
> in most the use cases. 
> 
> I should probably tighten the rules in verifier to only apply it for 
> __bpf_prog_enter (where we have the primary use case). We can enable it
> for other program types when there are other use cases. 

Update about some offline discussion with Alexei and Andrii. We are planning
to move static_call(perf_snapshot_branch_stack) to inside the helper 
bpf_get_branch_snapshot. This change has a few benefit:

1. No need for extra check (prog->call_get_branch) before every program (even
   when the program doesn't use the helper).

2. No need to duplicate code of different BPF program hook. 
3. BPF program always run with migrate_disable(), so it is not necessary to 
   run add extra preempt_disable_notrace. 

It does flushes a few more LBR entries. But the result seems ok:

ID: 0 from intel_pmu_lbr_disable_all+58 to intel_pmu_lbr_disable_all+93
ID: 1 from intel_pmu_lbr_disable_all+54 to intel_pmu_lbr_disable_all+58
ID: 2 from intel_pmu_snapshot_branch_stack+88 to intel_pmu_lbr_disable_all+0
ID: 3 from bpf_get_branch_snapshot+28 to intel_pmu_snapshot_branch_stack+0
ID: 4 from <bpf_tramepoline> to bpf_get_branch_snapshot+0
ID: 5 from <bpf_tramepoline> to <bpf_tramepoline>
ID: 6 from __bpf_prog_enter+34 to <bpf_tramepoline>
ID: 7 from migrate_disable+60 to __bpf_prog_enter+9
ID: 8 from __bpf_prog_enter+4 to migrate_disable+0
ID: 9 from __bpf_prog_enter+4 to __bpf_prog_enter+0
ID: 10 from bpf_fexit_loop_test1+22 to __bpf_prog_enter+0
ID: 11 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
ID: 12 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
ID: 13 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
ID: 14 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
ID: 15 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13

We can save more by inlining intel_pmu_lbr_disable_all(). But it is probably
not necessary at the moment. 

Thanks,
Song
Alexei Starovoitov Aug. 31, 2021, 9:37 p.m. UTC | #6
On 8/31/21 2:24 PM, Song Liu wrote:
> 
> 
>> On Aug 31, 2021, at 9:41 AM, Song Liu <songliubraving@fb.com> wrote:
>>
>>
>>
>>> On Aug 31, 2021, at 8:32 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Mon, Aug 30, 2021 at 02:41:05PM -0700, Song Liu wrote:
>>>
>>>> @@ -564,6 +565,18 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
>>>> u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
>>>> 	__acquires(RCU)
>>>> {
>>> 	preempt_disable_notrace();
>>>
>>>> +#ifdef CONFIG_PERF_EVENTS
>>>> +	/* Calling migrate_disable costs two entries in the LBR. To save
>>>> +	 * some entries, we call perf_snapshot_branch_stack before
>>>> +	 * migrate_disable to save some entries. This is OK because we
>>>> +	 * care about the branch trace before entering the BPF program.
>>>> +	 * If migrate happens exactly here, there isn't much we can do to
>>>> +	 * preserve the data.
>>>> +	 */
>>>> +	if (prog->call_get_branch)
>>>> +		static_call(perf_snapshot_branch_stack)(
>>>> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
>>>
>>> Here the comment is accurate, but if you recall the calling context
>>> requirements of perf_snapshot_branch_stack from the last patch, you'll
>>> see it requires you have at the very least preemption disabled, which
>>> you just violated.
>>
>>>
>>> I think you'll find that (on x86 at least) the suggested
>>> preempt_disable_notrace() incurs no additional branches.
>>>
>>> Still, there is the next point to consider...
>>>
>>>> +#endif
>>>> 	rcu_read_lock();
>>>> 	migrate_disable();
>>>
>>> 	preempt_enable_notrace();
>>
>> Do we want preempt_enable_notrace() after migrate_disable()? It feels a
>> little weird to me.
>>
>>>
>>>> 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
>>>
>>>> @@ -1863,9 +1892,23 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
>>>> 	preempt_enable();
>>>> }
>>>>
>>>> +DEFINE_PER_CPU(struct perf_branch_snapshot, bpf_perf_branch_snapshot);
>>>> +
>>>> static __always_inline
>>>> void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
>>>> {
>>>> +#ifdef CONFIG_PERF_EVENTS
>>>> +	/* Calling migrate_disable costs two entries in the LBR. To save
>>>> +	 * some entries, we call perf_snapshot_branch_stack before
>>>> +	 * migrate_disable to save some entries. This is OK because we
>>>> +	 * care about the branch trace before entering the BPF program.
>>>> +	 * If migrate happens exactly here, there isn't much we can do to
>>>> +	 * preserve the data.
>>>> +	 */
>>>> +	if (prog->call_get_branch)
>>>> +		static_call(perf_snapshot_branch_stack)(
>>>> +			this_cpu_ptr(&bpf_perf_branch_snapshot));
>>>> +#endif
>>>> 	cant_sleep();
>>>
>>> In the face of ^^^^^^ the comment makes no sense. Still, what are the
>>> nesting rules for __bpf_trace_run() and __bpf_prog_enter() ? I'm
>>> thinking the trace one can nest inside an occurence of prog, at which
>>> point you have pieces.
>>
>> I think broken LBR records is something we cannot really avoid in case
>> of nesting. OTOH, these should be rare cases and will not hurt the results
>> in most the use cases.
>>
>> I should probably tighten the rules in verifier to only apply it for
>> __bpf_prog_enter (where we have the primary use case). We can enable it
>> for other program types when there are other use cases.
> 
> Update about some offline discussion with Alexei and Andrii. We are planning
> to move static_call(perf_snapshot_branch_stack) to inside the helper
> bpf_get_branch_snapshot. This change has a few benefit:
> 
> 1. No need for extra check (prog->call_get_branch) before every program (even
>     when the program doesn't use the helper).
> 
> 2. No need to duplicate code of different BPF program hook.
> 3. BPF program always run with migrate_disable(), so it is not necessary to
>     run add extra preempt_disable_notrace.
> 
> It does flushes a few more LBR entries. But the result seems ok:
> 
> ID: 0 from intel_pmu_lbr_disable_all+58 to intel_pmu_lbr_disable_all+93
> ID: 1 from intel_pmu_lbr_disable_all+54 to intel_pmu_lbr_disable_all+58
> ID: 2 from intel_pmu_snapshot_branch_stack+88 to intel_pmu_lbr_disable_all+0
> ID: 3 from bpf_get_branch_snapshot+28 to intel_pmu_snapshot_branch_stack+0
> ID: 4 from <bpf_tramepoline> to bpf_get_branch_snapshot+0
> ID: 5 from <bpf_tramepoline> to <bpf_tramepoline>
> ID: 6 from __bpf_prog_enter+34 to <bpf_tramepoline>
> ID: 7 from migrate_disable+60 to __bpf_prog_enter+9
> ID: 8 from __bpf_prog_enter+4 to migrate_disable+0

If we make migrate_disable 'static inline' it will save these 2 entries.
It's probably worth doing regardless, since it will be immediate
performance benefit for all bpf programs.

> ID: 9 from __bpf_prog_enter+4 to __bpf_prog_enter+0
> ID: 10 from bpf_fexit_loop_test1+22 to __bpf_prog_enter+0
> ID: 11 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
> ID: 12 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
> ID: 13 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
> ID: 14 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
> ID: 15 from bpf_fexit_loop_test1+20 to bpf_fexit_loop_test1+13
> 
> We can save more by inlining intel_pmu_lbr_disable_all(). But it is probably
> not necessary at the moment.
> 
> Thanks,
> Song
> 
> 
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f4c16f19f83e3..1868434dc519a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2220,4 +2220,6 @@  int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 **bin_buf, u32 num_args);
 void bpf_bprintf_cleanup(void);
 
+DECLARE_PER_CPU(struct perf_branch_snapshot, bpf_perf_branch_snapshot);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d248941ecea3..75aa541c8a134 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -575,7 +575,8 @@  struct bpf_prog {
 				has_callchain_buf:1, /* callchain buffer allocated? */
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1; /* Do we call get_func_ip() */
+				call_get_func_ip:1, /* Do we call get_func_ip() */
+				call_get_branch:1; /* Do we call get_branch_snapshot() */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 791f31dd0abee..1fd4e85d123da 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4877,6 +4877,21 @@  union bpf_attr {
  *		Get the struct pt_regs associated with **task**.
  *	Return
  *		A pointer to struct pt_regs.
+ *
+ * long bpf_get_branch_snapshot(void *entries, u32 size)
+ *	Description
+ *		Get branch trace from hardware engines like Intel LBR. The
+ *		branch trace is taken soon after the trigger point of the
+ *		BPF program, so it may contain some entries after the
+ *		trigger point. The user need to filter these entries
+ *		accordingly.
+ *
+ *		The data is stored as struct perf_branch_entry into output
+ *		buffer *entries*. *size* is the size of *entries* in bytes.
+ *
+ *	Return
+ *		> 0, number of valid output entries.
+ *		**-EOPNOTSUPP**, the hardware/kernel does not support this function
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5055,6 +5070,7 @@  union bpf_attr {
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
 	FN(task_pt_regs),		\
+	FN(get_branch_snapshot),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index fe1e857324e66..137293fcceed7 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -10,6 +10,7 @@ 
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/module.h>
+#include <linux/static_call.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -564,6 +565,18 @@  static void notrace inc_misses_counter(struct bpf_prog *prog)
 u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
 	__acquires(RCU)
 {
+#ifdef CONFIG_PERF_EVENTS
+	/* Calling migrate_disable costs two entries in the LBR. To save
+	 * some entries, we call perf_snapshot_branch_stack before
+	 * migrate_disable to save some entries. This is OK because we
+	 * care about the branch trace before entering the BPF program.
+	 * If migrate happens exactly here, there isn't much we can do to
+	 * preserve the data.
+	 */
+	if (prog->call_get_branch)
+		static_call(perf_snapshot_branch_stack)(
+			this_cpu_ptr(&bpf_perf_branch_snapshot));
+#endif
 	rcu_read_lock();
 	migrate_disable();
 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 206c221453cfa..72e8b49da0bf9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6446,6 +6446,18 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		env->prog->call_get_func_ip = true;
 	}
 
+	if (func_id == BPF_FUNC_get_branch_snapshot) {
+		if (env->prog->aux->sleepable) {
+			verbose(env, "sleepable progs cannot call get_branch_snapshot\n");
+			return -ENOTSUPP;
+		}
+		if (!IS_ENABLED(CONFIG_PERF_EVENTS)) {
+			verbose(env, "func %s#%d not supported without CONFIG_PERF_EVENTS\n",
+				func_id_name(func_id), func_id);
+			return -ENOTSUPP;
+		}
+		env->prog->call_get_branch = true;
+	}
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8e2eb950aa829..a01f26b7877e6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1017,6 +1017,33 @@  static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_2(bpf_get_branch_snapshot, void *, buf, u32, size)
+{
+#ifdef CONFIG_PERF_EVENTS
+	u32 max_size;
+
+	if (this_cpu_ptr(&bpf_perf_branch_snapshot)->nr == 0)
+		return -EOPNOTSUPP;
+
+	max_size = this_cpu_ptr(&bpf_perf_branch_snapshot)->nr *
+		sizeof(struct perf_branch_entry);
+	memcpy(buf, this_cpu_ptr(&bpf_perf_branch_snapshot)->entries,
+	       min_t(u32, size, max_size));
+
+	return this_cpu_ptr(&bpf_perf_branch_snapshot)->nr;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+	.func		= bpf_get_branch_snapshot,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
 static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1132,6 +1159,8 @@  bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_snprintf_proto;
 	case BPF_FUNC_get_func_ip:
 		return &bpf_get_func_ip_proto_tracing;
+	case BPF_FUNC_get_branch_snapshot:
+		return &bpf_get_branch_snapshot_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -1863,9 +1892,23 @@  void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 	preempt_enable();
 }
 
+DEFINE_PER_CPU(struct perf_branch_snapshot, bpf_perf_branch_snapshot);
+
 static __always_inline
 void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
+#ifdef CONFIG_PERF_EVENTS
+	/* Calling migrate_disable costs two entries in the LBR. To save
+	 * some entries, we call perf_snapshot_branch_stack before
+	 * migrate_disable to save some entries. This is OK because we
+	 * care about the branch trace before entering the BPF program.
+	 * If migrate happens exactly here, there isn't much we can do to
+	 * preserve the data.
+	 */
+	if (prog->call_get_branch)
+		static_call(perf_snapshot_branch_stack)(
+			this_cpu_ptr(&bpf_perf_branch_snapshot));
+#endif
 	cant_sleep();
 	rcu_read_lock();
 	(void) bpf_prog_run(prog, args);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 791f31dd0abee..1fd4e85d123da 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4877,6 +4877,21 @@  union bpf_attr {
  *		Get the struct pt_regs associated with **task**.
  *	Return
  *		A pointer to struct pt_regs.
+ *
+ * long bpf_get_branch_snapshot(void *entries, u32 size)
+ *	Description
+ *		Get branch trace from hardware engines like Intel LBR. The
+ *		branch trace is taken soon after the trigger point of the
+ *		BPF program, so it may contain some entries after the
+ *		trigger point. The user need to filter these entries
+ *		accordingly.
+ *
+ *		The data is stored as struct perf_branch_entry into output
+ *		buffer *entries*. *size* is the size of *entries* in bytes.
+ *
+ *	Return
+ *		> 0, number of valid output entries.
+ *		**-EOPNOTSUPP**, the hardware/kernel does not support this function
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5055,6 +5070,7 @@  union bpf_attr {
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
 	FN(task_pt_regs),		\
+	FN(get_branch_snapshot),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper