diff mbox series

[v6,bpf-next,1/3] perf: enable branch record for software events

Message ID 20210907202802.3675104-2-songliubraving@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: introduce bpf_get_branch_snapshot | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 15 maintainers not CCed: x86@kernel.org kpsingh@kernel.org jolsa@redhat.com yhs@fb.com daniel@iogearbox.net namhyung@kernel.org mark.rutland@arm.com tglx@linutronix.de bp@alien8.de hpa@zytor.com linux-perf-users@vger.kernel.org kafai@fb.com netdev@vger.kernel.org alexander.shishkin@linux.intel.com ast@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 2386 this patch: 2386
netdev/kdoc success Errors and warnings before: 181 this patch: 181
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 2413 this patch: 2413
netdev/header_inline success Link
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test

Commit Message

Song Liu Sept. 7, 2021, 8:28 p.m. UTC
The typical way to access branch record (e.g. Intel LBR) is via hardware
perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture
reliable LBR. On the other hand, LBR could also be useful in non-PMI
scenario. For example, in kretprobe or bpf fexit program, LBR could
provide a lot of information on what happened with the function. Add API
to use branch record for software use.

Note that, when the software event triggers, it is necessary to stop the
branch record hardware asap. Therefore, static_call is used to remove some
branch instructions in this process.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 arch/x86/events/intel/core.c | 29 ++++++++++++++++++++++++++---
 arch/x86/events/intel/ds.c   |  8 --------
 arch/x86/events/perf_event.h | 10 ++++++++--
 include/linux/perf_event.h   | 23 +++++++++++++++++++++++
 kernel/events/core.c         |  2 ++
 5 files changed, 59 insertions(+), 13 deletions(-)

Comments

Peter Zijlstra Sept. 10, 2021, 10:40 a.m. UTC | #1
On Tue, Sep 07, 2021 at 01:28:00PM -0700, Song Liu wrote:

> +static int
> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	intel_pmu_disable_all();
> +	intel_pmu_lbr_read();
> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
> +
> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
> +	intel_pmu_enable_all(0);
> +	local_irq_restore(flags);
> +	return cnt;
> +}

So elsewhere you state:

+       /* Given we stop LBR in software, we will waste a few entries.
+        * But we should try to waste as few as possible entries. We are at
+        * about 11 on x86_64 systems.
+        * Add a check for < 15 so that we get heads-up when something
+        * changes and wastes too many entries.
+        */
+       ASSERT_LT(skel->bss->wasted_entries, 15, "check_wasted_entries");

Which is atrocious.. so I disassembled the new function to get horrible
crap. The below seems to cure that.

---
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2143,19 +2143,19 @@ static __initconst const u64 knl_hw_cach
  * However, there are some cases which may change PEBS status, e.g. PMI
  * throttle. The PEBS_ENABLE should be updated where the status changes.
  */
-static __always_inline void __intel_pmu_disable_all(void)
+static __always_inline void __intel_pmu_disable_all(bool bts)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+	if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
 }
 
 static __always_inline void intel_pmu_disable_all(void)
 {
-	__intel_pmu_disable_all();
+	__intel_pmu_disable_all(true);
 	intel_pmu_pebs_disable_all();
 	intel_pmu_lbr_disable_all();
 }
@@ -2186,14 +2186,12 @@ static void intel_pmu_enable_all(int add
 	__intel_pmu_enable_all(added, false);
 }
 
-static int
-intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+static noinline int
+__intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries,
+				  unsigned int cnt, unsigned long flags)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned long flags;
 
-	local_irq_save(flags);
-	intel_pmu_disable_all();
 	intel_pmu_lbr_read();
 	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
 
@@ -2203,6 +2201,36 @@ intel_pmu_snapshot_branch_stack(struct p
 	return cnt;
 }
 
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+	unsigned long flags;
+
+	/* must not have branches... */
+	local_irq_save(flags);
+	__intel_pmu_disable_all(false); /* we don't care about BTS */
+	__intel_pmu_pebs_disable_all();
+	__intel_pmu_lbr_disable();
+	/*            ... until here */
+
+	return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
+static int
+intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+	unsigned long flags;
+
+	/* must not have branches... */
+	local_irq_save(flags);
+	__intel_pmu_disable_all(false); /* we don't care about BTS */
+	__intel_pmu_pebs_disable_all();
+	__intel_pmu_arch_lbr_disable();
+	/*            ... until here */
+
+	return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
 /*
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
@@ -2946,7 +2974,7 @@ static int intel_pmu_handle_irq(struct p
 		apic_write(APIC_LVTPC, APIC_DM_NMI);
 	intel_bts_disable_local();
 	cpuc->enabled = 0;
-	__intel_pmu_disable_all();
+	__intel_pmu_disable_all(true);
 	handled = intel_pmu_drain_bts_buffer();
 	handled += intel_bts_interrupt();
 	status = intel_pmu_get_status();
@@ -6304,9 +6332,15 @@ __init int intel_pmu_init(void)
 		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
 
 		/* only support branch_stack snapshot for perfmon >= v2 */
-		if (x86_pmu.disable_all == intel_pmu_disable_all)
-			static_call_update(perf_snapshot_branch_stack,
-					   intel_pmu_snapshot_branch_stack);
+		if (x86_pmu.disable_all == intel_pmu_disable_all) {
+			if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) {
+				static_call_update(perf_snapshot_branch_stack,
+						   intel_pmu_snapshot_arch_branch_stack);
+			} else {
+				static_call_update(perf_snapshot_branch_stack,
+						   intel_pmu_snapshot_branch_stack);
+			}
+		}
 	}
 
 	intel_pmu_check_extra_regs(x86_pmu.extra_regs);
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1296,6 +1296,14 @@ void intel_pmu_pebs_enable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 }
 
+void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		__intel_pmu_pebs_disable_all();
+}
+
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1240,12 +1240,23 @@ static inline bool intel_pmu_has_bts(str
 	return intel_pmu_has_bts_period(event, hwc->sample_period);
 }
 
-static __always_inline void intel_pmu_pebs_disable_all(void)
+static __always_inline void __intel_pmu_pebs_disable_all(void)
 {
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
 
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+static __always_inline void __intel_pmu_arch_lbr_disable(void)
+{
+	wrmsrl(MSR_ARCH_LBR_CTL, 0);
+}
+
+static __always_inline void __intel_pmu_lbr_disable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
 int intel_pmu_save_and_restart(struct perf_event *event);
@@ -1322,6 +1333,8 @@ void intel_pmu_pebs_disable(struct perf_
 
 void intel_pmu_pebs_enable_all(void);
 
+void intel_pmu_pebs_disable_all(void);
+
 void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 void intel_pmu_auto_reload_read(struct perf_event *event);
Peter Zijlstra Sept. 10, 2021, 1:54 p.m. UTC | #2
On Fri, Sep 10, 2021 at 12:40:51PM +0200, Peter Zijlstra wrote:

> The below seems to cure that.

Seems I lost a hunk, fold below.

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 9e6d6eaeb4cb..6b72e9b55c69 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -228,20 +228,6 @@ static void __intel_pmu_lbr_enable(bool pmi)
 		wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
 }
 
-static void __intel_pmu_lbr_disable(void)
-{
-	u64 debugctl;
-
-	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
-		wrmsrl(MSR_ARCH_LBR_CTL, 0);
-		return;
-	}
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
 void intel_pmu_lbr_reset_32(void)
 {
 	int i;
@@ -779,8 +765,12 @@ void intel_pmu_lbr_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (cpuc->lbr_users && !vlbr_exclude_host())
+	if (cpuc->lbr_users && !vlbr_exclude_host()) {
+		if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+			return __intel_pmu_arch_lbr_disable();
+
 		__intel_pmu_lbr_disable();
+	}
 }
 
 void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
Song Liu Sept. 10, 2021, 6:27 p.m. UTC | #3
> On Sep 10, 2021, at 6:54 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Sep 10, 2021 at 12:40:51PM +0200, Peter Zijlstra wrote:
> 
>> The below seems to cure that.
> 
> Seems I lost a hunk, fold below.
> 
> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
> index 9e6d6eaeb4cb..6b72e9b55c69 100644
> --- a/arch/x86/events/intel/lbr.c
> +++ b/arch/x86/events/intel/lbr.c
> @@ -228,20 +228,6 @@ static void __intel_pmu_lbr_enable(bool pmi)
> 		wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
> }
> 
> -static void __intel_pmu_lbr_disable(void)
> -{
> -	u64 debugctl;
> -
> -	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
> -		wrmsrl(MSR_ARCH_LBR_CTL, 0);
> -		return;
> -	}
> -
> -	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> -	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> -	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> -}
> -
> void intel_pmu_lbr_reset_32(void)
> {
> 	int i;
> @@ -779,8 +765,12 @@ void intel_pmu_lbr_disable_all(void)
> {
> 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> 
> -	if (cpuc->lbr_users && !vlbr_exclude_host())
> +	if (cpuc->lbr_users && !vlbr_exclude_host()) {
> +		if (static_cpu_has(X86_FEATURE_ARCH_LBR))
> +			return __intel_pmu_arch_lbr_disable();
> +
> 		__intel_pmu_lbr_disable();
> +	}
> }
> 
> void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)

This works great and saves 3 entries! We have the following now:

ID: 0 from bpf_get_branch_snapshot+18 to intel_pmu_snapshot_branch_stack+0
ID: 1 from __brk_limit+477143934 to bpf_get_branch_snapshot+0
ID: 2 from __brk_limit+477192263 to __brk_limit+477143880  # trampoline 
ID: 3 from __bpf_prog_enter+34 to __brk_limit+477192251
ID: 4 from migrate_disable+60 to __bpf_prog_enter+9
ID: 5 from __bpf_prog_enter+4 to migrate_disable+0
ID: 6 from bpf_testmod_loop_test+20 to __bpf_prog_enter+0
ID: 7 from bpf_testmod_loop_test+20 to bpf_testmod_loop_test+13
ID: 8 from bpf_testmod_loop_test+20 to bpf_testmod_loop_test+13

I will fold this in and send v7. 

Thanks,
Song
Peter Zijlstra Sept. 10, 2021, 6:40 p.m. UTC | #4
On Fri, Sep 10, 2021 at 06:27:36PM +0000, Song Liu wrote:

> This works great and saves 3 entries! We have the following now:

Yay!

> ID: 0 from bpf_get_branch_snapshot+18 to intel_pmu_snapshot_branch_stack+0

is unavoidable, we need to end up in intel_pmu_snapshot_branch_stack()
eventually.

> ID: 1 from __brk_limit+477143934 to bpf_get_branch_snapshot+0

could be elided by having the JIT emit the call to
intel_pmu_snapshot_branch_stack directly, instead of laundering it
through that helper I suppose.

> ID: 2 from __brk_limit+477192263 to __brk_limit+477143880  # trampoline 
> ID: 3 from __bpf_prog_enter+34 to __brk_limit+477192251

-ENOCLUE

> ID: 4 from migrate_disable+60 to __bpf_prog_enter+9
> ID: 5 from __bpf_prog_enter+4 to migrate_disable+0

I suppose we can reduce that to a single branch if we inline
migrate_disable() here, that thing unfortunately needs one branch
itself.

> ID: 6 from bpf_testmod_loop_test+20 to __bpf_prog_enter+0

And this is the first branch out of the test program, giving 7 entries
now, of which we can remove at least 2 more with a bit of elbow greace,
right?

> ID: 7 from bpf_testmod_loop_test+20 to bpf_testmod_loop_test+13
> ID: 8 from bpf_testmod_loop_test+20 to bpf_testmod_loop_test+13
> 
> I will fold this in and send v7. 

Excellent.
Peter Zijlstra Sept. 10, 2021, 6:50 p.m. UTC | #5
On Fri, Sep 10, 2021 at 08:40:27PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 10, 2021 at 06:27:36PM +0000, Song Liu wrote:
> 
> > This works great and saves 3 entries! We have the following now:
> 
> Yay!
> 
> > ID: 0 from bpf_get_branch_snapshot+18 to intel_pmu_snapshot_branch_stack+0
> 
> is unavoidable, we need to end up in intel_pmu_snapshot_branch_stack()
> eventually.
> 
> > ID: 1 from __brk_limit+477143934 to bpf_get_branch_snapshot+0
> 
> could be elided by having the JIT emit the call to
> intel_pmu_snapshot_branch_stack directly, instead of laundering it
> through that helper I suppose.
> 
> > ID: 2 from __brk_limit+477192263 to __brk_limit+477143880  # trampoline 
> > ID: 3 from __bpf_prog_enter+34 to __brk_limit+477192251
> 
> -ENOCLUE
> 
> > ID: 4 from migrate_disable+60 to __bpf_prog_enter+9
> > ID: 5 from __bpf_prog_enter+4 to migrate_disable+0
> 
> I suppose we can reduce that to a single branch if we inline
> migrate_disable() here, that thing unfortunately needs one branch
> itself.

Oooh, since we put local_irq_save/restore() in
intel_pmu_snapshot_branch_stack(), we no longer need to be after
migrate_disable(). You could go back to placing it earlier!
Song Liu Sept. 10, 2021, 6:51 p.m. UTC | #6
> On Sep 10, 2021, at 11:40 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Sep 10, 2021 at 06:27:36PM +0000, Song Liu wrote:
> 
>> This works great and saves 3 entries! We have the following now:
> 
> Yay!
> 
>> ID: 0 from bpf_get_branch_snapshot+18 to intel_pmu_snapshot_branch_stack+0
> 
> is unavoidable, we need to end up in intel_pmu_snapshot_branch_stack()
> eventually.
> 
>> ID: 1 from __brk_limit+477143934 to bpf_get_branch_snapshot+0
> 
> could be elided by having the JIT emit the call to
> intel_pmu_snapshot_branch_stack directly, instead of laundering it
> through that helper I suppose.

Yep, some JIT magic could save one entry here. 

> 
>> ID: 2 from __brk_limit+477192263 to __brk_limit+477143880  # trampoline 
>> ID: 3 from __bpf_prog_enter+34 to __brk_limit+477192251
> 
> -ENOCLUE
> 
>> ID: 4 from migrate_disable+60 to __bpf_prog_enter+9
>> ID: 5 from __bpf_prog_enter+4 to migrate_disable+0
> 
> I suppose we can reduce that to a single branch if we inline
> migrate_disable() here, that thing unfortunately needs one branch
> itself.

To inline migrate_disable, we may need expose this_rq() in include/, or 
use some other alternatives. I am planning to optimize that after this
set gets in.

Thanks,
Song

> 
>> ID: 6 from bpf_testmod_loop_test+20 to __bpf_prog_enter+0
> 
> And this is the first branch out of the test program, giving 7 entries
> now, of which we can remove at least 2 more with a bit of elbow greace,
> right?
> 
>> ID: 7 from bpf_testmod_loop_test+20 to bpf_testmod_loop_test+13
>> ID: 8 from bpf_testmod_loop_test+20 to bpf_testmod_loop_test+13
>> 
>> I will fold this in and send v7. 
> 
> Excellent.
Song Liu Sept. 10, 2021, 7 p.m. UTC | #7
> On Sep 10, 2021, at 11:50 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Sep 10, 2021 at 08:40:27PM +0200, Peter Zijlstra wrote:
>> On Fri, Sep 10, 2021 at 06:27:36PM +0000, Song Liu wrote:
>> 
>>> This works great and saves 3 entries! We have the following now:
>> 
>> Yay!
>> 
>>> ID: 0 from bpf_get_branch_snapshot+18 to intel_pmu_snapshot_branch_stack+0
>> 
>> is unavoidable, we need to end up in intel_pmu_snapshot_branch_stack()
>> eventually.
>> 
>>> ID: 1 from __brk_limit+477143934 to bpf_get_branch_snapshot+0
>> 
>> could be elided by having the JIT emit the call to
>> intel_pmu_snapshot_branch_stack directly, instead of laundering it
>> through that helper I suppose.
>> 
>>> ID: 2 from __brk_limit+477192263 to __brk_limit+477143880  # trampoline 
>>> ID: 3 from __bpf_prog_enter+34 to __brk_limit+477192251
>> 
>> -ENOCLUE
>> 
>>> ID: 4 from migrate_disable+60 to __bpf_prog_enter+9
>>> ID: 5 from __bpf_prog_enter+4 to migrate_disable+0
>> 
>> I suppose we can reduce that to a single branch if we inline
>> migrate_disable() here, that thing unfortunately needs one branch
>> itself.
> 
> Oooh, since we put local_irq_save/restore() in
> intel_pmu_snapshot_branch_stack(), we no longer need to be after
> migrate_disable(). You could go back to placing it earlier!

Hmm.. not really. We call migrate_disable() before entering the BPF program. 
And the helper calls snapshot_branch_stack() inside the BPF program. To move
it to before migrate_disable(), we will have to add a "whether to snapshot
branch stack" check before entering the BPF program. This check, while is
cheap, is added to all BPF programs on this hook, even when the program does 
not use snapshot at all. So we would rather keep all logic inside the helper, 
and not touch the common path. 

Thanks,
Song
Peter Zijlstra Sept. 10, 2021, 7:08 p.m. UTC | #8
On Fri, Sep 10, 2021 at 07:00:08PM +0000, Song Liu wrote:

> Hmm.. not really. We call migrate_disable() before entering the BPF program. 
> And the helper calls snapshot_branch_stack() inside the BPF program. To move
> it to before migrate_disable(), we will have to add a "whether to snapshot
> branch stack" check before entering the BPF program. This check, while is
> cheap, is added to all BPF programs on this hook, even when the program does 
> not use snapshot at all. So we would rather keep all logic inside the helper, 
> and not touch the common path. 

Moo :/ Because I also really don't want to expose struct rq, it's
currently nicely squirelled away in kernel/sched/ and doesn't get
anywhere near include/.

A well, maybe we can do something clever with migrate_disable() itself.
I'll put it on this endless todo list ;-)
Song Liu Sept. 10, 2021, 7:11 p.m. UTC | #9
> On Sep 10, 2021, at 12:08 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Sep 10, 2021 at 07:00:08PM +0000, Song Liu wrote:
> 
>> Hmm.. not really. We call migrate_disable() before entering the BPF program. 
>> And the helper calls snapshot_branch_stack() inside the BPF program. To move
>> it to before migrate_disable(), we will have to add a "whether to snapshot
>> branch stack" check before entering the BPF program. This check, while is
>> cheap, is added to all BPF programs on this hook, even when the program does 
>> not use snapshot at all. So we would rather keep all logic inside the helper, 
>> and not touch the common path. 
> 
> Moo :/ Because I also really don't want to expose struct rq, it's
> currently nicely squirelled away in kernel/sched/ and doesn't get
> anywhere near include/.

This matches my guess, so I didn't go too far on that direction. :-)

> 
> A well, maybe we can do something clever with migrate_disable() itself.
> I'll put it on this endless todo list ;-)

Awesome!

Thanks,
Song
diff mbox series

Patch

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7011e87be6d03..2e318fb8d649b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2143,7 +2143,7 @@  static __initconst const u64 knl_hw_cache_extra_regs
  * However, there are some cases which may change PEBS status, e.g. PMI
  * throttle. The PEBS_ENABLE should be updated where the status changes.
  */
-static void __intel_pmu_disable_all(void)
+static __always_inline void __intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
@@ -2153,7 +2153,7 @@  static void __intel_pmu_disable_all(void)
 		intel_pmu_disable_bts();
 }
 
-static void intel_pmu_disable_all(void)
+static __always_inline void intel_pmu_disable_all(void)
 {
 	__intel_pmu_disable_all();
 	intel_pmu_pebs_disable_all();
@@ -2186,6 +2186,23 @@  static void intel_pmu_enable_all(int added)
 	__intel_pmu_enable_all(added, false);
 }
 
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	intel_pmu_disable_all();
+	intel_pmu_lbr_read();
+	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
+
+	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+	intel_pmu_enable_all(0);
+	local_irq_restore(flags);
+	return cnt;
+}
+
 /*
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
@@ -6283,9 +6300,15 @@  __init int intel_pmu_init(void)
 			x86_pmu.lbr_nr = 0;
 	}
 
-	if (x86_pmu.lbr_nr)
+	if (x86_pmu.lbr_nr) {
 		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
 
+		/* only support branch_stack snapshot for perfmon >= v2 */
+		if (x86_pmu.disable_all == intel_pmu_disable_all)
+			static_call_update(perf_snapshot_branch_stack,
+					   intel_pmu_snapshot_branch_stack);
+	}
+
 	intel_pmu_check_extra_regs(x86_pmu.extra_regs);
 
 	/* Support full width counters using alternative MSR range */
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8647713276a73..8a832986578a9 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1296,14 +1296,6 @@  void intel_pmu_pebs_enable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 }
 
-void intel_pmu_pebs_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e3ac05c97b5e5..171abbb359fe5 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1240,6 +1240,14 @@  static inline bool intel_pmu_has_bts(struct perf_event *event)
 	return intel_pmu_has_bts_period(event, hwc->sample_period);
 }
 
+static __always_inline void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
@@ -1314,8 +1322,6 @@  void intel_pmu_pebs_disable(struct perf_event *event);
 
 void intel_pmu_pebs_enable_all(void);
 
-void intel_pmu_pebs_disable_all(void);
-
 void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 void intel_pmu_auto_reload_read(struct perf_event *event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fe156a8170aa3..0cbc5dfe11102 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -57,6 +57,7 @@  struct perf_guest_info_callbacks {
 #include <linux/cgroup.h>
 #include <linux/refcount.h>
 #include <linux/security.h>
+#include <linux/static_call.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -1612,4 +1613,26 @@  extern void __weak arch_perf_update_userpage(struct perf_event *event,
 extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
 #endif
 
+/*
+ * Snapshot branch stack on software events.
+ *
+ * Branch stack can be very useful in understanding software events. For
+ * example, when a long function, e.g. sys_perf_event_open, returns an
+ * errno, it is not obvious why the function failed. Branch stack could
+ * provide very helpful information in this type of scenarios.
+ *
+ * On software event, it is necessary to stop the hardware branch recorder
+ * fast. Otherwise, the hardware register/buffer will be flushed with
+ * entries of the triggering event. Therefore, static call is used to
+ * stop the hardware recorder.
+ */
+
+/*
+ * cnt is the number of entries allocated for entries.
+ * Return number of entries copied to .
+ */
+typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
+					   unsigned int cnt);
+DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
+
 #endif /* _LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 744e8726c5b2f..349f80aa9e7d8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13435,3 +13435,5 @@  struct cgroup_subsys perf_event_cgrp_subsys = {
 	.threaded	= true,
 };
 #endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);