diff mbox series

[v5,bpf-next,1/3] perf: enable branch record for software events

Message ID 20210902165706.2812867-2-songliubraving@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: introduce bpf_get_branch_snapshot | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next fail VM_Test
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 17 maintainers not CCed: linux-perf-users@vger.kernel.org hpa@zytor.com alexander.shishkin@linux.intel.com jolsa@redhat.com andrii@kernel.org daniel@iogearbox.net tglx@linutronix.de x86@kernel.org kpsingh@kernel.org netdev@vger.kernel.org yhs@fb.com ast@kernel.org bp@alien8.de mark.rutland@arm.com kafai@fb.com john.fastabend@gmail.com namhyung@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 2360 this patch: 2360
netdev/kdoc success Errors and warnings before: 181 this patch: 181
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 2386 this patch: 2386
netdev/header_inline success Link

Commit Message

Song Liu Sept. 2, 2021, 4:57 p.m. UTC
The typical way to access branch record (e.g. Intel LBR) is via hardware
perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture
reliable LBR. On the other hand, LBR could also be useful in non-PMI
scenario. For example, in kretprobe or bpf fexit program, LBR could
provide a lot of information on what happened with the function. Add API
to use branch record for software use.

Note that, when the software event triggers, it is necessary to stop the
branch record hardware asap. Therefore, static_call is used to remove some
branch instructions in this process.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 arch/x86/events/intel/core.c | 26 +++++++++++++++++++++++---
 arch/x86/events/intel/ds.c   |  8 --------
 arch/x86/events/perf_event.h | 10 ++++++++--
 include/linux/perf_event.h   | 23 +++++++++++++++++++++++
 kernel/events/core.c         |  2 ++
 5 files changed, 56 insertions(+), 13 deletions(-)

Comments

John Fastabend Sept. 2, 2021, 8:49 p.m. UTC | #1
Song Liu wrote:
> The typical way to access branch record (e.g. Intel LBR) is via hardware
> perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture
> reliable LBR. On the other hand, LBR could also be useful in non-PMI
> scenario. For example, in kretprobe or bpf fexit program, LBR could
> provide a lot of information on what happened with the function. Add API
> to use branch record for software use.
> 
> Note that, when the software event triggers, it is necessary to stop the
> branch record hardware asap. Therefore, static_call is used to remove some
> branch instructions in this process.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---

[...]

>  void intel_pmu_auto_reload_read(struct perf_event *event);
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index fe156a8170aa3..4fe11f4f896b1 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -57,6 +57,7 @@ struct perf_guest_info_callbacks {
>  #include <linux/cgroup.h>
>  #include <linux/refcount.h>
>  #include <linux/security.h>
> +#include <linux/static_call.h>
>  #include <asm/local.h>
>  
>  struct perf_callchain_entry {
> @@ -1612,4 +1613,26 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event,
>  extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
>  #endif
>  
> +/*
> + * Snapshot branch stack on software events.
> + *
> + * Branch stack can be very useful in understanding software events. For
> + * example, when a long function, e.g. sys_perf_event_open, returns an
> + * errno, it is not obvious why the function failed. Branch stack could
> + * provide very helpful information in this type of scenarios.
> + *
> + * On software event, it is necessary to stop the hardware branch recorder
> + * fast. Otherwise, the hardware register/buffer will be flushed with
> + * entries af the triggering event. Therefore, static call is used to
              ^^
nit, af->of

> + * stop the hardware recorder.
> + */
> +
> +/*
> + * cnt is the number of entries allocated for entries.
> + * Return number of entries copied to .
> + */

A bit out of scope, but LGTM.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Peter Zijlstra Sept. 3, 2021, 8:02 a.m. UTC | #2
On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
> +static int
> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +
> +	intel_pmu_disable_all();
> +	intel_pmu_lbr_read();
> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
> +
> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
> +	intel_pmu_enable_all(0);
> +	return cnt;
> +}

Would something like the below help get rid of that memcpy() ?

(compile tested only)

---
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7011e87be6d0..c508ba6099d2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2938,7 +2938,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
 	loops = 0;
 again:
-	intel_pmu_lbr_read();
+	intel_pmu_lbr_read(&cpuc->lbr_stack);
 	intel_pmu_ack_status(status);
 	if (++loops > 100) {
 		static bool warned;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 9e6d6eaeb4cb..0a5bacba86c2 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -170,7 +170,7 @@ enum {
 
 #define ARCH_LBR_CTL_MASK			0x7f000e
 
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);
 
 static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
 {
@@ -783,7 +783,7 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	u64 tos = intel_pmu_lbr_tos();
@@ -801,18 +801,18 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
 
-		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
-		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
-		cpuc->lbr_entries[i].mispred	= 0;
-		cpuc->lbr_entries[i].predicted	= 0;
-		cpuc->lbr_entries[i].in_tx	= 0;
-		cpuc->lbr_entries[i].abort	= 0;
-		cpuc->lbr_entries[i].cycles	= 0;
-		cpuc->lbr_entries[i].type	= 0;
-		cpuc->lbr_entries[i].reserved	= 0;
+		bs->entries[i].from	= msr_lastbranch.from;
+		bs->entries[i].to	= msr_lastbranch.to;
+		bs->entries[i].mispred	= 0;
+		bs->entries[i].predicted= 0;
+		bs->entries[i].in_tx	= 0;
+		bs->entries[i].abort	= 0;
+		bs->entries[i].cycles	= 0;
+		bs->entries[i].type	= 0;
+		bs->entries[i].reserved	= 0;
 	}
-	cpuc->lbr_stack.nr = i;
-	cpuc->lbr_stack.hw_idx = tos;
+	bs->nr = i;
+	bs->hw_idx = tos;
 }
 
 /*
@@ -820,7 +820,7 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
 {
 	bool need_info = false, call_stack = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -896,19 +896,19 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		if (abort && x86_pmu.lbr_double_abort && out > 0)
 			out--;
 
-		cpuc->lbr_entries[out].from	 = from;
-		cpuc->lbr_entries[out].to	 = to;
-		cpuc->lbr_entries[out].mispred	 = mis;
-		cpuc->lbr_entries[out].predicted = pred;
-		cpuc->lbr_entries[out].in_tx	 = in_tx;
-		cpuc->lbr_entries[out].abort	 = abort;
-		cpuc->lbr_entries[out].cycles	 = cycles;
-		cpuc->lbr_entries[out].type	 = 0;
-		cpuc->lbr_entries[out].reserved	 = 0;
+		bs->entries[out].from	 = from;
+		bs->entries[out].to	 = to;
+		bs->entries[out].mispred = mis;
+		bs->entries[out].predicted = pred;
+		bs->entries[out].in_tx	 = in_tx;
+		bs->entries[out].abort	 = abort;
+		bs->entries[out].cycles	 = cycles;
+		bs->entries[out].type	 = 0;
+		bs->entries[out].reserved = 0;
 		out++;
 	}
-	cpuc->lbr_stack.nr = out;
-	cpuc->lbr_stack.hw_idx = tos;
+	bs->nr = out;
+	bs->hw_idx = tos;
 }
 
 static __always_inline int get_lbr_br_type(u64 info)
@@ -945,7 +945,8 @@ static __always_inline u16 get_lbr_cycles(u64 info)
 }
 
 static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
-				struct lbr_entry *entries)
+				struct lbr_entry *entries,
+				struct perf_branch_stack *bs)
 {
 	struct perf_branch_entry *e;
 	struct lbr_entry *lbr;
@@ -954,7 +955,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
 
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		lbr = entries ? &entries[i] : NULL;
-		e = &cpuc->lbr_entries[i];
+		e = &bs->entries[i];
 
 		from = rdlbr_from(i, lbr);
 		/*
@@ -977,28 +978,28 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
 		e->reserved	= 0;
 	}
 
-	cpuc->lbr_stack.nr = i;
+	bs->nr = i;
 }
 
-static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
 {
-	intel_pmu_store_lbr(cpuc, NULL);
+	intel_pmu_store_lbr(cpuc, NULL, bs);
 }
 
-static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
 {
 	struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;
 
 	if (!xsave) {
-		intel_pmu_store_lbr(cpuc, NULL);
+		intel_pmu_store_lbr(cpuc, NULL, bs);
 		return;
 	}
 	xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
 
-	intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+	intel_pmu_store_lbr(cpuc, xsave->lbr.entries, bs);
 }
 
-void intel_pmu_lbr_read(void)
+void intel_pmu_lbr_read(struct perf_branch_stack *bs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
@@ -1012,9 +1013,8 @@ void intel_pmu_lbr_read(void)
 	    cpuc->lbr_users == cpuc->lbr_pebs_users)
 		return;
 
-	x86_pmu.lbr_read(cpuc);
-
-	intel_pmu_lbr_filter(cpuc);
+	x86_pmu.lbr_read(cpuc, bs);
+	intel_pmu_lbr_filter(cpuc, bs);
 }
 
 /*
@@ -1402,7 +1402,7 @@ static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
  * in PERF_SAMPLE_BRANCH_STACK sample may vary.
  */
 static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
 {
 	u64 from, to;
 	int br_sel = cpuc->br_sel;
@@ -1414,11 +1414,11 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	    ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
 		return;
 
-	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+	for (i = 0; i < bs->nr; i++) {
 
-		from = cpuc->lbr_entries[i].from;
-		to = cpuc->lbr_entries[i].to;
-		type = cpuc->lbr_entries[i].type;
+		from = bs->entries[i].from;
+		to = bs->entries[i].to;
+		type = bs->entries[i].type;
 
 		/*
 		 * Parse the branch type recorded in LBR_x_INFO MSR.
@@ -1430,9 +1430,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 			to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
 			type = arch_lbr_br_type_map[type] | to_plm;
 		} else
-			type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+			type = branch_type(from, to, bs->entries[i].abort);
 		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
-			if (cpuc->lbr_entries[i].in_tx)
+			if (bs->entries[i].in_tx)
 				type |= X86_BR_IN_TX;
 			else
 				type |= X86_BR_NO_TX;
@@ -1440,25 +1440,25 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 
 		/* if type does not correspond, then discard */
 		if (type == X86_BR_NONE || (br_sel & type) != type) {
-			cpuc->lbr_entries[i].from = 0;
+			bs->entries[i].from = 0;
 			compress = true;
 		}
 
 		if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
-			cpuc->lbr_entries[i].type = common_branch_type(type);
+			bs->entries[i].type = common_branch_type(type);
 	}
 
 	if (!compress)
 		return;
 
 	/* remove all entries with from=0 */
-	for (i = 0; i < cpuc->lbr_stack.nr; ) {
-		if (!cpuc->lbr_entries[i].from) {
+	for (i = 0; i < bs->nr; ) {
+		if (!bs->entries[i].from) {
 			j = i;
-			while (++j < cpuc->lbr_stack.nr)
-				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
-			cpuc->lbr_stack.nr--;
-			if (!cpuc->lbr_entries[i].from)
+			while (++j < bs->nr)
+				bs->entries[j-1] = bs->entries[j];
+			bs->nr--;
+			if (!bs->entries[i].from)
 				continue;
 		}
 		i++;
@@ -1476,8 +1476,8 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 	else
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
-	intel_pmu_store_lbr(cpuc, lbr);
-	intel_pmu_lbr_filter(cpuc);
+	intel_pmu_store_lbr(cpuc, lbr, &cpuc->lbr_stack);
+	intel_pmu_lbr_filter(cpuc, &cpuc->lbr_stack);
 }
 
 /*
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e3ac05c97b5e..9700fb1f47c3 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -852,7 +852,7 @@ struct x86_pmu {
 	unsigned int	lbr_br_type:1;
 
 	void		(*lbr_reset)(void);
-	void		(*lbr_read)(struct cpu_hw_events *cpuc);
+	void		(*lbr_read)(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);
 	void		(*lbr_save)(void *ctx);
 	void		(*lbr_restore)(void *ctx);
 
@@ -1345,11 +1345,11 @@ void intel_pmu_lbr_enable_all(bool pmi);
 
 void intel_pmu_lbr_disable_all(void);
 
-void intel_pmu_lbr_read(void);
+void intel_pmu_lbr_read(struct perf_branch_stack *bs);
 
-void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);
 
-void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);
 
 void intel_pmu_lbr_save(void *ctx);
Peter Zijlstra Sept. 3, 2021, 8:27 a.m. UTC | #3
On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:

> +static int
> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +
> +	intel_pmu_disable_all();
> +	intel_pmu_lbr_read();
> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
> +
> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
> +	intel_pmu_enable_all(0);
> +	return cnt;
> +}

Given this disables the PMI from 'random' contexts, should we not add
IRQ disabling around this to avoid really bad behaviour?
Song Liu Sept. 3, 2021, 4:45 p.m. UTC | #4
Hi Peter,

> On Sep 3, 2021, at 1:27 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
> 
>> +static int
>> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
>> +{
>> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>> +
>> +	intel_pmu_disable_all();
>> +	intel_pmu_lbr_read();
>> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
>> +
>> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
>> +	intel_pmu_enable_all(0);
>> +	return cnt;
>> +}
> 
> Given this disables the PMI from 'random' contexts, should we not add
> IRQ disabling around this to avoid really bad behaviour?

Do you mean we should add (instead of not add) IRQ disable?

Thanks,
Song
Song Liu Sept. 3, 2021, 4:50 p.m. UTC | #5
> On Sep 3, 2021, at 1:02 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
>> +static int
>> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
>> +{
>> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>> +
>> +	intel_pmu_disable_all();
>> +	intel_pmu_lbr_read();
>> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
>> +
>> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
>> +	intel_pmu_enable_all(0);
>> +	return cnt;
>> +}
> 
> Would something like the below help get rid of that memcpy() ?
> 
> (compile tested only)

We can get rid of the memcpy. But we will need an extra "size" or "num_entries" 
parameter for intel_pmu_lbr_read. I can add this change in the next version. 

Thanks,
Song
Peter Zijlstra Sept. 4, 2021, 10:18 a.m. UTC | #6
On Fri, Sep 03, 2021 at 04:45:29PM +0000, Song Liu wrote:
> Hi Peter,
> 
> > On Sep 3, 2021, at 1:27 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
> > 
> >> +static int
> >> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> >> +{
> >> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >> +
> >> +	intel_pmu_disable_all();
> >> +	intel_pmu_lbr_read();
> >> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
> >> +
> >> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
> >> +	intel_pmu_enable_all(0);
> >> +	return cnt;
> >> +}
> > 
> > Given this disables the PMI from 'random' contexts, should we not add
> > IRQ disabling around this to avoid really bad behaviour?
> 
> Do you mean we should add (instead of not add) IRQ disable?

Yeah, I tihnk we want local_irq_save()/restore() here.
Song Liu Sept. 7, 2021, 6:59 p.m. UTC | #7
> On Sep 3, 2021, at 9:50 AM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Sep 3, 2021, at 1:02 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>> On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
>>> +static int
>>> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
>>> +{
>>> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>>> +
>>> +	intel_pmu_disable_all();
>>> +	intel_pmu_lbr_read();
>>> +	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
>>> +
>>> +	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
>>> +	intel_pmu_enable_all(0);
>>> +	return cnt;
>>> +}
>> 
>> Would something like the below help get rid of that memcpy() ?
>> 
>> (compile tested only)
> 
> We can get rid of the memcpy. But we will need an extra "size" or "num_entries" 
> parameter for intel_pmu_lbr_read. I can add this change in the next version. 
> 

This is trickier than I thought. As current lbr_read() function works with 
perf_branch_stack, while the BPF helper side uses array of perf_branch_entry. 
And the array is passed into the helper by the BPF program. Therefore, to 
really get rid of the memcpy, we need to refactor the lbr driver code more. 
How about we keep the memcpy for now, and add the optimization later (if we 
think it is necessary)?

Thanks,
Song
Andrii Nakryiko Sept. 7, 2021, 8:50 p.m. UTC | #8
On Tue, Sep 7, 2021 at 12:02 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Sep 3, 2021, at 9:50 AM, Song Liu <songliubraving@fb.com> wrote:
> >
> >
> >
> >> On Sep 3, 2021, at 1:02 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
> >>> +static int
> >>> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> >>> +{
> >>> +   struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >>> +
> >>> +   intel_pmu_disable_all();
> >>> +   intel_pmu_lbr_read();
> >>> +   cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
> >>> +
> >>> +   memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
> >>> +   intel_pmu_enable_all(0);
> >>> +   return cnt;
> >>> +}
> >>
> >> Would something like the below help get rid of that memcpy() ?
> >>
> >> (compile tested only)
> >
> > We can get rid of the memcpy. But we will need an extra "size" or "num_entries"
> > parameter for intel_pmu_lbr_read. I can add this change in the next version.
> >
>
> This is trickier than I thought. As current lbr_read() function works with
> perf_branch_stack, while the BPF helper side uses array of perf_branch_entry.
> And the array is passed into the helper by the BPF program. Therefore, to
> really get rid of the memcpy, we need to refactor the lbr driver code more.
> How about we keep the memcpy for now, and add the optimization later (if we
> think it is necessary)?
>

Sounds good to me!

> Thanks,
> Song
>
diff mbox series

Patch

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7011e87be6d03..d3422915969b9 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2143,7 +2143,7 @@  static __initconst const u64 knl_hw_cache_extra_regs
  * However, there are some cases which may change PEBS status, e.g. PMI
  * throttle. The PEBS_ENABLE should be updated where the status changes.
  */
-static void __intel_pmu_disable_all(void)
+static __always_inline void __intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
@@ -2153,7 +2153,7 @@  static void __intel_pmu_disable_all(void)
 		intel_pmu_disable_bts();
 }
 
-static void intel_pmu_disable_all(void)
+static __always_inline void intel_pmu_disable_all(void)
 {
 	__intel_pmu_disable_all();
 	intel_pmu_pebs_disable_all();
@@ -2186,6 +2186,20 @@  static void intel_pmu_enable_all(int added)
 	__intel_pmu_enable_all(added, false);
 }
 
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	intel_pmu_disable_all();
+	intel_pmu_lbr_read();
+	cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
+
+	memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+	intel_pmu_enable_all(0);
+	return cnt;
+}
+
 /*
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
@@ -6283,9 +6297,15 @@  __init int intel_pmu_init(void)
 			x86_pmu.lbr_nr = 0;
 	}
 
-	if (x86_pmu.lbr_nr)
+	if (x86_pmu.lbr_nr) {
 		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
 
+		/* only support branch_stack snapshot for perfmon >= v2 */
+		if (x86_pmu.disable_all == intel_pmu_disable_all)
+			static_call_update(perf_snapshot_branch_stack,
+					   intel_pmu_snapshot_branch_stack);
+	}
+
 	intel_pmu_check_extra_regs(x86_pmu.extra_regs);
 
 	/* Support full width counters using alternative MSR range */
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8647713276a73..8a832986578a9 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1296,14 +1296,6 @@  void intel_pmu_pebs_enable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 }
 
-void intel_pmu_pebs_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e3ac05c97b5e5..171abbb359fe5 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1240,6 +1240,14 @@  static inline bool intel_pmu_has_bts(struct perf_event *event)
 	return intel_pmu_has_bts_period(event, hwc->sample_period);
 }
 
+static __always_inline void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
@@ -1314,8 +1322,6 @@  void intel_pmu_pebs_disable(struct perf_event *event);
 
 void intel_pmu_pebs_enable_all(void);
 
-void intel_pmu_pebs_disable_all(void);
-
 void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 void intel_pmu_auto_reload_read(struct perf_event *event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fe156a8170aa3..4fe11f4f896b1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -57,6 +57,7 @@  struct perf_guest_info_callbacks {
 #include <linux/cgroup.h>
 #include <linux/refcount.h>
 #include <linux/security.h>
+#include <linux/static_call.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -1612,4 +1613,26 @@  extern void __weak arch_perf_update_userpage(struct perf_event *event,
 extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
 #endif
 
+/*
+ * Snapshot branch stack on software events.
+ *
+ * Branch stack can be very useful in understanding software events. For
+ * example, when a long function, e.g. sys_perf_event_open, returns an
+ * errno, it is not obvious why the function failed. Branch stack could
+ * provide very helpful information in this type of scenarios.
+ *
+ * On software event, it is necessary to stop the hardware branch recorder
+ * fast. Otherwise, the hardware register/buffer will be flushed with
+ * entries af the triggering event. Therefore, static call is used to
+ * stop the hardware recorder.
+ */
+
+/*
+ * cnt is the number of entries allocated for entries.
+ * Return number of entries copied to .
+ */
+typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
+					   unsigned int cnt);
+DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
+
 #endif /* _LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 011cc5069b7ba..d32a3cf37eb90 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13437,3 +13437,5 @@  struct cgroup_subsys perf_event_cgrp_subsys = {
 	.threaded	= true,
 };
 #endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);