diff mbox series

[V13,-,RESEND,09/10] arm64/perf: Implement branch records save on task sched out

Message ID 20230711082455.215983-10-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64/perf: Enable branch stack sampling | expand

Commit Message

Anshuman Khandual July 11, 2023, 8:24 a.m. UTC
This modifies current armv8pmu_sched_task(), to implement a branch records
save mechanism via armv8pmu_branch_save() when a task scheds out of a cpu.
BRBE is paused and disabled for all exception levels before branch records
get captured, which then get concatenated with all existing stored records
present in the task context maintaining the contiguity. Although the final
length of the concatenated buffer does not exceed implemented BRBE length.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Tested-by: James Clark <james.clark@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm/include/asm/arm_pmuv3.h    |  1 +
 arch/arm64/include/asm/perf_event.h |  2 ++
 drivers/perf/arm_brbe.c             | 30 +++++++++++++++++++++++++++++
 drivers/perf/arm_pmuv3.c            | 14 ++++++++++++--
 4 files changed, 45 insertions(+), 2 deletions(-)

Comments

Rajnesh Kanwal Aug. 2, 2023, 11:59 a.m. UTC | #1
>diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c
>index 203cd4f350d5..2177632befa6 100644
>--- a/drivers/perf/arm_brbe.c
>+++ b/drivers/perf/arm_brbe.c
>@@ -165,6 +165,36 @@ static int stitch_stored_live_entries(struct brbe_regset *stored,
> 	return min(nr_live + nr_stored, nr_max);
> }
> 
>+static int brbe_branch_save(struct brbe_regset *live, int nr_hw_entries)
>+{
>+	u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>+	int nr_live;
>+
>+	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>+	isb();
>+
>+	nr_live = capture_brbe_regset(live, nr_hw_entries);
>+
>+	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>+	isb();
>+
>+	return nr_live;
>+}
>+
>+void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>+{
>+	struct arm64_perf_task_context *task_ctx = ctx;
>+	struct brbe_regset live[BRBE_MAX_ENTRIES];
>+	int nr_live, nr_store, nr_hw_entries;
>+
>+	nr_hw_entries = brbe_get_numrec(arm_pmu->reg_brbidr);
>+	nr_live = brbe_branch_save(live, nr_hw_entries);
>+	nr_store = task_ctx->nr_brbe_records;
>+	nr_store = stitch_stored_live_entries(task_ctx->store, live, nr_store,
>+					      nr_live, nr_hw_entries);
>+	task_ctx->nr_brbe_records = nr_store;
>+}

Asking out-of-curiosity. Have you thought about virtualization use
case. Current LBR implementation create an event for the guest
and save/restore happens using the sched_task callback. (Correct me
if I am wrong). Given you are only saving and processing those saved
entries, how do you plan to expose the entries to the guest?

Thanks
Rajnesh

>+
> /*
>  * Generic perf branch filters supported on BRBE
>  *
>diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
>index 408974d5c57b..aa3c7b3dcdd6 100644
>--- a/drivers/perf/arm_pmuv3.c
>+++ b/drivers/perf/arm_pmuv3.c
>@@ -923,9 +923,19 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
> static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
> {
> 	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>+	void *task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
> 
>-	if (sched_in && armpmu->has_branch_stack)
>-		armv8pmu_branch_reset();
>+	if (armpmu->has_branch_stack) {
>+		/* Save branch records in task_ctx on sched out */
>+		if (task_ctx && !sched_in) {
>+			armv8pmu_branch_save(armpmu, task_ctx);
>+			return;
>+		}
>+
>+		/* Reset branch records on sched in */
>+		if (sched_in)
>+			armv8pmu_branch_reset();
>+	}
> }
Marc Zyngier Aug. 2, 2023, 7:16 p.m. UTC | #2
On Wed, 02 Aug 2023 12:59:31 +0100,
Rajnesh Kanwal <rkanwal@rivosinc.com> wrote:
> 
> >diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c
> >index 203cd4f350d5..2177632befa6 100644
> >--- a/drivers/perf/arm_brbe.c
> >+++ b/drivers/perf/arm_brbe.c
> >@@ -165,6 +165,36 @@ static int stitch_stored_live_entries(struct brbe_regset *stored,
> > 	return min(nr_live + nr_stored, nr_max);
> > }
> > 
> >+static int brbe_branch_save(struct brbe_regset *live, int nr_hw_entries)
> >+{
> >+	u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> >+	int nr_live;
> >+
> >+	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> >+	isb();
> >+
> >+	nr_live = capture_brbe_regset(live, nr_hw_entries);
> >+
> >+	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> >+	isb();
> >+
> >+	return nr_live;
> >+}
> >+
> >+void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
> >+{
> >+	struct arm64_perf_task_context *task_ctx = ctx;
> >+	struct brbe_regset live[BRBE_MAX_ENTRIES];
> >+	int nr_live, nr_store, nr_hw_entries;
> >+
> >+	nr_hw_entries = brbe_get_numrec(arm_pmu->reg_brbidr);
> >+	nr_live = brbe_branch_save(live, nr_hw_entries);
> >+	nr_store = task_ctx->nr_brbe_records;
> >+	nr_store = stitch_stored_live_entries(task_ctx->store, live, nr_store,
> >+					      nr_live, nr_hw_entries);
> >+	task_ctx->nr_brbe_records = nr_store;
> >+}
> 
> Asking out-of-curiosity. Have you thought about virtualization use
> case. Current LBR implementation create an event for the guest
> and save/restore happens using the sched_task callback. (Correct me
> if I am wrong). Given you are only saving and processing those saved
> entries, how do you plan to expose the entries to the guest?

Two possibilities:

- either we perform a full save/restore of the registers so that host
  and guest have isolated states

- or we trap all BRBE accesses and piggy-back on the perf framework
  for that, much like we already do for the PMU.

	M.
diff mbox series

Patch

diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index 3d8faf4200dc..3d047d2c9430 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -259,5 +259,6 @@  static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
 static inline void armv8pmu_branch_reset(void) { }
 static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu) { return 0; }
 static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu) { }
+static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) { }
 #endif
 #endif
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index b0c12a5882df..36e7dfb466a6 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -40,6 +40,7 @@  void armv8pmu_branch_probe(struct arm_pmu *arm_pmu);
 void armv8pmu_branch_reset(void);
 int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu);
 void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu);
+void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx);
 #else
 static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
 {
@@ -66,6 +67,7 @@  static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
 static inline void armv8pmu_branch_reset(void) { }
 static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu) { return 0; }
 static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu) { }
+static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) { }
 #endif
 #endif
 #endif
diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c
index 203cd4f350d5..2177632befa6 100644
--- a/drivers/perf/arm_brbe.c
+++ b/drivers/perf/arm_brbe.c
@@ -165,6 +165,36 @@  static int stitch_stored_live_entries(struct brbe_regset *stored,
 	return min(nr_live + nr_stored, nr_max);
 }
 
+static int brbe_branch_save(struct brbe_regset *live, int nr_hw_entries)
+{
+	u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+	int nr_live;
+
+	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+	isb();
+
+	nr_live = capture_brbe_regset(live, nr_hw_entries);
+
+	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+	isb();
+
+	return nr_live;
+}
+
+void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
+{
+	struct arm64_perf_task_context *task_ctx = ctx;
+	struct brbe_regset live[BRBE_MAX_ENTRIES];
+	int nr_live, nr_store, nr_hw_entries;
+
+	nr_hw_entries = brbe_get_numrec(arm_pmu->reg_brbidr);
+	nr_live = brbe_branch_save(live, nr_hw_entries);
+	nr_store = task_ctx->nr_brbe_records;
+	nr_store = stitch_stored_live_entries(task_ctx->store, live, nr_store,
+					      nr_live, nr_hw_entries);
+	task_ctx->nr_brbe_records = nr_store;
+}
+
 /*
  * Generic perf branch filters supported on BRBE
  *
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 408974d5c57b..aa3c7b3dcdd6 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -923,9 +923,19 @@  static int armv8pmu_user_event_idx(struct perf_event *event)
 static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+	void *task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
 
-	if (sched_in && armpmu->has_branch_stack)
-		armv8pmu_branch_reset();
+	if (armpmu->has_branch_stack) {
+		/* Save branch records in task_ctx on sched out */
+		if (task_ctx && !sched_in) {
+			armv8pmu_branch_save(armpmu, task_ctx);
+			return;
+		}
+
+		/* Reset branch records on sched in */
+		if (sched_in)
+			armv8pmu_branch_reset();
+	}
 }
 
 /*