Message ID | 20230711082455.215983-10-anshuman.khandual@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64/perf: Enable branch stack sampling | expand |
>diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c >index 203cd4f350d5..2177632befa6 100644 >--- a/drivers/perf/arm_brbe.c >+++ b/drivers/perf/arm_brbe.c >@@ -165,6 +165,36 @@ static int stitch_stored_live_entries(struct brbe_regset *stored, > return min(nr_live + nr_stored, nr_max); > } > >+static int brbe_branch_save(struct brbe_regset *live, int nr_hw_entries) >+{ >+ u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1); >+ int nr_live; >+ >+ write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); >+ isb(); >+ >+ nr_live = capture_brbe_regset(live, nr_hw_entries); >+ >+ write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); >+ isb(); >+ >+ return nr_live; >+} >+ >+void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) >+{ >+ struct arm64_perf_task_context *task_ctx = ctx; >+ struct brbe_regset live[BRBE_MAX_ENTRIES]; >+ int nr_live, nr_store, nr_hw_entries; >+ >+ nr_hw_entries = brbe_get_numrec(arm_pmu->reg_brbidr); >+ nr_live = brbe_branch_save(live, nr_hw_entries); >+ nr_store = task_ctx->nr_brbe_records; >+ nr_store = stitch_stored_live_entries(task_ctx->store, live, nr_store, >+ nr_live, nr_hw_entries); >+ task_ctx->nr_brbe_records = nr_store; >+} Asking out-of-curiosity. Have you thought about virtualization use case. Current LBR implementation create an event for the guest and save/restore happens using the sched_task callback. (Correct me if I am wrong). Given you are only saving and processing those saved entries, how do you plan to expose the entries to the guest? Thanks Rajnesh >+ > /* > * Generic perf branch filters supported on BRBE > * >diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c >index 408974d5c57b..aa3c7b3dcdd6 100644 >--- a/drivers/perf/arm_pmuv3.c >+++ b/drivers/perf/arm_pmuv3.c >@@ -923,9 +923,19 @@ static int armv8pmu_user_event_idx(struct perf_event *event) > static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) > { > struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu); >+ void *task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL; > >- if (sched_in && armpmu->has_branch_stack) >- armv8pmu_branch_reset(); >+ if (armpmu->has_branch_stack) { >+ /* Save branch records in task_ctx on sched out */ >+ if (task_ctx && !sched_in) { >+ armv8pmu_branch_save(armpmu, task_ctx); >+ return; >+ } >+ >+ /* Reset branch records on sched in */ >+ if (sched_in) >+ armv8pmu_branch_reset(); >+ } > }
On Wed, 02 Aug 2023 12:59:31 +0100, Rajnesh Kanwal <rkanwal@rivosinc.com> wrote: > > >diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c > >index 203cd4f350d5..2177632befa6 100644 > >--- a/drivers/perf/arm_brbe.c > >+++ b/drivers/perf/arm_brbe.c > >@@ -165,6 +165,36 @@ static int stitch_stored_live_entries(struct brbe_regset *stored, > > return min(nr_live + nr_stored, nr_max); > > } > > > >+static int brbe_branch_save(struct brbe_regset *live, int nr_hw_entries) > >+{ > >+ u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1); > >+ int nr_live; > >+ > >+ write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); > >+ isb(); > >+ > >+ nr_live = capture_brbe_regset(live, nr_hw_entries); > >+ > >+ write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); > >+ isb(); > >+ > >+ return nr_live; > >+} > >+ > >+void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) > >+{ > >+ struct arm64_perf_task_context *task_ctx = ctx; > >+ struct brbe_regset live[BRBE_MAX_ENTRIES]; > >+ int nr_live, nr_store, nr_hw_entries; > >+ > >+ nr_hw_entries = brbe_get_numrec(arm_pmu->reg_brbidr); > >+ nr_live = brbe_branch_save(live, nr_hw_entries); > >+ nr_store = task_ctx->nr_brbe_records; > >+ nr_store = stitch_stored_live_entries(task_ctx->store, live, nr_store, > >+ nr_live, nr_hw_entries); > >+ task_ctx->nr_brbe_records = nr_store; > >+} > > Asking out-of-curiosity. Have you thought about virtualization use > case. Current LBR implementation create an event for the guest > and save/restore happens using the sched_task callback. (Correct me > if I am wrong). Given you are only saving and processing those saved > entries, how do you plan to expose the entries to the guest? Two possibilities: - either we perform a full save/restore of the registers so that host and guest have isolated states - or we trap all BRBE accesses and piggy-back on the perf framework for that, much like we already do for the PMU. M.
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 3d8faf4200dc..3d047d2c9430 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -259,5 +259,6 @@ static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { } static inline void armv8pmu_branch_reset(void) { } static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu) { return 0; } static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu) { } +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) { } #endif #endif diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index b0c12a5882df..36e7dfb466a6 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -40,6 +40,7 @@ void armv8pmu_branch_probe(struct arm_pmu *arm_pmu); void armv8pmu_branch_reset(void); int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu); void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu); +void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx); #else static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event) { @@ -66,6 +67,7 @@ static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { } static inline void armv8pmu_branch_reset(void) { } static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu) { return 0; } static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu) { } +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) { } #endif #endif #endif diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c index 203cd4f350d5..2177632befa6 100644 --- a/drivers/perf/arm_brbe.c +++ b/drivers/perf/arm_brbe.c @@ -165,6 +165,36 @@ static int stitch_stored_live_entries(struct brbe_regset *stored, return min(nr_live + nr_stored, nr_max); } +static int brbe_branch_save(struct brbe_regset *live, int nr_hw_entries) +{ + u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1); + int nr_live; + + write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); + isb(); + + nr_live = capture_brbe_regset(live, nr_hw_entries); + + write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); + isb(); + + return nr_live; +} + +void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx) +{ + struct arm64_perf_task_context *task_ctx = ctx; + struct brbe_regset live[BRBE_MAX_ENTRIES]; + int nr_live, nr_store, nr_hw_entries; + + nr_hw_entries = brbe_get_numrec(arm_pmu->reg_brbidr); + nr_live = brbe_branch_save(live, nr_hw_entries); + nr_store = task_ctx->nr_brbe_records; + nr_store = stitch_stored_live_entries(task_ctx->store, live, nr_store, + nr_live, nr_hw_entries); + task_ctx->nr_brbe_records = nr_store; +} + /* * Generic perf branch filters supported on BRBE * diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 408974d5c57b..aa3c7b3dcdd6 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -923,9 +923,19 @@ static int armv8pmu_user_event_idx(struct perf_event *event) static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu); + void *task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL; - if (sched_in && armpmu->has_branch_stack) - armv8pmu_branch_reset(); + if (armpmu->has_branch_stack) { + /* Save branch records in task_ctx on sched out */ + if (task_ctx && !sched_in) { + armv8pmu_branch_save(armpmu, task_ctx); + return; + } + + /* Reset branch records on sched in */ + if (sched_in) + armv8pmu_branch_reset(); + } } /*