diff mbox series

[V14,3/8] drivers: perf: arm_pmuv3: Enable branch stack sampling framework

Message ID 20231114051329.327572-4-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64/perf: Enable branch stack sampling | expand

Commit Message

Anshuman Khandual Nov. 14, 2023, 5:13 a.m. UTC
Branch stack sampling support i.e capturing branch records during execution
in core perf, rides along with normal HW events being scheduled on the PMU.
This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
with required HW implementation.

ARMV8 PMU hardware support for branch stack sampling is indicated via a new
feature flag called 'has_branch_stack' that can be ascertained via probing.
This modifies current gate in armpmu_event_init() which blocks branch stack
sampling based perf events unconditionally. Instead allows such perf events
getting initialized on supporting PMU hardware.

Branch stack sampling is enabled and disabled along with regular PMU events
. This adds required function callbacks in armv8pmu_branch_xxx() format, to
drive the PMU branch stack hardware when supported. This also adds fallback
stub definitions for these callbacks for PMUs which would not have required
support.

If a task gets scheduled out, the current branch records get saved in the
task's context data, which can be later used to fill in the records upon an
event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
based flag) for branch stack requesting perf events. But this also requires
adding support for pmu::sched_task() callback to arm_pmu.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
Changes in V14:

- Folded the following three patches from V13 series into this single patch

  drivers: perf: arm_pmu: Add new sched_task() callback
  arm64/perf: Add branch stack support in struct arm_pmu
  arm64/perf: Add branch stack support in struct pmu_hw_events
  arm64/perf: Add branch stack support in ARMV8 PMU

- All armv8pmu_branch_xxxx() stub definitions have been moved inside
  include/linux/perf/arm_pmuv3.h for easy access from both arm32 and
  arm64 platforms

- Added brbe_users, brbe_context and brbe_sample_type struct pmu_hw_events
- Added branch_reset() and sched_task() callbacks

- Changed and optimized branch records processing during a PMU IRQ
- NO branch records get captured for event with mismatched brbe_sample_type

- Branch record context is tracked from armpmu_del() & armpmu_add()
- Branch record hardware is driven from armv8pmu_start() & armv8pmu_stop()

 drivers/perf/arm_pmu.c         |  41 +++++++++-
 drivers/perf/arm_pmuv3.c       | 141 ++++++++++++++++++++++++++++++++-
 include/linux/perf/arm_pmu.h   |  29 ++++++-
 include/linux/perf/arm_pmuv3.h |  46 +++++++++++
 4 files changed, 253 insertions(+), 4 deletions(-)

Comments

James Clark Nov. 14, 2023, 9:58 a.m. UTC | #1
On 14/11/2023 05:13, Anshuman Khandual wrote:
> Branch stack sampling support i.e capturing branch records during execution
> in core perf, rides along with normal HW events being scheduled on the PMU.
> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
> with required HW implementation.
> 

[...]

> - All armv8pmu_branch_xxxx() stub definitions have been moved inside
>   include/linux/perf/arm_pmuv3.h for easy access from both arm32 and
>   arm64 platforms
> 

This causes lots of W=1 build errors because the prototypes are in
arm_pmuv3.h, but arm_brbe.c doesn't include it.

It seems like the main reason you can't include arm_brbe.h in arm32 code
is because there are a load of inline functions and references to
registers in there. But these are only used in arm_brbe.c, so they don't
need to be in the header anyway.

If you removed the code from the header and moved it to the source file
you could move the brbe prototypes to the brbe header and it would be a
bit cleaner and more idiomatic.
James Clark Nov. 14, 2023, 12:14 p.m. UTC | #2
On 14/11/2023 05:13, Anshuman Khandual wrote:
[...]

> +/*
> + * This is a read only constant and safe during multi threaded access
> + */
> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
> +
> +static void read_branch_records(struct pmu_hw_events *cpuc,
> +				struct perf_event *event,
> +				struct perf_sample_data *data,
> +				bool *branch_captured)
> +{
> +	/*
> +	 * CPU specific branch records buffer must have been allocated already
> +	 * for the hardware records to be captured and processed further.
> +	 */
> +	if (WARN_ON(!cpuc->branches))
> +		return;
> +
> +	/*
> +	 * Overflowed event's branch_sample_type does not match the configured
> +	 * branch filters in the BRBE HW. So the captured branch records here
> +	 * cannot be co-related to the overflowed event. Report to the user as
> +	 * if no branch records have been captured, and flush branch records.
> +	 * The same scenario is applicable when the current task context does
> +	 * not match with overflown event.
> +	 */
> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
> +		perf_sample_save_brstack(data, event, &zero_branch_stack);

Is there any benefit to outputting a zero size stack vs not outputting
anything at all?

> +		return;
> +	}
> +
> +	/*
> +	 * Read the branch records from the hardware once after the PMU IRQ
> +	 * has been triggered but subsequently same records can be used for
> +	 * other events that might have been overflowed simultaneously thus
> +	 * saving much CPU cycles.
> +	 */
> +	if (!*branch_captured) {
> +		armv8pmu_branch_read(cpuc, event);
> +		*branch_captured = true;
> +	}
> +	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack);
> +}
> +
>  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>  {
>  	u32 pmovsr;
> @@ -766,6 +815,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>  	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>  	struct pt_regs *regs;
>  	int idx;
> +	bool branch_captured = false;
>  
>  	/*
>  	 * Get and reset the IRQ flags
> @@ -809,6 +859,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>  		if (!armpmu_event_set_period(event))
>  			continue;
>  
> +		/*
> +		 * PMU IRQ should remain asserted until all branch records
> +		 * are captured and processed into struct perf_sample_data.
> +		 */
> +		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
> +			read_branch_records(cpuc, event, &data, &branch_captured);

You could return instead of using the out param, not really any
different, but maybe a bit more normal:

  branch_captured |= read_branch_records(cpuc, event, &data,
branch_captured);

> +
>  		/*
>  		 * Perf event overflow will queue the processing of the event as
>  		 * an irq_work which will be taken care of in the handling of
> @@ -818,6 +875,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>  			cpu_pmu->disable(event);
>  	}
>  	armv8pmu_start(cpu_pmu);
> +	if (cpu_pmu->has_branch_stack)
> +		armv8pmu_branch_reset();
>  
>  	return IRQ_HANDLED;
>  }
> @@ -907,6 +966,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>  	return event->hw.idx;
>  }
>  
> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
> +{
> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
> +	void *task_ctx = pmu_ctx->task_ctx_data;
> +
> +	if (armpmu->has_branch_stack) {
> +		/* Save branch records in task_ctx on sched out */
> +		if (task_ctx && !sched_in) {
> +			armv8pmu_branch_save(armpmu, task_ctx);
> +			return;
> +		}
> +
> +		/* Reset branch records on sched in */
> +		if (sched_in)
> +			armv8pmu_branch_reset();
> +	}
> +}
> +
>  /*
>   * Add an event filter to a given event.
>   */
> @@ -977,6 +1054,9 @@ static void armv8pmu_reset(void *info)
>  		pmcr |= ARMV8_PMU_PMCR_LP;
>  
>  	armv8pmu_pmcr_write(pmcr);
> +
> +	if (cpu_pmu->has_branch_stack)
> +		armv8pmu_branch_reset();
>  }
>  
>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
> @@ -1014,6 +1094,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>  
>  	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>  
> +	if (has_branch_stack(event)) {
> +		if (!armv8pmu_branch_attr_valid(event))
> +			return -EOPNOTSUPP;
> +
> +		/*
> +		 * If a task gets scheduled out, the current branch records
> +		 * get saved in the task's context data, which can be later
> +		 * used to fill in the records upon an event overflow. Let's
> +		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
> +		 * all branch stack sampling perf events.
> +		 */
> +		event->attach_state |= PERF_ATTACH_TASK_DATA;
> +	}
> +
>  	/*
>  	 * CHAIN events only work when paired with an adjacent counter, and it
>  	 * never makes sense for a user to open one in isolation, as they'll be
> @@ -1130,6 +1224,35 @@ static void __armv8pmu_probe_pmu(void *info)
>  		cpu_pmu->reg_pmmir = read_pmmir();
>  	else
>  		cpu_pmu->reg_pmmir = 0;
> +	armv8pmu_branch_probe(cpu_pmu);

I'm not sure if this is splitting hairs or not, but
__armv8pmu_probe_pmu() is run on only one of 'any' of the supported CPUs
for this PMU.

Is it not possible to have some of those CPUs support and some not
support BRBE, even though they are all the same PMU type? Maybe we could
wait for it to explode with some weird system, or change it so that the
BRBE probe is run on every CPU, with a second 'supported_brbe_mask' field.

> +}
> +
> +static int branch_records_alloc(struct arm_pmu *armpmu)
> +{
> +	struct branch_records __percpu *records;
> +	int cpu;
> +
> +	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
> +	if (!records)
> +		return -ENOMEM;
> +

Doesn't this technically need to take the CPU mask where BRBE is
supported into account? Otherwise you are allocating for cores that
never use it.

Also it's done per-CPU _and_ per-PMU type, multiplying the number of
BRBE buffers allocated, even if they can only ever be used per-CPU.

> +	/*
> +	 * percpu memory allocated for 'records' gets completely consumed
> +	 * here, and never required to be freed up later. So permanently
> +	 * losing access to this anchor i.e 'records' is acceptable.
> +	 *
> +	 * Otherwise this allocation handle would have to be saved up for
> +	 * free_percpu() release later if required.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct pmu_hw_events *events_cpu;
> +		struct branch_records *records_cpu;
> +
> +		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
> +		records_cpu = per_cpu_ptr(records, cpu);
> +		events_cpu->branches = records_cpu;
> +	}
> +	return 0;
>  }
>  
>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> @@ -1146,7 +1269,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>  	if (ret)
>  		return ret;
>  
> -	return probe.present ? 0 : -ENODEV;
> +	if (!probe.present)
> +		return -ENODEV;
> +
> +	if (cpu_pmu->has_branch_stack) {
> +		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
> +		if (ret)
> +			return ret;
> +
> +		ret = branch_records_alloc(cpu_pmu);
> +		if (ret) {
> +			armv8pmu_task_ctx_cache_free(cpu_pmu);
> +			return ret;
> +		}
> +	}
> +	return 0;
>  }
>  

[...]
> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
> index 9c226adf938a..72da4522397c 100644
> --- a/include/linux/perf/arm_pmuv3.h
> +++ b/include/linux/perf/arm_pmuv3.h
> @@ -303,4 +303,50 @@
>  		}						\
>  	} while (0)
>  
> +struct pmu_hw_events;
> +struct arm_pmu;
> +struct perf_event;
> +
> +#ifdef CONFIG_PERF_EVENTS

Very minor nit, but if you end up moving the stubs to the brbe header
you probably don't need the #ifdef CONFIG_PERF_EVENTS because it just
won't be included in that case.

> +static inline void armv8pmu_branch_reset(void)
> +{
> +}
> +
> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
> +{
> +}
> +
> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!has_branch_stack(event));
> +	return false;
> +}
> +
> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
> +{
> +}
> +
> +static inline void armv8pmu_branch_disable(void)
> +{
> +}
> +
> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
> +					struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!has_branch_stack(event));
> +}
> +
> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
> +{
> +}
> +
> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
> +{
> +	return 0;
> +}
> +
> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
> +{
> +}
> +#endif /* CONFIG_PERF_EVENTS */
>  #endif
James Clark Nov. 14, 2023, 5:10 p.m. UTC | #3
On 14/11/2023 05:13, Anshuman Khandual wrote:
[...]
> 
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index d712a19e47ac..76f1376ae594 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>  	struct hw_perf_event *hwc = &event->hw;
>  	int idx = hwc->idx;
>  
> +	if (has_branch_stack(event)) {
> +		WARN_ON_ONCE(!hw_events->brbe_users);
> +		hw_events->brbe_users--;
> +		if (!hw_events->brbe_users) {
> +			hw_events->brbe_context = NULL;
> +			hw_events->brbe_sample_type = 0;
> +		}
> +	}
> +
>  	armpmu_stop(event, PERF_EF_UPDATE);
>  	hw_events->events[idx] = NULL;
>  	armpmu->clear_event_idx(hw_events, event);
> @@ -333,6 +342,22 @@ armpmu_add(struct perf_event *event, int flags)
>  	struct hw_perf_event *hwc = &event->hw;
>  	int idx;
>  
> +	if (has_branch_stack(event)) {
> +		/*
> +		 * Reset branch records buffer if a new task event gets
> +		 * scheduled on a PMU which might have existing records.
> +		 * Otherwise older branch records present in the buffer
> +		 * might leak into the new task event.
> +		 */
> +		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
> +			hw_events->brbe_context = event->ctx;
> +			if (armpmu->branch_reset)
> +				armpmu->branch_reset();

What about a per-thread event following a per-cpu event? Doesn't that
also need to branch_reset()? If hw_events->brbe_context was already
previously assigned, once the per-thread event is switched in it skips
this reset following a per-cpu event on the same core.

I think it should be possible to add a test for this scenario by
creating simulaneous per-cpu and per-thread events and checking for leakage.

> +		}
> +		hw_events->brbe_users++;
> +		hw_events->brbe_sample_type = event->attr.branch_sample_type;
> +	}
> +
>  	/* An event following a process won't be stopped earlier */
>  	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>  		return -ENOENT;
> @@ -512,13 +537,24 @@ static int armpmu_event_init(struct perf_event *event)
>  		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
>  		return -ENOENT;
>  
> -	/* does not support taken branch sampling */
> -	if (has_branch_stack(event))
> +	/*
> +	 * Branch stack sampling events are allowed
> +	 * only on PMU which has required support.
> +	 */
> +	if (has_branch_stack(event) && !armpmu->has_branch_stack)
>  		return -EOPNOTSUPP;
>  
>  	return __hw_perf_event_init(event);
>  }
>  

[...]
> +/*
> + * This is a read only constant and safe during multi threaded access
> + */
> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
> +
> +static void read_branch_records(struct pmu_hw_events *cpuc,
> +				struct perf_event *event,
> +				struct perf_sample_data *data,
> +				bool *branch_captured)
> +{
> +	/*
> +	 * CPU specific branch records buffer must have been allocated already
> +	 * for the hardware records to be captured and processed further.
> +	 */
> +	if (WARN_ON(!cpuc->branches))
> +		return;
> +
> +	/*
> +	 * Overflowed event's branch_sample_type does not match the configured
> +	 * branch filters in the BRBE HW. So the captured branch records here
> +	 * cannot be co-related to the overflowed event. Report to the user as
> +	 * if no branch records have been captured, and flush branch records.
> +	 * The same scenario is applicable when the current task context does
> +	 * not match with overflown event.
> +	 */
> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
> +		perf_sample_save_brstack(data, event, &zero_branch_stack);
> +		return;
> +	}

I think we should probably add a test for this scenario too. Like that
the second event opened on the same thread as another event with
different brbe settings always produces zero records.

I actually tried to reproduce this behaviour but couldn't. Not sure if I
did something wrong though.
Anshuman Khandual Nov. 15, 2023, 5:44 a.m. UTC | #4
On 11/14/23 15:28, James Clark wrote:
> 
> 
> On 14/11/2023 05:13, Anshuman Khandual wrote:
>> Branch stack sampling support i.e capturing branch records during execution
>> in core perf, rides along with normal HW events being scheduled on the PMU.
>> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
>> with required HW implementation.
>>
> 
> [...]
> 
>> - All armv8pmu_branch_xxxx() stub definitions have been moved inside
>>   include/linux/perf/arm_pmuv3.h for easy access from both arm32 and
>>   arm64 platforms
>>
> 
> This causes lots of W=1 build errors because the prototypes are in
> arm_pmuv3.h, but arm_brbe.c doesn't include it.

I guess these are the W=1 warnings you mentioned above.

drivers/perf/arm_brbe.c:11:6: warning: no previous prototype for ‘armv8pmu_branch_reset’ [-Wmissing-prototypes]
   11 | void armv8pmu_branch_reset(void)                                                                                                                                                                   
      |      ^~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                         
drivers/perf/arm_brbe.c:190:6: warning: no previous prototype for ‘armv8pmu_branch_save’ [-Wmissing-prototypes]      
  190 | void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)                      
      |      ^~~~~~~~~~~~~~~~~~~~                                                                                                                                                                          
drivers/perf/arm_brbe.c:236:6: warning: no previous prototype for ‘armv8pmu_branch_attr_valid’ [-Wmissing-prototypes]
  236 | bool armv8pmu_branch_attr_valid(struct perf_event *event)                                                                                                                                          
      |      ^~~~~~~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                    
drivers/perf/arm_brbe.c:269:5: warning: no previous prototype for ‘armv8pmu_task_ctx_cache_alloc’ [-Wmissing-prototypes]
  269 | int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)                            
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                            
drivers/perf/arm_brbe.c:279:6: warning: no previous prototype for ‘armv8pmu_task_ctx_cache_free’ [-Wmissing-prototypes]
  279 | void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)                       
      |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                            
drivers/perf/arm_brbe.c:303:6: warning: no previous prototype for ‘armv8pmu_branch_probe’ [-Wmissing-prototypes]
  303 | void armv8pmu_branch_probe(struct arm_pmu *armpmu)                                
      |      ^~~~~~~~~~~~~~~~~~~~~                                                                   
drivers/perf/arm_brbe.c:449:6: warning: no previous prototype for ‘armv8pmu_branch_enable’ [-Wmissing-prototypes]
  449 | void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)                             
      |      ^~~~~~~~~~~~~~~~~~~~~~                                                                  
drivers/perf/arm_brbe.c:474:6: warning: no previous prototype for ‘armv8pmu_branch_disable’ [-Wmissing-prototypes]
  474 | void armv8pmu_branch_disable(void)                                                           
      |      ^~~~~~~~~~~~~~~~~~~~~~~                                                                 
drivers/perf/arm_brbe.c:717:6: warning: no previous prototype for ‘armv8pmu_branch_read’ [-Wmissing-prototypes]
  717 | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)

Branch helpers are used in ARM PMU V3 driver i.e drivers/perf/arm_pmuv3.c.
Whether the actual BRBE helper definitions, or their fallback stubs (when
CONFIG_ARM64_BRBE is not enabled), need to be accessible from arm_pmuv3.c
driver not from brbe.c implementations itself.

> 
> It seems like the main reason you can't include arm_brbe.h in arm32 code
> is because there are a load of inline functions and references to
> registers in there. But these are only used in arm_brbe.c, so they don't

Right, arm32 should not be exposed to BRBE internals via arm_brbe.h header.

> need to be in the header anyway.

Right, these are only used in arm_brbe.c

> 
> If you removed the code from the header and moved it to the source file
> you could move the brbe prototypes to the brbe header and it would be a
> bit cleaner and more idiomatic.

Alight, how about the following changes - build tested on arm32 and arm64.

- Move BRBE helpers from arm_brbe.h into arm_brbe.c
- Move armv8_pmu_xxx() declaration inside arm_brbe.h for arm64 (CONFIG_ARM64_BRBE)
- Move armv8_pmu_xxx() stub definitions inside arm_pmuv3.c for arm32 (!CONFIG_ARM64_BRBE)
- Include arm_brbe.h header both in arm_pmuv3.c and arm_brbe.c
Anshuman Khandual Nov. 15, 2023, 7:22 a.m. UTC | #5
On 11/14/23 17:44, James Clark wrote:
> 
> 
> On 14/11/2023 05:13, Anshuman Khandual wrote:
> [...]
> 
>> +/*
>> + * This is a read only constant and safe during multi threaded access
>> + */
>> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
>> +
>> +static void read_branch_records(struct pmu_hw_events *cpuc,
>> +				struct perf_event *event,
>> +				struct perf_sample_data *data,
>> +				bool *branch_captured)
>> +{
>> +	/*
>> +	 * CPU specific branch records buffer must have been allocated already
>> +	 * for the hardware records to be captured and processed further.
>> +	 */
>> +	if (WARN_ON(!cpuc->branches))
>> +		return;
>> +
>> +	/*
>> +	 * Overflowed event's branch_sample_type does not match the configured
>> +	 * branch filters in the BRBE HW. So the captured branch records here
>> +	 * cannot be co-related to the overflowed event. Report to the user as
>> +	 * if no branch records have been captured, and flush branch records.
>> +	 * The same scenario is applicable when the current task context does
>> +	 * not match with overflown event.
>> +	 */
>> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
>> +		perf_sample_save_brstack(data, event, &zero_branch_stack);
> 
> Is there any benefit to outputting a zero size stack vs not outputting
> anything at all?

The event has got PERF_SAMPLE_BRANCH_STACK marked and hence perf_sample_data
must have PERF_SAMPLE_BRANCH_STACK with it's br_stack pointing to the branch
records. Hence without assigning a zeroed struct perf_branch_stack, there is
a chance, that perf_sample_data will pass on some garbage branch records to
the ring buffer.

> 
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Read the branch records from the hardware once after the PMU IRQ
>> +	 * has been triggered but subsequently same records can be used for
>> +	 * other events that might have been overflowed simultaneously thus
>> +	 * saving much CPU cycles.
>> +	 */
>> +	if (!*branch_captured) {
>> +		armv8pmu_branch_read(cpuc, event);
>> +		*branch_captured = true;
>> +	}
>> +	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack);
>> +}
>> +
>>  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>  {
>>  	u32 pmovsr;
>> @@ -766,6 +815,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>  	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>>  	struct pt_regs *regs;
>>  	int idx;
>> +	bool branch_captured = false;
>>  
>>  	/*
>>  	 * Get and reset the IRQ flags
>> @@ -809,6 +859,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>  		if (!armpmu_event_set_period(event))
>>  			continue;
>>  
>> +		/*
>> +		 * PMU IRQ should remain asserted until all branch records
>> +		 * are captured and processed into struct perf_sample_data.
>> +		 */
>> +		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
>> +			read_branch_records(cpuc, event, &data, &branch_captured);
> 
> You could return instead of using the out param, not really any
> different, but maybe a bit more normal:
> 
>   branch_captured |= read_branch_records(cpuc, event, &data,
> branch_captured);

I am just wondering - how that would be any better ?

> 
>> +
>>  		/*
>>  		 * Perf event overflow will queue the processing of the event as
>>  		 * an irq_work which will be taken care of in the handling of
>> @@ -818,6 +875,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>  			cpu_pmu->disable(event);
>>  	}
>>  	armv8pmu_start(cpu_pmu);
>> +	if (cpu_pmu->has_branch_stack)
>> +		armv8pmu_branch_reset();
>>  
>>  	return IRQ_HANDLED;
>>  }
>> @@ -907,6 +966,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>  	return event->hw.idx;
>>  }
>>  
>> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +	void *task_ctx = pmu_ctx->task_ctx_data;
>> +
>> +	if (armpmu->has_branch_stack) {
>> +		/* Save branch records in task_ctx on sched out */
>> +		if (task_ctx && !sched_in) {
>> +			armv8pmu_branch_save(armpmu, task_ctx);
>> +			return;
>> +		}
>> +
>> +		/* Reset branch records on sched in */
>> +		if (sched_in)
>> +			armv8pmu_branch_reset();
>> +	}
>> +}
>> +
>>  /*
>>   * Add an event filter to a given event.
>>   */
>> @@ -977,6 +1054,9 @@ static void armv8pmu_reset(void *info)
>>  		pmcr |= ARMV8_PMU_PMCR_LP;
>>  
>>  	armv8pmu_pmcr_write(pmcr);
>> +
>> +	if (cpu_pmu->has_branch_stack)
>> +		armv8pmu_branch_reset();
>>  }
>>  
>>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>> @@ -1014,6 +1094,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>  
>>  	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>  
>> +	if (has_branch_stack(event)) {
>> +		if (!armv8pmu_branch_attr_valid(event))
>> +			return -EOPNOTSUPP;
>> +
>> +		/*
>> +		 * If a task gets scheduled out, the current branch records
>> +		 * get saved in the task's context data, which can be later
>> +		 * used to fill in the records upon an event overflow. Let's
>> +		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
>> +		 * all branch stack sampling perf events.
>> +		 */
>> +		event->attach_state |= PERF_ATTACH_TASK_DATA;
>> +	}
>> +
>>  	/*
>>  	 * CHAIN events only work when paired with an adjacent counter, and it
>>  	 * never makes sense for a user to open one in isolation, as they'll be
>> @@ -1130,6 +1224,35 @@ static void __armv8pmu_probe_pmu(void *info)
>>  		cpu_pmu->reg_pmmir = read_pmmir();
>>  	else
>>  		cpu_pmu->reg_pmmir = 0;
>> +	armv8pmu_branch_probe(cpu_pmu);
> 
> I'm not sure if this is splitting hairs or not, but
> __armv8pmu_probe_pmu() is run on only one of 'any' of the supported CPUs
> for this PMU.

Right.

> 
> Is it not possible to have some of those CPUs support and some not
> support BRBE, even though they are all the same PMU type? Maybe we could

I am not sure, but not something I have come across.

> wait for it to explode with some weird system, or change it so that the
> BRBE probe is run on every CPU, with a second 'supported_brbe_mask' field.

Right, but for now, the current solutions looks sufficient.

> 
>> +}
>> +
>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>> +{
>> +	struct branch_records __percpu *records;
>> +	int cpu;
>> +
>> +	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>> +	if (!records)
>> +		return -ENOMEM;
>> +
> 
> Doesn't this technically need to take the CPU mask where BRBE is
> supported into account? Otherwise you are allocating for cores that
> never use it.
> 
> Also it's done per-CPU _and_ per-PMU type, multiplying the number of
> BRBE buffers allocated, even if they can only ever be used per-CPU.

Agreed, but I believe we have already been though this discussion, and
settled for this method - for being a simpler approach.

> 
>> +	/*
>> +	 * percpu memory allocated for 'records' gets completely consumed
>> +	 * here, and never required to be freed up later. So permanently
>> +	 * losing access to this anchor i.e 'records' is acceptable.
>> +	 *
>> +	 * Otherwise this allocation handle would have to be saved up for
>> +	 * free_percpu() release later if required.
>> +	 */
>> +	for_each_possible_cpu(cpu) {
>> +		struct pmu_hw_events *events_cpu;
>> +		struct branch_records *records_cpu;
>> +
>> +		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>> +		records_cpu = per_cpu_ptr(records, cpu);
>> +		events_cpu->branches = records_cpu;
>> +	}
>> +	return 0;
>>  }
>>  
>>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>> @@ -1146,7 +1269,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>  	if (ret)
>>  		return ret;
>>  
>> -	return probe.present ? 0 : -ENODEV;
>> +	if (!probe.present)
>> +		return -ENODEV;
>> +
>> +	if (cpu_pmu->has_branch_stack) {
>> +		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>> +		if (ret)
>> +			return ret;
>> +
>> +		ret = branch_records_alloc(cpu_pmu);
>> +		if (ret) {
>> +			armv8pmu_task_ctx_cache_free(cpu_pmu);
>> +			return ret;
>> +		}
>> +	}
>> +	return 0;
>>  }
>>  
> 
> [...]
>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>> index 9c226adf938a..72da4522397c 100644
>> --- a/include/linux/perf/arm_pmuv3.h
>> +++ b/include/linux/perf/arm_pmuv3.h
>> @@ -303,4 +303,50 @@
>>  		}						\
>>  	} while (0)
>>  
>> +struct pmu_hw_events;
>> +struct arm_pmu;
>> +struct perf_event;
>> +
>> +#ifdef CONFIG_PERF_EVENTS
> 
> Very minor nit, but if you end up moving the stubs to the brbe header
> you probably don't need the #ifdef CONFIG_PERF_EVENTS because it just
> won't be included in that case.

Right, will drop CONFIG_PERF_EVENTS wrapper.

> 
>> +static inline void armv8pmu_branch_reset(void)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +
>> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
>> +{
>> +	WARN_ON_ONCE(!has_branch_stack(event));
>> +	return false;
>> +}
>> +
>> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_disable(void)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
>> +					struct perf_event *event)
>> +{
>> +	WARN_ON_ONCE(!has_branch_stack(event));
>> +}
>> +
>> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>> +{
>> +}
>> +
>> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +#endif /* CONFIG_PERF_EVENTS */
>>  #endif
James Clark Nov. 15, 2023, 9:37 a.m. UTC | #6
On 15/11/2023 05:44, Anshuman Khandual wrote:
> On 11/14/23 15:28, James Clark wrote:
>>
>>
>> On 14/11/2023 05:13, Anshuman Khandual wrote:
>>> Branch stack sampling support i.e capturing branch records during execution
>>> in core perf, rides along with normal HW events being scheduled on the PMU.
>>> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
>>> with required HW implementation.
>>>
>>
>> [...]
>>
>>> - All armv8pmu_branch_xxxx() stub definitions have been moved inside
>>>   include/linux/perf/arm_pmuv3.h for easy access from both arm32 and
>>>   arm64 platforms
>>>
>>
>> This causes lots of W=1 build errors because the prototypes are in
>> arm_pmuv3.h, but arm_brbe.c doesn't include it.
> 
> I guess these are the W=1 warnings you mentioned above.
> 
> drivers/perf/arm_brbe.c:11:6: warning: no previous prototype for ‘armv8pmu_branch_reset’ [-Wmissing-prototypes]
>    11 | void armv8pmu_branch_reset(void)                                                                                                                                                                   
>       |      ^~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                         
> drivers/perf/arm_brbe.c:190:6: warning: no previous prototype for ‘armv8pmu_branch_save’ [-Wmissing-prototypes]      
>   190 | void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)                      
>       |      ^~~~~~~~~~~~~~~~~~~~                                                                                                                                                                          
> drivers/perf/arm_brbe.c:236:6: warning: no previous prototype for ‘armv8pmu_branch_attr_valid’ [-Wmissing-prototypes]
>   236 | bool armv8pmu_branch_attr_valid(struct perf_event *event)                                                                                                                                          
>       |      ^~~~~~~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                    
> drivers/perf/arm_brbe.c:269:5: warning: no previous prototype for ‘armv8pmu_task_ctx_cache_alloc’ [-Wmissing-prototypes]
>   269 | int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)                            
>       |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                            
> drivers/perf/arm_brbe.c:279:6: warning: no previous prototype for ‘armv8pmu_task_ctx_cache_free’ [-Wmissing-prototypes]
>   279 | void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)                       
>       |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                            
> drivers/perf/arm_brbe.c:303:6: warning: no previous prototype for ‘armv8pmu_branch_probe’ [-Wmissing-prototypes]
>   303 | void armv8pmu_branch_probe(struct arm_pmu *armpmu)                                
>       |      ^~~~~~~~~~~~~~~~~~~~~                                                                   
> drivers/perf/arm_brbe.c:449:6: warning: no previous prototype for ‘armv8pmu_branch_enable’ [-Wmissing-prototypes]
>   449 | void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)                             
>       |      ^~~~~~~~~~~~~~~~~~~~~~                                                                  
> drivers/perf/arm_brbe.c:474:6: warning: no previous prototype for ‘armv8pmu_branch_disable’ [-Wmissing-prototypes]
>   474 | void armv8pmu_branch_disable(void)                                                           
>       |      ^~~~~~~~~~~~~~~~~~~~~~~                                                                 
> drivers/perf/arm_brbe.c:717:6: warning: no previous prototype for ‘armv8pmu_branch_read’ [-Wmissing-prototypes]
>   717 | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> 
> Branch helpers are used in ARM PMU V3 driver i.e drivers/perf/arm_pmuv3.c.
> Whether the actual BRBE helper definitions, or their fallback stubs (when
> CONFIG_ARM64_BRBE is not enabled), need to be accessible from arm_pmuv3.c
> driver not from brbe.c implementations itself.
> 
>>
>> It seems like the main reason you can't include arm_brbe.h in arm32 code
>> is because there are a load of inline functions and references to
>> registers in there. But these are only used in arm_brbe.c, so they don't
> 
> Right, arm32 should not be exposed to BRBE internals via arm_brbe.h header.
> 
>> need to be in the header anyway.
> 
> Right, these are only used in arm_brbe.c
> 
>>
>> If you removed the code from the header and moved it to the source file
>> you could move the brbe prototypes to the brbe header and it would be a
>> bit cleaner and more idiomatic.
> 
> Alight, how about the following changes - build tested on arm32 and arm64.
> 
> - Move BRBE helpers from arm_brbe.h into arm_brbe.c
> - Move armv8_pmu_xxx() declaration inside arm_brbe.h for arm64 (CONFIG_ARM64_BRBE)
> - Move armv8_pmu_xxx() stub definitions inside arm_pmuv3.c for arm32 (!CONFIG_ARM64_BRBE)
> - Include arm_brbe.h header both in arm_pmuv3.c and arm_brbe.c

Agree to them all except:

  - Move armv8_pmu_xxx() stub definitions inside arm_pmuv3.c for arm32
(!CONFIG_ARM64_BRBE)

Normally you put the stubs right next to the prototypes with #else, so
in this case both would be in arm_brbe.h. Not sure what the reason for
splitting them here is? You already said "include arm_brbe.h in
arm_pmuv3.c", so that covers arm32 too.
James Clark Nov. 15, 2023, 10:07 a.m. UTC | #7
On 15/11/2023 07:22, Anshuman Khandual wrote:
> On 11/14/23 17:44, James Clark wrote:
>>
>>
>> On 14/11/2023 05:13, Anshuman Khandual wrote:
>> [...]
>>
>>> +/*
>>> + * This is a read only constant and safe during multi threaded access
>>> + */
>>> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
>>> +
>>> +static void read_branch_records(struct pmu_hw_events *cpuc,
>>> +				struct perf_event *event,
>>> +				struct perf_sample_data *data,
>>> +				bool *branch_captured)
>>> +{
>>> +	/*
>>> +	 * CPU specific branch records buffer must have been allocated already
>>> +	 * for the hardware records to be captured and processed further.
>>> +	 */
>>> +	if (WARN_ON(!cpuc->branches))
>>> +		return;
>>> +
>>> +	/*
>>> +	 * Overflowed event's branch_sample_type does not match the configured
>>> +	 * branch filters in the BRBE HW. So the captured branch records here
>>> +	 * cannot be co-related to the overflowed event. Report to the user as
>>> +	 * if no branch records have been captured, and flush branch records.
>>> +	 * The same scenario is applicable when the current task context does
>>> +	 * not match with overflown event.
>>> +	 */
>>> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>>> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
>>> +		perf_sample_save_brstack(data, event, &zero_branch_stack);
>>
>> Is there any benefit to outputting a zero size stack vs not outputting
>> anything at all?
> 
> The event has got PERF_SAMPLE_BRANCH_STACK marked and hence perf_sample_data
> must have PERF_SAMPLE_BRANCH_STACK with it's br_stack pointing to the branch
> records. Hence without assigning a zeroed struct perf_branch_stack, there is
> a chance, that perf_sample_data will pass on some garbage branch records to
> the ring buffer.
> 

I don't think that's an issue, the perf core code handles the case where
no branch stack exists on a sample. It even outputs the zero length for
you, but there is other stuff that can be skipped if you just never call
perf_sample_save_brstack():

from kernel/events/core.c:

 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
  if (data->br_stack) {
    size_t size;

    size = data->br_stack->nr
      * sizeof(struct perf_branch_entry);

    perf_output_put(handle, data->br_stack->nr);
    if (branch_sample_hw_index(event))
      perf_output_put(handle, data->br_stack->hw_idx);
    perf_output_copy(handle, data->br_stack->entries, size);
  } else {
    /*
     * we always store at least the value of nr
     */
    u64 nr = 0;
    perf_output_put(handle, nr);
  }
}


>>
>>> +		return;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Read the branch records from the hardware once after the PMU IRQ
>>> +	 * has been triggered but subsequently same records can be used for
>>> +	 * other events that might have been overflowed simultaneously thus
>>> +	 * saving much CPU cycles.
>>> +	 */
>>> +	if (!*branch_captured) {
>>> +		armv8pmu_branch_read(cpuc, event);
>>> +		*branch_captured = true;
>>> +	}
>>> +	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack);
>>> +}
>>> +
>>>  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>  {
>>>  	u32 pmovsr;
>>> @@ -766,6 +815,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>  	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>>>  	struct pt_regs *regs;
>>>  	int idx;
>>> +	bool branch_captured = false;
>>>  
>>>  	/*
>>>  	 * Get and reset the IRQ flags
>>> @@ -809,6 +859,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>  		if (!armpmu_event_set_period(event))
>>>  			continue;
>>>  
>>> +		/*
>>> +		 * PMU IRQ should remain asserted until all branch records
>>> +		 * are captured and processed into struct perf_sample_data.
>>> +		 */
>>> +		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
>>> +			read_branch_records(cpuc, event, &data, &branch_captured);
>>
>> You could return instead of using the out param, not really any
>> different, but maybe a bit more normal:
>>
>>   branch_captured |= read_branch_records(cpuc, event, &data,
>> branch_captured);
> 
> I am just wondering - how that would be any better ?
> 

Maybe it wouldn't, but I suppose it's just the same way you don't write
returns like:

  armv8pmu_task_ctx_cache_alloc(cpu_pmu, &ret);

instead of:

  ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);

Out params can be hard to reason about sometimes. Maybe not in this case
though.

>>
>>> +
>>>  		/*
>>>  		 * Perf event overflow will queue the processing of the event as
>>>  		 * an irq_work which will be taken care of in the handling of
>>> @@ -818,6 +875,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>  			cpu_pmu->disable(event);
>>>  	}
>>>  	armv8pmu_start(cpu_pmu);
>>> +	if (cpu_pmu->has_branch_stack)
>>> +		armv8pmu_branch_reset();
>>>  
>>>  	return IRQ_HANDLED;
>>>  }
>>> @@ -907,6 +966,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>>  	return event->hw.idx;
>>>  }
>>>  
>>> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>>> +{
>>> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>>> +	void *task_ctx = pmu_ctx->task_ctx_data;
>>> +
>>> +	if (armpmu->has_branch_stack) {
>>> +		/* Save branch records in task_ctx on sched out */
>>> +		if (task_ctx && !sched_in) {
>>> +			armv8pmu_branch_save(armpmu, task_ctx);
>>> +			return;
>>> +		}
>>> +
>>> +		/* Reset branch records on sched in */
>>> +		if (sched_in)
>>> +			armv8pmu_branch_reset();
>>> +	}
>>> +}
>>> +
>>>  /*
>>>   * Add an event filter to a given event.
>>>   */
>>> @@ -977,6 +1054,9 @@ static void armv8pmu_reset(void *info)
>>>  		pmcr |= ARMV8_PMU_PMCR_LP;
>>>  
>>>  	armv8pmu_pmcr_write(pmcr);
>>> +
>>> +	if (cpu_pmu->has_branch_stack)
>>> +		armv8pmu_branch_reset();
>>>  }
>>>  
>>>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>>> @@ -1014,6 +1094,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>>  
>>>  	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>>  
>>> +	if (has_branch_stack(event)) {
>>> +		if (!armv8pmu_branch_attr_valid(event))
>>> +			return -EOPNOTSUPP;
>>> +
>>> +		/*
>>> +		 * If a task gets scheduled out, the current branch records
>>> +		 * get saved in the task's context data, which can be later
>>> +		 * used to fill in the records upon an event overflow. Let's
>>> +		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
>>> +		 * all branch stack sampling perf events.
>>> +		 */
>>> +		event->attach_state |= PERF_ATTACH_TASK_DATA;
>>> +	}
>>> +
>>>  	/*
>>>  	 * CHAIN events only work when paired with an adjacent counter, and it
>>>  	 * never makes sense for a user to open one in isolation, as they'll be
>>> @@ -1130,6 +1224,35 @@ static void __armv8pmu_probe_pmu(void *info)
>>>  		cpu_pmu->reg_pmmir = read_pmmir();
>>>  	else
>>>  		cpu_pmu->reg_pmmir = 0;
>>> +	armv8pmu_branch_probe(cpu_pmu);
>>
>> I'm not sure if this is splitting hairs or not, but
>> __armv8pmu_probe_pmu() is run on only one of 'any' of the supported CPUs
>> for this PMU.
> 
> Right.
> 
>>
>> Is it not possible to have some of those CPUs support and some not
>> support BRBE, even though they are all the same PMU type? Maybe we could
> 
> I am not sure, but not something I have come across.
> 
>> wait for it to explode with some weird system, or change it so that the
>> BRBE probe is run on every CPU, with a second 'supported_brbe_mask' field.
> 
> Right, but for now, the current solutions looks sufficient.
> 

I suppose it means people will have to split their PMUs to ones that do
and don't support BRBE. I'm not sure if that's worth adding a comment in
the docs or it's too obscure.

>>
>>> +}
>>> +
>>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>>> +{
>>> +	struct branch_records __percpu *records;
>>> +	int cpu;
>>> +
>>> +	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>>> +	if (!records)
>>> +		return -ENOMEM;
>>> +
>>
>> Doesn't this technically need to take the CPU mask where BRBE is
>> supported into account? Otherwise you are allocating for cores that
>> never use it.
>>
>> Also it's done per-CPU _and_ per-PMU type, multiplying the number of
>> BRBE buffers allocated, even if they can only ever be used per-CPU.
> 
> Agreed, but I believe we have already been though this discussion, and
> settled for this method - for being a simpler approach.
> 
>>
>>> +	/*
>>> +	 * percpu memory allocated for 'records' gets completely consumed
>>> +	 * here, and never required to be freed up later. So permanently
>>> +	 * losing access to this anchor i.e 'records' is acceptable.
>>> +	 *
>>> +	 * Otherwise this allocation handle would have to be saved up for
>>> +	 * free_percpu() release later if required.
>>> +	 */
>>> +	for_each_possible_cpu(cpu) {
>>> +		struct pmu_hw_events *events_cpu;
>>> +		struct branch_records *records_cpu;
>>> +
>>> +		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>>> +		records_cpu = per_cpu_ptr(records, cpu);
>>> +		events_cpu->branches = records_cpu;
>>> +	}
>>> +	return 0;
>>>  }
>>>  
>>>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>> @@ -1146,7 +1269,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>  	if (ret)
>>>  		return ret;
>>>  
>>> -	return probe.present ? 0 : -ENODEV;
>>> +	if (!probe.present)
>>> +		return -ENODEV;
>>> +
>>> +	if (cpu_pmu->has_branch_stack) {
>>> +		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>>> +		if (ret)
>>> +			return ret;
>>> +
>>> +		ret = branch_records_alloc(cpu_pmu);
>>> +		if (ret) {
>>> +			armv8pmu_task_ctx_cache_free(cpu_pmu);
>>> +			return ret;
>>> +		}
>>> +	}
>>> +	return 0;
>>>  }
>>>  
>>
>> [...]
>>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>>> index 9c226adf938a..72da4522397c 100644
>>> --- a/include/linux/perf/arm_pmuv3.h
>>> +++ b/include/linux/perf/arm_pmuv3.h
>>> @@ -303,4 +303,50 @@
>>>  		}						\
>>>  	} while (0)
>>>  
>>> +struct pmu_hw_events;
>>> +struct arm_pmu;
>>> +struct perf_event;
>>> +
>>> +#ifdef CONFIG_PERF_EVENTS
>>
>> Very minor nit, but if you end up moving the stubs to the brbe header
>> you probably don't need the #ifdef CONFIG_PERF_EVENTS because it just
>> won't be included in that case.
> 
> Right, will drop CONFIG_PERF_EVENTS wrapper.
> 
>>
>>> +static inline void armv8pmu_branch_reset(void)
>>> +{
>>> +}
>>> +
>>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
>>> +{
>>> +}
>>> +
>>> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
>>> +{
>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>> +	return false;
>>> +}
>>> +
>>> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
>>> +{
>>> +}
>>> +
>>> +static inline void armv8pmu_branch_disable(void)
>>> +{
>>> +}
>>> +
>>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
>>> +					struct perf_event *event)
>>> +{
>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>> +}
>>> +
>>> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>>> +{
>>> +}
>>> +
>>> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
>>> +{
>>> +	return 0;
>>> +}
>>> +
>>> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
>>> +{
>>> +}
>>> +#endif /* CONFIG_PERF_EVENTS */
>>>  #endif
Anshuman Khandual Nov. 21, 2023, 9:13 a.m. UTC | #8
On 11/15/23 15:07, James Clark wrote:
> 
> 
> On 15/11/2023 05:44, Anshuman Khandual wrote:
>> On 11/14/23 15:28, James Clark wrote:
>>>
>>>
>>> On 14/11/2023 05:13, Anshuman Khandual wrote:
>>>> Branch stack sampling support i.e capturing branch records during execution
>>>> in core perf, rides along with normal HW events being scheduled on the PMU.
>>>> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
>>>> with required HW implementation.
>>>>
>>>
>>> [...]
>>>
>>>> - All armv8pmu_branch_xxxx() stub definitions have been moved inside
>>>>   include/linux/perf/arm_pmuv3.h for easy access from both arm32 and
>>>>   arm64 platforms
>>>>
>>>
>>> This causes lots of W=1 build errors because the prototypes are in
>>> arm_pmuv3.h, but arm_brbe.c doesn't include it.
>>
>> I guess these are the W=1 warnings you mentioned above.
>>
>> drivers/perf/arm_brbe.c:11:6: warning: no previous prototype for ‘armv8pmu_branch_reset’ [-Wmissing-prototypes]
>>    11 | void armv8pmu_branch_reset(void)                                                                                                                                                                   
>>       |      ^~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                         
>> drivers/perf/arm_brbe.c:190:6: warning: no previous prototype for ‘armv8pmu_branch_save’ [-Wmissing-prototypes]      
>>   190 | void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)                      
>>       |      ^~~~~~~~~~~~~~~~~~~~                                                                                                                                                                          
>> drivers/perf/arm_brbe.c:236:6: warning: no previous prototype for ‘armv8pmu_branch_attr_valid’ [-Wmissing-prototypes]
>>   236 | bool armv8pmu_branch_attr_valid(struct perf_event *event)                                                                                                                                          
>>       |      ^~~~~~~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                    
>> drivers/perf/arm_brbe.c:269:5: warning: no previous prototype for ‘armv8pmu_task_ctx_cache_alloc’ [-Wmissing-prototypes]
>>   269 | int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)                            
>>       |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                            
>> drivers/perf/arm_brbe.c:279:6: warning: no previous prototype for ‘armv8pmu_task_ctx_cache_free’ [-Wmissing-prototypes]
>>   279 | void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)                       
>>       |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                            
>> drivers/perf/arm_brbe.c:303:6: warning: no previous prototype for ‘armv8pmu_branch_probe’ [-Wmissing-prototypes]
>>   303 | void armv8pmu_branch_probe(struct arm_pmu *armpmu)                                
>>       |      ^~~~~~~~~~~~~~~~~~~~~                                                                   
>> drivers/perf/arm_brbe.c:449:6: warning: no previous prototype for ‘armv8pmu_branch_enable’ [-Wmissing-prototypes]
>>   449 | void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)                             
>>       |      ^~~~~~~~~~~~~~~~~~~~~~                                                                  
>> drivers/perf/arm_brbe.c:474:6: warning: no previous prototype for ‘armv8pmu_branch_disable’ [-Wmissing-prototypes]
>>   474 | void armv8pmu_branch_disable(void)                                                           
>>       |      ^~~~~~~~~~~~~~~~~~~~~~~                                                                 
>> drivers/perf/arm_brbe.c:717:6: warning: no previous prototype for ‘armv8pmu_branch_read’ [-Wmissing-prototypes]
>>   717 | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>
>> Branch helpers are used in ARM PMU V3 driver i.e drivers/perf/arm_pmuv3.c.
>> Whether the actual BRBE helper definitions, or their fallback stubs (when
>> CONFIG_ARM64_BRBE is not enabled), need to be accessible from arm_pmuv3.c
>> driver not from brbe.c implementations itself.
>>
>>>
>>> It seems like the main reason you can't include arm_brbe.h in arm32 code
>>> is because there are a load of inline functions and references to
>>> registers in there. But these are only used in arm_brbe.c, so they don't
>>
>> Right, arm32 should not be exposed to BRBE internals via arm_brbe.h header.
>>
>>> need to be in the header anyway.
>>
>> Right, these are only used in arm_brbe.c
>>
>>>
>>> If you removed the code from the header and moved it to the source file
>>> you could move the brbe prototypes to the brbe header and it would be a
>>> bit cleaner and more idiomatic.
>>
>> Alight, how about the following changes - build tested on arm32 and arm64.
>>
>> - Move BRBE helpers from arm_brbe.h into arm_brbe.c
>> - Move armv8_pmu_xxx() declaration inside arm_brbe.h for arm64 (CONFIG_ARM64_BRBE)
>> - Move armv8_pmu_xxx() stub definitions inside arm_pmuv3.c for arm32 (!CONFIG_ARM64_BRBE)
>> - Include arm_brbe.h header both in arm_pmuv3.c and arm_brbe.c
> 
> Agree to them all except:
> 
>   - Move armv8_pmu_xxx() stub definitions inside arm_pmuv3.c for arm32
> (!CONFIG_ARM64_BRBE)
> 
> Normally you put the stubs right next to the prototypes with #else, so
> in this case both would be in arm_brbe.h. Not sure what the reason for
> splitting them here is? You already said "include arm_brbe.h in
> arm_pmuv3.c", so that covers arm32 too.

Not any particular strong reason for the split as such, will move these
stubs to the header as well. BRBE header includes <linux/perf/arm_pmu.h>
which causes the following redefinition warning for the pr_fmt().

drivers/perf/arm_brbe.c:11: warning: "pr_fmt" redefined
   11 | #define pr_fmt(fmt) "brbe: " fmt
      | 
In file included from ./include/linux/kernel.h:31,
                 from ./include/linux/interrupt.h:6,
                 from ./include/linux/perf/arm_pmu.h:11,
                 from drivers/perf/arm_brbe.h:10,
                 from drivers/perf/arm_brbe.c:9:
./include/linux/printk.h:345: note: this is the location of the previous definition
  345 | #define pr_fmt(fmt) fmt

Although it should be okay to just drop this custom pr_fmt() from BRBE.
Anshuman Khandual Nov. 21, 2023, 9:57 a.m. UTC | #9
On 11/15/23 15:37, James Clark wrote:
> 
> 
> On 15/11/2023 07:22, Anshuman Khandual wrote:
>> On 11/14/23 17:44, James Clark wrote:
>>>
>>>
>>> On 14/11/2023 05:13, Anshuman Khandual wrote:
>>> [...]
>>>
>>>> +/*
>>>> + * This is a read only constant and safe during multi threaded access
>>>> + */
>>>> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
>>>> +
>>>> +static void read_branch_records(struct pmu_hw_events *cpuc,
>>>> +				struct perf_event *event,
>>>> +				struct perf_sample_data *data,
>>>> +				bool *branch_captured)
>>>> +{
>>>> +	/*
>>>> +	 * CPU specific branch records buffer must have been allocated already
>>>> +	 * for the hardware records to be captured and processed further.
>>>> +	 */
>>>> +	if (WARN_ON(!cpuc->branches))
>>>> +		return;
>>>> +
>>>> +	/*
>>>> +	 * Overflowed event's branch_sample_type does not match the configured
>>>> +	 * branch filters in the BRBE HW. So the captured branch records here
>>>> +	 * cannot be co-related to the overflowed event. Report to the user as
>>>> +	 * if no branch records have been captured, and flush branch records.
>>>> +	 * The same scenario is applicable when the current task context does
>>>> +	 * not match with overflown event.
>>>> +	 */
>>>> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>>>> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
>>>> +		perf_sample_save_brstack(data, event, &zero_branch_stack);
>>>
>>> Is there any benefit to outputting a zero size stack vs not outputting
>>> anything at all?
>>
>> The event has got PERF_SAMPLE_BRANCH_STACK marked and hence perf_sample_data
>> must have PERF_SAMPLE_BRANCH_STACK with it's br_stack pointing to the branch
>> records. Hence without assigning a zeroed struct perf_branch_stack, there is
>> a chance, that perf_sample_data will pass on some garbage branch records to
>> the ring buffer.
>>
> 
> I don't think that's an issue, the perf core code handles the case where
> no branch stack exists on a sample. It even outputs the zero length for
> you, but there is other stuff that can be skipped if you just never call
> perf_sample_save_brstack():

Sending out perf_sample_data without valid data->br_stack seems problematic,
which would be the case when perf_sample_save_brstack() never gets called on
the perf_sample_data being prepared, and depend on the below 'else' case for
pushing out zero records.

Alternatively - temporarily just zeroing out cpuc->branches->branch_stack.nr
for immediate perf_sample_save_brstack(), and then restoring it back to it's
original value might work as well. Remember it still has got valid records
for other qualifying events.

> 
> from kernel/events/core.c:
> 
>  if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
>   if (data->br_stack) {
>     size_t size;
> 
>     size = data->br_stack->nr
>       * sizeof(struct perf_branch_entry);
> 
>     perf_output_put(handle, data->br_stack->nr);
>     if (branch_sample_hw_index(event))
>       perf_output_put(handle, data->br_stack->hw_idx);
>     perf_output_copy(handle, data->br_stack->entries, size);
>   } else {
>     /*
>      * we always store at least the value of nr
>      */
>     u64 nr = 0;
>     perf_output_put(handle, nr);
>   }
> }
> 
> 
>>>
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Read the branch records from the hardware once after the PMU IRQ
>>>> +	 * has been triggered but subsequently same records can be used for
>>>> +	 * other events that might have been overflowed simultaneously thus
>>>> +	 * saving much CPU cycles.
>>>> +	 */
>>>> +	if (!*branch_captured) {
>>>> +		armv8pmu_branch_read(cpuc, event);
>>>> +		*branch_captured = true;
>>>> +	}
>>>> +	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack);
>>>> +}
>>>> +
>>>>  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>  {
>>>>  	u32 pmovsr;
>>>> @@ -766,6 +815,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>  	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>>>>  	struct pt_regs *regs;
>>>>  	int idx;
>>>> +	bool branch_captured = false;
>>>>  
>>>>  	/*
>>>>  	 * Get and reset the IRQ flags
>>>> @@ -809,6 +859,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>  		if (!armpmu_event_set_period(event))
>>>>  			continue;
>>>>  
>>>> +		/*
>>>> +		 * PMU IRQ should remain asserted until all branch records
>>>> +		 * are captured and processed into struct perf_sample_data.
>>>> +		 */
>>>> +		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
>>>> +			read_branch_records(cpuc, event, &data, &branch_captured);
>>>
>>> You could return instead of using the out param, not really any
>>> different, but maybe a bit more normal:
>>>
>>>   branch_captured |= read_branch_records(cpuc, event, &data,
>>> branch_captured);
>>
>> I am just wondering - how that would be any better ?
>>
> 
> Maybe it wouldn't, but I suppose it's just the same way you don't write
> returns like:
> 
>   armv8pmu_task_ctx_cache_alloc(cpu_pmu, &ret);
> 
> instead of:
> 
>   ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
> 
> Out params can be hard to reason about sometimes. Maybe not in this case
> though.

The out parameter 'branch_captured' is checked inside read_branch_records()
to ascertain whether the BRBE records have been already captured inside the
buffer i.e cpuc->branches->branch_stack, in case the process can be skipped
(optimization) for subsequent events in the session. Keeping this parameter
branch_captured just inside the caller i.e armv8pmu_handle_irq() would not
achieve that objective.

>>>
>>>> +
>>>>  		/*
>>>>  		 * Perf event overflow will queue the processing of the event as
>>>>  		 * an irq_work which will be taken care of in the handling of
>>>> @@ -818,6 +875,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>  			cpu_pmu->disable(event);
>>>>  	}
>>>>  	armv8pmu_start(cpu_pmu);
>>>> +	if (cpu_pmu->has_branch_stack)
>>>> +		armv8pmu_branch_reset();
>>>>  
>>>>  	return IRQ_HANDLED;
>>>>  }
>>>> @@ -907,6 +966,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>>>  	return event->hw.idx;
>>>>  }
>>>>  
>>>> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>>>> +{
>>>> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>>>> +	void *task_ctx = pmu_ctx->task_ctx_data;
>>>> +
>>>> +	if (armpmu->has_branch_stack) {
>>>> +		/* Save branch records in task_ctx on sched out */
>>>> +		if (task_ctx && !sched_in) {
>>>> +			armv8pmu_branch_save(armpmu, task_ctx);
>>>> +			return;
>>>> +		}
>>>> +
>>>> +		/* Reset branch records on sched in */
>>>> +		if (sched_in)
>>>> +			armv8pmu_branch_reset();
>>>> +	}
>>>> +}
>>>> +
>>>>  /*
>>>>   * Add an event filter to a given event.
>>>>   */
>>>> @@ -977,6 +1054,9 @@ static void armv8pmu_reset(void *info)
>>>>  		pmcr |= ARMV8_PMU_PMCR_LP;
>>>>  
>>>>  	armv8pmu_pmcr_write(pmcr);
>>>> +
>>>> +	if (cpu_pmu->has_branch_stack)
>>>> +		armv8pmu_branch_reset();
>>>>  }
>>>>  
>>>>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>>>> @@ -1014,6 +1094,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>>>  
>>>>  	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>>>  
>>>> +	if (has_branch_stack(event)) {
>>>> +		if (!armv8pmu_branch_attr_valid(event))
>>>> +			return -EOPNOTSUPP;
>>>> +
>>>> +		/*
>>>> +		 * If a task gets scheduled out, the current branch records
>>>> +		 * get saved in the task's context data, which can be later
>>>> +		 * used to fill in the records upon an event overflow. Let's
>>>> +		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
>>>> +		 * all branch stack sampling perf events.
>>>> +		 */
>>>> +		event->attach_state |= PERF_ATTACH_TASK_DATA;
>>>> +	}
>>>> +
>>>>  	/*
>>>>  	 * CHAIN events only work when paired with an adjacent counter, and it
>>>>  	 * never makes sense for a user to open one in isolation, as they'll be
>>>> @@ -1130,6 +1224,35 @@ static void __armv8pmu_probe_pmu(void *info)
>>>>  		cpu_pmu->reg_pmmir = read_pmmir();
>>>>  	else
>>>>  		cpu_pmu->reg_pmmir = 0;
>>>> +	armv8pmu_branch_probe(cpu_pmu);
>>>
>>> I'm not sure if this is splitting hairs or not, but
>>> __armv8pmu_probe_pmu() is run on only one of 'any' of the supported CPUs
>>> for this PMU.
>>
>> Right.
>>
>>>
>>> Is it not possible to have some of those CPUs support and some not
>>> support BRBE, even though they are all the same PMU type? Maybe we could
>>
>> I am not sure, but not something I have come across.
>>
>>> wait for it to explode with some weird system, or change it so that the
>>> BRBE probe is run on every CPU, with a second 'supported_brbe_mask' field.
>>
>> Right, but for now, the current solutions looks sufficient.
>>
> 
> I suppose it means people will have to split their PMUs to ones that do
> and don't support BRBE. I'm not sure if that's worth adding a comment in
> the docs or it's too obscure

Sure, can add that comment in brbe.rst. Also with debug enabled i.e wrapped
inside some debug config, it can be ascertained that all cpus on a given ARM
PMU have BRBE with exact same properties.

> 
>>>
>>>> +}
>>>> +
>>>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>>>> +{
>>>> +	struct branch_records __percpu *records;
>>>> +	int cpu;
>>>> +
>>>> +	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>>>> +	if (!records)
>>>> +		return -ENOMEM;
>>>> +
>>>
>>> Doesn't this technically need to take the CPU mask where BRBE is
>>> supported into account? Otherwise you are allocating for cores that
>>> never use it.
>>>
>>> Also it's done per-CPU _and_ per-PMU type, multiplying the number of
>>> BRBE buffers allocated, even if they can only ever be used per-CPU.
>>
>> Agreed, but I believe we have already been though this discussion, and
>> settled for this method - for being a simpler approach.
>>
>>>
>>>> +	/*
>>>> +	 * percpu memory allocated for 'records' gets completely consumed
>>>> +	 * here, and never required to be freed up later. So permanently
>>>> +	 * losing access to this anchor i.e 'records' is acceptable.
>>>> +	 *
>>>> +	 * Otherwise this allocation handle would have to be saved up for
>>>> +	 * free_percpu() release later if required.
>>>> +	 */
>>>> +	for_each_possible_cpu(cpu) {
>>>> +		struct pmu_hw_events *events_cpu;
>>>> +		struct branch_records *records_cpu;
>>>> +
>>>> +		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>>>> +		records_cpu = per_cpu_ptr(records, cpu);
>>>> +		events_cpu->branches = records_cpu;
>>>> +	}
>>>> +	return 0;
>>>>  }
>>>>  
>>>>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>> @@ -1146,7 +1269,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>>  	if (ret)
>>>>  		return ret;
>>>>  
>>>> -	return probe.present ? 0 : -ENODEV;
>>>> +	if (!probe.present)
>>>> +		return -ENODEV;
>>>> +
>>>> +	if (cpu_pmu->has_branch_stack) {
>>>> +		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>>>> +		if (ret)
>>>> +			return ret;
>>>> +
>>>> +		ret = branch_records_alloc(cpu_pmu);
>>>> +		if (ret) {
>>>> +			armv8pmu_task_ctx_cache_free(cpu_pmu);
>>>> +			return ret;
>>>> +		}
>>>> +	}
>>>> +	return 0;
>>>>  }
>>>>  
>>>
>>> [...]
>>>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>>>> index 9c226adf938a..72da4522397c 100644
>>>> --- a/include/linux/perf/arm_pmuv3.h
>>>> +++ b/include/linux/perf/arm_pmuv3.h
>>>> @@ -303,4 +303,50 @@
>>>>  		}						\
>>>>  	} while (0)
>>>>  
>>>> +struct pmu_hw_events;
>>>> +struct arm_pmu;
>>>> +struct perf_event;
>>>> +
>>>> +#ifdef CONFIG_PERF_EVENTS
>>>
>>> Very minor nit, but if you end up moving the stubs to the brbe header
>>> you probably don't need the #ifdef CONFIG_PERF_EVENTS because it just
>>> won't be included in that case.
>>
>> Right, will drop CONFIG_PERF_EVENTS wrapper.
>>
>>>
>>>> +static inline void armv8pmu_branch_reset(void)
>>>> +{
>>>> +}
>>>> +
>>>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
>>>> +{
>>>> +}
>>>> +
>>>> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
>>>> +{
>>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>>> +	return false;
>>>> +}
>>>> +
>>>> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
>>>> +{
>>>> +}
>>>> +
>>>> +static inline void armv8pmu_branch_disable(void)
>>>> +{
>>>> +}
>>>> +
>>>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
>>>> +					struct perf_event *event)
>>>> +{
>>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>>> +}
>>>> +
>>>> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>>>> +{
>>>> +}
>>>> +
>>>> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
>>>> +{
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
>>>> +{
>>>> +}
>>>> +#endif /* CONFIG_PERF_EVENTS */
>>>>  #endif
James Clark Nov. 23, 2023, 12:35 p.m. UTC | #10
On 21/11/2023 09:57, Anshuman Khandual wrote:
> 
> 
> On 11/15/23 15:37, James Clark wrote:
>>
>>
>> On 15/11/2023 07:22, Anshuman Khandual wrote:
>>> On 11/14/23 17:44, James Clark wrote:
>>>>
>>>>
>>>> On 14/11/2023 05:13, Anshuman Khandual wrote:
>>>> [...]
>>>>
>>>>> +/*
>>>>> + * This is a read only constant and safe during multi threaded access
>>>>> + */
>>>>> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
>>>>> +
>>>>> +static void read_branch_records(struct pmu_hw_events *cpuc,
>>>>> +				struct perf_event *event,
>>>>> +				struct perf_sample_data *data,
>>>>> +				bool *branch_captured)
>>>>> +{
>>>>> +	/*
>>>>> +	 * CPU specific branch records buffer must have been allocated already
>>>>> +	 * for the hardware records to be captured and processed further.
>>>>> +	 */
>>>>> +	if (WARN_ON(!cpuc->branches))
>>>>> +		return;
>>>>> +
>>>>> +	/*
>>>>> +	 * Overflowed event's branch_sample_type does not match the configured
>>>>> +	 * branch filters in the BRBE HW. So the captured branch records here
>>>>> +	 * cannot be co-related to the overflowed event. Report to the user as
>>>>> +	 * if no branch records have been captured, and flush branch records.
>>>>> +	 * The same scenario is applicable when the current task context does
>>>>> +	 * not match with overflown event.
>>>>> +	 */
>>>>> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>>>>> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
>>>>> +		perf_sample_save_brstack(data, event, &zero_branch_stack);
>>>>
>>>> Is there any benefit to outputting a zero size stack vs not outputting
>>>> anything at all?
>>>
>>> The event has got PERF_SAMPLE_BRANCH_STACK marked and hence perf_sample_data
>>> must have PERF_SAMPLE_BRANCH_STACK with it's br_stack pointing to the branch
>>> records. Hence without assigning a zeroed struct perf_branch_stack, there is
>>> a chance, that perf_sample_data will pass on some garbage branch records to
>>> the ring buffer.
>>>
>>
>> I don't think that's an issue, the perf core code handles the case where
>> no branch stack exists on a sample. It even outputs the zero length for
>> you, but there is other stuff that can be skipped if you just never call
>> perf_sample_save_brstack():
> 
> Sending out perf_sample_data without valid data->br_stack seems problematic,
> which would be the case when perf_sample_save_brstack() never gets called on
> the perf_sample_data being prepared, and depend on the below 'else' case for
> pushing out zero records.
> 

I'm not following why it would be problematic. data->br_stack is
initialised to NULL in perf_prepare_sample() and the core code
specifically has a path that was added for the case where
perf_sample_save_brstack() was never called.

> Alternatively - temporarily just zeroing out cpuc->branches->branch_stack.nr
> for immediate perf_sample_save_brstack(), and then restoring it back to it's
> original value might work as well. Remember it still has got valid records
> for other qualifying events.
> 

Zeroing isn't required, br_stack is already zero initialised.

Not sure what you mean by valid records for other qualifying events? But
this is a per sample thing, and the output struct is wiped per sample.

>>
>> from kernel/events/core.c:
>>
>>  if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
>>   if (data->br_stack) {
>>     size_t size;
>>
>>     size = data->br_stack->nr
>>       * sizeof(struct perf_branch_entry);
>>
>>     perf_output_put(handle, data->br_stack->nr);
>>     if (branch_sample_hw_index(event))
>>       perf_output_put(handle, data->br_stack->hw_idx);
>>     perf_output_copy(handle, data->br_stack->entries, size);
>>   } else {
>>     /*
>>      * we always store at least the value of nr
>>      */
>>     u64 nr = 0;
>>     perf_output_put(handle, nr);
>>   }
>> }
>>
>>
>>>>
>>>>> +		return;
>>>>> +	}
>>>>> +
>>>>> +	/*
>>>>> +	 * Read the branch records from the hardware once after the PMU IRQ
>>>>> +	 * has been triggered but subsequently same records can be used for
>>>>> +	 * other events that might have been overflowed simultaneously thus
>>>>> +	 * saving much CPU cycles.
>>>>> +	 */
>>>>> +	if (!*branch_captured) {
>>>>> +		armv8pmu_branch_read(cpuc, event);
>>>>> +		*branch_captured = true;
>>>>> +	}
>>>>> +	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack);
>>>>> +}
>>>>> +
>>>>>  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>>  {
>>>>>  	u32 pmovsr;
>>>>> @@ -766,6 +815,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>>  	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>>>>>  	struct pt_regs *regs;
>>>>>  	int idx;
>>>>> +	bool branch_captured = false;
>>>>>  
>>>>>  	/*
>>>>>  	 * Get and reset the IRQ flags
>>>>> @@ -809,6 +859,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>>  		if (!armpmu_event_set_period(event))
>>>>>  			continue;
>>>>>  
>>>>> +		/*
>>>>> +		 * PMU IRQ should remain asserted until all branch records
>>>>> +		 * are captured and processed into struct perf_sample_data.
>>>>> +		 */
>>>>> +		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
>>>>> +			read_branch_records(cpuc, event, &data, &branch_captured);
>>>>
>>>> You could return instead of using the out param, not really any
>>>> different, but maybe a bit more normal:
>>>>
>>>>   branch_captured |= read_branch_records(cpuc, event, &data,
>>>> branch_captured);
>>>
>>> I am just wondering - how that would be any better ?
>>>
>>
>> Maybe it wouldn't, but I suppose it's just the same way you don't write
>> returns like:
>>
>>   armv8pmu_task_ctx_cache_alloc(cpu_pmu, &ret);
>>
>> instead of:
>>
>>   ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>>
>> Out params can be hard to reason about sometimes. Maybe not in this case
>> though.
> 
> The out parameter 'branch_captured' is checked inside read_branch_records()
> to ascertain whether the BRBE records have been already captured inside the
> buffer i.e cpuc->branches->branch_stack, in case the process can be skipped
> (optimization) for subsequent events in the session. Keeping this parameter
> branch_captured just inside the caller i.e armv8pmu_handle_irq() would not
> achieve that objective.
> 

No, it would achieve the objective and it's the same. It's also arguably
two different things. One is whether any output was generated, and the
other is whether to skip, so having a return value allows you to give
the variables two different names.

 skip_output |= read_branch_records(cpuc, event, &data, skip_output);

And on the inside:

 bool read_branch_records(..., bool skip_output)
 {
    bool records_output = false;

    if (thing && !skip_output) {
      output_records();
      records_output = true;
    }

   return records_output;
  }

Either way, I'm not that bothered about this one, I was just mentioning
that the out parameter was a bit weird. But to say you can't accomplish
the same thing isn't right.

>>>>
>>>>> +
>>>>>  		/*
>>>>>  		 * Perf event overflow will queue the processing of the event as
>>>>>  		 * an irq_work which will be taken care of in the handling of
>>>>> @@ -818,6 +875,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>>>>  			cpu_pmu->disable(event);
>>>>>  	}
>>>>>  	armv8pmu_start(cpu_pmu);
>>>>> +	if (cpu_pmu->has_branch_stack)
>>>>> +		armv8pmu_branch_reset();
>>>>>  
>>>>>  	return IRQ_HANDLED;
>>>>>  }
>>>>> @@ -907,6 +966,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>>>>  	return event->hw.idx;
>>>>>  }
>>>>>  
>>>>> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>>>>> +{
>>>>> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>>>>> +	void *task_ctx = pmu_ctx->task_ctx_data;
>>>>> +
>>>>> +	if (armpmu->has_branch_stack) {
>>>>> +		/* Save branch records in task_ctx on sched out */
>>>>> +		if (task_ctx && !sched_in) {
>>>>> +			armv8pmu_branch_save(armpmu, task_ctx);
>>>>> +			return;
>>>>> +		}
>>>>> +
>>>>> +		/* Reset branch records on sched in */
>>>>> +		if (sched_in)
>>>>> +			armv8pmu_branch_reset();
>>>>> +	}
>>>>> +}
>>>>> +
>>>>>  /*
>>>>>   * Add an event filter to a given event.
>>>>>   */
>>>>> @@ -977,6 +1054,9 @@ static void armv8pmu_reset(void *info)
>>>>>  		pmcr |= ARMV8_PMU_PMCR_LP;
>>>>>  
>>>>>  	armv8pmu_pmcr_write(pmcr);
>>>>> +
>>>>> +	if (cpu_pmu->has_branch_stack)
>>>>> +		armv8pmu_branch_reset();
>>>>>  }
>>>>>  
>>>>>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>>>>> @@ -1014,6 +1094,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>>>>  
>>>>>  	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>>>>  
>>>>> +	if (has_branch_stack(event)) {
>>>>> +		if (!armv8pmu_branch_attr_valid(event))
>>>>> +			return -EOPNOTSUPP;
>>>>> +
>>>>> +		/*
>>>>> +		 * If a task gets scheduled out, the current branch records
>>>>> +		 * get saved in the task's context data, which can be later
>>>>> +		 * used to fill in the records upon an event overflow. Let's
>>>>> +		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
>>>>> +		 * all branch stack sampling perf events.
>>>>> +		 */
>>>>> +		event->attach_state |= PERF_ATTACH_TASK_DATA;
>>>>> +	}
>>>>> +
>>>>>  	/*
>>>>>  	 * CHAIN events only work when paired with an adjacent counter, and it
>>>>>  	 * never makes sense for a user to open one in isolation, as they'll be
>>>>> @@ -1130,6 +1224,35 @@ static void __armv8pmu_probe_pmu(void *info)
>>>>>  		cpu_pmu->reg_pmmir = read_pmmir();
>>>>>  	else
>>>>>  		cpu_pmu->reg_pmmir = 0;
>>>>> +	armv8pmu_branch_probe(cpu_pmu);
>>>>
>>>> I'm not sure if this is splitting hairs or not, but
>>>> __armv8pmu_probe_pmu() is run on only one of 'any' of the supported CPUs
>>>> for this PMU.
>>>
>>> Right.
>>>
>>>>
>>>> Is it not possible to have some of those CPUs support and some not
>>>> support BRBE, even though they are all the same PMU type? Maybe we could
>>>
>>> I am not sure, but not something I have come across.
>>>
>>>> wait for it to explode with some weird system, or change it so that the
>>>> BRBE probe is run on every CPU, with a second 'supported_brbe_mask' field.
>>>
>>> Right, but for now, the current solutions looks sufficient.
>>>
>>
>> I suppose it means people will have to split their PMUs to ones that do
>> and don't support BRBE. I'm not sure if that's worth adding a comment in
>> the docs or it's too obscure
> 
> Sure, can add that comment in brbe.rst. Also with debug enabled i.e wrapped
> inside some debug config, it can be ascertained that all cpus on a given ARM
> PMU have BRBE with exact same properties.
> 
>>
>>>>
>>>>> +}
>>>>> +
>>>>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>>>>> +{
>>>>> +	struct branch_records __percpu *records;
>>>>> +	int cpu;
>>>>> +
>>>>> +	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>>>>> +	if (!records)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>
>>>> Doesn't this technically need to take the CPU mask where BRBE is
>>>> supported into account? Otherwise you are allocating for cores that
>>>> never use it.
>>>>
>>>> Also it's done per-CPU _and_ per-PMU type, multiplying the number of
>>>> BRBE buffers allocated, even if they can only ever be used per-CPU.
>>>
>>> Agreed, but I believe we have already been though this discussion, and
>>> settled for this method - for being a simpler approach.
>>>
>>>>
>>>>> +	/*
>>>>> +	 * percpu memory allocated for 'records' gets completely consumed
>>>>> +	 * here, and never required to be freed up later. So permanently
>>>>> +	 * losing access to this anchor i.e 'records' is acceptable.
>>>>> +	 *
>>>>> +	 * Otherwise this allocation handle would have to be saved up for
>>>>> +	 * free_percpu() release later if required.
>>>>> +	 */
>>>>> +	for_each_possible_cpu(cpu) {
>>>>> +		struct pmu_hw_events *events_cpu;
>>>>> +		struct branch_records *records_cpu;
>>>>> +
>>>>> +		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>>>>> +		records_cpu = per_cpu_ptr(records, cpu);
>>>>> +		events_cpu->branches = records_cpu;
>>>>> +	}
>>>>> +	return 0;
>>>>>  }
>>>>>  
>>>>>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>>> @@ -1146,7 +1269,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>>>  	if (ret)
>>>>>  		return ret;
>>>>>  
>>>>> -	return probe.present ? 0 : -ENODEV;
>>>>> +	if (!probe.present)
>>>>> +		return -ENODEV;
>>>>> +
>>>>> +	if (cpu_pmu->has_branch_stack) {
>>>>> +		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>>>>> +		if (ret)
>>>>> +			return ret;
>>>>> +
>>>>> +		ret = branch_records_alloc(cpu_pmu);
>>>>> +		if (ret) {
>>>>> +			armv8pmu_task_ctx_cache_free(cpu_pmu);
>>>>> +			return ret;
>>>>> +		}
>>>>> +	}
>>>>> +	return 0;
>>>>>  }
>>>>>  
>>>>
>>>> [...]
>>>>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>>>>> index 9c226adf938a..72da4522397c 100644
>>>>> --- a/include/linux/perf/arm_pmuv3.h
>>>>> +++ b/include/linux/perf/arm_pmuv3.h
>>>>> @@ -303,4 +303,50 @@
>>>>>  		}						\
>>>>>  	} while (0)
>>>>>  
>>>>> +struct pmu_hw_events;
>>>>> +struct arm_pmu;
>>>>> +struct perf_event;
>>>>> +
>>>>> +#ifdef CONFIG_PERF_EVENTS
>>>>
>>>> Very minor nit, but if you end up moving the stubs to the brbe header
>>>> you probably don't need the #ifdef CONFIG_PERF_EVENTS because it just
>>>> won't be included in that case.
>>>
>>> Right, will drop CONFIG_PERF_EVENTS wrapper.
>>>
>>>>
>>>>> +static inline void armv8pmu_branch_reset(void)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
>>>>> +{
>>>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>>>> +	return false;
>>>>> +}
>>>>> +
>>>>> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline void armv8pmu_branch_disable(void)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
>>>>> +					struct perf_event *event)
>>>>> +{
>>>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>>>> +}
>>>>> +
>>>>> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
>>>>> +{
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
>>>>> +{
>>>>> +}
>>>>> +#endif /* CONFIG_PERF_EVENTS */
>>>>>  #endif
Anshuman Khandual Nov. 27, 2023, 8:06 a.m. UTC | #11
On 11/23/23 18:05, James Clark wrote:
> 
> 
> On 21/11/2023 09:57, Anshuman Khandual wrote:
>>
>>
>> On 11/15/23 15:37, James Clark wrote:
>>>
>>>
>>> On 15/11/2023 07:22, Anshuman Khandual wrote:
>>>> On 11/14/23 17:44, James Clark wrote:
>>>>>
>>>>>
>>>>> On 14/11/2023 05:13, Anshuman Khandual wrote:
>>>>> [...]
>>>>>
>>>>>> +/*
>>>>>> + * This is a read only constant and safe during multi threaded access
>>>>>> + */
>>>>>> +static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
>>>>>> +
>>>>>> +static void read_branch_records(struct pmu_hw_events *cpuc,
>>>>>> +				struct perf_event *event,
>>>>>> +				struct perf_sample_data *data,
>>>>>> +				bool *branch_captured)
>>>>>> +{
>>>>>> +	/*
>>>>>> +	 * CPU specific branch records buffer must have been allocated already
>>>>>> +	 * for the hardware records to be captured and processed further.
>>>>>> +	 */
>>>>>> +	if (WARN_ON(!cpuc->branches))
>>>>>> +		return;
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * Overflowed event's branch_sample_type does not match the configured
>>>>>> +	 * branch filters in the BRBE HW. So the captured branch records here
>>>>>> +	 * cannot be co-related to the overflowed event. Report to the user as
>>>>>> +	 * if no branch records have been captured, and flush branch records.
>>>>>> +	 * The same scenario is applicable when the current task context does
>>>>>> +	 * not match with overflown event.
>>>>>> +	 */
>>>>>> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>>>>>> +	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
>>>>>> +		perf_sample_save_brstack(data, event, &zero_branch_stack);
>>>>>
>>>>> Is there any benefit to outputting a zero size stack vs not outputting
>>>>> anything at all?
>>>>
>>>> The event has got PERF_SAMPLE_BRANCH_STACK marked and hence perf_sample_data
>>>> must have PERF_SAMPLE_BRANCH_STACK with it's br_stack pointing to the branch
>>>> records. Hence without assigning a zeroed struct perf_branch_stack, there is
>>>> a chance, that perf_sample_data will pass on some garbage branch records to
>>>> the ring buffer.
>>>>
>>>
>>> I don't think that's an issue, the perf core code handles the case where
>>> no branch stack exists on a sample. It even outputs the zero length for
>>> you, but there is other stuff that can be skipped if you just never call
>>> perf_sample_save_brstack():
>>
>> Sending out perf_sample_data without valid data->br_stack seems problematic,
>> which would be the case when perf_sample_save_brstack() never gets called on
>> the perf_sample_data being prepared, and depend on the below 'else' case for
>> pushing out zero records.
>>
> 
> I'm not following why it would be problematic. data->br_stack is
> initialised to NULL in perf_prepare_sample() and the core code
> specifically has a path that was added for the case where
> perf_sample_save_brstack() was never called.

Without perf_sample_save_brstack() called on the perf sample data will
preserve 'data->br_stack' unchanged as NULL from perf_prepare_sample(),
The perf sample record, will eventually be skipped for 'data->br_stack'
element in perf_output_sample().

void perf_prepare_sample(struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
	....
        if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
                data->br_stack = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
        }
	....
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
	....
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (branch_sample_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }
	....
}
Anshuman Khandual Nov. 30, 2023, 3:58 a.m. UTC | #12
On 11/14/23 22:40, James Clark wrote:
> 
> On 14/11/2023 05:13, Anshuman Khandual wrote:
> [...]
>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>> index d712a19e47ac..76f1376ae594 100644
>> --- a/drivers/perf/arm_pmu.c
>> +++ b/drivers/perf/arm_pmu.c
>> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>>  	struct hw_perf_event *hwc = &event->hw;
>>  	int idx = hwc->idx;
>>  
>> +	if (has_branch_stack(event)) {
>> +		WARN_ON_ONCE(!hw_events->brbe_users);
>> +		hw_events->brbe_users--;
>> +		if (!hw_events->brbe_users) {
>> +			hw_events->brbe_context = NULL;
>> +			hw_events->brbe_sample_type = 0;
>> +		}
>> +	}
>> +
>>  	armpmu_stop(event, PERF_EF_UPDATE);
>>  	hw_events->events[idx] = NULL;
>>  	armpmu->clear_event_idx(hw_events, event);
>> @@ -333,6 +342,22 @@ armpmu_add(struct perf_event *event, int flags)
>>  	struct hw_perf_event *hwc = &event->hw;
>>  	int idx;
>>  
>> +	if (has_branch_stack(event)) {
>> +		/*
>> +		 * Reset branch records buffer if a new task event gets
>> +		 * scheduled on a PMU which might have existing records.
>> +		 * Otherwise older branch records present in the buffer
>> +		 * might leak into the new task event.
>> +		 */
>> +		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
>> +			hw_events->brbe_context = event->ctx;
>> +			if (armpmu->branch_reset)
>> +				armpmu->branch_reset();
> What about a per-thread event following a per-cpu event? Doesn't that
> also need to branch_reset()? If hw_events->brbe_context was already
> previously assigned, once the per-thread event is switched in it skips
> this reset following a per-cpu event on the same core.

Right, guess it is real a possibility. How about folding in something like ..

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 76f1376ae594..15bb80823ae6 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -343,6 +343,22 @@ armpmu_add(struct perf_event *event, int flags)
        int idx;
 
        if (has_branch_stack(event)) {
+               /*
+                * Reset branch records buffer if a new CPU bound event
+                * gets scheduled on a PMU. Otherwise existing branch
+                * records present in the buffer might just leak into
+                * such events.
+                *
+                * Also reset current 'hw_events->brbe_context' because
+                * any previous task bound event now would have lost an
+                * opportunity for continuous branch records.
+                */
+               if (!event->ctx->task) {
+                       hw_events->brbe_context = NULL;
+                       if (armpmu->branch_reset)
+                               armpmu->branch_reset();
+               }
+
                /*
                 * Reset branch records buffer if a new task event gets
                 * scheduled on a PMU which might have existing records.
diff mbox series

Patch

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index d712a19e47ac..76f1376ae594 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -317,6 +317,15 @@  armpmu_del(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
+	if (has_branch_stack(event)) {
+		WARN_ON_ONCE(!hw_events->brbe_users);
+		hw_events->brbe_users--;
+		if (!hw_events->brbe_users) {
+			hw_events->brbe_context = NULL;
+			hw_events->brbe_sample_type = 0;
+		}
+	}
+
 	armpmu_stop(event, PERF_EF_UPDATE);
 	hw_events->events[idx] = NULL;
 	armpmu->clear_event_idx(hw_events, event);
@@ -333,6 +342,22 @@  armpmu_add(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx;
 
+	if (has_branch_stack(event)) {
+		/*
+		 * Reset branch records buffer if a new task event gets
+		 * scheduled on a PMU which might have existing records.
+		 * Otherwise older branch records present in the buffer
+		 * might leak into the new task event.
+		 */
+		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
+			hw_events->brbe_context = event->ctx;
+			if (armpmu->branch_reset)
+				armpmu->branch_reset();
+		}
+		hw_events->brbe_users++;
+		hw_events->brbe_sample_type = event->attr.branch_sample_type;
+	}
+
 	/* An event following a process won't be stopped earlier */
 	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
 		return -ENOENT;
@@ -512,13 +537,24 @@  static int armpmu_event_init(struct perf_event *event)
 		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
 		return -ENOENT;
 
-	/* does not support taken branch sampling */
-	if (has_branch_stack(event))
+	/*
+	 * Branch stack sampling events are allowed
+	 * only on PMU which has required support.
+	 */
+	if (has_branch_stack(event) && !armpmu->has_branch_stack)
 		return -EOPNOTSUPP;
 
 	return __hw_perf_event_init(event);
 }
 
+static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+
+	if (armpmu->sched_task)
+		armpmu->sched_task(pmu_ctx, sched_in);
+}
+
 static void armpmu_enable(struct pmu *pmu)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(pmu);
@@ -865,6 +901,7 @@  struct arm_pmu *armpmu_alloc(void)
 	}
 
 	pmu->pmu = (struct pmu) {
+		.sched_task	= armpmu_sched_task,
 		.pmu_enable	= armpmu_enable,
 		.pmu_disable	= armpmu_disable,
 		.event_init	= armpmu_event_init,
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 6ca7be05229c..7f973c77a95e 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -751,14 +751,63 @@  static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
 
 	kvm_vcpu_pmu_resync_el0();
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_enable(cpu_pmu);
 }
 
 static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
 {
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_disable();
+
 	/* Disable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
 }
 
+/*
+ * This is a read only constant and safe during multi threaded access
+ */
+static struct perf_branch_stack zero_branch_stack = { .nr = 0, .hw_idx = -1ULL};
+
+static void read_branch_records(struct pmu_hw_events *cpuc,
+				struct perf_event *event,
+				struct perf_sample_data *data,
+				bool *branch_captured)
+{
+	/*
+	 * CPU specific branch records buffer must have been allocated already
+	 * for the hardware records to be captured and processed further.
+	 */
+	if (WARN_ON(!cpuc->branches))
+		return;
+
+	/*
+	 * Overflowed event's branch_sample_type does not match the configured
+	 * branch filters in the BRBE HW. So the captured branch records here
+	 * cannot be co-related to the overflowed event. Report to the user as
+	 * if no branch records have been captured, and flush branch records.
+	 * The same scenario is applicable when the current task context does
+	 * not match with overflown event.
+	 */
+	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
+	    (event->ctx->task && cpuc->brbe_context != event->ctx)) {
+		perf_sample_save_brstack(data, event, &zero_branch_stack);
+		return;
+	}
+
+	/*
+	 * Read the branch records from the hardware once after the PMU IRQ
+	 * has been triggered but subsequently same records can be used for
+	 * other events that might have been overflowed simultaneously thus
+	 * saving much CPU cycles.
+	 */
+	if (!*branch_captured) {
+		armv8pmu_branch_read(cpuc, event);
+		*branch_captured = true;
+	}
+	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack);
+}
+
 static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	u32 pmovsr;
@@ -766,6 +815,7 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
+	bool branch_captured = false;
 
 	/*
 	 * Get and reset the IRQ flags
@@ -809,6 +859,13 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 		if (!armpmu_event_set_period(event))
 			continue;
 
+		/*
+		 * PMU IRQ should remain asserted until all branch records
+		 * are captured and processed into struct perf_sample_data.
+		 */
+		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
+			read_branch_records(cpuc, event, &data, &branch_captured);
+
 		/*
 		 * Perf event overflow will queue the processing of the event as
 		 * an irq_work which will be taken care of in the handling of
@@ -818,6 +875,8 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 			cpu_pmu->disable(event);
 	}
 	armv8pmu_start(cpu_pmu);
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_reset();
 
 	return IRQ_HANDLED;
 }
@@ -907,6 +966,24 @@  static int armv8pmu_user_event_idx(struct perf_event *event)
 	return event->hw.idx;
 }
 
+static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+	void *task_ctx = pmu_ctx->task_ctx_data;
+
+	if (armpmu->has_branch_stack) {
+		/* Save branch records in task_ctx on sched out */
+		if (task_ctx && !sched_in) {
+			armv8pmu_branch_save(armpmu, task_ctx);
+			return;
+		}
+
+		/* Reset branch records on sched in */
+		if (sched_in)
+			armv8pmu_branch_reset();
+	}
+}
+
 /*
  * Add an event filter to a given event.
  */
@@ -977,6 +1054,9 @@  static void armv8pmu_reset(void *info)
 		pmcr |= ARMV8_PMU_PMCR_LP;
 
 	armv8pmu_pmcr_write(pmcr);
+
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_reset();
 }
 
 static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
@@ -1014,6 +1094,20 @@  static int __armv8_pmuv3_map_event(struct perf_event *event,
 
 	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
 
+	if (has_branch_stack(event)) {
+		if (!armv8pmu_branch_attr_valid(event))
+			return -EOPNOTSUPP;
+
+		/*
+		 * If a task gets scheduled out, the current branch records
+		 * get saved in the task's context data, which can be later
+		 * used to fill in the records upon an event overflow. Let's
+		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
+		 * all branch stack sampling perf events.
+		 */
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+	}
+
 	/*
 	 * CHAIN events only work when paired with an adjacent counter, and it
 	 * never makes sense for a user to open one in isolation, as they'll be
@@ -1130,6 +1224,35 @@  static void __armv8pmu_probe_pmu(void *info)
 		cpu_pmu->reg_pmmir = read_pmmir();
 	else
 		cpu_pmu->reg_pmmir = 0;
+	armv8pmu_branch_probe(cpu_pmu);
+}
+
+static int branch_records_alloc(struct arm_pmu *armpmu)
+{
+	struct branch_records __percpu *records;
+	int cpu;
+
+	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
+	if (!records)
+		return -ENOMEM;
+
+	/*
+	 * percpu memory allocated for 'records' gets completely consumed
+	 * here, and never required to be freed up later. So permanently
+	 * losing access to this anchor i.e 'records' is acceptable.
+	 *
+	 * Otherwise this allocation handle would have to be saved up for
+	 * free_percpu() release later if required.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct pmu_hw_events *events_cpu;
+		struct branch_records *records_cpu;
+
+		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
+		records_cpu = per_cpu_ptr(records, cpu);
+		events_cpu->branches = records_cpu;
+	}
+	return 0;
 }
 
 static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1146,7 +1269,21 @@  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	if (ret)
 		return ret;
 
-	return probe.present ? 0 : -ENODEV;
+	if (!probe.present)
+		return -ENODEV;
+
+	if (cpu_pmu->has_branch_stack) {
+		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
+		if (ret)
+			return ret;
+
+		ret = branch_records_alloc(cpu_pmu);
+		if (ret) {
+			armv8pmu_task_ctx_cache_free(cpu_pmu);
+			return ret;
+		}
+	}
+	return 0;
 }
 
 static void armv8pmu_disable_user_access_ipi(void *unused)
@@ -1205,6 +1342,8 @@  static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 
 	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+	cpu_pmu->sched_task		= armv8pmu_sched_task;
+	cpu_pmu->branch_reset		= armv8pmu_branch_reset;
 
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 143fbc10ecfe..a489fdf163b4 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -46,6 +46,18 @@  static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
 	},								\
 }
 
+/*
+ * Maximum branch record entries which could be processed
+ * for core perf branch stack sampling support, regardless
+ * of the hardware support available on a given ARM PMU.
+ */
+#define MAX_BRANCH_RECORDS 64
+
+struct branch_records {
+	struct perf_branch_stack	branch_stack;
+	struct perf_branch_entry	branch_entries[MAX_BRANCH_RECORDS];
+};
+
 /* The events for a given PMU register set. */
 struct pmu_hw_events {
 	/*
@@ -72,6 +84,17 @@  struct pmu_hw_events {
 	struct arm_pmu		*percpu_pmu;
 
 	int irq;
+
+	struct branch_records	*branches;
+
+	/* Active context for task events */
+	void			*brbe_context;
+
+	/* Active events requesting branch records */
+	unsigned int		brbe_users;
+
+	/* Active branch sample type filters */
+	unsigned long		brbe_sample_type;
 };
 
 enum armpmu_attr_groups {
@@ -102,8 +125,12 @@  struct arm_pmu {
 	void		(*stop)(struct arm_pmu *);
 	void		(*reset)(void *);
 	int		(*map_event)(struct perf_event *event);
+	void		(*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+	void		(*branch_reset)(void);
 	int		num_events;
-	bool		secure_access; /* 32-bit ARM only */
+	unsigned int	secure_access	: 1, /* 32-bit ARM only */
+			has_branch_stack: 1, /* 64-bit ARM only */
+			reserved	: 30;
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
 	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
 #define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE	0x4000
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 9c226adf938a..72da4522397c 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -303,4 +303,50 @@ 
 		}						\
 	} while (0)
 
+struct pmu_hw_events;
+struct arm_pmu;
+struct perf_event;
+
+#ifdef CONFIG_PERF_EVENTS
+static inline void armv8pmu_branch_reset(void)
+{
+}
+
+static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
+{
+}
+
+static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+	return false;
+}
+
+static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
+{
+}
+
+static inline void armv8pmu_branch_disable(void)
+{
+}
+
+static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+}
+
+static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
+{
+}
+
+static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
+{
+	return 0;
+}
+
+static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
+{
+}
+#endif /* CONFIG_PERF_EVENTS */
 #endif