diff mbox series

[v7,1/5] drm/panthor: introduce job cycle and timestamp accounting

Message ID 20240920234436.207563-2-adrian.larumbe@collabora.com (mailing list archive)
State New, archived
Headers show
Series Support fdinfo runtime and memory stats on Panthor | expand

Commit Message

Adrián Larumbe Sept. 20, 2024, 11:43 p.m. UTC
Enable calculations of job submission times in clock cycles and wall
time. This is done by expanding the boilerplate command stream when running
a job to include instructions that compute said times right before and
after a user CS.

A separate kernel BO is created per queue to store those values. Jobs can
access their sampled data through an index different from that of the
queue's ringbuffer. The reason for this is saving memory on the profiling
information kernel BO, since the amount of simultaneous profiled jobs we
can write into the queue's ringbuffer might be much smaller than for
regular jobs, as the former take more CSF instructions.

This commit is done in preparation for enabling DRM fdinfo support in the
Panthor driver, which depends on the numbers calculated herein.

A profile mode mask has been added that will in a future commit allow UM to
toggle performance metric sampling behaviour, which is disabled by default
to save power. When a ringbuffer CS is constructed, timestamp and cycling
sampling instructions are added depending on the enabled flags in the
profiling mask.

A helper was provided that calculates the number of instructions for a
given set of enablement mask, and these are passed as the number of credits
when initialising a DRM scheduler job.

Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
---
 drivers/gpu/drm/panthor/panthor_device.h |  22 ++
 drivers/gpu/drm/panthor/panthor_sched.c  | 328 +++++++++++++++++++----
 2 files changed, 301 insertions(+), 49 deletions(-)

Comments

Steven Price Sept. 23, 2024, 9:07 a.m. UTC | #1
On 21/09/2024 00:43, Adrián Larumbe wrote:
> Enable calculations of job submission times in clock cycles and wall
> time. This is done by expanding the boilerplate command stream when running
> a job to include instructions that compute said times right before and
> after a user CS.
> 
> A separate kernel BO is created per queue to store those values. Jobs can
> access their sampled data through an index different from that of the
> queue's ringbuffer. The reason for this is saving memory on the profiling
> information kernel BO, since the amount of simultaneous profiled jobs we
> can write into the queue's ringbuffer might be much smaller than for
> regular jobs, as the former take more CSF instructions.
> 
> This commit is done in preparation for enabling DRM fdinfo support in the
> Panthor driver, which depends on the numbers calculated herein.
> 
> A profile mode mask has been added that will in a future commit allow UM to
> toggle performance metric sampling behaviour, which is disabled by default
> to save power. When a ringbuffer CS is constructed, timestamp and cycling
> sampling instructions are added depending on the enabled flags in the
> profiling mask.
> 
> A helper was provided that calculates the number of instructions for a
> given set of enablement mask, and these are passed as the number of credits
> when initialising a DRM scheduler job.
> 
> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>

I think just one bug remaining - see below...

> ---
>  drivers/gpu/drm/panthor/panthor_device.h |  22 ++
>  drivers/gpu/drm/panthor/panthor_sched.c  | 328 +++++++++++++++++++----
>  2 files changed, 301 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index e388c0472ba7..a48e30d0af30 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -66,6 +66,25 @@ struct panthor_irq {
>  	atomic_t suspended;
>  };
>  
> +/**
> + * enum panthor_device_profiling_mode - Profiling state
> + */
> +enum panthor_device_profiling_flags {
> +	/** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> +	PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> +
> +	/** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> +	PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> +
> +	/** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> +	PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> +
> +	/** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> +	PANTHOR_DEVICE_PROFILING_ALL =
> +	PANTHOR_DEVICE_PROFILING_CYCLES |
> +	PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> +};
> +
>  /**
>   * struct panthor_device - Panthor device
>   */
> @@ -162,6 +181,9 @@ struct panthor_device {
>  		 */
>  		struct page *dummy_latest_flush;
>  	} pm;
> +
> +	/** @profile_mask: User-set profiling flags for job accounting. */
> +	u32 profile_mask;
>  };
>  
>  /**
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 42afdf0ddb7e..6da5c3d0015e 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -93,6 +93,9 @@
>  #define MIN_CSGS				3
>  #define MAX_CSG_PRIO				0xf
>  
> +#define NUM_INSTRS_PER_CACHE_LINE		(64 / sizeof(u64))
> +#define MAX_INSTRS_PER_JOB			24
> +
>  struct panthor_group;
>  
>  /**
> @@ -476,6 +479,18 @@ struct panthor_queue {
>  		 */
>  		struct list_head in_flight_jobs;
>  	} fence_ctx;
> +
> +	/** @profiling: Job profiling data slots and access information. */
> +	struct {
> +		/** @slots: Kernel BO holding the slots. */
> +		struct panthor_kernel_bo *slots;
> +
> +		/** @slot_count: Number of jobs ringbuffer can hold at once. */
> +		u32 slot_count;
> +
> +		/** @seqno: Index of the next available profiling information slot. */
> +		u32 seqno;
> +	} profiling;
>  };
>  
>  /**
> @@ -661,6 +676,18 @@ struct panthor_group {
>  	struct list_head wait_node;
>  };
>  
> +struct panthor_job_profiling_data {
> +	struct {
> +		u64 before;
> +		u64 after;
> +	} cycles;
> +
> +	struct {
> +		u64 before;
> +		u64 after;
> +	} time;
> +};
> +
>  /**
>   * group_queue_work() - Queue a group work
>   * @group: Group to queue the work for.
> @@ -774,6 +801,15 @@ struct panthor_job {
>  
>  	/** @done_fence: Fence signaled when the job is finished or cancelled. */
>  	struct dma_fence *done_fence;
> +
> +	/** @profiling: Job profiling information. */
> +	struct {
> +		/** @mask: Current device job profiling enablement bitmask. */
> +		u32 mask;
> +
> +		/** @slot: Job index in the profiling slots BO. */
> +		u32 slot;
> +	} profiling;
>  };
>  
>  static void
> @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>  
>  	panthor_kernel_bo_destroy(queue->ringbuf);
>  	panthor_kernel_bo_destroy(queue->iface.mem);
> +	panthor_kernel_bo_destroy(queue->profiling.slots);
>  
>  	/* Release the last_fence we were holding, if any. */
>  	dma_fence_put(queue->fence_ctx.last_fence);
> @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
>  	}
>  }
>  
> -#define NUM_INSTRS_PER_SLOT		16
> -
>  static void
>  group_term_post_processing(struct panthor_group *group)
>  {
> @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work)
>  	group_put(group);
>  }
>  
> -static struct dma_fence *
> -queue_run_job(struct drm_sched_job *sched_job)
> +struct panthor_job_ringbuf_instrs {
> +	u64 buffer[MAX_INSTRS_PER_JOB];
> +	u32 count;
> +};
> +
> +struct panthor_job_instr {
> +	u32 profile_mask;
> +	u64 instr;
> +};
> +
> +#define JOB_INSTR(__prof, __instr) \
> +	{ \
> +		.profile_mask = __prof, \
> +		.instr = __instr, \
> +	}
> +
> +static void
> +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> +		       struct panthor_job *job,
> +		       struct panthor_job_ringbuf_instrs *instrs)
> +{
> +	u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> +	u64 start = job->ringbuf.start & (ringbuf_size - 1);
> +	u64 size, written;
> +
> +	/*
> +	 * We need to write a whole slot, including any trailing zeroes
> +	 * that may come at the end of it. Also, because instrs.buffer has
> +	 * been zero-initialised, there's no need to pad it with 0's
> +	 */
> +	instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> +	size = instrs->count * sizeof(u64);
> +	WARN_ON(size > ringbuf_size);
> +	written = min(ringbuf_size - start, size);
> +
> +	memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> +
> +	if (written < size)
> +		memcpy(queue->ringbuf->kmap,
> +		       &instrs->buffer[written/sizeof(u64)],
> +		       size - written);
> +}
> +
> +struct panthor_job_cs_params {
> +	u32 profile_mask;
> +	u64 addr_reg; u64 val_reg;
> +	u64 cycle_reg; u64 time_reg;
> +	u64 sync_addr; u64 times_addr;
> +	u64 cs_start; u64 cs_size;
> +	u32 last_flush; u32 waitall_mask;
> +};
> +
> +static void
> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
>  {
> -	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
>  	struct panthor_group *group = job->group;
>  	struct panthor_queue *queue = group->queues[job->queue_idx];
>  	struct panthor_device *ptdev = group->ptdev;
>  	struct panthor_scheduler *sched = ptdev->scheduler;
> -	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> -	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> -	u64 addr_reg = ptdev->csif_info.cs_reg_count -
> -		       ptdev->csif_info.unpreserved_cs_reg_count;
> -	u64 val_reg = addr_reg + 2;
> -	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> -			job->queue_idx * sizeof(struct panthor_syncobj_64b);
> -	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> -	struct dma_fence *done_fence;
> -	int ret;
>  
> -	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> -		/* MOV32 rX+2, cs.latest_flush */
> -		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> +	params->addr_reg = ptdev->csif_info.cs_reg_count -
> +			   ptdev->csif_info.unpreserved_cs_reg_count;
> +	params->val_reg = params->addr_reg + 2;
> +	params->cycle_reg = params->addr_reg;
> +	params->time_reg = params->val_reg;
>  
> -		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> -		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> +	params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> +			    job->queue_idx * sizeof(struct panthor_syncobj_64b);
> +	params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) +
> +			     (job->profiling.slot * sizeof(struct panthor_job_profiling_data));
> +	params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>  
> -		/* MOV48 rX:rX+1, cs.start */
> -		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
> +	params->cs_start = job->call_info.start;
> +	params->cs_size = job->call_info.size;
> +	params->last_flush = job->call_info.latest_flush;
>  
> -		/* MOV32 rX+2, cs.size */
> -		(2ull << 56) | (val_reg << 48) | job->call_info.size,
> +	params->profile_mask = job->profiling.mask;
> +}
>  
> -		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> -		(3ull << 56) | (1 << 16),
> +#define JOB_INSTR_ALWAYS(instr) \
> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr))
> +#define JOB_INSTR_TIMESTAMP(instr) \
> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr))
> +#define JOB_INSTR_CYCLES(instr) \
> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr))
>  
> +static void
> +prepare_job_instrs(const struct panthor_job_cs_params *params,
> +		   struct panthor_job_ringbuf_instrs *instrs)
> +{
> +	const struct panthor_job_instr instr_seq[] = {
> +		/* MOV32 rX+2, cs.latest_flush */
> +		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush),
> +		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> +		JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> +		/* MOV48 rX:rX+1, cycles_offset */
> +		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
> +				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))),
> +		/* STORE_STATE cycles */
> +		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> +		/* MOV48 rX:rX+1, time_offset */
> +		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr +
> +			   offsetof(struct panthor_job_profiling_data, time.before))),
> +		/* STORE_STATE timer */
> +		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> +		/* MOV48 rX:rX+1, cs.start */
> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> +		/* MOV32 rX+2, cs.size */
> +		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size),
> +		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> +		JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)),
>  		/* CALL rX:rX+1, rX+2 */
> -		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> -
> +		JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> +		/* MOV48 rX:rX+1, cycles_offset */
> +		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
> +				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))),
> +		/* STORE_STATE cycles */
> +		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> +		/* MOV48 rX:rX+1, time_offset */
> +		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
> +			  (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))),
> +		/* STORE_STATE timer */
> +		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>  		/* MOV48 rX:rX+1, sync_addr */
> -		(1ull << 56) | (addr_reg << 48) | sync_addr,
> -
> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
>  		/* MOV48 rX+2, #1 */
> -		(1ull << 56) | (val_reg << 48) | 1,
> -
> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1),
>  		/* WAIT(all) */
> -		(3ull << 56) | (waitall_mask << 16),
> -
> +		JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)),
>  		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> -		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> +		JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> +				 (params->val_reg << 32) | (0 << 16) | 1),
> +		/* ERROR_BARRIER, so we can recover from faults at job boundaries. */
> +		JOB_INSTR_ALWAYS((47ull << 56)),
> +	};
> +	u32 pad;
>  
> -		/* ERROR_BARRIER, so we can recover from faults at job
> -		 * boundaries.
> -		 */
> -		(47ull << 56),
> +	/* NEED to be cacheline aligned to please the prefetcher. */
> +	static_assert(sizeof(instrs->buffer) % 64 == 0,
> +		      "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> +
> +	/* Make sure we have enough storage to store the whole sequence. */
> +	static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) ==
> +		      ARRAY_SIZE(instrs->buffer),
> +		      "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
> +
> +	for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
> +		/* If the profile mask of this instruction is not enabled, skip it. */
> +		if (instr_seq[i].profile_mask &&
> +		    !(instr_seq[i].profile_mask & params->profile_mask))
> +			continue;
> +
> +		instrs->buffer[instrs->count++] = instr_seq[i].instr;
> +	}
> +
> +	pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> +	memset(&instrs->buffer[instrs->count], 0,
> +	       (pad - instrs->count) * sizeof(instrs->buffer[0]));
> +	instrs->count = pad;
> +}
> +
> +static u32 calc_job_credits(u32 profile_mask)
> +{
> +	struct panthor_job_ringbuf_instrs instrs = {
> +		.count = 0,
> +	};
> +	struct panthor_job_cs_params params = {
> +		.profile_mask = profile_mask,
>  	};
>  
> -	/* Need to be cacheline aligned to please the prefetcher. */
> -	static_assert(sizeof(call_instrs) % 64 == 0,
> -		      "call_instrs is not aligned on a cacheline");
> +	prepare_job_instrs(&params, &instrs);
> +	return instrs.count;
> +}
> +
> +static struct dma_fence *
> +queue_run_job(struct drm_sched_job *sched_job)
> +{
> +	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> +	struct panthor_group *group = job->group;
> +	struct panthor_queue *queue = group->queues[job->queue_idx];
> +	struct panthor_device *ptdev = group->ptdev;
> +	struct panthor_scheduler *sched = ptdev->scheduler;
> +	struct panthor_job_ringbuf_instrs instrs;

instrs isn't initialised...

> +	struct panthor_job_cs_params cs_params;
> +	struct dma_fence *done_fence;
> +	int ret;
>  
>  	/* Stream size is zero, nothing to do except making sure all previously
>  	 * submitted jobs are done before we signal the
> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		       queue->fence_ctx.id,
>  		       atomic64_inc_return(&queue->fence_ctx.seqno));
>  
> -	memcpy(queue->ringbuf->kmap + ringbuf_insert,
> -	       call_instrs, sizeof(call_instrs));
> +	job->profiling.slot = queue->profiling.seqno++;
> +	if (queue->profiling.seqno == queue->profiling.slot_count)
> +		queue->profiling.seqno = 0;
> +
> +	job->ringbuf.start = queue->iface.input->insert;
> +
> +	get_job_cs_params(job, &cs_params);
> +	prepare_job_instrs(&cs_params, &instrs);

...but it's passed into prepare_job_instrs() which depends on
instrs.count (same bug as was in calc_job_credits()) - sorry I didn't
spot it last review.

Initializing instrs makes everything work for me.

I'm not sure quite what kernel configuration you are using but I wonder
if you've got a 'hardening' option enabled which is causing the stack to
be zero-initialised. It's worth turning it off for testing purposes ;)

Steve

> +	copy_instrs_to_ringbuf(queue, job, &instrs);
> +
> +	job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>  
>  	panthor_job_get(&job->base);
>  	spin_lock(&queue->fence_ctx.lock);
>  	list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
>  	spin_unlock(&queue->fence_ctx.lock);
>  
> -	job->ringbuf.start = queue->iface.input->insert;
> -	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> -
>  	/* Make sure the ring buffer is updated before the INSERT
>  	 * register.
>  	 */
> @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
>  	.free_job = queue_free_job,
>  };
>  
> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
> +				       u32 cs_ringbuf_size)
> +{
> +	u32 min_profiled_job_instrs = U32_MAX;
> +	u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
> +
> +	/*
> +	 * We want to calculate the minimum size of a profiled job's CS,
> +	 * because since they need additional instructions for the sampling
> +	 * of performance metrics, they might take up further slots in
> +	 * the queue's ringbuffer. This means we might not need as many job
> +	 * slots for keeping track of their profiling information. What we
> +	 * need is the maximum number of slots we should allocate to this end,
> +	 * which matches the maximum number of profiled jobs we can place
> +	 * simultaneously in the queue's ring buffer.
> +	 * That has to be calculated separately for every single job profiling
> +	 * flag, but not in the case job profiling is disabled, since unprofiled
> +	 * jobs don't need to keep track of this at all.
> +	 */
> +	for (u32 i = 0; i < last_flag; i++) {
> +		if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
> +			min_profiled_job_instrs =
> +				min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
> +	}
> +
> +	return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
> +}
> +
>  static struct panthor_queue *
>  group_create_queue(struct panthor_group *group,
>  		   const struct drm_panthor_queue_create *args)
> @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group,
>  		goto err_free_queue;
>  	}
>  
> +	queue->profiling.slot_count =
> +		calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
> +
> +	queue->profiling.slots =
> +		panthor_kernel_bo_create(group->ptdev, group->vm,
> +					 queue->profiling.slot_count *
> +					 sizeof(struct panthor_job_profiling_data),
> +					 DRM_PANTHOR_BO_NO_MMAP,
> +					 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> +					 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> +					 PANTHOR_VM_KERNEL_AUTO_VA);
> +
> +	if (IS_ERR(queue->profiling.slots)) {
> +		ret = PTR_ERR(queue->profiling.slots);
> +		goto err_free_queue;
> +	}
> +
> +	ret = panthor_kernel_bo_vmap(queue->profiling.slots);
> +	if (ret)
> +		goto err_free_queue;
> +
> +	/*
> +	 * Credit limit argument tells us the total number of instructions
> +	 * across all CS slots in the ringbuffer, with some jobs requiring
> +	 * twice as many as others, depending on their profiling status.
> +	 */
>  	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
>  			     group->ptdev->scheduler->wq, 1,
> -			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> +			     args->ringbuf_size / sizeof(u64),
>  			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
>  			     group->ptdev->reset.wq,
>  			     NULL, "panthor-queue", group->ptdev->base.dev);
> @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile,
>  {
>  	struct panthor_group_pool *gpool = pfile->groups;
>  	struct panthor_job *job;
> +	u32 credits;
>  	int ret;
>  
>  	if (qsubmit->pad)
> @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile,
>  		}
>  	}
>  
> +	job->profiling.mask = pfile->ptdev->profile_mask;
> +	credits = calc_job_credits(job->profiling.mask);
> +	if (credits == 0) {
> +		ret = -EINVAL;
> +		goto err_put_job;
> +	}
> +
>  	ret = drm_sched_job_init(&job->base,
>  				 &job->group->queues[job->queue_idx]->entity,
> -				 1, job->group);
> +				 credits, job->group);
>  	if (ret)
>  		goto err_put_job;
>
Boris Brezillon Sept. 23, 2024, 10:18 a.m. UTC | #2
On Mon, 23 Sep 2024 10:07:14 +0100
Steven Price <steven.price@arm.com> wrote:

> > +static struct dma_fence *
> > +queue_run_job(struct drm_sched_job *sched_job)
> > +{
> > +	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> > +	struct panthor_group *group = job->group;
> > +	struct panthor_queue *queue = group->queues[job->queue_idx];
> > +	struct panthor_device *ptdev = group->ptdev;
> > +	struct panthor_scheduler *sched = ptdev->scheduler;
> > +	struct panthor_job_ringbuf_instrs instrs;  
> 
> instrs isn't initialised...
> 
> > +	struct panthor_job_cs_params cs_params;
> > +	struct dma_fence *done_fence;
> > +	int ret;
> >  
> >  	/* Stream size is zero, nothing to do except making sure all previously
> >  	 * submitted jobs are done before we signal the
> > @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  		       queue->fence_ctx.id,
> >  		       atomic64_inc_return(&queue->fence_ctx.seqno));
> >  
> > -	memcpy(queue->ringbuf->kmap + ringbuf_insert,
> > -	       call_instrs, sizeof(call_instrs));
> > +	job->profiling.slot = queue->profiling.seqno++;
> > +	if (queue->profiling.seqno == queue->profiling.slot_count)
> > +		queue->profiling.seqno = 0;
> > +
> > +	job->ringbuf.start = queue->iface.input->insert;
> > +
> > +	get_job_cs_params(job, &cs_params);
> > +	prepare_job_instrs(&cs_params, &instrs);  
> 
> ...but it's passed into prepare_job_instrs() which depends on
> instrs.count (same bug as was in calc_job_credits()) - sorry I didn't
> spot it last review.

Hm, can't we initialize instr::count to zero in prepare_job_instrs()
instead?
Steven Price Sept. 23, 2024, 10:24 a.m. UTC | #3
On 23/09/2024 11:18, Boris Brezillon wrote:
> On Mon, 23 Sep 2024 10:07:14 +0100
> Steven Price <steven.price@arm.com> wrote:
> 
>>> +static struct dma_fence *
>>> +queue_run_job(struct drm_sched_job *sched_job)
>>> +{
>>> +	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
>>> +	struct panthor_group *group = job->group;
>>> +	struct panthor_queue *queue = group->queues[job->queue_idx];
>>> +	struct panthor_device *ptdev = group->ptdev;
>>> +	struct panthor_scheduler *sched = ptdev->scheduler;
>>> +	struct panthor_job_ringbuf_instrs instrs;  
>>
>> instrs isn't initialised...
>>
>>> +	struct panthor_job_cs_params cs_params;
>>> +	struct dma_fence *done_fence;
>>> +	int ret;
>>>  
>>>  	/* Stream size is zero, nothing to do except making sure all previously
>>>  	 * submitted jobs are done before we signal the
>>> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job)
>>>  		       queue->fence_ctx.id,
>>>  		       atomic64_inc_return(&queue->fence_ctx.seqno));
>>>  
>>> -	memcpy(queue->ringbuf->kmap + ringbuf_insert,
>>> -	       call_instrs, sizeof(call_instrs));
>>> +	job->profiling.slot = queue->profiling.seqno++;
>>> +	if (queue->profiling.seqno == queue->profiling.slot_count)
>>> +		queue->profiling.seqno = 0;
>>> +
>>> +	job->ringbuf.start = queue->iface.input->insert;
>>> +
>>> +	get_job_cs_params(job, &cs_params);
>>> +	prepare_job_instrs(&cs_params, &instrs);  
>>
>> ...but it's passed into prepare_job_instrs() which depends on
>> instrs.count (same bug as was in calc_job_credits()) - sorry I didn't
>> spot it last review.
> 
> Hm, can't we initialize instr::count to zero in prepare_job_instrs()
> instead?

Indeed that would probably be better! I hadn't noticed there were two
places in the previous review.

Steve
Adrián Larumbe Sept. 23, 2024, 8:48 p.m. UTC | #4
On 23.09.2024 10:07, Steven Price wrote:
>On 21/09/2024 00:43, Adrián Larumbe wrote:
>> Enable calculations of job submission times in clock cycles and wall
>> time. This is done by expanding the boilerplate command stream when running
>> a job to include instructions that compute said times right before and
>> after a user CS.
>> 
>> A separate kernel BO is created per queue to store those values. Jobs can
>> access their sampled data through an index different from that of the
>> queue's ringbuffer. The reason for this is saving memory on the profiling
>> information kernel BO, since the amount of simultaneous profiled jobs we
>> can write into the queue's ringbuffer might be much smaller than for
>> regular jobs, as the former take more CSF instructions.
>> 
>> This commit is done in preparation for enabling DRM fdinfo support in the
>> Panthor driver, which depends on the numbers calculated herein.
>> 
>> A profile mode mask has been added that will in a future commit allow UM to
>> toggle performance metric sampling behaviour, which is disabled by default
>> to save power. When a ringbuffer CS is constructed, timestamp and cycling
>> sampling instructions are added depending on the enabled flags in the
>> profiling mask.
>> 
>> A helper was provided that calculates the number of instructions for a
>> given set of enablement mask, and these are passed as the number of credits
>> when initialising a DRM scheduler job.
>> 
>> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
>> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
>> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
>
>I think just one bug remaining - see below...
>
>> ---
>>  drivers/gpu/drm/panthor/panthor_device.h |  22 ++
>>  drivers/gpu/drm/panthor/panthor_sched.c  | 328 +++++++++++++++++++----
>>  2 files changed, 301 insertions(+), 49 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
>> index e388c0472ba7..a48e30d0af30 100644
>> --- a/drivers/gpu/drm/panthor/panthor_device.h
>> +++ b/drivers/gpu/drm/panthor/panthor_device.h
>> @@ -66,6 +66,25 @@ struct panthor_irq {
>>  	atomic_t suspended;
>>  };
>>  
>> +/**
>> + * enum panthor_device_profiling_mode - Profiling state
>> + */
>> +enum panthor_device_profiling_flags {
>> +	/** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
>> +	PANTHOR_DEVICE_PROFILING_DISABLED = 0,
>> +
>> +	/** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
>> +	PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
>> +
>> +	/** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
>> +	PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
>> +
>> +	/** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
>> +	PANTHOR_DEVICE_PROFILING_ALL =
>> +	PANTHOR_DEVICE_PROFILING_CYCLES |
>> +	PANTHOR_DEVICE_PROFILING_TIMESTAMP,
>> +};
>> +
>>  /**
>>   * struct panthor_device - Panthor device
>>   */
>> @@ -162,6 +181,9 @@ struct panthor_device {
>>  		 */
>>  		struct page *dummy_latest_flush;
>>  	} pm;
>> +
>> +	/** @profile_mask: User-set profiling flags for job accounting. */
>> +	u32 profile_mask;
>>  };
>>  
>>  /**
>> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
>> index 42afdf0ddb7e..6da5c3d0015e 100644
>> --- a/drivers/gpu/drm/panthor/panthor_sched.c
>> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
>> @@ -93,6 +93,9 @@
>>  #define MIN_CSGS				3
>>  #define MAX_CSG_PRIO				0xf
>>  
>> +#define NUM_INSTRS_PER_CACHE_LINE		(64 / sizeof(u64))
>> +#define MAX_INSTRS_PER_JOB			24
>> +
>>  struct panthor_group;
>>  
>>  /**
>> @@ -476,6 +479,18 @@ struct panthor_queue {
>>  		 */
>>  		struct list_head in_flight_jobs;
>>  	} fence_ctx;
>> +
>> +	/** @profiling: Job profiling data slots and access information. */
>> +	struct {
>> +		/** @slots: Kernel BO holding the slots. */
>> +		struct panthor_kernel_bo *slots;
>> +
>> +		/** @slot_count: Number of jobs ringbuffer can hold at once. */
>> +		u32 slot_count;
>> +
>> +		/** @seqno: Index of the next available profiling information slot. */
>> +		u32 seqno;
>> +	} profiling;
>>  };
>>  
>>  /**
>> @@ -661,6 +676,18 @@ struct panthor_group {
>>  	struct list_head wait_node;
>>  };
>>  
>> +struct panthor_job_profiling_data {
>> +	struct {
>> +		u64 before;
>> +		u64 after;
>> +	} cycles;
>> +
>> +	struct {
>> +		u64 before;
>> +		u64 after;
>> +	} time;
>> +};
>> +
>>  /**
>>   * group_queue_work() - Queue a group work
>>   * @group: Group to queue the work for.
>> @@ -774,6 +801,15 @@ struct panthor_job {
>>  
>>  	/** @done_fence: Fence signaled when the job is finished or cancelled. */
>>  	struct dma_fence *done_fence;
>> +
>> +	/** @profiling: Job profiling information. */
>> +	struct {
>> +		/** @mask: Current device job profiling enablement bitmask. */
>> +		u32 mask;
>> +
>> +		/** @slot: Job index in the profiling slots BO. */
>> +		u32 slot;
>> +	} profiling;
>>  };
>>  
>>  static void
>> @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>>  
>>  	panthor_kernel_bo_destroy(queue->ringbuf);
>>  	panthor_kernel_bo_destroy(queue->iface.mem);
>> +	panthor_kernel_bo_destroy(queue->profiling.slots);
>>  
>>  	/* Release the last_fence we were holding, if any. */
>>  	dma_fence_put(queue->fence_ctx.last_fence);
>> @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
>>  	}
>>  }
>>  
>> -#define NUM_INSTRS_PER_SLOT		16
>> -
>>  static void
>>  group_term_post_processing(struct panthor_group *group)
>>  {
>> @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work)
>>  	group_put(group);
>>  }
>>  
>> -static struct dma_fence *
>> -queue_run_job(struct drm_sched_job *sched_job)
>> +struct panthor_job_ringbuf_instrs {
>> +	u64 buffer[MAX_INSTRS_PER_JOB];
>> +	u32 count;
>> +};
>> +
>> +struct panthor_job_instr {
>> +	u32 profile_mask;
>> +	u64 instr;
>> +};
>> +
>> +#define JOB_INSTR(__prof, __instr) \
>> +	{ \
>> +		.profile_mask = __prof, \
>> +		.instr = __instr, \
>> +	}
>> +
>> +static void
>> +copy_instrs_to_ringbuf(struct panthor_queue *queue,
>> +		       struct panthor_job *job,
>> +		       struct panthor_job_ringbuf_instrs *instrs)
>> +{
>> +	u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
>> +	u64 start = job->ringbuf.start & (ringbuf_size - 1);
>> +	u64 size, written;
>> +
>> +	/*
>> +	 * We need to write a whole slot, including any trailing zeroes
>> +	 * that may come at the end of it. Also, because instrs.buffer has
>> +	 * been zero-initialised, there's no need to pad it with 0's
>> +	 */
>> +	instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
>> +	size = instrs->count * sizeof(u64);
>> +	WARN_ON(size > ringbuf_size);
>> +	written = min(ringbuf_size - start, size);
>> +
>> +	memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
>> +
>> +	if (written < size)
>> +		memcpy(queue->ringbuf->kmap,
>> +		       &instrs->buffer[written/sizeof(u64)],
>> +		       size - written);
>> +}
>> +
>> +struct panthor_job_cs_params {
>> +	u32 profile_mask;
>> +	u64 addr_reg; u64 val_reg;
>> +	u64 cycle_reg; u64 time_reg;
>> +	u64 sync_addr; u64 times_addr;
>> +	u64 cs_start; u64 cs_size;
>> +	u32 last_flush; u32 waitall_mask;
>> +};
>> +
>> +static void
>> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
>>  {
>> -	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
>>  	struct panthor_group *group = job->group;
>>  	struct panthor_queue *queue = group->queues[job->queue_idx];
>>  	struct panthor_device *ptdev = group->ptdev;
>>  	struct panthor_scheduler *sched = ptdev->scheduler;
>> -	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
>> -	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
>> -	u64 addr_reg = ptdev->csif_info.cs_reg_count -
>> -		       ptdev->csif_info.unpreserved_cs_reg_count;
>> -	u64 val_reg = addr_reg + 2;
>> -	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
>> -			job->queue_idx * sizeof(struct panthor_syncobj_64b);
>> -	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>> -	struct dma_fence *done_fence;
>> -	int ret;
>>  
>> -	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
>> -		/* MOV32 rX+2, cs.latest_flush */
>> -		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
>> +	params->addr_reg = ptdev->csif_info.cs_reg_count -
>> +			   ptdev->csif_info.unpreserved_cs_reg_count;
>> +	params->val_reg = params->addr_reg + 2;
>> +	params->cycle_reg = params->addr_reg;
>> +	params->time_reg = params->val_reg;
>>  
>> -		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
>> -		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
>> +	params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
>> +			    job->queue_idx * sizeof(struct panthor_syncobj_64b);
>> +	params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) +
>> +			     (job->profiling.slot * sizeof(struct panthor_job_profiling_data));
>> +	params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>>  
>> -		/* MOV48 rX:rX+1, cs.start */
>> -		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
>> +	params->cs_start = job->call_info.start;
>> +	params->cs_size = job->call_info.size;
>> +	params->last_flush = job->call_info.latest_flush;
>>  
>> -		/* MOV32 rX+2, cs.size */
>> -		(2ull << 56) | (val_reg << 48) | job->call_info.size,
>> +	params->profile_mask = job->profiling.mask;
>> +}
>>  
>> -		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
>> -		(3ull << 56) | (1 << 16),
>> +#define JOB_INSTR_ALWAYS(instr) \
>> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr))
>> +#define JOB_INSTR_TIMESTAMP(instr) \
>> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr))
>> +#define JOB_INSTR_CYCLES(instr) \
>> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr))
>>  
>> +static void
>> +prepare_job_instrs(const struct panthor_job_cs_params *params,
>> +		   struct panthor_job_ringbuf_instrs *instrs)
>> +{
>> +	const struct panthor_job_instr instr_seq[] = {
>> +		/* MOV32 rX+2, cs.latest_flush */
>> +		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush),
>> +		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
>> +		JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
>> +		/* MOV48 rX:rX+1, cycles_offset */
>> +		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
>> +				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))),
>> +		/* STORE_STATE cycles */
>> +		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
>> +		/* MOV48 rX:rX+1, time_offset */
>> +		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr +
>> +			   offsetof(struct panthor_job_profiling_data, time.before))),
>> +		/* STORE_STATE timer */
>> +		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>> +		/* MOV48 rX:rX+1, cs.start */
>> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start),
>> +		/* MOV32 rX+2, cs.size */
>> +		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size),
>> +		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
>> +		JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)),
>>  		/* CALL rX:rX+1, rX+2 */
>> -		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
>> -
>> +		JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
>> +		/* MOV48 rX:rX+1, cycles_offset */
>> +		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
>> +				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))),
>> +		/* STORE_STATE cycles */
>> +		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
>> +		/* MOV48 rX:rX+1, time_offset */
>> +		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
>> +			  (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))),
>> +		/* STORE_STATE timer */
>> +		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>>  		/* MOV48 rX:rX+1, sync_addr */
>> -		(1ull << 56) | (addr_reg << 48) | sync_addr,
>> -
>> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
>>  		/* MOV48 rX+2, #1 */
>> -		(1ull << 56) | (val_reg << 48) | 1,
>> -
>> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1),
>>  		/* WAIT(all) */
>> -		(3ull << 56) | (waitall_mask << 16),
>> -
>> +		JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)),
>>  		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
>> -		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
>> +		JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
>> +				 (params->val_reg << 32) | (0 << 16) | 1),
>> +		/* ERROR_BARRIER, so we can recover from faults at job boundaries. */
>> +		JOB_INSTR_ALWAYS((47ull << 56)),
>> +	};
>> +	u32 pad;
>>  
>> -		/* ERROR_BARRIER, so we can recover from faults at job
>> -		 * boundaries.
>> -		 */
>> -		(47ull << 56),
>> +	/* NEED to be cacheline aligned to please the prefetcher. */
>> +	static_assert(sizeof(instrs->buffer) % 64 == 0,
>> +		      "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
>> +
>> +	/* Make sure we have enough storage to store the whole sequence. */
>> +	static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) ==
>> +		      ARRAY_SIZE(instrs->buffer),
>> +		      "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
>> +
>> +	for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
>> +		/* If the profile mask of this instruction is not enabled, skip it. */
>> +		if (instr_seq[i].profile_mask &&
>> +		    !(instr_seq[i].profile_mask & params->profile_mask))
>> +			continue;
>> +
>> +		instrs->buffer[instrs->count++] = instr_seq[i].instr;
>> +	}
>> +
>> +	pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
>> +	memset(&instrs->buffer[instrs->count], 0,
>> +	       (pad - instrs->count) * sizeof(instrs->buffer[0]));
>> +	instrs->count = pad;
>> +}
>> +
>> +static u32 calc_job_credits(u32 profile_mask)
>> +{
>> +	struct panthor_job_ringbuf_instrs instrs = {
>> +		.count = 0,
>> +	};
>> +	struct panthor_job_cs_params params = {
>> +		.profile_mask = profile_mask,
>>  	};
>>  
>> -	/* Need to be cacheline aligned to please the prefetcher. */
>> -	static_assert(sizeof(call_instrs) % 64 == 0,
>> -		      "call_instrs is not aligned on a cacheline");
>> +	prepare_job_instrs(&params, &instrs);
>> +	return instrs.count;
>> +}
>> +
>> +static struct dma_fence *
>> +queue_run_job(struct drm_sched_job *sched_job)
>> +{
>> +	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
>> +	struct panthor_group *group = job->group;
>> +	struct panthor_queue *queue = group->queues[job->queue_idx];
>> +	struct panthor_device *ptdev = group->ptdev;
>> +	struct panthor_scheduler *sched = ptdev->scheduler;
>> +	struct panthor_job_ringbuf_instrs instrs;
>
>instrs isn't initialised...
>
>> +	struct panthor_job_cs_params cs_params;
>> +	struct dma_fence *done_fence;
>> +	int ret;
>>  
>>  	/* Stream size is zero, nothing to do except making sure all previously
>>  	 * submitted jobs are done before we signal the
>> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job)
>>  		       queue->fence_ctx.id,
>>  		       atomic64_inc_return(&queue->fence_ctx.seqno));
>>  
>> -	memcpy(queue->ringbuf->kmap + ringbuf_insert,
>> -	       call_instrs, sizeof(call_instrs));
>> +	job->profiling.slot = queue->profiling.seqno++;
>> +	if (queue->profiling.seqno == queue->profiling.slot_count)
>> +		queue->profiling.seqno = 0;
>> +
>> +	job->ringbuf.start = queue->iface.input->insert;
>> +
>> +	get_job_cs_params(job, &cs_params);
>> +	prepare_job_instrs(&cs_params, &instrs);
>
>...but it's passed into prepare_job_instrs() which depends on
>instrs.count (same bug as was in calc_job_credits()) - sorry I didn't
>spot it last review.
>
>Initializing instrs makes everything work for me.
>
>I'm not sure quite what kernel configuration you are using but I wonder
>if you've got a 'hardening' option enabled which is causing the stack to
>be zero-initialised. It's worth turning it off for testing purposes ;)

Thanks for catching this, it went completely unnoticed to me. Delving into my kernel
config, I found this option: CONFIG_INIT_STACK_ALL_ZERO. When I unset it, it triggers
an invalid memory reference in the kernel, I guess because of uninitialised stack
variables.

I don't know why I had this option enabled, but come to think of it seems like a
terrible idea. Thanks for bringing this up.

>Steve
>
>> +	copy_instrs_to_ringbuf(queue, job, &instrs);
>> +
>> +	job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>>  
>>  	panthor_job_get(&job->base);
>>  	spin_lock(&queue->fence_ctx.lock);
>>  	list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
>>  	spin_unlock(&queue->fence_ctx.lock);
>>  
>> -	job->ringbuf.start = queue->iface.input->insert;
>> -	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
>> -
>>  	/* Make sure the ring buffer is updated before the INSERT
>>  	 * register.
>>  	 */
>> @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
>>  	.free_job = queue_free_job,
>>  };
>>  
>> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
>> +				       u32 cs_ringbuf_size)
>> +{
>> +	u32 min_profiled_job_instrs = U32_MAX;
>> +	u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
>> +
>> +	/*
>> +	 * We want to calculate the minimum size of a profiled job's CS,
>> +	 * because since they need additional instructions for the sampling
>> +	 * of performance metrics, they might take up further slots in
>> +	 * the queue's ringbuffer. This means we might not need as many job
>> +	 * slots for keeping track of their profiling information. What we
>> +	 * need is the maximum number of slots we should allocate to this end,
>> +	 * which matches the maximum number of profiled jobs we can place
>> +	 * simultaneously in the queue's ring buffer.
>> +	 * That has to be calculated separately for every single job profiling
>> +	 * flag, but not in the case job profiling is disabled, since unprofiled
>> +	 * jobs don't need to keep track of this at all.
>> +	 */
>> +	for (u32 i = 0; i < last_flag; i++) {
>> +		if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
>> +			min_profiled_job_instrs =
>> +				min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
>> +	}
>> +
>> +	return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
>> +}
>> +
>>  static struct panthor_queue *
>>  group_create_queue(struct panthor_group *group,
>>  		   const struct drm_panthor_queue_create *args)
>> @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group,
>>  		goto err_free_queue;
>>  	}
>>  
>> +	queue->profiling.slot_count =
>> +		calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
>> +
>> +	queue->profiling.slots =
>> +		panthor_kernel_bo_create(group->ptdev, group->vm,
>> +					 queue->profiling.slot_count *
>> +					 sizeof(struct panthor_job_profiling_data),
>> +					 DRM_PANTHOR_BO_NO_MMAP,
>> +					 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
>> +					 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
>> +					 PANTHOR_VM_KERNEL_AUTO_VA);
>> +
>> +	if (IS_ERR(queue->profiling.slots)) {
>> +		ret = PTR_ERR(queue->profiling.slots);
>> +		goto err_free_queue;
>> +	}
>> +
>> +	ret = panthor_kernel_bo_vmap(queue->profiling.slots);
>> +	if (ret)
>> +		goto err_free_queue;
>> +
>> +	/*
>> +	 * Credit limit argument tells us the total number of instructions
>> +	 * across all CS slots in the ringbuffer, with some jobs requiring
>> +	 * twice as many as others, depending on their profiling status.
>> +	 */
>>  	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
>>  			     group->ptdev->scheduler->wq, 1,
>> -			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
>> +			     args->ringbuf_size / sizeof(u64),
>>  			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
>>  			     group->ptdev->reset.wq,
>>  			     NULL, "panthor-queue", group->ptdev->base.dev);
>> @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile,
>>  {
>>  	struct panthor_group_pool *gpool = pfile->groups;
>>  	struct panthor_job *job;
>> +	u32 credits;
>>  	int ret;
>>  
>>  	if (qsubmit->pad)
>> @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile,
>>  		}
>>  	}
>>  
>> +	job->profiling.mask = pfile->ptdev->profile_mask;
>> +	credits = calc_job_credits(job->profiling.mask);
>> +	if (credits == 0) {
>> +		ret = -EINVAL;
>> +		goto err_put_job;
>> +	}
>> +
>>  	ret = drm_sched_job_init(&job->base,
>>  				 &job->group->queues[job->queue_idx]->entity,
>> -				 1, job->group);
>> +				 credits, job->group);
>>  	if (ret)
>>  		goto err_put_job;
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index e388c0472ba7..a48e30d0af30 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -66,6 +66,25 @@  struct panthor_irq {
 	atomic_t suspended;
 };
 
+/**
+ * enum panthor_device_profiling_mode - Profiling state
+ */
+enum panthor_device_profiling_flags {
+	/** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
+	PANTHOR_DEVICE_PROFILING_DISABLED = 0,
+
+	/** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
+	PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
+
+	/** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
+	PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
+
+	/** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
+	PANTHOR_DEVICE_PROFILING_ALL =
+	PANTHOR_DEVICE_PROFILING_CYCLES |
+	PANTHOR_DEVICE_PROFILING_TIMESTAMP,
+};
+
 /**
  * struct panthor_device - Panthor device
  */
@@ -162,6 +181,9 @@  struct panthor_device {
 		 */
 		struct page *dummy_latest_flush;
 	} pm;
+
+	/** @profile_mask: User-set profiling flags for job accounting. */
+	u32 profile_mask;
 };
 
 /**
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 42afdf0ddb7e..6da5c3d0015e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -93,6 +93,9 @@ 
 #define MIN_CSGS				3
 #define MAX_CSG_PRIO				0xf
 
+#define NUM_INSTRS_PER_CACHE_LINE		(64 / sizeof(u64))
+#define MAX_INSTRS_PER_JOB			24
+
 struct panthor_group;
 
 /**
@@ -476,6 +479,18 @@  struct panthor_queue {
 		 */
 		struct list_head in_flight_jobs;
 	} fence_ctx;
+
+	/** @profiling: Job profiling data slots and access information. */
+	struct {
+		/** @slots: Kernel BO holding the slots. */
+		struct panthor_kernel_bo *slots;
+
+		/** @slot_count: Number of jobs ringbuffer can hold at once. */
+		u32 slot_count;
+
+		/** @seqno: Index of the next available profiling information slot. */
+		u32 seqno;
+	} profiling;
 };
 
 /**
@@ -661,6 +676,18 @@  struct panthor_group {
 	struct list_head wait_node;
 };
 
+struct panthor_job_profiling_data {
+	struct {
+		u64 before;
+		u64 after;
+	} cycles;
+
+	struct {
+		u64 before;
+		u64 after;
+	} time;
+};
+
 /**
  * group_queue_work() - Queue a group work
  * @group: Group to queue the work for.
@@ -774,6 +801,15 @@  struct panthor_job {
 
 	/** @done_fence: Fence signaled when the job is finished or cancelled. */
 	struct dma_fence *done_fence;
+
+	/** @profiling: Job profiling information. */
+	struct {
+		/** @mask: Current device job profiling enablement bitmask. */
+		u32 mask;
+
+		/** @slot: Job index in the profiling slots BO. */
+		u32 slot;
+	} profiling;
 };
 
 static void
@@ -838,6 +874,7 @@  static void group_free_queue(struct panthor_group *group, struct panthor_queue *
 
 	panthor_kernel_bo_destroy(queue->ringbuf);
 	panthor_kernel_bo_destroy(queue->iface.mem);
+	panthor_kernel_bo_destroy(queue->profiling.slots);
 
 	/* Release the last_fence we were holding, if any. */
 	dma_fence_put(queue->fence_ctx.last_fence);
@@ -1982,8 +2019,6 @@  tick_ctx_init(struct panthor_scheduler *sched,
 	}
 }
 
-#define NUM_INSTRS_PER_SLOT		16
-
 static void
 group_term_post_processing(struct panthor_group *group)
 {
@@ -2815,65 +2850,192 @@  static void group_sync_upd_work(struct work_struct *work)
 	group_put(group);
 }
 
-static struct dma_fence *
-queue_run_job(struct drm_sched_job *sched_job)
+struct panthor_job_ringbuf_instrs {
+	u64 buffer[MAX_INSTRS_PER_JOB];
+	u32 count;
+};
+
+struct panthor_job_instr {
+	u32 profile_mask;
+	u64 instr;
+};
+
+#define JOB_INSTR(__prof, __instr) \
+	{ \
+		.profile_mask = __prof, \
+		.instr = __instr, \
+	}
+
+static void
+copy_instrs_to_ringbuf(struct panthor_queue *queue,
+		       struct panthor_job *job,
+		       struct panthor_job_ringbuf_instrs *instrs)
+{
+	u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
+	u64 start = job->ringbuf.start & (ringbuf_size - 1);
+	u64 size, written;
+
+	/*
+	 * We need to write a whole slot, including any trailing zeroes
+	 * that may come at the end of it. Also, because instrs.buffer has
+	 * been zero-initialised, there's no need to pad it with 0's
+	 */
+	instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
+	size = instrs->count * sizeof(u64);
+	WARN_ON(size > ringbuf_size);
+	written = min(ringbuf_size - start, size);
+
+	memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
+
+	if (written < size)
+		memcpy(queue->ringbuf->kmap,
+		       &instrs->buffer[written/sizeof(u64)],
+		       size - written);
+}
+
+struct panthor_job_cs_params {
+	u32 profile_mask;
+	u64 addr_reg; u64 val_reg;
+	u64 cycle_reg; u64 time_reg;
+	u64 sync_addr; u64 times_addr;
+	u64 cs_start; u64 cs_size;
+	u32 last_flush; u32 waitall_mask;
+};
+
+static void
+get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
 {
-	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
 	struct panthor_group *group = job->group;
 	struct panthor_queue *queue = group->queues[job->queue_idx];
 	struct panthor_device *ptdev = group->ptdev;
 	struct panthor_scheduler *sched = ptdev->scheduler;
-	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
-	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
-	u64 addr_reg = ptdev->csif_info.cs_reg_count -
-		       ptdev->csif_info.unpreserved_cs_reg_count;
-	u64 val_reg = addr_reg + 2;
-	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
-			job->queue_idx * sizeof(struct panthor_syncobj_64b);
-	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
-	struct dma_fence *done_fence;
-	int ret;
 
-	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
-		/* MOV32 rX+2, cs.latest_flush */
-		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
+	params->addr_reg = ptdev->csif_info.cs_reg_count -
+			   ptdev->csif_info.unpreserved_cs_reg_count;
+	params->val_reg = params->addr_reg + 2;
+	params->cycle_reg = params->addr_reg;
+	params->time_reg = params->val_reg;
 
-		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
-		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
+	params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
+			    job->queue_idx * sizeof(struct panthor_syncobj_64b);
+	params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) +
+			     (job->profiling.slot * sizeof(struct panthor_job_profiling_data));
+	params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
 
-		/* MOV48 rX:rX+1, cs.start */
-		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
+	params->cs_start = job->call_info.start;
+	params->cs_size = job->call_info.size;
+	params->last_flush = job->call_info.latest_flush;
 
-		/* MOV32 rX+2, cs.size */
-		(2ull << 56) | (val_reg << 48) | job->call_info.size,
+	params->profile_mask = job->profiling.mask;
+}
 
-		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
-		(3ull << 56) | (1 << 16),
+#define JOB_INSTR_ALWAYS(instr) \
+	JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr))
+#define JOB_INSTR_TIMESTAMP(instr) \
+	JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr))
+#define JOB_INSTR_CYCLES(instr) \
+	JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr))
 
+static void
+prepare_job_instrs(const struct panthor_job_cs_params *params,
+		   struct panthor_job_ringbuf_instrs *instrs)
+{
+	const struct panthor_job_instr instr_seq[] = {
+		/* MOV32 rX+2, cs.latest_flush */
+		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush),
+		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
+		JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
+		/* MOV48 rX:rX+1, cycles_offset */
+		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
+				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))),
+		/* STORE_STATE cycles */
+		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
+		/* MOV48 rX:rX+1, time_offset */
+		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr +
+			   offsetof(struct panthor_job_profiling_data, time.before))),
+		/* STORE_STATE timer */
+		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
+		/* MOV48 rX:rX+1, cs.start */
+		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start),
+		/* MOV32 rX+2, cs.size */
+		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size),
+		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
+		JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)),
 		/* CALL rX:rX+1, rX+2 */
-		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
-
+		JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
+		/* MOV48 rX:rX+1, cycles_offset */
+		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
+				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))),
+		/* STORE_STATE cycles */
+		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
+		/* MOV48 rX:rX+1, time_offset */
+		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
+			  (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))),
+		/* STORE_STATE timer */
+		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
 		/* MOV48 rX:rX+1, sync_addr */
-		(1ull << 56) | (addr_reg << 48) | sync_addr,
-
+		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
 		/* MOV48 rX+2, #1 */
-		(1ull << 56) | (val_reg << 48) | 1,
-
+		JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1),
 		/* WAIT(all) */
-		(3ull << 56) | (waitall_mask << 16),
-
+		JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)),
 		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
-		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
+		JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
+				 (params->val_reg << 32) | (0 << 16) | 1),
+		/* ERROR_BARRIER, so we can recover from faults at job boundaries. */
+		JOB_INSTR_ALWAYS((47ull << 56)),
+	};
+	u32 pad;
 
-		/* ERROR_BARRIER, so we can recover from faults at job
-		 * boundaries.
-		 */
-		(47ull << 56),
+	/* NEED to be cacheline aligned to please the prefetcher. */
+	static_assert(sizeof(instrs->buffer) % 64 == 0,
+		      "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
+
+	/* Make sure we have enough storage to store the whole sequence. */
+	static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) ==
+		      ARRAY_SIZE(instrs->buffer),
+		      "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
+
+	for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
+		/* If the profile mask of this instruction is not enabled, skip it. */
+		if (instr_seq[i].profile_mask &&
+		    !(instr_seq[i].profile_mask & params->profile_mask))
+			continue;
+
+		instrs->buffer[instrs->count++] = instr_seq[i].instr;
+	}
+
+	pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
+	memset(&instrs->buffer[instrs->count], 0,
+	       (pad - instrs->count) * sizeof(instrs->buffer[0]));
+	instrs->count = pad;
+}
+
+static u32 calc_job_credits(u32 profile_mask)
+{
+	struct panthor_job_ringbuf_instrs instrs = {
+		.count = 0,
+	};
+	struct panthor_job_cs_params params = {
+		.profile_mask = profile_mask,
 	};
 
-	/* Need to be cacheline aligned to please the prefetcher. */
-	static_assert(sizeof(call_instrs) % 64 == 0,
-		      "call_instrs is not aligned on a cacheline");
+	prepare_job_instrs(&params, &instrs);
+	return instrs.count;
+}
+
+static struct dma_fence *
+queue_run_job(struct drm_sched_job *sched_job)
+{
+	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+	struct panthor_group *group = job->group;
+	struct panthor_queue *queue = group->queues[job->queue_idx];
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_job_ringbuf_instrs instrs;
+	struct panthor_job_cs_params cs_params;
+	struct dma_fence *done_fence;
+	int ret;
 
 	/* Stream size is zero, nothing to do except making sure all previously
 	 * submitted jobs are done before we signal the
@@ -2900,17 +3062,23 @@  queue_run_job(struct drm_sched_job *sched_job)
 		       queue->fence_ctx.id,
 		       atomic64_inc_return(&queue->fence_ctx.seqno));
 
-	memcpy(queue->ringbuf->kmap + ringbuf_insert,
-	       call_instrs, sizeof(call_instrs));
+	job->profiling.slot = queue->profiling.seqno++;
+	if (queue->profiling.seqno == queue->profiling.slot_count)
+		queue->profiling.seqno = 0;
+
+	job->ringbuf.start = queue->iface.input->insert;
+
+	get_job_cs_params(job, &cs_params);
+	prepare_job_instrs(&cs_params, &instrs);
+	copy_instrs_to_ringbuf(queue, job, &instrs);
+
+	job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
 
 	panthor_job_get(&job->base);
 	spin_lock(&queue->fence_ctx.lock);
 	list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
 	spin_unlock(&queue->fence_ctx.lock);
 
-	job->ringbuf.start = queue->iface.input->insert;
-	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
-
 	/* Make sure the ring buffer is updated before the INSERT
 	 * register.
 	 */
@@ -3003,6 +3171,34 @@  static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
 	.free_job = queue_free_job,
 };
 
+static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
+				       u32 cs_ringbuf_size)
+{
+	u32 min_profiled_job_instrs = U32_MAX;
+	u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
+
+	/*
+	 * We want to calculate the minimum size of a profiled job's CS,
+	 * because since they need additional instructions for the sampling
+	 * of performance metrics, they might take up further slots in
+	 * the queue's ringbuffer. This means we might not need as many job
+	 * slots for keeping track of their profiling information. What we
+	 * need is the maximum number of slots we should allocate to this end,
+	 * which matches the maximum number of profiled jobs we can place
+	 * simultaneously in the queue's ring buffer.
+	 * That has to be calculated separately for every single job profiling
+	 * flag, but not in the case job profiling is disabled, since unprofiled
+	 * jobs don't need to keep track of this at all.
+	 */
+	for (u32 i = 0; i < last_flag; i++) {
+		if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
+			min_profiled_job_instrs =
+				min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
+	}
+
+	return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
+}
+
 static struct panthor_queue *
 group_create_queue(struct panthor_group *group,
 		   const struct drm_panthor_queue_create *args)
@@ -3056,9 +3252,35 @@  group_create_queue(struct panthor_group *group,
 		goto err_free_queue;
 	}
 
+	queue->profiling.slot_count =
+		calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
+
+	queue->profiling.slots =
+		panthor_kernel_bo_create(group->ptdev, group->vm,
+					 queue->profiling.slot_count *
+					 sizeof(struct panthor_job_profiling_data),
+					 DRM_PANTHOR_BO_NO_MMAP,
+					 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+					 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+					 PANTHOR_VM_KERNEL_AUTO_VA);
+
+	if (IS_ERR(queue->profiling.slots)) {
+		ret = PTR_ERR(queue->profiling.slots);
+		goto err_free_queue;
+	}
+
+	ret = panthor_kernel_bo_vmap(queue->profiling.slots);
+	if (ret)
+		goto err_free_queue;
+
+	/*
+	 * Credit limit argument tells us the total number of instructions
+	 * across all CS slots in the ringbuffer, with some jobs requiring
+	 * twice as many as others, depending on their profiling status.
+	 */
 	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
 			     group->ptdev->scheduler->wq, 1,
-			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
+			     args->ringbuf_size / sizeof(u64),
 			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
 			     group->ptdev->reset.wq,
 			     NULL, "panthor-queue", group->ptdev->base.dev);
@@ -3354,6 +3576,7 @@  panthor_job_create(struct panthor_file *pfile,
 {
 	struct panthor_group_pool *gpool = pfile->groups;
 	struct panthor_job *job;
+	u32 credits;
 	int ret;
 
 	if (qsubmit->pad)
@@ -3407,9 +3630,16 @@  panthor_job_create(struct panthor_file *pfile,
 		}
 	}
 
+	job->profiling.mask = pfile->ptdev->profile_mask;
+	credits = calc_job_credits(job->profiling.mask);
+	if (credits == 0) {
+		ret = -EINVAL;
+		goto err_put_job;
+	}
+
 	ret = drm_sched_job_init(&job->base,
 				 &job->group->queues[job->queue_idx]->entity,
-				 1, job->group);
+				 credits, job->group);
 	if (ret)
 		goto err_put_job;