Message ID | 20240920234436.207563-2-adrian.larumbe@collabora.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Support fdinfo runtime and memory stats on Panthor | expand |
On 21/09/2024 00:43, Adrián Larumbe wrote: > Enable calculations of job submission times in clock cycles and wall > time. This is done by expanding the boilerplate command stream when running > a job to include instructions that compute said times right before and > after a user CS. > > A separate kernel BO is created per queue to store those values. Jobs can > access their sampled data through an index different from that of the > queue's ringbuffer. The reason for this is saving memory on the profiling > information kernel BO, since the amount of simultaneous profiled jobs we > can write into the queue's ringbuffer might be much smaller than for > regular jobs, as the former take more CSF instructions. > > This commit is done in preparation for enabling DRM fdinfo support in the > Panthor driver, which depends on the numbers calculated herein. > > A profile mode mask has been added that will in a future commit allow UM to > toggle performance metric sampling behaviour, which is disabled by default > to save power. When a ringbuffer CS is constructed, timestamp and cycling > sampling instructions are added depending on the enabled flags in the > profiling mask. > > A helper was provided that calculates the number of instructions for a > given set of enablement mask, and these are passed as the number of credits > when initialising a DRM scheduler job. > > Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com> > Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> > Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> I think just one bug remaining - see below... > --- > drivers/gpu/drm/panthor/panthor_device.h | 22 ++ > drivers/gpu/drm/panthor/panthor_sched.c | 328 +++++++++++++++++++---- > 2 files changed, 301 insertions(+), 49 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h > index e388c0472ba7..a48e30d0af30 100644 > --- a/drivers/gpu/drm/panthor/panthor_device.h > +++ b/drivers/gpu/drm/panthor/panthor_device.h > @@ -66,6 +66,25 @@ struct panthor_irq { > atomic_t suspended; > }; > > +/** > + * enum panthor_device_profiling_mode - Profiling state > + */ > +enum panthor_device_profiling_flags { > + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */ > + PANTHOR_DEVICE_PROFILING_DISABLED = 0, > + > + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */ > + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0), > + > + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */ > + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1), > + > + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */ > + PANTHOR_DEVICE_PROFILING_ALL = > + PANTHOR_DEVICE_PROFILING_CYCLES | > + PANTHOR_DEVICE_PROFILING_TIMESTAMP, > +}; > + > /** > * struct panthor_device - Panthor device > */ > @@ -162,6 +181,9 @@ struct panthor_device { > */ > struct page *dummy_latest_flush; > } pm; > + > + /** @profile_mask: User-set profiling flags for job accounting. */ > + u32 profile_mask; > }; > > /** > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c > index 42afdf0ddb7e..6da5c3d0015e 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -93,6 +93,9 @@ > #define MIN_CSGS 3 > #define MAX_CSG_PRIO 0xf > > +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64)) > +#define MAX_INSTRS_PER_JOB 24 > + > struct panthor_group; > > /** > @@ -476,6 +479,18 @@ struct panthor_queue { > */ > struct list_head in_flight_jobs; > } fence_ctx; > + > + /** @profiling: Job profiling data slots and access information. */ > + struct { > + /** @slots: Kernel BO holding the slots. */ > + struct panthor_kernel_bo *slots; > + > + /** @slot_count: Number of jobs ringbuffer can hold at once. */ > + u32 slot_count; > + > + /** @seqno: Index of the next available profiling information slot. */ > + u32 seqno; > + } profiling; > }; > > /** > @@ -661,6 +676,18 @@ struct panthor_group { > struct list_head wait_node; > }; > > +struct panthor_job_profiling_data { > + struct { > + u64 before; > + u64 after; > + } cycles; > + > + struct { > + u64 before; > + u64 after; > + } time; > +}; > + > /** > * group_queue_work() - Queue a group work > * @group: Group to queue the work for. > @@ -774,6 +801,15 @@ struct panthor_job { > > /** @done_fence: Fence signaled when the job is finished or cancelled. */ > struct dma_fence *done_fence; > + > + /** @profiling: Job profiling information. */ > + struct { > + /** @mask: Current device job profiling enablement bitmask. */ > + u32 mask; > + > + /** @slot: Job index in the profiling slots BO. */ > + u32 slot; > + } profiling; > }; > > static void > @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * > > panthor_kernel_bo_destroy(queue->ringbuf); > panthor_kernel_bo_destroy(queue->iface.mem); > + panthor_kernel_bo_destroy(queue->profiling.slots); > > /* Release the last_fence we were holding, if any. */ > dma_fence_put(queue->fence_ctx.last_fence); > @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched, > } > } > > -#define NUM_INSTRS_PER_SLOT 16 > - > static void > group_term_post_processing(struct panthor_group *group) > { > @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work) > group_put(group); > } > > -static struct dma_fence * > -queue_run_job(struct drm_sched_job *sched_job) > +struct panthor_job_ringbuf_instrs { > + u64 buffer[MAX_INSTRS_PER_JOB]; > + u32 count; > +}; > + > +struct panthor_job_instr { > + u32 profile_mask; > + u64 instr; > +}; > + > +#define JOB_INSTR(__prof, __instr) \ > + { \ > + .profile_mask = __prof, \ > + .instr = __instr, \ > + } > + > +static void > +copy_instrs_to_ringbuf(struct panthor_queue *queue, > + struct panthor_job *job, > + struct panthor_job_ringbuf_instrs *instrs) > +{ > + u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); > + u64 start = job->ringbuf.start & (ringbuf_size - 1); > + u64 size, written; > + > + /* > + * We need to write a whole slot, including any trailing zeroes > + * that may come at the end of it. Also, because instrs.buffer has > + * been zero-initialised, there's no need to pad it with 0's > + */ > + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); > + size = instrs->count * sizeof(u64); > + WARN_ON(size > ringbuf_size); > + written = min(ringbuf_size - start, size); > + > + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written); > + > + if (written < size) > + memcpy(queue->ringbuf->kmap, > + &instrs->buffer[written/sizeof(u64)], > + size - written); > +} > + > +struct panthor_job_cs_params { > + u32 profile_mask; > + u64 addr_reg; u64 val_reg; > + u64 cycle_reg; u64 time_reg; > + u64 sync_addr; u64 times_addr; > + u64 cs_start; u64 cs_size; > + u32 last_flush; u32 waitall_mask; > +}; > + > +static void > +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params) > { > - struct panthor_job *job = container_of(sched_job, struct panthor_job, base); > struct panthor_group *group = job->group; > struct panthor_queue *queue = group->queues[job->queue_idx]; > struct panthor_device *ptdev = group->ptdev; > struct panthor_scheduler *sched = ptdev->scheduler; > - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); > - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1); > - u64 addr_reg = ptdev->csif_info.cs_reg_count - > - ptdev->csif_info.unpreserved_cs_reg_count; > - u64 val_reg = addr_reg + 2; > - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + > - job->queue_idx * sizeof(struct panthor_syncobj_64b); > - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); > - struct dma_fence *done_fence; > - int ret; > > - u64 call_instrs[NUM_INSTRS_PER_SLOT] = { > - /* MOV32 rX+2, cs.latest_flush */ > - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush, > + params->addr_reg = ptdev->csif_info.cs_reg_count - > + ptdev->csif_info.unpreserved_cs_reg_count; > + params->val_reg = params->addr_reg + 2; > + params->cycle_reg = params->addr_reg; > + params->time_reg = params->val_reg; > > - /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ > - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233, > + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + > + job->queue_idx * sizeof(struct panthor_syncobj_64b); > + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) + > + (job->profiling.slot * sizeof(struct panthor_job_profiling_data)); > + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); > > - /* MOV48 rX:rX+1, cs.start */ > - (1ull << 56) | (addr_reg << 48) | job->call_info.start, > + params->cs_start = job->call_info.start; > + params->cs_size = job->call_info.size; > + params->last_flush = job->call_info.latest_flush; > > - /* MOV32 rX+2, cs.size */ > - (2ull << 56) | (val_reg << 48) | job->call_info.size, > + params->profile_mask = job->profiling.mask; > +} > > - /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ > - (3ull << 56) | (1 << 16), > +#define JOB_INSTR_ALWAYS(instr) \ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr)) > +#define JOB_INSTR_TIMESTAMP(instr) \ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr)) > +#define JOB_INSTR_CYCLES(instr) \ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr)) > > +static void > +prepare_job_instrs(const struct panthor_job_cs_params *params, > + struct panthor_job_ringbuf_instrs *instrs) > +{ > + const struct panthor_job_instr instr_seq[] = { > + /* MOV32 rX+2, cs.latest_flush */ > + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush), > + /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ > + JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233), > + /* MOV48 rX:rX+1, cycles_offset */ > + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) | > + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))), > + /* STORE_STATE cycles */ > + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), > + /* MOV48 rX:rX+1, time_offset */ > + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr + > + offsetof(struct panthor_job_profiling_data, time.before))), > + /* STORE_STATE timer */ > + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)), > + /* MOV48 rX:rX+1, cs.start */ > + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start), > + /* MOV32 rX+2, cs.size */ > + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size), > + /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ > + JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)), > /* CALL rX:rX+1, rX+2 */ > - (32ull << 56) | (addr_reg << 40) | (val_reg << 32), > - > + JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)), > + /* MOV48 rX:rX+1, cycles_offset */ > + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) | > + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))), > + /* STORE_STATE cycles */ > + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), > + /* MOV48 rX:rX+1, time_offset */ > + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | > + (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))), > + /* STORE_STATE timer */ > + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)), > /* MOV48 rX:rX+1, sync_addr */ > - (1ull << 56) | (addr_reg << 48) | sync_addr, > - > + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr), > /* MOV48 rX+2, #1 */ > - (1ull << 56) | (val_reg << 48) | 1, > - > + JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1), > /* WAIT(all) */ > - (3ull << 56) | (waitall_mask << 16), > - > + JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)), > /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/ > - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1, > + JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) | > + (params->val_reg << 32) | (0 << 16) | 1), > + /* ERROR_BARRIER, so we can recover from faults at job boundaries. */ > + JOB_INSTR_ALWAYS((47ull << 56)), > + }; > + u32 pad; > > - /* ERROR_BARRIER, so we can recover from faults at job > - * boundaries. > - */ > - (47ull << 56), > + /* NEED to be cacheline aligned to please the prefetcher. */ > + static_assert(sizeof(instrs->buffer) % 64 == 0, > + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline"); > + > + /* Make sure we have enough storage to store the whole sequence. */ > + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) == > + ARRAY_SIZE(instrs->buffer), > + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch"); > + > + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) { > + /* If the profile mask of this instruction is not enabled, skip it. */ > + if (instr_seq[i].profile_mask && > + !(instr_seq[i].profile_mask & params->profile_mask)) > + continue; > + > + instrs->buffer[instrs->count++] = instr_seq[i].instr; > + } > + > + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); > + memset(&instrs->buffer[instrs->count], 0, > + (pad - instrs->count) * sizeof(instrs->buffer[0])); > + instrs->count = pad; > +} > + > +static u32 calc_job_credits(u32 profile_mask) > +{ > + struct panthor_job_ringbuf_instrs instrs = { > + .count = 0, > + }; > + struct panthor_job_cs_params params = { > + .profile_mask = profile_mask, > }; > > - /* Need to be cacheline aligned to please the prefetcher. */ > - static_assert(sizeof(call_instrs) % 64 == 0, > - "call_instrs is not aligned on a cacheline"); > + prepare_job_instrs(¶ms, &instrs); > + return instrs.count; > +} > + > +static struct dma_fence * > +queue_run_job(struct drm_sched_job *sched_job) > +{ > + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); > + struct panthor_group *group = job->group; > + struct panthor_queue *queue = group->queues[job->queue_idx]; > + struct panthor_device *ptdev = group->ptdev; > + struct panthor_scheduler *sched = ptdev->scheduler; > + struct panthor_job_ringbuf_instrs instrs; instrs isn't initialised... > + struct panthor_job_cs_params cs_params; > + struct dma_fence *done_fence; > + int ret; > > /* Stream size is zero, nothing to do except making sure all previously > * submitted jobs are done before we signal the > @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job) > queue->fence_ctx.id, > atomic64_inc_return(&queue->fence_ctx.seqno)); > > - memcpy(queue->ringbuf->kmap + ringbuf_insert, > - call_instrs, sizeof(call_instrs)); > + job->profiling.slot = queue->profiling.seqno++; > + if (queue->profiling.seqno == queue->profiling.slot_count) > + queue->profiling.seqno = 0; > + > + job->ringbuf.start = queue->iface.input->insert; > + > + get_job_cs_params(job, &cs_params); > + prepare_job_instrs(&cs_params, &instrs); ...but it's passed into prepare_job_instrs() which depends on instrs.count (same bug as was in calc_job_credits()) - sorry I didn't spot it last review. Initializing instrs makes everything work for me. I'm not sure quite what kernel configuration you are using but I wonder if you've got a 'hardening' option enabled which is causing the stack to be zero-initialised. It's worth turning it off for testing purposes ;) Steve > + copy_instrs_to_ringbuf(queue, job, &instrs); > + > + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64)); > > panthor_job_get(&job->base); > spin_lock(&queue->fence_ctx.lock); > list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs); > spin_unlock(&queue->fence_ctx.lock); > > - job->ringbuf.start = queue->iface.input->insert; > - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs); > - > /* Make sure the ring buffer is updated before the INSERT > * register. > */ > @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = { > .free_job = queue_free_job, > }; > > +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev, > + u32 cs_ringbuf_size) > +{ > + u32 min_profiled_job_instrs = U32_MAX; > + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL); > + > + /* > + * We want to calculate the minimum size of a profiled job's CS, > + * because since they need additional instructions for the sampling > + * of performance metrics, they might take up further slots in > + * the queue's ringbuffer. This means we might not need as many job > + * slots for keeping track of their profiling information. What we > + * need is the maximum number of slots we should allocate to this end, > + * which matches the maximum number of profiled jobs we can place > + * simultaneously in the queue's ring buffer. > + * That has to be calculated separately for every single job profiling > + * flag, but not in the case job profiling is disabled, since unprofiled > + * jobs don't need to keep track of this at all. > + */ > + for (u32 i = 0; i < last_flag; i++) { > + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL) > + min_profiled_job_instrs = > + min(min_profiled_job_instrs, calc_job_credits(BIT(i))); > + } > + > + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64)); > +} > + > static struct panthor_queue * > group_create_queue(struct panthor_group *group, > const struct drm_panthor_queue_create *args) > @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group, > goto err_free_queue; > } > > + queue->profiling.slot_count = > + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size); > + > + queue->profiling.slots = > + panthor_kernel_bo_create(group->ptdev, group->vm, > + queue->profiling.slot_count * > + sizeof(struct panthor_job_profiling_data), > + DRM_PANTHOR_BO_NO_MMAP, > + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | > + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, > + PANTHOR_VM_KERNEL_AUTO_VA); > + > + if (IS_ERR(queue->profiling.slots)) { > + ret = PTR_ERR(queue->profiling.slots); > + goto err_free_queue; > + } > + > + ret = panthor_kernel_bo_vmap(queue->profiling.slots); > + if (ret) > + goto err_free_queue; > + > + /* > + * Credit limit argument tells us the total number of instructions > + * across all CS slots in the ringbuffer, with some jobs requiring > + * twice as many as others, depending on their profiling status. > + */ > ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops, > group->ptdev->scheduler->wq, 1, > - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)), > + args->ringbuf_size / sizeof(u64), > 0, msecs_to_jiffies(JOB_TIMEOUT_MS), > group->ptdev->reset.wq, > NULL, "panthor-queue", group->ptdev->base.dev); > @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile, > { > struct panthor_group_pool *gpool = pfile->groups; > struct panthor_job *job; > + u32 credits; > int ret; > > if (qsubmit->pad) > @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile, > } > } > > + job->profiling.mask = pfile->ptdev->profile_mask; > + credits = calc_job_credits(job->profiling.mask); > + if (credits == 0) { > + ret = -EINVAL; > + goto err_put_job; > + } > + > ret = drm_sched_job_init(&job->base, > &job->group->queues[job->queue_idx]->entity, > - 1, job->group); > + credits, job->group); > if (ret) > goto err_put_job; >
On Mon, 23 Sep 2024 10:07:14 +0100 Steven Price <steven.price@arm.com> wrote: > > +static struct dma_fence * > > +queue_run_job(struct drm_sched_job *sched_job) > > +{ > > + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); > > + struct panthor_group *group = job->group; > > + struct panthor_queue *queue = group->queues[job->queue_idx]; > > + struct panthor_device *ptdev = group->ptdev; > > + struct panthor_scheduler *sched = ptdev->scheduler; > > + struct panthor_job_ringbuf_instrs instrs; > > instrs isn't initialised... > > > + struct panthor_job_cs_params cs_params; > > + struct dma_fence *done_fence; > > + int ret; > > > > /* Stream size is zero, nothing to do except making sure all previously > > * submitted jobs are done before we signal the > > @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job) > > queue->fence_ctx.id, > > atomic64_inc_return(&queue->fence_ctx.seqno)); > > > > - memcpy(queue->ringbuf->kmap + ringbuf_insert, > > - call_instrs, sizeof(call_instrs)); > > + job->profiling.slot = queue->profiling.seqno++; > > + if (queue->profiling.seqno == queue->profiling.slot_count) > > + queue->profiling.seqno = 0; > > + > > + job->ringbuf.start = queue->iface.input->insert; > > + > > + get_job_cs_params(job, &cs_params); > > + prepare_job_instrs(&cs_params, &instrs); > > ...but it's passed into prepare_job_instrs() which depends on > instrs.count (same bug as was in calc_job_credits()) - sorry I didn't > spot it last review. Hm, can't we initialize instr::count to zero in prepare_job_instrs() instead?
On 23/09/2024 11:18, Boris Brezillon wrote: > On Mon, 23 Sep 2024 10:07:14 +0100 > Steven Price <steven.price@arm.com> wrote: > >>> +static struct dma_fence * >>> +queue_run_job(struct drm_sched_job *sched_job) >>> +{ >>> + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); >>> + struct panthor_group *group = job->group; >>> + struct panthor_queue *queue = group->queues[job->queue_idx]; >>> + struct panthor_device *ptdev = group->ptdev; >>> + struct panthor_scheduler *sched = ptdev->scheduler; >>> + struct panthor_job_ringbuf_instrs instrs; >> >> instrs isn't initialised... >> >>> + struct panthor_job_cs_params cs_params; >>> + struct dma_fence *done_fence; >>> + int ret; >>> >>> /* Stream size is zero, nothing to do except making sure all previously >>> * submitted jobs are done before we signal the >>> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job) >>> queue->fence_ctx.id, >>> atomic64_inc_return(&queue->fence_ctx.seqno)); >>> >>> - memcpy(queue->ringbuf->kmap + ringbuf_insert, >>> - call_instrs, sizeof(call_instrs)); >>> + job->profiling.slot = queue->profiling.seqno++; >>> + if (queue->profiling.seqno == queue->profiling.slot_count) >>> + queue->profiling.seqno = 0; >>> + >>> + job->ringbuf.start = queue->iface.input->insert; >>> + >>> + get_job_cs_params(job, &cs_params); >>> + prepare_job_instrs(&cs_params, &instrs); >> >> ...but it's passed into prepare_job_instrs() which depends on >> instrs.count (same bug as was in calc_job_credits()) - sorry I didn't >> spot it last review. > > Hm, can't we initialize instr::count to zero in prepare_job_instrs() > instead? Indeed that would probably be better! I hadn't noticed there were two places in the previous review. Steve
On 23.09.2024 10:07, Steven Price wrote: >On 21/09/2024 00:43, Adrián Larumbe wrote: >> Enable calculations of job submission times in clock cycles and wall >> time. This is done by expanding the boilerplate command stream when running >> a job to include instructions that compute said times right before and >> after a user CS. >> >> A separate kernel BO is created per queue to store those values. Jobs can >> access their sampled data through an index different from that of the >> queue's ringbuffer. The reason for this is saving memory on the profiling >> information kernel BO, since the amount of simultaneous profiled jobs we >> can write into the queue's ringbuffer might be much smaller than for >> regular jobs, as the former take more CSF instructions. >> >> This commit is done in preparation for enabling DRM fdinfo support in the >> Panthor driver, which depends on the numbers calculated herein. >> >> A profile mode mask has been added that will in a future commit allow UM to >> toggle performance metric sampling behaviour, which is disabled by default >> to save power. When a ringbuffer CS is constructed, timestamp and cycling >> sampling instructions are added depending on the enabled flags in the >> profiling mask. >> >> A helper was provided that calculates the number of instructions for a >> given set of enablement mask, and these are passed as the number of credits >> when initialising a DRM scheduler job. >> >> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com> >> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> >> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> > >I think just one bug remaining - see below... > >> --- >> drivers/gpu/drm/panthor/panthor_device.h | 22 ++ >> drivers/gpu/drm/panthor/panthor_sched.c | 328 +++++++++++++++++++---- >> 2 files changed, 301 insertions(+), 49 deletions(-) >> >> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h >> index e388c0472ba7..a48e30d0af30 100644 >> --- a/drivers/gpu/drm/panthor/panthor_device.h >> +++ b/drivers/gpu/drm/panthor/panthor_device.h >> @@ -66,6 +66,25 @@ struct panthor_irq { >> atomic_t suspended; >> }; >> >> +/** >> + * enum panthor_device_profiling_mode - Profiling state >> + */ >> +enum panthor_device_profiling_flags { >> + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */ >> + PANTHOR_DEVICE_PROFILING_DISABLED = 0, >> + >> + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */ >> + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0), >> + >> + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */ >> + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1), >> + >> + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */ >> + PANTHOR_DEVICE_PROFILING_ALL = >> + PANTHOR_DEVICE_PROFILING_CYCLES | >> + PANTHOR_DEVICE_PROFILING_TIMESTAMP, >> +}; >> + >> /** >> * struct panthor_device - Panthor device >> */ >> @@ -162,6 +181,9 @@ struct panthor_device { >> */ >> struct page *dummy_latest_flush; >> } pm; >> + >> + /** @profile_mask: User-set profiling flags for job accounting. */ >> + u32 profile_mask; >> }; >> >> /** >> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c >> index 42afdf0ddb7e..6da5c3d0015e 100644 >> --- a/drivers/gpu/drm/panthor/panthor_sched.c >> +++ b/drivers/gpu/drm/panthor/panthor_sched.c >> @@ -93,6 +93,9 @@ >> #define MIN_CSGS 3 >> #define MAX_CSG_PRIO 0xf >> >> +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64)) >> +#define MAX_INSTRS_PER_JOB 24 >> + >> struct panthor_group; >> >> /** >> @@ -476,6 +479,18 @@ struct panthor_queue { >> */ >> struct list_head in_flight_jobs; >> } fence_ctx; >> + >> + /** @profiling: Job profiling data slots and access information. */ >> + struct { >> + /** @slots: Kernel BO holding the slots. */ >> + struct panthor_kernel_bo *slots; >> + >> + /** @slot_count: Number of jobs ringbuffer can hold at once. */ >> + u32 slot_count; >> + >> + /** @seqno: Index of the next available profiling information slot. */ >> + u32 seqno; >> + } profiling; >> }; >> >> /** >> @@ -661,6 +676,18 @@ struct panthor_group { >> struct list_head wait_node; >> }; >> >> +struct panthor_job_profiling_data { >> + struct { >> + u64 before; >> + u64 after; >> + } cycles; >> + >> + struct { >> + u64 before; >> + u64 after; >> + } time; >> +}; >> + >> /** >> * group_queue_work() - Queue a group work >> * @group: Group to queue the work for. >> @@ -774,6 +801,15 @@ struct panthor_job { >> >> /** @done_fence: Fence signaled when the job is finished or cancelled. */ >> struct dma_fence *done_fence; >> + >> + /** @profiling: Job profiling information. */ >> + struct { >> + /** @mask: Current device job profiling enablement bitmask. */ >> + u32 mask; >> + >> + /** @slot: Job index in the profiling slots BO. */ >> + u32 slot; >> + } profiling; >> }; >> >> static void >> @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * >> >> panthor_kernel_bo_destroy(queue->ringbuf); >> panthor_kernel_bo_destroy(queue->iface.mem); >> + panthor_kernel_bo_destroy(queue->profiling.slots); >> >> /* Release the last_fence we were holding, if any. */ >> dma_fence_put(queue->fence_ctx.last_fence); >> @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched, >> } >> } >> >> -#define NUM_INSTRS_PER_SLOT 16 >> - >> static void >> group_term_post_processing(struct panthor_group *group) >> { >> @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work) >> group_put(group); >> } >> >> -static struct dma_fence * >> -queue_run_job(struct drm_sched_job *sched_job) >> +struct panthor_job_ringbuf_instrs { >> + u64 buffer[MAX_INSTRS_PER_JOB]; >> + u32 count; >> +}; >> + >> +struct panthor_job_instr { >> + u32 profile_mask; >> + u64 instr; >> +}; >> + >> +#define JOB_INSTR(__prof, __instr) \ >> + { \ >> + .profile_mask = __prof, \ >> + .instr = __instr, \ >> + } >> + >> +static void >> +copy_instrs_to_ringbuf(struct panthor_queue *queue, >> + struct panthor_job *job, >> + struct panthor_job_ringbuf_instrs *instrs) >> +{ >> + u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); >> + u64 start = job->ringbuf.start & (ringbuf_size - 1); >> + u64 size, written; >> + >> + /* >> + * We need to write a whole slot, including any trailing zeroes >> + * that may come at the end of it. Also, because instrs.buffer has >> + * been zero-initialised, there's no need to pad it with 0's >> + */ >> + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); >> + size = instrs->count * sizeof(u64); >> + WARN_ON(size > ringbuf_size); >> + written = min(ringbuf_size - start, size); >> + >> + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written); >> + >> + if (written < size) >> + memcpy(queue->ringbuf->kmap, >> + &instrs->buffer[written/sizeof(u64)], >> + size - written); >> +} >> + >> +struct panthor_job_cs_params { >> + u32 profile_mask; >> + u64 addr_reg; u64 val_reg; >> + u64 cycle_reg; u64 time_reg; >> + u64 sync_addr; u64 times_addr; >> + u64 cs_start; u64 cs_size; >> + u32 last_flush; u32 waitall_mask; >> +}; >> + >> +static void >> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params) >> { >> - struct panthor_job *job = container_of(sched_job, struct panthor_job, base); >> struct panthor_group *group = job->group; >> struct panthor_queue *queue = group->queues[job->queue_idx]; >> struct panthor_device *ptdev = group->ptdev; >> struct panthor_scheduler *sched = ptdev->scheduler; >> - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); >> - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1); >> - u64 addr_reg = ptdev->csif_info.cs_reg_count - >> - ptdev->csif_info.unpreserved_cs_reg_count; >> - u64 val_reg = addr_reg + 2; >> - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + >> - job->queue_idx * sizeof(struct panthor_syncobj_64b); >> - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); >> - struct dma_fence *done_fence; >> - int ret; >> >> - u64 call_instrs[NUM_INSTRS_PER_SLOT] = { >> - /* MOV32 rX+2, cs.latest_flush */ >> - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush, >> + params->addr_reg = ptdev->csif_info.cs_reg_count - >> + ptdev->csif_info.unpreserved_cs_reg_count; >> + params->val_reg = params->addr_reg + 2; >> + params->cycle_reg = params->addr_reg; >> + params->time_reg = params->val_reg; >> >> - /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ >> - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233, >> + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + >> + job->queue_idx * sizeof(struct panthor_syncobj_64b); >> + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) + >> + (job->profiling.slot * sizeof(struct panthor_job_profiling_data)); >> + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); >> >> - /* MOV48 rX:rX+1, cs.start */ >> - (1ull << 56) | (addr_reg << 48) | job->call_info.start, >> + params->cs_start = job->call_info.start; >> + params->cs_size = job->call_info.size; >> + params->last_flush = job->call_info.latest_flush; >> >> - /* MOV32 rX+2, cs.size */ >> - (2ull << 56) | (val_reg << 48) | job->call_info.size, >> + params->profile_mask = job->profiling.mask; >> +} >> >> - /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ >> - (3ull << 56) | (1 << 16), >> +#define JOB_INSTR_ALWAYS(instr) \ >> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr)) >> +#define JOB_INSTR_TIMESTAMP(instr) \ >> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr)) >> +#define JOB_INSTR_CYCLES(instr) \ >> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr)) >> >> +static void >> +prepare_job_instrs(const struct panthor_job_cs_params *params, >> + struct panthor_job_ringbuf_instrs *instrs) >> +{ >> + const struct panthor_job_instr instr_seq[] = { >> + /* MOV32 rX+2, cs.latest_flush */ >> + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush), >> + /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ >> + JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233), >> + /* MOV48 rX:rX+1, cycles_offset */ >> + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) | >> + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))), >> + /* STORE_STATE cycles */ >> + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), >> + /* MOV48 rX:rX+1, time_offset */ >> + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr + >> + offsetof(struct panthor_job_profiling_data, time.before))), >> + /* STORE_STATE timer */ >> + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)), >> + /* MOV48 rX:rX+1, cs.start */ >> + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start), >> + /* MOV32 rX+2, cs.size */ >> + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size), >> + /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ >> + JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)), >> /* CALL rX:rX+1, rX+2 */ >> - (32ull << 56) | (addr_reg << 40) | (val_reg << 32), >> - >> + JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)), >> + /* MOV48 rX:rX+1, cycles_offset */ >> + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) | >> + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))), >> + /* STORE_STATE cycles */ >> + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), >> + /* MOV48 rX:rX+1, time_offset */ >> + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | >> + (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))), >> + /* STORE_STATE timer */ >> + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)), >> /* MOV48 rX:rX+1, sync_addr */ >> - (1ull << 56) | (addr_reg << 48) | sync_addr, >> - >> + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr), >> /* MOV48 rX+2, #1 */ >> - (1ull << 56) | (val_reg << 48) | 1, >> - >> + JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1), >> /* WAIT(all) */ >> - (3ull << 56) | (waitall_mask << 16), >> - >> + JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)), >> /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/ >> - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1, >> + JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) | >> + (params->val_reg << 32) | (0 << 16) | 1), >> + /* ERROR_BARRIER, so we can recover from faults at job boundaries. */ >> + JOB_INSTR_ALWAYS((47ull << 56)), >> + }; >> + u32 pad; >> >> - /* ERROR_BARRIER, so we can recover from faults at job >> - * boundaries. >> - */ >> - (47ull << 56), >> + /* NEED to be cacheline aligned to please the prefetcher. */ >> + static_assert(sizeof(instrs->buffer) % 64 == 0, >> + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline"); >> + >> + /* Make sure we have enough storage to store the whole sequence. */ >> + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) == >> + ARRAY_SIZE(instrs->buffer), >> + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch"); >> + >> + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) { >> + /* If the profile mask of this instruction is not enabled, skip it. */ >> + if (instr_seq[i].profile_mask && >> + !(instr_seq[i].profile_mask & params->profile_mask)) >> + continue; >> + >> + instrs->buffer[instrs->count++] = instr_seq[i].instr; >> + } >> + >> + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); >> + memset(&instrs->buffer[instrs->count], 0, >> + (pad - instrs->count) * sizeof(instrs->buffer[0])); >> + instrs->count = pad; >> +} >> + >> +static u32 calc_job_credits(u32 profile_mask) >> +{ >> + struct panthor_job_ringbuf_instrs instrs = { >> + .count = 0, >> + }; >> + struct panthor_job_cs_params params = { >> + .profile_mask = profile_mask, >> }; >> >> - /* Need to be cacheline aligned to please the prefetcher. */ >> - static_assert(sizeof(call_instrs) % 64 == 0, >> - "call_instrs is not aligned on a cacheline"); >> + prepare_job_instrs(¶ms, &instrs); >> + return instrs.count; >> +} >> + >> +static struct dma_fence * >> +queue_run_job(struct drm_sched_job *sched_job) >> +{ >> + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); >> + struct panthor_group *group = job->group; >> + struct panthor_queue *queue = group->queues[job->queue_idx]; >> + struct panthor_device *ptdev = group->ptdev; >> + struct panthor_scheduler *sched = ptdev->scheduler; >> + struct panthor_job_ringbuf_instrs instrs; > >instrs isn't initialised... > >> + struct panthor_job_cs_params cs_params; >> + struct dma_fence *done_fence; >> + int ret; >> >> /* Stream size is zero, nothing to do except making sure all previously >> * submitted jobs are done before we signal the >> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job) >> queue->fence_ctx.id, >> atomic64_inc_return(&queue->fence_ctx.seqno)); >> >> - memcpy(queue->ringbuf->kmap + ringbuf_insert, >> - call_instrs, sizeof(call_instrs)); >> + job->profiling.slot = queue->profiling.seqno++; >> + if (queue->profiling.seqno == queue->profiling.slot_count) >> + queue->profiling.seqno = 0; >> + >> + job->ringbuf.start = queue->iface.input->insert; >> + >> + get_job_cs_params(job, &cs_params); >> + prepare_job_instrs(&cs_params, &instrs); > >...but it's passed into prepare_job_instrs() which depends on >instrs.count (same bug as was in calc_job_credits()) - sorry I didn't >spot it last review. > >Initializing instrs makes everything work for me. > >I'm not sure quite what kernel configuration you are using but I wonder >if you've got a 'hardening' option enabled which is causing the stack to >be zero-initialised. It's worth turning it off for testing purposes ;) Thanks for catching this, it went completely unnoticed to me. Delving into my kernel config, I found this option: CONFIG_INIT_STACK_ALL_ZERO. When I unset it, it triggers an invalid memory reference in the kernel, I guess because of uninitialised stack variables. I don't know why I had this option enabled, but come to think of it seems like a terrible idea. Thanks for bringing this up. >Steve > >> + copy_instrs_to_ringbuf(queue, job, &instrs); >> + >> + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64)); >> >> panthor_job_get(&job->base); >> spin_lock(&queue->fence_ctx.lock); >> list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs); >> spin_unlock(&queue->fence_ctx.lock); >> >> - job->ringbuf.start = queue->iface.input->insert; >> - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs); >> - >> /* Make sure the ring buffer is updated before the INSERT >> * register. >> */ >> @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = { >> .free_job = queue_free_job, >> }; >> >> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev, >> + u32 cs_ringbuf_size) >> +{ >> + u32 min_profiled_job_instrs = U32_MAX; >> + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL); >> + >> + /* >> + * We want to calculate the minimum size of a profiled job's CS, >> + * because since they need additional instructions for the sampling >> + * of performance metrics, they might take up further slots in >> + * the queue's ringbuffer. This means we might not need as many job >> + * slots for keeping track of their profiling information. What we >> + * need is the maximum number of slots we should allocate to this end, >> + * which matches the maximum number of profiled jobs we can place >> + * simultaneously in the queue's ring buffer. >> + * That has to be calculated separately for every single job profiling >> + * flag, but not in the case job profiling is disabled, since unprofiled >> + * jobs don't need to keep track of this at all. >> + */ >> + for (u32 i = 0; i < last_flag; i++) { >> + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL) >> + min_profiled_job_instrs = >> + min(min_profiled_job_instrs, calc_job_credits(BIT(i))); >> + } >> + >> + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64)); >> +} >> + >> static struct panthor_queue * >> group_create_queue(struct panthor_group *group, >> const struct drm_panthor_queue_create *args) >> @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group, >> goto err_free_queue; >> } >> >> + queue->profiling.slot_count = >> + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size); >> + >> + queue->profiling.slots = >> + panthor_kernel_bo_create(group->ptdev, group->vm, >> + queue->profiling.slot_count * >> + sizeof(struct panthor_job_profiling_data), >> + DRM_PANTHOR_BO_NO_MMAP, >> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | >> + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, >> + PANTHOR_VM_KERNEL_AUTO_VA); >> + >> + if (IS_ERR(queue->profiling.slots)) { >> + ret = PTR_ERR(queue->profiling.slots); >> + goto err_free_queue; >> + } >> + >> + ret = panthor_kernel_bo_vmap(queue->profiling.slots); >> + if (ret) >> + goto err_free_queue; >> + >> + /* >> + * Credit limit argument tells us the total number of instructions >> + * across all CS slots in the ringbuffer, with some jobs requiring >> + * twice as many as others, depending on their profiling status. >> + */ >> ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops, >> group->ptdev->scheduler->wq, 1, >> - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)), >> + args->ringbuf_size / sizeof(u64), >> 0, msecs_to_jiffies(JOB_TIMEOUT_MS), >> group->ptdev->reset.wq, >> NULL, "panthor-queue", group->ptdev->base.dev); >> @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile, >> { >> struct panthor_group_pool *gpool = pfile->groups; >> struct panthor_job *job; >> + u32 credits; >> int ret; >> >> if (qsubmit->pad) >> @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile, >> } >> } >> >> + job->profiling.mask = pfile->ptdev->profile_mask; >> + credits = calc_job_credits(job->profiling.mask); >> + if (credits == 0) { >> + ret = -EINVAL; >> + goto err_put_job; >> + } >> + >> ret = drm_sched_job_init(&job->base, >> &job->group->queues[job->queue_idx]->entity, >> - 1, job->group); >> + credits, job->group); >> if (ret) >> goto err_put_job; >>
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h index e388c0472ba7..a48e30d0af30 100644 --- a/drivers/gpu/drm/panthor/panthor_device.h +++ b/drivers/gpu/drm/panthor/panthor_device.h @@ -66,6 +66,25 @@ struct panthor_irq { atomic_t suspended; }; +/** + * enum panthor_device_profiling_mode - Profiling state + */ +enum panthor_device_profiling_flags { + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */ + PANTHOR_DEVICE_PROFILING_DISABLED = 0, + + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */ + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0), + + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */ + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1), + + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */ + PANTHOR_DEVICE_PROFILING_ALL = + PANTHOR_DEVICE_PROFILING_CYCLES | + PANTHOR_DEVICE_PROFILING_TIMESTAMP, +}; + /** * struct panthor_device - Panthor device */ @@ -162,6 +181,9 @@ struct panthor_device { */ struct page *dummy_latest_flush; } pm; + + /** @profile_mask: User-set profiling flags for job accounting. */ + u32 profile_mask; }; /** diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 42afdf0ddb7e..6da5c3d0015e 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -93,6 +93,9 @@ #define MIN_CSGS 3 #define MAX_CSG_PRIO 0xf +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64)) +#define MAX_INSTRS_PER_JOB 24 + struct panthor_group; /** @@ -476,6 +479,18 @@ struct panthor_queue { */ struct list_head in_flight_jobs; } fence_ctx; + + /** @profiling: Job profiling data slots and access information. */ + struct { + /** @slots: Kernel BO holding the slots. */ + struct panthor_kernel_bo *slots; + + /** @slot_count: Number of jobs ringbuffer can hold at once. */ + u32 slot_count; + + /** @seqno: Index of the next available profiling information slot. */ + u32 seqno; + } profiling; }; /** @@ -661,6 +676,18 @@ struct panthor_group { struct list_head wait_node; }; +struct panthor_job_profiling_data { + struct { + u64 before; + u64 after; + } cycles; + + struct { + u64 before; + u64 after; + } time; +}; + /** * group_queue_work() - Queue a group work * @group: Group to queue the work for. @@ -774,6 +801,15 @@ struct panthor_job { /** @done_fence: Fence signaled when the job is finished or cancelled. */ struct dma_fence *done_fence; + + /** @profiling: Job profiling information. */ + struct { + /** @mask: Current device job profiling enablement bitmask. */ + u32 mask; + + /** @slot: Job index in the profiling slots BO. */ + u32 slot; + } profiling; }; static void @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * panthor_kernel_bo_destroy(queue->ringbuf); panthor_kernel_bo_destroy(queue->iface.mem); + panthor_kernel_bo_destroy(queue->profiling.slots); /* Release the last_fence we were holding, if any. */ dma_fence_put(queue->fence_ctx.last_fence); @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched, } } -#define NUM_INSTRS_PER_SLOT 16 - static void group_term_post_processing(struct panthor_group *group) { @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work) group_put(group); } -static struct dma_fence * -queue_run_job(struct drm_sched_job *sched_job) +struct panthor_job_ringbuf_instrs { + u64 buffer[MAX_INSTRS_PER_JOB]; + u32 count; +}; + +struct panthor_job_instr { + u32 profile_mask; + u64 instr; +}; + +#define JOB_INSTR(__prof, __instr) \ + { \ + .profile_mask = __prof, \ + .instr = __instr, \ + } + +static void +copy_instrs_to_ringbuf(struct panthor_queue *queue, + struct panthor_job *job, + struct panthor_job_ringbuf_instrs *instrs) +{ + u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); + u64 start = job->ringbuf.start & (ringbuf_size - 1); + u64 size, written; + + /* + * We need to write a whole slot, including any trailing zeroes + * that may come at the end of it. Also, because instrs.buffer has + * been zero-initialised, there's no need to pad it with 0's + */ + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); + size = instrs->count * sizeof(u64); + WARN_ON(size > ringbuf_size); + written = min(ringbuf_size - start, size); + + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written); + + if (written < size) + memcpy(queue->ringbuf->kmap, + &instrs->buffer[written/sizeof(u64)], + size - written); +} + +struct panthor_job_cs_params { + u32 profile_mask; + u64 addr_reg; u64 val_reg; + u64 cycle_reg; u64 time_reg; + u64 sync_addr; u64 times_addr; + u64 cs_start; u64 cs_size; + u32 last_flush; u32 waitall_mask; +}; + +static void +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params) { - struct panthor_job *job = container_of(sched_job, struct panthor_job, base); struct panthor_group *group = job->group; struct panthor_queue *queue = group->queues[job->queue_idx]; struct panthor_device *ptdev = group->ptdev; struct panthor_scheduler *sched = ptdev->scheduler; - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1); - u64 addr_reg = ptdev->csif_info.cs_reg_count - - ptdev->csif_info.unpreserved_cs_reg_count; - u64 val_reg = addr_reg + 2; - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + - job->queue_idx * sizeof(struct panthor_syncobj_64b); - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); - struct dma_fence *done_fence; - int ret; - u64 call_instrs[NUM_INSTRS_PER_SLOT] = { - /* MOV32 rX+2, cs.latest_flush */ - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush, + params->addr_reg = ptdev->csif_info.cs_reg_count - + ptdev->csif_info.unpreserved_cs_reg_count; + params->val_reg = params->addr_reg + 2; + params->cycle_reg = params->addr_reg; + params->time_reg = params->val_reg; - /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233, + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + + job->queue_idx * sizeof(struct panthor_syncobj_64b); + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) + + (job->profiling.slot * sizeof(struct panthor_job_profiling_data)); + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); - /* MOV48 rX:rX+1, cs.start */ - (1ull << 56) | (addr_reg << 48) | job->call_info.start, + params->cs_start = job->call_info.start; + params->cs_size = job->call_info.size; + params->last_flush = job->call_info.latest_flush; - /* MOV32 rX+2, cs.size */ - (2ull << 56) | (val_reg << 48) | job->call_info.size, + params->profile_mask = job->profiling.mask; +} - /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ - (3ull << 56) | (1 << 16), +#define JOB_INSTR_ALWAYS(instr) \ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr)) +#define JOB_INSTR_TIMESTAMP(instr) \ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr)) +#define JOB_INSTR_CYCLES(instr) \ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr)) +static void +prepare_job_instrs(const struct panthor_job_cs_params *params, + struct panthor_job_ringbuf_instrs *instrs) +{ + const struct panthor_job_instr instr_seq[] = { + /* MOV32 rX+2, cs.latest_flush */ + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush), + /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ + JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233), + /* MOV48 rX:rX+1, cycles_offset */ + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) | + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))), + /* STORE_STATE cycles */ + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), + /* MOV48 rX:rX+1, time_offset */ + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr + + offsetof(struct panthor_job_profiling_data, time.before))), + /* STORE_STATE timer */ + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)), + /* MOV48 rX:rX+1, cs.start */ + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start), + /* MOV32 rX+2, cs.size */ + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size), + /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ + JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)), /* CALL rX:rX+1, rX+2 */ - (32ull << 56) | (addr_reg << 40) | (val_reg << 32), - + JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)), + /* MOV48 rX:rX+1, cycles_offset */ + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) | + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))), + /* STORE_STATE cycles */ + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), + /* MOV48 rX:rX+1, time_offset */ + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | + (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))), + /* STORE_STATE timer */ + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)), /* MOV48 rX:rX+1, sync_addr */ - (1ull << 56) | (addr_reg << 48) | sync_addr, - + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr), /* MOV48 rX+2, #1 */ - (1ull << 56) | (val_reg << 48) | 1, - + JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1), /* WAIT(all) */ - (3ull << 56) | (waitall_mask << 16), - + JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)), /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/ - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1, + JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) | + (params->val_reg << 32) | (0 << 16) | 1), + /* ERROR_BARRIER, so we can recover from faults at job boundaries. */ + JOB_INSTR_ALWAYS((47ull << 56)), + }; + u32 pad; - /* ERROR_BARRIER, so we can recover from faults at job - * boundaries. - */ - (47ull << 56), + /* NEED to be cacheline aligned to please the prefetcher. */ + static_assert(sizeof(instrs->buffer) % 64 == 0, + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline"); + + /* Make sure we have enough storage to store the whole sequence. */ + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) == + ARRAY_SIZE(instrs->buffer), + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch"); + + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) { + /* If the profile mask of this instruction is not enabled, skip it. */ + if (instr_seq[i].profile_mask && + !(instr_seq[i].profile_mask & params->profile_mask)) + continue; + + instrs->buffer[instrs->count++] = instr_seq[i].instr; + } + + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); + memset(&instrs->buffer[instrs->count], 0, + (pad - instrs->count) * sizeof(instrs->buffer[0])); + instrs->count = pad; +} + +static u32 calc_job_credits(u32 profile_mask) +{ + struct panthor_job_ringbuf_instrs instrs = { + .count = 0, + }; + struct panthor_job_cs_params params = { + .profile_mask = profile_mask, }; - /* Need to be cacheline aligned to please the prefetcher. */ - static_assert(sizeof(call_instrs) % 64 == 0, - "call_instrs is not aligned on a cacheline"); + prepare_job_instrs(¶ms, &instrs); + return instrs.count; +} + +static struct dma_fence * +queue_run_job(struct drm_sched_job *sched_job) +{ + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); + struct panthor_group *group = job->group; + struct panthor_queue *queue = group->queues[job->queue_idx]; + struct panthor_device *ptdev = group->ptdev; + struct panthor_scheduler *sched = ptdev->scheduler; + struct panthor_job_ringbuf_instrs instrs; + struct panthor_job_cs_params cs_params; + struct dma_fence *done_fence; + int ret; /* Stream size is zero, nothing to do except making sure all previously * submitted jobs are done before we signal the @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job) queue->fence_ctx.id, atomic64_inc_return(&queue->fence_ctx.seqno)); - memcpy(queue->ringbuf->kmap + ringbuf_insert, - call_instrs, sizeof(call_instrs)); + job->profiling.slot = queue->profiling.seqno++; + if (queue->profiling.seqno == queue->profiling.slot_count) + queue->profiling.seqno = 0; + + job->ringbuf.start = queue->iface.input->insert; + + get_job_cs_params(job, &cs_params); + prepare_job_instrs(&cs_params, &instrs); + copy_instrs_to_ringbuf(queue, job, &instrs); + + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64)); panthor_job_get(&job->base); spin_lock(&queue->fence_ctx.lock); list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs); spin_unlock(&queue->fence_ctx.lock); - job->ringbuf.start = queue->iface.input->insert; - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs); - /* Make sure the ring buffer is updated before the INSERT * register. */ @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = { .free_job = queue_free_job, }; +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev, + u32 cs_ringbuf_size) +{ + u32 min_profiled_job_instrs = U32_MAX; + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL); + + /* + * We want to calculate the minimum size of a profiled job's CS, + * because since they need additional instructions for the sampling + * of performance metrics, they might take up further slots in + * the queue's ringbuffer. This means we might not need as many job + * slots for keeping track of their profiling information. What we + * need is the maximum number of slots we should allocate to this end, + * which matches the maximum number of profiled jobs we can place + * simultaneously in the queue's ring buffer. + * That has to be calculated separately for every single job profiling + * flag, but not in the case job profiling is disabled, since unprofiled + * jobs don't need to keep track of this at all. + */ + for (u32 i = 0; i < last_flag; i++) { + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL) + min_profiled_job_instrs = + min(min_profiled_job_instrs, calc_job_credits(BIT(i))); + } + + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64)); +} + static struct panthor_queue * group_create_queue(struct panthor_group *group, const struct drm_panthor_queue_create *args) @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group, goto err_free_queue; } + queue->profiling.slot_count = + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size); + + queue->profiling.slots = + panthor_kernel_bo_create(group->ptdev, group->vm, + queue->profiling.slot_count * + sizeof(struct panthor_job_profiling_data), + DRM_PANTHOR_BO_NO_MMAP, + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, + PANTHOR_VM_KERNEL_AUTO_VA); + + if (IS_ERR(queue->profiling.slots)) { + ret = PTR_ERR(queue->profiling.slots); + goto err_free_queue; + } + + ret = panthor_kernel_bo_vmap(queue->profiling.slots); + if (ret) + goto err_free_queue; + + /* + * Credit limit argument tells us the total number of instructions + * across all CS slots in the ringbuffer, with some jobs requiring + * twice as many as others, depending on their profiling status. + */ ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops, group->ptdev->scheduler->wq, 1, - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)), + args->ringbuf_size / sizeof(u64), 0, msecs_to_jiffies(JOB_TIMEOUT_MS), group->ptdev->reset.wq, NULL, "panthor-queue", group->ptdev->base.dev); @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile, { struct panthor_group_pool *gpool = pfile->groups; struct panthor_job *job; + u32 credits; int ret; if (qsubmit->pad) @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile, } } + job->profiling.mask = pfile->ptdev->profile_mask; + credits = calc_job_credits(job->profiling.mask); + if (credits == 0) { + ret = -EINVAL; + goto err_put_job; + } + ret = drm_sched_job_init(&job->base, &job->group->queues[job->queue_idx]->entity, - 1, job->group); + credits, job->group); if (ret) goto err_put_job;