diff mbox series

[v2,3/6] drm/xe: Add helper to accumulate exec queue runtime

Message ID 20240423235652.1959945-4-lucas.demarchi@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/xe: Per client usage | expand

Commit Message

Lucas De Marchi April 23, 2024, 11:56 p.m. UTC
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>

Add a helper to accumulate per-client runtime of all its
exec queues. Currently that is done in 2 places:

	1. when the exec_queue is destroyed
	2. when the sched job is completed

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device_types.h |  9 +++++++
 drivers/gpu/drm/xe/xe_exec_queue.c   | 37 ++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
 drivers/gpu/drm/xe/xe_sched_job.c    |  2 ++
 4 files changed, 49 insertions(+)

Comments

Matthew Brost April 24, 2024, 4:08 a.m. UTC | #1
On Tue, Apr 23, 2024 at 04:56:48PM -0700, Lucas De Marchi wrote:
> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> 
> Add a helper to accumulate per-client runtime of all its
> exec queues. Currently that is done in 2 places:
> 
> 	1. when the exec_queue is destroyed
> 	2. when the sched job is completed
> 
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device_types.h |  9 +++++++
>  drivers/gpu/drm/xe/xe_exec_queue.c   | 37 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
>  drivers/gpu/drm/xe/xe_sched_job.c    |  2 ++
>  4 files changed, 49 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 2e62450d86e1..33d3bf93a2f1 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -547,6 +547,15 @@ struct xe_file {
>  		struct mutex lock;
>  	} exec_queue;
>  
> +	/**
> +	 * @runtime: hw engine class runtime in ticks for this drm client
> +	 *
> +	 * Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
> +	 * case, since all jobs run in parallel on the engines, only the stats
> +	 * from lrc[0] are sufficient.
> +	 */
> +	u64 runtime[XE_ENGINE_CLASS_MAX];
> +
>  	/** @client: drm client */
>  	struct xe_drm_client *client;
>  };
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 395de93579fa..b7b6256cb96a 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -214,6 +214,8 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
>  {
>  	int i;
>  
> +	xe_exec_queue_update_runtime(q);
> +
>  	for (i = 0; i < q->width; ++i)
>  		xe_lrc_finish(q->lrc + i);
>  	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm))
> @@ -769,6 +771,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
>  		q->lrc[0].fence_ctx.next_seqno - 1;
>  }
>  
> +/**
> + * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
> + * @q: The exec queue
> + *
> + * Update the timestamp saved by HW for this exec queue and save runtime
> + * calculated by using the delta from last update. On multi-lrc case, only the
> + * first is considered.
> + */
> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
> +{
> +	struct xe_file *xef;
> +	struct xe_lrc *lrc;
> +	u32 old_ts, new_ts;
> +
> +	/*
> +	 * Jobs that are run during driver load may use an exec_queue, but are
> +	 * not associated with a user xe file, so avoid accumulating busyness
> +	 * for kernel specific work.
> +	 */
> +	if (!q->vm || !q->vm->xef)
> +		return;
> +
> +	xef = q->vm->xef;
> +	lrc = &q->lrc[0];
> +
> +	new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
> +
> +	/*
> +	 * Special case the very first timestamp: we don't want the
> +	 * initial delta to be a huge value
> +	 */
> +	if (old_ts)
> +		xef->runtime[q->class] += new_ts - old_ts;
> +}
> +
>  void xe_exec_queue_kill(struct xe_exec_queue *q)
>  {
>  	struct xe_exec_queue *eq = q, *next;
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> index 02ce8d204622..45b72daa2db3 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -66,5 +66,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e,
>  					       struct xe_vm *vm);
>  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
>  				  struct dma_fence *fence);
> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> index cd8a2fba5438..6a081a4fa190 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -242,6 +242,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
>  {

This seems like the wrong place. xe_sched_job_completed is a helper
which determines *if* a job completed it *not* when it is completed. The
DRM scheduler free_job callback is probably the right place
(guc_exec_queue_free_job or execlist_job_free). So just call
xe_exec_queue_update_runtime there?

Matt

>  	struct xe_lrc *lrc = job->q->lrc;
>  
> +	xe_exec_queue_update_runtime(job->q);
> +
>  	/*
>  	 * Can safely check just LRC[0] seqno as that is last seqno written when
>  	 * parallel handshake is done.
> -- 
> 2.43.0
>
Lucas De Marchi April 24, 2024, 2:51 p.m. UTC | #2
On Wed, Apr 24, 2024 at 04:08:29AM GMT, Matthew Brost wrote:
>On Tue, Apr 23, 2024 at 04:56:48PM -0700, Lucas De Marchi wrote:
>> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>
>> Add a helper to accumulate per-client runtime of all its
>> exec queues. Currently that is done in 2 places:
>>
>> 	1. when the exec_queue is destroyed
>> 	2. when the sched job is completed
>>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
>> ---
>>  drivers/gpu/drm/xe/xe_device_types.h |  9 +++++++
>>  drivers/gpu/drm/xe/xe_exec_queue.c   | 37 ++++++++++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
>>  drivers/gpu/drm/xe/xe_sched_job.c    |  2 ++
>>  4 files changed, 49 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index 2e62450d86e1..33d3bf93a2f1 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -547,6 +547,15 @@ struct xe_file {
>>  		struct mutex lock;
>>  	} exec_queue;
>>
>> +	/**
>> +	 * @runtime: hw engine class runtime in ticks for this drm client
>> +	 *
>> +	 * Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
>> +	 * case, since all jobs run in parallel on the engines, only the stats
>> +	 * from lrc[0] are sufficient.
>> +	 */
>> +	u64 runtime[XE_ENGINE_CLASS_MAX];
>> +
>>  	/** @client: drm client */
>>  	struct xe_drm_client *client;
>>  };
>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>> index 395de93579fa..b7b6256cb96a 100644
>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>> @@ -214,6 +214,8 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
>>  {
>>  	int i;
>>
>> +	xe_exec_queue_update_runtime(q);
>> +
>>  	for (i = 0; i < q->width; ++i)
>>  		xe_lrc_finish(q->lrc + i);
>>  	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm))
>> @@ -769,6 +771,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
>>  		q->lrc[0].fence_ctx.next_seqno - 1;
>>  }
>>
>> +/**
>> + * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
>> + * @q: The exec queue
>> + *
>> + * Update the timestamp saved by HW for this exec queue and save runtime
>> + * calculated by using the delta from last update. On multi-lrc case, only the
>> + * first is considered.
>> + */
>> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
>> +{
>> +	struct xe_file *xef;
>> +	struct xe_lrc *lrc;
>> +	u32 old_ts, new_ts;
>> +
>> +	/*
>> +	 * Jobs that are run during driver load may use an exec_queue, but are
>> +	 * not associated with a user xe file, so avoid accumulating busyness
>> +	 * for kernel specific work.
>> +	 */
>> +	if (!q->vm || !q->vm->xef)
>> +		return;
>> +
>> +	xef = q->vm->xef;
>> +	lrc = &q->lrc[0];
>> +
>> +	new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
>> +
>> +	/*
>> +	 * Special case the very first timestamp: we don't want the
>> +	 * initial delta to be a huge value
>> +	 */
>> +	if (old_ts)
>> +		xef->runtime[q->class] += new_ts - old_ts;
>> +}
>> +
>>  void xe_exec_queue_kill(struct xe_exec_queue *q)
>>  {
>>  	struct xe_exec_queue *eq = q, *next;
>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
>> index 02ce8d204622..45b72daa2db3 100644
>> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
>> @@ -66,5 +66,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e,
>>  					       struct xe_vm *vm);
>>  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
>>  				  struct dma_fence *fence);
>> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
>>
>>  #endif
>> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
>> index cd8a2fba5438..6a081a4fa190 100644
>> --- a/drivers/gpu/drm/xe/xe_sched_job.c
>> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
>> @@ -242,6 +242,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
>>  {
>
>This seems like the wrong place. xe_sched_job_completed is a helper
>which determines *if* a job completed it *not* when it is completed. The

indeed, not the right place.


>DRM scheduler free_job callback is probably the right place
>(guc_exec_queue_free_job or execlist_job_free). So just call
>xe_exec_queue_update_runtime there?

yeah, I will add it there and do some tests.

thanks for catching this.

Lucas De Marchi

>
>Matt
>
>>  	struct xe_lrc *lrc = job->q->lrc;
>>
>> +	xe_exec_queue_update_runtime(job->q);
>> +
>>  	/*
>>  	 * Can safely check just LRC[0] seqno as that is last seqno written when
>>  	 * parallel handshake is done.
>> --
>> 2.43.0
>>
Tvrtko Ursulin April 26, 2024, 10:49 a.m. UTC | #3
On 24/04/2024 00:56, Lucas De Marchi wrote:
> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> 
> Add a helper to accumulate per-client runtime of all its
> exec queues. Currently that is done in 2 places:
> 
> 	1. when the exec_queue is destroyed
> 	2. when the sched job is completed
> 
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_device_types.h |  9 +++++++
>   drivers/gpu/drm/xe/xe_exec_queue.c   | 37 ++++++++++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
>   drivers/gpu/drm/xe/xe_sched_job.c    |  2 ++
>   4 files changed, 49 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 2e62450d86e1..33d3bf93a2f1 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -547,6 +547,15 @@ struct xe_file {
>   		struct mutex lock;
>   	} exec_queue;
>   
> +	/**
> +	 * @runtime: hw engine class runtime in ticks for this drm client
> +	 *
> +	 * Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
> +	 * case, since all jobs run in parallel on the engines, only the stats
> +	 * from lrc[0] are sufficient.

Out of curiousity doesn't this mean multi-lrc jobs will be incorrectly 
accounted for? (When capacity is considered.)

Regards,

Tvrtko

> +	 */
> +	u64 runtime[XE_ENGINE_CLASS_MAX];
> +
>   	/** @client: drm client */
>   	struct xe_drm_client *client;
>   };
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 395de93579fa..b7b6256cb96a 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -214,6 +214,8 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
>   {
>   	int i;
>   
> +	xe_exec_queue_update_runtime(q);
> +
>   	for (i = 0; i < q->width; ++i)
>   		xe_lrc_finish(q->lrc + i);
>   	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm))
> @@ -769,6 +771,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
>   		q->lrc[0].fence_ctx.next_seqno - 1;
>   }
>   
> +/**
> + * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
> + * @q: The exec queue
> + *
> + * Update the timestamp saved by HW for this exec queue and save runtime
> + * calculated by using the delta from last update. On multi-lrc case, only the
> + * first is considered.
> + */
> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
> +{
> +	struct xe_file *xef;
> +	struct xe_lrc *lrc;
> +	u32 old_ts, new_ts;
> +
> +	/*
> +	 * Jobs that are run during driver load may use an exec_queue, but are
> +	 * not associated with a user xe file, so avoid accumulating busyness
> +	 * for kernel specific work.
> +	 */
> +	if (!q->vm || !q->vm->xef)
> +		return;
> +
> +	xef = q->vm->xef;
> +	lrc = &q->lrc[0];
> +
> +	new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
> +
> +	/*
> +	 * Special case the very first timestamp: we don't want the
> +	 * initial delta to be a huge value
> +	 */
> +	if (old_ts)
> +		xef->runtime[q->class] += new_ts - old_ts;
> +}
> +
>   void xe_exec_queue_kill(struct xe_exec_queue *q)
>   {
>   	struct xe_exec_queue *eq = q, *next;
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> index 02ce8d204622..45b72daa2db3 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -66,5 +66,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e,
>   					       struct xe_vm *vm);
>   void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
>   				  struct dma_fence *fence);
> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
>   
>   #endif
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> index cd8a2fba5438..6a081a4fa190 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -242,6 +242,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
>   {
>   	struct xe_lrc *lrc = job->q->lrc;
>   
> +	xe_exec_queue_update_runtime(job->q);
> +
>   	/*
>   	 * Can safely check just LRC[0] seqno as that is last seqno written when
>   	 * parallel handshake is done.
Umesh Nerlige Ramappa April 26, 2024, 6:59 p.m. UTC | #4
On Fri, Apr 26, 2024 at 11:49:32AM +0100, Tvrtko Ursulin wrote:
>
>On 24/04/2024 00:56, Lucas De Marchi wrote:
>>From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>
>>Add a helper to accumulate per-client runtime of all its
>>exec queues. Currently that is done in 2 places:
>>
>>	1. when the exec_queue is destroyed
>>	2. when the sched job is completed
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
>>---
>>  drivers/gpu/drm/xe/xe_device_types.h |  9 +++++++
>>  drivers/gpu/drm/xe/xe_exec_queue.c   | 37 ++++++++++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
>>  drivers/gpu/drm/xe/xe_sched_job.c    |  2 ++
>>  4 files changed, 49 insertions(+)
>>
>>diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>index 2e62450d86e1..33d3bf93a2f1 100644
>>--- a/drivers/gpu/drm/xe/xe_device_types.h
>>+++ b/drivers/gpu/drm/xe/xe_device_types.h
>>@@ -547,6 +547,15 @@ struct xe_file {
>>  		struct mutex lock;
>>  	} exec_queue;
>>+	/**
>>+	 * @runtime: hw engine class runtime in ticks for this drm client
>>+	 *
>>+	 * Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
>>+	 * case, since all jobs run in parallel on the engines, only the stats
>>+	 * from lrc[0] are sufficient.
>
>Out of curiousity doesn't this mean multi-lrc jobs will be incorrectly 
>accounted for? (When capacity is considered.)

TBH, I am not sure what the user would like to see here for multi-lrc.  
If reporting the capacity, then we may need to use width as a 
multiplication factor for multi-lrc. How was this done in i915?

Regards,
Umesh


>
>Regards,
>
>Tvrtko
>
>>+	 */
>>+	u64 runtime[XE_ENGINE_CLASS_MAX];
>>+
>>  	/** @client: drm client */
>>  	struct xe_drm_client *client;
>>  };
>>diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>>index 395de93579fa..b7b6256cb96a 100644
>>--- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>@@ -214,6 +214,8 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
>>  {
>>  	int i;
>>+	xe_exec_queue_update_runtime(q);
>>+
>>  	for (i = 0; i < q->width; ++i)
>>  		xe_lrc_finish(q->lrc + i);
>>  	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm))
>>@@ -769,6 +771,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
>>  		q->lrc[0].fence_ctx.next_seqno - 1;
>>  }
>>+/**
>>+ * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
>>+ * @q: The exec queue
>>+ *
>>+ * Update the timestamp saved by HW for this exec queue and save runtime
>>+ * calculated by using the delta from last update. On multi-lrc case, only the
>>+ * first is considered.
>>+ */
>>+void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
>>+{
>>+	struct xe_file *xef;
>>+	struct xe_lrc *lrc;
>>+	u32 old_ts, new_ts;
>>+
>>+	/*
>>+	 * Jobs that are run during driver load may use an exec_queue, but are
>>+	 * not associated with a user xe file, so avoid accumulating busyness
>>+	 * for kernel specific work.
>>+	 */
>>+	if (!q->vm || !q->vm->xef)
>>+		return;
>>+
>>+	xef = q->vm->xef;
>>+	lrc = &q->lrc[0];
>>+
>>+	new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
>>+
>>+	/*
>>+	 * Special case the very first timestamp: we don't want the
>>+	 * initial delta to be a huge value
>>+	 */
>>+	if (old_ts)
>>+		xef->runtime[q->class] += new_ts - old_ts;
>>+}
>>+
>>  void xe_exec_queue_kill(struct xe_exec_queue *q)
>>  {
>>  	struct xe_exec_queue *eq = q, *next;
>>diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
>>index 02ce8d204622..45b72daa2db3 100644
>>--- a/drivers/gpu/drm/xe/xe_exec_queue.h
>>+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
>>@@ -66,5 +66,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e,
>>  					       struct xe_vm *vm);
>>  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
>>  				  struct dma_fence *fence);
>>+void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
>>  #endif
>>diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
>>index cd8a2fba5438..6a081a4fa190 100644
>>--- a/drivers/gpu/drm/xe/xe_sched_job.c
>>+++ b/drivers/gpu/drm/xe/xe_sched_job.c
>>@@ -242,6 +242,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
>>  {
>>  	struct xe_lrc *lrc = job->q->lrc;
>>+	xe_exec_queue_update_runtime(job->q);
>>+
>>  	/*
>>  	 * Can safely check just LRC[0] seqno as that is last seqno written when
>>  	 * parallel handshake is done.
Tvrtko Ursulin April 29, 2024, 8:07 a.m. UTC | #5
On 26/04/2024 19:59, Umesh Nerlige Ramappa wrote:
> On Fri, Apr 26, 2024 at 11:49:32AM +0100, Tvrtko Ursulin wrote:
>>
>> On 24/04/2024 00:56, Lucas De Marchi wrote:
>>> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>
>>> Add a helper to accumulate per-client runtime of all its
>>> exec queues. Currently that is done in 2 places:
>>>
>>>     1. when the exec_queue is destroyed
>>>     2. when the sched job is completed
>>>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
>>> ---
>>>  drivers/gpu/drm/xe/xe_device_types.h |  9 +++++++
>>>  drivers/gpu/drm/xe/xe_exec_queue.c   | 37 ++++++++++++++++++++++++++++
>>>  drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
>>>  drivers/gpu/drm/xe/xe_sched_job.c    |  2 ++
>>>  4 files changed, 49 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
>>> b/drivers/gpu/drm/xe/xe_device_types.h
>>> index 2e62450d86e1..33d3bf93a2f1 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -547,6 +547,15 @@ struct xe_file {
>>>          struct mutex lock;
>>>      } exec_queue;
>>> +    /**
>>> +     * @runtime: hw engine class runtime in ticks for this drm client
>>> +     *
>>> +     * Only stats from xe_exec_queue->lrc[0] are accumulated. For 
>>> multi-lrc
>>> +     * case, since all jobs run in parallel on the engines, only the 
>>> stats
>>> +     * from lrc[0] are sufficient.
>>
>> Out of curiousity doesn't this mean multi-lrc jobs will be incorrectly 
>> accounted for? (When capacity is considered.)
> 
> TBH, I am not sure what the user would like to see here for multi-lrc. 
> If reporting the capacity, then we may need to use width as a 
> multiplication factor for multi-lrc. How was this done in i915?

IMO user has to see the real utilisation - so if there are two VCS and 
both are busy, 100% should be reported and not 50%. Latter would be 
misleading, either with or without cross-checking with physical utilisation.

In i915 with execlists this works correctly and with GuC you would 
probably know the answer better than me.

Regards,

Tvrtko

> 
> Regards,
> Umesh
> 
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>> +     */
>>> +    u64 runtime[XE_ENGINE_CLASS_MAX];
>>> +
>>>      /** @client: drm client */
>>>      struct xe_drm_client *client;
>>>  };
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c 
>>> b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> index 395de93579fa..b7b6256cb96a 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> @@ -214,6 +214,8 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
>>>  {
>>>      int i;
>>> +    xe_exec_queue_update_runtime(q);
>>> +
>>>      for (i = 0; i < q->width; ++i)
>>>          xe_lrc_finish(q->lrc + i);
>>>      if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & 
>>> EXEC_QUEUE_FLAG_VM || !q->vm))
>>> @@ -769,6 +771,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
>>>          q->lrc[0].fence_ctx.next_seqno - 1;
>>>  }
>>> +/**
>>> + * xe_exec_queue_update_runtime() - Update runtime for this exec 
>>> queue from hw
>>> + * @q: The exec queue
>>> + *
>>> + * Update the timestamp saved by HW for this exec queue and save 
>>> runtime
>>> + * calculated by using the delta from last update. On multi-lrc 
>>> case, only the
>>> + * first is considered.
>>> + */
>>> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
>>> +{
>>> +    struct xe_file *xef;
>>> +    struct xe_lrc *lrc;
>>> +    u32 old_ts, new_ts;
>>> +
>>> +    /*
>>> +     * Jobs that are run during driver load may use an exec_queue, 
>>> but are
>>> +     * not associated with a user xe file, so avoid accumulating 
>>> busyness
>>> +     * for kernel specific work.
>>> +     */
>>> +    if (!q->vm || !q->vm->xef)
>>> +        return;
>>> +
>>> +    xef = q->vm->xef;
>>> +    lrc = &q->lrc[0];
>>> +
>>> +    new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
>>> +
>>> +    /*
>>> +     * Special case the very first timestamp: we don't want the
>>> +     * initial delta to be a huge value
>>> +     */
>>> +    if (old_ts)
>>> +        xef->runtime[q->class] += new_ts - old_ts;
>>> +}
>>> +
>>>  void xe_exec_queue_kill(struct xe_exec_queue *q)
>>>  {
>>>      struct xe_exec_queue *eq = q, *next;
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h 
>>> b/drivers/gpu/drm/xe/xe_exec_queue.h
>>> index 02ce8d204622..45b72daa2db3 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
>>> @@ -66,5 +66,6 @@ struct dma_fence 
>>> *xe_exec_queue_last_fence_get(struct xe_exec_queue *e,
>>>                             struct xe_vm *vm);
>>>  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct 
>>> xe_vm *vm,
>>>                    struct dma_fence *fence);
>>> +void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
>>>  #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c 
>>> b/drivers/gpu/drm/xe/xe_sched_job.c
>>> index cd8a2fba5438..6a081a4fa190 100644
>>> --- a/drivers/gpu/drm/xe/xe_sched_job.c
>>> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
>>> @@ -242,6 +242,8 @@ bool xe_sched_job_completed(struct xe_sched_job 
>>> *job)
>>>  {
>>>      struct xe_lrc *lrc = job->q->lrc;
>>> +    xe_exec_queue_update_runtime(job->q);
>>> +
>>>      /*
>>>       * Can safely check just LRC[0] seqno as that is last seqno 
>>> written when
>>>       * parallel handshake is done.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 2e62450d86e1..33d3bf93a2f1 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -547,6 +547,15 @@  struct xe_file {
 		struct mutex lock;
 	} exec_queue;
 
+	/**
+	 * @runtime: hw engine class runtime in ticks for this drm client
+	 *
+	 * Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
+	 * case, since all jobs run in parallel on the engines, only the stats
+	 * from lrc[0] are sufficient.
+	 */
+	u64 runtime[XE_ENGINE_CLASS_MAX];
+
 	/** @client: drm client */
 	struct xe_drm_client *client;
 };
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 395de93579fa..b7b6256cb96a 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -214,6 +214,8 @@  void xe_exec_queue_fini(struct xe_exec_queue *q)
 {
 	int i;
 
+	xe_exec_queue_update_runtime(q);
+
 	for (i = 0; i < q->width; ++i)
 		xe_lrc_finish(q->lrc + i);
 	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm))
@@ -769,6 +771,41 @@  bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
 		q->lrc[0].fence_ctx.next_seqno - 1;
 }
 
+/**
+ * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
+ * @q: The exec queue
+ *
+ * Update the timestamp saved by HW for this exec queue and save runtime
+ * calculated by using the delta from last update. On multi-lrc case, only the
+ * first is considered.
+ */
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
+{
+	struct xe_file *xef;
+	struct xe_lrc *lrc;
+	u32 old_ts, new_ts;
+
+	/*
+	 * Jobs that are run during driver load may use an exec_queue, but are
+	 * not associated with a user xe file, so avoid accumulating busyness
+	 * for kernel specific work.
+	 */
+	if (!q->vm || !q->vm->xef)
+		return;
+
+	xef = q->vm->xef;
+	lrc = &q->lrc[0];
+
+	new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
+
+	/*
+	 * Special case the very first timestamp: we don't want the
+	 * initial delta to be a huge value
+	 */
+	if (old_ts)
+		xef->runtime[q->class] += new_ts - old_ts;
+}
+
 void xe_exec_queue_kill(struct xe_exec_queue *q)
 {
 	struct xe_exec_queue *eq = q, *next;
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
index 02ce8d204622..45b72daa2db3 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -66,5 +66,6 @@  struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e,
 					       struct xe_vm *vm);
 void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
 				  struct dma_fence *fence);
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index cd8a2fba5438..6a081a4fa190 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -242,6 +242,8 @@  bool xe_sched_job_completed(struct xe_sched_job *job)
 {
 	struct xe_lrc *lrc = job->q->lrc;
 
+	xe_exec_queue_update_runtime(job->q);
+
 	/*
 	 * Can safely check just LRC[0] seqno as that is last seqno written when
 	 * parallel handshake is done.