drm/xe: Add per-engine pagefault and reset counts

Message ID	20250210193545.96003-1-jonathan.cavitt@intel.com (mailing list archive)
State	New
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Jonathan Cavitt <jonathan.cavitt@intel.com> To: intel-gfx@lists.freedesktop.org Cc: saurabhg.gupta@intel.com, alex.zuo@intel.com, jonathan.cavitt@intel.com, niranjana.vishwanathapura@intel.com, ayaz.siddiqui@intel.com, tomasz.mistat@intel.com Subject: [PATCH] drm/xe: Add per-engine pagefault and reset counts Date: Mon, 10 Feb 2025 19:35:44 +0000 Message-ID: <20250210193545.96003-1-jonathan.cavitt@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	drm/xe: Add per-engine pagefault and reset counts \| expand drm/xe: Add per-engine pagefault and reset counts

Message ID

20250210193545.96003-1-jonathan.cavitt@intel.com (mailing list archive)

State

New

Headers

From: Jonathan Cavitt <jonathan.cavitt@intel.com>
To: intel-gfx@lists.freedesktop.org
Cc: saurabhg.gupta@intel.com, alex.zuo@intel.com, jonathan.cavitt@intel.com,
 niranjana.vishwanathapura@intel.com, ayaz.siddiqui@intel.com,
 tomasz.mistat@intel.com
Subject: [PATCH] drm/xe: Add per-engine pagefault and reset counts
Date: Mon, 10 Feb 2025 19:35:44 +0000
Message-ID: <20250210193545.96003-1-jonathan.cavitt@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: list
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Series

drm/xe: Add per-engine pagefault and reset counts | expand

Commit Message

Jonathan Cavitt Feb. 10, 2025, 7:35 p.m. UTC

Add counters to all engines that count the number of pagefaults and
engine resets that have been triggered on them.  Report these values
during an engine reset.

Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
CC: Tomasz Mistat <tomasz.mistat@intel.com>
CC: Ayaz A Siddiqui <ayaz.siddiqui@intel.com>
CC: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_pagefault.c    | 6 ++++++
 drivers/gpu/drm/xe/xe_guc_submit.c      | 9 +++++++--
 drivers/gpu/drm/xe/xe_hw_engine.c       | 3 +++
 drivers/gpu/drm/xe/xe_hw_engine_types.h | 4 ++++
 4 files changed, 20 insertions(+), 2 deletions(-)

Comments

Jonathan Cavitt Feb. 10, 2025, 7:37 p.m. UTC | #1

Wrong mailing list.  Sorry.  Please ignore this email.
-Jonathan Cavitt

-----Original Message-----
From: Cavitt, Jonathan <jonathan.cavitt@intel.com> 
Sent: Monday, February 10, 2025 11:36 AM
To: intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; Cavitt, Jonathan <jonathan.cavitt@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Siddiqui, Ayaz A <ayaz.siddiqui@intel.com>; Mistat, Tomasz <tomasz.mistat@intel.com>
Subject: [PATCH] drm/xe: Add per-engine pagefault and reset counts
> 
> Add counters to all engines that count the number of pagefaults and
> engine resets that have been triggered on them.  Report these values
> during an engine reset.
> 
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> CC: Tomasz Mistat <tomasz.mistat@intel.com>
> CC: Ayaz A Siddiqui <ayaz.siddiqui@intel.com>
> CC: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt_pagefault.c    | 6 ++++++
>  drivers/gpu/drm/xe/xe_guc_submit.c      | 9 +++++++--
>  drivers/gpu/drm/xe/xe_hw_engine.c       | 3 +++
>  drivers/gpu/drm/xe/xe_hw_engine_types.h | 4 ++++
>  4 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 46701ca11ce0..04e973b20019 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -130,6 +130,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
>  {
>  	struct xe_vm *vm = xe_vma_vm(vma);
>  	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_hw_engine *hwe = NULL;
>  	struct drm_exec exec;
>  	struct dma_fence *fence;
>  	ktime_t end = 0;
> @@ -140,6 +141,11 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
>  	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
>  
>  	trace_xe_vma_pagefault(vma);
> +
> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> +	if (hwe)
> +		atomic_inc(&hwe->pagefault_count);
> +
>  	atomic = access_is_atomic(pf->access_type);
>  
>  	/* Check if VMA is valid */
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 913c74d6e2ae..6f5d74340319 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1972,6 +1972,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  {
>  	struct xe_gt *gt = guc_to_gt(guc);
>  	struct xe_exec_queue *q;
> +	struct xe_hw_engine *hwe;
>  	u32 guc_id;
>  
>  	if (unlikely(len < 1))
> @@ -1983,8 +1984,12 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	if (unlikely(!q))
>  		return -EPROTO;
>  
> -	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> -		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> +	hwe = q->hwe;
> +	atomic_inc(&hwe->reset_count);
> +
> +	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d, pagefault_count=%u, reset_count=%u",
> +		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id,
> +		   atomic_read(&hwe->pagefault_count), atomic_read(&hwe->reset_count));
>  
>  	trace_xe_exec_queue_reset(q);
>  
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index fc447751fe78..0be6c38fe2a4 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -516,6 +516,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  	hwe->fence_irq = &gt->fence_irq[info->class];
>  	hwe->engine_id = id;
>  
> +	atomic_set(&hwe->pagefault_count, 0);
> +	atomic_set(&hwe->reset_count, 0);
> +
>  	hwe->eclass = &gt->eclass[hwe->class];
>  	if (!hwe->eclass->sched_props.job_timeout_ms) {
>  		hwe->eclass->sched_props.job_timeout_ms = 5 * 1000;
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index e4191a7a2c31..635dc3da6523 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -150,6 +150,10 @@ struct xe_hw_engine {
>  	struct xe_oa_unit *oa_unit;
>  	/** @hw_engine_group: the group of hw engines this one belongs to */
>  	struct xe_hw_engine_group *hw_engine_group;
> +	/** @pagefault_count: number of pagefaults associated with this engine */
> +	atomic_t pagefault_count;
> +	/** @reset_count: number of engine resets associated with this engine */
> +	atomic_t reset_count;
>  };
>  
>  enum xe_hw_engine_snapshot_source_id {
> -- 
> 2.43.0
> 
>

Matthew Brost Feb. 11, 2025, 3:54 a.m. UTC | #2

On Mon, Feb 10, 2025 at 07:35:44PM +0000, Jonathan Cavitt wrote:
> Add counters to all engines that count the number of pagefaults and
> engine resets that have been triggered on them.  Report these values
> during an engine reset.
>

I'm not opposed to adding stats for engines, but if we do, I think it
should be done in a generic way. See xe_gt_stats* for an example—that
is, do not use specific variables for counters; instead, use an array
indexed by an enum. I'd also probably wire to debugfs while here using
the aforementioned example as a template.

> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> CC: Tomasz Mistat <tomasz.mistat@intel.com>
> CC: Ayaz A Siddiqui <ayaz.siddiqui@intel.com>
> CC: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt_pagefault.c    | 6 ++++++
>  drivers/gpu/drm/xe/xe_guc_submit.c      | 9 +++++++--
>  drivers/gpu/drm/xe/xe_hw_engine.c       | 3 +++
>  drivers/gpu/drm/xe/xe_hw_engine_types.h | 4 ++++
>  4 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 46701ca11ce0..04e973b20019 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -130,6 +130,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
>  {
>  	struct xe_vm *vm = xe_vma_vm(vma);
>  	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_hw_engine *hwe = NULL;
>  	struct drm_exec exec;
>  	struct dma_fence *fence;
>  	ktime_t end = 0;
> @@ -140,6 +141,11 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
>  	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
>  
>  	trace_xe_vma_pagefault(vma);
> +
> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> +	if (hwe)
> +		atomic_inc(&hwe->pagefault_count);
> +

Page faults are a critical path so anything that is debug related (e.g.
stats) likely should compile out in non-debug builds.

>  	atomic = access_is_atomic(pf->access_type);
>  
>  	/* Check if VMA is valid */
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 913c74d6e2ae..6f5d74340319 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1972,6 +1972,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  {
>  	struct xe_gt *gt = guc_to_gt(guc);
>  	struct xe_exec_queue *q;
> +	struct xe_hw_engine *hwe;
>  	u32 guc_id;
>  
>  	if (unlikely(len < 1))
> @@ -1983,8 +1984,12 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	if (unlikely(!q))
>  		return -EPROTO;
>  
> -	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> -		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> +	hwe = q->hwe;
> +	atomic_inc(&hwe->reset_count);
> +
> +	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d, pagefault_count=%u, reset_count=%u",
> +		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id,
> +		   atomic_read(&hwe->pagefault_count), atomic_read(&hwe->reset_count));

Can you explain why this information is helpful? A better option than
dmesg might be a devcoredump too. I'll need a little more background
here I guess.

Matt

>  
>  	trace_xe_exec_queue_reset(q);
>  
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index fc447751fe78..0be6c38fe2a4 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -516,6 +516,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  	hwe->fence_irq = &gt->fence_irq[info->class];
>  	hwe->engine_id = id;
>  
> +	atomic_set(&hwe->pagefault_count, 0);
> +	atomic_set(&hwe->reset_count, 0);
> +
>  	hwe->eclass = &gt->eclass[hwe->class];
>  	if (!hwe->eclass->sched_props.job_timeout_ms) {
>  		hwe->eclass->sched_props.job_timeout_ms = 5 * 1000;
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index e4191a7a2c31..635dc3da6523 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -150,6 +150,10 @@ struct xe_hw_engine {
>  	struct xe_oa_unit *oa_unit;
>  	/** @hw_engine_group: the group of hw engines this one belongs to */
>  	struct xe_hw_engine_group *hw_engine_group;
> +	/** @pagefault_count: number of pagefaults associated with this engine */
> +	atomic_t pagefault_count;
> +	/** @reset_count: number of engine resets associated with this engine */
> +	atomic_t reset_count;
>  };
>  
>  enum xe_hw_engine_snapshot_source_id {
> -- 
> 2.43.0
>

Matthew Brost Feb. 11, 2025, 4:12 a.m. UTC | #3

On Mon, Feb 10, 2025 at 07:35:44PM +0000, Jonathan Cavitt wrote:
> Add counters to all engines that count the number of pagefaults and
> engine resets that have been triggered on them.  Report these values
> during an engine reset.
> 
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> CC: Tomasz Mistat <tomasz.mistat@intel.com>
> CC: Ayaz A Siddiqui <ayaz.siddiqui@intel.com>
> CC: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt_pagefault.c    | 6 ++++++
>  drivers/gpu/drm/xe/xe_guc_submit.c      | 9 +++++++--
>  drivers/gpu/drm/xe/xe_hw_engine.c       | 3 +++
>  drivers/gpu/drm/xe/xe_hw_engine_types.h | 4 ++++
>  4 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 46701ca11ce0..04e973b20019 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -130,6 +130,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
>  {
>  	struct xe_vm *vm = xe_vma_vm(vma);
>  	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_hw_engine *hwe = NULL;
>  	struct drm_exec exec;
>  	struct dma_fence *fence;
>  	ktime_t end = 0;
> @@ -140,6 +141,11 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
>  	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
>  
>  	trace_xe_vma_pagefault(vma);
> +
> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> +	if (hwe)
> +		atomic_inc(&hwe->pagefault_count);
> +
>  	atomic = access_is_atomic(pf->access_type);
>  
>  	/* Check if VMA is valid */
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 913c74d6e2ae..6f5d74340319 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1972,6 +1972,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  {
>  	struct xe_gt *gt = guc_to_gt(guc);
>  	struct xe_exec_queue *q;
> +	struct xe_hw_engine *hwe;
>  	u32 guc_id;
>  
>  	if (unlikely(len < 1))
> @@ -1983,8 +1984,12 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	if (unlikely(!q))
>  		return -EPROTO;
>  
> -	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> -		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> +	hwe = q->hwe;
> +	atomic_inc(&hwe->reset_count);
> +
> +	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d, pagefault_count=%u, reset_count=%u",
> +		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id,
> +		   atomic_read(&hwe->pagefault_count), atomic_read(&hwe->reset_count));
>  
>  	trace_xe_exec_queue_reset(q);
>  
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index fc447751fe78..0be6c38fe2a4 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -516,6 +516,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  	hwe->fence_irq = &gt->fence_irq[info->class];
>  	hwe->engine_id = id;
>  
> +	atomic_set(&hwe->pagefault_count, 0);
> +	atomic_set(&hwe->reset_count, 0);

Missed this my previous reply, we zalloc basically everything in Xe,
certainly heavy weight things like enigines, so no need explictly set as
zero during init.

Matt  

> +
>  	hwe->eclass = &gt->eclass[hwe->class];
>  	if (!hwe->eclass->sched_props.job_timeout_ms) {
>  		hwe->eclass->sched_props.job_timeout_ms = 5 * 1000;
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index e4191a7a2c31..635dc3da6523 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -150,6 +150,10 @@ struct xe_hw_engine {
>  	struct xe_oa_unit *oa_unit;
>  	/** @hw_engine_group: the group of hw engines this one belongs to */
>  	struct xe_hw_engine_group *hw_engine_group;
> +	/** @pagefault_count: number of pagefaults associated with this engine */
> +	atomic_t pagefault_count;
> +	/** @reset_count: number of engine resets associated with this engine */
> +	atomic_t reset_count;
>  };
>  
>  enum xe_hw_engine_snapshot_source_id {
> -- 
> 2.43.0
>

Matthew Brost Feb. 11, 2025, 4:28 a.m. UTC | #4

On Mon, Feb 10, 2025 at 07:37:36PM +0000, Cavitt, Jonathan wrote:
> Wrong mailing list.  Sorry.  Please ignore this email.
> -Jonathan Cavitt
> 

I replied here... I guess continue this conversation here? Almost
everyone in on both lists...

Matt

> -----Original Message-----
> From: Cavitt, Jonathan <jonathan.cavitt@intel.com> 
> Sent: Monday, February 10, 2025 11:36 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; Cavitt, Jonathan <jonathan.cavitt@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Siddiqui, Ayaz A <ayaz.siddiqui@intel.com>; Mistat, Tomasz <tomasz.mistat@intel.com>
> Subject: [PATCH] drm/xe: Add per-engine pagefault and reset counts
> > 
> > Add counters to all engines that count the number of pagefaults and
> > engine resets that have been triggered on them.  Report these values
> > during an engine reset.
> > 
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > CC: Tomasz Mistat <tomasz.mistat@intel.com>
> > CC: Ayaz A Siddiqui <ayaz.siddiqui@intel.com>
> > CC: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_gt_pagefault.c    | 6 ++++++
> >  drivers/gpu/drm/xe/xe_guc_submit.c      | 9 +++++++--
> >  drivers/gpu/drm/xe/xe_hw_engine.c       | 3 +++
> >  drivers/gpu/drm/xe/xe_hw_engine_types.h | 4 ++++
> >  4 files changed, 20 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index 46701ca11ce0..04e973b20019 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -130,6 +130,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
> >  {
> >  	struct xe_vm *vm = xe_vma_vm(vma);
> >  	struct xe_tile *tile = gt_to_tile(gt);
> > +	struct xe_hw_engine *hwe = NULL;
> >  	struct drm_exec exec;
> >  	struct dma_fence *fence;
> >  	ktime_t end = 0;
> > @@ -140,6 +141,11 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
> >  	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
> >  
> >  	trace_xe_vma_pagefault(vma);
> > +
> > +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> > +	if (hwe)
> > +		atomic_inc(&hwe->pagefault_count);
> > +
> >  	atomic = access_is_atomic(pf->access_type);
> >  
> >  	/* Check if VMA is valid */
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 913c74d6e2ae..6f5d74340319 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1972,6 +1972,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >  {
> >  	struct xe_gt *gt = guc_to_gt(guc);
> >  	struct xe_exec_queue *q;
> > +	struct xe_hw_engine *hwe;
> >  	u32 guc_id;
> >  
> >  	if (unlikely(len < 1))
> > @@ -1983,8 +1984,12 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >  	if (unlikely(!q))
> >  		return -EPROTO;
> >  
> > -	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> > -		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> > +	hwe = q->hwe;
> > +	atomic_inc(&hwe->reset_count);
> > +
> > +	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d, pagefault_count=%u, reset_count=%u",
> > +		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id,
> > +		   atomic_read(&hwe->pagefault_count), atomic_read(&hwe->reset_count));
> >  
> >  	trace_xe_exec_queue_reset(q);
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> > index fc447751fe78..0be6c38fe2a4 100644
> > --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> > +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> > @@ -516,6 +516,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
> >  	hwe->fence_irq = &gt->fence_irq[info->class];
> >  	hwe->engine_id = id;
> >  
> > +	atomic_set(&hwe->pagefault_count, 0);
> > +	atomic_set(&hwe->reset_count, 0);
> > +
> >  	hwe->eclass = &gt->eclass[hwe->class];
> >  	if (!hwe->eclass->sched_props.job_timeout_ms) {
> >  		hwe->eclass->sched_props.job_timeout_ms = 5 * 1000;
> > diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > index e4191a7a2c31..635dc3da6523 100644
> > --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > @@ -150,6 +150,10 @@ struct xe_hw_engine {
> >  	struct xe_oa_unit *oa_unit;
> >  	/** @hw_engine_group: the group of hw engines this one belongs to */
> >  	struct xe_hw_engine_group *hw_engine_group;
> > +	/** @pagefault_count: number of pagefaults associated with this engine */
> > +	atomic_t pagefault_count;
> > +	/** @reset_count: number of engine resets associated with this engine */
> > +	atomic_t reset_count;
> >  };
> >  
> >  enum xe_hw_engine_snapshot_source_id {
> > -- 
> > 2.43.0
> > 
> >

diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 46701ca11ce0..04e973b20019 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -130,6 +130,7 @@  static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
 {
 	struct xe_vm *vm = xe_vma_vm(vma);
 	struct xe_tile *tile = gt_to_tile(gt);
+	struct xe_hw_engine *hwe = NULL;
 	struct drm_exec exec;
 	struct dma_fence *fence;
 	ktime_t end = 0;
@@ -140,6 +141,11 @@  static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
 	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
 
 	trace_xe_vma_pagefault(vma);
+
+	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
+	if (hwe)
+		atomic_inc(&hwe->pagefault_count);
+
 	atomic = access_is_atomic(pf->access_type);
 
 	/* Check if VMA is valid */
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 913c74d6e2ae..6f5d74340319 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1972,6 +1972,7 @@  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
 {
 	struct xe_gt *gt = guc_to_gt(guc);
 	struct xe_exec_queue *q;
+	struct xe_hw_engine *hwe;
 	u32 guc_id;
 
 	if (unlikely(len < 1))
@@ -1983,8 +1984,12 @@  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
 	if (unlikely(!q))
 		return -EPROTO;
 
-	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
-		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
+	hwe = q->hwe;
+	atomic_inc(&hwe->reset_count);
+
+	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d, pagefault_count=%u, reset_count=%u",
+		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id,
+		   atomic_read(&hwe->pagefault_count), atomic_read(&hwe->reset_count));
 
 	trace_xe_exec_queue_reset(q);
 
diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
index fc447751fe78..0be6c38fe2a4 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine.c
+++ b/drivers/gpu/drm/xe/xe_hw_engine.c
@@ -516,6 +516,9 @@  static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
 	hwe->fence_irq = &gt->fence_irq[info->class];
 	hwe->engine_id = id;
 
+	atomic_set(&hwe->pagefault_count, 0);
+	atomic_set(&hwe->reset_count, 0);
+
 	hwe->eclass = &gt->eclass[hwe->class];
 	if (!hwe->eclass->sched_props.job_timeout_ms) {
 		hwe->eclass->sched_props.job_timeout_ms = 5 * 1000;
diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
index e4191a7a2c31..635dc3da6523 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
+++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
@@ -150,6 +150,10 @@  struct xe_hw_engine {
 	struct xe_oa_unit *oa_unit;
 	/** @hw_engine_group: the group of hw engines this one belongs to */
 	struct xe_hw_engine_group *hw_engine_group;
+	/** @pagefault_count: number of pagefaults associated with this engine */
+	atomic_t pagefault_count;
+	/** @reset_count: number of engine resets associated with this engine */
+	atomic_t reset_count;
 };
 
 enum xe_hw_engine_snapshot_source_id {

drm/xe: Add per-engine pagefault and reset counts

Commit Message

Comments

Patch