diff mbox

[6/9] drm/i915/tdr: Add engine reset count to error state

Message ID 20161216202010.7983-7-michel.thierry@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michel Thierry Dec. 16, 2016, 8:20 p.m. UTC
From: Arun Siluvery <arun.siluvery@linux.intel.com>

Driver maintains count of how many times a given engine is reset, useful to
capture this in error state also. It gives an idea of how engine is coping
up with the workloads it is executing before this error state.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c       | 1 +
 drivers/gpu/drm/i915/i915_drv.h       | 9 +++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++
 3 files changed, 13 insertions(+)

Comments

Chris Wilson Dec. 16, 2016, 8:37 p.m. UTC | #1
On Fri, Dec 16, 2016 at 12:20:07PM -0800, Michel Thierry wrote:
> From: Arun Siluvery <arun.siluvery@linux.intel.com>
> 
> Driver maintains count of how many times a given engine is reset, useful to
> capture this in error state also. It gives an idea of how engine is coping
> up with the workloads it is executing before this error state.

Also i915_engine_info().
 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c       | 1 +
>  drivers/gpu/drm/i915/i915_drv.h       | 9 +++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++
>  3 files changed, 13 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index a034793bc246..1c706a082d60 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1877,6 +1877,7 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>  	intel_engine_reset_cancel(engine);
>  	intel_execlists_restart_submission(engine);
>  
> +	dev_priv->gpu_error.engine_reset_count[engine->id]++;
>  	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>  	return 0;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a97cc8f50ade..25183762ed94 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -937,6 +937,7 @@ struct drm_i915_error_state {
>  		enum intel_engine_hangcheck_action hangcheck_action;
>  		struct i915_address_space *vm;
>  		int num_requests;
> +		u32 reset_count;
>  
>  		/* position of active request inside the ring */
>  		u32 rq_head, rq_post, rq_tail;
> @@ -1629,6 +1630,8 @@ struct i915_gpu_error {
>  #define I915_RESET_IN_PROGRESS	0
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
>  
> +	unsigned long engine_reset_count[I915_NUM_ENGINES];
> +
>  	/**
>  	 * Waitqueue to signal when a hang is detected. Used to for waiters
>  	 * to release the struct_mutex for the reset to procede.
> @@ -3397,6 +3400,12 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
>  	return READ_ONCE(error->reset_count);
>  }
>  
> +static inline u32 i915_engine_reset_count(struct i915_gpu_error *error,

That's a weird hodgepodge. We first think of intel_engine and are
confused, then we spot the reset. i915_reset_engine_count?

> +					  struct intel_engine_cs *engine)
> +{
> +	return READ_ONCE(error->engine_reset_count[engine->id]);
> +}
> +
>  void i915_gem_reset(struct drm_i915_private *dev_priv);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
>  void i915_gem_reset_engine(struct intel_engine_cs *engine);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index e16037d1b0ba..f168ad873521 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -453,6 +453,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
>  		   ee->hangcheck_timestamp,
>  		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
> +	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
>  
>  	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
>  	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
> @@ -1170,6 +1171,8 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
>  	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
>  	ee->hangcheck_action = engine->hangcheck.action;
>  	ee->hangcheck_stalled = engine->hangcheck.stalled;
> +	ee->reset_count = i915_engine_reset_count(&dev_priv->gpu_error,
> +						  engine);
>  
>  	if (USES_PPGTT(dev_priv)) {
>  		int i;
> -- 
> 2.11.0
>
Michel Thierry Dec. 16, 2016, 9:19 p.m. UTC | #2
On 16/12/16 12:37, Chris Wilson wrote:
> On Fri, Dec 16, 2016 at 12:20:07PM -0800, Michel Thierry wrote:
>> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>>
>> Driver maintains count of how many times a given engine is reset, useful to
>> capture this in error state also. It gives an idea of how engine is coping
>> up with the workloads it is executing before this error state.
>
> Also i915_engine_info().
>
I'll add it there too.

>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.c       | 1 +
>>  drivers/gpu/drm/i915/i915_drv.h       | 9 +++++++++
>>  drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++
>>  3 files changed, 13 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>> index a034793bc246..1c706a082d60 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -1877,6 +1877,7 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>>  	intel_engine_reset_cancel(engine);
>>  	intel_execlists_restart_submission(engine);
>>
>> +	dev_priv->gpu_error.engine_reset_count[engine->id]++;
>>  	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>>  	return 0;
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index a97cc8f50ade..25183762ed94 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -937,6 +937,7 @@ struct drm_i915_error_state {
>>  		enum intel_engine_hangcheck_action hangcheck_action;
>>  		struct i915_address_space *vm;
>>  		int num_requests;
>> +		u32 reset_count;
>>
>>  		/* position of active request inside the ring */
>>  		u32 rq_head, rq_post, rq_tail;
>> @@ -1629,6 +1630,8 @@ struct i915_gpu_error {
>>  #define I915_RESET_IN_PROGRESS	0
>>  #define I915_WEDGED		(BITS_PER_LONG - 1)
>>
>> +	unsigned long engine_reset_count[I915_NUM_ENGINES];
>> +
>>  	/**
>>  	 * Waitqueue to signal when a hang is detected. Used to for waiters
>>  	 * to release the struct_mutex for the reset to procede.
>> @@ -3397,6 +3400,12 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
>>  	return READ_ONCE(error->reset_count);
>>  }
>>
>> +static inline u32 i915_engine_reset_count(struct i915_gpu_error *error,
>
> That's a weird hodgepodge. We first think of intel_engine and are
> confused, then we spot the reset. i915_reset_engine_count?
>

Agreed i915_reset_engine_count is better (plus it's called from 
i915_reset_info).

>> +					  struct intel_engine_cs *engine)
>> +{
>> +	return READ_ONCE(error->engine_reset_count[engine->id]);
>> +}
>> +

Then the same rename here, reset_engine_count[].

Thanks
>>  void i915_gem_reset(struct drm_i915_private *dev_priv);
>>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
>>  void i915_gem_reset_engine(struct intel_engine_cs *engine);
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index e16037d1b0ba..f168ad873521 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -453,6 +453,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>>  	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
>>  		   ee->hangcheck_timestamp,
>>  		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
>> +	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
>>
>>  	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
>>  	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
>> @@ -1170,6 +1171,8 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
>>  	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
>>  	ee->hangcheck_action = engine->hangcheck.action;
>>  	ee->hangcheck_stalled = engine->hangcheck.stalled;
>> +	ee->reset_count = i915_engine_reset_count(&dev_priv->gpu_error,
>> +						  engine);
>>
>>  	if (USES_PPGTT(dev_priv)) {
>>  		int i;
>> --
>> 2.11.0
>>
>
Tvrtko Ursulin Dec. 19, 2016, 8:27 a.m. UTC | #3
On 16/12/2016 20:20, Michel Thierry wrote:
> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>
> Driver maintains count of how many times a given engine is reset, useful to
> capture this in error state also. It gives an idea of how engine is coping
> up with the workloads it is executing before this error state.
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c       | 1 +
>  drivers/gpu/drm/i915/i915_drv.h       | 9 +++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++
>  3 files changed, 13 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index a034793bc246..1c706a082d60 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1877,6 +1877,7 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>  	intel_engine_reset_cancel(engine);
>  	intel_execlists_restart_submission(engine);
>
> +	dev_priv->gpu_error.engine_reset_count[engine->id]++;
>  	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>  	return 0;
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a97cc8f50ade..25183762ed94 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -937,6 +937,7 @@ struct drm_i915_error_state {
>  		enum intel_engine_hangcheck_action hangcheck_action;
>  		struct i915_address_space *vm;
>  		int num_requests;
> +		u32 reset_count;
>
>  		/* position of active request inside the ring */
>  		u32 rq_head, rq_post, rq_tail;
> @@ -1629,6 +1630,8 @@ struct i915_gpu_error {
>  #define I915_RESET_IN_PROGRESS	0
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
>
> +	unsigned long engine_reset_count[I915_NUM_ENGINES];
> +
>  	/**
>  	 * Waitqueue to signal when a hang is detected. Used to for waiters
>  	 * to release the struct_mutex for the reset to procede.
> @@ -3397,6 +3400,12 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
>  	return READ_ONCE(error->reset_count);
>  }
>
> +static inline u32 i915_engine_reset_count(struct i915_gpu_error *error,
> +					  struct intel_engine_cs *engine)
> +{
> +	return READ_ONCE(error->engine_reset_count[engine->id]);
> +}

Accidentally spotted a type width mismatch here between u32 and unsigned 
long. Not sure which one would be best, maybe unsigned int?

Regards,

Tvrtko

> +
>  void i915_gem_reset(struct drm_i915_private *dev_priv);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
>  void i915_gem_reset_engine(struct intel_engine_cs *engine);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index e16037d1b0ba..f168ad873521 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -453,6 +453,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
>  		   ee->hangcheck_timestamp,
>  		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
> +	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
>
>  	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
>  	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
> @@ -1170,6 +1171,8 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
>  	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
>  	ee->hangcheck_action = engine->hangcheck.action;
>  	ee->hangcheck_stalled = engine->hangcheck.stalled;
> +	ee->reset_count = i915_engine_reset_count(&dev_priv->gpu_error,
> +						  engine);
>
>  	if (USES_PPGTT(dev_priv)) {
>  		int i;
>
Michel Thierry Dec. 19, 2016, 6:09 p.m. UTC | #4
On 19/12/16 00:27, Tvrtko Ursulin wrote:
>
> On 16/12/2016 20:20, Michel Thierry wrote:
>> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>> @@ -937,6 +937,7 @@ struct drm_i915_error_state {
>>          enum intel_engine_hangcheck_action hangcheck_action;
>>          struct i915_address_space *vm;
>>          int num_requests;
>> +        u32 reset_count;
>>
>>          /* position of active request inside the ring */
>>          u32 rq_head, rq_post, rq_tail;
>> @@ -1629,6 +1630,8 @@ struct i915_gpu_error {
>>  #define I915_RESET_IN_PROGRESS    0
>>  #define I915_WEDGED        (BITS_PER_LONG - 1)
>>
>> +    unsigned long engine_reset_count[I915_NUM_ENGINES];
>> +
>>      /**
>>       * Waitqueue to signal when a hang is detected. Used to for waiters
>>       * to release the struct_mutex for the reset to procede.
>> @@ -3397,6 +3400,12 @@ static inline u32 i915_reset_count(struct
>> i915_gpu_error *error)
>>      return READ_ONCE(error->reset_count);
>>  }
>>
>> +static inline u32 i915_engine_reset_count(struct i915_gpu_error *error,
>> +                      struct intel_engine_cs *engine)
>> +{
>> +    return READ_ONCE(error->engine_reset_count[engine->id]);
>> +}
>
> Accidentally spotted a type width mismatch here between u32 and unsigned
> long. Not sure which one would be best, maybe unsigned int?
>
> Regards,
>

Thanks, I'll change engine_reset_count to u32...  I don't think we 
should worry too much if it overflows.

> Tvrtko
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index a034793bc246..1c706a082d60 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1877,6 +1877,7 @@  int i915_reset_engine(struct intel_engine_cs *engine)
 	intel_engine_reset_cancel(engine);
 	intel_execlists_restart_submission(engine);
 
+	dev_priv->gpu_error.engine_reset_count[engine->id]++;
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 	return 0;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a97cc8f50ade..25183762ed94 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -937,6 +937,7 @@  struct drm_i915_error_state {
 		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
+		u32 reset_count;
 
 		/* position of active request inside the ring */
 		u32 rq_head, rq_post, rq_tail;
@@ -1629,6 +1630,8 @@  struct i915_gpu_error {
 #define I915_RESET_IN_PROGRESS	0
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
+	unsigned long engine_reset_count[I915_NUM_ENGINES];
+
 	/**
 	 * Waitqueue to signal when a hang is detected. Used to for waiters
 	 * to release the struct_mutex for the reset to procede.
@@ -3397,6 +3400,12 @@  static inline u32 i915_reset_count(struct i915_gpu_error *error)
 	return READ_ONCE(error->reset_count);
 }
 
+static inline u32 i915_engine_reset_count(struct i915_gpu_error *error,
+					  struct intel_engine_cs *engine)
+{
+	return READ_ONCE(error->engine_reset_count[engine->id]);
+}
+
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 void i915_gem_reset_engine(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e16037d1b0ba..f168ad873521 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -453,6 +453,7 @@  static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
 		   ee->hangcheck_timestamp,
 		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
+	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
@@ -1170,6 +1171,8 @@  static void error_record_engine_registers(struct drm_i915_error_state *error,
 	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
 	ee->hangcheck_stalled = engine->hangcheck.stalled;
+	ee->reset_count = i915_engine_reset_count(&dev_priv->gpu_error,
+						  engine);
 
 	if (USES_PPGTT(dev_priv)) {
 		int i;