diff mbox series

[2/2] drm/i915: Use internal class when counting engine resets

Message ID 20231201122109.729006-2-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/i915/selftests: Fix engine reset count storage for multi-tile | expand

Commit Message

Tvrtko Ursulin Dec. 1, 2023, 12:21 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
made the GSC0 engine not have a valid uabi class and so broke the engine
reset counting, which in turn was made class based in cb823ed9915b ("drm/i915/gt: Use intel_gt as the primary object for handling resets").

Despite the title and commit text of the latter is not mentioning it (and
has left the storage array incorrectly sized), tracking by class, despite
it adding aliasing in hypthotetical multi-tile systems, is handy for
virtual engines which for instance do not have a valid engine->id.

Therefore we keep that but just change it to use the internal class which
is always valid. We also add a helper to increment the count, which
aligns with the existing getter.

What was broken without this fix were out of bounds reads every time a
reset would happen on the GSC0 engine, or during selftests when storing
and cross-checking the counts in igt_live_test_begin and
igt_live_test_end.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
Reported-by: Alan Previn Teres Alexis <alan.previn.teres.alexis@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_reset.c             |  2 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
 drivers/gpu/drm/i915/i915_gpu_error.h             | 12 ++++++++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

Comments

Daniele Ceraolo Spurio Dec. 6, 2023, 12:52 a.m. UTC | #1
On 12/1/2023 4:21 AM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
> made the GSC0 engine not have a valid uabi class and so broke the engine
> reset counting, which in turn was made class based in cb823ed9915b ("drm/i915/gt: Use intel_gt as the primary object for handling resets").
>
> Despite the title and commit text of the latter is not mentioning it (and
> has left the storage array incorrectly sized), tracking by class, despite
> it adding aliasing in hypthotetical multi-tile systems, is handy for
> virtual engines which for instance do not have a valid engine->id.
>
> Therefore we keep that but just change it to use the internal class which
> is always valid. We also add a helper to increment the count, which
> aligns with the existing getter.
>
> What was broken without this fix were out of bounds reads every time a
> reset would happen on the GSC0 engine, or during selftests when storing
> and cross-checking the counts in igt_live_test_begin and
> igt_live_test_end.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
> Reported-by: Alan Previn Teres Alexis <alan.previn.teres.alexis@intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> ---
>   drivers/gpu/drm/i915/gt/intel_reset.c             |  2 +-
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
>   drivers/gpu/drm/i915/i915_gpu_error.h             | 12 ++++++++++--
>   3 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index d5ed904f355d..6801f8b95c53 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1293,7 +1293,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
>   	if (msg)
>   		drm_notice(&engine->i915->drm,
>   			   "Resetting %s for %s\n", engine->name, msg);
> -	atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
> +	i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
>   
>   	ret = intel_gt_reset_engine(engine);
>   	if (ret) {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 04f8377fd7a3..58ea285c51d4 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -5003,7 +5003,8 @@ static void capture_error_state(struct intel_guc *guc,
>   			if (match) {
>   				intel_engine_set_hung_context(e, ce);
>   				engine_mask |= e->mask;
> -				atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
> +				i915_increase_reset_engine_count(&i915->gpu_error,
> +								 e);
>   			}
>   		}
>   
> @@ -5015,7 +5016,7 @@ static void capture_error_state(struct intel_guc *guc,
>   	} else {
>   		intel_engine_set_hung_context(ce->engine, ce);
>   		engine_mask = ce->engine->mask;
> -		atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
> +		i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
>   	}
>   
>   	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index fa886620d3f8..7c255bb1c319 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -17,6 +17,7 @@
>   #include "display/intel_display_device.h"
>   #include "display/intel_display_params.h"
>   #include "gt/intel_engine.h"
> +#include "gt/intel_engine_types.h"
>   #include "gt/intel_gt_types.h"
>   #include "gt/uc/intel_uc_fw.h"
>   
> @@ -234,7 +235,7 @@ struct i915_gpu_error {
>   	atomic_t reset_count;
>   
>   	/** Number of times an engine has been reset */
> -	atomic_t reset_engine_count[I915_NUM_ENGINES];
> +	atomic_t reset_engine_count[MAX_ENGINE_CLASS];
>   };
>   
>   struct drm_i915_error_state_buf {
> @@ -257,7 +258,14 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
>   static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>   					  const struct intel_engine_cs *engine)
>   {
> -	return atomic_read(&error->reset_engine_count[engine->uabi_class]);
> +	return atomic_read(&error->reset_engine_count[engine->class]);
> +}
> +
> +static inline void
> +i915_increase_reset_engine_count(struct i915_gpu_error *error,
> +				 const struct intel_engine_cs *engine)
> +{
> +	atomic_inc(&error->reset_engine_count[engine->class]);
>   }
>   
>   #define CORE_DUMP_FLAG_NONE           0x0
Tvrtko Ursulin Dec. 7, 2023, 11:12 a.m. UTC | #2
On 06/12/2023 00:52, Daniele Ceraolo Spurio wrote:
> 
> 
> On 12/1/2023 4:21 AM, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with 
>> reserved uabi class")
>> made the GSC0 engine not have a valid uabi class and so broke the engine
>> reset counting, which in turn was made class based in cb823ed9915b 
>> ("drm/i915/gt: Use intel_gt as the primary object for handling resets").
>>
>> Despite the title and commit text of the latter is not mentioning it (and
>> has left the storage array incorrectly sized), tracking by class, despite
>> it adding aliasing in hypthotetical multi-tile systems, is handy for
>> virtual engines which for instance do not have a valid engine->id.
>>
>> Therefore we keep that but just change it to use the internal class which
>> is always valid. We also add a helper to increment the count, which
>> aligns with the existing getter.
>>
>> What was broken without this fix were out of bounds reads every time a
>> reset would happen on the GSC0 engine, or during selftests when storing
>> and cross-checking the counts in igt_live_test_begin and
>> igt_live_test_end.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with 
>> reserved uabi class")
>> Reported-by: Alan Previn Teres Alexis 
>> <alan.previn.teres.alexis@intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> 
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Thanks! Lets see if 1/2 gets some attention so I don't have to split out 
2/2 just for CI.

Regards,

Tvrtko

> 
> Daniele
> 
>> ---
>>   drivers/gpu/drm/i915/gt/intel_reset.c             |  2 +-
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
>>   drivers/gpu/drm/i915/i915_gpu_error.h             | 12 ++++++++++--
>>   3 files changed, 14 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
>> b/drivers/gpu/drm/i915/gt/intel_reset.c
>> index d5ed904f355d..6801f8b95c53 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>> @@ -1293,7 +1293,7 @@ int __intel_engine_reset_bh(struct 
>> intel_engine_cs *engine, const char *msg)
>>       if (msg)
>>           drm_notice(&engine->i915->drm,
>>                  "Resetting %s for %s\n", engine->name, msg);
>> -    
>> atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
>> +    i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
>>       ret = intel_gt_reset_engine(engine);
>>       if (ret) {
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index 04f8377fd7a3..58ea285c51d4 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -5003,7 +5003,8 @@ static void capture_error_state(struct intel_guc 
>> *guc,
>>               if (match) {
>>                   intel_engine_set_hung_context(e, ce);
>>                   engine_mask |= e->mask;
>> -                
>> atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
>> +                i915_increase_reset_engine_count(&i915->gpu_error,
>> +                                 e);
>>               }
>>           }
>> @@ -5015,7 +5016,7 @@ static void capture_error_state(struct intel_guc 
>> *guc,
>>       } else {
>>           intel_engine_set_hung_context(ce->engine, ce);
>>           engine_mask = ce->engine->mask;
>> -        
>> atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
>> +        i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
>>       }
>>       with_intel_runtime_pm(&i915->runtime_pm, wakeref)
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
>> b/drivers/gpu/drm/i915/i915_gpu_error.h
>> index fa886620d3f8..7c255bb1c319 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
>> @@ -17,6 +17,7 @@
>>   #include "display/intel_display_device.h"
>>   #include "display/intel_display_params.h"
>>   #include "gt/intel_engine.h"
>> +#include "gt/intel_engine_types.h"
>>   #include "gt/intel_gt_types.h"
>>   #include "gt/uc/intel_uc_fw.h"
>> @@ -234,7 +235,7 @@ struct i915_gpu_error {
>>       atomic_t reset_count;
>>       /** Number of times an engine has been reset */
>> -    atomic_t reset_engine_count[I915_NUM_ENGINES];
>> +    atomic_t reset_engine_count[MAX_ENGINE_CLASS];
>>   };
>>   struct drm_i915_error_state_buf {
>> @@ -257,7 +258,14 @@ static inline u32 i915_reset_count(struct 
>> i915_gpu_error *error)
>>   static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>>                         const struct intel_engine_cs *engine)
>>   {
>> -    return atomic_read(&error->reset_engine_count[engine->uabi_class]);
>> +    return atomic_read(&error->reset_engine_count[engine->class]);
>> +}
>> +
>> +static inline void
>> +i915_increase_reset_engine_count(struct i915_gpu_error *error,
>> +                 const struct intel_engine_cs *engine)
>> +{
>> +    atomic_inc(&error->reset_engine_count[engine->class]);
>>   }
>>   #define CORE_DUMP_FLAG_NONE           0x0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index d5ed904f355d..6801f8b95c53 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1293,7 +1293,7 @@  int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
 	if (msg)
 		drm_notice(&engine->i915->drm,
 			   "Resetting %s for %s\n", engine->name, msg);
-	atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
+	i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
 
 	ret = intel_gt_reset_engine(engine);
 	if (ret) {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 04f8377fd7a3..58ea285c51d4 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -5003,7 +5003,8 @@  static void capture_error_state(struct intel_guc *guc,
 			if (match) {
 				intel_engine_set_hung_context(e, ce);
 				engine_mask |= e->mask;
-				atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
+				i915_increase_reset_engine_count(&i915->gpu_error,
+								 e);
 			}
 		}
 
@@ -5015,7 +5016,7 @@  static void capture_error_state(struct intel_guc *guc,
 	} else {
 		intel_engine_set_hung_context(ce->engine, ce);
 		engine_mask = ce->engine->mask;
-		atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
+		i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
 	}
 
 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index fa886620d3f8..7c255bb1c319 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -17,6 +17,7 @@ 
 #include "display/intel_display_device.h"
 #include "display/intel_display_params.h"
 #include "gt/intel_engine.h"
+#include "gt/intel_engine_types.h"
 #include "gt/intel_gt_types.h"
 #include "gt/uc/intel_uc_fw.h"
 
@@ -234,7 +235,7 @@  struct i915_gpu_error {
 	atomic_t reset_count;
 
 	/** Number of times an engine has been reset */
-	atomic_t reset_engine_count[I915_NUM_ENGINES];
+	atomic_t reset_engine_count[MAX_ENGINE_CLASS];
 };
 
 struct drm_i915_error_state_buf {
@@ -257,7 +258,14 @@  static inline u32 i915_reset_count(struct i915_gpu_error *error)
 static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
 					  const struct intel_engine_cs *engine)
 {
-	return atomic_read(&error->reset_engine_count[engine->uabi_class]);
+	return atomic_read(&error->reset_engine_count[engine->class]);
+}
+
+static inline void
+i915_increase_reset_engine_count(struct i915_gpu_error *error,
+				 const struct intel_engine_cs *engine)
+{
+	atomic_inc(&error->reset_engine_count[engine->class]);
 }
 
 #define CORE_DUMP_FLAG_NONE           0x0