diff mbox series

drm/i915/selftests: Allow engine reset failure to do a GT reset in hangcheck selftest

Message ID 20211011234705.30853-1-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/selftests: Allow engine reset failure to do a GT reset in hangcheck selftest | expand

Commit Message

Matthew Brost Oct. 11, 2021, 11:47 p.m. UTC
The hangcheck selftest blocks per engine resets by setting magic bits in
the reset flags. This is incorrect for GuC submission because if the GuC
fails to reset an engine we would like to do a full GT reset. Do no set
these magic bits when using GuC submission.

Side note this lockless algorithm with magic bits to block resets really
should be ripped out.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

Comments

Thomas Hellstrom Oct. 21, 2021, 6:15 a.m. UTC | #1
Hi, Matthew,

On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
> The hangcheck selftest blocks per engine resets by setting magic bits
> in
> the reset flags. This is incorrect for GuC submission because if the
> GuC
> fails to reset an engine we would like to do a full GT reset. Do no
> set
> these magic bits when using GuC submission.
> 
> Side note this lockless algorithm with magic bits to block resets
> really
> should be ripped out.
> 

Lockless algorithm aside, from a quick look at the code in
intel_reset.c it appears to me like the interface that falls back to a
full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
is explicitly intended to not do that, so is there a discrepancy
between GuC and non-GuC here?

/Thomas


> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index 7e2d99dd012d..90a03c60c80c 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
> *gt, bool active)
>                 reset_engine_count = i915_reset_engine_count(global,
> engine);
>  
>                 st_engine_heartbeat_disable(engine);
> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +               if (!using_guc)
> +                       set_bit(I915_RESET_ENGINE + id, &gt-
> >reset.flags);
>                 count = 0;
>                 do {
>                         struct i915_request *rq = NULL;
> @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
> *gt, bool active)
>                         if (err)
>                                 break;
>                 } while (time_before(jiffies, end_time));
> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +               if (!using_guc)
> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
> >reset.flags);
>                 st_engine_heartbeat_enable(engine);
>                 pr_info("%s: Completed %lu %s resets\n",
>                         engine->name, count, active ? "active" :
> "idle");
> @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
> *gt,
>                 yield(); /* start all threads before we begin */
>  
>                 st_engine_heartbeat_disable_no_pm(engine);
> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +               if (!using_guc)
> +                       set_bit(I915_RESET_ENGINE + id, &gt-
> >reset.flags);
>                 do {
>                         struct i915_request *rq = NULL;
>                         struct intel_selftest_saved_policy saved;
> @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
> *gt,
>                         if (err)
>                                 break;
>                 } while (time_before(jiffies, end_time));
> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +               if (!using_guc)
> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
> >reset.flags);
>                 st_engine_heartbeat_enable_no_pm(engine);
>  
>                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
Matthew Brost Oct. 21, 2021, 8:37 p.m. UTC | #2
On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
> Hi, Matthew,
> 
> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
> > The hangcheck selftest blocks per engine resets by setting magic bits
> > in
> > the reset flags. This is incorrect for GuC submission because if the
> > GuC
> > fails to reset an engine we would like to do a full GT reset. Do no
> > set
> > these magic bits when using GuC submission.
> > 
> > Side note this lockless algorithm with magic bits to block resets
> > really
> > should be ripped out.
> > 
> 
> Lockless algorithm aside, from a quick look at the code in
> intel_reset.c it appears to me like the interface that falls back to a
> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
> is explicitly intended to not do that, so is there a discrepancy
> between GuC and non-GuC here?
> 

With GuC submission when an engine reset fails, we get an engine reset
failure notification which triggers a full GT reset
(intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
reset is blocking by setting these magic bits. Clearing the bits in this
function doesn't seem to unblock that reset either, the driver tries to
unload with a worker blocked, and results in the blow up. Something with
this lockless algorithm could be wrong as clear of the bit should
unlblock the reset but it is doesn't. We can look into that but in the
meantime we need to fix this test to be able to fail gracefully and not
crash CI.

Matt

> /Thomas
> 
> 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
> >  1 file changed, 8 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > index 7e2d99dd012d..90a03c60c80c 100644
> > --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
> > *gt, bool active)
> >                 reset_engine_count = i915_reset_engine_count(global,
> > engine);
> >  
> >                 st_engine_heartbeat_disable(engine);
> > -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > +               if (!using_guc)
> > +                       set_bit(I915_RESET_ENGINE + id, &gt-
> > >reset.flags);
> >                 count = 0;
> >                 do {
> >                         struct i915_request *rq = NULL;
> > @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
> > *gt, bool active)
> >                         if (err)
> >                                 break;
> >                 } while (time_before(jiffies, end_time));
> > -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > +               if (!using_guc)
> > +                       clear_bit(I915_RESET_ENGINE + id, &gt-
> > >reset.flags);
> >                 st_engine_heartbeat_enable(engine);
> >                 pr_info("%s: Completed %lu %s resets\n",
> >                         engine->name, count, active ? "active" :
> > "idle");
> > @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
> > *gt,
> >                 yield(); /* start all threads before we begin */
> >  
> >                 st_engine_heartbeat_disable_no_pm(engine);
> > -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > +               if (!using_guc)
> > +                       set_bit(I915_RESET_ENGINE + id, &gt-
> > >reset.flags);
> >                 do {
> >                         struct i915_request *rq = NULL;
> >                         struct intel_selftest_saved_policy saved;
> > @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
> > *gt,
> >                         if (err)
> >                                 break;
> >                 } while (time_before(jiffies, end_time));
> > -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > +               if (!using_guc)
> > +                       clear_bit(I915_RESET_ENGINE + id, &gt-
> > >reset.flags);
> >                 st_engine_heartbeat_enable_no_pm(engine);
> >  
> >                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
> 
>
Thomas Hellstrom Oct. 22, 2021, 6:23 a.m. UTC | #3
On 10/21/21 22:37, Matthew Brost wrote:
> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>> Hi, Matthew,
>>
>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>> The hangcheck selftest blocks per engine resets by setting magic bits
>>> in
>>> the reset flags. This is incorrect for GuC submission because if the
>>> GuC
>>> fails to reset an engine we would like to do a full GT reset. Do no
>>> set
>>> these magic bits when using GuC submission.
>>>
>>> Side note this lockless algorithm with magic bits to block resets
>>> really
>>> should be ripped out.
>>>
>> Lockless algorithm aside, from a quick look at the code in
>> intel_reset.c it appears to me like the interface that falls back to a
>> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
>> is explicitly intended to not do that, so is there a discrepancy
>> between GuC and non-GuC here?
>>
> With GuC submission when an engine reset fails, we get an engine reset
> failure notification which triggers a full GT reset
> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
> reset is blocking by setting these magic bits. Clearing the bits in this
> function doesn't seem to unblock that reset either, the driver tries to
> unload with a worker blocked, and results in the blow up. Something with
> this lockless algorithm could be wrong as clear of the bit should
> unlblock the reset but it is doesn't. We can look into that but in the
> meantime we need to fix this test to be able to fail gracefully and not
> crash CI.

Yeah, for that lockless algorithm if needed, we might want to use a 
ww_mutex per engine or something,
but point was that AFAICT at least one of the tests that set those flags 
explicitly tested the functionality that no other engines than the 
intended one was reset when the intel_engine_reset() function was used, 
and then if GuC submission doesn't honor that, wouldn't a better 
approach be to make a code comment around intel_engine_reset() to 
explain the differences and disable that particular test for GuC?. Also 
wouldn't we for example we see a duplicated full GT reset with GuC if 
intel_engine_reset() fails as part of the intel_gt_handle_error() function?

I guess we could live with the hangcheck test being disabled for guc 
submission until this is sorted out?

/Thomas


>
> Matt
>
>> /Thomas
>>
>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
>>>   1 file changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> index 7e2d99dd012d..90a03c60c80c 100644
>>> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
>>> *gt, bool active)
>>>                  reset_engine_count = i915_reset_engine_count(global,
>>> engine);
>>>   
>>>                  st_engine_heartbeat_disable(engine);
>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  count = 0;
>>>                  do {
>>>                          struct i915_request *rq = NULL;
>>> @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
>>> *gt, bool active)
>>>                          if (err)
>>>                                  break;
>>>                  } while (time_before(jiffies, end_time));
>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  st_engine_heartbeat_enable(engine);
>>>                  pr_info("%s: Completed %lu %s resets\n",
>>>                          engine->name, count, active ? "active" :
>>> "idle");
>>> @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
>>> *gt,
>>>                  yield(); /* start all threads before we begin */
>>>   
>>>                  st_engine_heartbeat_disable_no_pm(engine);
>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  do {
>>>                          struct i915_request *rq = NULL;
>>>                          struct intel_selftest_saved_policy saved;
>>> @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
>>> *gt,
>>>                          if (err)
>>>                                  break;
>>>                  } while (time_before(jiffies, end_time));
>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  st_engine_heartbeat_enable_no_pm(engine);
>>>   
>>>                  pr_info("i915_reset_engine(%s:%s): %lu resets\n",
>>
Matthew Brost Oct. 22, 2021, 5:03 p.m. UTC | #4
On Fri, Oct 22, 2021 at 08:23:55AM +0200, Thomas Hellström wrote:
> 
> On 10/21/21 22:37, Matthew Brost wrote:
> > On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
> > > Hi, Matthew,
> > > 
> > > On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
> > > > The hangcheck selftest blocks per engine resets by setting magic bits
> > > > in
> > > > the reset flags. This is incorrect for GuC submission because if the
> > > > GuC
> > > > fails to reset an engine we would like to do a full GT reset. Do no
> > > > set
> > > > these magic bits when using GuC submission.
> > > > 
> > > > Side note this lockless algorithm with magic bits to block resets
> > > > really
> > > > should be ripped out.
> > > > 
> > > Lockless algorithm aside, from a quick look at the code in
> > > intel_reset.c it appears to me like the interface that falls back to a
> > > full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
> > > is explicitly intended to not do that, so is there a discrepancy
> > > between GuC and non-GuC here?
> > > 
> > With GuC submission when an engine reset fails, we get an engine reset
> > failure notification which triggers a full GT reset
> > (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
> > reset is blocking by setting these magic bits. Clearing the bits in this
> > function doesn't seem to unblock that reset either, the driver tries to
> > unload with a worker blocked, and results in the blow up. Something with
> > this lockless algorithm could be wrong as clear of the bit should
> > unlblock the reset but it is doesn't. We can look into that but in the
> > meantime we need to fix this test to be able to fail gracefully and not
> > crash CI.
> 
> Yeah, for that lockless algorithm if needed, we might want to use a ww_mutex
> per engine or something,

Do ww_mutex sleep? From what I can tell this lockless algorithm was
added because even though resets are protected by mutex, there are some
places in the IRQ context where we need to prevent resets from
happening, hence the lockless protection + the mutex - what a mess. Long
term this needs to rethought. 

> but point was that AFAICT at least one of the tests that set those flags
> explicitly tested the functionality that no other engines than the intended
> one was reset when the intel_engine_reset() function was used, and then if
> GuC submission doesn't honor that, wouldn't a better approach be to make a

No. In execlists this test explictly calls the engine reset function and
explictly prevents other parts of the i915 from calling the engine reset
function - this is why it sets that bit.

In GuC submission the i915 can't do engine resets, the GuC does. In this
case the engine reset fails which triggers a G2H message which tells the
i915 to do a GT reset. If this bit is set the worker blocks on this bit
in the GT reset and the driver blows up on unload as this worker isn't
complete (believe it has a PM ref or something).

> code comment around intel_engine_reset() to explain the differences and

intel_engine_reset() return -ENODEV in GuC submission as the i915 isn't
allowed to engine resets.

> disable that particular test for GuC?. Also wouldn't we for example we see a
> duplicated full GT reset with GuC if intel_engine_reset() fails as part of
> the intel_gt_handle_error() function?
>

Yes, but the GT reset in this test is done after clearing the bits by
the test. In the case of the GuC the GT reset is async operation done by
a worker that receives the G2H message saying the engine reset failed.

> I guess we could live with the hangcheck test being disabled for guc
> submission until this is sorted out?
>

Wouldn't help. See above this an async operation from G2H message. We
can't disable the async G2H handler as without other G2H messages the
world breaks. The only other possible fix would be add an IGT only
variable that if set skips the handling this G2H only.

I can assure with this patch, if the test fails, it fails gracefully
which is what we want.

Matt

> /Thomas
> 
> 
> > 
> > Matt
> > 
> > > /Thomas
> > > 
> > > 
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >   drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
> > > >   1 file changed, 8 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > > > b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > > > index 7e2d99dd012d..90a03c60c80c 100644
> > > > --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > > > +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> > > > @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
> > > > *gt, bool active)
> > > >                  reset_engine_count = i915_reset_engine_count(global,
> > > > engine);
> > > >                  st_engine_heartbeat_disable(engine);
> > > > -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > > > +               if (!using_guc)
> > > > +                       set_bit(I915_RESET_ENGINE + id, &gt-
> > > > > reset.flags);
> > > >                  count = 0;
> > > >                  do {
> > > >                          struct i915_request *rq = NULL;
> > > > @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
> > > > *gt, bool active)
> > > >                          if (err)
> > > >                                  break;
> > > >                  } while (time_before(jiffies, end_time));
> > > > -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > > > +               if (!using_guc)
> > > > +                       clear_bit(I915_RESET_ENGINE + id, &gt-
> > > > > reset.flags);
> > > >                  st_engine_heartbeat_enable(engine);
> > > >                  pr_info("%s: Completed %lu %s resets\n",
> > > >                          engine->name, count, active ? "active" :
> > > > "idle");
> > > > @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
> > > > *gt,
> > > >                  yield(); /* start all threads before we begin */
> > > >                  st_engine_heartbeat_disable_no_pm(engine);
> > > > -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > > > +               if (!using_guc)
> > > > +                       set_bit(I915_RESET_ENGINE + id, &gt-
> > > > > reset.flags);
> > > >                  do {
> > > >                          struct i915_request *rq = NULL;
> > > >                          struct intel_selftest_saved_policy saved;
> > > > @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
> > > > *gt,
> > > >                          if (err)
> > > >                                  break;
> > > >                  } while (time_before(jiffies, end_time));
> > > > -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> > > > +               if (!using_guc)
> > > > +                       clear_bit(I915_RESET_ENGINE + id, &gt-
> > > > > reset.flags);
> > > >                  st_engine_heartbeat_enable_no_pm(engine);
> > > >                  pr_info("i915_reset_engine(%s:%s): %lu resets\n",
> > >
John Harrison Oct. 22, 2021, 6:09 p.m. UTC | #5
On 10/22/2021 10:03, Matthew Brost wrote:
> On Fri, Oct 22, 2021 at 08:23:55AM +0200, Thomas Hellström wrote:
>> On 10/21/21 22:37, Matthew Brost wrote:
>>> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>>>> Hi, Matthew,
>>>>
>>>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>>>> The hangcheck selftest blocks per engine resets by setting magic bits
>>>>> in
>>>>> the reset flags. This is incorrect for GuC submission because if the
>>>>> GuC
>>>>> fails to reset an engine we would like to do a full GT reset. Do no
>>>>> set
>>>>> these magic bits when using GuC submission.
>>>>>
>>>>> Side note this lockless algorithm with magic bits to block resets
>>>>> really
>>>>> should be ripped out.
>>>>>
>>>> Lockless algorithm aside, from a quick look at the code in
>>>> intel_reset.c it appears to me like the interface that falls back to a
>>>> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
>>>> is explicitly intended to not do that, so is there a discrepancy
>>>> between GuC and non-GuC here?
>>>>
>>> With GuC submission when an engine reset fails, we get an engine reset
>>> failure notification which triggers a full GT reset
>>> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
>>> reset is blocking by setting these magic bits. Clearing the bits in this
>>> function doesn't seem to unblock that reset either, the driver tries to
>>> unload with a worker blocked, and results in the blow up. Something with
>>> this lockless algorithm could be wrong as clear of the bit should
>>> unlblock the reset but it is doesn't. We can look into that but in the
>>> meantime we need to fix this test to be able to fail gracefully and not
>>> crash CI.
>> Yeah, for that lockless algorithm if needed, we might want to use a ww_mutex
>> per engine or something,
> Do ww_mutex sleep? From what I can tell this lockless algorithm was
> added because even though resets are protected by mutex, there are some
> places in the IRQ context where we need to prevent resets from
> happening, hence the lockless protection + the mutex - what a mess. Long
> term this needs to rethought.
>
>> but point was that AFAICT at least one of the tests that set those flags
>> explicitly tested the functionality that no other engines than the intended
>> one was reset when the intel_engine_reset() function was used, and then if
>> GuC submission doesn't honor that, wouldn't a better approach be to make a
> No. In execlists this test explictly calls the engine reset function and
> explictly prevents other parts of the i915 from calling the engine reset
> function - this is why it sets that bit.
>
> In GuC submission the i915 can't do engine resets, the GuC does. In this
> case the engine reset fails which triggers a G2H message which tells the
> i915 to do a GT reset. If this bit is set the worker blocks on this bit
> in the GT reset and the driver blows up on unload as this worker isn't
> complete (believe it has a PM ref or something).
>
>> code comment around intel_engine_reset() to explain the differences and
> intel_engine_reset() return -ENODEV in GuC submission as the i915 isn't
> allowed to engine resets.
>
>> disable that particular test for GuC?. Also wouldn't we for example we see a
>> duplicated full GT reset with GuC if intel_engine_reset() fails as part of
>> the intel_gt_handle_error() function?
>>
> Yes, but the GT reset in this test is done after clearing the bits by
> the test. In the case of the GuC the GT reset is async operation done by
> a worker that receives the G2H message saying the engine reset failed.
>
>> I guess we could live with the hangcheck test being disabled for guc
>> submission until this is sorted out?
>>
> Wouldn't help. See above this an async operation from G2H message. We
> can't disable the async G2H handler as without other G2H messages the
> world breaks. The only other possible fix would be add an IGT only
> variable that if set skips the handling this G2H only.
And to be clear, the engine reset is not supposed to fail. Whether 
issued by GuC or i915, the GDRST register is supposed to self clear 
according to the bspec. If we are being sent the G2H notification for an 
engine reset failure then the assumption is that the hardware is broken. 
This is not a situation that is ever intended to occur in a production 
system. Therefore, it is not something we should spend huge amounts of 
effort on making a perfect selftest for.

The current theory is that the timeout in GuC is not quite long enough 
for DG1. Given that the bspec does not specify any kind of timeout, it 
is only a best guess anyway! Once that has been tuned correctly, we 
should never hit this case again. Not ever, Not in a selftest, not in an 
end user use case, just not ever.

John.


>
> I can assure with this patch, if the test fails, it fails gracefully
> which is what we want.
>
> Matt
>
>> /Thomas
>>
>>
>>> Matt
>>>
>>>> /Thomas
>>>>
>>>>
>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
>>>>>    1 file changed, 8 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>>> b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>>> index 7e2d99dd012d..90a03c60c80c 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>>> @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
>>>>> *gt, bool active)
>>>>>                   reset_engine_count = i915_reset_engine_count(global,
>>>>> engine);
>>>>>                   st_engine_heartbeat_disable(engine);
>>>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>>> +               if (!using_guc)
>>>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>>>> reset.flags);
>>>>>                   count = 0;
>>>>>                   do {
>>>>>                           struct i915_request *rq = NULL;
>>>>> @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
>>>>> *gt, bool active)
>>>>>                           if (err)
>>>>>                                   break;
>>>>>                   } while (time_before(jiffies, end_time));
>>>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>>> +               if (!using_guc)
>>>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>>>> reset.flags);
>>>>>                   st_engine_heartbeat_enable(engine);
>>>>>                   pr_info("%s: Completed %lu %s resets\n",
>>>>>                           engine->name, count, active ? "active" :
>>>>> "idle");
>>>>> @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
>>>>> *gt,
>>>>>                   yield(); /* start all threads before we begin */
>>>>>                   st_engine_heartbeat_disable_no_pm(engine);
>>>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>>> +               if (!using_guc)
>>>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>>>> reset.flags);
>>>>>                   do {
>>>>>                           struct i915_request *rq = NULL;
>>>>>                           struct intel_selftest_saved_policy saved;
>>>>> @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
>>>>> *gt,
>>>>>                           if (err)
>>>>>                                   break;
>>>>>                   } while (time_before(jiffies, end_time));
>>>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>>> +               if (!using_guc)
>>>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>>>> reset.flags);
>>>>>                   st_engine_heartbeat_enable_no_pm(engine);
>>>>>                   pr_info("i915_reset_engine(%s:%s): %lu resets\n",
Thomas Hellstrom Oct. 23, 2021, 5:46 p.m. UTC | #6
On 10/22/21 20:09, John Harrison wrote:
> And to be clear, the engine reset is not supposed to fail. Whether 
> issued by GuC or i915, the GDRST register is supposed to self clear 
> according to the bspec. If we are being sent the G2H notification for 
> an engine reset failure then the assumption is that the hardware is 
> broken. This is not a situation that is ever intended to occur in a 
> production system. Therefore, it is not something we should spend huge 
> amounts of effort on making a perfect selftest for.

I don't agree. Selftests are there to verify that assumptions made and 
contracts in the code hold and that hardware behaves as intended / 
assumed. No selftest should ideally trigger in a production driver / 
system. That doesn't mean we can remove all selftests or ignore updating 
them for altered assumptions / contracts. I think it's important here to 
acknowledge the fact that this and the perf selftest have found two 
problems that need consideration for fixing for a production system.

>
> The current theory is that the timeout in GuC is not quite long enough 
> for DG1. Given that the bspec does not specify any kind of timeout, it 
> is only a best guess anyway! Once that has been tuned correctly, we 
> should never hit this case again. Not ever, Not in a selftest, not in 
> an end user use case, just not ever.

..until we introduce new hardware for which the tuning doesn't hold 
anymore or somebody in a two years wants to lower the timeout wondering 
why it was set so long?

/Thomas
Matthew Brost Oct. 23, 2021, 6:18 p.m. UTC | #7
On Sat, Oct 23, 2021 at 07:46:48PM +0200, Thomas Hellström wrote:
> 
> On 10/22/21 20:09, John Harrison wrote:
> > And to be clear, the engine reset is not supposed to fail. Whether
> > issued by GuC or i915, the GDRST register is supposed to self clear
> > according to the bspec. If we are being sent the G2H notification for an
> > engine reset failure then the assumption is that the hardware is broken.
> > This is not a situation that is ever intended to occur in a production
> > system. Therefore, it is not something we should spend huge amounts of
> > effort on making a perfect selftest for.
> 
> I don't agree. Selftests are there to verify that assumptions made and
> contracts in the code hold and that hardware behaves as intended / assumed.
> No selftest should ideally trigger in a production driver / system. That
> doesn't mean we can remove all selftests or ignore updating them for altered
> assumptions / contracts. I think it's important here to acknowledge the fact
> that this and the perf selftest have found two problems that need
> consideration for fixing for a production system.
> 

I'm confused - we are going down the rabbit hole here.

Back to this patch. This test was written for very specific execlists
behavior. It was updated to also support the GuC. In that update we
missed fixing the failure path, well because it always passed. Now it
has failed, we see that it doesn't fail gracefully, and takes down the
machine. This patch fixes that. It also openned my eyes to the horror
show reset locking that needs to be fixed long term.

> > 
> > The current theory is that the timeout in GuC is not quite long enough
> > for DG1. Given that the bspec does not specify any kind of timeout, it
> > is only a best guess anyway! Once that has been tuned correctly, we
> > should never hit this case again. Not ever, Not in a selftest, not in an
> > end user use case, just not ever.
> 
> ..until we introduce new hardware for which the tuning doesn't hold anymore
> or somebody in a two years wants to lower the timeout wondering why it was
> set so long?

If an engine reset fails in the GuC, the GuC signals the i915 via a G2H
that the engine reset has failed and i915 initiates a full GT reset.
After this patch (which removes hacky behavior to block foreign,
relative to the test, resets) we can see the i915 behaving correctly and
the GPU recovering. This path in the code is working as designed. Do you
have test for that behavior, no. Can we? No at the moment as we would
need a way for the GuC to intentionally fail a engine reset. Right now
all we have is either flaky HW or GuC isn't waiting long enough. 

As far as why the engine reset fails - I am currently working with the
GuC team to get a firmware with a configurable timeout period so we can
root cause the engine failure. Figures crossed we are just not waiting
long enough rather than a HW issue.

Regardless of everything above, this patch is correct to unblock CI in
the short term and is correct in the long term too as this test
shouldn't bring down CI when it fails.

Matt

> 
> /Thomas
> 
>
Thomas Hellstrom Oct. 23, 2021, 6:36 p.m. UTC | #8
On 10/23/21 20:18, Matthew Brost wrote:
> On Sat, Oct 23, 2021 at 07:46:48PM +0200, Thomas Hellström wrote:
>> On 10/22/21 20:09, John Harrison wrote:
>>> And to be clear, the engine reset is not supposed to fail. Whether
>>> issued by GuC or i915, the GDRST register is supposed to self clear
>>> according to the bspec. If we are being sent the G2H notification for an
>>> engine reset failure then the assumption is that the hardware is broken.
>>> This is not a situation that is ever intended to occur in a production
>>> system. Therefore, it is not something we should spend huge amounts of
>>> effort on making a perfect selftest for.
>> I don't agree. Selftests are there to verify that assumptions made and
>> contracts in the code hold and that hardware behaves as intended / assumed.
>> No selftest should ideally trigger in a production driver / system. That
>> doesn't mean we can remove all selftests or ignore updating them for altered
>> assumptions / contracts. I think it's important here to acknowledge the fact
>> that this and the perf selftest have found two problems that need
>> consideration for fixing for a production system.
>>
> I'm confused - we are going down the rabbit hole here.
>
> Back to this patch. This test was written for very specific execlists
> behavior. It was updated to also support the GuC. In that update we
> missed fixing the failure path, well because it always passed. Now it
> has failed, we see that it doesn't fail gracefully, and takes down the
> machine. This patch fixes that. It also openned my eyes to the horror
> show reset locking that needs to be fixed long term.

Well the email above wasn't really about the correctness of this 
particular patch (I should probably have altered the subject to reflect 
that) but rather about the assumption that failures that should never 
occur in a production system are not worth spending time on selftests for.

For the patch itself, I'll take a deeper look at the patch and get back.

/Thomas
John Harrison Oct. 25, 2021, 5:32 p.m. UTC | #9
On 10/23/2021 11:36, Thomas Hellström wrote:
> On 10/23/21 20:18, Matthew Brost wrote:
>> On Sat, Oct 23, 2021 at 07:46:48PM +0200, Thomas Hellström wrote:
>>> On 10/22/21 20:09, John Harrison wrote:
>>>> And to be clear, the engine reset is not supposed to fail. Whether
>>>> issued by GuC or i915, the GDRST register is supposed to self clear
>>>> according to the bspec. If we are being sent the G2H notification 
>>>> for an
>>>> engine reset failure then the assumption is that the hardware is 
>>>> broken.
>>>> This is not a situation that is ever intended to occur in a production
>>>> system. Therefore, it is not something we should spend huge amounts of
>>>> effort on making a perfect selftest for.
>>> I don't agree. Selftests are there to verify that assumptions made and
>>> contracts in the code hold and that hardware behaves as intended / 
>>> assumed.
>>> No selftest should ideally trigger in a production driver / system. 
>>> That
>>> doesn't mean we can remove all selftests or ignore updating them for 
>>> altered
>>> assumptions / contracts. I think it's important here to acknowledge 
>>> the fact
>>> that this and the perf selftest have found two problems that need
>>> consideration for fixing for a production system.
>>>
>> I'm confused - we are going down the rabbit hole here.
>>
>> Back to this patch. This test was written for very specific execlists
>> behavior. It was updated to also support the GuC. In that update we
>> missed fixing the failure path, well because it always passed. Now it
>> has failed, we see that it doesn't fail gracefully, and takes down the
>> machine. This patch fixes that. It also openned my eyes to the horror
>> show reset locking that needs to be fixed long term.
>
> Well the email above wasn't really about the correctness of this 
> particular patch (I should probably have altered the subject to 
> reflect that) but rather about the assumption that failures that 
> should never occur in a production system are not worth spending time 
> on selftests for.
My point is that we have to make assumptions that the hardware is 
basically functional. Otherwise, where do you stop? Do you write a 
selftest for every conceivable operation of the hardware and prove that 
it still works every single day? No. That is pointless and we don't have 
the resources to test everything that the hardware can possibly do. We 
have to cope as gracefully as possible in the case where the hardware 
does not behave as intended, such as not killing the entire OS when a 
selftest fails. But I don't think we should be spending time on writing 
a perfect test for something that is supposed to be impossible at the 
hardware level. The purpose of the selftests is to test the driver 
behaviour, not the hardware.

John.

>
> For the patch itself, I'll take a deeper look at the patch and get back.
>
> /Thomas
>
>
Thomas Hellstrom Oct. 26, 2021, 8:22 a.m. UTC | #10
Hi,

On 10/21/21 22:37, Matthew Brost wrote:
> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>> Hi, Matthew,
>>
>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>> The hangcheck selftest blocks per engine resets by setting magic bits
>>> in
>>> the reset flags. This is incorrect for GuC submission because if the
>>> GuC
>>> fails to reset an engine we would like to do a full GT reset. Do no
>>> set
>>> these magic bits when using GuC submission.
>>>
>>> Side note this lockless algorithm with magic bits to block resets
>>> really
>>> should be ripped out.
>>>
>> Lockless algorithm aside, from a quick look at the code in
>> intel_reset.c it appears to me like the interface that falls back to a
>> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
>> is explicitly intended to not do that, so is there a discrepancy
>> between GuC and non-GuC here?
>>
> With GuC submission when an engine reset fails, we get an engine reset
> failure notification which triggers a full GT reset
> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
> reset is blocking by setting these magic bits. Clearing the bits in this
> function doesn't seem to unblock that reset either, the driver tries to
> unload with a worker blocked, and results in the blow up. Something with
> this lockless algorithm could be wrong as clear of the bit should
> unlblock the reset but it is doesn't. We can look into that but in the
> meantime we need to fix this test to be able to fail gracefully and not
> crash CI.
>
> Matt

Hmm, OK I think the situation is a bit unfortunate with the selftest 
hangcheck as the code is sprinkled with "using_guc" to disable anything 
that manually does per-engine resets or verifies the per-engine reset 
count, leaving it very difficult to understand what the test actually 
does except perhaps checking that GuC actually did a reset.

A better approach would probably be to disable all tests that doesn't do 
anything exept iterating through the engines with GuC, and for the other 
tests, extract what's left to test into GuC specific tests.

The bit-locks are obviously there to verify that we don't do concurrent 
per-engine resets or global resets while a per-engine reset is 
happening. Even in the GuC case it appears at least the latter is true 
for this particular self-test, but at the same time the selftest doesn't 
assume anything is trying to reset concurrently and therefore doesn't 
use clear_and_wake_up_bit() when releasing the bit-locks.

But as much as I want the selftests to start running again, TBH I don't 
think I can contribute to even more code being conditioned on GuC with 
an R-B here. Could we disable the per-engine reset tests when GuC is 
enabled for now or try a clear_and_wake_up_bit() instead.

/Thomas


>> /Thomas
>>
>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
>>>   1 file changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> index 7e2d99dd012d..90a03c60c80c 100644
>>> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>> @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
>>> *gt, bool active)
>>>                  reset_engine_count = i915_reset_engine_count(global,
>>> engine);
>>>   
>>>                  st_engine_heartbeat_disable(engine);
>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  count = 0;
>>>                  do {
>>>                          struct i915_request *rq = NULL;
>>> @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
>>> *gt, bool active)
>>>                          if (err)
>>>                                  break;
>>>                  } while (time_before(jiffies, end_time));
>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  st_engine_heartbeat_enable(engine);
>>>                  pr_info("%s: Completed %lu %s resets\n",
>>>                          engine->name, count, active ? "active" :
>>> "idle");
>>> @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
>>> *gt,
>>>                  yield(); /* start all threads before we begin */
>>>   
>>>                  st_engine_heartbeat_disable_no_pm(engine);
>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  do {
>>>                          struct i915_request *rq = NULL;
>>>                          struct intel_selftest_saved_policy saved;
>>> @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
>>> *gt,
>>>                          if (err)
>>>                                  break;
>>>                  } while (time_before(jiffies, end_time));
>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>> +               if (!using_guc)
>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>> reset.flags);
>>>                  st_engine_heartbeat_enable_no_pm(engine);
>>>   
>>>                  pr_info("i915_reset_engine(%s:%s): %lu resets\n",
>>
John Harrison Oct. 26, 2021, 7:48 p.m. UTC | #11
On 10/11/2021 16:47, Matthew Brost wrote:
> The hangcheck selftest blocks per engine resets by setting magic bits in
> the reset flags. This is incorrect for GuC submission because if the GuC
> fails to reset an engine we would like to do a full GT reset. Do no set
> these magic bits when using GuC submission.
>
> Side note this lockless algorithm with magic bits to block resets really
> should be ripped out.
As a first step, I am seeing a pointless BUILD_BUG but no BUILD_BUG at 
all for what really needs to be verified. Specifically, in 
intel_gt_handle_error, inside the engine reset loop, there is:
                         BUILD_BUG_ON(I915_RESET_MODESET >= 
I915_RESET_ENGINE);

Given that the above two values are explicit #defines of '1' and '2'. 
I'm not seeing any value to this assert. On the other hand, what I am 
not seeing anywhere is an assert that 'I915_RESET_ENGINE + max_engine_id 
< I915_WEDGED_ON_INIT'. That being the thing that would actually go 
horribly wrong if the engine count increased too far. Seems like there 
should be one of those in intel_engines_init_mmio, using 
ARRAY_SIZE(intel_engines) as the max id.


It looks like 'busy-reset' and 'reset-idle' parts of 'igt_ctx_sseu' in 
gem/selftests/i915_gem_context.c also mess around with these flags and 
then try to issue a manual engine reset. Presumably those tests are also 
going to have issues with GuC submission.

The workarounds, mocs and reset selftests also do strange things. Those 
ones did get updated to support GuC submission by not attempting manual 
engine resets but using the GuC based hang detection instead. However, 
it seems like they would also suffer the same deadlock scenario if the 
GuC based reset were to fail.

I'm wondering if the better fix is to remove the use of the 
I915_RESET_ENGINE flags completely when using GuC submission. So far as 
I can tell, they are only used (outside of selftest hackery) in 
intel_gt_handle_error to guard against multiple concurrent resets within 
i915. Guarding against multiple concurrent resets in GuC is the GuC's 
job. That leaves GuC based engine reset concurrent with i915 based full 
GT reset. But that is fine because the full GT reset toasts all engines 
AND the GuC. So it doesn't matter what GuC might or might not have been 
doing at the time. The only other issue is multiple concurrent full GT 
resets, but that is protected against by I915_RESET_BACKOFF.

So my thought is to add an 'if(!guc_submission)' wrapper around the set 
and clear of the reset flags immediately before/after the call to 
intel_gt_reset_global().

Fixing it there means the selftests can do what they like with the flags 
without causing problems for GuC submission. It also means being one 
step closer to removing the dodgy lockless locking completely, at least 
when using GuC submission.

John.


>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
>   1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index 7e2d99dd012d..90a03c60c80c 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
>   		reset_engine_count = i915_reset_engine_count(global, engine);
>   
>   		st_engine_heartbeat_disable(engine);
> -		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +		if (!using_guc)
> +			set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>   		count = 0;
>   		do {
>   			struct i915_request *rq = NULL;
> @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
>   			if (err)
>   				break;
>   		} while (time_before(jiffies, end_time));
> -		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +		if (!using_guc)
> +			clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>   		st_engine_heartbeat_enable(engine);
>   		pr_info("%s: Completed %lu %s resets\n",
>   			engine->name, count, active ? "active" : "idle");
> @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt *gt,
>   		yield(); /* start all threads before we begin */
>   
>   		st_engine_heartbeat_disable_no_pm(engine);
> -		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +		if (!using_guc)
> +			set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>   		do {
>   			struct i915_request *rq = NULL;
>   			struct intel_selftest_saved_policy saved;
> @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt *gt,
>   			if (err)
>   				break;
>   		} while (time_before(jiffies, end_time));
> -		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> +		if (!using_guc)
> +			clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>   		st_engine_heartbeat_enable_no_pm(engine);
>   
>   		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
John Harrison Oct. 26, 2021, 7:55 p.m. UTC | #12
On 10/21/2021 23:23, Thomas Hellström wrote:
> On 10/21/21 22:37, Matthew Brost wrote:
>> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>>> Hi, Matthew,
>>>
>>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>>> The hangcheck selftest blocks per engine resets by setting magic bits
>>>> in
>>>> the reset flags. This is incorrect for GuC submission because if the
>>>> GuC
>>>> fails to reset an engine we would like to do a full GT reset. Do no
>>>> set
>>>> these magic bits when using GuC submission.
>>>>
>>>> Side note this lockless algorithm with magic bits to block resets
>>>> really
>>>> should be ripped out.
>>>>
>>> Lockless algorithm aside, from a quick look at the code in
>>> intel_reset.c it appears to me like the interface that falls back to a
>>> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
>>> is explicitly intended to not do that, so is there a discrepancy
>>> between GuC and non-GuC here?
>>>
>> With GuC submission when an engine reset fails, we get an engine reset
>> failure notification which triggers a full GT reset
>> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
>> reset is blocking by setting these magic bits. Clearing the bits in this
>> function doesn't seem to unblock that reset either, the driver tries to
>> unload with a worker blocked, and results in the blow up. Something with
>> this lockless algorithm could be wrong as clear of the bit should
>> unlblock the reset but it is doesn't. We can look into that but in the
>> meantime we need to fix this test to be able to fail gracefully and not
>> crash CI.
>
> Yeah, for that lockless algorithm if needed, we might want to use a 
> ww_mutex per engine or something,
> but point was that AFAICT at least one of the tests that set those 
> flags explicitly tested the functionality that no other engines than 
> the intended one was reset when the intel_engine_reset() function was 
> used, and then if GuC submission doesn't honor that, wouldn't a better 
> approach be to make a code comment around intel_engine_reset() to 
> explain the differences and disable that particular test for GuC?. 
> Also wouldn't we for example we see a duplicated full GT reset with 
> GuC if intel_engine_reset() fails as part of the 
> intel_gt_handle_error() function?
Re-reading this thread, I think there is a misunderstanding.

The selftests themselves have already been updated to support GuC based 
engine resets. That is done by submitting a hanging context and letting 
the GuC detect the hang and issue a reset. There is no mechanism 
available for i915 to directly issue or request an engine based reset 
(because i915 does not know what is running on any given engine at any 
given time, being disconnected from the scheduler).

So the tests are already correctly testing per engine resets and do not 
go anywhere near either intel_engine_reset() or intel_gt_handle_error() 
when GuC submission is used. The problem is what happens if the engine 
reset fails (which supposedly can only happen with broken hardware). In 
that scenario, there is an asynchronous message from GuC to i915 to 
notify us of the failure. The KMD receives that notification and then 
(eventually) calls intel_gt_handle_error() to issue a full GT reset. 
However, that is blocked because the selftest is not expecting it and 
has vetoed the possibility. A fix is required to allow that full GT 
reset to proceed and recover the hardware. At that point, the selftest 
should indeed fail because the reset was larger than expected. That 
should be handled by the fact the selftest issued work to other engines 
beside the target and expects those requests to complete successfully. 
In the case of the escalated GT reset, all those requests will be killed 
off as well. Thus the test will correctly fail.

John.


>
> I guess we could live with the hangcheck test being disabled for guc 
> submission until this is sorted out?
>
> /Thomas
>
>
>>
>> Matt
>>
>>> /Thomas
>>>
>>>
>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>> ---
>>>>   drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 12 ++++++++----
>>>>   1 file changed, 8 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>> b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>> index 7e2d99dd012d..90a03c60c80c 100644
>>>> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
>>>> @@ -734,7 +734,8 @@ static int __igt_reset_engine(struct intel_gt
>>>> *gt, bool active)
>>>>                  reset_engine_count = i915_reset_engine_count(global,
>>>> engine);
>>>>                    st_engine_heartbeat_disable(engine);
>>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>> +               if (!using_guc)
>>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>>> reset.flags);
>>>>                  count = 0;
>>>>                  do {
>>>>                          struct i915_request *rq = NULL;
>>>> @@ -824,7 +825,8 @@ static int __igt_reset_engine(struct intel_gt
>>>> *gt, bool active)
>>>>                          if (err)
>>>>                                  break;
>>>>                  } while (time_before(jiffies, end_time));
>>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>> +               if (!using_guc)
>>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>>> reset.flags);
>>>>                  st_engine_heartbeat_enable(engine);
>>>>                  pr_info("%s: Completed %lu %s resets\n",
>>>>                          engine->name, count, active ? "active" :
>>>> "idle");
>>>> @@ -1042,7 +1044,8 @@ static int __igt_reset_engines(struct intel_gt
>>>> *gt,
>>>>                  yield(); /* start all threads before we begin */
>>>>                  st_engine_heartbeat_disable_no_pm(engine);
>>>> -               set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>> +               if (!using_guc)
>>>> +                       set_bit(I915_RESET_ENGINE + id, &gt-
>>>>> reset.flags);
>>>>                  do {
>>>>                          struct i915_request *rq = NULL;
>>>>                          struct intel_selftest_saved_policy saved;
>>>> @@ -1165,7 +1168,8 @@ static int __igt_reset_engines(struct intel_gt
>>>> *gt,
>>>>                          if (err)
>>>>                                  break;
>>>>                  } while (time_before(jiffies, end_time));
>>>> -               clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
>>>> +               if (!using_guc)
>>>> +                       clear_bit(I915_RESET_ENGINE + id, &gt-
>>>>> reset.flags);
>>>>                  st_engine_heartbeat_enable_no_pm(engine);
>>>>                    pr_info("i915_reset_engine(%s:%s): %lu resets\n",
>>>
Thomas Hellstrom Oct. 27, 2021, 6:36 a.m. UTC | #13
Hi, John,

On 10/26/21 21:55, John Harrison wrote:
> On 10/21/2021 23:23, Thomas Hellström wrote:
>> On 10/21/21 22:37, Matthew Brost wrote:
>>> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>>>> Hi, Matthew,
>>>>
>>>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>>>> The hangcheck selftest blocks per engine resets by setting magic bits
>>>>> in
>>>>> the reset flags. This is incorrect for GuC submission because if the
>>>>> GuC
>>>>> fails to reset an engine we would like to do a full GT reset. Do no
>>>>> set
>>>>> these magic bits when using GuC submission.
>>>>>
>>>>> Side note this lockless algorithm with magic bits to block resets
>>>>> really
>>>>> should be ripped out.
>>>>>
>>>> Lockless algorithm aside, from a quick look at the code in
>>>> intel_reset.c it appears to me like the interface that falls back to a
>>>> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
>>>> is explicitly intended to not do that, so is there a discrepancy
>>>> between GuC and non-GuC here?
>>>>
>>> With GuC submission when an engine reset fails, we get an engine reset
>>> failure notification which triggers a full GT reset
>>> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
>>> reset is blocking by setting these magic bits. Clearing the bits in 
>>> this
>>> function doesn't seem to unblock that reset either, the driver tries to
>>> unload with a worker blocked, and results in the blow up. Something 
>>> with
>>> this lockless algorithm could be wrong as clear of the bit should
>>> unlblock the reset but it is doesn't. We can look into that but in the
>>> meantime we need to fix this test to be able to fail gracefully and not
>>> crash CI.
>>
>> Yeah, for that lockless algorithm if needed, we might want to use a 
>> ww_mutex per engine or something,
>> but point was that AFAICT at least one of the tests that set those 
>> flags explicitly tested the functionality that no other engines than 
>> the intended one was reset when the intel_engine_reset() function was 
>> used, and then if GuC submission doesn't honor that, wouldn't a 
>> better approach be to make a code comment around intel_engine_reset() 
>> to explain the differences and disable that particular test for GuC?. 
>> Also wouldn't we for example we see a duplicated full GT reset with 
>> GuC if intel_engine_reset() fails as part of the 
>> intel_gt_handle_error() function?
> Re-reading this thread, I think there is a misunderstanding.
>
> The selftests themselves have already been updated to support GuC 
> based engine resets. That is done by submitting a hanging context and 
> letting the GuC detect the hang and issue a reset. There is no 
> mechanism available for i915 to directly issue or request an engine 
> based reset (because i915 does not know what is running on any given 
> engine at any given time, being disconnected from the scheduler).
>
> So the tests are already correctly testing per engine resets and do 
> not go anywhere near either intel_engine_reset() or 
> intel_gt_handle_error() when GuC submission is used. The problem is 
> what happens if the engine reset fails (which supposedly can only 
> happen with broken hardware). In that scenario, there is an 
> asynchronous message from GuC to i915 to notify us of the failure. The 
> KMD receives that notification and then (eventually) calls 
> intel_gt_handle_error() to issue a full GT reset. However, that is 
> blocked because the selftest is not expecting it and has vetoed the 
> possibility.

This is where my understanding of the discussion differs. According to 
Matthew, the selftest actually proceeds to clear the bits, but the 
worker that calls into intel_gt_handle_error() never wakes up. (and 
that's probably due to clear_bit() being used instead of 
clear_and_wake_up_bit()).

And my problem with this particular patch is that it adds even more "if 
(!guc_submission)" which is already sprinkled all over the place in the 
selftests to the point that it becomes difficult to see what (if 
anything) the tests are really testing. For example 
igt_reset_nop_engine() from a cursory look looks like it's doing 
something but inside the engine loop it becomes clear that the test 
doesn't do *anything* except iterate over engines. Same for 
igt_reset_engines() in the !TEST_ACTIVE case and for 
igt_reset_idle_engine(). For some other tests the reset_count checks are 
gone, leaving only a test that we actually do a reset.

So if possible, as previously mentioned, I think a solution without 
adding more of this in the selftests is preferrable. To me the best 
option is probably be the one you suggest in your previous email: Don't 
wait on the I915_RESET_ENGINE bits with GuC in intel_gt_handle_error(), 
(or perhaps extract what's left in a separate function called from the 
GuC handler).

Thanks,

Thomas
John Harrison Oct. 27, 2021, 8:34 p.m. UTC | #14
On 10/26/2021 23:36, Thomas Hellström wrote:
> Hi, John,
>
> On 10/26/21 21:55, John Harrison wrote:
>> On 10/21/2021 23:23, Thomas Hellström wrote:
>>> On 10/21/21 22:37, Matthew Brost wrote:
>>>> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>>>>> Hi, Matthew,
>>>>>
>>>>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>>>>> The hangcheck selftest blocks per engine resets by setting magic 
>>>>>> bits
>>>>>> in
>>>>>> the reset flags. This is incorrect for GuC submission because if the
>>>>>> GuC
>>>>>> fails to reset an engine we would like to do a full GT reset. Do no
>>>>>> set
>>>>>> these magic bits when using GuC submission.
>>>>>>
>>>>>> Side note this lockless algorithm with magic bits to block resets
>>>>>> really
>>>>>> should be ripped out.
>>>>>>
>>>>> Lockless algorithm aside, from a quick look at the code in
>>>>> intel_reset.c it appears to me like the interface that falls back 
>>>>> to a
>>>>> full GT reset is intel_gt_handle_error() whereas intel_engine_reset()
>>>>> is explicitly intended to not do that, so is there a discrepancy
>>>>> between GuC and non-GuC here?
>>>>>
>>>> With GuC submission when an engine reset fails, we get an engine reset
>>>> failure notification which triggers a full GT reset
>>>> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). That
>>>> reset is blocking by setting these magic bits. Clearing the bits in 
>>>> this
>>>> function doesn't seem to unblock that reset either, the driver 
>>>> tries to
>>>> unload with a worker blocked, and results in the blow up. Something 
>>>> with
>>>> this lockless algorithm could be wrong as clear of the bit should
>>>> unlblock the reset but it is doesn't. We can look into that but in the
>>>> meantime we need to fix this test to be able to fail gracefully and 
>>>> not
>>>> crash CI.
>>>
>>> Yeah, for that lockless algorithm if needed, we might want to use a 
>>> ww_mutex per engine or something,
>>> but point was that AFAICT at least one of the tests that set those 
>>> flags explicitly tested the functionality that no other engines than 
>>> the intended one was reset when the intel_engine_reset() function 
>>> was used, and then if GuC submission doesn't honor that, wouldn't a 
>>> better approach be to make a code comment around 
>>> intel_engine_reset() to explain the differences and disable that 
>>> particular test for GuC?. Also wouldn't we for example we see a 
>>> duplicated full GT reset with GuC if intel_engine_reset() fails as 
>>> part of the intel_gt_handle_error() function?
>> Re-reading this thread, I think there is a misunderstanding.
>>
>> The selftests themselves have already been updated to support GuC 
>> based engine resets. That is done by submitting a hanging context and 
>> letting the GuC detect the hang and issue a reset. There is no 
>> mechanism available for i915 to directly issue or request an engine 
>> based reset (because i915 does not know what is running on any given 
>> engine at any given time, being disconnected from the scheduler).
>>
>> So the tests are already correctly testing per engine resets and do 
>> not go anywhere near either intel_engine_reset() or 
>> intel_gt_handle_error() when GuC submission is used. The problem is 
>> what happens if the engine reset fails (which supposedly can only 
>> happen with broken hardware). In that scenario, there is an 
>> asynchronous message from GuC to i915 to notify us of the failure. 
>> The KMD receives that notification and then (eventually) calls 
>> intel_gt_handle_error() to issue a full GT reset. However, that is 
>> blocked because the selftest is not expecting it and has vetoed the 
>> possibility.
>
> This is where my understanding of the discussion differs. According to 
> Matthew, the selftest actually proceeds to clear the bits, but the 
> worker that calls into intel_gt_handle_error() never wakes up. (and 
> that's probably due to clear_bit() being used instead of 
> clear_and_wake_up_bit()).
Hmm, missed that point. Yeah, sounds like the missing wake_up suffix is 
what is causing the deadlock. I can't see any other reason why the reset 
handler would not proceed once the flags are cleared. And it looks like 
the selftest should timeout out waiting for the request and continue on 
to clear the bits just fine.


>
> And my problem with this particular patch is that it adds even more 
> "if (!guc_submission)" which is already sprinkled all over the place 
> in the selftests to the point that it becomes difficult to see what 
> (if anything) the tests are really testing.
I agree with this. Fixing the problem at source seems like a better 
solution than hacking lots of different bits in different tests.


> For example igt_reset_nop_engine() from a cursory look looks like it's 
> doing something but inside the engine loop it becomes clear that the 
> test doesn't do *anything* except iterate over engines. Same for 
> igt_reset_engines() in the !TEST_ACTIVE case and for 
> igt_reset_idle_engine(). For some other tests the reset_count checks 
> are gone, leaving only a test that we actually do a reset.
The nop_engine test is meant to be a no-op. It is testing what happens 
when you reset an idle engine. That is not something we can do with GuC 
based engine resets - there are no debug hooks into GuC specifically to 
trigger an engine reset when there is no hang. The same applies to the 
!TEST_ACTIVE case (which is igt_reset_idle_engine). Hence those are 
skipped for GuC.

The reset_count tests are still present. They are looking for global 
resets occurring when they should not. It is the reset_engine_count 
check that is disabled for GuC. Again, because GuC is handling the reset 
not i915. GuC tells us when a context is reset, it does not report any 
specifics about engines. So this is not a count we can maintain when 
using GuC submission.

>
> So if possible, as previously mentioned, I think a solution without 
> adding more of this in the selftests is preferrable. To me the best 
> option is probably be the one you suggest in your previous email: 
> Don't wait on the I915_RESET_ENGINE bits with GuC in 
> intel_gt_handle_error(), (or perhaps extract what's left in a separate 
> function called from the GuC handler).
I don't like the idea of splitting intel_gt_handle_error() apart. It is 
meant to be the top level reset handler that manages all resets - per 
engine and full GT (and potentially FLR in the future). Certainly, none 
of it should be pulled out into GuC specific code.

One less than ideal aspect of the current reset support is that 
'intel_has_reset_engine()' is ambiguous. With GuC submission, we have 
per engine reset. It's just not managed by i915. So code that wants to 
know if engines can be reset individual wants a true response. But code 
such as intel_gt_handle_error() needs to know *who* is doing that reset. 
Hence it needs the GuC specific check on top to differentiate. Ideally, 
there would be separate intel_has_reset_engine() and 
intel_does_i915_do_engine_reset() queries. Then the reset code itself 
would not need to query GuC submission. However, that seems like 
overkill to remove a couple of GuC tests. Maybe it could be done as 
future work, but it is certainly not in the scope of making this 
selftest not hang the system.

John.


>
> Thanks,
>
> Thomas
>
>
>
Thomas Hellstrom Oct. 27, 2021, 8:47 p.m. UTC | #15
On 10/27/21 22:34, John Harrison wrote:
> On 10/26/2021 23:36, Thomas Hellström wrote:
>> Hi, John,
>>
>> On 10/26/21 21:55, John Harrison wrote:
>>> On 10/21/2021 23:23, Thomas Hellström wrote:
>>>> On 10/21/21 22:37, Matthew Brost wrote:
>>>>> On Thu, Oct 21, 2021 at 08:15:49AM +0200, Thomas Hellström wrote:
>>>>>> Hi, Matthew,
>>>>>>
>>>>>> On Mon, 2021-10-11 at 16:47 -0700, Matthew Brost wrote:
>>>>>>> The hangcheck selftest blocks per engine resets by setting magic 
>>>>>>> bits
>>>>>>> in
>>>>>>> the reset flags. This is incorrect for GuC submission because if 
>>>>>>> the
>>>>>>> GuC
>>>>>>> fails to reset an engine we would like to do a full GT reset. Do no
>>>>>>> set
>>>>>>> these magic bits when using GuC submission.
>>>>>>>
>>>>>>> Side note this lockless algorithm with magic bits to block resets
>>>>>>> really
>>>>>>> should be ripped out.
>>>>>>>
>>>>>> Lockless algorithm aside, from a quick look at the code in
>>>>>> intel_reset.c it appears to me like the interface that falls back 
>>>>>> to a
>>>>>> full GT reset is intel_gt_handle_error() whereas 
>>>>>> intel_engine_reset()
>>>>>> is explicitly intended to not do that, so is there a discrepancy
>>>>>> between GuC and non-GuC here?
>>>>>>
>>>>> With GuC submission when an engine reset fails, we get an engine 
>>>>> reset
>>>>> failure notification which triggers a full GT reset
>>>>> (intel_guc_engine_failure_process_msg in intel_guc_submission.c). 
>>>>> That
>>>>> reset is blocking by setting these magic bits. Clearing the bits 
>>>>> in this
>>>>> function doesn't seem to unblock that reset either, the driver 
>>>>> tries to
>>>>> unload with a worker blocked, and results in the blow up. 
>>>>> Something with
>>>>> this lockless algorithm could be wrong as clear of the bit should
>>>>> unlblock the reset but it is doesn't. We can look into that but in 
>>>>> the
>>>>> meantime we need to fix this test to be able to fail gracefully 
>>>>> and not
>>>>> crash CI.
>>>>
>>>> Yeah, for that lockless algorithm if needed, we might want to use a 
>>>> ww_mutex per engine or something,
>>>> but point was that AFAICT at least one of the tests that set those 
>>>> flags explicitly tested the functionality that no other engines 
>>>> than the intended one was reset when the intel_engine_reset() 
>>>> function was used, and then if GuC submission doesn't honor that, 
>>>> wouldn't a better approach be to make a code comment around 
>>>> intel_engine_reset() to explain the differences and disable that 
>>>> particular test for GuC?. Also wouldn't we for example we see a 
>>>> duplicated full GT reset with GuC if intel_engine_reset() fails as 
>>>> part of the intel_gt_handle_error() function?
>>> Re-reading this thread, I think there is a misunderstanding.
>>>
>>> The selftests themselves have already been updated to support GuC 
>>> based engine resets. That is done by submitting a hanging context 
>>> and letting the GuC detect the hang and issue a reset. There is no 
>>> mechanism available for i915 to directly issue or request an engine 
>>> based reset (because i915 does not know what is running on any given 
>>> engine at any given time, being disconnected from the scheduler).
>>>
>>> So the tests are already correctly testing per engine resets and do 
>>> not go anywhere near either intel_engine_reset() or 
>>> intel_gt_handle_error() when GuC submission is used. The problem is 
>>> what happens if the engine reset fails (which supposedly can only 
>>> happen with broken hardware). In that scenario, there is an 
>>> asynchronous message from GuC to i915 to notify us of the failure. 
>>> The KMD receives that notification and then (eventually) calls 
>>> intel_gt_handle_error() to issue a full GT reset. However, that is 
>>> blocked because the selftest is not expecting it and has vetoed the 
>>> possibility.
>>
>> This is where my understanding of the discussion differs. According 
>> to Matthew, the selftest actually proceeds to clear the bits, but the 
>> worker that calls into intel_gt_handle_error() never wakes up. (and 
>> that's probably due to clear_bit() being used instead of 
>> clear_and_wake_up_bit()).
> Hmm, missed that point. Yeah, sounds like the missing wake_up suffix 
> is what is causing the deadlock. I can't see any other reason why the 
> reset handler would not proceed once the flags are cleared. And it 
> looks like the selftest should timeout out waiting for the request and 
> continue on to clear the bits just fine.
>
>
>>
>> And my problem with this particular patch is that it adds even more 
>> "if (!guc_submission)" which is already sprinkled all over the place 
>> in the selftests to the point that it becomes difficult to see what 
>> (if anything) the tests are really testing.
> I agree with this. Fixing the problem at source seems like a better 
> solution than hacking lots of different bits in different tests.
>
OK, so if we can fix this in intel_gt_handle_error() that'd be great.

Thanks,

Thomas
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 7e2d99dd012d..90a03c60c80c 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -734,7 +734,8 @@  static int __igt_reset_engine(struct intel_gt *gt, bool active)
 		reset_engine_count = i915_reset_engine_count(global, engine);
 
 		st_engine_heartbeat_disable(engine);
-		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+		if (!using_guc)
+			set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		count = 0;
 		do {
 			struct i915_request *rq = NULL;
@@ -824,7 +825,8 @@  static int __igt_reset_engine(struct intel_gt *gt, bool active)
 			if (err)
 				break;
 		} while (time_before(jiffies, end_time));
-		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+		if (!using_guc)
+			clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		st_engine_heartbeat_enable(engine);
 		pr_info("%s: Completed %lu %s resets\n",
 			engine->name, count, active ? "active" : "idle");
@@ -1042,7 +1044,8 @@  static int __igt_reset_engines(struct intel_gt *gt,
 		yield(); /* start all threads before we begin */
 
 		st_engine_heartbeat_disable_no_pm(engine);
-		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+		if (!using_guc)
+			set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		do {
 			struct i915_request *rq = NULL;
 			struct intel_selftest_saved_policy saved;
@@ -1165,7 +1168,8 @@  static int __igt_reset_engines(struct intel_gt *gt,
 			if (err)
 				break;
 		} while (time_before(jiffies, end_time));
-		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+		if (!using_guc)
+			clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		st_engine_heartbeat_enable_no_pm(engine);
 
 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",