diff mbox series

[v4,3/3] drm/i915/gt: Timeout when waiting for idle in suspending

Message ID 20230926190518.105393-4-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Resolve suspend-resume racing with GuC destroy-context-worker | expand

Commit Message

Alan Previn Sept. 26, 2023, 7:05 p.m. UTC
When suspending, add a timeout when calling
intel_gt_pm_wait_for_idle else if we have a lost
G2H event that holds a wakeref (which would be
indicative of a bug elsewhere in the driver),
driver will at least complete the suspend-resume
cycle, (albeit not hitting all the targets for
low power hw counters), instead of hanging in the kernel.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Tested-by: Mousumi Jana <mousumi.jana@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_gt_pm.c     |  6 +++++-
 drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  7 ++++++-
 drivers/gpu/drm/i915/intel_wakeref.c      | 14 ++++++++++----
 drivers/gpu/drm/i915/intel_wakeref.h      |  6 ++++--
 5 files changed, 26 insertions(+), 9 deletions(-)

Comments

Tvrtko Ursulin Sept. 27, 2023, 9:02 a.m. UTC | #1
On 26/09/2023 20:05, Alan Previn wrote:
> When suspending, add a timeout when calling
> intel_gt_pm_wait_for_idle else if we have a lost
> G2H event that holds a wakeref (which would be
> indicative of a bug elsewhere in the driver),
> driver will at least complete the suspend-resume
> cycle, (albeit not hitting all the targets for
> low power hw counters), instead of hanging in the kernel.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Tested-by: Mousumi Jana <mousumi.jana@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c     |  6 +++++-
>   drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  7 ++++++-
>   drivers/gpu/drm/i915/intel_wakeref.c      | 14 ++++++++++----
>   drivers/gpu/drm/i915/intel_wakeref.h      |  6 ++++--
>   5 files changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 84a75c95f3f7..9c6151b78e1d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -687,7 +687,7 @@ void intel_engines_release(struct intel_gt *gt)
>   		if (!engine->release)
>   			continue;
>   
> -		intel_wakeref_wait_for_idle(&engine->wakeref);
> +		intel_wakeref_wait_for_idle(&engine->wakeref, 0);
>   		GEM_BUG_ON(intel_engine_pm_is_awake(engine));
>   
>   		engine->release(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 59b5658a17fb..820217c06dc7 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -289,6 +289,7 @@ int intel_gt_resume(struct intel_gt *gt)
>   
>   static void wait_for_suspend(struct intel_gt *gt)
>   {
> +	int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 10000;

CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT is in ns so assigning it to _ms is 
a bit to arbitrary.

Why not the existing I915_GT_SUSPEND_IDLE_TIMEOUT for instance?

>   	/*
>   	 * On rare occasions, we've observed the fence completion trigger
>   	 * free_engines asynchronously via rcu_call. Ensure those are done.
> @@ -308,7 +309,10 @@ static void wait_for_suspend(struct intel_gt *gt)
>   		intel_gt_retire_requests(gt);
>   	}
>   
> -	intel_gt_pm_wait_for_idle(gt);
> +	/* we are suspending, so we shouldn't be waiting forever */
> +	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIMEDOUT)
> +		gt_warn(gt, "bailing from %s after %d milisec timeout\n",
> +			__func__, timeout_ms);

Does the timeout in intel_gt_pm_wait_timeout_for_idle always comes in 
pair with the timeout first in intel_gt_wait_for_idle?

Also, is the timeout here hit from the intel_gt_suspend_prepare, 
intel_gt_suspend_late, or can be both?

Main concern is that we need to be sure there are no possible 
ill-effects, like letting the GPU/GuC scribble on some memory we 
unmapped (or will unmap), having let the suspend continue after timing 
out, and not perhaps doing the forced wedge like wait_for_suspend() does 
on the existing timeout path.

Would it be possible to handle the lost G2H events directly in the 
respective component instead of here? Like apply the timeout during the 
step which explicitly idles the CT for suspend (presumably that 
exists?), and so cleanup from there once declared a lost event.

Regards,

Tvrtko

>   }
>   
>   void intel_gt_suspend_prepare(struct intel_gt *gt)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> index 6c9a46452364..5358acc2b5b1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> @@ -68,7 +68,12 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
>   
>   static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>   {
> -	return intel_wakeref_wait_for_idle(&gt->wakeref);
> +	return intel_wakeref_wait_for_idle(&gt->wakeref, 0);
> +}
> +
> +static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int timeout_ms)
> +{
> +	return intel_wakeref_wait_for_idle(&gt->wakeref, timeout_ms);
>   }
>   
>   void intel_gt_pm_init_early(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
> index 718f2f1b6174..383a37521415 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.c
> +++ b/drivers/gpu/drm/i915/intel_wakeref.c
> @@ -111,14 +111,20 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
>   			 "wakeref.work", &key->work, 0);
>   }
>   
> -int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)
> +int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms)
>   {
> -	int err;
> +	int err = 0;
>   
>   	might_sleep();
>   
> -	err = wait_var_event_killable(&wf->wakeref,
> -				      !intel_wakeref_is_active(wf));
> +	if (!timeout_ms)
> +		err = wait_var_event_killable(&wf->wakeref,
> +					      !intel_wakeref_is_active(wf));
> +	else if (wait_var_event_timeout(&wf->wakeref,
> +					!intel_wakeref_is_active(wf),
> +					msecs_to_jiffies(timeout_ms)) < 1)
> +		err = -ETIMEDOUT;
> +
>   	if (err)
>   		return err;
>   
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
> index ec881b097368..302694a780d2 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.h
> +++ b/drivers/gpu/drm/i915/intel_wakeref.h
> @@ -251,15 +251,17 @@ __intel_wakeref_defer_park(struct intel_wakeref *wf)
>   /**
>    * intel_wakeref_wait_for_idle: Wait until the wakeref is idle
>    * @wf: the wakeref
> + * @timeout_ms: Timeout in ms, 0 means never timeout.
>    *
>    * Wait for the earlier asynchronous release of the wakeref. Note
>    * this will wait for any third party as well, so make sure you only wait
>    * when you have control over the wakeref and trust no one else is acquiring
>    * it.
>    *
> - * Return: 0 on success, error code if killed.
> + * Returns 0 on success, -ETIMEDOUT upon a timeout, or the unlikely
> + * error propagation from wait_var_event_killable if timeout_ms is 0.
>    */
> -int intel_wakeref_wait_for_idle(struct intel_wakeref *wf);
> +int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms);
>   
>   struct intel_wakeref_auto {
>   	struct drm_i915_private *i915;
Alan Previn Sept. 27, 2023, 4:36 p.m. UTC | #2
Thanks for taking the time to review this Tvrtko, replies inline below.

On Wed, 2023-09-27 at 10:02 +0100, Tvrtko Ursulin wrote:
> On 26/09/2023 20:05, Alan Previn wrote:
> > When suspending, add a timeout when calling
> > intel_gt_pm_wait_for_idle else if we have a lost
> > G2H event that holds a wakeref (which would be
> > indicative of a bug elsewhere in the driver),
> > driver will at least complete the suspend-resume
> > cycle, (albeit not hitting all the targets for
> > low power hw counters), instead of hanging in the kernel.
alan:snip

> >   {
> > +	int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 10000;
> 
> CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT is in ns so assigning it to _ms is 
> a bit to arbitrary.
> 
> Why not the existing I915_GT_SUSPEND_IDLE_TIMEOUT for instance?
alan: good catch, my bad. However I915_GT_SUSPEND_IDLE_TIMEOUT is only half a sec
which i feel is too quick. I guess i could create a new #define or a multiple
of I915_GT_SUSPEND_IDLE_TIMEOUT?

> >   	/*
> >   	 * On rare occasions, we've observed the fence completion trigger
> >   	 * free_engines asynchronously via rcu_call. Ensure those are done.
> > @@ -308,7 +309,10 @@ static void wait_for_suspend(struct intel_gt *gt)
> >   		intel_gt_retire_requests(gt);
> >   	}
> >   
> > -	intel_gt_pm_wait_for_idle(gt);
> > +	/* we are suspending, so we shouldn't be waiting forever */
> > +	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIMEDOUT)
> > +		gt_warn(gt, "bailing from %s after %d milisec timeout\n",
> > +			__func__, timeout_ms);
> 
> Does the timeout in intel_gt_pm_wait_timeout_for_idle always comes in 
> pair with the timeout first in intel_gt_wait_for_idle?
alan: Not necessarily, ... IIRC in nearly all cases, the first wait call
complete in time (i.e. no more ongoing work) and the second wait
does wait only if the last bit of work 'just-finished', then that second
wait may take a touch bit longer because of possible async gt-pm-put calls.
(which appear in several places in the driver as part of regular runtime
operation including request completion). NOTE: this not high end workloads
so the
> 
> Also, is the timeout here hit from the intel_gt_suspend_prepare, 
> intel_gt_suspend_late, or can be both?
> 
> Main concern is that we need to be sure there are no possible 
> ill-effects, like letting the GPU/GuC scribble on some memory we 
> unmapped (or will unmap), having let the suspend continue after timing 
> out, and not perhaps doing the forced wedge like wait_for_suspend() does 
> on the existing timeout path.
alan: this will not happen because the held wakeref is never force-released
after the timeout - so what happens is the kernel would bail the suspend.
> 
> Would it be possible to handle the lost G2H events directly in the 
> respective component instead of here? Like apply the timeout during the 
> step which explicitly idles the CT for suspend (presumably that 
> exists?), and so cleanup from there once declared a lost event.
alan: So yes, we don't solve the problem with this patch - Patch#2
is addressing the root cause - and verification is still ongoing - because its
hard to thoroughly test / reproduce. (i.e. Patch 2 could get re-rev'd).

Intent of this patch, is to be informed that gt-pm wait failed in prep for
suspending and kernel will eventually bail the suspend, that's all.
Because without this timed-out version of gt-pm-wait, if we did have a leaky
gt-wakeref, kernel will hang here forever and we will need to get serial
console or ramoops to eventually discover the kernel's cpu hung error with
call-stack pointing to this location. So the goal here is to help speed up
future debug process (if let's say some future code change with another
async gt-pm-put came along and caused new problems after Patch #2 resolved
this time).

Recap: so in both cases (original vs this patch), if we had a buggy gt-wakeref leak,
we dont get invalid guc-accesses, but without this patch, we wait forever,
and with this patch, we get some messages and eventually bail the suspend.

alan:snip
Tvrtko Ursulin Sept. 28, 2023, 12:46 p.m. UTC | #3
On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> Thanks for taking the time to review this Tvrtko, replies inline below.
> 
> On Wed, 2023-09-27 at 10:02 +0100, Tvrtko Ursulin wrote:
>> On 26/09/2023 20:05, Alan Previn wrote:
>>> When suspending, add a timeout when calling
>>> intel_gt_pm_wait_for_idle else if we have a lost
>>> G2H event that holds a wakeref (which would be
>>> indicative of a bug elsewhere in the driver),
>>> driver will at least complete the suspend-resume
>>> cycle, (albeit not hitting all the targets for
>>> low power hw counters), instead of hanging in the kernel.
> alan:snip
> 
>>>    {
>>> +	int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 10000;
>>
>> CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT is in ns so assigning it to _ms is
>> a bit to arbitrary.
>>
>> Why not the existing I915_GT_SUSPEND_IDLE_TIMEOUT for instance?
> alan: good catch, my bad. However I915_GT_SUSPEND_IDLE_TIMEOUT is only half a sec
> which i feel is too quick. I guess i could create a new #define or a multiple
> of I915_GT_SUSPEND_IDLE_TIMEOUT?
> 
>>>    	/*
>>>    	 * On rare occasions, we've observed the fence completion trigger
>>>    	 * free_engines asynchronously via rcu_call. Ensure those are done.
>>> @@ -308,7 +309,10 @@ static void wait_for_suspend(struct intel_gt *gt)
>>>    		intel_gt_retire_requests(gt);
>>>    	}
>>>    
>>> -	intel_gt_pm_wait_for_idle(gt);
>>> +	/* we are suspending, so we shouldn't be waiting forever */
>>> +	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIMEDOUT)
>>> +		gt_warn(gt, "bailing from %s after %d milisec timeout\n",
>>> +			__func__, timeout_ms);
>>
>> Does the timeout in intel_gt_pm_wait_timeout_for_idle always comes in
>> pair with the timeout first in intel_gt_wait_for_idle?
> alan: Not necessarily, ... IIRC in nearly all cases, the first wait call
> complete in time (i.e. no more ongoing work) and the second wait
> does wait only if the last bit of work 'just-finished', then that second
> wait may take a touch bit longer because of possible async gt-pm-put calls.
> (which appear in several places in the driver as part of regular runtime
> operation including request completion). NOTE: this not high end workloads
> so the
>>
>> Also, is the timeout here hit from the intel_gt_suspend_prepare,
>> intel_gt_suspend_late, or can be both?
>>
>> Main concern is that we need to be sure there are no possible
>> ill-effects, like letting the GPU/GuC scribble on some memory we
>> unmapped (or will unmap), having let the suspend continue after timing
>> out, and not perhaps doing the forced wedge like wait_for_suspend() does
>> on the existing timeout path.
> alan: this will not happen because the held wakeref is never force-released
> after the timeout - so what happens is the kernel would bail the suspend.

How does it know to fail the suspend when there is no error code 
returned with this timeout? Maybe a stupid question.. my knowledge of 
suspend-resume paths was not great even before I forgot it all.

>> Would it be possible to handle the lost G2H events directly in the
>> respective component instead of here? Like apply the timeout during the
>> step which explicitly idles the CT for suspend (presumably that
>> exists?), and so cleanup from there once declared a lost event.
> alan: So yes, we don't solve the problem with this patch - Patch#2
> is addressing the root cause - and verification is still ongoing - because its
> hard to thoroughly test / reproduce. (i.e. Patch 2 could get re-rev'd).
> 
> Intent of this patch, is to be informed that gt-pm wait failed in prep for
> suspending and kernel will eventually bail the suspend, that's all.
> Because without this timed-out version of gt-pm-wait, if we did have a leaky
> gt-wakeref, kernel will hang here forever and we will need to get serial
> console or ramoops to eventually discover the kernel's cpu hung error with
> call-stack pointing to this location. So the goal here is to help speed up
> future debug process (if let's say some future code change with another
> async gt-pm-put came along and caused new problems after Patch #2 resolved
> this time).
> 
> Recap: so in both cases (original vs this patch), if we had a buggy gt-wakeref leak,
> we dont get invalid guc-accesses, but without this patch, we wait forever,
> and with this patch, we get some messages and eventually bail the suspend.

It is not possible to wait for lost G2H in something like 
intel_uc_suspend() and simply declare "bad things happened" if it times 
out there, and forcibly clean it all up? (Which would include releasing 
all the abandoned pm refs, so this patch wouldn't be needed.)

Regards,

Tvrtko
Alan Previn Oct. 4, 2023, 5:59 p.m. UTC | #4
On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
> On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> > Thanks for taking the time to review this Tvrtko, replies inline below.
alan:snip

> > > 
> > > Main concern is that we need to be sure there are no possible
> > > ill-effects, like letting the GPU/GuC scribble on some memory we
> > > unmapped (or will unmap), having let the suspend continue after timing
> > > out, and not perhaps doing the forced wedge like wait_for_suspend() does
> > > on the existing timeout path.
> > alan: this will not happen because the held wakeref is never force-released
> > after the timeout - so what happens is the kernel would bail the suspend.
> 
> How does it know to fail the suspend when there is no error code 
> returned with this timeout? Maybe a stupid question.. my knowledge of 
> suspend-resume paths was not great even before I forgot it all.
alan:Tvrtko, you and I both sir. (apologies for the tardy response yet again busy week).
So i did trace back the gt->wakeref before i posted these patches and discovered that
runtime get/put call, i believe that the first 'get' leads to __intel_wakeref_get_first
which calls intel_runtime_pm_get via rpm_get helper and eventually executes a
pm_runtime_get_sync(rpm->kdev); (hanging off i915). (ofc, there is a corresponding
for '_put_last') - so non-first, non-last increases the counter for the gt...
but this last miss will mean kernel knows i915 hasnt 'put' everything.

alan:snip
> > 
> > Recap: so in both cases (original vs this patch), if we had a buggy gt-wakeref leak,
> > we dont get invalid guc-accesses, but without this patch, we wait forever,
> > and with this patch, we get some messages and eventually bail the suspend.
> 
> It is not possible to wait for lost G2H in something like 
> intel_uc_suspend() and simply declare "bad things happened" if it times 
> out there, and forcibly clean it all up? (Which would include releasing 
> all the abandoned pm refs, so this patch wouldn't be needed.)
> 
alan: I'm not sure if intel_uc_suspend should be held up by gt-level wakeref
check unless huc/guc/gsc-uc are the only ones ever taking a gt wakeref. 

As we already know, what we do know from a uc-perspective:
-  ensure the outstanding guc related workers is flushed which we didnt before
(addressed by patch #1).
- any further late H2G-SchedDisable is not leaking wakerefs when calling H2G
and not realizing it failed (addressed by patch #2).
- (we already), "forcibly clean it all up" at the end of the intel_uc_suspend
when we do the guc reset and cleanup all guc-ids. (pre-existing upstream code)
- we previously didnt have a coherrent guarantee that "this is the end" i.e. no
more new request after intel_uc_suspend. I mean by code logic, we thought we did
(thats why intel_uc_suspend ends wth a guc reset), but we now know otherwise.
So we that fix by adding the additional rcu_barrier (also part of patch #2).

That said, patch-3 is NOT fixing a bug in guc -its about "if we ever have
a future racy gt-wakeref late-leak somewhere - no matter which subsystem
took it (guc is not the only subsystem taking gt wakerefs), we at
least don't hang forever in this code. Ofc, based on that, even without
patch-3 i am confident the issue is resolved anyway.
So we could just drop patch-3 is you prefer?

...alan
Tvrtko Ursulin Oct. 25, 2023, 12:58 p.m. UTC | #5
On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
> On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
>> On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
>>> Thanks for taking the time to review this Tvrtko, replies inline below.
> alan:snip
> 
>>>>
>>>> Main concern is that we need to be sure there are no possible
>>>> ill-effects, like letting the GPU/GuC scribble on some memory we
>>>> unmapped (or will unmap), having let the suspend continue after timing
>>>> out, and not perhaps doing the forced wedge like wait_for_suspend() does
>>>> on the existing timeout path.
>>> alan: this will not happen because the held wakeref is never force-released
>>> after the timeout - so what happens is the kernel would bail the suspend.
>>
>> How does it know to fail the suspend when there is no error code
>> returned with this timeout? Maybe a stupid question.. my knowledge of
>> suspend-resume paths was not great even before I forgot it all.
> alan:Tvrtko, you and I both sir. (apologies for the tardy response yet again busy week).
> So i did trace back the gt->wakeref before i posted these patches and discovered that
> runtime get/put call, i believe that the first 'get' leads to __intel_wakeref_get_first
> which calls intel_runtime_pm_get via rpm_get helper and eventually executes a
> pm_runtime_get_sync(rpm->kdev); (hanging off i915). (ofc, there is a corresponding
> for '_put_last') - so non-first, non-last increases the counter for the gt...
> but this last miss will mean kernel knows i915 hasnt 'put' everything.
> 
> alan:snip
>>>
>>> Recap: so in both cases (original vs this patch), if we had a buggy gt-wakeref leak,
>>> we dont get invalid guc-accesses, but without this patch, we wait forever,
>>> and with this patch, we get some messages and eventually bail the suspend.
>>
>> It is not possible to wait for lost G2H in something like
>> intel_uc_suspend() and simply declare "bad things happened" if it times
>> out there, and forcibly clean it all up? (Which would include releasing
>> all the abandoned pm refs, so this patch wouldn't be needed.)
>>
> alan: I'm not sure if intel_uc_suspend should be held up by gt-level wakeref
> check unless huc/guc/gsc-uc are the only ones ever taking a gt wakeref.
> 
> As we already know, what we do know from a uc-perspective:
> -  ensure the outstanding guc related workers is flushed which we didnt before
> (addressed by patch #1).
> - any further late H2G-SchedDisable is not leaking wakerefs when calling H2G
> and not realizing it failed (addressed by patch #2).
> - (we already), "forcibly clean it all up" at the end of the intel_uc_suspend
> when we do the guc reset and cleanup all guc-ids. (pre-existing upstream code)
> - we previously didnt have a coherrent guarantee that "this is the end" i.e. no
> more new request after intel_uc_suspend. I mean by code logic, we thought we did
> (thats why intel_uc_suspend ends wth a guc reset), but we now know otherwise.
> So we that fix by adding the additional rcu_barrier (also part of patch #2).

It is not clear to me from the above if that includes cleaning up the 
outstanding CT replies or no. But anyway..

> 
> That said, patch-3 is NOT fixing a bug in guc -its about "if we ever have
> a future racy gt-wakeref late-leak somewhere - no matter which subsystem
> took it (guc is not the only subsystem taking gt wakerefs), we at
> least don't hang forever in this code. Ofc, based on that, even without
> patch-3 i am confident the issue is resolved anyway.
> So we could just drop patch-3 is you prefer?

.. given this it does sound to me that if you are confident patch 3 
isn't fixing anything today that it should be dropped.

Regards,

Tvrtko
Alan Previn Nov. 13, 2023, 5:57 p.m. UTC | #6
On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
> On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
> > On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
> > > On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
alan:snip
> > > It is not possible to wait for lost G2H in something like
> > > intel_uc_suspend() and simply declare "bad things happened" if it times
> > > out there, and forcibly clean it all up? (Which would include releasing
> > > all the abandoned pm refs, so this patch wouldn't be needed.)
> > > 
> > alan: I'm not sure if intel_uc_suspend should be held up by gt-level wakeref
> > check unless huc/guc/gsc-uc are the only ones ever taking a gt wakeref.
> > 
> > As we already know, what we do know from a uc-perspective:
> > -  ensure the outstanding guc related workers is flushed which we didnt before
> > (addressed by patch #1).
> > - any further late H2G-SchedDisable is not leaking wakerefs when calling H2G
> > and not realizing it failed (addressed by patch #2).
> > - (we already), "forcibly clean it all up" at the end of the intel_uc_suspend
> > when we do the guc reset and cleanup all guc-ids. (pre-existing upstream code)
> > - we previously didnt have a coherrent guarantee that "this is the end" i.e. no
> > more new request after intel_uc_suspend. I mean by code logic, we thought we did
> > (thats why intel_uc_suspend ends wth a guc reset), but we now know otherwise.
> > So we that fix by adding the additional rcu_barrier (also part of patch #2).
> 
> It is not clear to me from the above if that includes cleaning up the 
> outstanding CT replies or no. But anyway..
alan: Apologies, i should have made it clearer by citing the actual function
name calling out the steps of interest: So if you look at the function
"intel_gt_suspend_late", it calls "intel_uc_suspend" which in turn calls 
"intel_guc_suspend" which does a soft reset onto guc firmware - so after that
we can assume all outstanding G2Hs are done. Going back up the stack,
intel_gt_suspend_late finally calls gt_sanitize which calls "intel_uc_reset_prepare"
which calls "intel_guc_submission_reset_prepare" which calls
"scrub_guc_desc_for_outstanding_g2h" which does what we are looking for for all
types of outstanding G2H. As per prior review comments, besides closing the race
condition, we were missing an rcu_barrier (which caused new requests to come in way
late). So we have resolved both in Patch #2.

> > That said, patch-3 is NOT fixing a bug in guc -its about "if we ever have
> > a future racy gt-wakeref late-leak somewhere - no matter which subsystem
> > took it (guc is not the only subsystem taking gt wakerefs), we at
> > least don't hang forever in this code. Ofc, based on that, even without
> > patch-3 i am confident the issue is resolved anyway.
> > So we could just drop patch-3 is you prefer?
> 
> .. given this it does sound to me that if you are confident patch 3 
> isn't fixing anything today that it should be dropped.
alan: I won't say its NOT fixing anything - i am saying it's not fixing
this specific bug where we have the outstanding G2H from a context destruction
operation that got dropped. I am okay with dropping this patch in the next rev
but shall post a separate stand alone version of Patch3 - because I believe
all other i915 subsystems that take runtime-pm's DO NOT wait forever when entering
suspend - but GT is doing this. This means if there ever was a bug introduced,
it would require serial port or ramoops collection to debug. So i think such a
patch, despite not fixing this specific bug will be very helpful for debuggability
of future issues. After all, its better to fail our suspend when we have a bug
rather than to hang the kernel forever.
Tvrtko Ursulin Nov. 14, 2023, 5:27 p.m. UTC | #7
On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
> On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
>> On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
>>> On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
>>>> On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> alan:snip
>>>> It is not possible to wait for lost G2H in something like
>>>> intel_uc_suspend() and simply declare "bad things happened" if it times
>>>> out there, and forcibly clean it all up? (Which would include releasing
>>>> all the abandoned pm refs, so this patch wouldn't be needed.)
>>>>
>>> alan: I'm not sure if intel_uc_suspend should be held up by gt-level wakeref
>>> check unless huc/guc/gsc-uc are the only ones ever taking a gt wakeref.
>>>
>>> As we already know, what we do know from a uc-perspective:
>>> -  ensure the outstanding guc related workers is flushed which we didnt before
>>> (addressed by patch #1).
>>> - any further late H2G-SchedDisable is not leaking wakerefs when calling H2G
>>> and not realizing it failed (addressed by patch #2).
>>> - (we already), "forcibly clean it all up" at the end of the intel_uc_suspend
>>> when we do the guc reset and cleanup all guc-ids. (pre-existing upstream code)
>>> - we previously didnt have a coherrent guarantee that "this is the end" i.e. no
>>> more new request after intel_uc_suspend. I mean by code logic, we thought we did
>>> (thats why intel_uc_suspend ends wth a guc reset), but we now know otherwise.
>>> So we that fix by adding the additional rcu_barrier (also part of patch #2).
>>
>> It is not clear to me from the above if that includes cleaning up the
>> outstanding CT replies or no. But anyway..
> alan: Apologies, i should have made it clearer by citing the actual function
> name calling out the steps of interest: So if you look at the function
> "intel_gt_suspend_late", it calls "intel_uc_suspend" which in turn calls
> "intel_guc_suspend" which does a soft reset onto guc firmware - so after that
> we can assume all outstanding G2Hs are done. Going back up the stack,
> intel_gt_suspend_late finally calls gt_sanitize which calls "intel_uc_reset_prepare"
> which calls "intel_guc_submission_reset_prepare" which calls
> "scrub_guc_desc_for_outstanding_g2h" which does what we are looking for for all
> types of outstanding G2H. As per prior review comments, besides closing the race
> condition, we were missing an rcu_barrier (which caused new requests to come in way
> late). So we have resolved both in Patch #2.

Cool, I read this as all known bugs are fixed by first two patches.

>>> That said, patch-3 is NOT fixing a bug in guc -its about "if we ever have
>>> a future racy gt-wakeref late-leak somewhere - no matter which subsystem
>>> took it (guc is not the only subsystem taking gt wakerefs), we at
>>> least don't hang forever in this code. Ofc, based on that, even without
>>> patch-3 i am confident the issue is resolved anyway.
>>> So we could just drop patch-3 is you prefer?
>>
>> .. given this it does sound to me that if you are confident patch 3
>> isn't fixing anything today that it should be dropped.
> alan: I won't say its NOT fixing anything - i am saying it's not fixing
> this specific bug where we have the outstanding G2H from a context destruction
> operation that got dropped. I am okay with dropping this patch in the next rev
> but shall post a separate stand alone version of Patch3 - because I believe
> all other i915 subsystems that take runtime-pm's DO NOT wait forever when entering
> suspend - but GT is doing this. This means if there ever was a bug introduced,
> it would require serial port or ramoops collection to debug. So i think such a
> patch, despite not fixing this specific bug will be very helpful for debuggability
> of future issues. After all, its better to fail our suspend when we have a bug
> rather than to hang the kernel forever.

Issue I have is that I am not seeing how it fails the suspend when 
nothing is passed out from *void* wait_suspend(gt). To me it looks the 
suspend just carries on. And if so, it sounds dangerous to allow it to 
do that with any future/unknown bugs in the suspend sequence. Existing 
timeout wedges the GPU (and all that entails). New code just says "meh 
I'll just carry on regardless".

Regards,

Tvrtko
Rodrigo Vivi Nov. 14, 2023, 5:36 p.m. UTC | #8
On Tue, Nov 14, 2023 at 05:27:18PM +0000, Tvrtko Ursulin wrote:
> 
> On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
> > On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
> > > On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
> > > > On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
> > > > > On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> > alan:snip
> > > > > It is not possible to wait for lost G2H in something like
> > > > > intel_uc_suspend() and simply declare "bad things happened" if it times
> > > > > out there, and forcibly clean it all up? (Which would include releasing
> > > > > all the abandoned pm refs, so this patch wouldn't be needed.)
> > > > > 
> > > > alan: I'm not sure if intel_uc_suspend should be held up by gt-level wakeref
> > > > check unless huc/guc/gsc-uc are the only ones ever taking a gt wakeref.
> > > > 
> > > > As we already know, what we do know from a uc-perspective:
> > > > -  ensure the outstanding guc related workers is flushed which we didnt before
> > > > (addressed by patch #1).
> > > > - any further late H2G-SchedDisable is not leaking wakerefs when calling H2G
> > > > and not realizing it failed (addressed by patch #2).
> > > > - (we already), "forcibly clean it all up" at the end of the intel_uc_suspend
> > > > when we do the guc reset and cleanup all guc-ids. (pre-existing upstream code)
> > > > - we previously didnt have a coherrent guarantee that "this is the end" i.e. no
> > > > more new request after intel_uc_suspend. I mean by code logic, we thought we did
> > > > (thats why intel_uc_suspend ends wth a guc reset), but we now know otherwise.
> > > > So we that fix by adding the additional rcu_barrier (also part of patch #2).
> > > 
> > > It is not clear to me from the above if that includes cleaning up the
> > > outstanding CT replies or no. But anyway..
> > alan: Apologies, i should have made it clearer by citing the actual function
> > name calling out the steps of interest: So if you look at the function
> > "intel_gt_suspend_late", it calls "intel_uc_suspend" which in turn calls
> > "intel_guc_suspend" which does a soft reset onto guc firmware - so after that
> > we can assume all outstanding G2Hs are done. Going back up the stack,
> > intel_gt_suspend_late finally calls gt_sanitize which calls "intel_uc_reset_prepare"
> > which calls "intel_guc_submission_reset_prepare" which calls
> > "scrub_guc_desc_for_outstanding_g2h" which does what we are looking for for all
> > types of outstanding G2H. As per prior review comments, besides closing the race
> > condition, we were missing an rcu_barrier (which caused new requests to come in way
> > late). So we have resolved both in Patch #2.
> 
> Cool, I read this as all known bugs are fixed by first two patches.
> 
> > > > That said, patch-3 is NOT fixing a bug in guc -its about "if we ever have
> > > > a future racy gt-wakeref late-leak somewhere - no matter which subsystem
> > > > took it (guc is not the only subsystem taking gt wakerefs), we at
> > > > least don't hang forever in this code. Ofc, based on that, even without
> > > > patch-3 i am confident the issue is resolved anyway.
> > > > So we could just drop patch-3 is you prefer?
> > > 
> > > .. given this it does sound to me that if you are confident patch 3
> > > isn't fixing anything today that it should be dropped.
> > alan: I won't say its NOT fixing anything - i am saying it's not fixing
> > this specific bug where we have the outstanding G2H from a context destruction
> > operation that got dropped. I am okay with dropping this patch in the next rev
> > but shall post a separate stand alone version of Patch3 - because I believe
> > all other i915 subsystems that take runtime-pm's DO NOT wait forever when entering
> > suspend - but GT is doing this. This means if there ever was a bug introduced,
> > it would require serial port or ramoops collection to debug. So i think such a
> > patch, despite not fixing this specific bug will be very helpful for debuggability
> > of future issues. After all, its better to fail our suspend when we have a bug
> > rather than to hang the kernel forever.
> 
> Issue I have is that I am not seeing how it fails the suspend when nothing
> is passed out from *void* wait_suspend(gt). To me it looks the suspend just
> carries on. And if so, it sounds dangerous to allow it to do that with any
> future/unknown bugs in the suspend sequence. Existing timeout wedges the GPU
> (and all that entails). New code just says "meh I'll just carry on
> regardless".

That's a good point.
Well, current code is already bad and buggy on suspend-resume. We could get
suspend stuck forever without any clue of what happened.
At least the proposed patch add some gt_warn. But, yes, the right thing
to do should be entirely abort the suspend in case of timeout, besides the
gt_warn.

> 
> Regards,
> 
> Tvrtko
>
Alan Previn Nov. 14, 2023, 5:37 p.m. UTC | #9
On Tue, 2023-11-14 at 17:27 +0000, Tvrtko Ursulin wrote:
> On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
> > On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
> > > On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
> > > > On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
> > > > > On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
alan: snip
> 
> > alan: I won't say its NOT fixing anything - i am saying it's not fixing
> > this specific bug where we have the outstanding G2H from a context destruction
> > operation that got dropped. I am okay with dropping this patch in the next rev
> > but shall post a separate stand alone version of Patch3 - because I believe
> > all other i915 subsystems that take runtime-pm's DO NOT wait forever when entering
> > suspend - but GT is doing this. This means if there ever was a bug introduced,
> > it would require serial port or ramoops collection to debug. So i think such a
> > patch, despite not fixing this specific bug will be very helpful for debuggability
> > of future issues. After all, its better to fail our suspend when we have a bug
> > rather than to hang the kernel forever.
> 
> Issue I have is that I am not seeing how it fails the suspend when 
> nothing is passed out from *void* wait_suspend(gt). To me it looks the 
> suspend just carries on. And if so, it sounds dangerous to allow it to 
> do that with any future/unknown bugs in the suspend sequence. Existing 
> timeout wedges the GPU (and all that entails). New code just says "meh 
> I'll just carry on regardless".

alan: So i did trace back the gt->wakeref before i posted these patches and
see that within these runtime get/put calls, i believe the first 'get' leads
to __intel_wakeref_get_first which calls intel_runtime_pm_get via rpm_get
helper and eventually executes a pm_runtime_get_sync(rpm->kdev); (hanging off
i915_device). (naturally there is a corresponding mirros for the '_put_last').
So this means the first-get and last-put lets the kernel know. Thats why when
i tested this patch, it did actually cause the suspend to abort from kernel side
and the kernel would print a message indicating i915 was the one that didnt
release all refs.

alan: Anyways, i have pulled this patch out of rev6 of this series and created a
separate standalone patch for this patch #3 that we review independently.
Tvrtko Ursulin Nov. 14, 2023, 5:52 p.m. UTC | #10
On 14/11/2023 17:37, Teres Alexis, Alan Previn wrote:
> On Tue, 2023-11-14 at 17:27 +0000, Tvrtko Ursulin wrote:
>> On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
>>> On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
>>>> On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
>>>>> On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
>>>>>> On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> alan: snip
>>
>>> alan: I won't say its NOT fixing anything - i am saying it's not fixing
>>> this specific bug where we have the outstanding G2H from a context destruction
>>> operation that got dropped. I am okay with dropping this patch in the next rev
>>> but shall post a separate stand alone version of Patch3 - because I believe
>>> all other i915 subsystems that take runtime-pm's DO NOT wait forever when entering
>>> suspend - but GT is doing this. This means if there ever was a bug introduced,
>>> it would require serial port or ramoops collection to debug. So i think such a
>>> patch, despite not fixing this specific bug will be very helpful for debuggability
>>> of future issues. After all, its better to fail our suspend when we have a bug
>>> rather than to hang the kernel forever.
>>
>> Issue I have is that I am not seeing how it fails the suspend when
>> nothing is passed out from *void* wait_suspend(gt). To me it looks the
>> suspend just carries on. And if so, it sounds dangerous to allow it to
>> do that with any future/unknown bugs in the suspend sequence. Existing
>> timeout wedges the GPU (and all that entails). New code just says "meh
>> I'll just carry on regardless".
> 
> alan: So i did trace back the gt->wakeref before i posted these patches and
> see that within these runtime get/put calls, i believe the first 'get' leads
> to __intel_wakeref_get_first which calls intel_runtime_pm_get via rpm_get
> helper and eventually executes a pm_runtime_get_sync(rpm->kdev); (hanging off
> i915_device). (naturally there is a corresponding mirros for the '_put_last').
> So this means the first-get and last-put lets the kernel know. Thats why when
> i tested this patch, it did actually cause the suspend to abort from kernel side
> and the kernel would print a message indicating i915 was the one that didnt
> release all refs.

Ah that would be much better then.

Do you know if everything gets resumed/restored correctly in that case 
or we would need some additional work to maybe early exit from callers 
of wait_for_suspend()?

What I would also ask is to see if something like injecting a probing 
failure is feasible, so we can have this new timeout exit path 
constantly/regularly tested in CI.

Regards,

Tvrtko

> alan: Anyways, i have pulled this patch out of rev6 of this series and created a
> separate standalone patch for this patch #3 that we review independently.
>
Alan Previn Nov. 14, 2023, 7:34 p.m. UTC | #11
On Tue, 2023-11-14 at 12:36 -0500, Vivi, Rodrigo wrote:
> On Tue, Nov 14, 2023 at 05:27:18PM +0000, Tvrtko Ursulin wrote:
> > 
> > On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
> > > On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
> > > > On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
> > > > > On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
> > > > > > On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> 
> That's a good point.
> Well, current code is already bad and buggy on suspend-resume. We could get
> suspend stuck forever without any clue of what happened.
> At least the proposed patch add some gt_warn. But, yes, the right thing
> to do should be entirely abort the suspend in case of timeout, besides the
> gt_warn.
alan: yes - thats was the whole idea for Patch #3. Only after putting such
code did we have much better debuggability on real world production platforms
+ config that may not have serial-port or ramoops-dump by default.

Btw, as per previous comments by Tvrtko - which i agreed with, I have
moved this one single patch into a separate patch on its own.
See here: https://patchwork.freedesktop.org/series/126414/
(I also maintained the RB from you Rodrigo because the patch did not changed).
And yes, the gt_warn is still in place :)
Alan Previn Nov. 14, 2023, 7:48 p.m. UTC | #12
On Tue, 2023-11-14 at 17:52 +0000, Tvrtko Ursulin wrote:
> On 14/11/2023 17:37, Teres Alexis, Alan Previn wrote:
> > On Tue, 2023-11-14 at 17:27 +0000, Tvrtko Ursulin wrote:
> > > On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
> > > > On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
> > > > > On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
> > > > > > On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
> > > > > > > On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
> 
alan:snip

> > alan: So i did trace back the gt->wakeref before i posted these patches and
> > see that within these runtime get/put calls, i believe the first 'get' leads
> > to __intel_wakeref_get_first which calls intel_runtime_pm_get via rpm_get
> > helper and eventually executes a pm_runtime_get_sync(rpm->kdev); (hanging off
> > i915_device). (naturally there is a corresponding mirros for the '_put_last').
> > So this means the first-get and last-put lets the kernel know. Thats why when
> > i tested this patch, it did actually cause the suspend to abort from kernel side
> > and the kernel would print a message indicating i915 was the one that didnt
> > release all refs.
> 
> Ah that would be much better then.
> 
> Do you know if everything gets resumed/restored correctly in that case 
> or we would need some additional work to maybe early exit from callers 
> of wait_for_suspend()?
alan: So assuming we are still discussing about a "potentially new
future leaked-wakeref bug" (i.e. putting aside the fact that
Patch #1 + #2 resolves this specific series' bug), based on the
previous testing we did, after this timeout-bail trigger,
the suspend flow bails and gt/guc operation does actually continue
as normal. However, its been a long time since we tested this so
i am not sure of how accidental-new-future bugs might play to this
assumption especially if some other subsystem that leaked the rpm
wakref but that subsystem did NOT get reset like how GuC is reset
at the end of suspend.

> 
> What I would also ask is to see if something like injecting a probing 
> failure is feasible, so we can have this new timeout exit path 
> constantly/regularly tested in CI.
alan: Thats a good idea. In line with this, i would like to point out that
rev6 of this series has been posted but i removed this Patch #3. However i did
post this Patch #3 as a standalone patch here: https://patchwork.freedesktop.org/series/126414/
as i anticipate this patch will truly help with future issue debuggability.

That said, i shall post a review on that patch with your suggestion to add
an injected probe error for the suspend-resume flow and follow up on that one
separately.

> 
> Regards,
> 
> Tvrtko
> 
> > alan: Anyways, i have pulled this patch out of rev6 of this series and created a
> > separate standalone patch for this patch #3 that we review independently.
> >
Tvrtko Ursulin Nov. 16, 2023, 10:19 a.m. UTC | #13
On 14/11/2023 19:48, Teres Alexis, Alan Previn wrote:
> On Tue, 2023-11-14 at 17:52 +0000, Tvrtko Ursulin wrote:
>> On 14/11/2023 17:37, Teres Alexis, Alan Previn wrote:
>>> On Tue, 2023-11-14 at 17:27 +0000, Tvrtko Ursulin wrote:
>>>> On 13/11/2023 17:57, Teres Alexis, Alan Previn wrote:
>>>>> On Wed, 2023-10-25 at 13:58 +0100, Tvrtko Ursulin wrote:
>>>>>> On 04/10/2023 18:59, Teres Alexis, Alan Previn wrote:
>>>>>>> On Thu, 2023-09-28 at 13:46 +0100, Tvrtko Ursulin wrote:
>>>>>>>> On 27/09/2023 17:36, Teres Alexis, Alan Previn wrote:
>>
> alan:snip
> 
>>> alan: So i did trace back the gt->wakeref before i posted these patches and
>>> see that within these runtime get/put calls, i believe the first 'get' leads
>>> to __intel_wakeref_get_first which calls intel_runtime_pm_get via rpm_get
>>> helper and eventually executes a pm_runtime_get_sync(rpm->kdev); (hanging off
>>> i915_device). (naturally there is a corresponding mirros for the '_put_last').
>>> So this means the first-get and last-put lets the kernel know. Thats why when
>>> i tested this patch, it did actually cause the suspend to abort from kernel side
>>> and the kernel would print a message indicating i915 was the one that didnt
>>> release all refs.
>>
>> Ah that would be much better then.
>>
>> Do you know if everything gets resumed/restored correctly in that case
>> or we would need some additional work to maybe early exit from callers
>> of wait_for_suspend()?
> alan: So assuming we are still discussing about a "potentially new
> future leaked-wakeref bug" (i.e. putting aside the fact that
> Patch #1 + #2 resolves this specific series' bug), based on the
> previous testing we did, after this timeout-bail trigger,
> the suspend flow bails and gt/guc operation does actually continue
> as normal. However, its been a long time since we tested this so
> i am not sure of how accidental-new-future bugs might play to this
> assumption especially if some other subsystem that leaked the rpm
> wakref but that subsystem did NOT get reset like how GuC is reset
> at the end of suspend.
> 
>>
>> What I would also ask is to see if something like injecting a probing
>> failure is feasible, so we can have this new timeout exit path
>> constantly/regularly tested in CI.
> alan: Thats a good idea. In line with this, i would like to point out that
> rev6 of this series has been posted but i removed this Patch #3. However i did
> post this Patch #3 as a standalone patch here: https://patchwork.freedesktop.org/series/126414/
> as i anticipate this patch will truly help with future issue debuggability.
> 
> That said, i shall post a review on that patch with your suggestion to add
> an injected probe error for the suspend-resume flow and follow up on that one
> separately.

Cool! I don't know exactly how to do it but if we come up with way and 
so gain IGT coverage then I am okay with the patch in principle.

Like perhaps some new debugfs api needs to be added to provoke the 
timeout error path on suspend, or something.

Regards,

Tvrtko

> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>> alan: Anyways, i have pulled this patch out of rev6 of this series and created a
>>> separate standalone patch for this patch #3 that we review independently.
>>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 84a75c95f3f7..9c6151b78e1d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -687,7 +687,7 @@  void intel_engines_release(struct intel_gt *gt)
 		if (!engine->release)
 			continue;
 
-		intel_wakeref_wait_for_idle(&engine->wakeref);
+		intel_wakeref_wait_for_idle(&engine->wakeref, 0);
 		GEM_BUG_ON(intel_engine_pm_is_awake(engine));
 
 		engine->release(engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 59b5658a17fb..820217c06dc7 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -289,6 +289,7 @@  int intel_gt_resume(struct intel_gt *gt)
 
 static void wait_for_suspend(struct intel_gt *gt)
 {
+	int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 10000;
 	/*
 	 * On rare occasions, we've observed the fence completion trigger
 	 * free_engines asynchronously via rcu_call. Ensure those are done.
@@ -308,7 +309,10 @@  static void wait_for_suspend(struct intel_gt *gt)
 		intel_gt_retire_requests(gt);
 	}
 
-	intel_gt_pm_wait_for_idle(gt);
+	/* we are suspending, so we shouldn't be waiting forever */
+	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIMEDOUT)
+		gt_warn(gt, "bailing from %s after %d milisec timeout\n",
+			__func__, timeout_ms);
 }
 
 void intel_gt_suspend_prepare(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index 6c9a46452364..5358acc2b5b1 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -68,7 +68,12 @@  static inline void intel_gt_pm_might_put(struct intel_gt *gt)
 
 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
 {
-	return intel_wakeref_wait_for_idle(&gt->wakeref);
+	return intel_wakeref_wait_for_idle(&gt->wakeref, 0);
+}
+
+static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int timeout_ms)
+{
+	return intel_wakeref_wait_for_idle(&gt->wakeref, timeout_ms);
 }
 
 void intel_gt_pm_init_early(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index 718f2f1b6174..383a37521415 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -111,14 +111,20 @@  void __intel_wakeref_init(struct intel_wakeref *wf,
 			 "wakeref.work", &key->work, 0);
 }
 
-int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)
+int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms)
 {
-	int err;
+	int err = 0;
 
 	might_sleep();
 
-	err = wait_var_event_killable(&wf->wakeref,
-				      !intel_wakeref_is_active(wf));
+	if (!timeout_ms)
+		err = wait_var_event_killable(&wf->wakeref,
+					      !intel_wakeref_is_active(wf));
+	else if (wait_var_event_timeout(&wf->wakeref,
+					!intel_wakeref_is_active(wf),
+					msecs_to_jiffies(timeout_ms)) < 1)
+		err = -ETIMEDOUT;
+
 	if (err)
 		return err;
 
diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
index ec881b097368..302694a780d2 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.h
+++ b/drivers/gpu/drm/i915/intel_wakeref.h
@@ -251,15 +251,17 @@  __intel_wakeref_defer_park(struct intel_wakeref *wf)
 /**
  * intel_wakeref_wait_for_idle: Wait until the wakeref is idle
  * @wf: the wakeref
+ * @timeout_ms: Timeout in ms, 0 means never timeout.
  *
  * Wait for the earlier asynchronous release of the wakeref. Note
  * this will wait for any third party as well, so make sure you only wait
  * when you have control over the wakeref and trust no one else is acquiring
  * it.
  *
- * Return: 0 on success, error code if killed.
+ * Returns 0 on success, -ETIMEDOUT upon a timeout, or the unlikely
+ * error propagation from wait_var_event_killable if timeout_ms is 0.
  */
-int intel_wakeref_wait_for_idle(struct intel_wakeref *wf);
+int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms);
 
 struct intel_wakeref_auto {
 	struct drm_i915_private *i915;