diff mbox series

[v1,1/3] drm/i915/guc: Flush context destruction worker at suspend

Message ID 20230802233501.17074-2-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Resolve suspend-resume racing with GuC destroy-context-worker | expand

Commit Message

Alan Previn Aug. 2, 2023, 11:34 p.m. UTC
Suspend is not like reset, it can unroll, so we have to properly
flush pending context-guc-id deregistrations to complete before
we return from suspend calls.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c             | 6 +++++-
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +++++
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h | 2 ++
 drivers/gpu/drm/i915/gt/uc/intel_uc.c             | 7 +++++++
 drivers/gpu/drm/i915/gt/uc/intel_uc.h             | 1 +
 5 files changed, 20 insertions(+), 1 deletion(-)

Comments

Rodrigo Vivi Aug. 7, 2023, 5:52 p.m. UTC | #1
On Wed, Aug 02, 2023 at 04:34:59PM -0700, Alan Previn wrote:
> Suspend is not like reset, it can unroll, so we have to properly
> flush pending context-guc-id deregistrations to complete before
> we return from suspend calls.

But if is 'unrolls' the execution should just continue, no?!
In other words, why is this flush needed? What happens if we
don't flush, but resume doesn't proceed? in in which case
of resume you are thinking that it returns and not having flushed?

> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_gt_pm.c             | 6 +++++-
>  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +++++
>  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h | 2 ++
>  drivers/gpu/drm/i915/gt/uc/intel_uc.c             | 7 +++++++
>  drivers/gpu/drm/i915/gt/uc/intel_uc.h             | 1 +
>  5 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 5a942af0a14e..3162d859ed68 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -289,8 +289,10 @@ int intel_gt_resume(struct intel_gt *gt)
>  
>  static void wait_for_suspend(struct intel_gt *gt)
>  {
> -	if (!intel_gt_pm_is_awake(gt))
> +	if (!intel_gt_pm_is_awake(gt)) {
> +		intel_uc_suspend_prepare(&gt->uc);

why only on idle?

Well, I know, if we are in idle it is because all the requests had
already ended and gt will be wedged, but why do we need to do anything
if we are in idle?

And why here and not some upper layer? like in prepare....

>  		return;
> +	}
>  
>  	if (intel_gt_wait_for_idle(gt, I915_GT_SUSPEND_IDLE_TIMEOUT) == -ETIME) {
>  		/*
> @@ -299,6 +301,8 @@ static void wait_for_suspend(struct intel_gt *gt)
>  		 */
>  		intel_gt_set_wedged(gt);
>  		intel_gt_retire_requests(gt);
> +	} else {
> +		intel_uc_suspend_prepare(&gt->uc);
>  	}
>  
>  	intel_gt_pm_wait_for_idle(gt);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index a0e3ef1c65d2..dc7735a19a5a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1578,6 +1578,11 @@ static void guc_flush_submissions(struct intel_guc *guc)
>  	spin_unlock_irqrestore(&sched_engine->lock, flags);
>  }
>  
> +void intel_guc_submission_suspend_prepare(struct intel_guc *guc)
> +{
> +	flush_work(&guc->submission_state.destroyed_worker);
> +}
> +
>  static void guc_flush_destroyed_contexts(struct intel_guc *guc);
>  
>  void intel_guc_submission_reset_prepare(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index c57b29cdb1a6..7f0705ece74b 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -38,6 +38,8 @@ int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
>  				   bool interruptible,
>  				   long timeout);
>  
> +void intel_guc_submission_suspend_prepare(struct intel_guc *guc);
> +
>  static inline bool intel_guc_submission_is_supported(struct intel_guc *guc)
>  {
>  	return guc->submission_supported;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index 18250fb64bd8..468d7b397927 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -679,6 +679,13 @@ void intel_uc_runtime_suspend(struct intel_uc *uc)
>  	guc_disable_communication(guc);
>  }
>  
> +void intel_uc_suspend_prepare(struct intel_uc *uc)
> +{
> +	struct intel_guc *guc = &uc->guc;
> +
> +	intel_guc_submission_suspend_prepare(guc);
> +}
> +
>  void intel_uc_suspend(struct intel_uc *uc)
>  {
>  	struct intel_guc *guc = &uc->guc;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
> index 014bb7d83689..036877a07261 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
> @@ -49,6 +49,7 @@ void intel_uc_reset_prepare(struct intel_uc *uc);
>  void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled);
>  void intel_uc_reset_finish(struct intel_uc *uc);
>  void intel_uc_cancel_requests(struct intel_uc *uc);
> +void intel_uc_suspend_prepare(struct intel_uc *uc);
>  void intel_uc_suspend(struct intel_uc *uc);
>  void intel_uc_runtime_suspend(struct intel_uc *uc);
>  int intel_uc_resume(struct intel_uc *uc);
> -- 
> 2.39.0
>
Alan Previn Aug. 9, 2023, 9:09 p.m. UTC | #2
Thanks Rodrigo for reviewing this.

On Mon, 2023-08-07 at 13:52 -0400, Vivi, Rodrigo wrote:
> On Wed, Aug 02, 2023 at 04:34:59PM -0700, Alan Previn wrote:
> > Suspend is not like reset, it can unroll, so we have to properly
> > flush pending context-guc-id deregistrations to complete before
> > we return from suspend calls.
> 
> But if is 'unrolls' the execution should just continue, no?!
> In other words, why is this flush needed? What happens if we
> don't flush, but resume doesn't proceed? in in which case
> of resume you are thinking that it returns and not having flushed?

alan: I apologize, i realize I should have explained it better.
The flush is needed for when we DON'T unroll. I wanted to point
out that at in intel_gt_suspend_foo we dont actually know
if the suspend is going to unroll or not so we should always flush
properly in the case. I will re-rev with better comment on this patch.

alan:snip
> 
> >  
> >  static void wait_for_suspend(struct intel_gt *gt)
> >  {
> > -	if (!intel_gt_pm_is_awake(gt))
> > +	if (!intel_gt_pm_is_awake(gt)) {
> > +		intel_uc_suspend_prepare(&gt->uc);
> 
> why only on idle?

alan: actually no - i am flushing for both idle and non-idle cases (see farther
below) but for the non-idle case, i am skipping the flush if we decide to wedge
(since the wedge will clear up everything -> all guc-contexts that are inflight
are nuked and freed).
> 
> Well, I know, if we are in idle it is because all the requests had
> already ended and gt will be wedged, but why do we need to do anything
> if we are in idle?

Because the issue that is seen as a very late context-deregister that
is triggered by a, orthogonal thread via the following callstack:
drm-fence signal triggers a FENCE_FREE via__i915_sw_fence_notify that
connects to engines_notify -> free_engines_rcu -> (thread) ->
intel_context_put -> kref_put(&ce->ref..) that queues the
context-destruction worker. That said, eventhough the gt is idle because
the last workload has been retired a context-destruction worker
may have has just gotten queued. 

> 
> And why here and not some upper layer? like in prepare....
alan: wait_for_suspend does both the checking for idle as well as the potential
wedging if guc or hw has hung at this late state. if i call from the upper
layer before this wait_for_suspend, it might be too early because the
wait-for-idle could experience workloads completing and new contexts-derigtrations
being queued. If i call it from upper layer after wait_for_suspend, then it would
be unnecessary if wait_for_suspend decided to nuke everything. Hmmm.. but i guess
the latter could work too - since the nuke case would have emptied out the link-list
of that worker and so it would either run and do nothing or would not even be queued.
Would you rather i go that way? (i'll recheck with my team mates for i-dotting/t-crossing
discussion.
> 
> >  		return;
> > +	}
> >  
> >  	if (intel_gt_wait_for_idle(gt, I915_GT_SUSPEND_IDLE_TIMEOUT) == -ETIME) {
> >  		/*
> > @@ -299,6 +301,8 @@ static void wait_for_suspend(struct intel_gt *gt)
> >  		 */
> >  		intel_gt_set_wedged(gt);
> >  		intel_gt_retire_requests(gt);
> > +	} else {
> > +		intel_uc_suspend_prepare(&gt->uc);
> >  	}
> >  
> >  	intel_gt_pm_wait_for_idle(gt);
alan:snip
Alan Previn Aug. 15, 2023, 12:49 a.m. UTC | #3
> 

> > Rodrigo: And why here and not some upper layer? like in prepare....
> alan: wait_for_suspend does both the checking for idle as well as the potential
> wedging if guc or hw has hung at this late state. if i call from the upper
> layer before this wait_for_suspend, it might be too early because the
> wait-for-idle could experience workloads completing and new contexts-derigtrations
> being queued. If i call it from upper layer after wait_for_suspend, then it would
> be unnecessary if wait_for_suspend decided to nuke everything. Hmmm.. but i guess
> the latter could work too - since the nuke case would have emptied out the link-list
> of that worker and so it would either run and do nothing or would not even be queued.
> Would you rather i go that way? (i'll recheck with my team mates for i-dotting/t-crossing
> discussion.

actually, after going up a layer, i realize the right place might be to insert
late stage worker-flushing into intel_uc_suspend (called from intel_gt_suspend_late)
which is also where the gsc worker is flushed. This will also mean we don't need to
create intel_uc_suspend_prepare for new plumbing.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 5a942af0a14e..3162d859ed68 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -289,8 +289,10 @@  int intel_gt_resume(struct intel_gt *gt)
 
 static void wait_for_suspend(struct intel_gt *gt)
 {
-	if (!intel_gt_pm_is_awake(gt))
+	if (!intel_gt_pm_is_awake(gt)) {
+		intel_uc_suspend_prepare(&gt->uc);
 		return;
+	}
 
 	if (intel_gt_wait_for_idle(gt, I915_GT_SUSPEND_IDLE_TIMEOUT) == -ETIME) {
 		/*
@@ -299,6 +301,8 @@  static void wait_for_suspend(struct intel_gt *gt)
 		 */
 		intel_gt_set_wedged(gt);
 		intel_gt_retire_requests(gt);
+	} else {
+		intel_uc_suspend_prepare(&gt->uc);
 	}
 
 	intel_gt_pm_wait_for_idle(gt);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..dc7735a19a5a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1578,6 +1578,11 @@  static void guc_flush_submissions(struct intel_guc *guc)
 	spin_unlock_irqrestore(&sched_engine->lock, flags);
 }
 
+void intel_guc_submission_suspend_prepare(struct intel_guc *guc)
+{
+	flush_work(&guc->submission_state.destroyed_worker);
+}
+
 static void guc_flush_destroyed_contexts(struct intel_guc *guc);
 
 void intel_guc_submission_reset_prepare(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c57b29cdb1a6..7f0705ece74b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -38,6 +38,8 @@  int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
 				   bool interruptible,
 				   long timeout);
 
+void intel_guc_submission_suspend_prepare(struct intel_guc *guc);
+
 static inline bool intel_guc_submission_is_supported(struct intel_guc *guc)
 {
 	return guc->submission_supported;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 18250fb64bd8..468d7b397927 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -679,6 +679,13 @@  void intel_uc_runtime_suspend(struct intel_uc *uc)
 	guc_disable_communication(guc);
 }
 
+void intel_uc_suspend_prepare(struct intel_uc *uc)
+{
+	struct intel_guc *guc = &uc->guc;
+
+	intel_guc_submission_suspend_prepare(guc);
+}
+
 void intel_uc_suspend(struct intel_uc *uc)
 {
 	struct intel_guc *guc = &uc->guc;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
index 014bb7d83689..036877a07261 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
@@ -49,6 +49,7 @@  void intel_uc_reset_prepare(struct intel_uc *uc);
 void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled);
 void intel_uc_reset_finish(struct intel_uc *uc);
 void intel_uc_cancel_requests(struct intel_uc *uc);
+void intel_uc_suspend_prepare(struct intel_uc *uc);
 void intel_uc_suspend(struct intel_uc *uc);
 void intel_uc_runtime_suspend(struct intel_uc *uc);
 int intel_uc_resume(struct intel_uc *uc);