diff mbox

drm/i915: Look for active requests earlier in the reset path

Message ID 20170518182257.23241-1-michel.thierry@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michel Thierry May 18, 2017, 6:22 p.m. UTC
And store the active request so that we only search for it once; this
applies for reset-engine and full reset.

v2: Check for request completion inside _prepare_engine, don't use
ECANCELED, remove unnecessary null checks (Chris).

v3: Capture active requests during reset_prepare and store it the
engine hangcheck obj.

v4: Rename commit, change i915_gem_reset_request to just confirm the
active_request is still incomplete, instead of engine_stalled (Chris).

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>

fixes

Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         | 11 ++++++-----
 drivers/gpu/drm/i915/i915_drv.h         |  3 ++-
 drivers/gpu/drm/i915/i915_gem.c         | 34 +++++++++++++++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 4 files changed, 31 insertions(+), 18 deletions(-)

Comments

Michel Thierry May 18, 2017, 6:26 p.m. UTC | #1
On 5/18/2017 11:22 AM, Michel Thierry wrote:
> fixes
>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>

rebase mistake
Chris Wilson May 18, 2017, 7:55 p.m. UTC | #2
On Thu, May 18, 2017 at 11:22:57AM -0700, Michel Thierry wrote:
> And store the active request so that we only search for it once; this
> applies for reset-engine and full reset.
> 
> v2: Check for request completion inside _prepare_engine, don't use
> ECANCELED, remove unnecessary null checks (Chris).
> 
> v3: Capture active requests during reset_prepare and store it the
> engine hangcheck obj.
> 
> v4: Rename commit, change i915_gem_reset_request to just confirm the
> active_request is still incomplete, instead of engine_stalled (Chris).
> 
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> 
> fixes
> 
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         | 11 ++++++-----
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++-
>  drivers/gpu/drm/i915/i915_gem.c         | 34 +++++++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>  4 files changed, 31 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index d62793805794..ec719376fc24 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1900,15 +1900,16 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>  
>  	DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
>  
> -	ret = i915_gem_reset_prepare_engine(engine);
> -	if (ret) {
> -		DRM_ERROR("Previous reset failed - promote to full reset\n");
> +	engine->hangcheck.active_request = i915_gem_reset_prepare_engine(engine);

Whilst this is not wrong (since we are serialising the per-engine and
global resets), I would suggest we avoid storing the request in the
hangcheck here and just pass the request along to
i915_gem_request_engine.

No strong reason, just less magic state passing between functions.

> +	if (IS_ERR(engine->hangcheck.active_request)) {
> +		DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
> +		ret = PTR_ERR(engine->hangcheck.active_request);
>  		goto out;
>  	}
>  
> index b5dc073a5ddc..c9f139b322d2 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2793,12 +2793,14 @@ static bool engine_stalled(struct intel_engine_cs *engine)
>  	return true;
>  }
>  
> -/* Ensure irq handler finishes, and not run again. */
> -int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * Also store the active request so that we only search for it once.
> + */
> +struct drm_i915_gem_request *
> +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *request;
> -	int err = 0;
> -
> +	struct drm_i915_gem_request *request = NULL;
>  
>  	/* Prevent the signaler thread from updating the request
>  	 * state (by calling dma_fence_signal) as we are processing
> @@ -2827,21 +2829,30 @@ int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  
>  	if (engine_stalled(engine)) {
>  		request = i915_gem_find_active_request(engine);
> +

If we neuter the return beneath the if, this blank line can also go.

>  		if (request && request->fence.error == -EIO)
> -			err = -EIO; /* Previous reset failed! */
> +			return ERR_PTR(-EIO); /* Previous reset failed! */

request = ERR_PTR(-EIO); and then keep the single return.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d62793805794..ec719376fc24 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1900,15 +1900,16 @@  int i915_reset_engine(struct intel_engine_cs *engine)
 
 	DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
 
-	ret = i915_gem_reset_prepare_engine(engine);
-	if (ret) {
-		DRM_ERROR("Previous reset failed - promote to full reset\n");
+	engine->hangcheck.active_request = i915_gem_reset_prepare_engine(engine);
+	if (IS_ERR(engine->hangcheck.active_request)) {
+		DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
+		ret = PTR_ERR(engine->hangcheck.active_request);
 		goto out;
 	}
 
 	/*
-	 * the request that caused the hang is stuck on elsp, identify the
-	 * active request and drop it, adjust head to skip the offending
+	 * the request that caused the hang is stuck on elsp, we know the
+	 * active request and can drop it, adjust head to skip the offending
 	 * request to resume executing remaining requests in the queue.
 	 */
 	i915_gem_reset_engine(engine);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a5b9c666b3bf..6cbfeaa02246 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3370,7 +3370,8 @@  static inline u32 i915_reset_count(struct i915_gpu_error *error)
 	return READ_ONCE(error->reset_count);
 }
 
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b5dc073a5ddc..c9f139b322d2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2793,12 +2793,14 @@  static bool engine_stalled(struct intel_engine_cs *engine)
 	return true;
 }
 
-/* Ensure irq handler finishes, and not run again. */
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+/*
+ * Ensure irq handler finishes, and not run again.
+ * Also store the active request so that we only search for it once.
+ */
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *request;
-	int err = 0;
-
+	struct drm_i915_gem_request *request = NULL;
 
 	/* Prevent the signaler thread from updating the request
 	 * state (by calling dma_fence_signal) as we are processing
@@ -2827,21 +2829,30 @@  int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 
 	if (engine_stalled(engine)) {
 		request = i915_gem_find_active_request(engine);
+
 		if (request && request->fence.error == -EIO)
-			err = -EIO; /* Previous reset failed! */
+			return ERR_PTR(-EIO); /* Previous reset failed! */
 	}
 
-	return err;
+	return request;
 }
 
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
 {
 	struct intel_engine_cs *engine;
+	struct drm_i915_gem_request *request;
 	enum intel_engine_id id;
 	int err = 0;
 
-	for_each_engine(engine, dev_priv, id)
-		err = i915_gem_reset_prepare_engine(engine);
+	for_each_engine(engine, dev_priv, id) {
+		request = i915_gem_reset_prepare_engine(engine);
+		if (IS_ERR(request)) {
+			err = PTR_ERR(request);
+			break;
+		}
+
+		engine->hangcheck.active_request = request;
+	}
 
 	i915_gem_revoke_fences(dev_priv);
 
@@ -2894,7 +2905,7 @@  static void engine_skip_context(struct drm_i915_gem_request *request)
 static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
 {
 	/* Read once and return the resolution */
-	const bool guilty = engine_stalled(request->engine);
+	const bool guilty = !i915_gem_request_completed(request);
 
 	/* The guilty request will get skipped on a hung engine.
 	 *
@@ -2930,9 +2941,8 @@  static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
 
 void i915_gem_reset_engine(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *request;
+	struct drm_i915_gem_request *request = engine->hangcheck.active_request;
 
-	request = i915_gem_find_active_request(engine);
 	if (request && i915_gem_reset_request(request)) {
 		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
 				 engine->name, request->global_seqno);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ec16fb6fde62..f850c4b12337 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -121,6 +121,7 @@  struct intel_engine_hangcheck {
 	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
+	struct drm_i915_gem_request *active_request;
 	bool stalled;
 };