diff mbox

[05/20] drm/i915: Extending i915_gem_check_wedge to check engine reset in progress

Message ID 1452706112-8617-6-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

arun.siluvery@linux.intel.com Jan. 13, 2016, 5:28 p.m. UTC
From: Tomas Elf <tomas.elf@intel.com>

i915_gem_wedge now returns a non-zero result in three different cases:

1. Legacy: A hang has been detected and full GPU reset is in progress.

2. Per-engine recovery:

	a. A single engine reference can be passed to the function, in which
	case only that engine will be checked. If that particular engine is
	detected to be hung and is to be reset this will yield a non-zero
	result but not if reset is in progress for any other engine.

	b. No engine reference is passed to the function, in which case all
	engines are checked for ongoing per-engine hang recovery.

Also, i915_wait_request was updated to take advantage of this new
functionality. This is important since the TDR hang recovery mechanism needs a
way to force waiting threads that hold the struct_mutex to give up the
struct_mutex and try again after the hang recovery has completed. If
i915_wait_request does not take per-engine hang recovery into account there is
no way for a waiting thread to know that a per-engine recovery is about to
happen and that it needs to back off.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@intel.com>
Signed-off-by: Ian Lister <ian.lister@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  3 +-
 drivers/gpu/drm/i915/i915_gem.c         | 60 +++++++++++++++++++++++++++------
 drivers/gpu/drm/i915/intel_lrc.c        |  4 +--
 drivers/gpu/drm/i915/intel_ringbuffer.c |  4 ++-
 4 files changed, 56 insertions(+), 15 deletions(-)

Comments

Chris Wilson Jan. 13, 2016, 8:49 p.m. UTC | #1
On Wed, Jan 13, 2016 at 05:28:17PM +0000, Arun Siluvery wrote:
> From: Tomas Elf <tomas.elf@intel.com>
> 
> i915_gem_wedge now returns a non-zero result in three different cases:
> 
> 1. Legacy: A hang has been detected and full GPU reset is in progress.
> 
> 2. Per-engine recovery:
> 
> 	a. A single engine reference can be passed to the function, in which
> 	case only that engine will be checked. If that particular engine is
> 	detected to be hung and is to be reset this will yield a non-zero
> 	result but not if reset is in progress for any other engine.
> 
> 	b. No engine reference is passed to the function, in which case all
> 	engines are checked for ongoing per-engine hang recovery.
> 
> Also, i915_wait_request was updated to take advantage of this new
> functionality. This is important since the TDR hang recovery mechanism needs a
> way to force waiting threads that hold the struct_mutex to give up the
> struct_mutex and try again after the hang recovery has completed. If
> i915_wait_request does not take per-engine hang recovery into account there is
> no way for a waiting thread to know that a per-engine recovery is about to
> happen and that it needs to back off.
> 
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@intel.com>
> Signed-off-by: Ian Lister <ian.lister@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  3 +-
>  drivers/gpu/drm/i915/i915_gem.c         | 60 +++++++++++++++++++++++++++------
>  drivers/gpu/drm/i915/intel_lrc.c        |  4 +--
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 ++-
>  4 files changed, 56 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 85cf692..5be7d3e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3033,7 +3033,8 @@ i915_gem_find_active_request(struct intel_engine_cs *ring);
>  
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_engine_cs *ring);
> -int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> +int __must_check i915_gem_check_wedge(struct drm_i915_private *dev_priv,
> +				      struct intel_engine_cs *engine,
>  				      bool interruptible);
>  
>  static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e3cfed2..e6eb45d 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -80,12 +80,38 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
>  	spin_unlock(&dev_priv->mm.object_stat_lock);
>  }
>  
> +static inline int
> +i915_engine_reset_in_progress(struct drm_i915_private *dev_priv,
> +	struct intel_engine_cs *engine)
> +{
> +	int ret = 0;
> +
> +	if (engine) {
> +		ret = !!(atomic_read(&dev_priv->ring[engine->id].hangcheck.flags)
> +			& I915_ENGINE_RESET_IN_PROGRESS);
> +	} else {
> +		int i;
> +
> +		for (i = 0; i < I915_NUM_RINGS; i++)
> +			if (atomic_read(&dev_priv->ring[i].hangcheck.flags)
> +				& I915_ENGINE_RESET_IN_PROGRESS) {
> +
> +				ret = 1;
> +				break;
> +			}
> +	}

Since this side will be called far more often than the writer, could you
not make this more convenient for the reader and move it to a global set
of flags in dev_priv->gpu_error?

To avoid regressing on the EIO front, the waiter sequence should look
like

if (req->reset_counter != i915_reset_counter(&req->i915->gpu_error))
	return 0;

if (flags & LOCKED && i915_engine_reset_in_process(&req->i915->gpu_error, req->engine))
	return -EAGAIN;

Oh, and don't add a second boolean to __i915_wait_request, just
transform the first into flags.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 85cf692..5be7d3e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3033,7 +3033,8 @@  i915_gem_find_active_request(struct intel_engine_cs *ring);
 
 bool i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_engine_cs *ring);
-int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
+int __must_check i915_gem_check_wedge(struct drm_i915_private *dev_priv,
+				      struct intel_engine_cs *engine,
 				      bool interruptible);
 
 static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e3cfed2..e6eb45d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -80,12 +80,38 @@  static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
+static inline int
+i915_engine_reset_in_progress(struct drm_i915_private *dev_priv,
+	struct intel_engine_cs *engine)
+{
+	int ret = 0;
+
+	if (engine) {
+		ret = !!(atomic_read(&dev_priv->ring[engine->id].hangcheck.flags)
+			& I915_ENGINE_RESET_IN_PROGRESS);
+	} else {
+		int i;
+
+		for (i = 0; i < I915_NUM_RINGS; i++)
+			if (atomic_read(&dev_priv->ring[i].hangcheck.flags)
+				& I915_ENGINE_RESET_IN_PROGRESS) {
+
+				ret = 1;
+				break;
+			}
+	}
+
+	return ret;
+}
+
 static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
+i915_gem_wait_for_error(struct drm_i915_private *dev_priv)
 {
 	int ret;
+	struct i915_gpu_error *error = &dev_priv->gpu_error;
 
 #define EXIT_COND (!i915_reset_in_progress(error) || \
+		   !i915_engine_reset_in_progress(dev_priv, NULL) || \
 		   i915_terminally_wedged(error))
 	if (EXIT_COND)
 		return 0;
@@ -114,7 +140,7 @@  int i915_mutex_lock_interruptible(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
+	ret = i915_gem_wait_for_error(dev_priv);
 	if (ret)
 		return ret;
 
@@ -1110,10 +1136,15 @@  put_rpm:
 }
 
 int
-i915_gem_check_wedge(struct i915_gpu_error *error,
+i915_gem_check_wedge(struct drm_i915_private *dev_priv,
+		     struct intel_engine_cs *engine,
 		     bool interruptible)
 {
-	if (i915_reset_in_progress(error)) {
+	struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+	if (i915_reset_in_progress(error) ||
+	    i915_engine_reset_in_progress(dev_priv, engine)) {
+
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
 		if (!interruptible)
@@ -1253,6 +1284,7 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 	unsigned long timeout_expire;
 	s64 before, now;
 	int ret;
+	int reset_in_progress = 0;
 
 	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
 
@@ -1297,11 +1329,17 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
-		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
+		reset_in_progress =
+			i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible);
+
+		if ((reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) ||
+		     reset_in_progress) {
+
 			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
 			 * is truely gone. */
-			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-			if (ret == 0)
+			if (reset_in_progress)
+				ret = reset_in_progress;
+			else
 				ret = -EAGAIN;
 			break;
 		}
@@ -1470,7 +1508,7 @@  i915_wait_request(struct drm_i915_gem_request *req)
 
 	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+	ret = i915_gem_check_wedge(dev_priv, NULL, interruptible);
 	if (ret)
 		return ret;
 
@@ -1560,7 +1598,7 @@  i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	if (!obj->active)
 		return 0;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
+	ret = i915_gem_check_wedge(dev_priv, NULL, true);
 	if (ret)
 		return ret;
 
@@ -4104,11 +4142,11 @@  i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	unsigned reset_counter;
 	int ret;
 
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
+	ret = i915_gem_wait_for_error(dev_priv);
 	if (ret)
 		return ret;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, false);
+	ret = i915_gem_check_wedge(dev_priv, NULL, false);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fcec476..a2e56d4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1066,8 +1066,8 @@  int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 
 	WARN_ON(req == NULL);
 	dev_priv = req->ring->dev->dev_private;
-
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+	ret = i915_gem_check_wedge(dev_priv,
+				   req->ring,
 				   dev_priv->mm.interruptible);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index def0dcf..f959326 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2501,8 +2501,10 @@  int intel_ring_begin(struct drm_i915_gem_request *req,
 	ring = req->ring;
 	dev_priv = ring->dev->dev_private;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+	ret = i915_gem_check_wedge(dev_priv,
+				   ring,
 				   dev_priv->mm.interruptible);
+
 	if (ret)
 		return ret;