[04/20] drm/i915: TDR / per-engine hang detection

Message ID	1452706112-8617-5-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Arun Siluvery <arun.siluvery@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Wed, 13 Jan 2016 17:28:16 +0000 Message-Id: <1452706112-8617-5-git-send-email-arun.siluvery@linux.intel.com> In-Reply-To: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com> References: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com> Cc: Tomas Elf <tomas.elf@intel.com> Subject: [Intel-gfx] [PATCH 04/20] drm/i915: TDR / per-engine hang detection Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Message ID

1452706112-8617-5-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)

State

New, archived

Headers

From: Arun Siluvery <arun.siluvery@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Date: Wed, 13 Jan 2016 17:28:16 +0000
Message-Id: <1452706112-8617-5-git-send-email-arun.siluvery@linux.intel.com>
In-Reply-To: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com>
References: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com>
Cc: Tomas Elf <tomas.elf@intel.com>
Subject: [Intel-gfx] [PATCH 04/20] drm/i915: TDR / per-engine hang detection
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Commit Message

arun.siluvery@linux.intel.com Jan. 13, 2016, 5:28 p.m. UTC

From: Tomas Elf <tomas.elf@intel.com>

With the per-engine hang recovery path already in place this patch adds
per-engine hang detection by letting the periodic hang checker detect hangs on
individual engines and communicate this to the error handler. During hang
checking every engine is checked and the hang detection status for each engine
is aggregated into a single 32-bit engine flag mask that contains all the
engine flags (1 << ring->id) of all the hung engines or'ed together. The
per-engine path in the error handler then sets up the hangcheck state for each
invidual, hung engine based on the engine flag mask before potentially calling
the per-engine hang recovery path.

This allows the hang detection to happen in lock-step for all engines in
parallel and lets the driver process all hung engines in turn in the error
handler.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.h     |  4 ++--
 drivers/gpu/drm/i915/i915_irq.c     | 41 +++++++++++++++++++++++--------------
 3 files changed, 29 insertions(+), 18 deletions(-)

Comments

Chris Wilson Jan. 13, 2016, 8:37 p.m. UTC | #1

On Wed, Jan 13, 2016 at 05:28:16PM +0000, Arun Siluvery wrote:
> From: Tomas Elf <tomas.elf@intel.com>
> 
> With the per-engine hang recovery path already in place this patch adds
> per-engine hang detection by letting the periodic hang checker detect hangs on
> individual engines and communicate this to the error handler. During hang
> checking every engine is checked and the hang detection status for each engine
> is aggregated into a single 32-bit engine flag mask that contains all the
> engine flags (1 << ring->id) of all the hung engines or'ed together. The
> per-engine path in the error handler then sets up the hangcheck state for each
> invidual, hung engine based on the engine flag mask before potentially calling
> the per-engine hang recovery path.
> 
> This allows the hang detection to happen in lock-step for all engines in
> parallel and lets the driver process all hung engines in turn in the error
> handler.
> 
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h     |  4 ++--
>  drivers/gpu/drm/i915/i915_irq.c     | 41 +++++++++++++++++++++++--------------
>  3 files changed, 29 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index e3377ab..6d1b6c3 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val)
>  
>  	intel_runtime_pm_get(dev_priv);
>  
> -	i915_handle_error(dev, val,
> +	i915_handle_error(dev, 0x0, val,
>  			  "Manually setting wedged to %llu", val);
>  
>  	intel_runtime_pm_put(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e866f14..85cf692 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2765,8 +2765,8 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
>  
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
> -__printf(3, 4)
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +__printf(4, 5)
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
>  		       const char *fmt, ...);
>  
>  extern void intel_irq_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6a0ec37..fef74cf 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2712,15 +2712,29 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>  
>  /**
>   * i915_handle_error - handle a gpu error
> - * @dev: drm device
> + * @dev: 		drm device
> + * @engine_mask: 	Bit mask containing the engine flags of all engines
> + *			associated with one or more detected errors.
> + *			May be 0x0.
> + *			If wedged is set to true this implies that one or more
> + *			engine hangs were detected. In this case we will
> + *			attempt to reset all engines that have been detected
> + *			as hung.
> + *			If a previous engine reset was attempted too recently
> + *			or if one of the current engine resets fails we fall
> + *			back to legacy full GPU reset.
> + * @wedged: 		true = Hang detected, invoke hang recovery.

These two look to be tautological. If wedged == 0, we expect engine_mask
to be 0 zero as well since we will not be resetting the engines, just
capturing error state. It's not though. Conversely if wedged == 1, we
expect engine_mask to be set. Again, it's not and I think there the
caller is wrong.

> + * @fmt, ...: 		Error message describing reason for error.
>   *
>   * Do some basic checking of register state at error time and
>   * dump it to the syslog.  Also call i915_capture_error_state() to make
>   * sure we get a record and make it available in debugfs.  Fire a uevent
>   * so userspace knows something bad happened (should trigger collection
> - * of a ring dump etc.).
> + * of a ring dump etc.). If a hang was detected (wedged = true) try to
> + * reset the associated engine. Failing that, try to fall back to legacy
> + * full GPU reset recovery mode.
>   */
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
>  		       const char *fmt, ...)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -2729,12 +2743,6 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
>  
>  	struct intel_engine_cs *engine;
>  
> -	/*
> -	 * NB: Placeholder until the hang checker supports
> -	 * per-engine hang detection.
> -	 */
> -	u32 engine_mask = 0;
> -
>  	va_start(args, fmt);
>  	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
>  	va_end(args);
> @@ -3162,7 +3170,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>  	 */
>  	tmp = I915_READ_CTL(ring);
>  	if (tmp & RING_WAIT) {
> -		i915_handle_error(dev, false,
> +		i915_handle_error(dev, intel_ring_flag(ring), false,
>  				  "Kicking stuck wait on %s",
>  				  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
> @@ -3174,7 +3182,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>  		default:
>  			return HANGCHECK_HUNG;
>  		case 1:
> -			i915_handle_error(dev, false,
> +			i915_handle_error(dev, intel_ring_flag(ring), false,
>  					  "Kicking stuck semaphore on %s",
>  					  ring->name);
>  			I915_WRITE_CTL(ring, tmp);
> @@ -3203,7 +3211,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  	struct drm_device *dev = dev_priv->dev;
>  	struct intel_engine_cs *ring;
>  	int i;
> -	int busy_count = 0, rings_hung = 0;
> +	u32 engine_mask = 0;
> +	int busy_count = 0;
>  	bool stuck[I915_NUM_RINGS] = { 0 };
>  #define BUSY 1
>  #define KICK 5
> @@ -3316,12 +3325,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> -			rings_hung++;
> +
> +			engine_mask |= intel_ring_flag(ring);
> +			ring->hangcheck.tdr_count++;

tdr_count was introduced unused in the previous patch and here it has
nothing to do with tdr, but just hangcheck pure and simple. It would
actually be a useful statistic for hangcheck/error-state...
-Chris

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index e3377ab..6d1b6c3 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4720,7 +4720,7 @@  i915_wedged_set(void *data, u64 val)
 
 	intel_runtime_pm_get(dev_priv);
 
-	i915_handle_error(dev, val,
+	i915_handle_error(dev, 0x0, val,
 			  "Manually setting wedged to %llu", val);
 
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e866f14..85cf692 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2765,8 +2765,8 @@  static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-__printf(3, 4)
-void i915_handle_error(struct drm_device *dev, bool wedged,
+__printf(4, 5)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 		       const char *fmt, ...);
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6a0ec37..fef74cf 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2712,15 +2712,29 @@  static void i915_report_and_clear_eir(struct drm_device *dev)
 
 /**
  * i915_handle_error - handle a gpu error
- * @dev: drm device
+ * @dev: 		drm device
+ * @engine_mask: 	Bit mask containing the engine flags of all engines
+ *			associated with one or more detected errors.
+ *			May be 0x0.
+ *			If wedged is set to true this implies that one or more
+ *			engine hangs were detected. In this case we will
+ *			attempt to reset all engines that have been detected
+ *			as hung.
+ *			If a previous engine reset was attempted too recently
+ *			or if one of the current engine resets fails we fall
+ *			back to legacy full GPU reset.
+ * @wedged: 		true = Hang detected, invoke hang recovery.
+ * @fmt, ...: 		Error message describing reason for error.
  *
  * Do some basic checking of register state at error time and
  * dump it to the syslog.  Also call i915_capture_error_state() to make
  * sure we get a record and make it available in debugfs.  Fire a uevent
  * so userspace knows something bad happened (should trigger collection
- * of a ring dump etc.).
+ * of a ring dump etc.). If a hang was detected (wedged = true) try to
+ * reset the associated engine. Failing that, try to fall back to legacy
+ * full GPU reset recovery mode.
  */
-void i915_handle_error(struct drm_device *dev, bool wedged,
+void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 		       const char *fmt, ...)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -2729,12 +2743,6 @@  void i915_handle_error(struct drm_device *dev, bool wedged,
 
 	struct intel_engine_cs *engine;
 
-	/*
-	 * NB: Placeholder until the hang checker supports
-	 * per-engine hang detection.
-	 */
-	u32 engine_mask = 0;
-
 	va_start(args, fmt);
 	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
 	va_end(args);
@@ -3162,7 +3170,7 @@  ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 	 */
 	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
-		i915_handle_error(dev, false,
+		i915_handle_error(dev, intel_ring_flag(ring), false,
 				  "Kicking stuck wait on %s",
 				  ring->name);
 		I915_WRITE_CTL(ring, tmp);
@@ -3174,7 +3182,7 @@  ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 		default:
 			return HANGCHECK_HUNG;
 		case 1:
-			i915_handle_error(dev, false,
+			i915_handle_error(dev, intel_ring_flag(ring), false,
 					  "Kicking stuck semaphore on %s",
 					  ring->name);
 			I915_WRITE_CTL(ring, tmp);
@@ -3203,7 +3211,8 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	struct drm_device *dev = dev_priv->dev;
 	struct intel_engine_cs *ring;
 	int i;
-	int busy_count = 0, rings_hung = 0;
+	u32 engine_mask = 0;
+	int busy_count = 0;
 	bool stuck[I915_NUM_RINGS] = { 0 };
 #define BUSY 1
 #define KICK 5
@@ -3316,12 +3325,14 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 ring->name);
-			rings_hung++;
+
+			engine_mask |= intel_ring_flag(ring);
+			ring->hangcheck.tdr_count++;
 		}
 	}
 
-	if (rings_hung) {
-		i915_handle_error(dev, true, "Ring hung");
+	if (engine_mask) {
+		i915_handle_error(dev, engine_mask, true, "Ring hung (0x%02x)", engine_mask);
 		goto out;
 	}

[04/20] drm/i915: TDR / per-engine hang detection

Commit Message

Comments

Patch