diff mbox

[2/7] drm/i915/tdr: Modify error handler for per engine hang recovery

Message ID 1474299019-26430-2-git-send-email-matthew.auld@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Matthew Auld Sept. 19, 2016, 3:30 p.m. UTC
From: "arun.siluvery@linux.intel.com" <arun.siluvery@linux.intel.com>

This is a preparatory patch which modifies error handler to do per engine
hang recovery. The actual patch which implements this sequence follows
later in the series. The aim is to prepare existing recovery function to
adapt to this new function where applicable (which fails at this point
because core implementation is lacking) and continue recovery using legacy
full gpu reset.

A helper function is also added to query the availability of engine
reset.

The error events behaviour that are used to notify user of reset are
adapted to engine reset such that it doesn't break users listening to these
events. In legacy we report an error event, a reset event before resetting
the gpu and a reset done event marking the completion of reset. The same
behaviour is adapted but reset event is only dispatched once even when
multiple engines are hung. Finally once reset is complete we send reset
done event as usual.

v2
  - rework slightly
  - prefer INTEL_GEN
  - document engine_mask param

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Ian Lister <ian.lister@intel.com>
Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c     | 26 +++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h     |  2 ++
 drivers/gpu/drm/i915/i915_irq.c     | 50 +++++++++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/intel_uncore.c |  5 ++++
 4 files changed, 78 insertions(+), 5 deletions(-)

Comments

Chris Wilson Sept. 19, 2016, 7:52 p.m. UTC | #1
On Mon, Sep 19, 2016 at 04:30:14PM +0100, Matthew Auld wrote:
> From: "arun.siluvery@linux.intel.com" <arun.siluvery@linux.intel.com>
> 
> This is a preparatory patch which modifies error handler to do per engine
> hang recovery. The actual patch which implements this sequence follows
> later in the series. The aim is to prepare existing recovery function to
> adapt to this new function where applicable (which fails at this point
> because core implementation is lacking) and continue recovery using legacy
> full gpu reset.
> 
> A helper function is also added to query the availability of engine
> reset.
> 
> The error events behaviour that are used to notify user of reset are
> adapted to engine reset such that it doesn't break users listening to these
> events. In legacy we report an error event, a reset event before resetting
> the gpu and a reset done event marking the completion of reset. The same
> behaviour is adapted but reset event is only dispatched once even when
> multiple engines are hung. Finally once reset is complete we send reset
> done event as usual.
> 
> v2
>   - rework slightly
>   - prefer INTEL_GEN
>   - document engine_mask param
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Ian Lister <ian.lister@intel.com>
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Matthew Auld <matthew.auld@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c     | 26 +++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h     |  2 ++
>  drivers/gpu/drm/i915/i915_irq.c     | 50 +++++++++++++++++++++++++++++++++----
>  drivers/gpu/drm/i915/intel_uncore.c |  5 ++++
>  4 files changed, 78 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 7f4e8ad..99fa690 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1804,6 +1804,32 @@ error:
>  	goto wakeup;
>  }
>  
> +/**
> + * i915_reset_engine - reset GPU engine to recover from a hang
> + * @engine: engine to reset
> + *
> + * Reset a specific GPU engine. Useful if a hang is detected.
> + * Returns zero on successful reset or otherwise an error code.
> + *
> + * Procedure is fairly simple:
> + *  - force engine to idle
> + *  - save current state which includes head and current request
> + *  - reset engine
> + *  - restore saved state and resubmit context
> + */
> +int i915_reset_engine(struct intel_engine_cs *engine)
> +{
> +	int ret;
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	/* FIXME: replace me with engine reset sequence */
> +	ret = -ENODEV;
> +
> +	set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
> +
> +	return ret;
> +}
> +
>  static int i915_pm_suspend(struct device *kdev)
>  {
>  	struct pci_dev *pdev = to_pci_dev(kdev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4dd307e..79de74d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2888,6 +2888,8 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
>  extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
>  extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
>  extern void i915_reset(struct drm_i915_private *dev_priv);
> +extern bool intel_has_engine_reset(struct drm_i915_private *dev_priv);
> +extern int i915_reset_engine(struct intel_engine_cs *engine);
>  extern int intel_guc_reset(struct drm_i915_private *dev_priv);
>  extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
>  extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c128fdb..45c5a26 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2487,11 +2487,13 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv)
>  /**
>   * i915_reset_and_wakeup - do process context error handling work
>   * @dev_priv: i915 device private
> + * @engine_mask: engines which are marked as hung
>   *
>   * Fire an error uevent so userspace can see that a hang or error
>   * was detected.
>   */
> -static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
> +static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv,
> +				  u32 engine_mask)
>  {
>  	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
>  	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> @@ -2501,6 +2503,15 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>  
>  	DRM_DEBUG_DRIVER("resetting chip\n");
> +	/*
> +	 * This event needs to be sent before performing gpu reset. When
> +	 * engine resets are supported we iterate through all engines and
> +	 * reset hung engines individually. To keep the event dispatch
> +	 * mechanism consistent with full gpu reset, this is only sent once
> +	 * even when multiple engines are hung. It is also safe to move this
> +	 * here because when we are in this function, we will definitely
> +	 * perform gpu reset.
> +	 */
>  	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>  
>  	/*
> @@ -2511,6 +2522,28 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	 * simulated reset via debugs, so get an RPM reference.
>  	 */
>  	intel_runtime_pm_get(dev_priv);
> +
> +	/*
> +	 * First attempt to reset the engines which are marked as hung. If that
> +	 * fails then we fallback to doing a full gpu reset.
> +	 */
> +	if (intel_has_engine_reset(dev_priv)) {
> +		struct intel_engine_cs *engine;
> +		unsigned int tmp;
> +		int ret = 0;
> +
> +		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> +			ret = i915_reset_engine(engine);
> +			if (ret)
> +				break;
> +		}
> +
> +		if (!ret) {
> +			DRM_DEBUG_DRIVER("reset hung engines.\n");
> +			goto reset_done;
> +		}
> +	}
> +
>  	intel_prepare_reset(dev_priv);
>  
>  	do {
> @@ -2532,6 +2565,8 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  				     HZ));
>  
>  	intel_finish_reset(dev_priv);
> +
> +reset_done:
>  	intel_runtime_pm_put(dev_priv);
>  
>  	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
> @@ -2640,6 +2675,8 @@ static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
>   * i915_handle_error - handle a gpu error
>   * @dev_priv: i915 device private
>   * @engine_mask: mask representing engines that are hung
> + * @fmt: formatted hang msg that gets logged in captured error state
> + *
>   * Do some basic checking of register state at error time and
>   * dump it to the syslog.  Also call i915_capture_error_state() to make
>   * sure we get a record and make it available in debugfs.  Fire a uevent
> @@ -2664,9 +2701,12 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
>  	if (!engine_mask)
>  		return;
>  
> -	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
> -			     &dev_priv->gpu_error.flags))
> -		return;

No. This is a serialised test for a reason.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 7f4e8ad..99fa690 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1804,6 +1804,32 @@  error:
 	goto wakeup;
 }
 
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is fairly simple:
+ *  - force engine to idle
+ *  - save current state which includes head and current request
+ *  - reset engine
+ *  - restore saved state and resubmit context
+ */
+int i915_reset_engine(struct intel_engine_cs *engine)
+{
+	int ret;
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	/* FIXME: replace me with engine reset sequence */
+	ret = -ENODEV;
+
+	set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
+
+	return ret;
+}
+
 static int i915_pm_suspend(struct device *kdev)
 {
 	struct pci_dev *pdev = to_pci_dev(kdev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4dd307e..79de74d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2888,6 +2888,8 @@  extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
 extern void i915_reset(struct drm_i915_private *dev_priv);
+extern bool intel_has_engine_reset(struct drm_i915_private *dev_priv);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c128fdb..45c5a26 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2487,11 +2487,13 @@  static void i915_error_wake_up(struct drm_i915_private *dev_priv)
 /**
  * i915_reset_and_wakeup - do process context error handling work
  * @dev_priv: i915 device private
+ * @engine_mask: engines which are marked as hung
  *
  * Fire an error uevent so userspace can see that a hang or error
  * was detected.
  */
-static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
+static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv,
+				  u32 engine_mask)
 {
 	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
@@ -2501,6 +2503,15 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
+	/*
+	 * This event needs to be sent before performing gpu reset. When
+	 * engine resets are supported we iterate through all engines and
+	 * reset hung engines individually. To keep the event dispatch
+	 * mechanism consistent with full gpu reset, this is only sent once
+	 * even when multiple engines are hung. It is also safe to move this
+	 * here because when we are in this function, we will definitely
+	 * perform gpu reset.
+	 */
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
 	/*
@@ -2511,6 +2522,28 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	 * simulated reset via debugs, so get an RPM reference.
 	 */
 	intel_runtime_pm_get(dev_priv);
+
+	/*
+	 * First attempt to reset the engines which are marked as hung. If that
+	 * fails then we fallback to doing a full gpu reset.
+	 */
+	if (intel_has_engine_reset(dev_priv)) {
+		struct intel_engine_cs *engine;
+		unsigned int tmp;
+		int ret = 0;
+
+		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
+			ret = i915_reset_engine(engine);
+			if (ret)
+				break;
+		}
+
+		if (!ret) {
+			DRM_DEBUG_DRIVER("reset hung engines.\n");
+			goto reset_done;
+		}
+	}
+
 	intel_prepare_reset(dev_priv);
 
 	do {
@@ -2532,6 +2565,8 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 				     HZ));
 
 	intel_finish_reset(dev_priv);
+
+reset_done:
 	intel_runtime_pm_put(dev_priv);
 
 	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
@@ -2640,6 +2675,8 @@  static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
  * i915_handle_error - handle a gpu error
  * @dev_priv: i915 device private
  * @engine_mask: mask representing engines that are hung
+ * @fmt: formatted hang msg that gets logged in captured error state
+ *
  * Do some basic checking of register state at error time and
  * dump it to the syslog.  Also call i915_capture_error_state() to make
  * sure we get a record and make it available in debugfs.  Fire a uevent
@@ -2664,9 +2701,12 @@  void i915_handle_error(struct drm_i915_private *dev_priv,
 	if (!engine_mask)
 		return;
 
-	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
-			     &dev_priv->gpu_error.flags))
-		return;
+	/*
+	 * Engine reset support is only available from Gen8 onwards so if
+	 * it is not available or explicity disabled, use full gpu reset.
+	 */
+	if (!intel_has_engine_reset(dev_priv))
+		set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
 
 	/*
 	 * Wakeup waiting processes so that the reset function
@@ -2682,7 +2722,7 @@  void i915_handle_error(struct drm_i915_private *dev_priv,
 	 */
 	i915_error_wake_up(dev_priv);
 
-	i915_reset_and_wakeup(dev_priv);
+	i915_reset_and_wakeup(dev_priv, engine_mask);
 }
 
 /* Called from drm generic code, passed 'crtc' which
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index a9b6c93..bfd9b93 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1769,6 +1769,11 @@  bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
 	return intel_get_gpu_reset(dev_priv) != NULL;
 }
 
+bool intel_has_engine_reset(struct drm_i915_private *dev_priv)
+{
+	return (INTEL_GEN(dev_priv) >= 8 && i915.reset == 2);
+}
+
 int intel_guc_reset(struct drm_i915_private *dev_priv)
 {
 	int ret;