@@ -1816,6 +1816,29 @@ error:
return ret;
}
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is fairly simple:
+ * - force engine to idle
+ * - save current state which includes head and current request
+ * - reset engine
+ * - restore saved state and resubmit context
+ */
+int i915_reset_engine(struct intel_engine_cs *engine)
+{
+ int ret;
+
+ /* FIXME: replace me with engine reset sequence */
+ ret = -ENODEV;
+
+ return ret;
+}
+
static int i915_pm_suspend(struct device *dev)
{
struct pci_dev *pdev = to_pci_dev(dev);
@@ -2836,6 +2836,8 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
extern int i915_reset(struct drm_i915_private *dev_priv);
+extern bool intel_has_engine_reset(struct drm_i915_private *dev_priv);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
extern int intel_guc_reset(struct drm_i915_private *dev_priv);
extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
@@ -2492,6 +2492,53 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv)
wake_up_all(&dev_priv->pending_flip_queue);
}
+static int i915_reset_engines(struct drm_i915_private *dev_priv)
+{
+ struct intel_engine_cs *engine;
+
+ for_each_engine(engine, dev_priv) {
+ int ret;
+ struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+ if (!i915_engine_reset_in_progress(error, engine))
+ continue;
+
+ ret = i915_reset_engine(engine);
+ if (ret) {
+ DRM_ERROR("Reset of %s failed! ret=%d",
+ engine->name, ret);
+ return ret;
+ }
+
+ clear_bit(engine->id + 1, &error->flags);
+ error->engine_reset_count[engine->id]++;
+ }
+
+ return 0;
+}
+
+static int i915_reset_full(struct drm_i915_private *dev_priv)
+{
+ int ret;
+
+ /* ensure device is awake */
+ assert_rpm_wakelock_held(dev_priv);
+
+ intel_prepare_reset(dev_priv);
+
+ /*
+ * All state reset _must_ be completed before we update the
+ * reset counter, for otherwise waiters might miss the reset
+ * pending state and not properly drop locks, resulting in
+ * deadlocks with the reset work.
+ */
+ ret = i915_reset(dev_priv);
+
+ intel_finish_reset(dev_priv);
+
+ return ret;
+}
+
/**
* i915_reset_and_wakeup - do process context error handling work
* @dev_priv: i915 device private
@@ -2501,6 +2548,7 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv)
*/
static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
{
+ struct i915_gpu_error *error = &dev_priv->gpu_error;
struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
@@ -2509,7 +2557,15 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
- DRM_DEBUG_DRIVER("resetting chip\n");
+ /*
+ * This event needs to be sent before performing gpu reset. When
+ * engine resets are supported we iterate through all engines and
+ * reset hung engines individually. To keep the event dispatch
+ * mechanism consistent with full gpu reset, this is only sent once
+ * even when multiple engines are hung. It is also safe to move this
+ * here because when we are in this function, we will definitely
+ * perform gpu reset.
+ */
kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
/*
@@ -2521,29 +2577,57 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
*/
intel_runtime_pm_get(dev_priv);
- intel_prepare_reset(dev_priv);
+ if (!i915_full_gpu_reset_in_progress(error)) {
+ ret = i915_reset_engines(dev_priv);
+ if (ret) {
+ struct intel_engine_cs *e;
- /*
- * All state reset _must_ be completed before we update the
- * reset counter, for otherwise waiters might miss the reset
- * pending state and not properly drop locks, resulting in
- * deadlocks with the reset work.
- */
- ret = i915_reset(dev_priv);
+ /* attempt full gpu reset to recover */
+ set_bit(I915_RESET_IN_PROGRESS, &error->flags);
- intel_finish_reset(dev_priv);
+ /*
+ * when engine reset fails we switch to full gpu
+ * reset which clears everything; In the case where
+ * multiple engines are hung we would've already
+ * scheduled work items and when they attempt to do
+ * engine reset they won't find any active request
+ * (full gpu reset would've cleared it). To make
+ * the work items exit safely, clear engine reset
+ * pending mask.
+ */
+ for_each_engine(e, dev_priv) {
+ if (i915_engine_reset_in_progress(error, e))
+ clear_bit(e->id + 1, &error->flags);
+ }
+ }
+ }
- intel_runtime_pm_put(dev_priv);
+ /*
+ * Note that there's only one work item which does gpu resets, so we
+ * need not worry about concurrent gpu resets potentially incrementing
+ * error->reset_counter twice. We only need to take care of another
+ * racing irq/hangcheck declaring the gpu dead for a second time. A
+ * quick check for that is good enough: schedule_work ensures the
+ * correct ordering between hang detection and this work item, and since
+ * the reset in-progress bit is only ever set by code outside of this
+ * work we don't need to worry about any other races.
+ */
+ if (i915_full_gpu_reset_in_progress(&dev_priv->gpu_error)) {
+ DRM_DEBUG_DRIVER("resetting chip\n");
- if (ret == 0)
- kobject_uevent_env(kobj,
- KOBJ_CHANGE, reset_done_event);
+ ret = i915_reset_full(dev_priv);
+ }
/*
* Note: The wake_up also serves as a memory barrier so that
* waiters see the update value of the reset counter atomic_t.
*/
- i915_error_wake_up(dev_priv);
+ if (!i915_terminally_wedged(error)) {
+ wake_up_all(&dev_priv->gpu_error.reset_queue);
+ kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
+ }
+
+ intel_runtime_pm_put(dev_priv);
}
static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
@@ -2641,6 +2725,8 @@ static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
* i915_handle_error - handle a gpu error
* @dev_priv: i915 device private
* @engine_mask: mask representing engines that are hung
+ * @fmt: formatted hang msg that gets logged in captured error state
+ *
* Do some basic checking of register state at error time and
* dump it to the syslog. Also call i915_capture_error_state() to make
* sure we get a record and make it available in debugfs. Fire a uevent
@@ -2665,9 +2751,19 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
if (!engine_mask)
return;
- if (test_and_set_bit(I915_RESET_IN_PROGRESS,
- &dev_priv->gpu_error.flags))
- return;
+ if (intel_has_engine_reset(dev_priv)) {
+ struct intel_engine_cs *engine;
+ struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+ for_each_engine_masked(engine, dev_priv, engine_mask) {
+ if (i915_engine_reset_in_progress(error, engine))
+ continue;
+
+ set_bit(engine->id + 1, &error->flags);
+ }
+ } else {
+ set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
+ }
/*
* Wakeup waiting processes so that the reset function
@@ -1775,6 +1775,11 @@ bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
return intel_get_gpu_reset(dev_priv) != NULL;
}
+bool intel_has_engine_reset(struct drm_i915_private *dev_priv)
+{
+ return (INTEL_INFO(dev_priv)->gen >=8 && i915.reset == 2);
+}
+
int intel_guc_reset(struct drm_i915_private *dev_priv)
{
int ret;