@@ -1879,11 +1879,55 @@ void i915_reset(struct drm_i915_private *dev_priv)
*
* Reset a specific GPU engine. Useful if a hang is detected.
* Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ * - identifies the request that caused the hang and it is dropped
+ * - reset engine (which will force the engine to idle)
+ * - re-init/configure engine
*/
int i915_reset_engine(struct intel_engine_cs *engine)
{
- /* FIXME: replace me with engine reset sequence */
- return -ENODEV;
+ int ret;
+ struct drm_i915_private *dev_priv = engine->i915;
+ struct i915_gpu_error *error = &dev_priv->gpu_error;
+ struct drm_i915_gem_request *active_request;
+
+ GEM_BUG_ON(!test_bit(I915_RESET_ENGINE_IN_PROGRESS, &error->flags));
+
+ DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
+
+ active_request = i915_gem_reset_prepare_engine(engine);
+ if (IS_ERR(active_request)) {
+ DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
+ ret = PTR_ERR(active_request);
+ goto out;
+ }
+
+ /*
+ * the request that caused the hang is stuck on elsp, we know the
+ * active request and can drop it, adjust head to skip the offending
+ * request to resume executing remaining requests in the queue.
+ */
+ i915_gem_reset_engine(engine, active_request);
+
+ /* finally, reset engine */
+ ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+ if (ret) {
+ DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+ goto out;
+ }
+
+ i915_gem_reset_finish_engine(engine);
+
+ /*
+ * The engine and its registers (and workarounds in case of render)
+ * have been reset to their default values. Follow the init_ring
+ * process to program RING_MODE, HWSP and re-enable submission.
+ */
+ ret = engine->init_hw(engine);
+
+out:
+ return ret;
}
static int i915_pm_suspend(struct device *kdev)
@@ -3417,11 +3417,16 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
return READ_ONCE(error->reset_count);
}
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
void i915_gem_reset(struct drm_i915_private *dev_priv);
+void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+ struct drm_i915_gem_request *request);
void i915_gem_init_mmio(struct drm_i915_private *i915);
int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
@@ -2793,45 +2793,61 @@ static bool engine_stalled(struct intel_engine_cs *engine)
return true;
}
+/*
+ * Ensure irq handler finishes, and not run again.
+ * Also return the active request so that we only search for it once.
+ */
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+{
+ struct drm_i915_gem_request *request = NULL;
+
+ /* Prevent the signaler thread from updating the request
+ * state (by calling dma_fence_signal) as we are processing
+ * the reset. The write from the GPU of the seqno is
+ * asynchronous and the signaler thread may see a different
+ * value to us and declare the request complete, even though
+ * the reset routine have picked that request as the active
+ * (incomplete) request. This conflict is not handled
+ * gracefully!
+ */
+ kthread_park(engine->breadcrumbs.signaler);
+
+ /* Prevent request submission to the hardware until we have
+ * completed the reset in i915_gem_reset_finish(). If a request
+ * is completed by one engine, it may then queue a request
+ * to a second via its engine->irq_tasklet *just* as we are
+ * calling engine->init_hw() and also writing the ELSP.
+ * Turning off the engine->irq_tasklet until the reset is over
+ * prevents the race.
+ */
+ tasklet_kill(&engine->irq_tasklet);
+ tasklet_disable(&engine->irq_tasklet);
+
+ if (engine->irq_seqno_barrier)
+ engine->irq_seqno_barrier(engine);
+
+ if (engine_stalled(engine)) {
+ request = i915_gem_find_active_request(engine);
+ if (request && request->fence.error == -EIO)
+ request = ERR_PTR(-EIO); /* Previous reset failed! */
+ }
+
+ return request;
+}
+
int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
{
struct intel_engine_cs *engine;
+ struct drm_i915_gem_request *request;
enum intel_engine_id id;
int err = 0;
- /* Ensure irq handler finishes, and not run again. */
for_each_engine(engine, dev_priv, id) {
- struct drm_i915_gem_request *request = NULL;
-
- /* Prevent the signaler thread from updating the request
- * state (by calling dma_fence_signal) as we are processing
- * the reset. The write from the GPU of the seqno is
- * asynchronous and the signaler thread may see a different
- * value to us and declare the request complete, even though
- * the reset routine have picked that request as the active
- * (incomplete) request. This conflict is not handled
- * gracefully!
- */
- kthread_park(engine->breadcrumbs.signaler);
-
- /* Prevent request submission to the hardware until we have
- * completed the reset in i915_gem_reset_finish(). If a request
- * is completed by one engine, it may then queue a request
- * to a second via its engine->irq_tasklet *just* as we are
- * calling engine->init_hw() and also writing the ELSP.
- * Turning off the engine->irq_tasklet until the reset is over
- * prevents the race.
- */
- tasklet_kill(&engine->irq_tasklet);
- tasklet_disable(&engine->irq_tasklet);
-
- if (engine->irq_seqno_barrier)
- engine->irq_seqno_barrier(engine);
-
- if (engine_stalled(engine)) {
- request = i915_gem_find_active_request(engine);
- if (request && request->fence.error == -EIO)
- err = -EIO; /* Previous reset failed! */
+ request = i915_gem_reset_prepare_engine(engine);
+ if (IS_ERR(request)) {
+ err = PTR_ERR(request);
+ break;
}
engine->hangcheck.active_request = request;
@@ -2922,8 +2938,8 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
return guilty;
}
-static void i915_gem_reset_engine(struct intel_engine_cs *engine,
- struct drm_i915_gem_request *request)
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+ struct drm_i915_gem_request *request)
{
if (request && i915_gem_reset_request(request)) {
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
@@ -2966,6 +2982,12 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
}
}
+void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
+{
+ tasklet_enable(&engine->irq_tasklet);
+ kthread_unpark(engine->breadcrumbs.signaler);
+}
+
void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
{
struct intel_engine_cs *engine;
@@ -2973,10 +2995,8 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
lockdep_assert_held(&dev_priv->drm.struct_mutex);
- for_each_engine(engine, dev_priv, id) {
- tasklet_enable(&engine->irq_tasklet);
- kthread_unpark(engine->breadcrumbs.signaler);
- }
+ for_each_engine(engine, dev_priv, id)
+ i915_gem_reset_finish_engine(engine);
}
static void nop_submit_request(struct drm_i915_gem_request *request)