@@ -961,6 +961,120 @@ int i915_reset(struct drm_device *dev)
}
/**
+ * i915_gem_reset_engine_CSSC_precheck - check/rectify context inconsistency
+ * @dev_priv: ...
+ * @engine: The engine whose state is to be checked.
+ * @req: Output parameter containing the request most recently submitted
+ * to hardware, if any. May be NULL.
+ * @ret: Output parameter containing the error code to be returned to
+ * i915_handle_error().
+ *
+ * Before an engine reset can be attempted it is important that the submission
+ * state of the currently running (i.e. hung) context is verified as
+ * consistent. If the context submission state is inconsistent that means that
+ * the context that the driver thinks is running on hardware is in fact not
+ * running at all. It might be that the hardware is idle or is running another
+ * context altogether. The reason why this is important in the case of engine
+ * reset in particular is because at the end of the engine recovery path the
+ * fixed-up context needs to be resubmitted to hardware in order for the
+ * context changes (HEAD register nudged past the hung batch buffer) to take
+ * effect. Context resubmission requires the same context as is resubmitted to
+ * be running on hardware - otherwise we might cause unexpected preemptions or
+ * submit a context to a GPU engine that is idle, which would not make much
+ * sense. (if the engine is idle why does the driver think that the context in
+ * question is hung etc.)
+ * If an inconsistent state like this is detected then a rectification attempt
+ * is made by faking the presumed lost context event interrupt. The outcome of
+ * this attempt is returned back to the per-engine recovery path: If it was
+ * succesful the hang recovery can be aborted early since we now have resolved
+ * the hang this way. If it was not successful then fail the hang recovery and
+ * let the error handler promote to the next level of hang recovery.
+ *
+ * Returns:
+ * True: Work currently in progress, consistent state.
+ * Proceed with engine reset.
+ * False: No work in progress or work in progress but state irrecoverably
+ * inconsistent (context event IRQ faking attempted but failed).
+ * Do not proceed with engine reset.
+ */
+static bool i915_gem_reset_engine_CSSC_precheck(
+ struct drm_i915_private *dev_priv,
+ struct intel_engine_cs *engine,
+ struct drm_i915_gem_request **req,
+ int *ret)
+{
+ bool precheck_ok = true;
+ enum context_submission_status status;
+
+ WARN_ON(!ret);
+
+ *ret = 0;
+
+ status = intel_execlists_TDR_get_current_request(engine, req);
+
+ if (status == CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED) {
+ /*
+ * No work in flight, no way to carry out a per-engine hang
+ * recovery in this state. Just do early exit and forget it
+ * happened. If this state persists then the error handler will
+ * be called by the periodic hang checker soon after this and
+ * at that point the hang will hopefully be promoted to full
+ * GPU reset, which will take care of it.
+ */
+ WARN(1, "No work in flight! Aborting recovery on %s\n",
+ engine->name);
+
+ precheck_ok = false;
+ *ret = 0;
+
+ } else if (status == CONTEXT_SUBMISSION_STATUS_INCONSISTENT) {
+ if (!intel_execlists_TDR_force_CSB_check(dev_priv, engine)) {
+ DRM_ERROR("Inconsistency rectification on %s unsuccessful!\n",
+ engine->name);
+
+ /*
+ * Context submission state is inconsistent and
+ * faking a context event IRQ did not help.
+ * Fail and promote to higher level of
+ * recovery!
+ */
+ precheck_ok = false;
+ *ret = -EINVAL;
+ } else {
+ DRM_INFO("Inconsistency rectification on %s successful!\n",
+ engine->name);
+
+ /*
+ * Rectifying the inconsistent context
+ * submission status helped! No reset required,
+ * just exit and move on!
+ */
+ precheck_ok = false;
+ *ret = 0;
+
+ /*
+ * Reset the hangcheck state otherwise the hang checker
+ * will detect another hang immediately. Since the
+ * forced CSB checker resulted in more work being
+ * submitted to hardware we know that we are not hung
+ * anymore so it should be safe to clear any hang
+ * detections for this engine prior to this point.
+ */
+ i915_hangcheck_reinit(engine);
+ }
+
+ } else if (status != CONTEXT_SUBMISSION_STATUS_OK) {
+ WARN(1, "Unexpected context submission status (%u) on %s\n",
+ status, engine->name);
+
+ precheck_ok = false;
+ *ret = -EINVAL;
+ }
+
+ return precheck_ok;
+}
+
+/**
* i915_reset_engine - reset GPU engine after a hang
* @engine: engine to reset
*
@@ -1001,28 +1115,22 @@ int i915_reset_engine(struct intel_engine_cs *engine)
i915_gem_reset_ring_status(dev_priv, engine);
if (i915.enable_execlists) {
- enum context_submission_status status =
- intel_execlists_TDR_get_current_request(engine, NULL);
-
/*
- * If the context submission state in hardware is not
- * consistent with the the corresponding state in the driver or
- * if there for some reason is no current context in the
- * process of being submitted then bail out and try again. Do
- * not proceed unless we have reliable current context state
- * information. The reason why this is important is because
- * per-engine hang recovery relies on context resubmission in
- * order to force the execution to resume following the hung
- * batch buffer. If the hardware is not currently running the
- * same context as the driver thinks is hung then anything can
- * happen at the point of context resubmission, e.g. unexpected
- * preemptions or the previously hung context could be
- * submitted when the hardware is idle which makes no sense.
+ * Check context submission status consistency (CSSC) before
+ * moving on. If the driver and hardware have different
+ * opinions about what is going on and this inconsistency
+ * cannot be rectified then just fail and let TDR escalate to a
+ * higher form of hang recovery.
*/
- if (status != CONTEXT_SUBMISSION_STATUS_OK) {
- ret = -EAGAIN;
+ if (!i915_gem_reset_engine_CSSC_precheck(dev_priv,
+ engine,
+ NULL,
+ &ret)) {
+ DRM_INFO("Aborting hang recovery on %s (%d)\n",
+ engine->name, ret);
+
goto reset_engine_error;
- }
+ }
}
ret = intel_ring_disable(engine);
@@ -1032,27 +1140,25 @@ int i915_reset_engine(struct intel_engine_cs *engine)
}
if (i915.enable_execlists) {
- enum context_submission_status status;
- bool inconsistent;
-
- status = intel_execlists_TDR_get_current_request(engine,
- ¤t_request);
-
- inconsistent = (status != CONTEXT_SUBMISSION_STATUS_OK);
- if (inconsistent) {
- /*
- * If we somehow have reached this point with
- * an inconsistent context submission status then
- * back out of the previously requested reset and
- * retry later.
- */
- WARN(inconsistent,
- "Inconsistent context status on %s: %u\n",
- engine->name, status);
+ /*
+ * Get a hold of the currently executing context.
+ *
+ * Context submission status consistency is done implicitly so
+ * we might as well check it post-engine disablement since we
+ * get that option for free. Also, it's conceivable that the
+ * context submission state might have changed as part of the
+ * reset request on gen8+ so it's not completely devoid of
+ * value to do this.
+ */
+ if (!i915_gem_reset_engine_CSSC_precheck(dev_priv,
+ engine,
+ ¤t_request,
+ &ret)) {
+ DRM_INFO("Aborting hang recovery on %s (%d)\n",
+ engine->name, ret);
- ret = -EAGAIN;
goto reenable_reset_engine_error;
- }
+ }
}
/* Sample the current ring head position */
@@ -36,6 +36,7 @@
#include "i915_drv.h"
#include "i915_trace.h"
#include "intel_drv.h"
+#include "intel_lrc_tdr.h"
/**
* DOC: interrupt handling
@@ -1324,7 +1325,7 @@ gen8_cs_irq_handler(struct intel_engine_cs *ring, u32 iir, int test_shift)
if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
notify_ring(ring);
if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
- intel_lrc_irq_handler(ring);
+ intel_lrc_irq_handler(ring, true);
if (iir & (GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT)) {
struct drm_i915_private *dev_priv = ring->dev->dev_private;
@@ -2524,27 +2525,6 @@ static void i915_error_work_func(struct work_struct *work)
ret = i915_reset_engine(ring);
- /*
- * Execlist mode only:
- *
- * -EAGAIN means that between detecting a hang (and
- * also determining that the currently submitted
- * context is stable and valid) and trying to recover
- * from the hang the current context changed state.
- * This means that we are probably not completely hung
- * after all. Just fail and retry by exiting all the
- * way back and wait for the next hang detection. If we
- * have a true hang on our hands then we will detect it
- * again, otherwise we will continue like nothing
- * happened.
- */
- if (ret == -EAGAIN) {
- DRM_ERROR("Reset of %s aborted due to " \
- "change in context submission " \
- "state - retrying!", ring->name);
- ret = 0;
- }
-
if (ret) {
DRM_ERROR("Reset of %s failed! (%d)", ring->name, ret);
@@ -726,11 +726,15 @@ static void get_context_status(struct intel_engine_cs *ring,
/**
* intel_lrc_irq_handler() - handle Context Switch interrupts
* @ring: Engine Command Streamer to handle.
+ * @do_lock: Lock execlist spinlock (if false the caller is responsible for this)
*
* Check the unread Context Status Buffers and manage the submission of new
* contexts to the ELSP accordingly.
+ *
+ * Return:
+ * The number of unqueued contexts.
*/
-void intel_lrc_irq_handler(struct intel_engine_cs *ring)
+int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
u32 status_pointer;
@@ -740,6 +744,9 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
u32 status_id;
u32 submit_contexts = 0;
+ if (do_lock)
+ spin_lock(&ring->execlist_lock);
+
status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
read_pointer = ring->next_context_status_buffer;
@@ -747,8 +754,6 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
if (read_pointer > write_pointer)
write_pointer += GEN8_CSB_ENTRIES;
- spin_lock(&ring->execlist_lock);
-
while (read_pointer < write_pointer) {
get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
@@ -781,8 +786,6 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
execlists_context_unqueue(ring, false);
}
- spin_unlock(&ring->execlist_lock);
-
if (unlikely(submit_contexts > 2))
DRM_ERROR("More than two context complete events?\n");
@@ -793,6 +796,11 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
ring->next_context_status_buffer << 8));
+
+ if (do_lock)
+ spin_unlock(&ring->execlist_lock);
+
+ return submit_contexts;
}
static int execlists_context_queue(struct drm_i915_gem_request *request)
@@ -1811,6 +1819,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
struct drm_device *dev = ring->dev;
struct drm_i915_private *dev_priv = dev->dev_private;
u8 next_context_status_buffer_hw;
+ unsigned long flags;
lrc_setup_hardware_status_page(ring,
ring->default_context->engine[ring->id].state);
@@ -1823,6 +1832,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
_MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
POSTING_READ(RING_MODE_GEN7(ring));
+ spin_lock_irqsave(&ring->execlist_lock, flags);
/*
* Instead of resetting the Context Status Buffer (CSB) read pointer to
* zero, we need to read the write pointer from hardware and use its
@@ -1847,6 +1857,8 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
ring->next_context_status_buffer = next_context_status_buffer_hw;
+ spin_unlock_irqrestore(&ring->execlist_lock, flags);
+
DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
i915_hangcheck_reinit(ring);
@@ -3232,3 +3244,64 @@ intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring,
return status;
}
+
+/**
+ * execlists_TDR_force_CSB_check() - rectify inconsistency by faking IRQ
+ * @dev_priv: ...
+ * @engine: engine whose CSB is to be checked.
+ *
+ * Context submission status inconsistencies are caused by lost interrupts that
+ * leave CSB events unprocessed and leave contexts in the execlist queues when
+ * they should really have been removed. These stale contexts block further
+ * submissions to the hardware (all the while the hardware is sitting idle) and
+ * thereby cause a software hang. The way to rectify this is by manually
+ * checking the CSB buffer for outstanding context state transition events and
+ * acting on these. The easiest way of doing this is by simply faking the
+ * presumed lost context event interrupt by manually calling the interrupt
+ * handler. If there are indeed outstanding, unprocessed CSB events then these
+ * will be processed by the faked interrupt call and if one of these events is
+ * some form of context completion event then that will purge a stale context
+ * from the execlist queue and submit a new context to hardware from the queue,
+ * thereby resuming execution.
+ *
+ * Returns:
+ * True: Forced CSB check successful, state consistency restored.
+ * False: No CSB events found, forced CSB check unsuccessful, failed
+ * trying to restore consistency.
+ */
+bool intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
+ struct intel_engine_cs *engine)
+{
+ unsigned long flags;
+ bool hw_active;
+ int was_effective;
+
+ hw_active =
+ (I915_READ(RING_EXECLIST_STATUS_LO(engine)) &
+ EXECLIST_STATUS_CURRENT_ACTIVE_ELEMENT_STATUS) ?
+ true : false;
+ if (hw_active) {
+ u32 hw_context;
+
+ hw_context = I915_READ(RING_EXECLIST_STATUS_CTX_ID(engine));
+ WARN(hw_active, "Context (%x) executing on %s - " \
+ "No need for faked IRQ!\n",
+ hw_context, engine->name);
+ return false;
+ }
+
+ spin_lock_irqsave(&engine->execlist_lock, flags);
+
+ WARN(1, "%s: Inconsistent context state - Faking context event IRQ!\n",
+ engine->name);
+
+ if (!(was_effective = intel_lrc_irq_handler(engine, false)))
+ DRM_ERROR("Forced CSB check of %s ineffective!\n", engine->name);
+
+ spin_unlock_irqrestore(&engine->execlist_lock, flags);
+
+ wake_up_all(&engine->irq_queue);
+
+ return !!was_effective;
+}
+
@@ -117,7 +117,7 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
struct list_head *vmas);
u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
-void intel_lrc_irq_handler(struct intel_engine_cs *ring);
+int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock);
void intel_execlists_retire_requests(struct intel_engine_cs *ring);
int intel_execlists_read_tail(struct intel_engine_cs *ring,
@@ -32,5 +32,8 @@ enum context_submission_status
intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring,
struct drm_i915_gem_request **req);
+bool intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
+ struct intel_engine_cs *engine);
+
#endif /* _INTEL_LRC_TDR_H_ */