@@ -1552,6 +1552,9 @@ struct i915_gpu_error {
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset-engine attempts.
*
+ * #I915_RESET_WATCHDOG - When hw detects a hang before us, we can use
+ * I915_RESET_WATCHDOG to report the hang detection cause accurately.
+ *
* #I915_WEDGED - If reset fails and we can no longer use the GPU,
* we set the #I915_WEDGED bit. Prior to command submission, e.g.
* i915_gem_request_alloc(), this bit is checked and the sequence
@@ -1561,6 +1564,7 @@ struct i915_gpu_error {
#define I915_RESET_BACKOFF 0
#define I915_RESET_HANDOFF 1
#define I915_RESET_ENGINE_IN_PROGRESS 2
+#define I915_RESET_WATCHDOG 3
#define I915_WEDGED (BITS_PER_LONG - 1)
/** Number of times an engine has been reset */
@@ -1336,6 +1336,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
if (tasklet)
tasklet_hi_schedule(&engine->irq_tasklet);
+
+ if (iir & (GT_GEN8_WATCHDOG_INTERRUPT << test_shift))
+ tasklet_schedule(&engine->watchdog_tasklet);
}
static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
@@ -3415,12 +3418,15 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
uint32_t gt_interrupts[] = {
GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT,
GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
- GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT,
+ GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT,
0,
GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT
@@ -3429,6 +3435,10 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
if (HAS_L3_DPF(dev_priv))
gt_interrupts[0] |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
+ /* VECS watchdog is only available in skl+ */
+ if (INTEL_GEN(dev_priv) >= 9)
+ gt_interrupts[3] |= GT_GEN8_WATCHDOG_INTERRUPT;
+
dev_priv->pm_ier = 0x0;
dev_priv->pm_imr = ~dev_priv->pm_ier;
GEN8_IRQ_INIT_NDX(GT, 0, ~gt_interrupts[0], gt_interrupts[0]);
@@ -1908,6 +1908,11 @@ enum skl_disp_power_wells {
#define RING_START(base) _MMIO((base)+0x38)
#define RING_CTL(base) _MMIO((base)+0x3c)
#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */
+#define RING_CNTR(base) _MMIO((base) + 0x178)
+#define GEN8_WATCHDOG_ENABLE 0
+#define GEN8_WATCHDOG_DISABLE 1
+#define GEN8_XCS_WATCHDOG_DISABLE 0xFFFFFFFF /* GEN8 & non-render only */
+#define RING_THRESH(base) _MMIO((base) + 0x17C)
#define RING_SYNC_0(base) _MMIO((base)+0x40)
#define RING_SYNC_1(base) _MMIO((base)+0x44)
#define RING_SYNC_2(base) _MMIO((base)+0x48)
@@ -2386,6 +2391,7 @@ enum skl_disp_power_wells {
#define GT_BSD_USER_INTERRUPT (1 << 12)
#define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1 (1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
#define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8)
+#define GT_GEN8_WATCHDOG_INTERRUPT (1 << 6) /* gen8+ */
#define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */
#define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4)
#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT (1 << 3)
@@ -388,7 +388,8 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
static void hangcheck_declare_hang(struct drm_i915_private *i915,
unsigned int hung,
- unsigned int stuck)
+ unsigned int stuck,
+ unsigned int watchdog)
{
struct intel_engine_cs *engine;
char msg[80];
@@ -401,7 +402,8 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
if (stuck != hung)
hung &= ~stuck;
len = scnprintf(msg, sizeof(msg),
- "%s on ", stuck == hung ? "No progress" : "Hang");
+ "%s on ", watchdog ? "Watchdog timeout" :
+ stuck == hung ? "No progress" : "Hang");
for_each_engine_masked(engine, i915, hung, tmp)
len += scnprintf(msg + len, sizeof(msg) - len,
"%s, ", engine->name);
@@ -425,7 +427,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
gpu_error.hangcheck_work.work);
struct intel_engine_cs *engine;
enum intel_engine_id id;
- unsigned int hung = 0, stuck = 0;
+ unsigned int hung = 0, stuck = 0, watchdog = 0;
int busy_count = 0;
if (!i915.enable_hangcheck)
@@ -437,6 +439,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (i915_terminally_wedged(&dev_priv->gpu_error))
return;
+ if (test_and_clear_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags))
+ watchdog = 1;
+
/* As enabling the GPU requires fairly extensive mmio access,
* periodically arm the mmio checker to see if we are triggering
* any invalid access.
@@ -463,7 +468,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
}
if (hung)
- hangcheck_declare_hang(dev_priv, hung, stuck);
+ hangcheck_declare_hang(dev_priv, hung, stuck, watchdog);
/* Reset timer in case GPU hangs without another request being added */
if (busy_count)
@@ -1538,6 +1538,53 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
return 0;
}
+/* From GEN9 onwards, all engines use the same RING_CNTR format */
+static inline u32 get_watchdog_disable(struct intel_engine_cs *engine)
+{
+ if (engine->id == RCS || INTEL_GEN(engine->i915) >= 9)
+ return GEN8_WATCHDOG_DISABLE;
+ else
+ return GEN8_XCS_WATCHDOG_DISABLE;
+}
+
+#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function
+static void gen8_watchdog_irq_handler(unsigned long data)
+{
+ struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+ struct drm_i915_private *dev_priv = engine->i915;
+ u32 current_seqno;
+
+ intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
+
+ /* Stop the counter to prevent further timeout interrupts */
+ I915_WRITE_FW(RING_CNTR(engine->mmio_base), get_watchdog_disable(engine));
+
+ current_seqno = intel_engine_get_seqno(engine);
+
+ /* did the request complete after the timer expired? */
+ if (intel_engine_last_submit(engine) == current_seqno)
+ goto fw_put;
+
+ if (engine->hangcheck.watchdog == current_seqno) {
+ /* Make sure the active request will be marked as guilty */
+ engine->hangcheck.stalled = true;
+ engine->hangcheck.seqno = current_seqno;
+
+ /* And try to run the hangcheck_work as soon as possible */
+ set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags);
+ queue_delayed_work(system_long_wq,
+ &dev_priv->gpu_error.hangcheck_work, 0);
+ } else {
+ engine->hangcheck.watchdog = current_seqno;
+ /* Re-start the counter, if really hung, it will expire again */
+ I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US);
+ I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE);
+ }
+
+fw_put:
+ intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
+}
+
/*
* Reserve space for 2 NOOPs at the end of each request to be
* used as a workaround for not being allowed to do lite
@@ -1631,6 +1678,9 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->irq_tasklet.state)))
tasklet_kill(&engine->irq_tasklet);
+ if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->watchdog_tasklet.state)))
+ tasklet_kill(&engine->watchdog_tasklet);
+
dev_priv = engine->i915;
if (engine->buffer) {
@@ -1689,6 +1739,22 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
unsigned shift = engine->irq_shift;
engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
+
+ switch (engine->id) {
+ default:
+ /* BCS engine does not support hw watchdog */
+ break;
+ case RCS:
+ case VCS:
+ case VCS2:
+ engine->irq_keep_mask |= (GT_GEN8_WATCHDOG_INTERRUPT << shift);
+ break;
+ case VECS:
+ if (INTEL_GEN(engine->i915) >= 9)
+ engine->irq_keep_mask |=
+ (GT_GEN8_WATCHDOG_INTERRUPT << shift);
+ break;
+ }
}
static int
@@ -1737,6 +1803,9 @@ logical_ring_setup(struct intel_engine_cs *engine)
tasklet_init(&engine->irq_tasklet,
intel_lrc_irq_handler, (unsigned long)engine);
+ tasklet_init(&engine->watchdog_tasklet,
+ gen8_watchdog_irq_handler, (unsigned long)engine);
+
logical_ring_default_vfuncs(engine);
logical_ring_default_irqs(engine);
}
@@ -117,6 +117,7 @@ struct intel_instdone {
struct intel_engine_hangcheck {
u64 acthd;
u32 seqno;
+ u32 watchdog;
enum intel_engine_hangcheck_action action;
unsigned long action_timestamp;
int deadlock;
@@ -416,6 +417,9 @@ struct intel_engine_cs {
struct intel_engine_hangcheck hangcheck;
+ /* watchdog_tasklet: stop counter and re-schedule hangcheck_work asap */
+ struct tasklet_struct watchdog_tasklet;
+
bool needs_cmd_parser;
/*