@@ -3120,6 +3120,14 @@ i915_gem_context_lookup(struct drm_i915_file_private *file_priv, u32 id)
return ctx;
}
+static inline u32
+watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
+{
+ u64 threshold = 0;
+
+ return threshold;
+}
+
int i915_perf_open_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
@@ -233,6 +233,9 @@ struct i915_gpu_error {
* i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
* secondary role in preventing two concurrent global reset attempts.
*
+ * #I915_RESET_WATCHDOG - When hw detects a hang before us, we can use
+ * I915_RESET_WATCHDOG to report the hang detection cause accurately.
+ *
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset attempts in the same engine.
@@ -248,6 +251,7 @@ struct i915_gpu_error {
#define I915_RESET_BACKOFF 0
#define I915_RESET_MODESET 1
#define I915_RESET_ENGINE 2
+#define I915_RESET_WATCHDOG 3
#define I915_WEDGED (BITS_PER_LONG - 1)
/** Number of times an engine has been reset */
@@ -1456,6 +1456,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
if (tasklet)
tasklet_hi_schedule(&engine->execlists.tasklet);
+
+ if (iir & GT_GEN8_WATCHDOG_INTERRUPT)
+ tasklet_schedule(&engine->execlists.watchdog_tasklet);
}
static void gen8_gt_irq_ack(struct drm_i915_private *i915,
@@ -3883,17 +3886,24 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
u32 gt_interrupts[] = {
GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT,
GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
- GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT,
+ GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT,
0,
GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT
};
+ /* VECS watchdog is only available in skl+ */
+ if (INTEL_GEN(dev_priv) >= 9)
+ gt_interrupts[3] |= GT_GEN8_WATCHDOG_INTERRUPT;
+
dev_priv->pm_ier = 0x0;
dev_priv->pm_imr = ~dev_priv->pm_ier;
GEN8_IRQ_INIT_NDX(GT, 0, ~gt_interrupts[0], gt_interrupts[0]);
@@ -2335,6 +2335,11 @@ enum i915_power_well_id {
#define RING_START(base) _MMIO((base) + 0x38)
#define RING_CTL(base) _MMIO((base) + 0x3c)
#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */
+#define RING_CNTR(base) _MMIO((base) + 0x178)
+#define GEN8_WATCHDOG_ENABLE 0
+#define GEN8_WATCHDOG_DISABLE 1
+#define GEN8_XCS_WATCHDOG_DISABLE 0xFFFFFFFF /* GEN8 & non-render only */
+#define RING_THRESH(base) _MMIO((base) + 0x17C)
#define RING_SYNC_0(base) _MMIO((base) + 0x40)
#define RING_SYNC_1(base) _MMIO((base) + 0x44)
#define RING_SYNC_2(base) _MMIO((base) + 0x48)
@@ -2894,6 +2899,7 @@ enum i915_power_well_id {
#define GT_BSD_USER_INTERRUPT (1 << 12)
#define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1 (1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
#define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8)
+#define GT_GEN8_WATCHDOG_INTERRUPT (1 << 6) /* gen8+ */
#define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */
#define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4)
#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT (1 << 3)
@@ -1106,6 +1106,7 @@ void intel_engines_park(struct drm_i915_private *i915)
/* Flush the residual irq tasklets first. */
intel_engine_disarm_breadcrumbs(engine);
tasklet_kill(&engine->execlists.tasklet);
+ tasklet_kill(&engine->execlists.watchdog_tasklet);
/*
* We are committed now to parking the engines, make sure there
@@ -218,7 +218,8 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
static void hangcheck_declare_hang(struct drm_i915_private *i915,
unsigned int hung,
- unsigned int stuck)
+ unsigned int stuck,
+ unsigned int watchdog)
{
struct intel_engine_cs *engine;
char msg[80];
@@ -231,13 +232,16 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
if (stuck != hung)
hung &= ~stuck;
len = scnprintf(msg, sizeof(msg),
- "%s on ", stuck == hung ? "no progress" : "hang");
+ "%s on ", watchdog ? "watchdog timeout" :
+ stuck == hung ? "no progress" : "hang");
for_each_engine_masked(engine, i915, hung, tmp)
len += scnprintf(msg + len, sizeof(msg) - len,
"%s, ", engine->name);
msg[len-2] = '\0';
- return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
+ return i915_handle_error(i915, hung,
+ watchdog ? 0 : I915_ERROR_CAPTURE,
+ "%s", msg);
}
/*
@@ -255,7 +259,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
gpu_error.hangcheck_work.work);
struct intel_engine_cs *engine;
enum intel_engine_id id;
- unsigned int hung = 0, stuck = 0, wedged = 0;
+ unsigned int hung = 0, stuck = 0, wedged = 0, watchdog = 0;
if (!i915_modparams.enable_hangcheck)
return;
@@ -266,6 +270,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (i915_terminally_wedged(&dev_priv->gpu_error))
return;
+ if (test_and_clear_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags))
+ watchdog = 1;
+
/* As enabling the GPU requires fairly extensive mmio access,
* periodically arm the mmio checker to see if we are triggering
* any invalid access.
@@ -311,7 +318,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
}
if (hung)
- hangcheck_declare_hang(dev_priv, hung, stuck);
+ hangcheck_declare_hang(dev_priv, hung, stuck, watchdog);
/* Reset timer in case GPU hangs without another request being added */
i915_queue_hangcheck(dev_priv);
@@ -2352,6 +2352,70 @@ static int gen8_emit_flush_render(struct i915_request *request,
return 0;
}
+/* From GEN9 onwards, all engines use the same RING_CNTR format */
+static inline u32 get_watchdog_disable(struct intel_engine_cs *engine)
+{
+ if (engine->id == RCS || INTEL_GEN(engine->i915) >= 9)
+ return GEN8_WATCHDOG_DISABLE;
+ else
+ return GEN8_XCS_WATCHDOG_DISABLE;
+}
+
+#define GEN8_WATCHDOG_1000US(dev_priv) watchdog_to_clock_counts(dev_priv, 1000)
+static void gen8_watchdog_irq_handler(unsigned long data)
+{
+ struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+ struct drm_i915_private *dev_priv = engine->i915;
+ enum forcewake_domains fw_domains;
+ u32 current_seqno;
+
+ switch (engine->id) {
+ default:
+ MISSING_CASE(engine->id);
+ /* fall through */
+ case RCS:
+ fw_domains = FORCEWAKE_RENDER;
+ break;
+ case VCS:
+ case VCS2:
+ case VECS:
+ fw_domains = FORCEWAKE_MEDIA;
+ break;
+ }
+
+ intel_uncore_forcewake_get(dev_priv, fw_domains);
+
+ /* Stop the counter to prevent further timeout interrupts */
+ I915_WRITE_FW(RING_CNTR(engine->mmio_base), get_watchdog_disable(engine));
+
+ current_seqno = intel_engine_get_hangcheck_seqno(engine);
+
+ /* did the request complete after the timer expired? */
+ if (engine->hangcheck.next_seqno == current_seqno)
+ goto fw_put;
+
+ if (engine->hangcheck.watchdog == current_seqno) {
+ /* Make sure the active request will be marked as guilty */
+ engine->hangcheck.acthd = intel_engine_get_active_head(engine);
+ engine->hangcheck.last_seqno = current_seqno;
+
+ /* And try to run the hangcheck_work as soon as possible */
+ set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags);
+ queue_delayed_work(system_long_wq,
+ &dev_priv->gpu_error.hangcheck_work,
+ round_jiffies_up_relative(HZ));
+ } else {
+ engine->hangcheck.watchdog = current_seqno;
+ /* Re-start the counter, if really hung, it will expire again */
+ I915_WRITE_FW(RING_THRESH(engine->mmio_base),
+ GEN8_WATCHDOG_1000US(dev_priv));
+ I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE);
+ }
+
+fw_put:
+ intel_uncore_forcewake_put(dev_priv, fw_domains);
+}
+
/*
* Reserve space for 2 NOOPs at the end of each request to be
* used as a workaround for not being allowed to do lite
@@ -2539,6 +2603,22 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
+
+ switch (engine->id) {
+ default:
+ /* BCS engine does not support hw watchdog */
+ break;
+ case RCS:
+ case VCS:
+ case VCS2:
+ engine->irq_keep_mask |= GT_GEN8_WATCHDOG_INTERRUPT << shift;
+ break;
+ case VECS:
+ if (INTEL_GEN(engine->i915) >= 9)
+ engine->irq_keep_mask |=
+ GT_GEN8_WATCHDOG_INTERRUPT << shift;
+ break;
+ }
}
static int
@@ -2556,6 +2636,9 @@ logical_ring_setup(struct intel_engine_cs *engine)
tasklet_init(&engine->execlists.tasklet,
execlists_submission_tasklet, (unsigned long)engine);
+ tasklet_init(&engine->execlists.watchdog_tasklet,
+ gen8_watchdog_irq_handler, (unsigned long)engine);
+
logical_ring_default_vfuncs(engine);
logical_ring_default_irqs(engine);
@@ -122,6 +122,7 @@ struct intel_engine_hangcheck {
u64 acthd;
u32 last_seqno;
u32 next_seqno;
+ u32 watchdog;
unsigned long action_timestamp;
struct intel_instdone instdone;
};
@@ -222,6 +223,11 @@ struct intel_engine_execlists {
*/
struct tasklet_struct tasklet;
+ /**
+ * @watchdog_tasklet: stop counter and re-schedule hangcheck_work asap
+ */
+ struct tasklet_struct watchdog_tasklet;
+
/**
* @default_priolist: priority list for I915_PRIORITY_NORMAL
*/