Message ID | 20170325013010.36244-16-michel.thierry@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Mar 24, 2017 at 06:30:07PM -0700, Michel Thierry wrote: > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 87e76ef589b1..d484cbc561eb 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1369,6 +1369,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift) > > if (tasklet) > tasklet_hi_schedule(&engine->irq_tasklet); > + > + if (iir & (GT_GEN8_WATCHDOG_INTERRUPT << test_shift)) { > + tasklet_hi_schedule(&engine->watchdog_tasklet); We don't need to set this as high, we definitely do want to process the live engines first and so some small latency in detecting the reset is no deal breaker. We probably don't even want to use a tasklet? (Or actually we do!) > static void hangcheck_declare_hang(struct drm_i915_private *i915, > unsigned int hung, > - unsigned int stuck) > + unsigned int stuck, > + unsigned int watchdog) That's a very interesting question as to whether we want to use the very heavy hangcheck and capture machine at all for the watchdog. > +#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function > +static void gen8_watchdog_irq_handler(unsigned long data) > +{ > + struct intel_engine_cs *engine = (struct intel_engine_cs *)data; > + struct drm_i915_private *dev_priv = engine->i915; > + u32 watchdog_disable, current_seqno; > + > + intel_uncore_forcewake_get(dev_priv, engine->fw_domains); > + > + if (engine->id == RCS) > + watchdog_disable = GEN8_RCS_WATCHDOG_DISABLE; > + else > + watchdog_disable = GEN8_XCS_WATCHDOG_DISABLE; > + > + /* Stop the counter to prevent further timeout interrupts */ > + I915_WRITE_FW(RING_CNTR(engine->mmio_base), watchdog_disable); > + > + /* false-positive, request completed after the timer expired */ False optimism in spotting the false positive. engine_is_idle() means all requests not the interesting one. Since you are using seqno, just reject when seqno == intel_engine_last_submit(). > + if (intel_engine_is_idle(engine)) > + goto fw_put; > + > + current_seqno = intel_engine_get_seqno(engine); > + if (engine->hangcheck.last_watchdog_seqno == current_seqno) { Or you could just reset the engine directly, once we rid it of that pesky mutex (which is done in all but name already). Doing that from inside the tasklet has some advantages -- we don't need to disable the execlists/guc tasklet. > + /* Make sure the active request will be marked as guilty */ > + engine->hangcheck.stalled = true; > + engine->hangcheck.seqno = intel_engine_get_seqno(engine); > + > + /* And try to run the hangcheck_work as soon as possible */ > + set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags); > + queue_delayed_work(system_long_wq, > + &dev_priv->gpu_error.hangcheck_work, 0); > + } else { > + engine->hangcheck.last_watchdog_seqno = current_seqno; > + /* Re-start the counter, if really hung, it will expire again */ > + I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US); > + I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE); > + } > + > +fw_put: > + intel_uncore_forcewake_put(dev_priv, engine->fw_domains); > +} > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index e8faf2c34c97..fffe69f5aed2 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -128,6 +128,7 @@ struct intel_instdone { > struct intel_engine_hangcheck { > u64 acthd; > u32 seqno; > + u32 last_watchdog_seqno; Just watchdog will be enough for its meaning to be clear. -Chris
On 25/03/17 02:26, Chris Wilson wrote: > On Fri, Mar 24, 2017 at 06:30:07PM -0700, Michel Thierry wrote: >> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c >> index 87e76ef589b1..d484cbc561eb 100644 >> --- a/drivers/gpu/drm/i915/i915_irq.c >> +++ b/drivers/gpu/drm/i915/i915_irq.c >> @@ -1369,6 +1369,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift) >> >> if (tasklet) >> tasklet_hi_schedule(&engine->irq_tasklet); >> + >> + if (iir & (GT_GEN8_WATCHDOG_INTERRUPT << test_shift)) { >> + tasklet_hi_schedule(&engine->watchdog_tasklet); > > We don't need to set this as high, we definitely do want to process the > live engines first and so some small latency in detecting the reset is > no deal breaker. We probably don't even want to use a tasklet? (Or > actually we do!) > >> static void hangcheck_declare_hang(struct drm_i915_private *i915, >> unsigned int hung, >> - unsigned int stuck) >> + unsigned int stuck, >> + unsigned int watchdog) > > That's a very interesting question as to whether we want to use the very > heavy hangcheck and capture machine at all for the watchdog. > >> +#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function >> +static void gen8_watchdog_irq_handler(unsigned long data) >> +{ >> + struct intel_engine_cs *engine = (struct intel_engine_cs *)data; >> + struct drm_i915_private *dev_priv = engine->i915; >> + u32 watchdog_disable, current_seqno; >> + >> + intel_uncore_forcewake_get(dev_priv, engine->fw_domains); >> + >> + if (engine->id == RCS) >> + watchdog_disable = GEN8_RCS_WATCHDOG_DISABLE; >> + else >> + watchdog_disable = GEN8_XCS_WATCHDOG_DISABLE; >> + >> + /* Stop the counter to prevent further timeout interrupts */ >> + I915_WRITE_FW(RING_CNTR(engine->mmio_base), watchdog_disable); >> + >> + /* false-positive, request completed after the timer expired */ > > False optimism in spotting the false positive. engine_is_idle() means > all requests not the interesting one. Since you are using seqno, just > reject when seqno == intel_engine_last_submit(). > >> + if (intel_engine_is_idle(engine)) >> + goto fw_put; >> + >> + current_seqno = intel_engine_get_seqno(engine); >> + if (engine->hangcheck.last_watchdog_seqno == current_seqno) { > > Or you could just reset the engine directly, once we rid it of that > pesky mutex (which is done in all but name already). Doing that from > inside the tasklet has some advantages -- we don't need to disable the > execlists/guc tasklet. > True, as you said above, we probably don't need to capture the gpu state in this case. The error state may not even be meaningful (for example if the threshold was too short and the engine was not really hung). >> + /* Make sure the active request will be marked as guilty */ >> + engine->hangcheck.stalled = true; >> + engine->hangcheck.seqno = intel_engine_get_seqno(engine); >> + >> + /* And try to run the hangcheck_work as soon as possible */ >> + set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags); >> + queue_delayed_work(system_long_wq, >> + &dev_priv->gpu_error.hangcheck_work, 0); >> + } else { >> + engine->hangcheck.last_watchdog_seqno = current_seqno; >> + /* Re-start the counter, if really hung, it will expire again */ >> + I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US); >> + I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE); >> + } >> + >> +fw_put: >> + intel_uncore_forcewake_put(dev_priv, engine->fw_domains); >> +} > >> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h >> index e8faf2c34c97..fffe69f5aed2 100644 >> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h >> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h >> @@ -128,6 +128,7 @@ struct intel_instdone { >> struct intel_engine_hangcheck { >> u64 acthd; >> u32 seqno; >> + u32 last_watchdog_seqno; > > Just watchdog will be enough for its meaning to be clear. watchdog or watchdog_seqno?
On Mon, Mar 27, 2017 at 02:48:42PM -0700, Michel Thierry wrote: > > > On 25/03/17 02:26, Chris Wilson wrote: > >On Fri, Mar 24, 2017 at 06:30:07PM -0700, Michel Thierry wrote: > >>diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > >>index e8faf2c34c97..fffe69f5aed2 100644 > >>--- a/drivers/gpu/drm/i915/intel_ringbuffer.h > >>+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > >>@@ -128,6 +128,7 @@ struct intel_instdone { > >> struct intel_engine_hangcheck { > >> u64 acthd; > >> u32 seqno; > >>+ u32 last_watchdog_seqno; > > > >Just watchdog will be enough for its meaning to be clear. > > watchdog or watchdog_seqno? Here, intel_engine_hangcheck.watchdog is unique enough for it not to be confusing. If we grow more interesting bits for the watchdog, we break it out into its own substruct. Maybe we should task Mika with filling in some kerneldoc for struct intel_engine_hangcheck? -Chris
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d5c12ddd35b3..b43c37a911bb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1628,6 +1628,9 @@ struct i915_gpu_error { * inspect the bit and do the reset directly, otherwise the worker * waits for the struct_mutex. * + * #I915_RESET_WATCHDOG - When hw detects a hang before us, we can use + * I915_RESET_WATCHDOG to report the hang detection cause accurately. + * * #I915_WEDGED - If reset fails and we can no longer use the GPU, * we set the #I915_WEDGED bit. Prior to command submission, e.g. * i915_gem_request_alloc(), this bit is checked and the sequence @@ -1636,6 +1639,7 @@ struct i915_gpu_error { unsigned long flags; #define I915_RESET_BACKOFF 0 #define I915_RESET_HANDOFF 1 +#define I915_RESET_WATCHDOG 2 #define I915_WEDGED (BITS_PER_LONG - 1) /* if available, engine-specific reset is tried before full gpu reset */ diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 87e76ef589b1..d484cbc561eb 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1369,6 +1369,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift) if (tasklet) tasklet_hi_schedule(&engine->irq_tasklet); + + if (iir & (GT_GEN8_WATCHDOG_INTERRUPT << test_shift)) { + tasklet_hi_schedule(&engine->watchdog_tasklet); + } } static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv, @@ -3442,12 +3446,15 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv) uint32_t gt_interrupts[] = { GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT | GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT | + GT_GEN8_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT | GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT | GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT, GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT | GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT | + GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT | GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT | - GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT, + GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT | + GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT, 0, GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT | GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT @@ -3456,6 +3463,10 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv) if (HAS_L3_DPF(dev_priv)) gt_interrupts[0] |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; + /* VECS watchdog is only available in skl+ */ + if (INTEL_GEN(dev_priv) >= 9) + gt_interrupts[3] |= GT_GEN8_WATCHDOG_INTERRUPT; + dev_priv->pm_ier = 0x0; dev_priv->pm_imr = ~dev_priv->pm_ier; GEN8_IRQ_INIT_NDX(GT, 0, ~gt_interrupts[0], gt_interrupts[0]); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 04c8f69fcc62..89f5191b2635 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -1900,6 +1900,11 @@ enum skl_disp_power_wells { #define RING_START(base) _MMIO((base)+0x38) #define RING_CTL(base) _MMIO((base)+0x3c) #define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ +#define RING_CNTR(base) _MMIO((base) + 0x178) +#define GEN8_WATCHDOG_ENABLE 0 +#define GEN8_RCS_WATCHDOG_DISABLE 1 +#define GEN8_XCS_WATCHDOG_DISABLE 0xFFFFFFFF +#define RING_THRESH(base) _MMIO((base) + 0x17C) #define RING_SYNC_0(base) _MMIO((base)+0x40) #define RING_SYNC_1(base) _MMIO((base)+0x44) #define RING_SYNC_2(base) _MMIO((base)+0x48) @@ -2378,6 +2383,7 @@ enum skl_disp_power_wells { #define GT_BSD_USER_INTERRUPT (1 << 12) #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1 (1 << 11) /* hsw+; rsvd on snb, ivb, vlv */ #define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8) +#define GT_GEN8_WATCHDOG_INTERRUPT (1 << 6) /* gen8+ */ #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */ #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4) #define GT_RENDER_CS_MASTER_ERROR_INTERRUPT (1 << 3) diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index dce742243ba6..0e9272c97096 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -388,7 +388,8 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, static void hangcheck_declare_hang(struct drm_i915_private *i915, unsigned int hung, - unsigned int stuck) + unsigned int stuck, + unsigned int watchdog) { struct intel_engine_cs *engine; char msg[80]; @@ -401,7 +402,8 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, if (stuck != hung) hung &= ~stuck; len = scnprintf(msg, sizeof(msg), - "%s on ", stuck == hung ? "No progress" : "Hang"); + "%s on ", watchdog ? "Watchdog timeout" : + stuck == hung ? "No progress" : "Hang"); for_each_engine_masked(engine, i915, hung, tmp) len += scnprintf(msg + len, sizeof(msg) - len, "%s, ", engine->name); @@ -425,7 +427,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) gpu_error.hangcheck_work.work); struct intel_engine_cs *engine; enum intel_engine_id id; - unsigned int hung = 0, stuck = 0; + unsigned int hung = 0, stuck = 0, watchdog = 0; int busy_count = 0; if (!i915.enable_hangcheck) @@ -437,6 +439,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work) if (i915_terminally_wedged(&dev_priv->gpu_error)) return; + if (test_and_clear_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags)) + watchdog = 1; + /* As enabling the GPU requires fairly extensive mmio access, * periodically arm the mmio checker to see if we are triggering * any invalid access. @@ -463,7 +468,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) } if (hung) - hangcheck_declare_hang(dev_priv, hung, stuck); + hangcheck_declare_hang(dev_priv, hung, stuck, watchdog); /* Reset timer in case GPU hangs without another request being added */ if (busy_count) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index bc224a24ddad..73f8fbdcf1fb 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1470,6 +1470,48 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request, return 0; } +#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function +static void gen8_watchdog_irq_handler(unsigned long data) +{ + struct intel_engine_cs *engine = (struct intel_engine_cs *)data; + struct drm_i915_private *dev_priv = engine->i915; + u32 watchdog_disable, current_seqno; + + intel_uncore_forcewake_get(dev_priv, engine->fw_domains); + + if (engine->id == RCS) + watchdog_disable = GEN8_RCS_WATCHDOG_DISABLE; + else + watchdog_disable = GEN8_XCS_WATCHDOG_DISABLE; + + /* Stop the counter to prevent further timeout interrupts */ + I915_WRITE_FW(RING_CNTR(engine->mmio_base), watchdog_disable); + + /* false-positive, request completed after the timer expired */ + if (intel_engine_is_idle(engine)) + goto fw_put; + + current_seqno = intel_engine_get_seqno(engine); + if (engine->hangcheck.last_watchdog_seqno == current_seqno) { + /* Make sure the active request will be marked as guilty */ + engine->hangcheck.stalled = true; + engine->hangcheck.seqno = intel_engine_get_seqno(engine); + + /* And try to run the hangcheck_work as soon as possible */ + set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags); + queue_delayed_work(system_long_wq, + &dev_priv->gpu_error.hangcheck_work, 0); + } else { + engine->hangcheck.last_watchdog_seqno = current_seqno; + /* Re-start the counter, if really hung, it will expire again */ + I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US); + I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE); + } + +fw_put: + intel_uncore_forcewake_put(dev_priv, engine->fw_domains); +} + /* * Reserve space for 2 NOOPs at the end of each request to be * used as a workaround for not being allowed to do lite @@ -1563,6 +1605,9 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine) if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->irq_tasklet.state))) tasklet_kill(&engine->irq_tasklet); + if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->watchdog_tasklet.state))) + tasklet_kill(&engine->watchdog_tasklet); + dev_priv = engine->i915; if (engine->buffer) { @@ -1621,6 +1666,22 @@ logical_ring_default_irqs(struct intel_engine_cs *engine) unsigned shift = engine->irq_shift; engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; + + switch (engine->id) { + default: + /* BCS engine does not support hw watchdog */ + break; + case RCS: + case VCS: + case VCS2: + engine->irq_keep_mask |= (GT_GEN8_WATCHDOG_INTERRUPT << shift); + break; + case VECS: + if (INTEL_GEN(engine->i915) >= 9) + engine->irq_keep_mask |= + (GT_GEN8_WATCHDOG_INTERRUPT << shift); + break; + } } static int @@ -1669,6 +1730,9 @@ logical_ring_setup(struct intel_engine_cs *engine) tasklet_init(&engine->irq_tasklet, intel_lrc_irq_handler, (unsigned long)engine); + tasklet_init(&engine->watchdog_tasklet, + gen8_watchdog_irq_handler, (unsigned long)engine); + logical_ring_default_vfuncs(engine); logical_ring_default_irqs(engine); } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index e8faf2c34c97..fffe69f5aed2 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -128,6 +128,7 @@ struct intel_instdone { struct intel_engine_hangcheck { u64 acthd; u32 seqno; + u32 last_watchdog_seqno; enum intel_engine_hangcheck_action action; unsigned long action_timestamp; int deadlock; @@ -405,6 +406,9 @@ struct intel_engine_cs { struct intel_engine_hangcheck hangcheck; + /* watchdog_tasklet: stop counter and re-schedule hangcheck_work asap */ + struct tasklet_struct watchdog_tasklet; + bool needs_cmd_parser; /*