Message ID | 1452706112-8617-5-git-send-email-arun.siluvery@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Jan 13, 2016 at 05:28:16PM +0000, Arun Siluvery wrote: > From: Tomas Elf <tomas.elf@intel.com> > > With the per-engine hang recovery path already in place this patch adds > per-engine hang detection by letting the periodic hang checker detect hangs on > individual engines and communicate this to the error handler. During hang > checking every engine is checked and the hang detection status for each engine > is aggregated into a single 32-bit engine flag mask that contains all the > engine flags (1 << ring->id) of all the hung engines or'ed together. The > per-engine path in the error handler then sets up the hangcheck state for each > invidual, hung engine based on the engine flag mask before potentially calling > the per-engine hang recovery path. > > This allows the hang detection to happen in lock-step for all engines in > parallel and lets the driver process all hung engines in turn in the error > handler. > > Signed-off-by: Tomas Elf <tomas.elf@intel.com> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 2 +- > drivers/gpu/drm/i915/i915_drv.h | 4 ++-- > drivers/gpu/drm/i915/i915_irq.c | 41 +++++++++++++++++++++++-------------- > 3 files changed, 29 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index e3377ab..6d1b6c3 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val) > > intel_runtime_pm_get(dev_priv); > > - i915_handle_error(dev, val, > + i915_handle_error(dev, 0x0, val, > "Manually setting wedged to %llu", val); > > intel_runtime_pm_put(dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index e866f14..85cf692 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2765,8 +2765,8 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine) > > /* i915_irq.c */ > void i915_queue_hangcheck(struct drm_device *dev); > -__printf(3, 4) > -void i915_handle_error(struct drm_device *dev, bool wedged, > +__printf(4, 5) > +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged, > const char *fmt, ...); > > extern void intel_irq_init(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 6a0ec37..fef74cf 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2712,15 +2712,29 @@ static void i915_report_and_clear_eir(struct drm_device *dev) > > /** > * i915_handle_error - handle a gpu error > - * @dev: drm device > + * @dev: drm device > + * @engine_mask: Bit mask containing the engine flags of all engines > + * associated with one or more detected errors. > + * May be 0x0. > + * If wedged is set to true this implies that one or more > + * engine hangs were detected. In this case we will > + * attempt to reset all engines that have been detected > + * as hung. > + * If a previous engine reset was attempted too recently > + * or if one of the current engine resets fails we fall > + * back to legacy full GPU reset. > + * @wedged: true = Hang detected, invoke hang recovery. These two look to be tautological. If wedged == 0, we expect engine_mask to be 0 zero as well since we will not be resetting the engines, just capturing error state. It's not though. Conversely if wedged == 1, we expect engine_mask to be set. Again, it's not and I think there the caller is wrong. > + * @fmt, ...: Error message describing reason for error. > * > * Do some basic checking of register state at error time and > * dump it to the syslog. Also call i915_capture_error_state() to make > * sure we get a record and make it available in debugfs. Fire a uevent > * so userspace knows something bad happened (should trigger collection > - * of a ring dump etc.). > + * of a ring dump etc.). If a hang was detected (wedged = true) try to > + * reset the associated engine. Failing that, try to fall back to legacy > + * full GPU reset recovery mode. > */ > -void i915_handle_error(struct drm_device *dev, bool wedged, > +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged, > const char *fmt, ...) > { > struct drm_i915_private *dev_priv = dev->dev_private; > @@ -2729,12 +2743,6 @@ void i915_handle_error(struct drm_device *dev, bool wedged, > > struct intel_engine_cs *engine; > > - /* > - * NB: Placeholder until the hang checker supports > - * per-engine hang detection. > - */ > - u32 engine_mask = 0; > - > va_start(args, fmt); > vscnprintf(error_msg, sizeof(error_msg), fmt, args); > va_end(args); > @@ -3162,7 +3170,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd) > */ > tmp = I915_READ_CTL(ring); > if (tmp & RING_WAIT) { > - i915_handle_error(dev, false, > + i915_handle_error(dev, intel_ring_flag(ring), false, > "Kicking stuck wait on %s", > ring->name); > I915_WRITE_CTL(ring, tmp); > @@ -3174,7 +3182,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd) > default: > return HANGCHECK_HUNG; > case 1: > - i915_handle_error(dev, false, > + i915_handle_error(dev, intel_ring_flag(ring), false, > "Kicking stuck semaphore on %s", > ring->name); > I915_WRITE_CTL(ring, tmp); > @@ -3203,7 +3211,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) > struct drm_device *dev = dev_priv->dev; > struct intel_engine_cs *ring; > int i; > - int busy_count = 0, rings_hung = 0; > + u32 engine_mask = 0; > + int busy_count = 0; > bool stuck[I915_NUM_RINGS] = { 0 }; > #define BUSY 1 > #define KICK 5 > @@ -3316,12 +3325,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work) > DRM_INFO("%s on %s\n", > stuck[i] ? "stuck" : "no progress", > ring->name); > - rings_hung++; > + > + engine_mask |= intel_ring_flag(ring); > + ring->hangcheck.tdr_count++; tdr_count was introduced unused in the previous patch and here it has nothing to do with tdr, but just hangcheck pure and simple. It would actually be a useful statistic for hangcheck/error-state... -Chris
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index e3377ab..6d1b6c3 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val) intel_runtime_pm_get(dev_priv); - i915_handle_error(dev, val, + i915_handle_error(dev, 0x0, val, "Manually setting wedged to %llu", val); intel_runtime_pm_put(dev_priv); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e866f14..85cf692 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2765,8 +2765,8 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine) /* i915_irq.c */ void i915_queue_hangcheck(struct drm_device *dev); -__printf(3, 4) -void i915_handle_error(struct drm_device *dev, bool wedged, +__printf(4, 5) +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged, const char *fmt, ...); extern void intel_irq_init(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 6a0ec37..fef74cf 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2712,15 +2712,29 @@ static void i915_report_and_clear_eir(struct drm_device *dev) /** * i915_handle_error - handle a gpu error - * @dev: drm device + * @dev: drm device + * @engine_mask: Bit mask containing the engine flags of all engines + * associated with one or more detected errors. + * May be 0x0. + * If wedged is set to true this implies that one or more + * engine hangs were detected. In this case we will + * attempt to reset all engines that have been detected + * as hung. + * If a previous engine reset was attempted too recently + * or if one of the current engine resets fails we fall + * back to legacy full GPU reset. + * @wedged: true = Hang detected, invoke hang recovery. + * @fmt, ...: Error message describing reason for error. * * Do some basic checking of register state at error time and * dump it to the syslog. Also call i915_capture_error_state() to make * sure we get a record and make it available in debugfs. Fire a uevent * so userspace knows something bad happened (should trigger collection - * of a ring dump etc.). + * of a ring dump etc.). If a hang was detected (wedged = true) try to + * reset the associated engine. Failing that, try to fall back to legacy + * full GPU reset recovery mode. */ -void i915_handle_error(struct drm_device *dev, bool wedged, +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged, const char *fmt, ...) { struct drm_i915_private *dev_priv = dev->dev_private; @@ -2729,12 +2743,6 @@ void i915_handle_error(struct drm_device *dev, bool wedged, struct intel_engine_cs *engine; - /* - * NB: Placeholder until the hang checker supports - * per-engine hang detection. - */ - u32 engine_mask = 0; - va_start(args, fmt); vscnprintf(error_msg, sizeof(error_msg), fmt, args); va_end(args); @@ -3162,7 +3170,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd) */ tmp = I915_READ_CTL(ring); if (tmp & RING_WAIT) { - i915_handle_error(dev, false, + i915_handle_error(dev, intel_ring_flag(ring), false, "Kicking stuck wait on %s", ring->name); I915_WRITE_CTL(ring, tmp); @@ -3174,7 +3182,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd) default: return HANGCHECK_HUNG; case 1: - i915_handle_error(dev, false, + i915_handle_error(dev, intel_ring_flag(ring), false, "Kicking stuck semaphore on %s", ring->name); I915_WRITE_CTL(ring, tmp); @@ -3203,7 +3211,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) struct drm_device *dev = dev_priv->dev; struct intel_engine_cs *ring; int i; - int busy_count = 0, rings_hung = 0; + u32 engine_mask = 0; + int busy_count = 0; bool stuck[I915_NUM_RINGS] = { 0 }; #define BUSY 1 #define KICK 5 @@ -3316,12 +3325,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work) DRM_INFO("%s on %s\n", stuck[i] ? "stuck" : "no progress", ring->name); - rings_hung++; + + engine_mask |= intel_ring_flag(ring); + ring->hangcheck.tdr_count++; } } - if (rings_hung) { - i915_handle_error(dev, true, "Ring hung"); + if (engine_mask) { + i915_handle_error(dev, engine_mask, true, "Ring hung (0x%02x)", engine_mask); goto out; }