Message ID | 20170223194421.28463-3-michel.thierry@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Feb 23, 2017 at 11:44:19AM -0800, Michel Thierry wrote: > Final enablement patch for GPU hang detection using watchdog timeout. > Using the gem_context_setparam ioctl, users can specify the desired > timeout value in milliseconds, and the driver will do the conversion to > 'timestamps'. > > The _recommended_ default watchdog threshold for video engines is 60 ms, > since this has been _empirically determined_ to be a good compromise for > low-latency requirements and low rate of false positives. The default > register value is ~106ms and the theoretical max value (all 1s) is > 353 seconds. > > Signed-off-by: Tomas Elf <tomas.elf@intel.com> > Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> > Signed-off-by: Michel Thierry <michel.thierry@intel.com> > --- > drivers/gpu/drm/i915/i915_gem_context.c | 46 ++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/i915_gem_context.h | 3 ++ > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 8 +----- > include/uapi/drm/i915_drm.h | 1 + > 4 files changed, 51 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index 99c46f4dbde6..c3748878e64c 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -440,6 +440,32 @@ i915_gem_context_create_gvt(struct drm_device *dev) > return ctx; > } > > +void i915_context_watchdog_setup(struct i915_gem_context *ctx, u32 value_in_ms) Ahem. > +{ > + /* > + * Based on time out value (ms) calculate > + * timer count thresholds needed based on core frequency. > + */ > +#define TIMER_MILLISECOND 1000 > + > + /* > + * Timestamp timer resolution = 0.080 uSec, > + * or 12500000 counts per second > + */ > +#define TIMESTAMP_CNTS_PER_SEC_80NS 12500000 > + > + ctx->watchdog_threshold = > + ((value_in_ms) * > + ((TIMESTAMP_CNTS_PER_SEC_80NS) / (TIMER_MILLISECOND))); Rescale to ns (u64 math), check for overflows before assigning to u32. > + /* > + * watchdog register must never be programmed to zero. This would > + * cause the watchdog counter to exceed and not allow the engine to > + * go into IDLE state > + */ > + GEM_BUG_ON(ctx->watchdog_threshold == 0); This throws a bug when the user tries to disable the watchdog afterwards. [ctx->watchdog_threshold = 0 => disable watchdog around this context, default]. > +} > + > int i915_gem_context_init(struct drm_i915_private *dev_priv) > { > struct i915_gem_context *ctx; > @@ -1056,6 +1082,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, > struct drm_i915_file_private *file_priv = file->driver_priv; > struct drm_i915_gem_context_param *args = data; > struct i915_gem_context *ctx; > + struct intel_engine_cs *engine; > int ret; > > ret = i915_mutex_lock_interruptible(dev); > @@ -1090,6 +1117,15 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, > case I915_CONTEXT_PARAM_BANNABLE: > args->value = i915_gem_context_is_bannable(ctx); > break; > + case I915_CONTEXT_PARAM_WATCHDOG: > + engine = to_i915(dev)->engine[VCS]; > + if (!engine->emit_start_watchdog) > + ret = -EINVAL; > + else if (args->value) > + ret = -EINVAL; > + else > + args->value = ctx->watchdog_threshold; Just args->value = watchdog_to_ns(ctx->watchdog_threshold); will be 0 where unset. On setting we complain if not supported. Do we really want the user API to be in clock ticks rather than say ns? I think we want ns, and don't forget to include that information in the error state. Do we want to capture error state? Do we want this to contribute to banning? > + break; > default: > ret = -EINVAL; > break; > @@ -1105,6 +1141,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, > struct drm_i915_file_private *file_priv = file->driver_priv; > struct drm_i915_gem_context_param *args = data; > struct i915_gem_context *ctx; > + struct intel_engine_cs *engine; > int ret; > > ret = i915_mutex_lock_interruptible(dev); > @@ -1147,6 +1184,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, > else > i915_gem_context_clear_bannable(ctx); > break; > + case I915_CONTEXT_PARAM_WATCHDOG: > + engine = to_i915(dev)->engine[VCS]; > + if (!engine->emit_start_watchdog) > + ret = -EINVAL; ret = -ENODEV; > + else if (!args->value) > + ret = -EINVAL; > + else > + i915_context_watchdog_setup(ctx, args->value); I'm pushing for ns. And this should be i915_gem_context_set_watchdog(ctx, args->value); > + break; > default: > ret = -EINVAL; > break; > diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h > index 0ac750b90f3d..133ed7b413aa 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.h > +++ b/drivers/gpu/drm/i915/i915_gem_context.h > @@ -176,6 +176,9 @@ struct i915_gem_context { > /** ban_score: Accumulated score of all hangs caused by this context. */ > int ban_score; > > + /** watchdog_threshold: hw watchdog threshold value, in clock counts */ > + u32 watchdog_threshold; > + > /** remap_slice: Bitmask of cache lines that need remapping */ > u8 remap_slice; > }; > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index 348d81c40e81..26c50a6d6158 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -1419,12 +1419,6 @@ execbuf_submit(struct i915_execbuffer_params *params, > bool watchdog_running = false; > int ret; > > - /* > - * NB: Place-holder until watchdog timeout is enabled through DRM > - * interface > - */ > - bool enable_watchdog = false; > - > ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas); > if (ret) > return ret; > @@ -1488,7 +1482,7 @@ execbuf_submit(struct i915_execbuffer_params *params, > } > > /* Start watchdog timer */ > - if (enable_watchdog) { > + if (params->ctx->watchdog_threshold != 0) { > if (!params->engine->emit_start_watchdog) > return -EINVAL; No. If we set a watchdog on a ctx, this then causes all use of BCS to fail. We need saner API than that - either per-engine thresholds in the ctx, or just document that the watchdog is only supported where available by hw. -Chris
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 99c46f4dbde6..c3748878e64c 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -440,6 +440,32 @@ i915_gem_context_create_gvt(struct drm_device *dev) return ctx; } +void i915_context_watchdog_setup(struct i915_gem_context *ctx, u32 value_in_ms) +{ + /* + * Based on time out value (ms) calculate + * timer count thresholds needed based on core frequency. + */ +#define TIMER_MILLISECOND 1000 + + /* + * Timestamp timer resolution = 0.080 uSec, + * or 12500000 counts per second + */ +#define TIMESTAMP_CNTS_PER_SEC_80NS 12500000 + + ctx->watchdog_threshold = + ((value_in_ms) * + ((TIMESTAMP_CNTS_PER_SEC_80NS) / (TIMER_MILLISECOND))); + + /* + * watchdog register must never be programmed to zero. This would + * cause the watchdog counter to exceed and not allow the engine to + * go into IDLE state + */ + GEM_BUG_ON(ctx->watchdog_threshold == 0); +} + int i915_gem_context_init(struct drm_i915_private *dev_priv) { struct i915_gem_context *ctx; @@ -1056,6 +1082,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, struct drm_i915_file_private *file_priv = file->driver_priv; struct drm_i915_gem_context_param *args = data; struct i915_gem_context *ctx; + struct intel_engine_cs *engine; int ret; ret = i915_mutex_lock_interruptible(dev); @@ -1090,6 +1117,15 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_BANNABLE: args->value = i915_gem_context_is_bannable(ctx); break; + case I915_CONTEXT_PARAM_WATCHDOG: + engine = to_i915(dev)->engine[VCS]; + if (!engine->emit_start_watchdog) + ret = -EINVAL; + else if (args->value) + ret = -EINVAL; + else + args->value = ctx->watchdog_threshold; + break; default: ret = -EINVAL; break; @@ -1105,6 +1141,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, struct drm_i915_file_private *file_priv = file->driver_priv; struct drm_i915_gem_context_param *args = data; struct i915_gem_context *ctx; + struct intel_engine_cs *engine; int ret; ret = i915_mutex_lock_interruptible(dev); @@ -1147,6 +1184,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, else i915_gem_context_clear_bannable(ctx); break; + case I915_CONTEXT_PARAM_WATCHDOG: + engine = to_i915(dev)->engine[VCS]; + if (!engine->emit_start_watchdog) + ret = -EINVAL; + else if (!args->value) + ret = -EINVAL; + else + i915_context_watchdog_setup(ctx, args->value); + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 0ac750b90f3d..133ed7b413aa 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -176,6 +176,9 @@ struct i915_gem_context { /** ban_score: Accumulated score of all hangs caused by this context. */ int ban_score; + /** watchdog_threshold: hw watchdog threshold value, in clock counts */ + u32 watchdog_threshold; + /** remap_slice: Bitmask of cache lines that need remapping */ u8 remap_slice; }; diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 348d81c40e81..26c50a6d6158 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1419,12 +1419,6 @@ execbuf_submit(struct i915_execbuffer_params *params, bool watchdog_running = false; int ret; - /* - * NB: Place-holder until watchdog timeout is enabled through DRM - * interface - */ - bool enable_watchdog = false; - ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas); if (ret) return ret; @@ -1488,7 +1482,7 @@ execbuf_submit(struct i915_execbuffer_params *params, } /* Start watchdog timer */ - if (enable_watchdog) { + if (params->ctx->watchdog_threshold != 0) { if (!params->engine->emit_start_watchdog) return -EINVAL; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 3554495bef13..e318c4f53a9e 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1289,6 +1289,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 #define I915_CONTEXT_PARAM_BANNABLE 0x5 +#define I915_CONTEXT_PARAM_WATCHDOG 0x6 __u64 value; };