Message ID | 20170418202335.35232-19-michel.thierry@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Apr 18, 2017 at 01:23:33PM -0700, Michel Thierry wrote: > Final enablement patch for GPU hang detection using watchdog timeout. > Using the gem_context_setparam ioctl, users can specify the desired > timeout value in microseconds, and the driver will do the conversion to > 'timestamps'. > > The recommended default watchdog threshold for video engines is 60000 us, > since this has been _empirically determined_ to be a good compromise for > low-latency requirements and low rate of false positives. The default > register value is ~106000us and the theoretical max value (all 1s) is > 353 seconds. > > Note, UABI engine ids and i915 engine ids are different, and this patch > uses the i915 ones. Some kind of mapping table [1] is required if we > decide to use the UABI engine ids. > > [1] http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-chris@chris-wilson.co.uk > > v2: Fixed get api to return values in microseconds. Threshold updated to > be per context engine. Check for u32 overflow. Capture ctx threshold > value in error state. > > v3: Add a way to get array size, short-cut to disable all thresholds, > return EFAULT / EINVAL as needed. Move the capture of the threshold > value in the error state into a new patch. BXT has a different > timestamp base (because why not?). > > Signed-off-by: Tomas Elf <tomas.elf@intel.com> > Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> > Signed-off-by: Michel Thierry <michel.thierry@intel.com> > --- > drivers/gpu/drm/i915/i915_drv.h | 29 +++++++++ > drivers/gpu/drm/i915/i915_gem_context.c | 102 ++++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/intel_lrc.c | 5 +- > include/uapi/drm/i915_drm.h | 1 + > 4 files changed, 135 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 203f2112dd18..f65a236fddef 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx, > return &vm->timeline.engine[engine->id]; > } > > +/* > + * BDW & SKL+ Timestamp timer resolution = 0.080 uSec, > + * or 12500000 counts per second, or ~12 counts per microsecond. > + * > + * But Broxton Timestamp timer resolution is different, 0.052 uSec, > + * or 19200000 counts per second, or ~19 counts per microsecond. > + */ > +#define SKL_TIMESTAMP_CNTS_PER_USEC 12 > +#define BXT_TIMESTAMP_CNTS_PER_USEC 19 > +#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \ > + BXT_TIMESTAMP_CNTS_PER_USEC : \ > + SKL_TIMESTAMP_CNTS_PER_USEC) > +static inline u32 > +watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts) > +{ > + return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv); > +} > + > +static inline u32 > +watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us) > +{ > + u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv); > + > + if (overflows_type(threshold, u32)) > + return -EINVAL; > + > + return threshold; > +} > + > int i915_perf_open_ioctl(struct drm_device *dev, void *data, > struct drm_file *file); > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index edbed85a1c88..85a6467a25a6 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev) > return ctx; > } > > +/* Return the timer count threshold in microseconds. */ > +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx, > + struct drm_i915_gem_context_param *args) > +{ > + struct drm_i915_private *dev_priv = ctx->i915; > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + u32 threshold_in_us[I915_NUM_ENGINES]; > + > + if (args->size == 0) > + goto out; > + > + if (args->size < sizeof(threshold_in_us)) > + return -EFAULT; > + > + if (!dev_priv->engine[VCS]->emit_start_watchdog) > + return -ENODEV; > + > + for_each_engine(engine, dev_priv, id) { > + struct intel_context *ce = &ctx->engine[id]; > + > + threshold_in_us[id] = watchdog_to_us(dev_priv, > + ce->watchdog_threshold); > + } > + > + mutex_unlock(&dev_priv->drm.struct_mutex); > + if (__copy_to_user(u64_to_user_ptr(args->value), > + &threshold_in_us, > + sizeof(threshold_in_us))) { > + mutex_lock(&dev_priv->drm.struct_mutex); > + return -EFAULT; > + } > + mutex_lock(&dev_priv->drm.struct_mutex); > + > +out: > + args->size = sizeof(threshold_in_us); > + > + return 0; > +} > + > +/* > + * Based on time out value in microseconds (us) calculate > + * timer count thresholds needed based on core frequency. > + * Watchdog can be disabled by setting it to 0. > + */ > +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx, > + struct drm_i915_gem_context_param *args) > +{ > + struct drm_i915_private *dev_priv = ctx->i915; > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + u32 threshold[I915_NUM_ENGINES]; > + > + memset(&threshold, 0, sizeof(threshold)); > + memset(threshold, 0, sizeof(threshold)); > + /* shortcut to disable in all engines */ > + if (args->size == 0) > + goto set_watchdog; > + > + if (args->size < sizeof(threshold)) > + return -EFAULT; > + > + if (!dev_priv->engine[VCS]->emit_start_watchdog) > + return -ENODEV; > + > + mutex_unlock(&dev_priv->drm.struct_mutex); > + if (copy_from_user(&threshold, if (copy_from_user(threshold, > + u64_to_user_ptr(args->value), > + sizeof(threshold))) { > + mutex_lock(&dev_priv->drm.struct_mutex); > + return -EFAULT; > + } > + mutex_lock(&dev_priv->drm.struct_mutex); > + > + /* not supported in blitter engine */ > + if (threshold[BCS] != 0) > + return -EINVAL; > + > + for_each_engine(engine, dev_priv, id) { > + threshold[id] = watchdog_to_clock_counts(dev_priv, > + threshold[id]); > + > + if (threshold[id] == -EINVAL) > + return -EINVAL; > + } > + > +set_watchdog: > + for_each_engine(engine, dev_priv, id) { > + struct intel_context *ce = &ctx->engine[id]; > + > + ce->watchdog_threshold = threshold[id]; > + } > + > + return 0; > +} > + > int i915_gem_context_init(struct drm_i915_private *dev_priv) > { > struct i915_gem_context *ctx; > @@ -1061,6 +1157,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, > case I915_CONTEXT_PARAM_BANNABLE: > args->value = i915_gem_context_is_bannable(ctx); > break; > + case I915_CONTEXT_PARAM_WATCHDOG: > + ret = i915_gem_context_get_watchdog(ctx, args); > + break; > default: > ret = -EINVAL; > break; > @@ -1118,6 +1217,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, > else > i915_gem_context_clear_bannable(ctx); > break; > + case I915_CONTEXT_PARAM_WATCHDOG: > + ret = i915_gem_context_set_watchdog(ctx, args); > + break; > default: > ret = -EINVAL; > break; > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 7a202e73ce9b..f4e05531a9a0 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -1496,7 +1496,7 @@ static inline u32 get_watchdog_disable(struct intel_engine_cs *engine) > return GEN8_XCS_WATCHDOG_DISABLE; > } > > -#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function > +#define GEN8_WATCHDOG_1000US(dev_priv) watchdog_to_clock_counts(dev_priv, 1000) > static void gen8_watchdog_irq_handler(unsigned long data) > { > struct intel_engine_cs *engine = (struct intel_engine_cs *)data; > @@ -1526,7 +1526,8 @@ static void gen8_watchdog_irq_handler(unsigned long data) > } else { > engine->hangcheck.watchdog = current_seqno; > /* Re-start the counter, if really hung, it will expire again */ > - I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US); > + I915_WRITE_FW(RING_THRESH(engine->mmio_base), > + GEN8_WATCHDOG_1000US(dev_priv)); > I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE); > } > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index fadedefba6db..18bc0ec618dd 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -1308,6 +1308,7 @@ struct drm_i915_gem_context_param { > #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 > #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 > #define I915_CONTEXT_PARAM_BANNABLE 0x5 > +#define I915_CONTEXT_PARAM_WATCHDOG 0x6 > __u64 value; > }; > > -- > 2.11.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On 18/04/17 13:23, Michel Thierry wrote: > Final enablement patch for GPU hang detection using watchdog timeout. > Using the gem_context_setparam ioctl, users can specify the desired > timeout value in microseconds, and the driver will do the conversion to > 'timestamps'. > > The recommended default watchdog threshold for video engines is 60000 us, > since this has been _empirically determined_ to be a good compromise for > low-latency requirements and low rate of false positives. The default > register value is ~106000us and the theoretical max value (all 1s) is > 353 seconds. > > Note, UABI engine ids and i915 engine ids are different, and this patch > uses the i915 ones. Some kind of mapping table [1] is required if we > decide to use the UABI engine ids. > > [1] http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-chris@chris-wilson.co.uk Now that we have class and instance definitions, could it be worth to use those instead to index the engines? If we pair it with the engine discovery ioctl that Tvrtko proposed we could have something that is reasonably future-proof. Thanks, Daniele
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 203f2112dd18..f65a236fddef 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx, return &vm->timeline.engine[engine->id]; } +/* + * BDW & SKL+ Timestamp timer resolution = 0.080 uSec, + * or 12500000 counts per second, or ~12 counts per microsecond. + * + * But Broxton Timestamp timer resolution is different, 0.052 uSec, + * or 19200000 counts per second, or ~19 counts per microsecond. + */ +#define SKL_TIMESTAMP_CNTS_PER_USEC 12 +#define BXT_TIMESTAMP_CNTS_PER_USEC 19 +#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \ + BXT_TIMESTAMP_CNTS_PER_USEC : \ + SKL_TIMESTAMP_CNTS_PER_USEC) +static inline u32 +watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts) +{ + return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv); +} + +static inline u32 +watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us) +{ + u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv); + + if (overflows_type(threshold, u32)) + return -EINVAL; + + return threshold; +} + int i915_perf_open_ioctl(struct drm_device *dev, void *data, struct drm_file *file); diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index edbed85a1c88..85a6467a25a6 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev) return ctx; } +/* Return the timer count threshold in microseconds. */ +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx, + struct drm_i915_gem_context_param *args) +{ + struct drm_i915_private *dev_priv = ctx->i915; + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 threshold_in_us[I915_NUM_ENGINES]; + + if (args->size == 0) + goto out; + + if (args->size < sizeof(threshold_in_us)) + return -EFAULT; + + if (!dev_priv->engine[VCS]->emit_start_watchdog) + return -ENODEV; + + for_each_engine(engine, dev_priv, id) { + struct intel_context *ce = &ctx->engine[id]; + + threshold_in_us[id] = watchdog_to_us(dev_priv, + ce->watchdog_threshold); + } + + mutex_unlock(&dev_priv->drm.struct_mutex); + if (__copy_to_user(u64_to_user_ptr(args->value), + &threshold_in_us, + sizeof(threshold_in_us))) { + mutex_lock(&dev_priv->drm.struct_mutex); + return -EFAULT; + } + mutex_lock(&dev_priv->drm.struct_mutex); + +out: + args->size = sizeof(threshold_in_us); + + return 0; +} + +/* + * Based on time out value in microseconds (us) calculate + * timer count thresholds needed based on core frequency. + * Watchdog can be disabled by setting it to 0. + */ +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx, + struct drm_i915_gem_context_param *args) +{ + struct drm_i915_private *dev_priv = ctx->i915; + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 threshold[I915_NUM_ENGINES]; + + memset(&threshold, 0, sizeof(threshold)); + + /* shortcut to disable in all engines */ + if (args->size == 0) + goto set_watchdog; + + if (args->size < sizeof(threshold)) + return -EFAULT; + + if (!dev_priv->engine[VCS]->emit_start_watchdog) + return -ENODEV; + + mutex_unlock(&dev_priv->drm.struct_mutex); + if (copy_from_user(&threshold, + u64_to_user_ptr(args->value), + sizeof(threshold))) { + mutex_lock(&dev_priv->drm.struct_mutex); + return -EFAULT; + } + mutex_lock(&dev_priv->drm.struct_mutex); + + /* not supported in blitter engine */ + if (threshold[BCS] != 0) + return -EINVAL; + + for_each_engine(engine, dev_priv, id) { + threshold[id] = watchdog_to_clock_counts(dev_priv, + threshold[id]); + + if (threshold[id] == -EINVAL) + return -EINVAL; + } + +set_watchdog: + for_each_engine(engine, dev_priv, id) { + struct intel_context *ce = &ctx->engine[id]; + + ce->watchdog_threshold = threshold[id]; + } + + return 0; +} + int i915_gem_context_init(struct drm_i915_private *dev_priv) { struct i915_gem_context *ctx; @@ -1061,6 +1157,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_BANNABLE: args->value = i915_gem_context_is_bannable(ctx); break; + case I915_CONTEXT_PARAM_WATCHDOG: + ret = i915_gem_context_get_watchdog(ctx, args); + break; default: ret = -EINVAL; break; @@ -1118,6 +1217,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, else i915_gem_context_clear_bannable(ctx); break; + case I915_CONTEXT_PARAM_WATCHDOG: + ret = i915_gem_context_set_watchdog(ctx, args); + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 7a202e73ce9b..f4e05531a9a0 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1496,7 +1496,7 @@ static inline u32 get_watchdog_disable(struct intel_engine_cs *engine) return GEN8_XCS_WATCHDOG_DISABLE; } -#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function +#define GEN8_WATCHDOG_1000US(dev_priv) watchdog_to_clock_counts(dev_priv, 1000) static void gen8_watchdog_irq_handler(unsigned long data) { struct intel_engine_cs *engine = (struct intel_engine_cs *)data; @@ -1526,7 +1526,8 @@ static void gen8_watchdog_irq_handler(unsigned long data) } else { engine->hangcheck.watchdog = current_seqno; /* Re-start the counter, if really hung, it will expire again */ - I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US); + I915_WRITE_FW(RING_THRESH(engine->mmio_base), + GEN8_WATCHDOG_1000US(dev_priv)); I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE); } diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index fadedefba6db..18bc0ec618dd 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1308,6 +1308,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 #define I915_CONTEXT_PARAM_BANNABLE 0x5 +#define I915_CONTEXT_PARAM_WATCHDOG 0x6 __u64 value; };