diff mbox

[v6,18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Message ID 5fca72c5-3d24-c688-7625-78f45d46ef5b@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michel Thierry April 20, 2017, 1:09 a.m. UTC
On 18/04/17 13:23, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
>
> The recommended default watchdog threshold for video engines is 60000 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
>
> Note, UABI engine ids and i915 engine ids are different, and this patch
> uses the i915 ones. Some kind of mapping table [1] is required if we
> decide to use the UABI engine ids.
>
> [1] http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-chris@chris-wilson.co.uk
>
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
>
> v3: Add a way to get array size, short-cut to disable all thresholds,
> return EFAULT / EINVAL as needed. Move the capture of the threshold
> value in the error state into a new patch. BXT has a different
> timestamp base (because why not?).
>
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  29 +++++++++
>  drivers/gpu/drm/i915/i915_gem_context.c | 102 ++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.c        |   5 +-
>  include/uapi/drm/i915_drm.h             |   1 +
>  4 files changed, 135 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 203f2112dd18..f65a236fddef 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx,
>         return &vm->timeline.engine[engine->id];
>  }
>
> +/*
> + * BDW & SKL+ Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second, or ~12 counts per microsecond.
> + *
> + * But Broxton Timestamp timer resolution is different, 0.052 uSec,
> + * or 19200000 counts per second, or ~19 counts per microsecond.
> + */
> +#define SKL_TIMESTAMP_CNTS_PER_USEC 12
> +#define BXT_TIMESTAMP_CNTS_PER_USEC 19
> +#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \
> +                                          BXT_TIMESTAMP_CNTS_PER_USEC : \
> +                                          SKL_TIMESTAMP_CNTS_PER_USEC)
> +static inline u32
> +watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)
> +{
> +       return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv);
> +}
> +
> +static inline u32
> +watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
> +{
> +       u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv);
> +
> +       if (overflows_type(threshold, u32))
> +               return -EINVAL;
> +
> +       return threshold;
> +}
> +
>  int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>                          struct drm_file *file);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index edbed85a1c88..85a6467a25a6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>         return ctx;
>  }
>
> +/* Return the timer count threshold in microseconds. */
> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +                                 struct drm_i915_gem_context_param *args)
> +{
> +       struct drm_i915_private *dev_priv = ctx->i915;
> +       struct intel_engine_cs *engine;
> +       enum intel_engine_id id;
> +       u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +       if (args->size == 0)
> +               goto out;
> +
> +       if (args->size < sizeof(threshold_in_us))
> +               return -EFAULT;
> +
> +       if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +               return -ENODEV;
> +
> +       for_each_engine(engine, dev_priv, id) {
> +               struct intel_context *ce = &ctx->engine[id];
> +
> +               threshold_in_us[id] = watchdog_to_us(dev_priv,
> +                                                    ce->watchdog_threshold);
> +       }
> +
> +       mutex_unlock(&dev_priv->drm.struct_mutex);
> +       if (__copy_to_user(u64_to_user_ptr(args->value),
> +                          &threshold_in_us,
> +                          sizeof(threshold_in_us))) {
> +               mutex_lock(&dev_priv->drm.struct_mutex);
> +               return -EFAULT;
> +       }
> +       mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +out:
> +       args->size = sizeof(threshold_in_us);
> +
> +       return 0;
> +}
> +
> +/*
> + * Based on time out value in microseconds (us) calculate
> + * timer count thresholds needed based on core frequency.
> + * Watchdog can be disabled by setting it to 0.
> + */
> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> +                                 struct drm_i915_gem_context_param *args)
> +{
> +       struct drm_i915_private *dev_priv = ctx->i915;
> +       struct intel_engine_cs *engine;
> +       enum intel_engine_id id;
> +       u32 threshold[I915_NUM_ENGINES];
> +
> +       memset(&threshold, 0, sizeof(threshold));
> +
> +       /* shortcut to disable in all engines */
> +       if (args->size == 0)
> +               goto set_watchdog;
> +
> +       if (args->size < sizeof(threshold))
> +               return -EFAULT;
> +
> +       if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +               return -ENODEV;
> +
> +       mutex_unlock(&dev_priv->drm.struct_mutex);
> +       if (copy_from_user(&threshold,
> +                          u64_to_user_ptr(args->value),
> +                          sizeof(threshold))) {
> +               mutex_lock(&dev_priv->drm.struct_mutex);
> +               return -EFAULT;
> +       }
> +       mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +       /* not supported in blitter engine */
> +       if (threshold[BCS] != 0)
> +               return -EINVAL;
> +
> +       for_each_engine(engine, dev_priv, id) {
> +               threshold[id] = watchdog_to_clock_counts(dev_priv,
> +                                                        threshold[id]);
> +
> +               if (threshold[id] == -EINVAL)
> +                       return -EINVAL;
> +       }
> +
> +set_watchdog:
> +       for_each_engine(engine, dev_priv, id) {
> +               struct intel_context *ce = &ctx->engine[id];
> +
> +               ce->watchdog_threshold = threshold[id];
> +       }
> +
> +       return 0;
> +}
> +
>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>  {
>         struct i915_gem_context *ctx;

This patch is missing:

  	case I915_CONTEXT_PARAM_BAN_PERIOD:
  		ret = -EINVAL;

Or there will be no way to get the current thresholds (chunk was missed 
due to some TRTT code nearby). I'll be sure to include it in the next 
version.

> @@ -1061,6 +1157,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>         case I915_CONTEXT_PARAM_BANNABLE:
>                 args->value = i915_gem_context_is_bannable(ctx);
>                 break;
> +       case I915_CONTEXT_PARAM_WATCHDOG:
> +               ret = i915_gem_context_get_watchdog(ctx, args);
> +               break;
>         default:
>                 ret = -EINVAL;
>                 break;
> @@ -1118,6 +1217,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>                 else
>                         i915_gem_context_clear_bannable(ctx);
>                 break;
> +       case I915_CONTEXT_PARAM_WATCHDOG:
> +               ret = i915_gem_context_set_watchdog(ctx, args);
> +               break;
>         default:
>                 ret = -EINVAL;
>                 break;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 7a202e73ce9b..f4e05531a9a0 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1496,7 +1496,7 @@ static inline u32 get_watchdog_disable(struct intel_engine_cs *engine)
>                 return GEN8_XCS_WATCHDOG_DISABLE;
>  }
>
> -#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function
> +#define GEN8_WATCHDOG_1000US(dev_priv) watchdog_to_clock_counts(dev_priv, 1000)
>  static void gen8_watchdog_irq_handler(unsigned long data)
>  {
>         struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> @@ -1526,7 +1526,8 @@ static void gen8_watchdog_irq_handler(unsigned long data)
>         } else {
>                 engine->hangcheck.watchdog = current_seqno;
>                 /* Re-start the counter, if really hung, it will expire again */
> -               I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US);
> +               I915_WRITE_FW(RING_THRESH(engine->mmio_base),
> +                             GEN8_WATCHDOG_1000US(dev_priv));
>                 I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE);
>         }
>
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index fadedefba6db..18bc0ec618dd 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1308,6 +1308,7 @@ struct drm_i915_gem_context_param {
>  #define I915_CONTEXT_PARAM_GTT_SIZE    0x3
>  #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE    0x4
>  #define I915_CONTEXT_PARAM_BANNABLE    0x5
> +#define I915_CONTEXT_PARAM_WATCHDOG    0x6
>         __u64 value;
>  };
>
> --
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
>

Comments

Chris Wilson April 20, 2017, 8:52 a.m. UTC | #1
On Wed, Apr 19, 2017 at 06:09:00PM -0700, Michel Thierry wrote:
> This patch is missing:
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index c1013af0b910..a8bdea43a217 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1135,7 +1135,7 @@ int i915_gem_context_getparam_ioctl(struct
> drm_device *dev, void *data,
>  		return PTR_ERR(ctx);
>  	}
> 
> -	args->size = 0;
> +	args->size = (args->param != I915_CONTEXT_PARAM_WATCHDOG) ? 0 :
> args->size;
>  	switch (args->param) {
>  	case I915_CONTEXT_PARAM_BAN_PERIOD:
>  		ret = -EINVAL;
> 
> Or there will be no way to get the current thresholds (chunk was
> missed due to some TRTT code nearby). I'll be sure to include it in
> the next version.

No. It is always preset to 0. The PARAM should set it to the actual
struct size (it would write) and *not* the user's size.
-Chris
Michel Thierry April 20, 2017, 5:19 p.m. UTC | #2
On 20/04/17 01:52, Chris Wilson wrote:
> On Wed, Apr 19, 2017 at 06:09:00PM -0700, Michel Thierry wrote:
>> This patch is missing:
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> b/drivers/gpu/drm/i915/i915_gem_context.c
>> index c1013af0b910..a8bdea43a217 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -1135,7 +1135,7 @@ int i915_gem_context_getparam_ioctl(struct
>> drm_device *dev, void *data,
>>  		return PTR_ERR(ctx);
>>  	}
>>
>> -	args->size = 0;
>> +	args->size = (args->param != I915_CONTEXT_PARAM_WATCHDOG) ? 0 :
>> args->size;
>>  	switch (args->param) {
>>  	case I915_CONTEXT_PARAM_BAN_PERIOD:
>>  		ret = -EINVAL;
>>
>> Or there will be no way to get the current thresholds (chunk was
>> missed due to some TRTT code nearby). I'll be sure to include it in
>> the next version.
>
> No. It is always preset to 0. The PARAM should set it to the actual
> struct size (it would write) and *not* the user's size.
> -Chris
>

Ok, then I'll change the shortcut in get_watchdog, because as it is you 
can query the size, but not the thresholds.

int i915_gem_context_get_watchdog()
{
...
	if (args->size == 0)
		goto out;
...
out:
	args->size = sizeof(threshold_in_us);

	return 0;
}
}
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index c1013af0b910..a8bdea43a217 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1135,7 +1135,7 @@  int i915_gem_context_getparam_ioctl(struct 
drm_device *dev, void *data,
  		return PTR_ERR(ctx);
  	}

-	args->size = 0;
+	args->size = (args->param != I915_CONTEXT_PARAM_WATCHDOG) ? 0 : 
args->size;
  	switch (args->param) {