diff mbox

drm/i915: optionally ban context on first hang

Message ID 1378819010-5173-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Sept. 10, 2013, 1:16 p.m. UTC
Current policy is to ban context if it manages to hang
gpu in a certain time windows. Paul Berry asked if more
strict policy could be available for use cases where
the application doesn't know if the rendering command stream
sent to gpu is valid or not.

Provide an option, flag on context creation time, to let
userspace to set more strict policy for handling gpu hangs for
this context. If context with this flag set ever hangs the gpu,
it will be permanently banned from accessing the GPU.
All subsequent batch submissions will return -EIO.

Requested-by: Paul Berry <stereotype441@gmail.com>
Cc: Paul Berry <stereotype441@gmail.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c         |    3 +++
 drivers/gpu/drm/i915/i915_drv.h         |    3 +++
 drivers/gpu/drm/i915/i915_gem.c         |    9 ++++++++-
 drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++++---
 include/uapi/drm/i915_drm.h             |    5 +++++
 5 files changed, 28 insertions(+), 4 deletions(-)

Comments

Chris Wilson Sept. 10, 2013, 1:26 p.m. UTC | #1
On Tue, Sep 10, 2013 at 04:16:50PM +0300, Mika Kuoppala wrote:
> Current policy is to ban context if it manages to hang
> gpu in a certain time windows. Paul Berry asked if more
> strict policy could be available for use cases where
> the application doesn't know if the rendering command stream
> sent to gpu is valid or not.
> 
> Provide an option, flag on context creation time, to let
> userspace to set more strict policy for handling gpu hangs for
> this context. If context with this flag set ever hangs the gpu,
> it will be permanently banned from accessing the GPU.
> All subsequent batch submissions will return -EIO.
> 
> Requested-by: Paul Berry <stereotype441@gmail.com>
> Cc: Paul Berry <stereotype441@gmail.com>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c         |    3 +++
>  drivers/gpu/drm/i915/i915_drv.h         |    3 +++
>  drivers/gpu/drm/i915/i915_gem.c         |    9 ++++++++-
>  drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++++---
>  include/uapi/drm/i915_drm.h             |    5 +++++
>  5 files changed, 28 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 3de6050..4353458 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1003,6 +1003,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
>  		value = 1;
>  		break;
> +	case I915_PARAM_HAS_CONTEXT_BAN:
> +		value = 1;
> +		break;

As we add the flags, we have a better method for detecting whether the
context accepts the flags (just request that a first-ban context be
created and mark the failure as unsupported), and so the getparam is
redundant.

>  struct drm_i915_gem_context_create {
>  	/*  output: id of new context*/
>  	__u32 ctx_id;
>  	__u32 pad;
> +	__u64 flags;
>  };

I thought that the size of the ioctl was part of the ABI, but it does
look like extending it as you have done here is valid. TIL.
-Chris
Ville Syrjala Sept. 10, 2013, 1:59 p.m. UTC | #2
On Tue, Sep 10, 2013 at 02:26:51PM +0100, Chris Wilson wrote:
> On Tue, Sep 10, 2013 at 04:16:50PM +0300, Mika Kuoppala wrote:
> > Current policy is to ban context if it manages to hang
> > gpu in a certain time windows. Paul Berry asked if more
> > strict policy could be available for use cases where
> > the application doesn't know if the rendering command stream
> > sent to gpu is valid or not.
> > 
> > Provide an option, flag on context creation time, to let
> > userspace to set more strict policy for handling gpu hangs for
> > this context. If context with this flag set ever hangs the gpu,
> > it will be permanently banned from accessing the GPU.
> > All subsequent batch submissions will return -EIO.
> > 
> > Requested-by: Paul Berry <stereotype441@gmail.com>
> > Cc: Paul Berry <stereotype441@gmail.com>
> > Cc: Ben Widawsky <ben@bwidawsk.net>
> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c         |    3 +++
> >  drivers/gpu/drm/i915/i915_drv.h         |    3 +++
> >  drivers/gpu/drm/i915/i915_gem.c         |    9 ++++++++-
> >  drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++++---
> >  include/uapi/drm/i915_drm.h             |    5 +++++
> >  5 files changed, 28 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 3de6050..4353458 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -1003,6 +1003,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
> >  	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
> >  		value = 1;
> >  		break;
> > +	case I915_PARAM_HAS_CONTEXT_BAN:
> > +		value = 1;
> > +		break;
> 
> As we add the flags, we have a better method for detecting whether the
> context accepts the flags (just request that a first-ban context be
> created and mark the failure as unsupported), and so the getparam is
> redundant.
> 
> >  struct drm_i915_gem_context_create {
> >  	/*  output: id of new context*/
> >  	__u32 ctx_id;
> >  	__u32 pad;
> > +	__u64 flags;
> >  };
> 
> I thought that the size of the ioctl was part of the ABI, but it does
> look like extending it as you have done here is valid. TIL.

Yeah, it does look like drm_ioctl() does allow it, but only for driver
ioctls. For drm core ioctls the kernel still accepts the ioctl, but it
gets the size from the kernel's ioctl->cmd. So depeding on the case the
kernel may read garbage from userspace, overwrite some other userspace
data, not touch some of the data userspace was offering, or just give
back -EFAULT. I guess that's all fine since userspace that does stuff
like that is already buggy.
Paul Berry Sept. 10, 2013, 6:11 p.m. UTC | #3
On 10 September 2013 06:16, Mika Kuoppala <mika.kuoppala@linux.intel.com>wrote:

> Current policy is to ban context if it manages to hang
> gpu in a certain time windows. Paul Berry asked if more
> strict policy could be available for use cases where
> the application doesn't know if the rendering command stream
> sent to gpu is valid or not.
>
> Provide an option, flag on context creation time, to let
> userspace to set more strict policy for handling gpu hangs for
> this context. If context with this flag set ever hangs the gpu,
> it will be permanently banned from accessing the GPU.
> All subsequent batch submissions will return -EIO.
>
> Requested-by: Paul Berry <stereotype441@gmail.com>
> Cc: Paul Berry <stereotype441@gmail.com>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>

(Cc-ing Ian since this impacts ARB_robustness, which he's been working on).

To clarify my reasons for requesting this feature, it's not necessarily for
use cases where the application doesn't know if the rendering command
stream is valid.  Rather, it's for any case where there is the risk of a
GPU hang (this might happen even if the command stream is valid, for
example because of an infinite loop in a WebGL shader).  Since the user
mode application (Mesa in my example) assumes that each batch buffer runs
to completion before the next batch buffer runs, it frequently includes
commands in batch buffer N that rely on state established by commands in
batch buffer N-1.  If batch buffer N-1 was interrupted due to a GPU hang,
then some of its state updates may not have completed, resulting in a
sizeable risk that batch buffer N (and a potentially unlimited number of
subsequent batches) will produce a GPU hang also.  The only reliable way to
recover from this situation is for Mesa to send a new batch buffer that
sets up the GPU state from scratch rather than relying on state established
in previous batch buffers.

Since Mesa doesn't wait for batch buffer N-1 to complete before submitting
batch buffer N, once a GPU hang occurs the kernel must regard any
subsequent buffers as suspect, until it receives some notification from
Mesa that the next batch is going to set up the GPU state from scratch.
When we met in June, we decided that the notification mechanism would be
for Mesa to stop using the context that caused the GPU hang, and create a
new context.  The first batch buffer sent to the new context would (of
necessity) set up the GPU state from scratch.  Consequently, all the kernel
needs to do to implement the new policy is to permanently ban any context
involved in a GPU hang.

Question, since I'm not terribly familiar with the kernel code: is it
possible for the ring buffer to contain batches belonging to multiple
contexts at a time?  If so, then what happens if a GPU hang occurs?  For
instance, let's say that the ring contains batch A1 from context A followed
by batch B2 from context B.  What happens if a GPU hang occurs while
executing batch A1?  Ideally the kernel would consider only context A to
have been involved in the GPU hang, and automatically re-submit batch B2 so
that context B it not affected by the hang.  Less ideally, but still ok,
would be for the kernel to consider both contexts A and B to be involved in
the GPU hang, and apply both contexts' banning policies.  If, however, the
kernel considered only context A to be involved in the GPU hang, but failed
to re-submit batch B2, then that would risk future GPU hangs from context
B, since a future batch B3 from context B would likely rely on state that
should have been established by batch B2.

Assuming the above question has been addressed, this looks reasonable to
me.  Thank you.



> ---
>  drivers/gpu/drm/i915/i915_dma.c         |    3 +++
>  drivers/gpu/drm/i915/i915_drv.h         |    3 +++
>  drivers/gpu/drm/i915/i915_gem.c         |    9 ++++++++-
>  drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++++---
>  include/uapi/drm/i915_drm.h             |    5 +++++
>  5 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_dma.c
> b/drivers/gpu/drm/i915/i915_dma.c
> index 3de6050..4353458 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1003,6 +1003,9 @@ static int i915_getparam(struct drm_device *dev,
> void *data,
>         case I915_PARAM_HAS_EXEC_HANDLE_LUT:
>                 value = 1;
>                 break;
> +       case I915_PARAM_HAS_CONTEXT_BAN:
> +               value = 1;
> +               break;
>         default:
>                 DRM_DEBUG("Unknown parameter %d\n", param->param);
>                 return -EINVAL;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h
> index 81ba5bb..9cf7050 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -594,6 +594,9 @@ struct i915_ctx_hang_stats {
>
>         /* This context is banned to submit more work */
>         bool banned;
> +
> +       /* Instead of default period based ban policy, ban on first hang */
> +       bool ban_on_first;
>  };
>
>  /* This must match up with the value previously used for execbuf2.rsvd1.
> */
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c
> index 04e810c..9feaafd2 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2190,11 +2190,18 @@ static bool i915_request_guilty(struct
> drm_i915_gem_request *request,
>
>  static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  {
> -       const unsigned long elapsed = get_seconds() - hs->guilty_ts;
> +       unsigned long elapsed;
>
>         if (hs->banned)
>                 return true;
>
> +       if (hs->ban_on_first) {
> +               DRM_ERROR("context banned on first hang!\n");
> +               return true;
> +       }
> +
> +       elapsed = get_seconds() - hs->guilty_ts;
> +
>         if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
>                 DRM_ERROR("context hanging too fast, declaring banned!\n");
>                 return true;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index 26c3fcc..8baa1d0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -135,7 +135,8 @@ void i915_gem_context_free(struct kref *ctx_ref)
>
>  static struct i915_hw_context *
>  create_hw_context(struct drm_device *dev,
> -                 struct drm_i915_file_private *file_priv)
> +                 struct drm_i915_file_private *file_priv,
> +                 const u64 flags)
>  {
>         struct drm_i915_private *dev_priv = dev->dev_private;
>         struct i915_hw_context *ctx;
> @@ -178,6 +179,7 @@ create_hw_context(struct drm_device *dev,
>
>         ctx->file_priv = file_priv;
>         ctx->id = ret;
> +       ctx->hang_stats.ban_on_first = !!(flags &
> I915_CONTEXT_BAN_ON_HANG);
>
>         return ctx;
>
> @@ -203,7 +205,7 @@ static int create_default_context(struct
> drm_i915_private *dev_priv)
>
>         BUG_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
>
> -       ctx = create_hw_context(dev_priv->dev, NULL);
> +       ctx = create_hw_context(dev_priv->dev, NULL, 0);
>         if (IS_ERR(ctx))
>                 return PTR_ERR(ctx);
>
> @@ -520,11 +522,15 @@ int i915_gem_context_create_ioctl(struct drm_device
> *dev, void *data,
>         if (dev_priv->hw_contexts_disabled)
>                 return -ENODEV;
>
> +       if (args->flags & __I915_CONTEXT_UNKNOWN_FLAGS)
> +               return -EINVAL;
> +
>         ret = i915_mutex_lock_interruptible(dev);
>         if (ret)
>                 return ret;
>
> -       ctx = create_hw_context(dev, file_priv);
> +       ctx = create_hw_context(dev, file_priv, args->flags);
> +
>         mutex_unlock(&dev->struct_mutex);
>         if (IS_ERR(ctx))
>                 return PTR_ERR(ctx);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 55bb572..a020454 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -335,6 +335,7 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_EXEC_NO_RELOC    25
>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>  #define I915_PARAM_HAS_WT               27
> +#define I915_PARAM_HAS_CONTEXT_BAN      28
>
>  typedef struct drm_i915_getparam {
>         int param;
> @@ -1015,10 +1016,14 @@ struct drm_i915_gem_wait {
>         __s64 timeout_ns;
>  };
>
> +#define I915_CONTEXT_BAN_ON_HANG  (1 << 0)
> +#define __I915_CONTEXT_UNKNOWN_FLAGS -(I915_CONTEXT_BAN_ON_HANG<<1)
> +
>  struct drm_i915_gem_context_create {
>         /*  output: id of new context*/
>         __u32 ctx_id;
>         __u32 pad;
> +       __u64 flags;
>  };
>
>  struct drm_i915_gem_context_destroy {
> --
> 1.7.9.5
>
>
Mika Kuoppala Sept. 11, 2013, 2:50 p.m. UTC | #4
Paul Berry <stereotype441@gmail.com> writes:

> On 10 September 2013 06:16, Mika Kuoppala <mika.kuoppala@linux.intel.com>wrote:
>
>> Current policy is to ban context if it manages to hang
>> gpu in a certain time windows. Paul Berry asked if more
>> strict policy could be available for use cases where
>> the application doesn't know if the rendering command stream
>> sent to gpu is valid or not.
>>
>> Provide an option, flag on context creation time, to let
>> userspace to set more strict policy for handling gpu hangs for
>> this context. If context with this flag set ever hangs the gpu,
>> it will be permanently banned from accessing the GPU.
>> All subsequent batch submissions will return -EIO.
>>
>> Requested-by: Paul Berry <stereotype441@gmail.com>
>> Cc: Paul Berry <stereotype441@gmail.com>
>> Cc: Ben Widawsky <ben@bwidawsk.net>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>
>
> (Cc-ing Ian since this impacts ARB_robustness, which he's been working on).
>
> To clarify my reasons for requesting this feature, it's not necessarily for
> use cases where the application doesn't know if the rendering command
> stream is valid.  Rather, it's for any case where there is the risk of a
> GPU hang (this might happen even if the command stream is valid, for
> example because of an infinite loop in a WebGL shader).  Since the user
> mode application (Mesa in my example) assumes that each batch buffer runs
> to completion before the next batch buffer runs, it frequently includes
> commands in batch buffer N that rely on state established by commands in
> batch buffer N-1.  If batch buffer N-1 was interrupted due to a GPU hang,
> then some of its state updates may not have completed, resulting in a
> sizeable risk that batch buffer N (and a potentially unlimited number of
> subsequent batches) will produce a GPU hang also.  The only reliable way to
> recover from this situation is for Mesa to send a new batch buffer that
> sets up the GPU state from scratch rather than relying on state established
> in previous batch buffers.

Thanks for the clarification. I have updated the commit message.
>
> Since Mesa doesn't wait for batch buffer N-1 to complete before submitting
> batch buffer N, once a GPU hang occurs the kernel must regard any
> subsequent buffers as suspect, until it receives some notification from
> Mesa that the next batch is going to set up the GPU state from scratch.
> When we met in June, we decided that the notification mechanism would be
> for Mesa to stop using the context that caused the GPU hang, and create a
> new context.  The first batch buffer sent to the new context would (of
> necessity) set up the GPU state from scratch.  Consequently, all the kernel
> needs to do to implement the new policy is to permanently ban any context
> involved in a GPU hang.

Involved as a guilty of hang or ban every context who had batches pending?

We could add I915_CONTEXT_BAN_ON_PENDING flag also and with it all contexts
that were affected would get -EIO on next batch submission after the hang.

> Question, since I'm not terribly familiar with the kernel code: is it
> possible for the ring buffer to contain batches belonging to multiple
> contexts at a time?

Yes.

> If so, then what happens if a GPU hang occurs?  For
> instance, let's say that the ring contains batch A1 from context A followed
> by batch B2 from context B.  What happens if a GPU hang occurs while
> executing batch A1?  Ideally the kernel would consider only context A to
> have been involved in the GPU hang, and automatically re-submit batch B2 so
> that context B it not affected by the hang.  Less ideally, but still ok,
> would be for the kernel to consider both contexts A and B to be involved in
> the GPU hang, and apply both contexts' banning policies.  If, however, the
> kernel considered only context A to be involved in the GPU hang, but failed
> to re-submit batch B2, then that would risk future GPU hangs from context
> B, since a future batch B3 from context B would likely rely on state that
> should have been established by batch B2.
>

This patch will only ban the offending context. Other contexts
will lose the batches that were pending as the request queue will be
cleared on reset following the hang. As things are now, kernel wont
re-submit anything by itself.

I have been also working with ioctl (get_reset_stats, for arb robustness
extension) which allows application to sort out which contexts were
affected by hang. Here is the planned ioctl for arb robustness
extension:
https://github.com/mkuoppal/linux/commit/698a413472edaec78852b8ca9849961cbdc40d78

This allows applications then to detect which contexts need to resubmit
their state and also will give information if the context had batch
active or pending when the gpu hang happened.

Thanks,
--Mika

> Assuming the above question has been addressed, this looks reasonable to
> me.  Thank you.
>
>
>
>> ---
>>  drivers/gpu/drm/i915/i915_dma.c         |    3 +++
>>  drivers/gpu/drm/i915/i915_drv.h         |    3 +++
>>  drivers/gpu/drm/i915/i915_gem.c         |    9 ++++++++-
>>  drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++++---
>>  include/uapi/drm/i915_drm.h             |    5 +++++
>>  5 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_dma.c
>> b/drivers/gpu/drm/i915/i915_dma.c
>> index 3de6050..4353458 100644
>> --- a/drivers/gpu/drm/i915/i915_dma.c
>> +++ b/drivers/gpu/drm/i915/i915_dma.c
>> @@ -1003,6 +1003,9 @@ static int i915_getparam(struct drm_device *dev,
>> void *data,
>>         case I915_PARAM_HAS_EXEC_HANDLE_LUT:
>>                 value = 1;
>>                 break;
>> +       case I915_PARAM_HAS_CONTEXT_BAN:
>> +               value = 1;
>> +               break;
>>         default:
>>                 DRM_DEBUG("Unknown parameter %d\n", param->param);
>>                 return -EINVAL;
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 81ba5bb..9cf7050 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -594,6 +594,9 @@ struct i915_ctx_hang_stats {
>>
>>         /* This context is banned to submit more work */
>>         bool banned;
>> +
>> +       /* Instead of default period based ban policy, ban on first hang */
>> +       bool ban_on_first;
>>  };
>>
>>  /* This must match up with the value previously used for execbuf2.rsvd1.
>> */
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 04e810c..9feaafd2 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2190,11 +2190,18 @@ static bool i915_request_guilty(struct
>> drm_i915_gem_request *request,
>>
>>  static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>>  {
>> -       const unsigned long elapsed = get_seconds() - hs->guilty_ts;
>> +       unsigned long elapsed;
>>
>>         if (hs->banned)
>>                 return true;
>>
>> +       if (hs->ban_on_first) {
>> +               DRM_ERROR("context banned on first hang!\n");
>> +               return true;
>> +       }
>> +
>> +       elapsed = get_seconds() - hs->guilty_ts;
>> +
>>         if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
>>                 DRM_ERROR("context hanging too fast, declaring banned!\n");
>>                 return true;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> b/drivers/gpu/drm/i915/i915_gem_context.c
>> index 26c3fcc..8baa1d0 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -135,7 +135,8 @@ void i915_gem_context_free(struct kref *ctx_ref)
>>
>>  static struct i915_hw_context *
>>  create_hw_context(struct drm_device *dev,
>> -                 struct drm_i915_file_private *file_priv)
>> +                 struct drm_i915_file_private *file_priv,
>> +                 const u64 flags)
>>  {
>>         struct drm_i915_private *dev_priv = dev->dev_private;
>>         struct i915_hw_context *ctx;
>> @@ -178,6 +179,7 @@ create_hw_context(struct drm_device *dev,
>>
>>         ctx->file_priv = file_priv;
>>         ctx->id = ret;
>> +       ctx->hang_stats.ban_on_first = !!(flags &
>> I915_CONTEXT_BAN_ON_HANG);
>>
>>         return ctx;
>>
>> @@ -203,7 +205,7 @@ static int create_default_context(struct
>> drm_i915_private *dev_priv)
>>
>>         BUG_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
>>
>> -       ctx = create_hw_context(dev_priv->dev, NULL);
>> +       ctx = create_hw_context(dev_priv->dev, NULL, 0);
>>         if (IS_ERR(ctx))
>>                 return PTR_ERR(ctx);
>>
>> @@ -520,11 +522,15 @@ int i915_gem_context_create_ioctl(struct drm_device
>> *dev, void *data,
>>         if (dev_priv->hw_contexts_disabled)
>>                 return -ENODEV;
>>
>> +       if (args->flags & __I915_CONTEXT_UNKNOWN_FLAGS)
>> +               return -EINVAL;
>> +
>>         ret = i915_mutex_lock_interruptible(dev);
>>         if (ret)
>>                 return ret;
>>
>> -       ctx = create_hw_context(dev, file_priv);
>> +       ctx = create_hw_context(dev, file_priv, args->flags);
>> +
>>         mutex_unlock(&dev->struct_mutex);
>>         if (IS_ERR(ctx))
>>                 return PTR_ERR(ctx);
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 55bb572..a020454 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -335,6 +335,7 @@ typedef struct drm_i915_irq_wait {
>>  #define I915_PARAM_HAS_EXEC_NO_RELOC    25
>>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>>  #define I915_PARAM_HAS_WT               27
>> +#define I915_PARAM_HAS_CONTEXT_BAN      28
>>
>>  typedef struct drm_i915_getparam {
>>         int param;
>> @@ -1015,10 +1016,14 @@ struct drm_i915_gem_wait {
>>         __s64 timeout_ns;
>>  };
>>
>> +#define I915_CONTEXT_BAN_ON_HANG  (1 << 0)
>> +#define __I915_CONTEXT_UNKNOWN_FLAGS -(I915_CONTEXT_BAN_ON_HANG<<1)
>> +
>>  struct drm_i915_gem_context_create {
>>         /*  output: id of new context*/
>>         __u32 ctx_id;
>>         __u32 pad;
>> +       __u64 flags;
>>  };
>>
>>  struct drm_i915_gem_context_destroy {
>> --
>> 1.7.9.5
>>
>>
Paul Berry Sept. 11, 2013, 3:49 p.m. UTC | #5
On 11 September 2013 07:50, Mika Kuoppala <mika.kuoppala@linux.intel.com>wrote:

> Paul Berry <stereotype441@gmail.com> writes:
>
> > On 10 September 2013 06:16, Mika Kuoppala <mika.kuoppala@linux.intel.com
> >wrote:
> >
> >> Current policy is to ban context if it manages to hang
> >> gpu in a certain time windows. Paul Berry asked if more
> >> strict policy could be available for use cases where
> >> the application doesn't know if the rendering command stream
> >> sent to gpu is valid or not.
> >>
> >> Provide an option, flag on context creation time, to let
> >> userspace to set more strict policy for handling gpu hangs for
> >> this context. If context with this flag set ever hangs the gpu,
> >> it will be permanently banned from accessing the GPU.
> >> All subsequent batch submissions will return -EIO.
> >>
> >> Requested-by: Paul Berry <stereotype441@gmail.com>
> >> Cc: Paul Berry <stereotype441@gmail.com>
> >> Cc: Ben Widawsky <ben@bwidawsk.net>
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >>
> >
> > (Cc-ing Ian since this impacts ARB_robustness, which he's been working
> on).
> >
> > To clarify my reasons for requesting this feature, it's not necessarily
> for
> > use cases where the application doesn't know if the rendering command
> > stream is valid.  Rather, it's for any case where there is the risk of a
> > GPU hang (this might happen even if the command stream is valid, for
> > example because of an infinite loop in a WebGL shader).  Since the user
> > mode application (Mesa in my example) assumes that each batch buffer runs
> > to completion before the next batch buffer runs, it frequently includes
> > commands in batch buffer N that rely on state established by commands in
> > batch buffer N-1.  If batch buffer N-1 was interrupted due to a GPU hang,
> > then some of its state updates may not have completed, resulting in a
> > sizeable risk that batch buffer N (and a potentially unlimited number of
> > subsequent batches) will produce a GPU hang also.  The only reliable way
> to
> > recover from this situation is for Mesa to send a new batch buffer that
> > sets up the GPU state from scratch rather than relying on state
> established
> > in previous batch buffers.
>
> Thanks for the clarification. I have updated the commit message.
> >
> > Since Mesa doesn't wait for batch buffer N-1 to complete before
> submitting
> > batch buffer N, once a GPU hang occurs the kernel must regard any
> > subsequent buffers as suspect, until it receives some notification from
> > Mesa that the next batch is going to set up the GPU state from scratch.
> > When we met in June, we decided that the notification mechanism would be
> > for Mesa to stop using the context that caused the GPU hang, and create a
> > new context.  The first batch buffer sent to the new context would (of
> > necessity) set up the GPU state from scratch.  Consequently, all the
> kernel
> > needs to do to implement the new policy is to permanently ban any context
> > involved in a GPU hang.
>
> Involved as a guilty of hang or ban every context who had batches pending?
>
> We could add I915_CONTEXT_BAN_ON_PENDING flag also and with it all contexts
> that were affected would get -EIO on next batch submission after the hang.
>
> > Question, since I'm not terribly familiar with the kernel code: is it
> > possible for the ring buffer to contain batches belonging to multiple
> > contexts at a time?
>
> Yes.
>
> > If so, then what happens if a GPU hang occurs?  For
> > instance, let's say that the ring contains batch A1 from context A
> followed
> > by batch B2 from context B.  What happens if a GPU hang occurs while
> > executing batch A1?  Ideally the kernel would consider only context A to
> > have been involved in the GPU hang, and automatically re-submit batch B2
> so
> > that context B it not affected by the hang.  Less ideally, but still ok,
> > would be for the kernel to consider both contexts A and B to be involved
> in
> > the GPU hang, and apply both contexts' banning policies.  If, however,
> the
> > kernel considered only context A to be involved in the GPU hang, but
> failed
> > to re-submit batch B2, then that would risk future GPU hangs from context
> > B, since a future batch B3 from context B would likely rely on state that
> > should have been established by batch B2.
> >
>
> This patch will only ban the offending context. Other contexts
> will lose the batches that were pending as the request queue will be
> cleared on reset following the hang. As things are now, kernel wont
> re-submit anything by itself.
>

Thanks for the clarification.

The important thing from Mesa's point of view is to make sure that batch N
submitted to context C will only be executed if batch N-1 has run to
completion.  We would like this invariant to hold even if other contexts
cause GPU hangs.  Under the current state of affairs, where a hang on
context A can cause a batch belonging to context B to be lost, we would
need the I915_CONTEXT_BAN_ON_PENDING flag in order to achieve that
invariant.  But if the kernel ever got changed in the future so that it
automatically re-submitted pending batches upon recovery from a GPU hang*
(a change I would advocate), then we wouldn't need the
I915_CONTEXT_BAN_ON_PENDING flag anymore, and in fact setting it would be
counterproductive.

(*Of course, in order to avoid cascading GPU hangs, the kernel should only
re-submit pending batches from contexts other than the offending context)

So I would be in favor of adding a I915_CONTEXT_BAN_ON_PENDING flag, but
I'd suggest renaming it to something like I915_CONTEXT_BAN_ON_BATCH_LOSS.
That way, if in the future, we add the ability for the kernel to re-submit
pending batches upon recovery from a GPU hang, then it will be clear that
I915_CONTEXT_BAN_ON_BATCH_LOSS doesn't apply to the contexts that had their
batches automatically re-submitted.


>
> I have been also working with ioctl (get_reset_stats, for arb robustness
> extension) which allows application to sort out which contexts were
> affected by hang. Here is the planned ioctl for arb robustness
> extension:
>
> https://github.com/mkuoppal/linux/commit/698a413472edaec78852b8ca9849961cbdc40d78
>
> This allows applications then to detect which contexts need to resubmit
> their state and also will give information if the context had batch
> active or pending when the gpu hang happened.
>

That ioctl seems reasonable to me.  My only comment is that we might want
to consider renaming the "batch_pending" field in drm_i915_reset_stats to
"batch_loss", for similar reasons to what I stated above.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 3de6050..4353458 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1003,6 +1003,9 @@  static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 		value = 1;
 		break;
+	case I915_PARAM_HAS_CONTEXT_BAN:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 81ba5bb..9cf7050 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -594,6 +594,9 @@  struct i915_ctx_hang_stats {
 
 	/* This context is banned to submit more work */
 	bool banned;
+
+	/* Instead of default period based ban policy, ban on first hang */
+	bool ban_on_first;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 04e810c..9feaafd2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2190,11 +2190,18 @@  static bool i915_request_guilty(struct drm_i915_gem_request *request,
 
 static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
 {
-	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
+	unsigned long elapsed;
 
 	if (hs->banned)
 		return true;
 
+	if (hs->ban_on_first) {
+		DRM_ERROR("context banned on first hang!\n");
+		return true;
+	}
+
+	elapsed = get_seconds() - hs->guilty_ts;
+
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
 		DRM_ERROR("context hanging too fast, declaring banned!\n");
 		return true;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 26c3fcc..8baa1d0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -135,7 +135,8 @@  void i915_gem_context_free(struct kref *ctx_ref)
 
 static struct i915_hw_context *
 create_hw_context(struct drm_device *dev,
-		  struct drm_i915_file_private *file_priv)
+		  struct drm_i915_file_private *file_priv,
+		  const u64 flags)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct i915_hw_context *ctx;
@@ -178,6 +179,7 @@  create_hw_context(struct drm_device *dev,
 
 	ctx->file_priv = file_priv;
 	ctx->id = ret;
+	ctx->hang_stats.ban_on_first = !!(flags & I915_CONTEXT_BAN_ON_HANG);
 
 	return ctx;
 
@@ -203,7 +205,7 @@  static int create_default_context(struct drm_i915_private *dev_priv)
 
 	BUG_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
 
-	ctx = create_hw_context(dev_priv->dev, NULL);
+	ctx = create_hw_context(dev_priv->dev, NULL, 0);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
@@ -520,11 +522,15 @@  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (dev_priv->hw_contexts_disabled)
 		return -ENODEV;
 
+	if (args->flags & __I915_CONTEXT_UNKNOWN_FLAGS)
+		return -EINVAL;
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
-	ctx = create_hw_context(dev, file_priv);
+	ctx = create_hw_context(dev, file_priv, args->flags);
+
 	mutex_unlock(&dev->struct_mutex);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 55bb572..a020454 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -335,6 +335,7 @@  typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_EXEC_NO_RELOC	 25
 #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
 #define I915_PARAM_HAS_WT     	 	 27
+#define I915_PARAM_HAS_CONTEXT_BAN	 28
 
 typedef struct drm_i915_getparam {
 	int param;
@@ -1015,10 +1016,14 @@  struct drm_i915_gem_wait {
 	__s64 timeout_ns;
 };
 
+#define I915_CONTEXT_BAN_ON_HANG  (1 << 0)
+#define __I915_CONTEXT_UNKNOWN_FLAGS -(I915_CONTEXT_BAN_ON_HANG<<1)
+
 struct drm_i915_gem_context_create {
 	/*  output: id of new context*/
 	__u32 ctx_id;
 	__u32 pad;
+	__u64 flags;
 };
 
 struct drm_i915_gem_context_destroy {