diff mbox

[08/12] drm/i915: Update context_fini

Message ID 1366784140-2670-9-git-send-email-ben@bwidawsk.net (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky April 24, 2013, 6:15 a.m. UTC
Make resets optional for fini. fini is only ever called in
module_unload. It was therefore a good assumption that the GPU reset
(see the comment in the function) was what we wanted. With an upcoming
patch, we're going to add a few more callsites, one of which is GPU
reset, where doing the extra reset isn't usual.

On that same note, with more callers it makes sense to make the default
context state consistent with the actual state (via NULLing the
pointer).

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_dma.c         | 2 +-
 drivers/gpu/drm/i915/i915_drv.h         | 2 +-
 drivers/gpu/drm/i915/i915_gem_context.c | 7 +++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

Comments

Mika Kuoppala April 24, 2013, 3:11 p.m. UTC | #1
Ben Widawsky <ben@bwidawsk.net> writes:

> Make resets optional for fini. fini is only ever called in
> module_unload. It was therefore a good assumption that the GPU reset
> (see the comment in the function) was what we wanted. With an upcoming
> patch, we're going to add a few more callsites, one of which is GPU
> reset, where doing the extra reset isn't usual.
>
> On that same note, with more callers it makes sense to make the default
> context state consistent with the actual state (via NULLing the
> pointer).
>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_dma.c         | 2 +-
>  drivers/gpu/drm/i915/i915_drv.h         | 2 +-
>  drivers/gpu/drm/i915/i915_gem_context.c | 7 +++++--
>  3 files changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 3b315ba..a141b8a 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1764,7 +1764,7 @@ int i915_driver_unload(struct drm_device *dev)
>  		mutex_lock(&dev->struct_mutex);
>  		i915_gem_free_all_phys_object(dev);
>  		i915_gem_cleanup_ringbuffer(dev);
> -		i915_gem_context_fini(dev);
> +		i915_gem_context_fini(dev, true);
>  		mutex_unlock(&dev->struct_mutex);
>  		i915_gem_cleanup_aliasing_ppgtt(dev);
>  		i915_gem_cleanup_stolen(dev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 9717c69..618845e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1698,7 +1698,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
>  
>  /* i915_gem_context.c */
>  void i915_gem_context_init(struct drm_device *dev);
> -void i915_gem_context_fini(struct drm_device *dev);
> +void i915_gem_context_fini(struct drm_device *dev, bool reset);
>  void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
>  int i915_switch_context(struct intel_ring_buffer *ring,
>  			struct drm_file *file, int to_id);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 411ace0..6030d83 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -259,7 +259,7 @@ void i915_gem_context_init(struct drm_device *dev)
>  	DRM_DEBUG_DRIVER("HW context support initialized\n");
>  }
>  
> -void i915_gem_context_fini(struct drm_device *dev)
> +void i915_gem_context_fini(struct drm_device *dev, bool reset)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> @@ -269,11 +269,14 @@ void i915_gem_context_fini(struct drm_device *dev)
>  	/* The only known way to stop the gpu from accessing the hw context is
>  	 * to reset it. Do this as the very last operation to avoid confusing
>  	 * other code, leading to spurious errors. */

Should we switch to default context in here to be sure that 
the running context will get unreferenced correctly?...

> -	intel_gpu_reset(dev);
> +	if (reset)
> +		intel_gpu_reset(dev);
>  
>  	i915_gem_object_unpin(dev_priv->ring[RCS].default_context->obj);
>

...and unreference the default context here so that it will get freed
on module unload.

--Mika

>  	do_destroy(dev_priv->ring[RCS].default_context);
> +
> +	dev_priv->ring[RCS].default_context = NULL;
>  }
>  
>  static int context_idr_cleanup(int id, void *p, void *data)
> -- 
> 1.8.2.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ben Widawsky April 25, 2013, 4:11 a.m. UTC | #2
On Wed, Apr 24, 2013 at 06:11:40PM +0300, Mika Kuoppala wrote:
> Ben Widawsky <ben@bwidawsk.net> writes:
> 
> > Make resets optional for fini. fini is only ever called in
> > module_unload. It was therefore a good assumption that the GPU reset
> > (see the comment in the function) was what we wanted. With an upcoming
> > patch, we're going to add a few more callsites, one of which is GPU
> > reset, where doing the extra reset isn't usual.
> >
> > On that same note, with more callers it makes sense to make the default
> > context state consistent with the actual state (via NULLing the
> > pointer).
> >
> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c         | 2 +-
> >  drivers/gpu/drm/i915/i915_drv.h         | 2 +-
> >  drivers/gpu/drm/i915/i915_gem_context.c | 7 +++++--
> >  3 files changed, 7 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 3b315ba..a141b8a 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -1764,7 +1764,7 @@ int i915_driver_unload(struct drm_device *dev)
> >  		mutex_lock(&dev->struct_mutex);
> >  		i915_gem_free_all_phys_object(dev);
> >  		i915_gem_cleanup_ringbuffer(dev);
> > -		i915_gem_context_fini(dev);
> > +		i915_gem_context_fini(dev, true);
> >  		mutex_unlock(&dev->struct_mutex);
> >  		i915_gem_cleanup_aliasing_ppgtt(dev);
> >  		i915_gem_cleanup_stolen(dev);
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 9717c69..618845e 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1698,7 +1698,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
> >  
> >  /* i915_gem_context.c */
> >  void i915_gem_context_init(struct drm_device *dev);
> > -void i915_gem_context_fini(struct drm_device *dev);
> > +void i915_gem_context_fini(struct drm_device *dev, bool reset);
> >  void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
> >  int i915_switch_context(struct intel_ring_buffer *ring,
> >  			struct drm_file *file, int to_id);
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 411ace0..6030d83 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -259,7 +259,7 @@ void i915_gem_context_init(struct drm_device *dev)
> >  	DRM_DEBUG_DRIVER("HW context support initialized\n");
> >  }
> >  
> > -void i915_gem_context_fini(struct drm_device *dev)
> > +void i915_gem_context_fini(struct drm_device *dev, bool reset)
> >  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  
> > @@ -269,11 +269,14 @@ void i915_gem_context_fini(struct drm_device *dev)
> >  	/* The only known way to stop the gpu from accessing the hw context is
> >  	 * to reset it. Do this as the very last operation to avoid confusing
> >  	 * other code, leading to spurious errors. */
> 
> Should we switch to default context in here to be sure that 
> the running context will get unreferenced correctly?...

I'm confused if you're talking about the object refcount, or the context
refcount. The latter isn't yet introduced in this part of the series.
But maybe I've missed your point, so let's discuss...

There are 3 ways we can end up at fini():
1. failed during some part of driver initialization
2. unload
  Conceptually, 1 and 2 are the same, the GPU is idle, (explicitly so in
  the latter case) and so therefore we know the default context must be
  the last context context[1]. We know the retire worker has either been
  called, or we've not yet submitted work, and we should therefore be
  able to assert the object refcount is exactly 1.

  Given our discussion and the new reset parameter in fini, I can assert
  last_context_obj == default_context->obj in the !reset case, and that
  the refcount is 1.

3. reset
  In this case, it's a good question. Normally reset is called on a
  hang, and we can't expect to be able to switch. I think
  reset_ring_lists does everything I need, and then it follows into case
  1 and 2. If I have missed something though, could you please explain
  it a bit further fo rme?

> 
> > -	intel_gpu_reset(dev); +	if (reset) +
> > intel_gpu_reset(dev);
> >  
> >  	i915_gem_object_unpin(dev_priv->ring[RCS].default_context->obj);
> >
> 
> ...and unreference the default context here so that it will get freed
> on module unload.

Can you explain what I'm missing. Doesn't do_destroy just below take
care of it?
>

 
> --Mika

For the context reference counting case, I think the questions are still
applicable, and the answers are similar. You deref in reset_ring_lists,
init fail and unload are the same... If you want me to use the
context_unref interface, I can, but I would want to make unref return
the val of kref_put() and do while(!context_unref(...)) just to be safe.


You can take a look here to see how that's progressing:
http://cgit.freedesktop.org/~bwidawsk/drm-intel/log/?h=ppgtt-ctx

It's quite volatile though, so be warned.


> 
> >  	do_destroy(dev_priv->ring[RCS].default_context);
> > +
> > +	dev_priv->ring[RCS].default_context = NULL;
> >  }
> >  
> >  static int context_idr_cleanup(int id, void *p, void *data)
> > -- 
> > 1.8.2.1
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ben Widawsky April 25, 2013, 5:17 a.m. UTC | #3
On Wed, Apr 24, 2013 at 09:11:02PM -0700, Ben Widawsky wrote:
> On Wed, Apr 24, 2013 at 06:11:40PM +0300, Mika Kuoppala wrote:
> > Ben Widawsky <ben@bwidawsk.net> writes:
> > 
> > > Make resets optional for fini. fini is only ever called in
> > > module_unload. It was therefore a good assumption that the GPU reset
> > > (see the comment in the function) was what we wanted. With an upcoming
> > > patch, we're going to add a few more callsites, one of which is GPU
> > > reset, where doing the extra reset isn't usual.
> > >
> > > On that same note, with more callers it makes sense to make the default
> > > context state consistent with the actual state (via NULLing the
> > > pointer).
> > >
> > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > > ---
> > >  drivers/gpu/drm/i915/i915_dma.c         | 2 +-
> > >  drivers/gpu/drm/i915/i915_drv.h         | 2 +-
> > >  drivers/gpu/drm/i915/i915_gem_context.c | 7 +++++--
> > >  3 files changed, 7 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > > index 3b315ba..a141b8a 100644
> > > --- a/drivers/gpu/drm/i915/i915_dma.c
> > > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > > @@ -1764,7 +1764,7 @@ int i915_driver_unload(struct drm_device *dev)
> > >  		mutex_lock(&dev->struct_mutex);
> > >  		i915_gem_free_all_phys_object(dev);
> > >  		i915_gem_cleanup_ringbuffer(dev);
> > > -		i915_gem_context_fini(dev);
> > > +		i915_gem_context_fini(dev, true);
> > >  		mutex_unlock(&dev->struct_mutex);
> > >  		i915_gem_cleanup_aliasing_ppgtt(dev);
> > >  		i915_gem_cleanup_stolen(dev);
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index 9717c69..618845e 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -1698,7 +1698,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
> > >  
> > >  /* i915_gem_context.c */
> > >  void i915_gem_context_init(struct drm_device *dev);
> > > -void i915_gem_context_fini(struct drm_device *dev);
> > > +void i915_gem_context_fini(struct drm_device *dev, bool reset);
> > >  void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
> > >  int i915_switch_context(struct intel_ring_buffer *ring,
> > >  			struct drm_file *file, int to_id);
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > > index 411ace0..6030d83 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > > @@ -259,7 +259,7 @@ void i915_gem_context_init(struct drm_device *dev)
> > >  	DRM_DEBUG_DRIVER("HW context support initialized\n");
> > >  }
> > >  
> > > -void i915_gem_context_fini(struct drm_device *dev)
> > > +void i915_gem_context_fini(struct drm_device *dev, bool reset)
> > >  {
> > >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  
> > > @@ -269,11 +269,14 @@ void i915_gem_context_fini(struct drm_device *dev)
> > >  	/* The only known way to stop the gpu from accessing the hw context is
> > >  	 * to reset it. Do this as the very last operation to avoid confusing
> > >  	 * other code, leading to spurious errors. */
> > 
> > Should we switch to default context in here to be sure that 
> > the running context will get unreferenced correctly?...
> 
> I'm confused if you're talking about the object refcount, or the context
> refcount. The latter isn't yet introduced in this part of the series.
> But maybe I've missed your point, so let's discuss...
> 
> There are 3 ways we can end up at fini():
> 1. failed during some part of driver initialization
> 2. unload
>   Conceptually, 1 and 2 are the same, the GPU is idle, (explicitly so in
>   the latter case) and so therefore we know the default context must be
>   the last context context[1]. We know the retire worker has either been
>   called, or we've not yet submitted work, and we should therefore be
>   able to assert the object refcount is exactly 1.
> 
>   Given our discussion and the new reset parameter in fini, I can assert
>   last_context_obj == default_context->obj in the !reset case, and that
>   the refcount is 1.

I should add to this statement:
Getting the object reference count isn't possible, and it'd involve
changing drm_gem_object_unreference to return the value of kref_put()
which I'd prefer not to d.

Also the other assertion requires some extra artificial code to actually
assert last_context_obj == default_context->obj because the init case
will only work after enable().

> 
> 3. reset
>   In this case, it's a good question. Normally reset is called on a
>   hang, and we can't expect to be able to switch. I think
>   reset_ring_lists does everything I need, and then it follows into case
>   1 and 2. If I have missed something though, could you please explain
>   it a bit further fo rme?
> 
> > 
> > > -	intel_gpu_reset(dev); +	if (reset) +
> > > intel_gpu_reset(dev);
> > >  
> > >  	i915_gem_object_unpin(dev_priv->ring[RCS].default_context->obj);
> > >
> > 
> > ...and unreference the default context here so that it will get freed
> > on module unload.
> 
> Can you explain what I'm missing. Doesn't do_destroy just below take
> care of it?
> >
> 
>  
> > --Mika
> 
> For the context reference counting case, I think the questions are still
> applicable, and the answers are similar. You deref in reset_ring_lists,
> init fail and unload are the same... If you want me to use the
> context_unref interface, I can, but I would want to make unref return
> the val of kref_put() and do while(!context_unref(...)) just to be safe.
> 
> 
> You can take a look here to see how that's progressing:
> http://cgit.freedesktop.org/~bwidawsk/drm-intel/log/?h=ppgtt-ctx
> 
> It's quite volatile though, so be warned.
> 
> 
> > 
> > >  	do_destroy(dev_priv->ring[RCS].default_context);
> > > +
> > > +	dev_priv->ring[RCS].default_context = NULL;
> > >  }
> > >  
> > >  static int context_idr_cleanup(int id, void *p, void *data)
> > > -- 
> > > 1.8.2.1
> > >
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
Mika Kuoppala April 25, 2013, 3:01 p.m. UTC | #4
Ben Widawsky <ben@bwidawsk.net> writes:

> On Wed, Apr 24, 2013 at 06:11:40PM +0300, Mika Kuoppala wrote:
>> Ben Widawsky <ben@bwidawsk.net> writes:
>> 
>> > Make resets optional for fini. fini is only ever called in
>> > module_unload. It was therefore a good assumption that the GPU reset
>> > (see the comment in the function) was what we wanted. With an upcoming
>> > patch, we're going to add a few more callsites, one of which is GPU
>> > reset, where doing the extra reset isn't usual.
>> >
>> > On that same note, with more callers it makes sense to make the default
>> > context state consistent with the actual state (via NULLing the
>> > pointer).
>> >
>> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
>> > ---
>> >  drivers/gpu/drm/i915/i915_dma.c         | 2 +-
>> >  drivers/gpu/drm/i915/i915_drv.h         | 2 +-
>> >  drivers/gpu/drm/i915/i915_gem_context.c | 7 +++++--
>> >  3 files changed, 7 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
>> > index 3b315ba..a141b8a 100644
>> > --- a/drivers/gpu/drm/i915/i915_dma.c
>> > +++ b/drivers/gpu/drm/i915/i915_dma.c
>> > @@ -1764,7 +1764,7 @@ int i915_driver_unload(struct drm_device *dev)
>> >  		mutex_lock(&dev->struct_mutex);
>> >  		i915_gem_free_all_phys_object(dev);
>> >  		i915_gem_cleanup_ringbuffer(dev);
>> > -		i915_gem_context_fini(dev);
>> > +		i915_gem_context_fini(dev, true);
>> >  		mutex_unlock(&dev->struct_mutex);
>> >  		i915_gem_cleanup_aliasing_ppgtt(dev);
>> >  		i915_gem_cleanup_stolen(dev);
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> > index 9717c69..618845e 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1698,7 +1698,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
>> >  
>> >  /* i915_gem_context.c */
>> >  void i915_gem_context_init(struct drm_device *dev);
>> > -void i915_gem_context_fini(struct drm_device *dev);
>> > +void i915_gem_context_fini(struct drm_device *dev, bool reset);
>> >  void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
>> >  int i915_switch_context(struct intel_ring_buffer *ring,
>> >  			struct drm_file *file, int to_id);
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> > index 411ace0..6030d83 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> > @@ -259,7 +259,7 @@ void i915_gem_context_init(struct drm_device *dev)
>> >  	DRM_DEBUG_DRIVER("HW context support initialized\n");
>> >  }
>> >  
>> > -void i915_gem_context_fini(struct drm_device *dev)
>> > +void i915_gem_context_fini(struct drm_device *dev, bool reset)
>> >  {
>> >  	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  
>> > @@ -269,11 +269,14 @@ void i915_gem_context_fini(struct drm_device *dev)
>> >  	/* The only known way to stop the gpu from accessing the hw context is
>> >  	 * to reset it. Do this as the very last operation to avoid confusing
>> >  	 * other code, leading to spurious errors. */
>> 
>> Should we switch to default context in here to be sure that 
>> the running context will get unreferenced correctly?...
>
> I'm confused if you're talking about the object refcount, or the context
> refcount. The latter isn't yet introduced in this part of the series.
> But maybe I've missed your point, so let's discuss...

I should have been more specific. Object refcount it is.

> There are 3 ways we can end up at fini():
> 1. failed during some part of driver initialization
> 2. unload
>   Conceptually, 1 and 2 are the same, the GPU is idle, (explicitly so in
>   the latter case) and so therefore we know the default context must be
>   the last context context[1]. We know the retire worker has either been
>   called, or we've not yet submitted work, and we should therefore be
>   able to assert the object refcount is exactly 1.
>
>   Given our discussion and the new reset parameter in fini, I can assert
>   last_context_obj == default_context->obj in the !reset case, and that
>   the refcount is 1.

Yes. I forgot the gpu_idle which indeed switches to default context. So
no need to switch.

> 3. reset
>   In this case, it's a good question. Normally reset is called on a
>   hang, and we can't expect to be able to switch. I think
>   reset_ring_lists does everything I need, and then it follows into case
>   1 and 2. If I have missed something though, could you please explain
>   it a bit further fo rme?

My only concern was that the drivers bookkeepping of what context it has
currently switched to the hw, is out of sync after reset. And as a
result first call to i915_switch_context might skip the switch even
if it was needed.

>> 
>> > -	intel_gpu_reset(dev); +	if (reset) +
>> > intel_gpu_reset(dev);
>> >  
>> >  	i915_gem_object_unpin(dev_priv->ring[RCS].default_context->obj);
>> >
>> 
>> ...and unreference the default context here so that it will get freed
>> on module unload.
>
> Can you explain what I'm missing. Doesn't do_destroy just below take
> care of it?

do_destroy is enough for all the other contexts that were created
by the ioctl. However the default context is different as we are owning
it, adding to the refcount. So we need to unreference it also once before
calling do destroy.

This is what i think is the cause as i see default context leaking
with the current codebase, on module unload.

Adding
+ drm_gem_object_unreference(&dev_priv->ring[RCS].default_context->obj->base);
after the unpin makes the leak go away.

-Mika

>>
>
>  
>> --Mika
>
> For the context reference counting case, I think the questions are still
> applicable, and the answers are similar. You deref in reset_ring_lists,
> init fail and unload are the same... If you want me to use the
> context_unref interface, I can, but I would want to make unref return
> the val of kref_put() and do while(!context_unref(...)) just to be safe.
>
>
> You can take a look here to see how that's progressing:
> http://cgit.freedesktop.org/~bwidawsk/drm-intel/log/?h=ppgtt-ctx
>
> It's quite volatile though, so be warned.
>
>
>> 
>> >  	do_destroy(dev_priv->ring[RCS].default_context);
>> > +
>> > +	dev_priv->ring[RCS].default_context = NULL;
>> >  }
>> >  
>> >  static int context_idr_cleanup(int id, void *p, void *data)
>> > -- 
>> > 1.8.2.1
>> >
>> > _______________________________________________
>> > Intel-gfx mailing list
>> > Intel-gfx@lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> -- 
> Ben Widawsky, Intel Open Source Technology Center
Ben Widawsky April 25, 2013, 5:22 p.m. UTC | #5
On Thu, Apr 25, 2013 at 06:01:56PM +0300, Mika Kuoppala wrote:
> Ben Widawsky <ben@bwidawsk.net> writes:
> 
> > On Wed, Apr 24, 2013 at 06:11:40PM +0300, Mika Kuoppala wrote:
> >> Ben Widawsky <ben@bwidawsk.net> writes:
> >> 
> >> > Make resets optional for fini. fini is only ever called in
> >> > module_unload. It was therefore a good assumption that the GPU reset
> >> > (see the comment in the function) was what we wanted. With an upcoming
> >> > patch, we're going to add a few more callsites, one of which is GPU
> >> > reset, where doing the extra reset isn't usual.
> >> >
> >> > On that same note, with more callers it makes sense to make the default
> >> > context state consistent with the actual state (via NULLing the
> >> > pointer).
> >> >
> >> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> >> > ---
> >> >  drivers/gpu/drm/i915/i915_dma.c         | 2 +-
> >> >  drivers/gpu/drm/i915/i915_drv.h         | 2 +-
> >> >  drivers/gpu/drm/i915/i915_gem_context.c | 7 +++++--
> >> >  3 files changed, 7 insertions(+), 4 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> >> > index 3b315ba..a141b8a 100644
> >> > --- a/drivers/gpu/drm/i915/i915_dma.c
> >> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> >> > @@ -1764,7 +1764,7 @@ int i915_driver_unload(struct drm_device *dev)
> >> >  		mutex_lock(&dev->struct_mutex);
> >> >  		i915_gem_free_all_phys_object(dev);
> >> >  		i915_gem_cleanup_ringbuffer(dev);
> >> > -		i915_gem_context_fini(dev);
> >> > +		i915_gem_context_fini(dev, true);
> >> >  		mutex_unlock(&dev->struct_mutex);
> >> >  		i915_gem_cleanup_aliasing_ppgtt(dev);
> >> >  		i915_gem_cleanup_stolen(dev);
> >> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >> > index 9717c69..618845e 100644
> >> > --- a/drivers/gpu/drm/i915/i915_drv.h
> >> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> >> > @@ -1698,7 +1698,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
> >> >  
> >> >  /* i915_gem_context.c */
> >> >  void i915_gem_context_init(struct drm_device *dev);
> >> > -void i915_gem_context_fini(struct drm_device *dev);
> >> > +void i915_gem_context_fini(struct drm_device *dev, bool reset);
> >> >  void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
> >> >  int i915_switch_context(struct intel_ring_buffer *ring,
> >> >  			struct drm_file *file, int to_id);
> >> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> >> > index 411ace0..6030d83 100644
> >> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> >> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> >> > @@ -259,7 +259,7 @@ void i915_gem_context_init(struct drm_device *dev)
> >> >  	DRM_DEBUG_DRIVER("HW context support initialized\n");
> >> >  }
> >> >  
> >> > -void i915_gem_context_fini(struct drm_device *dev)
> >> > +void i915_gem_context_fini(struct drm_device *dev, bool reset)
> >> >  {
> >> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >> >  
> >> > @@ -269,11 +269,14 @@ void i915_gem_context_fini(struct drm_device *dev)
> >> >  	/* The only known way to stop the gpu from accessing the hw context is
> >> >  	 * to reset it. Do this as the very last operation to avoid confusing
> >> >  	 * other code, leading to spurious errors. */
> >> 
> >> Should we switch to default context in here to be sure that 
> >> the running context will get unreferenced correctly?...
> >
> > I'm confused if you're talking about the object refcount, or the context
> > refcount. The latter isn't yet introduced in this part of the series.
> > But maybe I've missed your point, so let's discuss...
> 
> I should have been more specific. Object refcount it is.
> 
> > There are 3 ways we can end up at fini():
> > 1. failed during some part of driver initialization
> > 2. unload
> >   Conceptually, 1 and 2 are the same, the GPU is idle, (explicitly so in
> >   the latter case) and so therefore we know the default context must be
> >   the last context context[1]. We know the retire worker has either been
> >   called, or we've not yet submitted work, and we should therefore be
> >   able to assert the object refcount is exactly 1.
> >
> >   Given our discussion and the new reset parameter in fini, I can assert
> >   last_context_obj == default_context->obj in the !reset case, and that
> >   the refcount is 1.
> 
> Yes. I forgot the gpu_idle which indeed switches to default context. So
> no need to switch.
> 
> > 3. reset
> >   In this case, it's a good question. Normally reset is called on a
> >   hang, and we can't expect to be able to switch. I think
> >   reset_ring_lists does everything I need, and then it follows into case
> >   1 and 2. If I have missed something though, could you please explain
> >   it a bit further fo rme?
> 
> My only concern was that the drivers bookkeepping of what context it has
> currently switched to the hw, is out of sync after reset. And as a
> result first call to i915_switch_context might skip the switch even
> if it was needed.

If you want to submit something that tries to force-fake a context
switch (ie.  do all the book keeping manually without actually using the
ring), we can see how horrible it looks and decide if we want to keep
it.

> 
> >> 
> >> > -	intel_gpu_reset(dev); +	if (reset) +
> >> > intel_gpu_reset(dev);
> >> >  
> >> >  	i915_gem_object_unpin(dev_priv->ring[RCS].default_context->obj);
> >> >
> >> 
> >> ...and unreference the default context here so that it will get freed
> >> on module unload.
> >
> > Can you explain what I'm missing. Doesn't do_destroy just below take
> > care of it?
> 
> do_destroy is enough for all the other contexts that were created
> by the ioctl. However the default context is different as we are owning
> it, adding to the refcount. So we need to unreference it also once before
> calling do destroy.

Yeah. This changes a bit in the series because refcount doesn't go to 1
until we successfully execute do_switch (i915_gem_context_enable). But I
think with my comment below, it's fine.

> 
> This is what i think is the cause as i see default context leaking
> with the current codebase, on module unload.
> 
> Adding
> + drm_gem_object_unreference(&dev_priv->ring[RCS].default_context->obj->base);
> after the unpin makes the leak go away.
> 
> -Mika


This looks like a bug to me as well:
10:19:04      bwidawsk | mkuoppal: you should submit that patch... maybe add a comment on the first unref that it may
                       | be unsafe to access the object after that point
10:19:11      bwidawsk | the one you added after unpin


> 
> >>
> >
> >  
> >> --Mika
> >
> > For the context reference counting case, I think the questions are still
> > applicable, and the answers are similar. You deref in reset_ring_lists,
> > init fail and unload are the same... If you want me to use the
> > context_unref interface, I can, but I would want to make unref return
> > the val of kref_put() and do while(!context_unref(...)) just to be safe.
> >
> >
> > You can take a look here to see how that's progressing:
> > http://cgit.freedesktop.org/~bwidawsk/drm-intel/log/?h=ppgtt-ctx
> >
> > It's quite volatile though, so be warned.
> >
> >
> >> 
> >> >  	do_destroy(dev_priv->ring[RCS].default_context);
> >> > +
> >> > +	dev_priv->ring[RCS].default_context = NULL;
> >> >  }
> >> >  
> >> >  static int context_idr_cleanup(int id, void *p, void *data)
> >> > -- 
> >> > 1.8.2.1
> >> >
> >> > _______________________________________________
> >> > Intel-gfx mailing list
> >> > Intel-gfx@lists.freedesktop.org
> >> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> >
> > -- 
> > Ben Widawsky, Intel Open Source Technology Center
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 3b315ba..a141b8a 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1764,7 +1764,7 @@  int i915_driver_unload(struct drm_device *dev)
 		mutex_lock(&dev->struct_mutex);
 		i915_gem_free_all_phys_object(dev);
 		i915_gem_cleanup_ringbuffer(dev);
-		i915_gem_context_fini(dev);
+		i915_gem_context_fini(dev, true);
 		mutex_unlock(&dev->struct_mutex);
 		i915_gem_cleanup_aliasing_ppgtt(dev);
 		i915_gem_cleanup_stolen(dev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9717c69..618845e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1698,7 +1698,7 @@  struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
 
 /* i915_gem_context.c */
 void i915_gem_context_init(struct drm_device *dev);
-void i915_gem_context_fini(struct drm_device *dev);
+void i915_gem_context_fini(struct drm_device *dev, bool reset);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct intel_ring_buffer *ring,
 			struct drm_file *file, int to_id);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 411ace0..6030d83 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -259,7 +259,7 @@  void i915_gem_context_init(struct drm_device *dev)
 	DRM_DEBUG_DRIVER("HW context support initialized\n");
 }
 
-void i915_gem_context_fini(struct drm_device *dev)
+void i915_gem_context_fini(struct drm_device *dev, bool reset)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
@@ -269,11 +269,14 @@  void i915_gem_context_fini(struct drm_device *dev)
 	/* The only known way to stop the gpu from accessing the hw context is
 	 * to reset it. Do this as the very last operation to avoid confusing
 	 * other code, leading to spurious errors. */
-	intel_gpu_reset(dev);
+	if (reset)
+		intel_gpu_reset(dev);
 
 	i915_gem_object_unpin(dev_priv->ring[RCS].default_context->obj);
 
 	do_destroy(dev_priv->ring[RCS].default_context);
+
+	dev_priv->ring[RCS].default_context = NULL;
 }
 
 static int context_idr_cleanup(int id, void *p, void *data)