diff mbox series

[drm-misc-next,v8,09/12] drm/gpuvm: reference count drm_gpuvm structures

Message ID 20231101233113.8059-10-dakr@redhat.com (mailing list archive)
State New, archived
Headers show
Series DRM GPUVM features | expand

Commit Message

Danilo Krummrich Nov. 1, 2023, 11:31 p.m. UTC
Implement reference counting for struct drm_gpuvm.

Signed-off-by: Danilo Krummrich <dakr@redhat.com>
---
 drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-------
 drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
 include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
 3 files changed, 78 insertions(+), 17 deletions(-)

Comments

kernel test robot Nov. 2, 2023, 10:46 a.m. UTC | #1
Hi Danilo,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 3c6c7ca4508b6cb1a033ac954c50a1b2c97af883]

url:    https://github.com/intel-lab-lkp/linux/commits/Danilo-Krummrich/drm-gpuvm-convert-WARN-to-drm_WARN-variants/20231102-073332
base:   3c6c7ca4508b6cb1a033ac954c50a1b2c97af883
patch link:    https://lore.kernel.org/r/20231101233113.8059-10-dakr%40redhat.com
patch subject: [PATCH drm-misc-next v8 09/12] drm/gpuvm: reference count drm_gpuvm structures
config: arc-allmodconfig (https://download.01.org/0day-ci/archive/20231102/202311021833.q8aYDJnr-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231102/202311021833.q8aYDJnr-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311021833.q8aYDJnr-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/drm_gpuvm.c:810: warning: expecting prototype for drm_gpuvm_bo_put(). Prototype was for drm_gpuvm_put() instead


vim +810 drivers/gpu/drm/drm_gpuvm.c

   801	
   802	/**
   803	 * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
   804	 * @gpuvm: the &drm_gpuvm to release the reference of
   805	 *
   806	 * This releases a reference to @gpuvm.
   807	 */
   808	void
   809	drm_gpuvm_put(struct drm_gpuvm *gpuvm)
 > 810	{
   811		if (gpuvm)
   812			kref_put(&gpuvm->kref, drm_gpuvm_free);
   813	}
   814	EXPORT_SYMBOL_GPL(drm_gpuvm_put);
   815
Thomas Hellstrom Nov. 2, 2023, 1:21 p.m. UTC | #2
On Thu, 2023-11-02 at 00:31 +0100, Danilo Krummrich wrote:
> Implement reference counting for struct drm_gpuvm.
> 
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>

Will port the Xe series over to check that it works properly and get
back with review on this one.


> ---
>  drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-----
> --
>  drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>  include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>  3 files changed, 78 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gpuvm.c
> b/drivers/gpu/drm/drm_gpuvm.c
> index 53e2c406fb04..6a88eafc5229 100644
> --- a/drivers/gpu/drm/drm_gpuvm.c
> +++ b/drivers/gpu/drm/drm_gpuvm.c
> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
> char *name,
>         gpuvm->rb.tree = RB_ROOT_CACHED;
>         INIT_LIST_HEAD(&gpuvm->rb.list);
>  
> +       kref_init(&gpuvm->kref);
> +
>         gpuvm->name = name ? name : "unknown";
>         gpuvm->flags = flags;
>         gpuvm->ops = ops;
> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
> char *name,
>  }
>  EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>  
> -/**
> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> - * @gpuvm: pointer to the &drm_gpuvm to clean up
> - *
> - * Note that it is a bug to call this function on a manager that
> still
> - * holds GPU VA mappings.
> - */
> -void
> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> +static void
> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>  {
>         gpuvm->name = NULL;
>  
> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>  
>         drm_gem_object_put(gpuvm->r_obj);
>  }
> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> +
> +static void
> +drm_gpuvm_free(struct kref *kref)
> +{
> +       struct drm_gpuvm *gpuvm = container_of(kref, struct
> drm_gpuvm, kref);
> +
> +       if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> +               return;
> +
> +       drm_gpuvm_fini(gpuvm);
> +
> +       gpuvm->ops->vm_free(gpuvm);
> +}
> +
> +/**
> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> + * @gpuvm: the &drm_gpuvm to release the reference of
> + *
> + * This releases a reference to @gpuvm.
> + */
> +void
> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> +{
> +       if (gpuvm)
> +               kref_put(&gpuvm->kref, drm_gpuvm_free);
> +}
> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>  
>  static int
>  __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>         if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>                 return -EINVAL;
>  
> -       return __drm_gpuva_insert(gpuvm, va);
> +       return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
>  }
>  EXPORT_SYMBOL_GPL(drm_gpuva_insert);
>  
> @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
>         }
>  
>         __drm_gpuva_remove(va);
> +       drm_gpuvm_put(va->vm);
>  }
>  EXPORT_SYMBOL_GPL(drm_gpuva_remove);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> index 54be12c1272f..cb2f06565c46 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct nouveau_bo
> *nvbo)
>         }
>  }
>  
> +static void
> +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
> +{
> +       struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
> +
> +       kfree(uvmm);
> +}
> +
> +static const struct drm_gpuvm_ops gpuvm_ops = {
> +       .vm_free = nouveau_uvmm_free,
> +};
> +
>  int
>  nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>                            void *data,
> @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device
> *dev,
>                        NOUVEAU_VA_SPACE_END,
>                        init->kernel_managed_addr,
>                        init->kernel_managed_size,
> -                      NULL);
> +                      &gpuvm_ops);
>         /* GPUVM takes care from here on. */
>         drm_gem_object_put(r_obj);
>  
> @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device
> *dev,
>         return 0;
>  
>  out_gpuvm_fini:
> -       drm_gpuvm_destroy(&uvmm->base);
> -       kfree(uvmm);
> +       drm_gpuvm_put(&uvmm->base);
>  out_unlock:
>         mutex_unlock(&cli->mutex);
>         return ret;
> @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
>  
>         mutex_lock(&cli->mutex);
>         nouveau_vmm_fini(&uvmm->vmm);
> -       drm_gpuvm_destroy(&uvmm->base);
> -       kfree(uvmm);
> +       drm_gpuvm_put(&uvmm->base);
>         mutex_unlock(&cli->mutex);
>  }
> diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
> index 0c2e24155a93..4e6e1fd3485a 100644
> --- a/include/drm/drm_gpuvm.h
> +++ b/include/drm/drm_gpuvm.h
> @@ -247,6 +247,11 @@ struct drm_gpuvm {
>                 struct list_head list;
>         } rb;
>  
> +       /**
> +        * @kref: reference count of this object
> +        */
> +       struct kref kref;
> +
>         /**
>          * @kernel_alloc_node:
>          *
> @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct drm_gpuvm *gpuvm,
> const char *name,
>                     u64 start_offset, u64 range,
>                     u64 reserve_offset, u64 reserve_range,
>                     const struct drm_gpuvm_ops *ops);
> -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
> +
> +/**
> + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
> + * @gpuvm: the &drm_gpuvm to acquire the reference of
> + *
> + * This function acquires an additional reference to @gpuvm. It is
> illegal to
> + * call this without already holding a reference. No locks required.
> + */
> +static inline struct drm_gpuvm *
> +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
> +{
> +       kref_get(&gpuvm->kref);
> +
> +       return gpuvm;
> +}
> +
> +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
>  
>  bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64
> range);
>  bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64
> range);
> @@ -673,6 +694,14 @@ static inline void drm_gpuva_init_from_op(struct
> drm_gpuva *va,
>   * operations to drivers.
>   */
>  struct drm_gpuvm_ops {
> +       /**
> +        * @vm_free: called when the last reference of a struct
> drm_gpuvm is
> +        * dropped
> +        *
> +        * This callback is mandatory.
> +        */
> +       void (*vm_free)(struct drm_gpuvm *gpuvm);
> +
>         /**
>          * @op_alloc: called when the &drm_gpuvm allocates
>          * a struct drm_gpuva_op
Thomas Hellstrom Nov. 2, 2023, 5:09 p.m. UTC | #3
On Thu, 2023-11-02 at 00:31 +0100, Danilo Krummrich wrote:
> Implement reference counting for struct drm_gpuvm.
> 
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>  drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-----
> --
>  drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>  include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>  3 files changed, 78 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gpuvm.c
> b/drivers/gpu/drm/drm_gpuvm.c
> index 53e2c406fb04..6a88eafc5229 100644
> --- a/drivers/gpu/drm/drm_gpuvm.c
> +++ b/drivers/gpu/drm/drm_gpuvm.c
> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
> char *name,
>         gpuvm->rb.tree = RB_ROOT_CACHED;
>         INIT_LIST_HEAD(&gpuvm->rb.list);
>  
> +       kref_init(&gpuvm->kref);
> +
>         gpuvm->name = name ? name : "unknown";
>         gpuvm->flags = flags;
>         gpuvm->ops = ops;
> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
> char *name,
>  }
>  EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>  
> -/**
> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> - * @gpuvm: pointer to the &drm_gpuvm to clean up
> - *
> - * Note that it is a bug to call this function on a manager that
> still
> - * holds GPU VA mappings.
> - */
> -void
> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> +static void
> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>  {
>         gpuvm->name = NULL;
>  
> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>  
>         drm_gem_object_put(gpuvm->r_obj);
>  }
> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> +
> +static void
> +drm_gpuvm_free(struct kref *kref)
> +{
> +       struct drm_gpuvm *gpuvm = container_of(kref, struct
> drm_gpuvm, kref);
> +
> +       if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> +               return;
> +
> +       drm_gpuvm_fini(gpuvm);
> +
> +       gpuvm->ops->vm_free(gpuvm);
> +}
> +
> +/**
> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
copy-paste error in function name.

Also it appears like xe might put a vm from irq context so we should
document the context where this function call is allowable, and if
applicable add a might_sleep().

If this function needs to sleep we can work around that in Xe by
keeping an xe-private refcount for the xe vm container, but I'd like to
avoid that if possible and piggy-back on the refcount introduced here.

> + * @gpuvm: the &drm_gpuvm to release the reference of
> + *
> + * This releases a reference to @gpuvm.
> + */
> +void
> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> +{
> +       if (gpuvm)
> +               kref_put(&gpuvm->kref, drm_gpuvm_free);
> +}
> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>  
>  static int
>  __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>         if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>                 return -EINVAL;
>  
> -       return __drm_gpuva_insert(gpuvm, va);
> +       return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);

Here we leak a reference if __drm_gpuva_insert() fails, and IMO the
reference should be taken where the pointer holding the reference is
assigned (in this case in __drm_gpuva_insert()), or document the
reference transfer from the argument close to the assignment.

But since a va itself is not refcounted it clearly can't outlive the
vm, so is a reference really needed here?

I'd suggest using an accessor that instead of using va->vm uses va-
>vm_bo->vm, to avoid needing to worry about the vm->vm refcount
altoghether.

Thanks,
Thomas
Danilo Krummrich Nov. 2, 2023, 5:32 p.m. UTC | #4
Hi Thomas,

thanks for your timely response on that!

On 11/2/23 18:09, Thomas Hellström wrote:
> On Thu, 2023-11-02 at 00:31 +0100, Danilo Krummrich wrote:
>> Implement reference counting for struct drm_gpuvm.
>>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>>   drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-----
>> --
>>   drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>>   include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>>   3 files changed, 78 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_gpuvm.c
>> b/drivers/gpu/drm/drm_gpuvm.c
>> index 53e2c406fb04..6a88eafc5229 100644
>> --- a/drivers/gpu/drm/drm_gpuvm.c
>> +++ b/drivers/gpu/drm/drm_gpuvm.c
>> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
>> char *name,
>>          gpuvm->rb.tree = RB_ROOT_CACHED;
>>          INIT_LIST_HEAD(&gpuvm->rb.list);
>>   
>> +       kref_init(&gpuvm->kref);
>> +
>>          gpuvm->name = name ? name : "unknown";
>>          gpuvm->flags = flags;
>>          gpuvm->ops = ops;
>> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
>> char *name,
>>   }
>>   EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>>   
>> -/**
>> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
>> - * @gpuvm: pointer to the &drm_gpuvm to clean up
>> - *
>> - * Note that it is a bug to call this function on a manager that
>> still
>> - * holds GPU VA mappings.
>> - */
>> -void
>> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>> +static void
>> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>>   {
>>          gpuvm->name = NULL;
>>   
>> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>   
>>          drm_gem_object_put(gpuvm->r_obj);
>>   }
>> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
>> +
>> +static void
>> +drm_gpuvm_free(struct kref *kref)
>> +{
>> +       struct drm_gpuvm *gpuvm = container_of(kref, struct
>> drm_gpuvm, kref);
>> +
>> +       if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>> +               return;
>> +
>> +       drm_gpuvm_fini(gpuvm);
>> +
>> +       gpuvm->ops->vm_free(gpuvm);
>> +}
>> +
>> +/**
>> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> copy-paste error in function name.
> 
> Also it appears like xe might put a vm from irq context so we should
> document the context where this function call is allowable, and if
> applicable add a might_sleep().

 From GPUVM PoV I don't see why we can't call this from an IRQ context.
It depends on the driver callbacks of GPUVM (->vm_free) and the resv GEM's
free callback. Both are controlled by the driver. Hence, I don't see the
need for a restriction here.

> 
> If this function needs to sleep we can work around that in Xe by
> keeping an xe-private refcount for the xe vm container, but I'd like to
> avoid that if possible and piggy-back on the refcount introduced here.
> 
>> + * @gpuvm: the &drm_gpuvm to release the reference of
>> + *
>> + * This releases a reference to @gpuvm.
>> + */
>> +void
>> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
>> +{
>> +       if (gpuvm)
>> +               kref_put(&gpuvm->kref, drm_gpuvm_free);
>> +}
>> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>>   
>>   static int
>>   __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>          if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>>                  return -EINVAL;
>>   
>> -       return __drm_gpuva_insert(gpuvm, va);
>> +       return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
> 
> Here we leak a reference if __drm_gpuva_insert() fails, and IMO the
> reference should be taken where the pointer holding the reference is
> assigned (in this case in __drm_gpuva_insert()), or document the
> reference transfer from the argument close to the assignment.

Ah, good catch. I had it in __drm_gpuva_insert() originally, but that
doesn't work, because __drm_gpuva_insert() is used to insert the
kernel_alloc_node. And we need to __drm_gpuva_remove() the kernel_alloc_node
from drm_gpuvm_fini(), which is called when the reference count is at zero
already. In fact, the __* variants are only there to handle the
kernel_alloc_node and this one clearly doesn't need reference counting.

> 
> But since a va itself is not refcounted it clearly can't outlive the
> vm, so is a reference really needed here?

Well, technically, it can. It just doesn't make any sense and would be
considered to be a bug. The reference count comes in handy to prevent
that in the first place.

I'd like to keep the reference count and just fix up the code.

> 
> I'd suggest using an accessor that instead of using va->vm uses va-
>> vm_bo->vm, to avoid needing to worry about the vm->vm refcount
> altoghether.

No, I want to keep that optional. Drivers should be able to use GPUVM to
track mappings without being required to implement everything else.

I think PowerVR, for instance, currently uses GPUVM only to track mappings
without everything else.

- Danilo

> 
> Thanks,
> Thomas
>
Thomas Hellstrom Nov. 2, 2023, 6:04 p.m. UTC | #5
On Thu, 2023-11-02 at 18:32 +0100, Danilo Krummrich wrote:
> Hi Thomas,
> 
> thanks for your timely response on that!
> 
> On 11/2/23 18:09, Thomas Hellström wrote:
> > On Thu, 2023-11-02 at 00:31 +0100, Danilo Krummrich wrote:
> > > Implement reference counting for struct drm_gpuvm.
> > > 
> > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > ---
> > >   drivers/gpu/drm/drm_gpuvm.c            | 44
> > > +++++++++++++++++++-----
> > > --
> > >   drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
> > >   include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
> > >   3 files changed, 78 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_gpuvm.c
> > > b/drivers/gpu/drm/drm_gpuvm.c
> > > index 53e2c406fb04..6a88eafc5229 100644
> > > --- a/drivers/gpu/drm/drm_gpuvm.c
> > > +++ b/drivers/gpu/drm/drm_gpuvm.c
> > > @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const
> > > char *name,
> > >          gpuvm->rb.tree = RB_ROOT_CACHED;
> > >          INIT_LIST_HEAD(&gpuvm->rb.list);
> > >   
> > > +       kref_init(&gpuvm->kref);
> > > +
> > >          gpuvm->name = name ? name : "unknown";
> > >          gpuvm->flags = flags;
> > >          gpuvm->ops = ops;
> > > @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm,
> > > const
> > > char *name,
> > >   }
> > >   EXPORT_SYMBOL_GPL(drm_gpuvm_init);
> > >   
> > > -/**
> > > - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> > > - * @gpuvm: pointer to the &drm_gpuvm to clean up
> > > - *
> > > - * Note that it is a bug to call this function on a manager that
> > > still
> > > - * holds GPU VA mappings.
> > > - */
> > > -void
> > > -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > > +static void
> > > +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
> > >   {
> > >          gpuvm->name = NULL;
> > >   
> > > @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > >   
> > >          drm_gem_object_put(gpuvm->r_obj);
> > >   }
> > > -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> > > +
> > > +static void
> > > +drm_gpuvm_free(struct kref *kref)
> > > +{
> > > +       struct drm_gpuvm *gpuvm = container_of(kref, struct
> > > drm_gpuvm, kref);
> > > +
> > > +       if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> > > +               return;
> > > +
> > > +       drm_gpuvm_fini(gpuvm);
> > > +
> > > +       gpuvm->ops->vm_free(gpuvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> > copy-paste error in function name.
> > 
> > Also it appears like xe might put a vm from irq context so we
> > should
> > document the context where this function call is allowable, and if
> > applicable add a might_sleep().
> 
>  From GPUVM PoV I don't see why we can't call this from an IRQ
> context.
> It depends on the driver callbacks of GPUVM (->vm_free) and the resv
> GEM's
> free callback. Both are controlled by the driver. Hence, I don't see
> the
> need for a restriction here.

OK. we should keep in mind though, that if such a restriction is needed
in the future, it might be some work to fix the drivers.

> 
> > 
> > If this function needs to sleep we can work around that in Xe by
> > keeping an xe-private refcount for the xe vm container, but I'd
> > like to
> > avoid that if possible and piggy-back on the refcount introduced
> > here.
> > 
> > > + * @gpuvm: the &drm_gpuvm to release the reference of
> > > + *
> > > + * This releases a reference to @gpuvm.
> > > + */
> > > +void
> > > +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> > > +{
> > > +       if (gpuvm)
> > > +               kref_put(&gpuvm->kref, drm_gpuvm_free);
> > > +}
> > > +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
> > >   
> > >   static int
> > >   __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > > @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > >          if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr,
> > > range)))
> > >                  return -EINVAL;
> > >   
> > > -       return __drm_gpuva_insert(gpuvm, va);
> > > +       return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
> > 
> > Here we leak a reference if __drm_gpuva_insert() fails, and IMO the
> > reference should be taken where the pointer holding the reference
> > is
> > assigned (in this case in __drm_gpuva_insert()), or document the
> > reference transfer from the argument close to the assignment.
> 
> Ah, good catch. I had it in __drm_gpuva_insert() originally, but that
> doesn't work, because __drm_gpuva_insert() is used to insert the
> kernel_alloc_node. And we need to __drm_gpuva_remove() the
> kernel_alloc_node
> from drm_gpuvm_fini(), which is called when the reference count is at
> zero
> already. In fact, the __* variants are only there to handle the
> kernel_alloc_node and this one clearly doesn't need reference
> counting.
> 
> > 
> > But since a va itself is not refcounted it clearly can't outlive
> > the
> > vm, so is a reference really needed here?
> 
> Well, technically, it can. It just doesn't make any sense and would
> be
> considered to be a bug. The reference count comes in handy to prevent
> that in the first place.

> 
> I'd like to keep the reference count and just fix up the code.

OK. That's probably being a bit overly cautious IMHO, but I can't see
any major drawbacks either.

> 
> > 
> > I'd suggest using an accessor that instead of using va->vm uses va-
> > > vm_bo->vm, to avoid needing to worry about the vm->vm refcount
> > altoghether.
> 
> No, I want to keep that optional. Drivers should be able to use GPUVM
> to
> track mappings without being required to implement everything else.
> 
> I think PowerVR, for instance, currently uses GPUVM only to track
> mappings
> without everything else.

Yeah, I also realized that userptr is another potential user.
A badly though-trough suggestion..

Thanks,
Thomas


> 
> - Danilo
> 
> > 
> > Thanks,
> > Thomas
> > 
>
Christian König Nov. 3, 2023, 7:18 a.m. UTC | #6
Am 02.11.23 um 00:31 schrieb Danilo Krummrich:
> Implement reference counting for struct drm_gpuvm.

 From the design point of view what is that good for?

Background is that the most common use case I see is that this object is 
embedded into something else and a reference count is then not really a 
good idea.

Thanks,
Christian.

>
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>   drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-------
>   drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>   include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>   3 files changed, 78 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
> index 53e2c406fb04..6a88eafc5229 100644
> --- a/drivers/gpu/drm/drm_gpuvm.c
> +++ b/drivers/gpu/drm/drm_gpuvm.c
> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>   	gpuvm->rb.tree = RB_ROOT_CACHED;
>   	INIT_LIST_HEAD(&gpuvm->rb.list);
>   
> +	kref_init(&gpuvm->kref);
> +
>   	gpuvm->name = name ? name : "unknown";
>   	gpuvm->flags = flags;
>   	gpuvm->ops = ops;
> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>   }
>   EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>   
> -/**
> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> - * @gpuvm: pointer to the &drm_gpuvm to clean up
> - *
> - * Note that it is a bug to call this function on a manager that still
> - * holds GPU VA mappings.
> - */
> -void
> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> +static void
> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>   {
>   	gpuvm->name = NULL;
>   
> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>   
>   	drm_gem_object_put(gpuvm->r_obj);
>   }
> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> +
> +static void
> +drm_gpuvm_free(struct kref *kref)
> +{
> +	struct drm_gpuvm *gpuvm = container_of(kref, struct drm_gpuvm, kref);
> +
> +	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> +		return;
> +
> +	drm_gpuvm_fini(gpuvm);
> +
> +	gpuvm->ops->vm_free(gpuvm);
> +}
> +
> +/**
> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> + * @gpuvm: the &drm_gpuvm to release the reference of
> + *
> + * This releases a reference to @gpuvm.
> + */
> +void
> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> +{
> +	if (gpuvm)
> +		kref_put(&gpuvm->kref, drm_gpuvm_free);
> +}
> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>   
>   static int
>   __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>   	if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>   		return -EINVAL;
>   
> -	return __drm_gpuva_insert(gpuvm, va);
> +	return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
>   }
>   EXPORT_SYMBOL_GPL(drm_gpuva_insert);
>   
> @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
>   	}
>   
>   	__drm_gpuva_remove(va);
> +	drm_gpuvm_put(va->vm);
>   }
>   EXPORT_SYMBOL_GPL(drm_gpuva_remove);
>   
> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> index 54be12c1272f..cb2f06565c46 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct nouveau_bo *nvbo)
>   	}
>   }
>   
> +static void
> +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
> +{
> +	struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
> +
> +	kfree(uvmm);
> +}
> +
> +static const struct drm_gpuvm_ops gpuvm_ops = {
> +	.vm_free = nouveau_uvmm_free,
> +};
> +
>   int
>   nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>   			   void *data,
> @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>   		       NOUVEAU_VA_SPACE_END,
>   		       init->kernel_managed_addr,
>   		       init->kernel_managed_size,
> -		       NULL);
> +		       &gpuvm_ops);
>   	/* GPUVM takes care from here on. */
>   	drm_gem_object_put(r_obj);
>   
> @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>   	return 0;
>   
>   out_gpuvm_fini:
> -	drm_gpuvm_destroy(&uvmm->base);
> -	kfree(uvmm);
> +	drm_gpuvm_put(&uvmm->base);
>   out_unlock:
>   	mutex_unlock(&cli->mutex);
>   	return ret;
> @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
>   
>   	mutex_lock(&cli->mutex);
>   	nouveau_vmm_fini(&uvmm->vmm);
> -	drm_gpuvm_destroy(&uvmm->base);
> -	kfree(uvmm);
> +	drm_gpuvm_put(&uvmm->base);
>   	mutex_unlock(&cli->mutex);
>   }
> diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
> index 0c2e24155a93..4e6e1fd3485a 100644
> --- a/include/drm/drm_gpuvm.h
> +++ b/include/drm/drm_gpuvm.h
> @@ -247,6 +247,11 @@ struct drm_gpuvm {
>   		struct list_head list;
>   	} rb;
>   
> +	/**
> +	 * @kref: reference count of this object
> +	 */
> +	struct kref kref;
> +
>   	/**
>   	 * @kernel_alloc_node:
>   	 *
> @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>   		    u64 start_offset, u64 range,
>   		    u64 reserve_offset, u64 reserve_range,
>   		    const struct drm_gpuvm_ops *ops);
> -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
> +
> +/**
> + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
> + * @gpuvm: the &drm_gpuvm to acquire the reference of
> + *
> + * This function acquires an additional reference to @gpuvm. It is illegal to
> + * call this without already holding a reference. No locks required.
> + */
> +static inline struct drm_gpuvm *
> +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
> +{
> +	kref_get(&gpuvm->kref);
> +
> +	return gpuvm;
> +}
> +
> +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
>   
>   bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
>   bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
> @@ -673,6 +694,14 @@ static inline void drm_gpuva_init_from_op(struct drm_gpuva *va,
>    * operations to drivers.
>    */
>   struct drm_gpuvm_ops {
> +	/**
> +	 * @vm_free: called when the last reference of a struct drm_gpuvm is
> +	 * dropped
> +	 *
> +	 * This callback is mandatory.
> +	 */
> +	void (*vm_free)(struct drm_gpuvm *gpuvm);
> +
>   	/**
>   	 * @op_alloc: called when the &drm_gpuvm allocates
>   	 * a struct drm_gpuva_op
Danilo Krummrich Nov. 3, 2023, 1:14 p.m. UTC | #7
On Fri, Nov 03, 2023 at 08:18:35AM +0100, Christian König wrote:
> Am 02.11.23 um 00:31 schrieb Danilo Krummrich:
> > Implement reference counting for struct drm_gpuvm.
> 
> From the design point of view what is that good for?

It was discussed in this thread [1].

Essentially, the idea is to make sure that vm_bo->vm is always valid without the
driver having the need to take extra care. It also ensures that GPUVM can't be
freed with mappings still held.

> 
> Background is that the most common use case I see is that this object is
> embedded into something else and a reference count is then not really a good
> idea.

Do you have a specific use-case in mind where this would interfere?

> 
> Thanks,
> Christian.

[1] https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/

> 
> > 
> > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > ---
> >   drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-------
> >   drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
> >   include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
> >   3 files changed, 78 insertions(+), 17 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
> > index 53e2c406fb04..6a88eafc5229 100644
> > --- a/drivers/gpu/drm/drm_gpuvm.c
> > +++ b/drivers/gpu/drm/drm_gpuvm.c
> > @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
> >   	gpuvm->rb.tree = RB_ROOT_CACHED;
> >   	INIT_LIST_HEAD(&gpuvm->rb.list);
> > +	kref_init(&gpuvm->kref);
> > +
> >   	gpuvm->name = name ? name : "unknown";
> >   	gpuvm->flags = flags;
> >   	gpuvm->ops = ops;
> > @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
> >   }
> >   EXPORT_SYMBOL_GPL(drm_gpuvm_init);
> > -/**
> > - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> > - * @gpuvm: pointer to the &drm_gpuvm to clean up
> > - *
> > - * Note that it is a bug to call this function on a manager that still
> > - * holds GPU VA mappings.
> > - */
> > -void
> > -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > +static void
> > +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
> >   {
> >   	gpuvm->name = NULL;
> > @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> >   	drm_gem_object_put(gpuvm->r_obj);
> >   }
> > -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> > +
> > +static void
> > +drm_gpuvm_free(struct kref *kref)
> > +{
> > +	struct drm_gpuvm *gpuvm = container_of(kref, struct drm_gpuvm, kref);
> > +
> > +	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> > +		return;
> > +
> > +	drm_gpuvm_fini(gpuvm);
> > +
> > +	gpuvm->ops->vm_free(gpuvm);
> > +}
> > +
> > +/**
> > + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> > + * @gpuvm: the &drm_gpuvm to release the reference of
> > + *
> > + * This releases a reference to @gpuvm.
> > + */
> > +void
> > +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> > +{
> > +	if (gpuvm)
> > +		kref_put(&gpuvm->kref, drm_gpuvm_free);
> > +}
> > +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
> >   static int
> >   __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> >   	if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
> >   		return -EINVAL;
> > -	return __drm_gpuva_insert(gpuvm, va);
> > +	return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
> >   }
> >   EXPORT_SYMBOL_GPL(drm_gpuva_insert);
> > @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
> >   	}
> >   	__drm_gpuva_remove(va);
> > +	drm_gpuvm_put(va->vm);
> >   }
> >   EXPORT_SYMBOL_GPL(drm_gpuva_remove);
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > index 54be12c1272f..cb2f06565c46 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct nouveau_bo *nvbo)
> >   	}
> >   }
> > +static void
> > +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
> > +{
> > +	struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
> > +
> > +	kfree(uvmm);
> > +}
> > +
> > +static const struct drm_gpuvm_ops gpuvm_ops = {
> > +	.vm_free = nouveau_uvmm_free,
> > +};
> > +
> >   int
> >   nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
> >   			   void *data,
> > @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
> >   		       NOUVEAU_VA_SPACE_END,
> >   		       init->kernel_managed_addr,
> >   		       init->kernel_managed_size,
> > -		       NULL);
> > +		       &gpuvm_ops);
> >   	/* GPUVM takes care from here on. */
> >   	drm_gem_object_put(r_obj);
> > @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
> >   	return 0;
> >   out_gpuvm_fini:
> > -	drm_gpuvm_destroy(&uvmm->base);
> > -	kfree(uvmm);
> > +	drm_gpuvm_put(&uvmm->base);
> >   out_unlock:
> >   	mutex_unlock(&cli->mutex);
> >   	return ret;
> > @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
> >   	mutex_lock(&cli->mutex);
> >   	nouveau_vmm_fini(&uvmm->vmm);
> > -	drm_gpuvm_destroy(&uvmm->base);
> > -	kfree(uvmm);
> > +	drm_gpuvm_put(&uvmm->base);
> >   	mutex_unlock(&cli->mutex);
> >   }
> > diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
> > index 0c2e24155a93..4e6e1fd3485a 100644
> > --- a/include/drm/drm_gpuvm.h
> > +++ b/include/drm/drm_gpuvm.h
> > @@ -247,6 +247,11 @@ struct drm_gpuvm {
> >   		struct list_head list;
> >   	} rb;
> > +	/**
> > +	 * @kref: reference count of this object
> > +	 */
> > +	struct kref kref;
> > +
> >   	/**
> >   	 * @kernel_alloc_node:
> >   	 *
> > @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
> >   		    u64 start_offset, u64 range,
> >   		    u64 reserve_offset, u64 reserve_range,
> >   		    const struct drm_gpuvm_ops *ops);
> > -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
> > +
> > +/**
> > + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
> > + * @gpuvm: the &drm_gpuvm to acquire the reference of
> > + *
> > + * This function acquires an additional reference to @gpuvm. It is illegal to
> > + * call this without already holding a reference. No locks required.
> > + */
> > +static inline struct drm_gpuvm *
> > +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
> > +{
> > +	kref_get(&gpuvm->kref);
> > +
> > +	return gpuvm;
> > +}
> > +
> > +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
> >   bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
> >   bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
> > @@ -673,6 +694,14 @@ static inline void drm_gpuva_init_from_op(struct drm_gpuva *va,
> >    * operations to drivers.
> >    */
> >   struct drm_gpuvm_ops {
> > +	/**
> > +	 * @vm_free: called when the last reference of a struct drm_gpuvm is
> > +	 * dropped
> > +	 *
> > +	 * This callback is mandatory.
> > +	 */
> > +	void (*vm_free)(struct drm_gpuvm *gpuvm);
> > +
> >   	/**
> >   	 * @op_alloc: called when the &drm_gpuvm allocates
> >   	 * a struct drm_gpuva_op
>
Christian König Nov. 3, 2023, 2:04 p.m. UTC | #8
Am 03.11.23 um 14:14 schrieb Danilo Krummrich:
> On Fri, Nov 03, 2023 at 08:18:35AM +0100, Christian König wrote:
>> Am 02.11.23 um 00:31 schrieb Danilo Krummrich:
>>> Implement reference counting for struct drm_gpuvm.
>>  From the design point of view what is that good for?
> It was discussed in this thread [1].
>
> Essentially, the idea is to make sure that vm_bo->vm is always valid without the
> driver having the need to take extra care. It also ensures that GPUVM can't be
> freed with mappings still held.

Well in this case I have some objections to this. The lifetime of the VM 
is driver and use case specific.

Especially we most likely don't want the VM to live longer than the 
application which originally used it. If you make the GPUVM an 
independent object you actually open up driver abuse for the lifetime of 
this.

Additional to that see below for a quite real problem with this.

>> Background is that the most common use case I see is that this object is
>> embedded into something else and a reference count is then not really a good
>> idea.
> Do you have a specific use-case in mind where this would interfere?

Yes, absolutely. For an example see amdgpu_mes_self_test(), here we 
initialize a temporary amdgpu VM for an in kernel unit test which runs 
during driver load.

When the function returns I need to guarantee that the VM is destroyed 
or otherwise I will mess up normal operation.

Reference counting is nice when you don't know who else is referring to 
your VM, but the cost is that you also don't know when the object will 
guardedly be destroyed.

I can trivially work around this by saying that the generic GPUVM object 
has a different lifetime than the amdgpu specific object, but that opens 
up doors for use after free again.

Regards,
Christian.

>
>> Thanks,
>> Christian.
> [1]https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/
>
>>> Signed-off-by: Danilo Krummrich<dakr@redhat.com>
>>> ---
>>>    drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-------
>>>    drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>>>    include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>>>    3 files changed, 78 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
>>> index 53e2c406fb04..6a88eafc5229 100644
>>> --- a/drivers/gpu/drm/drm_gpuvm.c
>>> +++ b/drivers/gpu/drm/drm_gpuvm.c
>>> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>>>    	gpuvm->rb.tree = RB_ROOT_CACHED;
>>>    	INIT_LIST_HEAD(&gpuvm->rb.list);
>>> +	kref_init(&gpuvm->kref);
>>> +
>>>    	gpuvm->name = name ? name : "unknown";
>>>    	gpuvm->flags = flags;
>>>    	gpuvm->ops = ops;
>>> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>>>    }
>>>    EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>>> -/**
>>> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
>>> - * @gpuvm: pointer to the &drm_gpuvm to clean up
>>> - *
>>> - * Note that it is a bug to call this function on a manager that still
>>> - * holds GPU VA mappings.
>>> - */
>>> -void
>>> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>> +static void
>>> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>>>    {
>>>    	gpuvm->name = NULL;
>>> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>    	drm_gem_object_put(gpuvm->r_obj);
>>>    }
>>> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
>>> +
>>> +static void
>>> +drm_gpuvm_free(struct kref *kref)
>>> +{
>>> +	struct drm_gpuvm *gpuvm = container_of(kref, struct drm_gpuvm, kref);
>>> +
>>> +	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>>> +		return;
>>> +
>>> +	drm_gpuvm_fini(gpuvm);
>>> +
>>> +	gpuvm->ops->vm_free(gpuvm);
>>> +}
>>> +
>>> +/**
>>> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
>>> + * @gpuvm: the &drm_gpuvm to release the reference of
>>> + *
>>> + * This releases a reference to @gpuvm.
>>> + */
>>> +void
>>> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
>>> +{
>>> +	if (gpuvm)
>>> +		kref_put(&gpuvm->kref, drm_gpuvm_free);
>>> +}
>>> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>>>    static int
>>>    __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>    	if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>>>    		return -EINVAL;
>>> -	return __drm_gpuva_insert(gpuvm, va);
>>> +	return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
>>>    }
>>>    EXPORT_SYMBOL_GPL(drm_gpuva_insert);
>>> @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
>>>    	}
>>>    	__drm_gpuva_remove(va);
>>> +	drm_gpuvm_put(va->vm);
>>>    }
>>>    EXPORT_SYMBOL_GPL(drm_gpuva_remove);
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>> index 54be12c1272f..cb2f06565c46 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>> @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct nouveau_bo *nvbo)
>>>    	}
>>>    }
>>> +static void
>>> +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
>>> +{
>>> +	struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
>>> +
>>> +	kfree(uvmm);
>>> +}
>>> +
>>> +static const struct drm_gpuvm_ops gpuvm_ops = {
>>> +	.vm_free = nouveau_uvmm_free,
>>> +};
>>> +
>>>    int
>>>    nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>    			   void *data,
>>> @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>    		       NOUVEAU_VA_SPACE_END,
>>>    		       init->kernel_managed_addr,
>>>    		       init->kernel_managed_size,
>>> -		       NULL);
>>> +		       &gpuvm_ops);
>>>    	/* GPUVM takes care from here on. */
>>>    	drm_gem_object_put(r_obj);
>>> @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>    	return 0;
>>>    out_gpuvm_fini:
>>> -	drm_gpuvm_destroy(&uvmm->base);
>>> -	kfree(uvmm);
>>> +	drm_gpuvm_put(&uvmm->base);
>>>    out_unlock:
>>>    	mutex_unlock(&cli->mutex);
>>>    	return ret;
>>> @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
>>>    	mutex_lock(&cli->mutex);
>>>    	nouveau_vmm_fini(&uvmm->vmm);
>>> -	drm_gpuvm_destroy(&uvmm->base);
>>> -	kfree(uvmm);
>>> +	drm_gpuvm_put(&uvmm->base);
>>>    	mutex_unlock(&cli->mutex);
>>>    }
>>> diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
>>> index 0c2e24155a93..4e6e1fd3485a 100644
>>> --- a/include/drm/drm_gpuvm.h
>>> +++ b/include/drm/drm_gpuvm.h
>>> @@ -247,6 +247,11 @@ struct drm_gpuvm {
>>>    		struct list_head list;
>>>    	} rb;
>>> +	/**
>>> +	 * @kref: reference count of this object
>>> +	 */
>>> +	struct kref kref;
>>> +
>>>    	/**
>>>    	 * @kernel_alloc_node:
>>>    	 *
>>> @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>>>    		    u64 start_offset, u64 range,
>>>    		    u64 reserve_offset, u64 reserve_range,
>>>    		    const struct drm_gpuvm_ops *ops);
>>> -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
>>> +
>>> +/**
>>> + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
>>> + * @gpuvm: the &drm_gpuvm to acquire the reference of
>>> + *
>>> + * This function acquires an additional reference to @gpuvm. It is illegal to
>>> + * call this without already holding a reference. No locks required.
>>> + */
>>> +static inline struct drm_gpuvm *
>>> +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
>>> +{
>>> +	kref_get(&gpuvm->kref);
>>> +
>>> +	return gpuvm;
>>> +}
>>> +
>>> +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
>>>    bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
>>>    bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
>>> @@ -673,6 +694,14 @@ static inline void drm_gpuva_init_from_op(struct drm_gpuva *va,
>>>     * operations to drivers.
>>>     */
>>>    struct drm_gpuvm_ops {
>>> +	/**
>>> +	 * @vm_free: called when the last reference of a struct drm_gpuvm is
>>> +	 * dropped
>>> +	 *
>>> +	 * This callback is mandatory.
>>> +	 */
>>> +	void (*vm_free)(struct drm_gpuvm *gpuvm);
>>> +
>>>    	/**
>>>    	 * @op_alloc: called when the &drm_gpuvm allocates
>>>    	 * a struct drm_gpuva_op
Danilo Krummrich Nov. 3, 2023, 3:34 p.m. UTC | #9
On 11/3/23 15:04, Christian König wrote:
> Am 03.11.23 um 14:14 schrieb Danilo Krummrich:
>> On Fri, Nov 03, 2023 at 08:18:35AM +0100, Christian König wrote:
>>> Am 02.11.23 um 00:31 schrieb Danilo Krummrich:
>>>> Implement reference counting for struct drm_gpuvm.
>>>  From the design point of view what is that good for?
>> It was discussed in this thread [1].
>>
>> Essentially, the idea is to make sure that vm_bo->vm is always valid without the
>> driver having the need to take extra care. It also ensures that GPUVM can't be
>> freed with mappings still held.
> 
> Well in this case I have some objections to this. The lifetime of the VM is driver and use case specific.

That's fine, I don't see how this changes with a reference count.

> 
> Especially we most likely don't want the VM to live longer than the application which originally used it. If you make the GPUVM an independent object you actually open up driver abuse for the lifetime of this.

Right, we don't want that. But I don't see how the reference count prevents that.

Independant object is relative. struct drm_gpuvm is still embedded into a driver
specific structure. It's working the same way as with struct drm_gem_obejct.

> 
> Additional to that see below for a quite real problem with this.
> 
>>> Background is that the most common use case I see is that this object is
>>> embedded into something else and a reference count is then not really a good
>>> idea.
>> Do you have a specific use-case in mind where this would interfere?
> 
> Yes, absolutely. For an example see amdgpu_mes_self_test(), here we initialize a temporary amdgpu VM for an in kernel unit test which runs during driver load.
> 
> When the function returns I need to guarantee that the VM is destroyed or otherwise I will mess up normal operation.

Nothing prevents that. The reference counting is well defined. If the driver did not
take additional references (which is clearly up to the driver taking care of) and all
VM_BOs and mappings are cleaned up, the reference count is guaranteed to be 1 at this
point.

Also note that if the driver would have not cleaned up all VM_BOs and mappings before
shutting down the VM, it would have been a bug anyways and the driver would potentially
leak memory and UAF issues.

Hence, If the VM is still alive at a point where you don't expect it to be, then it's
simply a driver bug.

> 
> Reference counting is nice when you don't know who else is referring to your VM, but the cost is that you also don't know when the object will guardedly be destroyed.
> 
> I can trivially work around this by saying that the generic GPUVM object has a different lifetime than the amdgpu specific object, but that opens up doors for use after free again.

If your driver never touches the VM's reference count and exits the VM with a clean state
(no mappings and no VM_BOs left), effectively, this is the same as having no reference
count.

In the very worst case you could argue that we trade a potential UAF *and* memroy leak
(no reference count) with *only* a memory leak (with reference count), which to me seems
reasonable.

> 
> Regards,
> Christian.
> 
>>> Thanks,
>>> Christian.
>> [1]https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/
>>
>>>> Signed-off-by: Danilo Krummrich<dakr@redhat.com>
>>>> ---
>>>>    drivers/gpu/drm/drm_gpuvm.c            | 44 +++++++++++++++++++-------
>>>>    drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>>>>    include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>>>>    3 files changed, 78 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
>>>> index 53e2c406fb04..6a88eafc5229 100644
>>>> --- a/drivers/gpu/drm/drm_gpuvm.c
>>>> +++ b/drivers/gpu/drm/drm_gpuvm.c
>>>> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>>>>    	gpuvm->rb.tree = RB_ROOT_CACHED;
>>>>    	INIT_LIST_HEAD(&gpuvm->rb.list);
>>>> +	kref_init(&gpuvm->kref);
>>>> +
>>>>    	gpuvm->name = name ? name : "unknown";
>>>>    	gpuvm->flags = flags;
>>>>    	gpuvm->ops = ops;
>>>> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>>>> -/**
>>>> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
>>>> - * @gpuvm: pointer to the &drm_gpuvm to clean up
>>>> - *
>>>> - * Note that it is a bug to call this function on a manager that still
>>>> - * holds GPU VA mappings.
>>>> - */
>>>> -void
>>>> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>> +static void
>>>> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>>>>    {
>>>>    	gpuvm->name = NULL;
>>>> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>>    	drm_gem_object_put(gpuvm->r_obj);
>>>>    }
>>>> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
>>>> +
>>>> +static void
>>>> +drm_gpuvm_free(struct kref *kref)
>>>> +{
>>>> +	struct drm_gpuvm *gpuvm = container_of(kref, struct drm_gpuvm, kref);
>>>> +
>>>> +	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>>>> +		return;
>>>> +
>>>> +	drm_gpuvm_fini(gpuvm);
>>>> +
>>>> +	gpuvm->ops->vm_free(gpuvm);
>>>> +}
>>>> +
>>>> +/**
>>>> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
>>>> + * @gpuvm: the &drm_gpuvm to release the reference of
>>>> + *
>>>> + * This releases a reference to @gpuvm.
>>>> + */
>>>> +void
>>>> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
>>>> +{
>>>> +	if (gpuvm)
>>>> +		kref_put(&gpuvm->kref, drm_gpuvm_free);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>>>>    static int
>>>>    __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>>    	if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>>>>    		return -EINVAL;
>>>> -	return __drm_gpuva_insert(gpuvm, va);
>>>> +	return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(drm_gpuva_insert);
>>>> @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
>>>>    	}
>>>>    	__drm_gpuva_remove(va);
>>>> +	drm_gpuvm_put(va->vm);
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(drm_gpuva_remove);
>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>> index 54be12c1272f..cb2f06565c46 100644
>>>> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>> @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct nouveau_bo *nvbo)
>>>>    	}
>>>>    }
>>>> +static void
>>>> +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
>>>> +{
>>>> +	struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
>>>> +
>>>> +	kfree(uvmm);
>>>> +}
>>>> +
>>>> +static const struct drm_gpuvm_ops gpuvm_ops = {
>>>> +	.vm_free = nouveau_uvmm_free,
>>>> +};
>>>> +
>>>>    int
>>>>    nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>>    			   void *data,
>>>> @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>>    		       NOUVEAU_VA_SPACE_END,
>>>>    		       init->kernel_managed_addr,
>>>>    		       init->kernel_managed_size,
>>>> -		       NULL);
>>>> +		       &gpuvm_ops);
>>>>    	/* GPUVM takes care from here on. */
>>>>    	drm_gem_object_put(r_obj);
>>>> @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>>    	return 0;
>>>>    out_gpuvm_fini:
>>>> -	drm_gpuvm_destroy(&uvmm->base);
>>>> -	kfree(uvmm);
>>>> +	drm_gpuvm_put(&uvmm->base);
>>>>    out_unlock:
>>>>    	mutex_unlock(&cli->mutex);
>>>>    	return ret;
>>>> @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
>>>>    	mutex_lock(&cli->mutex);
>>>>    	nouveau_vmm_fini(&uvmm->vmm);
>>>> -	drm_gpuvm_destroy(&uvmm->base);
>>>> -	kfree(uvmm);
>>>> +	drm_gpuvm_put(&uvmm->base);
>>>>    	mutex_unlock(&cli->mutex);
>>>>    }
>>>> diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
>>>> index 0c2e24155a93..4e6e1fd3485a 100644
>>>> --- a/include/drm/drm_gpuvm.h
>>>> +++ b/include/drm/drm_gpuvm.h
>>>> @@ -247,6 +247,11 @@ struct drm_gpuvm {
>>>>    		struct list_head list;
>>>>    	} rb;
>>>> +	/**
>>>> +	 * @kref: reference count of this object
>>>> +	 */
>>>> +	struct kref kref;
>>>> +
>>>>    	/**
>>>>    	 * @kernel_alloc_node:
>>>>    	 *
>>>> @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
>>>>    		    u64 start_offset, u64 range,
>>>>    		    u64 reserve_offset, u64 reserve_range,
>>>>    		    const struct drm_gpuvm_ops *ops);
>>>> -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
>>>> +
>>>> +/**
>>>> + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
>>>> + * @gpuvm: the &drm_gpuvm to acquire the reference of
>>>> + *
>>>> + * This function acquires an additional reference to @gpuvm. It is illegal to
>>>> + * call this without already holding a reference. No locks required.
>>>> + */
>>>> +static inline struct drm_gpuvm *
>>>> +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
>>>> +{
>>>> +	kref_get(&gpuvm->kref);
>>>> +
>>>> +	return gpuvm;
>>>> +}
>>>> +
>>>> +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
>>>>    bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
>>>>    bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
>>>> @@ -673,6 +694,14 @@ static inline void drm_gpuva_init_from_op(struct drm_gpuva *va,
>>>>     * operations to drivers.
>>>>     */
>>>>    struct drm_gpuvm_ops {
>>>> +	/**
>>>> +	 * @vm_free: called when the last reference of a struct drm_gpuvm is
>>>> +	 * dropped
>>>> +	 *
>>>> +	 * This callback is mandatory.
>>>> +	 */
>>>> +	void (*vm_free)(struct drm_gpuvm *gpuvm);
>>>> +
>>>>    	/**
>>>>    	 * @op_alloc: called when the &drm_gpuvm allocates
>>>>    	 * a struct drm_gpuva_op
>
Christian König Nov. 6, 2023, 9:14 a.m. UTC | #10
Am 03.11.23 um 16:34 schrieb Danilo Krummrich:
[SNIP]
>>
>> Especially we most likely don't want the VM to live longer than the 
>> application which originally used it. If you make the GPUVM an 
>> independent object you actually open up driver abuse for the lifetime 
>> of this.
>
> Right, we don't want that. But I don't see how the reference count 
> prevents that.

It doesn't prevents that, it's just not the most defensive approach.

>
> Independant object is relative. struct drm_gpuvm is still embedded 
> into a driver
> specific structure. It's working the same way as with struct 
> drm_gem_obejct.
>
>>
>> Additional to that see below for a quite real problem with this.
>>
>>>> Background is that the most common use case I see is that this 
>>>> object is
>>>> embedded into something else and a reference count is then not 
>>>> really a good
>>>> idea.
>>> Do you have a specific use-case in mind where this would interfere?
>>
>> Yes, absolutely. For an example see amdgpu_mes_self_test(), here we 
>> initialize a temporary amdgpu VM for an in kernel unit test which 
>> runs during driver load.
>>
>> When the function returns I need to guarantee that the VM is 
>> destroyed or otherwise I will mess up normal operation.
>
> Nothing prevents that. The reference counting is well defined. If the 
> driver did not
> take additional references (which is clearly up to the driver taking 
> care of) and all
> VM_BOs and mappings are cleaned up, the reference count is guaranteed 
> to be 1 at this
> point.
>
> Also note that if the driver would have not cleaned up all VM_BOs and 
> mappings before
> shutting down the VM, it would have been a bug anyways and the driver 
> would potentially
> leak memory and UAF issues.

Exactly that's what I'm talking about why I think this is an extremely 
bad idea.

It's a perfect normal operation to shutdown the VM while there are still 
mappings. This is just what happens when you kill an application.

Because of this the mapping should *never* have a reference to the VM, 
but rather the VM destroys all mapping when it is destroyed itself.

> Hence, If the VM is still alive at a point where you don't expect it 
> to be, then it's
> simply a driver bug.

Driver bugs is just what I try to prevent here. When individual mappings 
keep the VM structure alive then drivers are responsible to clean them 
up, if the VM cleans up after itself then we don't need to worry about 
it in the driver.

When the mapping is destroyed with the VM drivers can't mess this common 
operation up. That's why this is more defensive.

What is a possible requirement is that external code needs to keep 
references to the VM, but *never* the VM to itself through the mappings. 
I would consider that a major bug in the component.

Regards,
Christian.

>
>>
>> Reference counting is nice when you don't know who else is referring 
>> to your VM, but the cost is that you also don't know when the object 
>> will guardedly be destroyed.
>>
>> I can trivially work around this by saying that the generic GPUVM 
>> object has a different lifetime than the amdgpu specific object, but 
>> that opens up doors for use after free again.
>
> If your driver never touches the VM's reference count and exits the VM 
> with a clean state
> (no mappings and no VM_BOs left), effectively, this is the same as 
> having no reference
> count.
>
> In the very worst case you could argue that we trade a potential UAF 
> *and* memroy leak
> (no reference count) with *only* a memory leak (with reference count), 
> which to me seems
> reasonable.
>
>>
>> Regards,
>> Christian.
>>
>>>> Thanks,
>>>> Christian.
>>> [1]https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/ 
>>>
>>>
>>>>> Signed-off-by: Danilo Krummrich<dakr@redhat.com>
>>>>> ---
>>>>>    drivers/gpu/drm/drm_gpuvm.c            | 44 
>>>>> +++++++++++++++++++-------
>>>>>    drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>>>>>    include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>>>>>    3 files changed, 78 insertions(+), 17 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/drm_gpuvm.c 
>>>>> b/drivers/gpu/drm/drm_gpuvm.c
>>>>> index 53e2c406fb04..6a88eafc5229 100644
>>>>> --- a/drivers/gpu/drm/drm_gpuvm.c
>>>>> +++ b/drivers/gpu/drm/drm_gpuvm.c
>>>>> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const 
>>>>> char *name,
>>>>>        gpuvm->rb.tree = RB_ROOT_CACHED;
>>>>>        INIT_LIST_HEAD(&gpuvm->rb.list);
>>>>> +    kref_init(&gpuvm->kref);
>>>>> +
>>>>>        gpuvm->name = name ? name : "unknown";
>>>>>        gpuvm->flags = flags;
>>>>>        gpuvm->ops = ops;
>>>>> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const 
>>>>> char *name,
>>>>>    }
>>>>>    EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>>>>> -/**
>>>>> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
>>>>> - * @gpuvm: pointer to the &drm_gpuvm to clean up
>>>>> - *
>>>>> - * Note that it is a bug to call this function on a manager that 
>>>>> still
>>>>> - * holds GPU VA mappings.
>>>>> - */
>>>>> -void
>>>>> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>>> +static void
>>>>> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>>>>>    {
>>>>>        gpuvm->name = NULL;
>>>>> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>>>        drm_gem_object_put(gpuvm->r_obj);
>>>>>    }
>>>>> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
>>>>> +
>>>>> +static void
>>>>> +drm_gpuvm_free(struct kref *kref)
>>>>> +{
>>>>> +    struct drm_gpuvm *gpuvm = container_of(kref, struct 
>>>>> drm_gpuvm, kref);
>>>>> +
>>>>> +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>>>>> +        return;
>>>>> +
>>>>> +    drm_gpuvm_fini(gpuvm);
>>>>> +
>>>>> +    gpuvm->ops->vm_free(gpuvm);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
>>>>> + * @gpuvm: the &drm_gpuvm to release the reference of
>>>>> + *
>>>>> + * This releases a reference to @gpuvm.
>>>>> + */
>>>>> +void
>>>>> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
>>>>> +{
>>>>> +    if (gpuvm)
>>>>> +        kref_put(&gpuvm->kref, drm_gpuvm_free);
>>>>> +}
>>>>> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>>>>>    static int
>>>>>    __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>>> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>>>        if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>>>>>            return -EINVAL;
>>>>> -    return __drm_gpuva_insert(gpuvm, va);
>>>>> +    return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
>>>>>    }
>>>>>    EXPORT_SYMBOL_GPL(drm_gpuva_insert);
>>>>> @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
>>>>>        }
>>>>>        __drm_gpuva_remove(va);
>>>>> +    drm_gpuvm_put(va->vm);
>>>>>    }
>>>>>    EXPORT_SYMBOL_GPL(drm_gpuva_remove);
>>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c 
>>>>> b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>> index 54be12c1272f..cb2f06565c46 100644
>>>>> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>> @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct nouveau_bo 
>>>>> *nvbo)
>>>>>        }
>>>>>    }
>>>>> +static void
>>>>> +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
>>>>> +{
>>>>> +    struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
>>>>> +
>>>>> +    kfree(uvmm);
>>>>> +}
>>>>> +
>>>>> +static const struct drm_gpuvm_ops gpuvm_ops = {
>>>>> +    .vm_free = nouveau_uvmm_free,
>>>>> +};
>>>>> +
>>>>>    int
>>>>>    nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>>>                   void *data,
>>>>> @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device 
>>>>> *dev,
>>>>>                   NOUVEAU_VA_SPACE_END,
>>>>>                   init->kernel_managed_addr,
>>>>>                   init->kernel_managed_size,
>>>>> -               NULL);
>>>>> +               &gpuvm_ops);
>>>>>        /* GPUVM takes care from here on. */
>>>>>        drm_gem_object_put(r_obj);
>>>>> @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct drm_device 
>>>>> *dev,
>>>>>        return 0;
>>>>>    out_gpuvm_fini:
>>>>> -    drm_gpuvm_destroy(&uvmm->base);
>>>>> -    kfree(uvmm);
>>>>> +    drm_gpuvm_put(&uvmm->base);
>>>>>    out_unlock:
>>>>>        mutex_unlock(&cli->mutex);
>>>>>        return ret;
>>>>> @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
>>>>>        mutex_lock(&cli->mutex);
>>>>>        nouveau_vmm_fini(&uvmm->vmm);
>>>>> -    drm_gpuvm_destroy(&uvmm->base);
>>>>> -    kfree(uvmm);
>>>>> +    drm_gpuvm_put(&uvmm->base);
>>>>>        mutex_unlock(&cli->mutex);
>>>>>    }
>>>>> diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
>>>>> index 0c2e24155a93..4e6e1fd3485a 100644
>>>>> --- a/include/drm/drm_gpuvm.h
>>>>> +++ b/include/drm/drm_gpuvm.h
>>>>> @@ -247,6 +247,11 @@ struct drm_gpuvm {
>>>>>            struct list_head list;
>>>>>        } rb;
>>>>> +    /**
>>>>> +     * @kref: reference count of this object
>>>>> +     */
>>>>> +    struct kref kref;
>>>>> +
>>>>>        /**
>>>>>         * @kernel_alloc_node:
>>>>>         *
>>>>> @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct drm_gpuvm *gpuvm, 
>>>>> const char *name,
>>>>>                u64 start_offset, u64 range,
>>>>>                u64 reserve_offset, u64 reserve_range,
>>>>>                const struct drm_gpuvm_ops *ops);
>>>>> -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
>>>>> +
>>>>> +/**
>>>>> + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
>>>>> + * @gpuvm: the &drm_gpuvm to acquire the reference of
>>>>> + *
>>>>> + * This function acquires an additional reference to @gpuvm. It 
>>>>> is illegal to
>>>>> + * call this without already holding a reference. No locks required.
>>>>> + */
>>>>> +static inline struct drm_gpuvm *
>>>>> +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
>>>>> +{
>>>>> +    kref_get(&gpuvm->kref);
>>>>> +
>>>>> +    return gpuvm;
>>>>> +}
>>>>> +
>>>>> +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
>>>>>    bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, 
>>>>> u64 range);
>>>>>    bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 
>>>>> addr, u64 range);
>>>>> @@ -673,6 +694,14 @@ static inline void 
>>>>> drm_gpuva_init_from_op(struct drm_gpuva *va,
>>>>>     * operations to drivers.
>>>>>     */
>>>>>    struct drm_gpuvm_ops {
>>>>> +    /**
>>>>> +     * @vm_free: called when the last reference of a struct 
>>>>> drm_gpuvm is
>>>>> +     * dropped
>>>>> +     *
>>>>> +     * This callback is mandatory.
>>>>> +     */
>>>>> +    void (*vm_free)(struct drm_gpuvm *gpuvm);
>>>>> +
>>>>>        /**
>>>>>         * @op_alloc: called when the &drm_gpuvm allocates
>>>>>         * a struct drm_gpuva_op
>>
>
Danilo Krummrich Nov. 6, 2023, 12:16 p.m. UTC | #11
On Mon, Nov 06, 2023 at 10:14:29AM +0100, Christian König wrote:
> Am 03.11.23 um 16:34 schrieb Danilo Krummrich:
> [SNIP]
> > > 
> > > Especially we most likely don't want the VM to live longer than the
> > > application which originally used it. If you make the GPUVM an
> > > independent object you actually open up driver abuse for the
> > > lifetime of this.
> > 
> > Right, we don't want that. But I don't see how the reference count
> > prevents that.
> 
> It doesn't prevents that, it's just not the most defensive approach.
> 
> > 
> > Independant object is relative. struct drm_gpuvm is still embedded into
> > a driver
> > specific structure. It's working the same way as with struct
> > drm_gem_obejct.
> > 
> > > 
> > > Additional to that see below for a quite real problem with this.
> > > 
> > > > > Background is that the most common use case I see is that
> > > > > this object is
> > > > > embedded into something else and a reference count is then
> > > > > not really a good
> > > > > idea.
> > > > Do you have a specific use-case in mind where this would interfere?
> > > 
> > > Yes, absolutely. For an example see amdgpu_mes_self_test(), here we
> > > initialize a temporary amdgpu VM for an in kernel unit test which
> > > runs during driver load.
> > > 
> > > When the function returns I need to guarantee that the VM is
> > > destroyed or otherwise I will mess up normal operation.
> > 
> > Nothing prevents that. The reference counting is well defined. If the
> > driver did not
> > take additional references (which is clearly up to the driver taking
> > care of) and all
> > VM_BOs and mappings are cleaned up, the reference count is guaranteed to
> > be 1 at this
> > point.
> > 
> > Also note that if the driver would have not cleaned up all VM_BOs and
> > mappings before
> > shutting down the VM, it would have been a bug anyways and the driver
> > would potentially
> > leak memory and UAF issues.
> 
> Exactly that's what I'm talking about why I think this is an extremely bad
> idea.
> 
> It's a perfect normal operation to shutdown the VM while there are still
> mappings. This is just what happens when you kill an application.

Shut down the VM in terms of removing existing mappings, doing driver specifc
cleanup operations, etc. But not freeing the VM structure yet. That's what you
gonna do after you cleaned up everything.

So, when your application gets killed, you just call your driver_vm_destroy()
function, cleaning up all mappings etc. and then you call drm_gpuvm_put(). And
if you did a decent job cleaning things up (removing existing mappings etc.)
this drm_gpuvm_put() call will result into the vm_free() callback being called.

This reference count just prevents that the VM is freed as long as other
ressources are attached to it that carry a VM pointer, such as mappings and
VM_BOs. The motivation for that are VM_BOs. For mappings it's indeed a bit
paranoid, but it doesn't hurt either and keeps it consistant.

> 
> Because of this the mapping should *never* have a reference to the VM, but
> rather the VM destroys all mapping when it is destroyed itself.
> 
> > Hence, If the VM is still alive at a point where you don't expect it to
> > be, then it's
> > simply a driver bug.
> 
> Driver bugs is just what I try to prevent here. When individual mappings
> keep the VM structure alive then drivers are responsible to clean them up,
> if the VM cleans up after itself then we don't need to worry about it in the
> driver.

Drivers are *always* responsible for that. This has nothing to do with whether
the VM is reference counted or not. GPUVM can't clean up mappings after itself.
If the driver left mappings, GPUVM would just leak them without reference count.
It doesn't know about the drivers surrounding structures, nor does it know about
attached ressources such as PT(E)s. 

> 
> When the mapping is destroyed with the VM drivers can't mess this common
> operation up. That's why this is more defensive.
> 
> What is a possible requirement is that external code needs to keep
> references to the VM, but *never* the VM to itself through the mappings. I
> would consider that a major bug in the component.

Obviously, you just (want to) apply a different semantics to this reference
count. It is meant to reflect that the VM structure can be freed, instead of the
VM can be cleaned up. If you want to latter, you can have a driver specifc
reference count for that in the exact same way as it was before this patch.

> 
> Regards,
> Christian.
> 
> > 
> > > 
> > > Reference counting is nice when you don't know who else is referring
> > > to your VM, but the cost is that you also don't know when the object
> > > will guardedly be destroyed.
> > > 
> > > I can trivially work around this by saying that the generic GPUVM
> > > object has a different lifetime than the amdgpu specific object, but
> > > that opens up doors for use after free again.
> > 
> > If your driver never touches the VM's reference count and exits the VM
> > with a clean state
> > (no mappings and no VM_BOs left), effectively, this is the same as
> > having no reference
> > count.
> > 
> > In the very worst case you could argue that we trade a potential UAF
> > *and* memroy leak
> > (no reference count) with *only* a memory leak (with reference count),
> > which to me seems
> > reasonable.
> > 
> > > 
> > > Regards,
> > > Christian.
> > > 
> > > > > Thanks,
> > > > > Christian.
> > > > [1]https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/
> > > > 
> > > > 
> > > > > > Signed-off-by: Danilo Krummrich<dakr@redhat.com>
> > > > > > ---
> > > > > >    drivers/gpu/drm/drm_gpuvm.c            | 44
> > > > > > +++++++++++++++++++-------
> > > > > >    drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
> > > > > >    include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
> > > > > >    3 files changed, 78 insertions(+), 17 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/drm_gpuvm.c
> > > > > > b/drivers/gpu/drm/drm_gpuvm.c
> > > > > > index 53e2c406fb04..6a88eafc5229 100644
> > > > > > --- a/drivers/gpu/drm/drm_gpuvm.c
> > > > > > +++ b/drivers/gpu/drm/drm_gpuvm.c
> > > > > > @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm
> > > > > > *gpuvm, const char *name,
> > > > > >        gpuvm->rb.tree = RB_ROOT_CACHED;
> > > > > >        INIT_LIST_HEAD(&gpuvm->rb.list);
> > > > > > +    kref_init(&gpuvm->kref);
> > > > > > +
> > > > > >        gpuvm->name = name ? name : "unknown";
> > > > > >        gpuvm->flags = flags;
> > > > > >        gpuvm->ops = ops;
> > > > > > @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm
> > > > > > *gpuvm, const char *name,
> > > > > >    }
> > > > > >    EXPORT_SYMBOL_GPL(drm_gpuvm_init);
> > > > > > -/**
> > > > > > - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> > > > > > - * @gpuvm: pointer to the &drm_gpuvm to clean up
> > > > > > - *
> > > > > > - * Note that it is a bug to call this function on a
> > > > > > manager that still
> > > > > > - * holds GPU VA mappings.
> > > > > > - */
> > > > > > -void
> > > > > > -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > > > > > +static void
> > > > > > +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
> > > > > >    {
> > > > > >        gpuvm->name = NULL;
> > > > > > @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > > > > >        drm_gem_object_put(gpuvm->r_obj);
> > > > > >    }
> > > > > > -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> > > > > > +
> > > > > > +static void
> > > > > > +drm_gpuvm_free(struct kref *kref)
> > > > > > +{
> > > > > > +    struct drm_gpuvm *gpuvm = container_of(kref, struct
> > > > > > drm_gpuvm, kref);
> > > > > > +
> > > > > > +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> > > > > > +        return;
> > > > > > +
> > > > > > +    drm_gpuvm_fini(gpuvm);
> > > > > > +
> > > > > > +    gpuvm->ops->vm_free(gpuvm);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> > > > > > + * @gpuvm: the &drm_gpuvm to release the reference of
> > > > > > + *
> > > > > > + * This releases a reference to @gpuvm.
> > > > > > + */
> > > > > > +void
> > > > > > +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> > > > > > +{
> > > > > > +    if (gpuvm)
> > > > > > +        kref_put(&gpuvm->kref, drm_gpuvm_free);
> > > > > > +}
> > > > > > +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
> > > > > >    static int
> > > > > >    __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > > > > > @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > > > > >        if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
> > > > > >            return -EINVAL;
> > > > > > -    return __drm_gpuva_insert(gpuvm, va);
> > > > > > +    return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
> > > > > >    }
> > > > > >    EXPORT_SYMBOL_GPL(drm_gpuva_insert);
> > > > > > @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
> > > > > >        }
> > > > > >        __drm_gpuva_remove(va);
> > > > > > +    drm_gpuvm_put(va->vm);
> > > > > >    }
> > > > > >    EXPORT_SYMBOL_GPL(drm_gpuva_remove);
> > > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > index 54be12c1272f..cb2f06565c46 100644
> > > > > > --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct
> > > > > > nouveau_bo *nvbo)
> > > > > >        }
> > > > > >    }
> > > > > > +static void
> > > > > > +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
> > > > > > +{
> > > > > > +    struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
> > > > > > +
> > > > > > +    kfree(uvmm);
> > > > > > +}
> > > > > > +
> > > > > > +static const struct drm_gpuvm_ops gpuvm_ops = {
> > > > > > +    .vm_free = nouveau_uvmm_free,
> > > > > > +};
> > > > > > +
> > > > > >    int
> > > > > >    nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
> > > > > >                   void *data,
> > > > > > @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct
> > > > > > drm_device *dev,
> > > > > >                   NOUVEAU_VA_SPACE_END,
> > > > > >                   init->kernel_managed_addr,
> > > > > >                   init->kernel_managed_size,
> > > > > > -               NULL);
> > > > > > +               &gpuvm_ops);
> > > > > >        /* GPUVM takes care from here on. */
> > > > > >        drm_gem_object_put(r_obj);
> > > > > > @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct
> > > > > > drm_device *dev,
> > > > > >        return 0;
> > > > > >    out_gpuvm_fini:
> > > > > > -    drm_gpuvm_destroy(&uvmm->base);
> > > > > > -    kfree(uvmm);
> > > > > > +    drm_gpuvm_put(&uvmm->base);
> > > > > >    out_unlock:
> > > > > >        mutex_unlock(&cli->mutex);
> > > > > >        return ret;
> > > > > > @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
> > > > > >        mutex_lock(&cli->mutex);
> > > > > >        nouveau_vmm_fini(&uvmm->vmm);
> > > > > > -    drm_gpuvm_destroy(&uvmm->base);
> > > > > > -    kfree(uvmm);
> > > > > > +    drm_gpuvm_put(&uvmm->base);
> > > > > >        mutex_unlock(&cli->mutex);
> > > > > >    }
> > > > > > diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
> > > > > > index 0c2e24155a93..4e6e1fd3485a 100644
> > > > > > --- a/include/drm/drm_gpuvm.h
> > > > > > +++ b/include/drm/drm_gpuvm.h
> > > > > > @@ -247,6 +247,11 @@ struct drm_gpuvm {
> > > > > >            struct list_head list;
> > > > > >        } rb;
> > > > > > +    /**
> > > > > > +     * @kref: reference count of this object
> > > > > > +     */
> > > > > > +    struct kref kref;
> > > > > > +
> > > > > >        /**
> > > > > >         * @kernel_alloc_node:
> > > > > >         *
> > > > > > @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct
> > > > > > drm_gpuvm *gpuvm, const char *name,
> > > > > >                u64 start_offset, u64 range,
> > > > > >                u64 reserve_offset, u64 reserve_range,
> > > > > >                const struct drm_gpuvm_ops *ops);
> > > > > > -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
> > > > > > + * @gpuvm: the &drm_gpuvm to acquire the reference of
> > > > > > + *
> > > > > > + * This function acquires an additional reference to
> > > > > > @gpuvm. It is illegal to
> > > > > > + * call this without already holding a reference. No locks required.
> > > > > > + */
> > > > > > +static inline struct drm_gpuvm *
> > > > > > +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
> > > > > > +{
> > > > > > +    kref_get(&gpuvm->kref);
> > > > > > +
> > > > > > +    return gpuvm;
> > > > > > +}
> > > > > > +
> > > > > > +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
> > > > > >    bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm,
> > > > > > u64 addr, u64 range);
> > > > > >    bool drm_gpuvm_interval_empty(struct drm_gpuvm
> > > > > > *gpuvm, u64 addr, u64 range);
> > > > > > @@ -673,6 +694,14 @@ static inline void
> > > > > > drm_gpuva_init_from_op(struct drm_gpuva *va,
> > > > > >     * operations to drivers.
> > > > > >     */
> > > > > >    struct drm_gpuvm_ops {
> > > > > > +    /**
> > > > > > +     * @vm_free: called when the last reference of a
> > > > > > struct drm_gpuvm is
> > > > > > +     * dropped
> > > > > > +     *
> > > > > > +     * This callback is mandatory.
> > > > > > +     */
> > > > > > +    void (*vm_free)(struct drm_gpuvm *gpuvm);
> > > > > > +
> > > > > >        /**
> > > > > >         * @op_alloc: called when the &drm_gpuvm allocates
> > > > > >         * a struct drm_gpuva_op
> > > 
> > 
>
Christian König Nov. 6, 2023, 1:05 p.m. UTC | #12
Am 06.11.23 um 13:16 schrieb Danilo Krummrich:
> [SNIP]
> This reference count just prevents that the VM is freed as long as other
> ressources are attached to it that carry a VM pointer, such as mappings and
> VM_BOs. The motivation for that are VM_BOs. For mappings it's indeed a bit
> paranoid, but it doesn't hurt either and keeps it consistant.

Ah! Yeah, we have similar semantics in amdgpu as well.

But we keep the reference to the root GEM object and not the VM.

Ok, that makes much more sense then keeping one reference for each mapping.

>> Because of this the mapping should *never* have a reference to the VM, but
>> rather the VM destroys all mapping when it is destroyed itself.
>>
>>> Hence, If the VM is still alive at a point where you don't expect it to
>>> be, then it's
>>> simply a driver bug.
>> Driver bugs is just what I try to prevent here. When individual mappings
>> keep the VM structure alive then drivers are responsible to clean them up,
>> if the VM cleans up after itself then we don't need to worry about it in the
>> driver.
> Drivers are *always* responsible for that. This has nothing to do with whether
> the VM is reference counted or not. GPUVM can't clean up mappings after itself.

Why not?

At least in amdgpu we have it exactly like that. E.g. the higher level 
can cleanup the BO_VM structure at any time possible, even when there 
are mappings. The VM then keeps track which areas still need to be 
invalidated in the physical representation of the page tables.

I would expect that the generalized GPU VM handling would need something 
similar. If we leave that to the driver then each driver would have to 
implement that stuff on it's own again.

> If the driver left mappings, GPUVM would just leak them without reference count.
> It doesn't know about the drivers surrounding structures, nor does it know about
> attached ressources such as PT(E)s.

What are we talking with the word "mapping"? The BO_VM structure? Or 
each individual mapping?

E.g. what we need to prevent is that VM structure (or the root GEM 
object) is released while VM_BOs are still around. That's what I totally 
agree on.

But each individual mapping is a different story. Userspace can create 
so many of them that we probably could even overrun a 32bit counter 
quite easily.

>> When the mapping is destroyed with the VM drivers can't mess this common
>> operation up. That's why this is more defensive.
>>
>> What is a possible requirement is that external code needs to keep
>> references to the VM, but *never* the VM to itself through the mappings. I
>> would consider that a major bug in the component.
> Obviously, you just (want to) apply a different semantics to this reference
> count. It is meant to reflect that the VM structure can be freed, instead of the
> VM can be cleaned up. If you want to latter, you can have a driver specifc
> reference count for that in the exact same way as it was before this patch.

Yeah, it becomes clear that you try to solve some different problem than 
I have expected.

Regards,
Christian.

>
>> Regards,
>> Christian.
>>
>>>> Reference counting is nice when you don't know who else is referring
>>>> to your VM, but the cost is that you also don't know when the object
>>>> will guardedly be destroyed.
>>>>
>>>> I can trivially work around this by saying that the generic GPUVM
>>>> object has a different lifetime than the amdgpu specific object, but
>>>> that opens up doors for use after free again.
>>> If your driver never touches the VM's reference count and exits the VM
>>> with a clean state
>>> (no mappings and no VM_BOs left), effectively, this is the same as
>>> having no reference
>>> count.
>>>
>>> In the very worst case you could argue that we trade a potential UAF
>>> *and* memroy leak
>>> (no reference count) with *only* a memory leak (with reference count),
>>> which to me seems
>>> reasonable.
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>> Thanks,
>>>>>> Christian.
>>>>> [1]https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/
>>>>>
>>>>>
>>>>>>> Signed-off-by: Danilo Krummrich<dakr@redhat.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/drm_gpuvm.c            | 44
>>>>>>> +++++++++++++++++++-------
>>>>>>>     drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
>>>>>>>     include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
>>>>>>>     3 files changed, 78 insertions(+), 17 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/drm_gpuvm.c
>>>>>>> b/drivers/gpu/drm/drm_gpuvm.c
>>>>>>> index 53e2c406fb04..6a88eafc5229 100644
>>>>>>> --- a/drivers/gpu/drm/drm_gpuvm.c
>>>>>>> +++ b/drivers/gpu/drm/drm_gpuvm.c
>>>>>>> @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm
>>>>>>> *gpuvm, const char *name,
>>>>>>>         gpuvm->rb.tree = RB_ROOT_CACHED;
>>>>>>>         INIT_LIST_HEAD(&gpuvm->rb.list);
>>>>>>> +    kref_init(&gpuvm->kref);
>>>>>>> +
>>>>>>>         gpuvm->name = name ? name : "unknown";
>>>>>>>         gpuvm->flags = flags;
>>>>>>>         gpuvm->ops = ops;
>>>>>>> @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm
>>>>>>> *gpuvm, const char *name,
>>>>>>>     }
>>>>>>>     EXPORT_SYMBOL_GPL(drm_gpuvm_init);
>>>>>>> -/**
>>>>>>> - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
>>>>>>> - * @gpuvm: pointer to the &drm_gpuvm to clean up
>>>>>>> - *
>>>>>>> - * Note that it is a bug to call this function on a
>>>>>>> manager that still
>>>>>>> - * holds GPU VA mappings.
>>>>>>> - */
>>>>>>> -void
>>>>>>> -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>>>>> +static void
>>>>>>> +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
>>>>>>>     {
>>>>>>>         gpuvm->name = NULL;
>>>>>>> @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
>>>>>>>         drm_gem_object_put(gpuvm->r_obj);
>>>>>>>     }
>>>>>>> -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
>>>>>>> +
>>>>>>> +static void
>>>>>>> +drm_gpuvm_free(struct kref *kref)
>>>>>>> +{
>>>>>>> +    struct drm_gpuvm *gpuvm = container_of(kref, struct
>>>>>>> drm_gpuvm, kref);
>>>>>>> +
>>>>>>> +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>>>>>>> +        return;
>>>>>>> +
>>>>>>> +    drm_gpuvm_fini(gpuvm);
>>>>>>> +
>>>>>>> +    gpuvm->ops->vm_free(gpuvm);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
>>>>>>> + * @gpuvm: the &drm_gpuvm to release the reference of
>>>>>>> + *
>>>>>>> + * This releases a reference to @gpuvm.
>>>>>>> + */
>>>>>>> +void
>>>>>>> +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
>>>>>>> +{
>>>>>>> +    if (gpuvm)
>>>>>>> +        kref_put(&gpuvm->kref, drm_gpuvm_free);
>>>>>>> +}
>>>>>>> +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
>>>>>>>     static int
>>>>>>>     __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>>>>> @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
>>>>>>>         if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
>>>>>>>             return -EINVAL;
>>>>>>> -    return __drm_gpuva_insert(gpuvm, va);
>>>>>>> +    return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
>>>>>>>     }
>>>>>>>     EXPORT_SYMBOL_GPL(drm_gpuva_insert);
>>>>>>> @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
>>>>>>>         }
>>>>>>>         __drm_gpuva_remove(va);
>>>>>>> +    drm_gpuvm_put(va->vm);
>>>>>>>     }
>>>>>>>     EXPORT_SYMBOL_GPL(drm_gpuva_remove);
>>>>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>>>> b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>>>> index 54be12c1272f..cb2f06565c46 100644
>>>>>>> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>>>>>> @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct
>>>>>>> nouveau_bo *nvbo)
>>>>>>>         }
>>>>>>>     }
>>>>>>> +static void
>>>>>>> +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
>>>>>>> +{
>>>>>>> +    struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
>>>>>>> +
>>>>>>> +    kfree(uvmm);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static const struct drm_gpuvm_ops gpuvm_ops = {
>>>>>>> +    .vm_free = nouveau_uvmm_free,
>>>>>>> +};
>>>>>>> +
>>>>>>>     int
>>>>>>>     nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
>>>>>>>                    void *data,
>>>>>>> @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct
>>>>>>> drm_device *dev,
>>>>>>>                    NOUVEAU_VA_SPACE_END,
>>>>>>>                    init->kernel_managed_addr,
>>>>>>>                    init->kernel_managed_size,
>>>>>>> -               NULL);
>>>>>>> +               &gpuvm_ops);
>>>>>>>         /* GPUVM takes care from here on. */
>>>>>>>         drm_gem_object_put(r_obj);
>>>>>>> @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct
>>>>>>> drm_device *dev,
>>>>>>>         return 0;
>>>>>>>     out_gpuvm_fini:
>>>>>>> -    drm_gpuvm_destroy(&uvmm->base);
>>>>>>> -    kfree(uvmm);
>>>>>>> +    drm_gpuvm_put(&uvmm->base);
>>>>>>>     out_unlock:
>>>>>>>         mutex_unlock(&cli->mutex);
>>>>>>>         return ret;
>>>>>>> @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
>>>>>>>         mutex_lock(&cli->mutex);
>>>>>>>         nouveau_vmm_fini(&uvmm->vmm);
>>>>>>> -    drm_gpuvm_destroy(&uvmm->base);
>>>>>>> -    kfree(uvmm);
>>>>>>> +    drm_gpuvm_put(&uvmm->base);
>>>>>>>         mutex_unlock(&cli->mutex);
>>>>>>>     }
>>>>>>> diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
>>>>>>> index 0c2e24155a93..4e6e1fd3485a 100644
>>>>>>> --- a/include/drm/drm_gpuvm.h
>>>>>>> +++ b/include/drm/drm_gpuvm.h
>>>>>>> @@ -247,6 +247,11 @@ struct drm_gpuvm {
>>>>>>>             struct list_head list;
>>>>>>>         } rb;
>>>>>>> +    /**
>>>>>>> +     * @kref: reference count of this object
>>>>>>> +     */
>>>>>>> +    struct kref kref;
>>>>>>> +
>>>>>>>         /**
>>>>>>>          * @kernel_alloc_node:
>>>>>>>          *
>>>>>>> @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct
>>>>>>> drm_gpuvm *gpuvm, const char *name,
>>>>>>>                 u64 start_offset, u64 range,
>>>>>>>                 u64 reserve_offset, u64 reserve_range,
>>>>>>>                 const struct drm_gpuvm_ops *ops);
>>>>>>> -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
>>>>>>> + * @gpuvm: the &drm_gpuvm to acquire the reference of
>>>>>>> + *
>>>>>>> + * This function acquires an additional reference to
>>>>>>> @gpuvm. It is illegal to
>>>>>>> + * call this without already holding a reference. No locks required.
>>>>>>> + */
>>>>>>> +static inline struct drm_gpuvm *
>>>>>>> +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
>>>>>>> +{
>>>>>>> +    kref_get(&gpuvm->kref);
>>>>>>> +
>>>>>>> +    return gpuvm;
>>>>>>> +}
>>>>>>> +
>>>>>>> +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
>>>>>>>     bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm,
>>>>>>> u64 addr, u64 range);
>>>>>>>     bool drm_gpuvm_interval_empty(struct drm_gpuvm
>>>>>>> *gpuvm, u64 addr, u64 range);
>>>>>>> @@ -673,6 +694,14 @@ static inline void
>>>>>>> drm_gpuva_init_from_op(struct drm_gpuva *va,
>>>>>>>      * operations to drivers.
>>>>>>>      */
>>>>>>>     struct drm_gpuvm_ops {
>>>>>>> +    /**
>>>>>>> +     * @vm_free: called when the last reference of a
>>>>>>> struct drm_gpuvm is
>>>>>>> +     * dropped
>>>>>>> +     *
>>>>>>> +     * This callback is mandatory.
>>>>>>> +     */
>>>>>>> +    void (*vm_free)(struct drm_gpuvm *gpuvm);
>>>>>>> +
>>>>>>>         /**
>>>>>>>          * @op_alloc: called when the &drm_gpuvm allocates
>>>>>>>          * a struct drm_gpuva_op
Danilo Krummrich Nov. 6, 2023, 2:11 p.m. UTC | #13
On Mon, Nov 06, 2023 at 02:05:13PM +0100, Christian König wrote:
> Am 06.11.23 um 13:16 schrieb Danilo Krummrich:
> > [SNIP]
> > This reference count just prevents that the VM is freed as long as other
> > ressources are attached to it that carry a VM pointer, such as mappings and
> > VM_BOs. The motivation for that are VM_BOs. For mappings it's indeed a bit
> > paranoid, but it doesn't hurt either and keeps it consistant.
> 
> Ah! Yeah, we have similar semantics in amdgpu as well.
> 
> But we keep the reference to the root GEM object and not the VM.
> 
> Ok, that makes much more sense then keeping one reference for each mapping.
> 
> > > Because of this the mapping should *never* have a reference to the VM, but
> > > rather the VM destroys all mapping when it is destroyed itself.
> > > 
> > > > Hence, If the VM is still alive at a point where you don't expect it to
> > > > be, then it's
> > > > simply a driver bug.
> > > Driver bugs is just what I try to prevent here. When individual mappings
> > > keep the VM structure alive then drivers are responsible to clean them up,
> > > if the VM cleans up after itself then we don't need to worry about it in the
> > > driver.
> > Drivers are *always* responsible for that. This has nothing to do with whether
> > the VM is reference counted or not. GPUVM can't clean up mappings after itself.
> 
> Why not?

I feel like we're talking past each other here, at least to some extend.
However, I can't yet see where exactly the misunderstanding resides.

> 
> At least in amdgpu we have it exactly like that. E.g. the higher level can
> cleanup the BO_VM structure at any time possible, even when there are
> mappings.

What do you mean with "cleanup the VM_BO structue" exactly?

The VM_BO structure keeps track of all the mappings mapped in the VM_BO's VM
being backed by the VM_BO's GEM object. And the GEM objects keeps a list of
the corresponding VM_BOs.

Hence, as long as there are mappings that this VM_BO keeps track of, this VM_BO
should stay alive.

> The VM then keeps track which areas still need to be invalidated
> in the physical representation of the page tables.

And the VM does that through its tree of mappings (struct drm_gpuva). Hence, if
the VM would just remove those structures on cleanup by itself, you'd loose the
ability of cleaning up the page tables. Unless, you track this separately, which
would make the whole tracking of GPUVM itself kinda pointless.

> 
> I would expect that the generalized GPU VM handling would need something
> similar. If we leave that to the driver then each driver would have to
> implement that stuff on it's own again.

Similar to what? What exactly do you think can be generalized here?

> 
> > If the driver left mappings, GPUVM would just leak them without reference count.
> > It doesn't know about the drivers surrounding structures, nor does it know about
> > attached ressources such as PT(E)s.
> 
> What are we talking with the word "mapping"? The BO_VM structure? Or each
> individual mapping?

An individual mapping represented by struct drm_gpuva.

> 
> E.g. what we need to prevent is that VM structure (or the root GEM object)
> is released while VM_BOs are still around. That's what I totally agree on.
> 
> But each individual mapping is a different story. Userspace can create so
> many of them that we probably could even overrun a 32bit counter quite
> easily.

REFCOUNT_MAX is specified as 0x7fff_ffff. I agree there can be a lot of
mappings, but (including the VM_BO references) more than 2.147.483.647 per VM?

> 
> > > When the mapping is destroyed with the VM drivers can't mess this common
> > > operation up. That's why this is more defensive.
> > > 
> > > What is a possible requirement is that external code needs to keep
> > > references to the VM, but *never* the VM to itself through the mappings. I
> > > would consider that a major bug in the component.
> > Obviously, you just (want to) apply a different semantics to this reference
> > count. It is meant to reflect that the VM structure can be freed, instead of the
> > VM can be cleaned up. If you want to latter, you can have a driver specifc
> > reference count for that in the exact same way as it was before this patch.
> 
> Yeah, it becomes clear that you try to solve some different problem than I
> have expected.
> 
> Regards,
> Christian.
> 
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > > Reference counting is nice when you don't know who else is referring
> > > > > to your VM, but the cost is that you also don't know when the object
> > > > > will guardedly be destroyed.
> > > > > 
> > > > > I can trivially work around this by saying that the generic GPUVM
> > > > > object has a different lifetime than the amdgpu specific object, but
> > > > > that opens up doors for use after free again.
> > > > If your driver never touches the VM's reference count and exits the VM
> > > > with a clean state
> > > > (no mappings and no VM_BOs left), effectively, this is the same as
> > > > having no reference
> > > > count.
> > > > 
> > > > In the very worst case you could argue that we trade a potential UAF
> > > > *and* memroy leak
> > > > (no reference count) with *only* a memory leak (with reference count),
> > > > which to me seems
> > > > reasonable.
> > > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > > > > Thanks,
> > > > > > > Christian.
> > > > > > [1]https://lore.kernel.org/dri-devel/6fa058a4-20d3-44b9-af58-755cfb375d75@redhat.com/
> > > > > > 
> > > > > > 
> > > > > > > > Signed-off-by: Danilo Krummrich<dakr@redhat.com>
> > > > > > > > ---
> > > > > > > >     drivers/gpu/drm/drm_gpuvm.c            | 44
> > > > > > > > +++++++++++++++++++-------
> > > > > > > >     drivers/gpu/drm/nouveau/nouveau_uvmm.c | 20 +++++++++---
> > > > > > > >     include/drm/drm_gpuvm.h                | 31 +++++++++++++++++-
> > > > > > > >     3 files changed, 78 insertions(+), 17 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/gpu/drm/drm_gpuvm.c
> > > > > > > > b/drivers/gpu/drm/drm_gpuvm.c
> > > > > > > > index 53e2c406fb04..6a88eafc5229 100644
> > > > > > > > --- a/drivers/gpu/drm/drm_gpuvm.c
> > > > > > > > +++ b/drivers/gpu/drm/drm_gpuvm.c
> > > > > > > > @@ -746,6 +746,8 @@ drm_gpuvm_init(struct drm_gpuvm
> > > > > > > > *gpuvm, const char *name,
> > > > > > > >         gpuvm->rb.tree = RB_ROOT_CACHED;
> > > > > > > >         INIT_LIST_HEAD(&gpuvm->rb.list);
> > > > > > > > +    kref_init(&gpuvm->kref);
> > > > > > > > +
> > > > > > > >         gpuvm->name = name ? name : "unknown";
> > > > > > > >         gpuvm->flags = flags;
> > > > > > > >         gpuvm->ops = ops;
> > > > > > > > @@ -770,15 +772,8 @@ drm_gpuvm_init(struct drm_gpuvm
> > > > > > > > *gpuvm, const char *name,
> > > > > > > >     }
> > > > > > > >     EXPORT_SYMBOL_GPL(drm_gpuvm_init);
> > > > > > > > -/**
> > > > > > > > - * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
> > > > > > > > - * @gpuvm: pointer to the &drm_gpuvm to clean up
> > > > > > > > - *
> > > > > > > > - * Note that it is a bug to call this function on a
> > > > > > > > manager that still
> > > > > > > > - * holds GPU VA mappings.
> > > > > > > > - */
> > > > > > > > -void
> > > > > > > > -drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > > > > > > > +static void
> > > > > > > > +drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
> > > > > > > >     {
> > > > > > > >         gpuvm->name = NULL;
> > > > > > > > @@ -790,7 +785,33 @@ drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
> > > > > > > >         drm_gem_object_put(gpuvm->r_obj);
> > > > > > > >     }
> > > > > > > > -EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +drm_gpuvm_free(struct kref *kref)
> > > > > > > > +{
> > > > > > > > +    struct drm_gpuvm *gpuvm = container_of(kref, struct
> > > > > > > > drm_gpuvm, kref);
> > > > > > > > +
> > > > > > > > +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> > > > > > > > +        return;
> > > > > > > > +
> > > > > > > > +    drm_gpuvm_fini(gpuvm);
> > > > > > > > +
> > > > > > > > +    gpuvm->ops->vm_free(gpuvm);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
> > > > > > > > + * @gpuvm: the &drm_gpuvm to release the reference of
> > > > > > > > + *
> > > > > > > > + * This releases a reference to @gpuvm.
> > > > > > > > + */
> > > > > > > > +void
> > > > > > > > +drm_gpuvm_put(struct drm_gpuvm *gpuvm)
> > > > > > > > +{
> > > > > > > > +    if (gpuvm)
> > > > > > > > +        kref_put(&gpuvm->kref, drm_gpuvm_free);
> > > > > > > > +}
> > > > > > > > +EXPORT_SYMBOL_GPL(drm_gpuvm_put);
> > > > > > > >     static int
> > > > > > > >     __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > > > > > > > @@ -843,7 +864,7 @@ drm_gpuva_insert(struct drm_gpuvm *gpuvm,
> > > > > > > >         if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
> > > > > > > >             return -EINVAL;
> > > > > > > > -    return __drm_gpuva_insert(gpuvm, va);
> > > > > > > > +    return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
> > > > > > > >     }
> > > > > > > >     EXPORT_SYMBOL_GPL(drm_gpuva_insert);
> > > > > > > > @@ -876,6 +897,7 @@ drm_gpuva_remove(struct drm_gpuva *va)
> > > > > > > >         }
> > > > > > > >         __drm_gpuva_remove(va);
> > > > > > > > +    drm_gpuvm_put(va->vm);
> > > > > > > >     }
> > > > > > > >     EXPORT_SYMBOL_GPL(drm_gpuva_remove);
> > > > > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > > > b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > > > index 54be12c1272f..cb2f06565c46 100644
> > > > > > > > --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > > > +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > > > > > > > @@ -1780,6 +1780,18 @@ nouveau_uvmm_bo_unmap_all(struct
> > > > > > > > nouveau_bo *nvbo)
> > > > > > > >         }
> > > > > > > >     }
> > > > > > > > +static void
> > > > > > > > +nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
> > > > > > > > +{
> > > > > > > > +    struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
> > > > > > > > +
> > > > > > > > +    kfree(uvmm);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static const struct drm_gpuvm_ops gpuvm_ops = {
> > > > > > > > +    .vm_free = nouveau_uvmm_free,
> > > > > > > > +};
> > > > > > > > +
> > > > > > > >     int
> > > > > > > >     nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
> > > > > > > >                    void *data,
> > > > > > > > @@ -1830,7 +1842,7 @@ nouveau_uvmm_ioctl_vm_init(struct
> > > > > > > > drm_device *dev,
> > > > > > > >                    NOUVEAU_VA_SPACE_END,
> > > > > > > >                    init->kernel_managed_addr,
> > > > > > > >                    init->kernel_managed_size,
> > > > > > > > -               NULL);
> > > > > > > > +               &gpuvm_ops);
> > > > > > > >         /* GPUVM takes care from here on. */
> > > > > > > >         drm_gem_object_put(r_obj);
> > > > > > > > @@ -1849,8 +1861,7 @@ nouveau_uvmm_ioctl_vm_init(struct
> > > > > > > > drm_device *dev,
> > > > > > > >         return 0;
> > > > > > > >     out_gpuvm_fini:
> > > > > > > > -    drm_gpuvm_destroy(&uvmm->base);
> > > > > > > > -    kfree(uvmm);
> > > > > > > > +    drm_gpuvm_put(&uvmm->base);
> > > > > > > >     out_unlock:
> > > > > > > >         mutex_unlock(&cli->mutex);
> > > > > > > >         return ret;
> > > > > > > > @@ -1902,7 +1913,6 @@ nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
> > > > > > > >         mutex_lock(&cli->mutex);
> > > > > > > >         nouveau_vmm_fini(&uvmm->vmm);
> > > > > > > > -    drm_gpuvm_destroy(&uvmm->base);
> > > > > > > > -    kfree(uvmm);
> > > > > > > > +    drm_gpuvm_put(&uvmm->base);
> > > > > > > >         mutex_unlock(&cli->mutex);
> > > > > > > >     }
> > > > > > > > diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
> > > > > > > > index 0c2e24155a93..4e6e1fd3485a 100644
> > > > > > > > --- a/include/drm/drm_gpuvm.h
> > > > > > > > +++ b/include/drm/drm_gpuvm.h
> > > > > > > > @@ -247,6 +247,11 @@ struct drm_gpuvm {
> > > > > > > >             struct list_head list;
> > > > > > > >         } rb;
> > > > > > > > +    /**
> > > > > > > > +     * @kref: reference count of this object
> > > > > > > > +     */
> > > > > > > > +    struct kref kref;
> > > > > > > > +
> > > > > > > >         /**
> > > > > > > >          * @kernel_alloc_node:
> > > > > > > >          *
> > > > > > > > @@ -273,7 +278,23 @@ void drm_gpuvm_init(struct
> > > > > > > > drm_gpuvm *gpuvm, const char *name,
> > > > > > > >                 u64 start_offset, u64 range,
> > > > > > > >                 u64 reserve_offset, u64 reserve_range,
> > > > > > > >                 const struct drm_gpuvm_ops *ops);
> > > > > > > > -void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
> > > > > > > > + * @gpuvm: the &drm_gpuvm to acquire the reference of
> > > > > > > > + *
> > > > > > > > + * This function acquires an additional reference to
> > > > > > > > @gpuvm. It is illegal to
> > > > > > > > + * call this without already holding a reference. No locks required.
> > > > > > > > + */
> > > > > > > > +static inline struct drm_gpuvm *
> > > > > > > > +drm_gpuvm_get(struct drm_gpuvm *gpuvm)
> > > > > > > > +{
> > > > > > > > +    kref_get(&gpuvm->kref);
> > > > > > > > +
> > > > > > > > +    return gpuvm;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
> > > > > > > >     bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm,
> > > > > > > > u64 addr, u64 range);
> > > > > > > >     bool drm_gpuvm_interval_empty(struct drm_gpuvm
> > > > > > > > *gpuvm, u64 addr, u64 range);
> > > > > > > > @@ -673,6 +694,14 @@ static inline void
> > > > > > > > drm_gpuva_init_from_op(struct drm_gpuva *va,
> > > > > > > >      * operations to drivers.
> > > > > > > >      */
> > > > > > > >     struct drm_gpuvm_ops {
> > > > > > > > +    /**
> > > > > > > > +     * @vm_free: called when the last reference of a
> > > > > > > > struct drm_gpuvm is
> > > > > > > > +     * dropped
> > > > > > > > +     *
> > > > > > > > +     * This callback is mandatory.
> > > > > > > > +     */
> > > > > > > > +    void (*vm_free)(struct drm_gpuvm *gpuvm);
> > > > > > > > +
> > > > > > > >         /**
> > > > > > > >          * @op_alloc: called when the &drm_gpuvm allocates
> > > > > > > >          * a struct drm_gpuva_op
Christian König Nov. 6, 2023, 3:10 p.m. UTC | #14
Am 06.11.23 um 15:11 schrieb Danilo Krummrich:
> On Mon, Nov 06, 2023 at 02:05:13PM +0100, Christian König wrote:
>> Am 06.11.23 um 13:16 schrieb Danilo Krummrich:
>>> [SNIP]
>>> This reference count just prevents that the VM is freed as long as other
>>> ressources are attached to it that carry a VM pointer, such as mappings and
>>> VM_BOs. The motivation for that are VM_BOs. For mappings it's indeed a bit
>>> paranoid, but it doesn't hurt either and keeps it consistant.
>> Ah! Yeah, we have similar semantics in amdgpu as well.
>>
>> But we keep the reference to the root GEM object and not the VM.
>>
>> Ok, that makes much more sense then keeping one reference for each mapping.
>>
>>>> Because of this the mapping should *never* have a reference to the VM, but
>>>> rather the VM destroys all mapping when it is destroyed itself.
>>>>
>>>>> Hence, If the VM is still alive at a point where you don't expect it to
>>>>> be, then it's
>>>>> simply a driver bug.
>>>> Driver bugs is just what I try to prevent here. When individual mappings
>>>> keep the VM structure alive then drivers are responsible to clean them up,
>>>> if the VM cleans up after itself then we don't need to worry about it in the
>>>> driver.
>>> Drivers are *always* responsible for that. This has nothing to do with whether
>>> the VM is reference counted or not. GPUVM can't clean up mappings after itself.
>> Why not?
> I feel like we're talking past each other here, at least to some extend.
> However, I can't yet see where exactly the misunderstanding resides.

+1

>> At least in amdgpu we have it exactly like that. E.g. the higher level can
>> cleanup the BO_VM structure at any time possible, even when there are
>> mappings.
> What do you mean with "cleanup the VM_BO structue" exactly?
>
> The VM_BO structure keeps track of all the mappings mapped in the VM_BO's VM
> being backed by the VM_BO's GEM object. And the GEM objects keeps a list of
> the corresponding VM_BOs.
>
> Hence, as long as there are mappings that this VM_BO keeps track of, this VM_BO
> should stay alive.

No, exactly the other way around. When the VM_BO structure is destroyed 
the mappings are destroyed with them.

Otherwise you would need to destroy each individual mapping separately 
before teardown which is quite inefficient.

>> The VM then keeps track which areas still need to be invalidated
>> in the physical representation of the page tables.
> And the VM does that through its tree of mappings (struct drm_gpuva). Hence, if
> the VM would just remove those structures on cleanup by itself, you'd loose the
> ability of cleaning up the page tables. Unless, you track this separately, which
> would make the whole tracking of GPUVM itself kinda pointless.

But how do you then keep track of areas which are freed and needs to be 
updated so that nobody can access the underlying memory any more?

>> I would expect that the generalized GPU VM handling would need something
>> similar. If we leave that to the driver then each driver would have to
>> implement that stuff on it's own again.
> Similar to what? What exactly do you think can be generalized here?

Similar to how amdgpu works.

 From what I can see you are basically re-inventing everything we 
already have in there and asking the same questions we stumbled over 
years ago.

>>> If the driver left mappings, GPUVM would just leak them without reference count.
>>> It doesn't know about the drivers surrounding structures, nor does it know about
>>> attached ressources such as PT(E)s.
>> What are we talking with the word "mapping"? The BO_VM structure? Or each
>> individual mapping?
> An individual mapping represented by struct drm_gpuva.

Yeah than that certainly doesn't work. See below.

>> E.g. what we need to prevent is that VM structure (or the root GEM object)
>> is released while VM_BOs are still around. That's what I totally agree on.
>>
>> But each individual mapping is a different story. Userspace can create so
>> many of them that we probably could even overrun a 32bit counter quite
>> easily.
> REFCOUNT_MAX is specified as 0x7fff_ffff. I agree there can be a lot of
> mappings, but (including the VM_BO references) more than 2.147.483.647 per VM?

IIRC on amdgpu we can create something like 100k mappings per second and 
each takes ~64 bytes.

So you just need 128GiB of memory and approx 20 seconds to let the 
kernel run into a refcount overrun.

The worst I've seen in a real world game was around 19k mappings, but 
that doesn't mean that this here can't be exploited.

What can be done is to keep one reference per VM_BO structure, but I 
think per mapping is rather unrealistic.

Regards,
Christian.
Danilo Krummrich Nov. 6, 2023, 4:42 p.m. UTC | #15
On Mon, Nov 06, 2023 at 04:10:50PM +0100, Christian König wrote:
> Am 06.11.23 um 15:11 schrieb Danilo Krummrich:
> > On Mon, Nov 06, 2023 at 02:05:13PM +0100, Christian König wrote:
> > > Am 06.11.23 um 13:16 schrieb Danilo Krummrich:
> > > > [SNIP]
> > > > This reference count just prevents that the VM is freed as long as other
> > > > ressources are attached to it that carry a VM pointer, such as mappings and
> > > > VM_BOs. The motivation for that are VM_BOs. For mappings it's indeed a bit
> > > > paranoid, but it doesn't hurt either and keeps it consistant.
> > > Ah! Yeah, we have similar semantics in amdgpu as well.
> > > 
> > > But we keep the reference to the root GEM object and not the VM.
> > > 
> > > Ok, that makes much more sense then keeping one reference for each mapping.
> > > 
> > > > > Because of this the mapping should *never* have a reference to the VM, but
> > > > > rather the VM destroys all mapping when it is destroyed itself.
> > > > > 
> > > > > > Hence, If the VM is still alive at a point where you don't expect it to
> > > > > > be, then it's
> > > > > > simply a driver bug.
> > > > > Driver bugs is just what I try to prevent here. When individual mappings
> > > > > keep the VM structure alive then drivers are responsible to clean them up,
> > > > > if the VM cleans up after itself then we don't need to worry about it in the
> > > > > driver.
> > > > Drivers are *always* responsible for that. This has nothing to do with whether
> > > > the VM is reference counted or not. GPUVM can't clean up mappings after itself.
> > > Why not?
> > I feel like we're talking past each other here, at least to some extend.
> > However, I can't yet see where exactly the misunderstanding resides.
> 
> +1
> 
> > > At least in amdgpu we have it exactly like that. E.g. the higher level can
> > > cleanup the BO_VM structure at any time possible, even when there are
> > > mappings.
> > What do you mean with "cleanup the VM_BO structue" exactly?
> > 
> > The VM_BO structure keeps track of all the mappings mapped in the VM_BO's VM
> > being backed by the VM_BO's GEM object. And the GEM objects keeps a list of
> > the corresponding VM_BOs.
> > 
> > Hence, as long as there are mappings that this VM_BO keeps track of, this VM_BO
> > should stay alive.
> 
> No, exactly the other way around. When the VM_BO structure is destroyed the
> mappings are destroyed with them.

This seems to be the same misunderstanding as with the VM reference count.

It seems to me that you want to say that for amdgpu it seems to be a use-case
to get rid of all mappings backed by a given BO and mapped in a given VM, hence
a VM_BO. You can do that. Thers's even a helper for that in GPUVM.

But also in this case you first need to get rid of all mappings before you
*free* the VM_BO - GPUVM ensures that.

> 
> Otherwise you would need to destroy each individual mapping separately
> before teardown which is quite inefficient.

Not sure what you mean, but I don't see a difference between walking all VM_BOs
and removing their mappings and walking the VM's tree of mappings and removing
each of them. Comes down to the same effort in the end. But surely can go both
ways if you know all the existing VM_BOs.

> 
> > > The VM then keeps track which areas still need to be invalidated
> > > in the physical representation of the page tables.
> > And the VM does that through its tree of mappings (struct drm_gpuva). Hence, if
> > the VM would just remove those structures on cleanup by itself, you'd loose the
> > ability of cleaning up the page tables. Unless, you track this separately, which
> > would make the whole tracking of GPUVM itself kinda pointless.
> 
> But how do you then keep track of areas which are freed and needs to be
> updated so that nobody can access the underlying memory any more?

"areas which are freed", what do refer to? What do yo mean with that?

Do you mean areas of the VA space not containing mappings? Why would I need to
track them explicitly? When the mapping is removed the corresponding page tables
/ page table entries are gone as well, hence no subsequent access to the
underlaying memory would be possible.

> 
> > > I would expect that the generalized GPU VM handling would need something
> > > similar. If we leave that to the driver then each driver would have to
> > > implement that stuff on it's own again.
> > Similar to what? What exactly do you think can be generalized here?
> 
> Similar to how amdgpu works.

I don't think it's quite fair to just throw the "look at what amdgpu does"
argument at me. What am I supposed to do? Read and understand *every* detail of
*every* driver?

Did you read through the GPUVM code? That's a honest question and I'm asking it
because I feel like you're picking up some details from commit messages and
start questioning them (and that's perfectly fine and absolutely welcome).

But if the answers don't satisfy you or do not lead to a better understanding it
just seems you ask others to check out amdgpu rather than taking the time to go
though the proposed code yourself making suggestions to improve it or explicitly
point out the changes you require.

> 
> From what I can see you are basically re-inventing everything we already
> have in there and asking the same questions we stumbled over years ago.

I did not ask any questions in the first place. I came up with something that
Nouveau, Xe, Panthor, PowerVR, etc. required and that works for them.

They also all provided a lot of ideas and contributed code through the review
process.

Of course, I want this to work for amdgpu as well. So, if you think we're
missing something fundamential or if you see something that simply doesn't work
for other drivers, like amdgpu, please educate us. I'm surely willing to learn
and, if required, change the code.

But please don't just tell me I would re-invent amdgpu and assume that I know
all the details of this driver. None of that is reasonable.

> 
> > > > If the driver left mappings, GPUVM would just leak them without reference count.
> > > > It doesn't know about the drivers surrounding structures, nor does it know about
> > > > attached ressources such as PT(E)s.
> > > What are we talking with the word "mapping"? The BO_VM structure? Or each
> > > individual mapping?
> > An individual mapping represented by struct drm_gpuva.
> 
> Yeah than that certainly doesn't work. See below.
> 
> > > E.g. what we need to prevent is that VM structure (or the root GEM object)
> > > is released while VM_BOs are still around. That's what I totally agree on.
> > > 
> > > But each individual mapping is a different story. Userspace can create so
> > > many of them that we probably could even overrun a 32bit counter quite
> > > easily.
> > REFCOUNT_MAX is specified as 0x7fff_ffff. I agree there can be a lot of
> > mappings, but (including the VM_BO references) more than 2.147.483.647 per VM?
> 
> IIRC on amdgpu we can create something like 100k mappings per second and
> each takes ~64 bytes.
> 
> So you just need 128GiB of memory and approx 20 seconds to let the kernel
> run into a refcount overrun.

100.000 * 20 = 2.000.000

That's pretty far from REFCOUNT_MAX with 2.147.483.647. So, it's more like
20.000s if we can keep the pace and have enough memory. Also, it's not only the
mapping structures itself, it's also page tables, userspace structures, etc.

Again, is the number of ~2.15 Billion mappings something we really need to worry
about?

I'm still not convinced about that. But I think we can also just cap GPUVM at,
let's say, 1 Billion mappings?

> 
> The worst I've seen in a real world game was around 19k mappings, but that
> doesn't mean that this here can't be exploited.
> 
> What can be done is to keep one reference per VM_BO structure, but I think
> per mapping is rather unrealistic.
> 
> Regards,
> Christian.
> 
>
Thomas Hellstrom Nov. 9, 2023, 3:50 p.m. UTC | #16
Danilo, Christian

On 11/6/23 17:42, Danilo Krummrich wrote:
> On Mon, Nov 06, 2023 at 04:10:50PM +0100, Christian König wrote:
>> Am 06.11.23 um 15:11 schrieb Danilo Krummrich:
>>> On Mon, Nov 06, 2023 at 02:05:13PM +0100, Christian König wrote:
>>>> Am 06.11.23 um 13:16 schrieb Danilo Krummrich:
>>>>> [SNIP]
>>>>> This reference count just prevents that the VM is freed as long as other
>>>>> ressources are attached to it that carry a VM pointer, such as mappings and
>>>>> VM_BOs. The motivation for that are VM_BOs. For mappings it's indeed a bit
>>>>> paranoid, but it doesn't hurt either and keeps it consistant.
>>>> Ah! Yeah, we have similar semantics in amdgpu as well.
>>>>
>>>> But we keep the reference to the root GEM object and not the VM.
>>>>
>>>> Ok, that makes much more sense then keeping one reference for each mapping.
>>>>
>>>>>> Because of this the mapping should *never* have a reference to the VM, but
>>>>>> rather the VM destroys all mapping when it is destroyed itself.
>>>>>>
>>>>>>> Hence, If the VM is still alive at a point where you don't expect it to
>>>>>>> be, then it's
>>>>>>> simply a driver bug.
>>>>>> Driver bugs is just what I try to prevent here. When individual mappings
>>>>>> keep the VM structure alive then drivers are responsible to clean them up,
>>>>>> if the VM cleans up after itself then we don't need to worry about it in the
>>>>>> driver.
>>>>> Drivers are *always* responsible for that. This has nothing to do with whether
>>>>> the VM is reference counted or not. GPUVM can't clean up mappings after itself.
>>>> Why not?
>>> I feel like we're talking past each other here, at least to some extend.
>>> However, I can't yet see where exactly the misunderstanding resides.
>> +1
>>
>>>> At least in amdgpu we have it exactly like that. E.g. the higher level can
>>>> cleanup the BO_VM structure at any time possible, even when there are
>>>> mappings.
>>> What do you mean with "cleanup the VM_BO structue" exactly?
>>>
>>> The VM_BO structure keeps track of all the mappings mapped in the VM_BO's VM
>>> being backed by the VM_BO's GEM object. And the GEM objects keeps a list of
>>> the corresponding VM_BOs.
>>>
>>> Hence, as long as there are mappings that this VM_BO keeps track of, this VM_BO
>>> should stay alive.
>> No, exactly the other way around. When the VM_BO structure is destroyed the
>> mappings are destroyed with them.
> This seems to be the same misunderstanding as with the VM reference count.
>
> It seems to me that you want to say that for amdgpu it seems to be a use-case
> to get rid of all mappings backed by a given BO and mapped in a given VM, hence
> a VM_BO. You can do that. Thers's even a helper for that in GPUVM.
>
> But also in this case you first need to get rid of all mappings before you
> *free* the VM_BO - GPUVM ensures that.
>
>> Otherwise you would need to destroy each individual mapping separately
>> before teardown which is quite inefficient.
> Not sure what you mean, but I don't see a difference between walking all VM_BOs
> and removing their mappings and walking the VM's tree of mappings and removing
> each of them. Comes down to the same effort in the end. But surely can go both
> ways if you know all the existing VM_BOs.
>
>>>> The VM then keeps track which areas still need to be invalidated
>>>> in the physical representation of the page tables.
>>> And the VM does that through its tree of mappings (struct drm_gpuva). Hence, if
>>> the VM would just remove those structures on cleanup by itself, you'd loose the
>>> ability of cleaning up the page tables. Unless, you track this separately, which
>>> would make the whole tracking of GPUVM itself kinda pointless.
>> But how do you then keep track of areas which are freed and needs to be
>> updated so that nobody can access the underlying memory any more?
> "areas which are freed", what do refer to? What do yo mean with that?
>
> Do you mean areas of the VA space not containing mappings? Why would I need to
> track them explicitly? When the mapping is removed the corresponding page tables
> / page table entries are gone as well, hence no subsequent access to the
> underlaying memory would be possible.
>
>>>> I would expect that the generalized GPU VM handling would need something
>>>> similar. If we leave that to the driver then each driver would have to
>>>> implement that stuff on it's own again.
>>> Similar to what? What exactly do you think can be generalized here?
>> Similar to how amdgpu works.
> I don't think it's quite fair to just throw the "look at what amdgpu does"
> argument at me. What am I supposed to do? Read and understand *every* detail of
> *every* driver?
>
> Did you read through the GPUVM code? That's a honest question and I'm asking it
> because I feel like you're picking up some details from commit messages and
> start questioning them (and that's perfectly fine and absolutely welcome).
>
> But if the answers don't satisfy you or do not lead to a better understanding it
> just seems you ask others to check out amdgpu rather than taking the time to go
> though the proposed code yourself making suggestions to improve it or explicitly
> point out the changes you require.
>
>>  From what I can see you are basically re-inventing everything we already
>> have in there and asking the same questions we stumbled over years ago.
> I did not ask any questions in the first place. I came up with something that
> Nouveau, Xe, Panthor, PowerVR, etc. required and that works for them.
>
> They also all provided a lot of ideas and contributed code through the review
> process.
>
> Of course, I want this to work for amdgpu as well. So, if you think we're
> missing something fundamential or if you see something that simply doesn't work
> for other drivers, like amdgpu, please educate us. I'm surely willing to learn
> and, if required, change the code.
>
> But please don't just tell me I would re-invent amdgpu and assume that I know
> all the details of this driver. None of that is reasonable.
>
>>>>> If the driver left mappings, GPUVM would just leak them without reference count.
>>>>> It doesn't know about the drivers surrounding structures, nor does it know about
>>>>> attached ressources such as PT(E)s.
>>>> What are we talking with the word "mapping"? The BO_VM structure? Or each
>>>> individual mapping?
>>> An individual mapping represented by struct drm_gpuva.
>> Yeah than that certainly doesn't work. See below.
>>
>>>> E.g. what we need to prevent is that VM structure (or the root GEM object)
>>>> is released while VM_BOs are still around. That's what I totally agree on.
>>>>
>>>> But each individual mapping is a different story. Userspace can create so
>>>> many of them that we probably could even overrun a 32bit counter quite
>>>> easily.
>>> REFCOUNT_MAX is specified as 0x7fff_ffff. I agree there can be a lot of
>>> mappings, but (including the VM_BO references) more than 2.147.483.647 per VM?
>> IIRC on amdgpu we can create something like 100k mappings per second and
>> each takes ~64 bytes.
>>
>> So you just need 128GiB of memory and approx 20 seconds to let the kernel
>> run into a refcount overrun.
> 100.000 * 20 = 2.000.000
>
> That's pretty far from REFCOUNT_MAX with 2.147.483.647. So, it's more like
> 20.000s if we can keep the pace and have enough memory. Also, it's not only the
> mapping structures itself, it's also page tables, userspace structures, etc.
>
> Again, is the number of ~2.15 Billion mappings something we really need to worry
> about?
>
> I'm still not convinced about that. But I think we can also just cap GPUVM at,
> let's say, 1 Billion mappings?
>
>> The worst I've seen in a real world game was around 19k mappings, but that
>> doesn't mean that this here can't be exploited.
>>
>> What can be done is to keep one reference per VM_BO structure, but I think
>> per mapping is rather unrealistic.
>>
>> Regards,
>> Christian.
>>
>>
Did we get any resolution on this?

FWIW, my take on this is that it would be possible to get GPUVM to work 
both with and without internal refcounting; If with, the driver needs a 
vm close to resolve cyclic references, if without that's not necessary. 
If GPUVM is allowed to refcount in mappings and vm_bos, that comes with 
a slight performance drop but as Danilo pointed out, the VM lifetime 
problem iterating over a vm_bo's mapping becomes much easier and the 
code thus becomes easier to maintain moving forward. That convinced me 
it's a good thing.

Another issue Christian brought up is that something intended to be 
embeddable (a base class) shouldn't really have its own refcount. I 
think that's a valid point. If you at some point need to derive from 
multiple such structs each having its own refcount, things will start to 
get weird. One way to resolve that would be to have the driver's 
subclass provide get() and put() ops, and export a destructor for the 
base-class, rather than to have the base-class provide the refcount and 
a destructor  ops.

That would also make it possible for the driver to decide the context 
for the put() call: If the driver needs to be able to call put() from 
irq / atomic context but the base-class'es destructor doesn't allow 
atomic context, the driver can push freeing out to a work item if needed.

Finally, the refcount overflow Christian pointed out. Limiting the 
number of mapping sounds like a reasonable remedy to me.

But I think all of this is fixable as follow-ups if needed, unless I'm 
missing something crucial.

Just my 2 cents.

/Thomas
Christian König Nov. 9, 2023, 4:03 p.m. UTC | #17
Am 09.11.23 um 16:50 schrieb Thomas Hellström:
> [SNIP]
>>>
> Did we get any resolution on this?
>
> FWIW, my take on this is that it would be possible to get GPUVM to 
> work both with and without internal refcounting; If with, the driver 
> needs a vm close to resolve cyclic references, if without that's not 
> necessary. If GPUVM is allowed to refcount in mappings and vm_bos, 
> that comes with a slight performance drop but as Danilo pointed out, 
> the VM lifetime problem iterating over a vm_bo's mapping becomes much 
> easier and the code thus becomes easier to maintain moving forward. 
> That convinced me it's a good thing.

I strongly believe you guys stumbled over one of the core problems with 
the VM here and I think that reference counting is the right answer to 
solving this.

The big question is that what is reference counted and in which 
direction does the dependency points, e.g. we have here VM, BO, BO_VM 
and Mapping objects.

Those patches here suggest a counted Mapping -> VM reference and I'm 
pretty sure that this isn't a good idea. What we should rather really 
have is a BO -> VM or BO_VM ->VM reference. In other words that each BO 
which is part of the VM keeps a reference to the VM.

BTW: At least in amdgpu we can have BOs which (temporary) doesn't have 
any mappings, but are still considered part of the VM.

>
> Another issue Christian brought up is that something intended to be 
> embeddable (a base class) shouldn't really have its own refcount. I 
> think that's a valid point. If you at some point need to derive from 
> multiple such structs each having its own refcount, things will start 
> to get weird. One way to resolve that would be to have the driver's 
> subclass provide get() and put() ops, and export a destructor for the 
> base-class, rather than to have the base-class provide the refcount 
> and a destructor  ops.

Well, I have never seen stuff like that in the kernel. Might be that 
this works, but I would rather not try if avoidable.

>
> That would also make it possible for the driver to decide the context 
> for the put() call: If the driver needs to be able to call put() from 
> irq / atomic context but the base-class'es destructor doesn't allow 
> atomic context, the driver can push freeing out to a work item if needed.
>
> Finally, the refcount overflow Christian pointed out. Limiting the 
> number of mapping sounds like a reasonable remedy to me.

Well that depends, I would rather avoid having a dependency for mappings.

Taking the CPU VM handling as example as far as I know vm_area_structs 
doesn't grab a reference to their mm_struct either. Instead they get 
automatically destroyed when the mm_struct is destroyed.

Which makes sense in that case because when the mm_struct is gone the 
vm_area_struct doesn't make sense any more either.

What we clearly need is a reference to prevent the VM or at least the 
shared resv to go away to early.

Regards,
Christian.

>
> But I think all of this is fixable as follow-ups if needed, unless I'm 
> missing something crucial.
>
> Just my 2 cents.
>
> /Thomas
>
>
Danilo Krummrich Nov. 9, 2023, 6:34 p.m. UTC | #18
On 11/9/23 17:03, Christian König wrote:
> Am 09.11.23 um 16:50 schrieb Thomas Hellström:
>> [SNIP]
>>>>
>> Did we get any resolution on this?
>>
>> FWIW, my take on this is that it would be possible to get GPUVM to work both with and without internal refcounting; If with, the driver needs a vm close to resolve cyclic references, if without that's not necessary. If GPUVM is allowed to refcount in mappings and vm_bos, that comes with a slight performance drop but as Danilo pointed out, the VM lifetime problem iterating over a vm_bo's mapping becomes much easier and the code thus becomes easier to maintain moving forward. That convinced me it's a good thing.
> 
> I strongly believe you guys stumbled over one of the core problems with the VM here and I think that reference counting is the right answer to solving this.
> 
> The big question is that what is reference counted and in which direction does the dependency points, e.g. we have here VM, BO, BO_VM and Mapping objects.
> 
> Those patches here suggest a counted Mapping -> VM reference and I'm pretty sure that this isn't a good idea. What we should rather really have is a BO -> VM or BO_VM ->VM reference. In other words that each BO which is part of the VM keeps a reference to the VM.

We have both. Please see the subsequent patch introducing VM_BO structures for that.

As I explained, mappings (struct drm_gpuva) keep a pointer to their VM they're mapped
in and besides that it doesn't make sense to free a VM that still contains mappings,
the reference count ensures that. This simply ensures memory safety.

> 
> BTW: At least in amdgpu we can have BOs which (temporary) doesn't have any mappings, but are still considered part of the VM.

That should be possible.

> 
>>
>> Another issue Christian brought up is that something intended to be embeddable (a base class) shouldn't really have its own refcount. I think that's a valid point. If you at some point need to derive from multiple such structs each having its own refcount, things will start to get weird. One way to resolve that would be to have the driver's subclass provide get() and put() ops, and export a destructor for the base-class, rather than to have the base-class provide the refcount and a destructor  ops.

GPUVM simply follows the same pattern we have with drm_gem_objects. And I think it makes
sense. Why would we want to embed two struct drm_gpuvm in a single driver structure?

> 
> Well, I have never seen stuff like that in the kernel. Might be that this works, but I would rather not try if avoidable.
> 
>>
>> That would also make it possible for the driver to decide the context for the put() call: If the driver needs to be able to call put() from irq / atomic context but the base-class'es destructor doesn't allow atomic context, the driver can push freeing out to a work item if needed.
>>
>> Finally, the refcount overflow Christian pointed out. Limiting the number of mapping sounds like a reasonable remedy to me.
> 
> Well that depends, I would rather avoid having a dependency for mappings.
> 
> Taking the CPU VM handling as example as far as I know vm_area_structs doesn't grab a reference to their mm_struct either. Instead they get automatically destroyed when the mm_struct is destroyed.

Certainly, that would be possible. However, thinking about it, this might call for
huge trouble.

First of all, we'd still need to reference count a GPUVM and take a reference for each
VM_BO, as we do already. Now instead of simply increasing the reference count for each
mapping as well, we'd need a *mandatory* driver callback that is called when the GPUVM
reference count drops to zero. Maybe something like vm_destroy().

The reason is that GPUVM can't just remove all mappings from the tree nor can it free them
by itself, since drivers might use them for tracking their allocated page tables and/or
other stuff.

Now, let's think about the scope this callback might be called from. When a VM_BO is destroyed
the driver might hold a couple of locks (for Xe it would be the VM's shared dma-resv lock and
potentially the corresponding object's dma-resv lock if they're not the same already). If
destroying this VM_BO leads to the VM being destroyed, the drivers vm_destroy() callback would
be called with those locks being held as well.

I feel like doing this finally opens the doors of the locking hell entirely. I think we should
really avoid that.

> 
> Which makes sense in that case because when the mm_struct is gone the vm_area_struct doesn't make sense any more either.
> 
> What we clearly need is a reference to prevent the VM or at least the shared resv to go away to early.

Yeah, that was a good hint and we've covered that.

> 
> Regards,
> Christian.
> 
>>
>> But I think all of this is fixable as follow-ups if needed, unless I'm missing something crucial.

Fully agree, I think at this point we should go ahead and land this series.

>>
>> Just my 2 cents.
>>
>> /Thomas
>>
>>
>
Christian König Nov. 10, 2023, 8:50 a.m. UTC | #19
Am 09.11.23 um 19:34 schrieb Danilo Krummrich:
> On 11/9/23 17:03, Christian König wrote:
>> Am 09.11.23 um 16:50 schrieb Thomas Hellström:
>>> [SNIP]
>>>>>
>>> Did we get any resolution on this?
>>>
>>> FWIW, my take on this is that it would be possible to get GPUVM to 
>>> work both with and without internal refcounting; If with, the driver 
>>> needs a vm close to resolve cyclic references, if without that's not 
>>> necessary. If GPUVM is allowed to refcount in mappings and vm_bos, 
>>> that comes with a slight performance drop but as Danilo pointed out, 
>>> the VM lifetime problem iterating over a vm_bo's mapping becomes 
>>> much easier and the code thus becomes easier to maintain moving 
>>> forward. That convinced me it's a good thing.
>>
>> I strongly believe you guys stumbled over one of the core problems 
>> with the VM here and I think that reference counting is the right 
>> answer to solving this.
>>
>> The big question is that what is reference counted and in which 
>> direction does the dependency points, e.g. we have here VM, BO, BO_VM 
>> and Mapping objects.
>>
>> Those patches here suggest a counted Mapping -> VM reference and I'm 
>> pretty sure that this isn't a good idea. What we should rather really 
>> have is a BO -> VM or BO_VM ->VM reference. In other words that each 
>> BO which is part of the VM keeps a reference to the VM.
>
> We have both. Please see the subsequent patch introducing VM_BO 
> structures for that.
>
> As I explained, mappings (struct drm_gpuva) keep a pointer to their VM 
> they're mapped
> in and besides that it doesn't make sense to free a VM that still 
> contains mappings,
> the reference count ensures that. This simply ensures memory safety.
>
>>
>> BTW: At least in amdgpu we can have BOs which (temporary) doesn't 
>> have any mappings, but are still considered part of the VM.
>
> That should be possible.
>
>>
>>>
>>> Another issue Christian brought up is that something intended to be 
>>> embeddable (a base class) shouldn't really have its own refcount. I 
>>> think that's a valid point. If you at some point need to derive from 
>>> multiple such structs each having its own refcount, things will 
>>> start to get weird. One way to resolve that would be to have the 
>>> driver's subclass provide get() and put() ops, and export a 
>>> destructor for the base-class, rather than to have the base-class 
>>> provide the refcount and a destructor  ops.
>
> GPUVM simply follows the same pattern we have with drm_gem_objects. 
> And I think it makes
> sense. Why would we want to embed two struct drm_gpuvm in a single 
> driver structure?

Because you need one drm_gpuvm structure for each application using the 
driver? Or am I missing something?

As far as I can see a driver would want to embed that into your fpriv 
structure which is allocated during drm_driver.open callback.

>
>>
>> Well, I have never seen stuff like that in the kernel. Might be that 
>> this works, but I would rather not try if avoidable.
>>
>>>
>>> That would also make it possible for the driver to decide the 
>>> context for the put() call: If the driver needs to be able to call 
>>> put() from irq / atomic context but the base-class'es destructor 
>>> doesn't allow atomic context, the driver can push freeing out to a 
>>> work item if needed.
>>>
>>> Finally, the refcount overflow Christian pointed out. Limiting the 
>>> number of mapping sounds like a reasonable remedy to me.
>>
>> Well that depends, I would rather avoid having a dependency for 
>> mappings.
>>
>> Taking the CPU VM handling as example as far as I know 
>> vm_area_structs doesn't grab a reference to their mm_struct either. 
>> Instead they get automatically destroyed when the mm_struct is 
>> destroyed.
>
> Certainly, that would be possible. However, thinking about it, this 
> might call for
> huge trouble.
>
> First of all, we'd still need to reference count a GPUVM and take a 
> reference for each
> VM_BO, as we do already. Now instead of simply increasing the 
> reference count for each
> mapping as well, we'd need a *mandatory* driver callback that is 
> called when the GPUVM
> reference count drops to zero. Maybe something like vm_destroy().
>
> The reason is that GPUVM can't just remove all mappings from the tree 
> nor can it free them
> by itself, since drivers might use them for tracking their allocated 
> page tables and/or
> other stuff.
>
> Now, let's think about the scope this callback might be called from. 
> When a VM_BO is destroyed
> the driver might hold a couple of locks (for Xe it would be the VM's 
> shared dma-resv lock and
> potentially the corresponding object's dma-resv lock if they're not 
> the same already). If
> destroying this VM_BO leads to the VM being destroyed, the drivers 
> vm_destroy() callback would
> be called with those locks being held as well.
>
> I feel like doing this finally opens the doors of the locking hell 
> entirely. I think we should
> really avoid that.

That's a really good point, but I fear exactly that's the use case.

I would expect that VM_BO structures are added in the 
drm_gem_object_funcs.open callback and freed in drm_gem_object_funcs.close.

Since it is perfectly legal for userspace to close a BO while there are 
still mappings (can trivial be that the app is killed) I would expect 
that the drm_gem_object_funcs.close handling is something like asking 
drm_gpuvm destroying the VM_BO and getting the mappings which should be 
cleared in the page table in return.

In amdgpu we even go a step further and the VM structure keeps track of 
all the mappings of deleted VM_BOs so that higher level can query those 
and clear them later on.

Background is that the drm_gem_object_funcs.close can't fail, but it can 
perfectly be that the app is killed because of an OOM situation and we 
can't do page tables updates in that moment because of this.

>
>>
>> Which makes sense in that case because when the mm_struct is gone the 
>> vm_area_struct doesn't make sense any more either.
>>
>> What we clearly need is a reference to prevent the VM or at least the 
>> shared resv to go away to early.
>
> Yeah, that was a good hint and we've covered that.
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> But I think all of this is fixable as follow-ups if needed, unless 
>>> I'm missing something crucial.
>
> Fully agree, I think at this point we should go ahead and land this 
> series.

Yeah, agree this is not UAPI so not nailed in stone. Feel free to add my 
acked-by as well if you want.

Only keep in mind that when you give drivers some functionality in a 
common component they usually expect to keep that functionality.

For example changing the dma_resv object to make sure that drivers can't 
cause use after free errors any more was an extremely annoying 
experience since every user of those interface had to change at once.

Regards,
Christian.

>
>>>
>>> Just my 2 cents.
>>>
>>> /Thomas
>>>
>>>
>>
>
Thomas Hellstrom Nov. 10, 2023, 9:39 a.m. UTC | #20
On 11/10/23 09:50, Christian König wrote:
> Am 09.11.23 um 19:34 schrieb Danilo Krummrich:
>> On 11/9/23 17:03, Christian König wrote:
>>> Am 09.11.23 um 16:50 schrieb Thomas Hellström:
>>>> [SNIP]
>>>>>>
>>>> Did we get any resolution on this?
>>>>
>>>> FWIW, my take on this is that it would be possible to get GPUVM to 
>>>> work both with and without internal refcounting; If with, the 
>>>> driver needs a vm close to resolve cyclic references, if without 
>>>> that's not necessary. If GPUVM is allowed to refcount in mappings 
>>>> and vm_bos, that comes with a slight performance drop but as Danilo 
>>>> pointed out, the VM lifetime problem iterating over a vm_bo's 
>>>> mapping becomes much easier and the code thus becomes easier to 
>>>> maintain moving forward. That convinced me it's a good thing.
>>>
>>> I strongly believe you guys stumbled over one of the core problems 
>>> with the VM here and I think that reference counting is the right 
>>> answer to solving this.
>>>
>>> The big question is that what is reference counted and in which 
>>> direction does the dependency points, e.g. we have here VM, BO, 
>>> BO_VM and Mapping objects.
>>>
>>> Those patches here suggest a counted Mapping -> VM reference and I'm 
>>> pretty sure that this isn't a good idea. What we should rather 
>>> really have is a BO -> VM or BO_VM ->VM reference. In other words 
>>> that each BO which is part of the VM keeps a reference to the VM.
>>
>> We have both. Please see the subsequent patch introducing VM_BO 
>> structures for that.
>>
>> As I explained, mappings (struct drm_gpuva) keep a pointer to their 
>> VM they're mapped
>> in and besides that it doesn't make sense to free a VM that still 
>> contains mappings,
>> the reference count ensures that. This simply ensures memory safety.
>>
>>>
>>> BTW: At least in amdgpu we can have BOs which (temporary) doesn't 
>>> have any mappings, but are still considered part of the VM.
>>
>> That should be possible.
>>
>>>
>>>>
>>>> Another issue Christian brought up is that something intended to be 
>>>> embeddable (a base class) shouldn't really have its own refcount. I 
>>>> think that's a valid point. If you at some point need to derive 
>>>> from multiple such structs each having its own refcount, things 
>>>> will start to get weird. One way to resolve that would be to have 
>>>> the driver's subclass provide get() and put() ops, and export a 
>>>> destructor for the base-class, rather than to have the base-class 
>>>> provide the refcount and a destructor  ops.
>>
>> GPUVM simply follows the same pattern we have with drm_gem_objects. 
>> And I think it makes
>> sense. Why would we want to embed two struct drm_gpuvm in a single 
>> driver structure?
>
> Because you need one drm_gpuvm structure for each application using 
> the driver? Or am I missing something?
>
> As far as I can see a driver would want to embed that into your fpriv 
> structure which is allocated during drm_driver.open callback.

I was thinking more of the general design of a base-class that needs to 
be refcounted. Say a driver vm that inherits from gpu-vm, gem_object and 
yet another base-class that supplies its own refcount. What's the 
best-practice way to do refcounting? All base-classes supplying a 
refcount of its own, or the subclass supplying a refcount and the 
base-classes supply destroy helpers.

But to be clear this is nothing I see needing urgent attention.

>
>>
>>>
>>> Well, I have never seen stuff like that in the kernel. Might be that 
>>> this works, but I would rather not try if avoidable.
>>>
>>>>
>>>> That would also make it possible for the driver to decide the 
>>>> context for the put() call: If the driver needs to be able to call 
>>>> put() from irq / atomic context but the base-class'es destructor 
>>>> doesn't allow atomic context, the driver can push freeing out to a 
>>>> work item if needed.
>>>>
>>>> Finally, the refcount overflow Christian pointed out. Limiting the 
>>>> number of mapping sounds like a reasonable remedy to me.
>>>
>>> Well that depends, I would rather avoid having a dependency for 
>>> mappings.
>>>
>>> Taking the CPU VM handling as example as far as I know 
>>> vm_area_structs doesn't grab a reference to their mm_struct either. 
>>> Instead they get automatically destroyed when the mm_struct is 
>>> destroyed.
>>
>> Certainly, that would be possible. However, thinking about it, this 
>> might call for
>> huge trouble.
>>
>> First of all, we'd still need to reference count a GPUVM and take a 
>> reference for each
>> VM_BO, as we do already. Now instead of simply increasing the 
>> reference count for each
>> mapping as well, we'd need a *mandatory* driver callback that is 
>> called when the GPUVM
>> reference count drops to zero. Maybe something like vm_destroy().
>>
>> The reason is that GPUVM can't just remove all mappings from the tree 
>> nor can it free them
>> by itself, since drivers might use them for tracking their allocated 
>> page tables and/or
>> other stuff.
>>
>> Now, let's think about the scope this callback might be called from. 
>> When a VM_BO is destroyed
>> the driver might hold a couple of locks (for Xe it would be the VM's 
>> shared dma-resv lock and
>> potentially the corresponding object's dma-resv lock if they're not 
>> the same already). If
>> destroying this VM_BO leads to the VM being destroyed, the drivers 
>> vm_destroy() callback would
>> be called with those locks being held as well.
>>
>> I feel like doing this finally opens the doors of the locking hell 
>> entirely. I think we should
>> really avoid that.

I don't think we need to worry much about this particular locking hell 
because if we hold, for example a vm and bo resv when putting the vm_bo, 
we need to keep additional strong references for the bo / vm pointer we 
use for unlocking. Hence putting the vm_bo under those locks can never 
lead to the vm getting destroyed.

Also, don't we already sort of have a mandatory vm_destroy callback?

+	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
+		return;



>
> That's a really good point, but I fear exactly that's the use case.
>
> I would expect that VM_BO structures are added in the 
> drm_gem_object_funcs.open callback and freed in 
> drm_gem_object_funcs.close.
>
> Since it is perfectly legal for userspace to close a BO while there 
> are still mappings (can trivial be that the app is killed) I would 
> expect that the drm_gem_object_funcs.close handling is something like 
> asking drm_gpuvm destroying the VM_BO and getting the mappings which 
> should be cleared in the page table in return.
>
> In amdgpu we even go a step further and the VM structure keeps track 
> of all the mappings of deleted VM_BOs so that higher level can query 
> those and clear them later on.
>
> Background is that the drm_gem_object_funcs.close can't fail, but it 
> can perfectly be that the app is killed because of an OOM situation 
> and we can't do page tables updates in that moment because of this.
>
>>
>>>
>>> Which makes sense in that case because when the mm_struct is gone 
>>> the vm_area_struct doesn't make sense any more either.
>>>
>>> What we clearly need is a reference to prevent the VM or at least 
>>> the shared resv to go away to early.
>>
>> Yeah, that was a good hint and we've covered that.
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> But I think all of this is fixable as follow-ups if needed, unless 
>>>> I'm missing something crucial.
>>
>> Fully agree, I think at this point we should go ahead and land this 
>> series.

+1.

/Thomas


>>
>
> Yeah, agree this is not UAPI so not nailed in stone. Feel free to add 
> my acked-by as well if you want.
>
> Only keep in mind that when you give drivers some functionality in a 
> common component they usually expect to keep that functionality.
>
> For example changing the dma_resv object to make sure that drivers 
> can't cause use after free errors any more was an extremely annoying 
> experience since every user of those interface had to change at once.
>
> Regards,
> Christian.
>
>>
>>>>
>>>> Just my 2 cents.
>>>>
>>>> /Thomas
>>>>
>>>>
>>>
>>
>
Christian König Nov. 10, 2023, 10:42 a.m. UTC | #21
Am 10.11.23 um 10:39 schrieb Thomas Hellström:
>
> [SNIP]

> I was thinking more of the general design of a base-class that needs 
> to be refcounted. Say a driver vm that inherits from gpu-vm, 
> gem_object and yet another base-class that supplies its own refcount. 
> What's the best-practice way to do refcounting? All base-classes 
> supplying a refcount of its own, or the subclass supplying a refcount 
> and the base-classes supply destroy helpers.

 From my experience the most common design pattern in the Linux kernel 
is that you either have reference counted objects which contain a 
private pointer (like struct file, struct inode etc..) or the lifetime 
is defined by the user of the object instead of reference counting and 
in this case you can embed it into your own object.

>
> But to be clear this is nothing I see needing urgent attention.
>
>>
>>>
>>>>
>>>> Well, I have never seen stuff like that in the kernel. Might be 
>>>> that this works, but I would rather not try if avoidable.
>>>>
>>>>>
>>>>> That would also make it possible for the driver to decide the 
>>>>> context for the put() call: If the driver needs to be able to call 
>>>>> put() from irq / atomic context but the base-class'es destructor 
>>>>> doesn't allow atomic context, the driver can push freeing out to a 
>>>>> work item if needed.
>>>>>
>>>>> Finally, the refcount overflow Christian pointed out. Limiting the 
>>>>> number of mapping sounds like a reasonable remedy to me.
>>>>
>>>> Well that depends, I would rather avoid having a dependency for 
>>>> mappings.
>>>>
>>>> Taking the CPU VM handling as example as far as I know 
>>>> vm_area_structs doesn't grab a reference to their mm_struct either. 
>>>> Instead they get automatically destroyed when the mm_struct is 
>>>> destroyed.
>>>
>>> Certainly, that would be possible. However, thinking about it, this 
>>> might call for
>>> huge trouble.
>>>
>>> First of all, we'd still need to reference count a GPUVM and take a 
>>> reference for each
>>> VM_BO, as we do already. Now instead of simply increasing the 
>>> reference count for each
>>> mapping as well, we'd need a *mandatory* driver callback that is 
>>> called when the GPUVM
>>> reference count drops to zero. Maybe something like vm_destroy().
>>>
>>> The reason is that GPUVM can't just remove all mappings from the 
>>> tree nor can it free them
>>> by itself, since drivers might use them for tracking their allocated 
>>> page tables and/or
>>> other stuff.
>>>
>>> Now, let's think about the scope this callback might be called from. 
>>> When a VM_BO is destroyed
>>> the driver might hold a couple of locks (for Xe it would be the VM's 
>>> shared dma-resv lock and
>>> potentially the corresponding object's dma-resv lock if they're not 
>>> the same already). If
>>> destroying this VM_BO leads to the VM being destroyed, the drivers 
>>> vm_destroy() callback would
>>> be called with those locks being held as well.
>>>
>>> I feel like doing this finally opens the doors of the locking hell 
>>> entirely. I think we should
>>> really avoid that.
>
> I don't think we need to worry much about this particular locking hell 
> because if we hold

I have to agree with Danilo here. Especially you have cases where you 
usually lock BO->VM (for example eviction) as well as cases where you 
need to lock VM->BO (command submission).

Because of this in amdgpu we used (or abused?) the dma_resv of the root 
BO as lock for the VM. Since this is a ww_mutex locking it in both VM, 
BO as well as BO, VM order works.

Regards,
Christian.

> , for example a vm and bo resv when putting the vm_bo, we need to keep 
> additional strong references for the bo / vm pointer we use for 
> unlocking. Hence putting the vm_bo under those locks can never lead to 
> the vm getting destroyed.
>
> Also, don't we already sort of have a mandatory vm_destroy callback?
>
> +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> +        return;
>
>
>
>>
>> That's a really good point, but I fear exactly that's the use case.
>>
>> I would expect that VM_BO structures are added in the 
>> drm_gem_object_funcs.open callback and freed in 
>> drm_gem_object_funcs.close.
>>
>> Since it is perfectly legal for userspace to close a BO while there 
>> are still mappings (can trivial be that the app is killed) I would 
>> expect that the drm_gem_object_funcs.close handling is something like 
>> asking drm_gpuvm destroying the VM_BO and getting the mappings which 
>> should be cleared in the page table in return.
>>
>> In amdgpu we even go a step further and the VM structure keeps track 
>> of all the mappings of deleted VM_BOs so that higher level can query 
>> those and clear them later on.
>>
>> Background is that the drm_gem_object_funcs.close can't fail, but it 
>> can perfectly be that the app is killed because of an OOM situation 
>> and we can't do page tables updates in that moment because of this.
>>
>>>
>>>>
>>>> Which makes sense in that case because when the mm_struct is gone 
>>>> the vm_area_struct doesn't make sense any more either.
>>>>
>>>> What we clearly need is a reference to prevent the VM or at least 
>>>> the shared resv to go away to early.
>>>
>>> Yeah, that was a good hint and we've covered that.
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> But I think all of this is fixable as follow-ups if needed, unless 
>>>>> I'm missing something crucial.
>>>
>>> Fully agree, I think at this point we should go ahead and land this 
>>> series.
>
> +1.
>
> /Thomas
>
>
>>>
>>
>> Yeah, agree this is not UAPI so not nailed in stone. Feel free to add 
>> my acked-by as well if you want.
>>
>> Only keep in mind that when you give drivers some functionality in a 
>> common component they usually expect to keep that functionality.
>>
>> For example changing the dma_resv object to make sure that drivers 
>> can't cause use after free errors any more was an extremely annoying 
>> experience since every user of those interface had to change at once.
>>
>> Regards,
>> Christian.
>>
>>>
>>>>>
>>>>> Just my 2 cents.
>>>>>
>>>>> /Thomas
>>>>>
>>>>>
>>>>
>>>
>>
Thomas Hellstrom Nov. 10, 2023, 10:52 a.m. UTC | #22
On 11/10/23 11:42, Christian König wrote:
> Am 10.11.23 um 10:39 schrieb Thomas Hellström:
>>
>> [SNIP]
>
>> I was thinking more of the general design of a base-class that needs 
>> to be refcounted. Say a driver vm that inherits from gpu-vm, 
>> gem_object and yet another base-class that supplies its own refcount. 
>> What's the best-practice way to do refcounting? All base-classes 
>> supplying a refcount of its own, or the subclass supplying a refcount 
>> and the base-classes supply destroy helpers.
>
> From my experience the most common design pattern in the Linux kernel 
> is that you either have reference counted objects which contain a 
> private pointer (like struct file, struct inode etc..) or the lifetime 
> is defined by the user of the object instead of reference counting and 
> in this case you can embed it into your own object.
>
>>
>> But to be clear this is nothing I see needing urgent attention.
>>
>>>
>>>>
>>>>>
>>>>> Well, I have never seen stuff like that in the kernel. Might be 
>>>>> that this works, but I would rather not try if avoidable.
>>>>>
>>>>>>
>>>>>> That would also make it possible for the driver to decide the 
>>>>>> context for the put() call: If the driver needs to be able to 
>>>>>> call put() from irq / atomic context but the base-class'es 
>>>>>> destructor doesn't allow atomic context, the driver can push 
>>>>>> freeing out to a work item if needed.
>>>>>>
>>>>>> Finally, the refcount overflow Christian pointed out. Limiting 
>>>>>> the number of mapping sounds like a reasonable remedy to me.
>>>>>
>>>>> Well that depends, I would rather avoid having a dependency for 
>>>>> mappings.
>>>>>
>>>>> Taking the CPU VM handling as example as far as I know 
>>>>> vm_area_structs doesn't grab a reference to their mm_struct 
>>>>> either. Instead they get automatically destroyed when the 
>>>>> mm_struct is destroyed.
>>>>
>>>> Certainly, that would be possible. However, thinking about it, this 
>>>> might call for
>>>> huge trouble.
>>>>
>>>> First of all, we'd still need to reference count a GPUVM and take a 
>>>> reference for each
>>>> VM_BO, as we do already. Now instead of simply increasing the 
>>>> reference count for each
>>>> mapping as well, we'd need a *mandatory* driver callback that is 
>>>> called when the GPUVM
>>>> reference count drops to zero. Maybe something like vm_destroy().
>>>>
>>>> The reason is that GPUVM can't just remove all mappings from the 
>>>> tree nor can it free them
>>>> by itself, since drivers might use them for tracking their 
>>>> allocated page tables and/or
>>>> other stuff.
>>>>
>>>> Now, let's think about the scope this callback might be called 
>>>> from. When a VM_BO is destroyed
>>>> the driver might hold a couple of locks (for Xe it would be the 
>>>> VM's shared dma-resv lock and
>>>> potentially the corresponding object's dma-resv lock if they're not 
>>>> the same already). If
>>>> destroying this VM_BO leads to the VM being destroyed, the drivers 
>>>> vm_destroy() callback would
>>>> be called with those locks being held as well.
>>>>
>>>> I feel like doing this finally opens the doors of the locking hell 
>>>> entirely. I think we should
>>>> really avoid that.
>>
>> I don't think we need to worry much about this particular locking 
>> hell because if we hold
>
> I have to agree with Danilo here. Especially you have cases where you 
> usually lock BO->VM (for example eviction) as well as cases where you 
> need to lock VM->BO (command submission).
>
> Because of this in amdgpu we used (or abused?) the dma_resv of the 
> root BO as lock for the VM. Since this is a ww_mutex locking it in 
> both VM, BO as well as BO, VM order works.

Yes, gpuvm is doing the same. (although not necessarily using the 
page-table root bo, but any bo of the driver's choice). But I read it as 
Danilo feared the case where the VM destructor was called with a VM resv 
(or possibly bo resv) held. I meant the driver can easily ensure that's 
not happening, and in some cases it can't happen.

Thanks,

Thomas



>
> Regards,
> Christian.
>
>> , for example a vm and bo resv when putting the vm_bo, we need to 
>> keep additional strong references for the bo / vm pointer we use for 
>> unlocking. Hence putting the vm_bo under those locks can never lead 
>> to the vm getting destroyed.
>>
>> Also, don't we already sort of have a mandatory vm_destroy callback?
>>
>> +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>> +        return;
>>
>>
>>
>>>
>>> That's a really good point, but I fear exactly that's the use case.
>>>
>>> I would expect that VM_BO structures are added in the 
>>> drm_gem_object_funcs.open callback and freed in 
>>> drm_gem_object_funcs.close.
>>>
>>> Since it is perfectly legal for userspace to close a BO while there 
>>> are still mappings (can trivial be that the app is killed) I would 
>>> expect that the drm_gem_object_funcs.close handling is something 
>>> like asking drm_gpuvm destroying the VM_BO and getting the mappings 
>>> which should be cleared in the page table in return.
>>>
>>> In amdgpu we even go a step further and the VM structure keeps track 
>>> of all the mappings of deleted VM_BOs so that higher level can query 
>>> those and clear them later on.
>>>
>>> Background is that the drm_gem_object_funcs.close can't fail, but it 
>>> can perfectly be that the app is killed because of an OOM situation 
>>> and we can't do page tables updates in that moment because of this.
>>>
>>>>
>>>>>
>>>>> Which makes sense in that case because when the mm_struct is gone 
>>>>> the vm_area_struct doesn't make sense any more either.
>>>>>
>>>>> What we clearly need is a reference to prevent the VM or at least 
>>>>> the shared resv to go away to early.
>>>>
>>>> Yeah, that was a good hint and we've covered that.
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> But I think all of this is fixable as follow-ups if needed, 
>>>>>> unless I'm missing something crucial.
>>>>
>>>> Fully agree, I think at this point we should go ahead and land this 
>>>> series.
>>
>> +1.
>>
>> /Thomas
>>
>>
>>>>
>>>
>>> Yeah, agree this is not UAPI so not nailed in stone. Feel free to 
>>> add my acked-by as well if you want.
>>>
>>> Only keep in mind that when you give drivers some functionality in a 
>>> common component they usually expect to keep that functionality.
>>>
>>> For example changing the dma_resv object to make sure that drivers 
>>> can't cause use after free errors any more was an extremely annoying 
>>> experience since every user of those interface had to change at once.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>>>
>>>>>> Just my 2 cents.
>>>>>>
>>>>>> /Thomas
>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>
Danilo Krummrich Nov. 10, 2023, 4:43 p.m. UTC | #23
On 11/10/23 10:39, Thomas Hellström wrote:
> 
> On 11/10/23 09:50, Christian König wrote:
>> Am 09.11.23 um 19:34 schrieb Danilo Krummrich:
>>> On 11/9/23 17:03, Christian König wrote:
>>>> Am 09.11.23 um 16:50 schrieb Thomas Hellström:
>>>>> [SNIP]
>>>>>>>
>>>>> Did we get any resolution on this?
>>>>>
>>>>> FWIW, my take on this is that it would be possible to get GPUVM to work both with and without internal refcounting; If with, the driver needs a vm close to resolve cyclic references, if without that's not necessary. If GPUVM is allowed to refcount in mappings and vm_bos, that comes with a slight performance drop but as Danilo pointed out, the VM lifetime problem iterating over a vm_bo's mapping becomes much easier and the code thus becomes easier to maintain moving forward. That convinced me it's a good thing.
>>>>
>>>> I strongly believe you guys stumbled over one of the core problems with the VM here and I think that reference counting is the right answer to solving this.
>>>>
>>>> The big question is that what is reference counted and in which direction does the dependency points, e.g. we have here VM, BO, BO_VM and Mapping objects.
>>>>
>>>> Those patches here suggest a counted Mapping -> VM reference and I'm pretty sure that this isn't a good idea. What we should rather really have is a BO -> VM or BO_VM ->VM reference. In other words that each BO which is part of the VM keeps a reference to the VM.
>>>
>>> We have both. Please see the subsequent patch introducing VM_BO structures for that.
>>>
>>> As I explained, mappings (struct drm_gpuva) keep a pointer to their VM they're mapped
>>> in and besides that it doesn't make sense to free a VM that still contains mappings,
>>> the reference count ensures that. This simply ensures memory safety.
>>>
>>>>
>>>> BTW: At least in amdgpu we can have BOs which (temporary) doesn't have any mappings, but are still considered part of the VM.
>>>
>>> That should be possible.
>>>
>>>>
>>>>>
>>>>> Another issue Christian brought up is that something intended to be embeddable (a base class) shouldn't really have its own refcount. I think that's a valid point. If you at some point need to derive from multiple such structs each having its own refcount, things will start to get weird. One way to resolve that would be to have the driver's subclass provide get() and put() ops, and export a destructor for the base-class, rather than to have the base-class provide the refcount and a destructor  ops.
>>>
>>> GPUVM simply follows the same pattern we have with drm_gem_objects. And I think it makes
>>> sense. Why would we want to embed two struct drm_gpuvm in a single driver structure?
>>
>> Because you need one drm_gpuvm structure for each application using the driver? Or am I missing something?
>>
>> As far as I can see a driver would want to embed that into your fpriv structure which is allocated during drm_driver.open callback.
> 
> I was thinking more of the general design of a base-class that needs to be refcounted. Say a driver vm that inherits from gpu-vm, gem_object and yet another base-class that supplies its own refcount. What's the best-practice way to do refcounting? All base-classes supplying a refcount of its own, or the subclass supplying a refcount and the base-classes supply destroy helpers.
> 
> But to be clear this is nothing I see needing urgent attention.
> 
>>
>>>
>>>>
>>>> Well, I have never seen stuff like that in the kernel. Might be that this works, but I would rather not try if avoidable.
>>>>
>>>>>
>>>>> That would also make it possible for the driver to decide the context for the put() call: If the driver needs to be able to call put() from irq / atomic context but the base-class'es destructor doesn't allow atomic context, the driver can push freeing out to a work item if needed.
>>>>>
>>>>> Finally, the refcount overflow Christian pointed out. Limiting the number of mapping sounds like a reasonable remedy to me.
>>>>
>>>> Well that depends, I would rather avoid having a dependency for mappings.
>>>>
>>>> Taking the CPU VM handling as example as far as I know vm_area_structs doesn't grab a reference to their mm_struct either. Instead they get automatically destroyed when the mm_struct is destroyed.
>>>
>>> Certainly, that would be possible. However, thinking about it, this might call for
>>> huge trouble.
>>>
>>> First of all, we'd still need to reference count a GPUVM and take a reference for each
>>> VM_BO, as we do already. Now instead of simply increasing the reference count for each
>>> mapping as well, we'd need a *mandatory* driver callback that is called when the GPUVM
>>> reference count drops to zero. Maybe something like vm_destroy().
>>>
>>> The reason is that GPUVM can't just remove all mappings from the tree nor can it free them
>>> by itself, since drivers might use them for tracking their allocated page tables and/or
>>> other stuff.
>>>
>>> Now, let's think about the scope this callback might be called from. When a VM_BO is destroyed
>>> the driver might hold a couple of locks (for Xe it would be the VM's shared dma-resv lock and
>>> potentially the corresponding object's dma-resv lock if they're not the same already). If
>>> destroying this VM_BO leads to the VM being destroyed, the drivers vm_destroy() callback would
>>> be called with those locks being held as well.
>>>
>>> I feel like doing this finally opens the doors of the locking hell entirely. I think we should
>>> really avoid that.
> 
> I don't think we need to worry much about this particular locking hell because if we hold, for example a vm and bo resv when putting the vm_bo, we need to keep additional strong references for the bo / vm pointer we use for unlocking. Hence putting the vm_bo under those locks can never lead to the vm getting destroyed.
> 
> Also, don't we already sort of have a mandatory vm_destroy callback?

Sure, I just wanted to say that we'd then have a mandatory callback where drivers *must* ensure
to remove *all* mappings before returning from this callback. I could imagine that there could
be some pitfalls with that.

So I'm not worried about the callback itself being mandatory, but about enforcing this semantics
on it. Maybe I didn't phrase this very well.

> 
> +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
> +        return;> 
> 
> 
>>
>> That's a really good point, but I fear exactly that's the use case.
>>
>> I would expect that VM_BO structures are added in the drm_gem_object_funcs.open callback and freed in drm_gem_object_funcs.close.
>>
>> Since it is perfectly legal for userspace to close a BO while there are still mappings (can trivial be that the app is killed) I would expect that the drm_gem_object_funcs.close handling is something like asking drm_gpuvm destroying the VM_BO and getting the mappings which should be cleared in the page table in return.
>>
>> In amdgpu we even go a step further and the VM structure keeps track of all the mappings of deleted VM_BOs so that higher level can query those and clear them later on.
>>
>> Background is that the drm_gem_object_funcs.close can't fail, but it can perfectly be that the app is killed because of an OOM situation and we can't do page tables updates in that moment because of this.
>>
>>>
>>>>
>>>> Which makes sense in that case because when the mm_struct is gone the vm_area_struct doesn't make sense any more either.
>>>>
>>>> What we clearly need is a reference to prevent the VM or at least the shared resv to go away to early.
>>>
>>> Yeah, that was a good hint and we've covered that.
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> But I think all of this is fixable as follow-ups if needed, unless I'm missing something crucial.
>>>
>>> Fully agree, I think at this point we should go ahead and land this series.
> 
> +1.
> 
> /Thomas
> 
> 
>>>
>>
>> Yeah, agree this is not UAPI so not nailed in stone. Feel free to add my acked-by as well if you want.
>>
>> Only keep in mind that when you give drivers some functionality in a common component they usually expect to keep that functionality.
>>
>> For example changing the dma_resv object to make sure that drivers can't cause use after free errors any more was an extremely annoying experience since every user of those interface had to change at once.
>>
>> Regards,
>> Christian.
>>
>>>
>>>>>
>>>>> Just my 2 cents.
>>>>>
>>>>> /Thomas
>>>>>
>>>>>
>>>>
>>>
>>
>
Danilo Krummrich Nov. 10, 2023, 4:49 p.m. UTC | #24
On 11/10/23 11:52, Thomas Hellström wrote:
> 
> On 11/10/23 11:42, Christian König wrote:
>> Am 10.11.23 um 10:39 schrieb Thomas Hellström:
>>>
>>> [SNIP]
>>
>>> I was thinking more of the general design of a base-class that needs to be refcounted. Say a driver vm that inherits from gpu-vm, gem_object and yet another base-class that supplies its own refcount. What's the best-practice way to do refcounting? All base-classes supplying a refcount of its own, or the subclass supplying a refcount and the base-classes supply destroy helpers.
>>
>> From my experience the most common design pattern in the Linux kernel is that you either have reference counted objects which contain a private pointer (like struct file, struct inode etc..) or the lifetime is defined by the user of the object instead of reference counting and in this case you can embed it into your own object.
>>
>>>
>>> But to be clear this is nothing I see needing urgent attention.
>>>
>>>>
>>>>>
>>>>>>
>>>>>> Well, I have never seen stuff like that in the kernel. Might be that this works, but I would rather not try if avoidable.
>>>>>>
>>>>>>>
>>>>>>> That would also make it possible for the driver to decide the context for the put() call: If the driver needs to be able to call put() from irq / atomic context but the base-class'es destructor doesn't allow atomic context, the driver can push freeing out to a work item if needed.
>>>>>>>
>>>>>>> Finally, the refcount overflow Christian pointed out. Limiting the number of mapping sounds like a reasonable remedy to me.
>>>>>>
>>>>>> Well that depends, I would rather avoid having a dependency for mappings.
>>>>>>
>>>>>> Taking the CPU VM handling as example as far as I know vm_area_structs doesn't grab a reference to their mm_struct either. Instead they get automatically destroyed when the mm_struct is destroyed.
>>>>>
>>>>> Certainly, that would be possible. However, thinking about it, this might call for
>>>>> huge trouble.
>>>>>
>>>>> First of all, we'd still need to reference count a GPUVM and take a reference for each
>>>>> VM_BO, as we do already. Now instead of simply increasing the reference count for each
>>>>> mapping as well, we'd need a *mandatory* driver callback that is called when the GPUVM
>>>>> reference count drops to zero. Maybe something like vm_destroy().
>>>>>
>>>>> The reason is that GPUVM can't just remove all mappings from the tree nor can it free them
>>>>> by itself, since drivers might use them for tracking their allocated page tables and/or
>>>>> other stuff.
>>>>>
>>>>> Now, let's think about the scope this callback might be called from. When a VM_BO is destroyed
>>>>> the driver might hold a couple of locks (for Xe it would be the VM's shared dma-resv lock and
>>>>> potentially the corresponding object's dma-resv lock if they're not the same already). If
>>>>> destroying this VM_BO leads to the VM being destroyed, the drivers vm_destroy() callback would
>>>>> be called with those locks being held as well.
>>>>>
>>>>> I feel like doing this finally opens the doors of the locking hell entirely. I think we should
>>>>> really avoid that.
>>>
>>> I don't think we need to worry much about this particular locking hell because if we hold
>>
>> I have to agree with Danilo here. Especially you have cases where you usually lock BO->VM (for example eviction) as well as cases where you need to lock VM->BO (command submission).
>>
>> Because of this in amdgpu we used (or abused?) the dma_resv of the root BO as lock for the VM. Since this is a ww_mutex locking it in both VM, BO as well as BO, VM order works.
> 
> Yes, gpuvm is doing the same. (although not necessarily using the page-table root bo, but any bo of the driver's choice). But I read it as Danilo feared the case where the VM destructor was called with a VM resv (or possibly bo resv) held. I meant the driver can easily ensure that's not happening, and in some cases it can't happen.

Right, that's what I meant. However, this also comes down to what Christian means. When the callback
is called with the resv locks held, we'd potentially have this locking inversion between
VM lock -> resv lock and resv lock -> VM lock.

> 
> Thanks,
> 
> Thomas
> 
> 
> 
>>
>> Regards,
>> Christian.
>>
>>> , for example a vm and bo resv when putting the vm_bo, we need to keep additional strong references for the bo / vm pointer we use for unlocking. Hence putting the vm_bo under those locks can never lead to the vm getting destroyed.
>>>
>>> Also, don't we already sort of have a mandatory vm_destroy callback?
>>>
>>> +    if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
>>> +        return;
>>>
>>>
>>>
>>>>
>>>> That's a really good point, but I fear exactly that's the use case.
>>>>
>>>> I would expect that VM_BO structures are added in the drm_gem_object_funcs.open callback and freed in drm_gem_object_funcs.close.
>>>>
>>>> Since it is perfectly legal for userspace to close a BO while there are still mappings (can trivial be that the app is killed) I would expect that the drm_gem_object_funcs.close handling is something like asking drm_gpuvm destroying the VM_BO and getting the mappings which should be cleared in the page table in return.
>>>>
>>>> In amdgpu we even go a step further and the VM structure keeps track of all the mappings of deleted VM_BOs so that higher level can query those and clear them later on.
>>>>
>>>> Background is that the drm_gem_object_funcs.close can't fail, but it can perfectly be that the app is killed because of an OOM situation and we can't do page tables updates in that moment because of this.
>>>>
>>>>>
>>>>>>
>>>>>> Which makes sense in that case because when the mm_struct is gone the vm_area_struct doesn't make sense any more either.
>>>>>>
>>>>>> What we clearly need is a reference to prevent the VM or at least the shared resv to go away to early.
>>>>>
>>>>> Yeah, that was a good hint and we've covered that.
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> But I think all of this is fixable as follow-ups if needed, unless I'm missing something crucial.
>>>>>
>>>>> Fully agree, I think at this point we should go ahead and land this series.
>>>
>>> +1.
>>>
>>> /Thomas
>>>
>>>
>>>>>
>>>>
>>>> Yeah, agree this is not UAPI so not nailed in stone. Feel free to add my acked-by as well if you want.
>>>>
>>>> Only keep in mind that when you give drivers some functionality in a common component they usually expect to keep that functionality.
>>>>
>>>> For example changing the dma_resv object to make sure that drivers can't cause use after free errors any more was an extremely annoying experience since every user of those interface had to change at once.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>>>>
>>>>>>> Just my 2 cents.
>>>>>>>
>>>>>>> /Thomas
>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>
>
Danilo Krummrich Nov. 10, 2023, 4:57 p.m. UTC | #25
On 11/10/23 09:50, Christian König wrote:
> Am 09.11.23 um 19:34 schrieb Danilo Krummrich:
>> On 11/9/23 17:03, Christian König wrote:
>>> Am 09.11.23 um 16:50 schrieb Thomas Hellström:
>>>> [SNIP]
>>>>>>
>>>> Did we get any resolution on this?
>>>>
>>>> FWIW, my take on this is that it would be possible to get GPUVM to work both with and without internal refcounting; If with, the driver needs a vm close to resolve cyclic references, if without that's not necessary. If GPUVM is allowed to refcount in mappings and vm_bos, that comes with a slight performance drop but as Danilo pointed out, the VM lifetime problem iterating over a vm_bo's mapping becomes much easier and the code thus becomes easier to maintain moving forward. That convinced me it's a good thing.
>>>
>>> I strongly believe you guys stumbled over one of the core problems with the VM here and I think that reference counting is the right answer to solving this.
>>>
>>> The big question is that what is reference counted and in which direction does the dependency points, e.g. we have here VM, BO, BO_VM and Mapping objects.
>>>
>>> Those patches here suggest a counted Mapping -> VM reference and I'm pretty sure that this isn't a good idea. What we should rather really have is a BO -> VM or BO_VM ->VM reference. In other words that each BO which is part of the VM keeps a reference to the VM.
>>
>> We have both. Please see the subsequent patch introducing VM_BO structures for that.
>>
>> As I explained, mappings (struct drm_gpuva) keep a pointer to their VM they're mapped
>> in and besides that it doesn't make sense to free a VM that still contains mappings,
>> the reference count ensures that. This simply ensures memory safety.
>>
>>>
>>> BTW: At least in amdgpu we can have BOs which (temporary) doesn't have any mappings, but are still considered part of the VM.
>>
>> That should be possible.
>>
>>>
>>>>
>>>> Another issue Christian brought up is that something intended to be embeddable (a base class) shouldn't really have its own refcount. I think that's a valid point. If you at some point need to derive from multiple such structs each having its own refcount, things will start to get weird. One way to resolve that would be to have the driver's subclass provide get() and put() ops, and export a destructor for the base-class, rather than to have the base-class provide the refcount and a destructor  ops.
>>
>> GPUVM simply follows the same pattern we have with drm_gem_objects. And I think it makes
>> sense. Why would we want to embed two struct drm_gpuvm in a single driver structure?
> 
> Because you need one drm_gpuvm structure for each application using the driver? Or am I missing something?

Right, *one*, but not more than one. Wasn't that the concern? Maybe I misunderstood something. :)

> 
> As far as I can see a driver would want to embed that into your fpriv structure which is allocated during drm_driver.open callback.
> 
>>
>>>
>>> Well, I have never seen stuff like that in the kernel. Might be that this works, but I would rather not try if avoidable.
>>>
>>>>
>>>> That would also make it possible for the driver to decide the context for the put() call: If the driver needs to be able to call put() from irq / atomic context but the base-class'es destructor doesn't allow atomic context, the driver can push freeing out to a work item if needed.
>>>>
>>>> Finally, the refcount overflow Christian pointed out. Limiting the number of mapping sounds like a reasonable remedy to me.
>>>
>>> Well that depends, I would rather avoid having a dependency for mappings.
>>>
>>> Taking the CPU VM handling as example as far as I know vm_area_structs doesn't grab a reference to their mm_struct either. Instead they get automatically destroyed when the mm_struct is destroyed.
>>
>> Certainly, that would be possible. However, thinking about it, this might call for
>> huge trouble.
>>
>> First of all, we'd still need to reference count a GPUVM and take a reference for each
>> VM_BO, as we do already. Now instead of simply increasing the reference count for each
>> mapping as well, we'd need a *mandatory* driver callback that is called when the GPUVM
>> reference count drops to zero. Maybe something like vm_destroy().
>>
>> The reason is that GPUVM can't just remove all mappings from the tree nor can it free them
>> by itself, since drivers might use them for tracking their allocated page tables and/or
>> other stuff.
>>
>> Now, let's think about the scope this callback might be called from. When a VM_BO is destroyed
>> the driver might hold a couple of locks (for Xe it would be the VM's shared dma-resv lock and
>> potentially the corresponding object's dma-resv lock if they're not the same already). If
>> destroying this VM_BO leads to the VM being destroyed, the drivers vm_destroy() callback would
>> be called with those locks being held as well.
>>
>> I feel like doing this finally opens the doors of the locking hell entirely. I think we should
>> really avoid that.
> 
> That's a really good point, but I fear exactly that's the use case.
> 
> I would expect that VM_BO structures are added in the drm_gem_object_funcs.open callback and freed in drm_gem_object_funcs.close.
> 
> Since it is perfectly legal for userspace to close a BO while there are still mappings (can trivial be that the app is killed) I would expect that the drm_gem_object_funcs.close handling is something like asking drm_gpuvm destroying the VM_BO and getting the mappings which should be cleared in the page table in return.

Yes, you can do that. You can ask GPUVM to create a set of operations for you to shut down all
mappings of a given VM_BO. But until this is done the VM_BO structure is kept alive. Semantically,
it's exactly the same though.

> 
> In amdgpu we even go a step further and the VM structure keeps track of all the mappings of deleted VM_BOs so that higher level can query those and clear them later on.
> 
> Background is that the drm_gem_object_funcs.close can't fail, but it can perfectly be that the app is killed because of an OOM situation and we can't do page tables updates in that moment because of this.
> 
>>
>>>
>>> Which makes sense in that case because when the mm_struct is gone the vm_area_struct doesn't make sense any more either.
>>>
>>> What we clearly need is a reference to prevent the VM or at least the shared resv to go away to early.
>>
>> Yeah, that was a good hint and we've covered that.
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> But I think all of this is fixable as follow-ups if needed, unless I'm missing something crucial.
>>
>> Fully agree, I think at this point we should go ahead and land this series.
> 
> Yeah, agree this is not UAPI so not nailed in stone. Feel free to add my acked-by as well if you want.
> 
> Only keep in mind that when you give drivers some functionality in a common component they usually expect to keep that functionality.
> 
> For example changing the dma_resv object to make sure that drivers can't cause use after free errors any more was an extremely annoying experience since every user of those interface had to change at once.
> 
> Regards,
> Christian.
> 
>>
>>>>
>>>> Just my 2 cents.
>>>>
>>>> /Thomas
>>>>
>>>>
>>>
>>
>
Christian König Nov. 13, 2023, 7:22 a.m. UTC | #26
Am 10.11.23 um 17:57 schrieb Danilo Krummrich:
> On 11/10/23 09:50, Christian König wrote:
>> [SNIP]
>>>>
>>>>>
>>>>> Another issue Christian brought up is that something intended to 
>>>>> be embeddable (a base class) shouldn't really have its own 
>>>>> refcount. I think that's a valid point. If you at some point need 
>>>>> to derive from multiple such structs each having its own refcount, 
>>>>> things will start to get weird. One way to resolve that would be 
>>>>> to have the driver's subclass provide get() and put() ops, and 
>>>>> export a destructor for the base-class, rather than to have the 
>>>>> base-class provide the refcount and a destructor  ops.
>>>
>>> GPUVM simply follows the same pattern we have with drm_gem_objects. 
>>> And I think it makes
>>> sense. Why would we want to embed two struct drm_gpuvm in a single 
>>> driver structure?
>>
>> Because you need one drm_gpuvm structure for each application using 
>> the driver? Or am I missing something?
>
> Right, *one*, but not more than one. Wasn't that the concern? Maybe I 
> misunderstood something. :)

Well, there is the use case of native context with XEN/KVM. In that 
situation QEMU opens tons of driver file descriptors on behalves of the 
virtual environment clients.

In this use case you have many drm_gpuvm instances for a single 
application. So you can't assume that you only have one VM per PID/TGID 
or something like that.

AMD already made that mistake with KFD and I strongly suggest not to 
repeat it :)

Regards,
Christian.
Danilo Krummrich Nov. 13, 2023, 12:57 p.m. UTC | #27
On 11/13/23 08:22, Christian König wrote:
> Am 10.11.23 um 17:57 schrieb Danilo Krummrich:
>> On 11/10/23 09:50, Christian König wrote:
>>> [SNIP]
>>>>>
>>>>>>
>>>>>> Another issue Christian brought up is that something intended to be embeddable (a base class) shouldn't really have its own refcount. I think that's a valid point. If you at some point need to derive from multiple such structs each having its own refcount, things will start to get weird. One way to resolve that would be to have the driver's subclass provide get() and put() ops, and export a destructor for the base-class, rather than to have the base-class provide the refcount and a destructor  ops.
>>>>
>>>> GPUVM simply follows the same pattern we have with drm_gem_objects. And I think it makes
>>>> sense. Why would we want to embed two struct drm_gpuvm in a single driver structure?
>>>
>>> Because you need one drm_gpuvm structure for each application using the driver? Or am I missing something?
>>
>> Right, *one*, but not more than one. Wasn't that the concern? Maybe I misunderstood something. :)
> 
> Well, there is the use case of native context with XEN/KVM. In that situation QEMU opens tons of driver file descriptors on behalves of the virtual environment clients.
> 
> In this use case you have many drm_gpuvm instances for a single application. So you can't assume that you only have one VM per PID/TGID or something like that.

Well, that's fine. I think Xe can have multiple VMs per PID as well. In this case you'd keep creating driver VM structures with a single GPUVM as base class. But not multiple GPUVMs serving as base class for a single driver structure, which I thought was the concern here. For the latter I can't see a use case.

> 
> AMD already made that mistake with KFD and I strongly suggest not to repeat it :)
> 
> Regards,
> Christian.
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
index 53e2c406fb04..6a88eafc5229 100644
--- a/drivers/gpu/drm/drm_gpuvm.c
+++ b/drivers/gpu/drm/drm_gpuvm.c
@@ -746,6 +746,8 @@  drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
 	gpuvm->rb.tree = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&gpuvm->rb.list);
 
+	kref_init(&gpuvm->kref);
+
 	gpuvm->name = name ? name : "unknown";
 	gpuvm->flags = flags;
 	gpuvm->ops = ops;
@@ -770,15 +772,8 @@  drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
 }
 EXPORT_SYMBOL_GPL(drm_gpuvm_init);
 
-/**
- * drm_gpuvm_destroy() - cleanup a &drm_gpuvm
- * @gpuvm: pointer to the &drm_gpuvm to clean up
- *
- * Note that it is a bug to call this function on a manager that still
- * holds GPU VA mappings.
- */
-void
-drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
+static void
+drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
 {
 	gpuvm->name = NULL;
 
@@ -790,7 +785,33 @@  drm_gpuvm_destroy(struct drm_gpuvm *gpuvm)
 
 	drm_gem_object_put(gpuvm->r_obj);
 }
-EXPORT_SYMBOL_GPL(drm_gpuvm_destroy);
+
+static void
+drm_gpuvm_free(struct kref *kref)
+{
+	struct drm_gpuvm *gpuvm = container_of(kref, struct drm_gpuvm, kref);
+
+	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
+		return;
+
+	drm_gpuvm_fini(gpuvm);
+
+	gpuvm->ops->vm_free(gpuvm);
+}
+
+/**
+ * drm_gpuvm_bo_put() - drop a struct drm_gpuvm reference
+ * @gpuvm: the &drm_gpuvm to release the reference of
+ *
+ * This releases a reference to @gpuvm.
+ */
+void
+drm_gpuvm_put(struct drm_gpuvm *gpuvm)
+{
+	if (gpuvm)
+		kref_put(&gpuvm->kref, drm_gpuvm_free);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_put);
 
 static int
 __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
@@ -843,7 +864,7 @@  drm_gpuva_insert(struct drm_gpuvm *gpuvm,
 	if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
 		return -EINVAL;
 
-	return __drm_gpuva_insert(gpuvm, va);
+	return __drm_gpuva_insert(drm_gpuvm_get(gpuvm), va);
 }
 EXPORT_SYMBOL_GPL(drm_gpuva_insert);
 
@@ -876,6 +897,7 @@  drm_gpuva_remove(struct drm_gpuva *va)
 	}
 
 	__drm_gpuva_remove(va);
+	drm_gpuvm_put(va->vm);
 }
 EXPORT_SYMBOL_GPL(drm_gpuva_remove);
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
index 54be12c1272f..cb2f06565c46 100644
--- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
@@ -1780,6 +1780,18 @@  nouveau_uvmm_bo_unmap_all(struct nouveau_bo *nvbo)
 	}
 }
 
+static void
+nouveau_uvmm_free(struct drm_gpuvm *gpuvm)
+{
+	struct nouveau_uvmm *uvmm = uvmm_from_gpuvm(gpuvm);
+
+	kfree(uvmm);
+}
+
+static const struct drm_gpuvm_ops gpuvm_ops = {
+	.vm_free = nouveau_uvmm_free,
+};
+
 int
 nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
 			   void *data,
@@ -1830,7 +1842,7 @@  nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
 		       NOUVEAU_VA_SPACE_END,
 		       init->kernel_managed_addr,
 		       init->kernel_managed_size,
-		       NULL);
+		       &gpuvm_ops);
 	/* GPUVM takes care from here on. */
 	drm_gem_object_put(r_obj);
 
@@ -1849,8 +1861,7 @@  nouveau_uvmm_ioctl_vm_init(struct drm_device *dev,
 	return 0;
 
 out_gpuvm_fini:
-	drm_gpuvm_destroy(&uvmm->base);
-	kfree(uvmm);
+	drm_gpuvm_put(&uvmm->base);
 out_unlock:
 	mutex_unlock(&cli->mutex);
 	return ret;
@@ -1902,7 +1913,6 @@  nouveau_uvmm_fini(struct nouveau_uvmm *uvmm)
 
 	mutex_lock(&cli->mutex);
 	nouveau_vmm_fini(&uvmm->vmm);
-	drm_gpuvm_destroy(&uvmm->base);
-	kfree(uvmm);
+	drm_gpuvm_put(&uvmm->base);
 	mutex_unlock(&cli->mutex);
 }
diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
index 0c2e24155a93..4e6e1fd3485a 100644
--- a/include/drm/drm_gpuvm.h
+++ b/include/drm/drm_gpuvm.h
@@ -247,6 +247,11 @@  struct drm_gpuvm {
 		struct list_head list;
 	} rb;
 
+	/**
+	 * @kref: reference count of this object
+	 */
+	struct kref kref;
+
 	/**
 	 * @kernel_alloc_node:
 	 *
@@ -273,7 +278,23 @@  void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
 		    u64 start_offset, u64 range,
 		    u64 reserve_offset, u64 reserve_range,
 		    const struct drm_gpuvm_ops *ops);
-void drm_gpuvm_destroy(struct drm_gpuvm *gpuvm);
+
+/**
+ * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
+ * @gpuvm: the &drm_gpuvm to acquire the reference of
+ *
+ * This function acquires an additional reference to @gpuvm. It is illegal to
+ * call this without already holding a reference. No locks required.
+ */
+static inline struct drm_gpuvm *
+drm_gpuvm_get(struct drm_gpuvm *gpuvm)
+{
+	kref_get(&gpuvm->kref);
+
+	return gpuvm;
+}
+
+void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
 
 bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
 bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
@@ -673,6 +694,14 @@  static inline void drm_gpuva_init_from_op(struct drm_gpuva *va,
  * operations to drivers.
  */
 struct drm_gpuvm_ops {
+	/**
+	 * @vm_free: called when the last reference of a struct drm_gpuvm is
+	 * dropped
+	 *
+	 * This callback is mandatory.
+	 */
+	void (*vm_free)(struct drm_gpuvm *gpuvm);
+
 	/**
 	 * @op_alloc: called when the &drm_gpuvm allocates
 	 * a struct drm_gpuva_op