diff mbox series

vfio: Reuse file f_inode as vfio device inode

Message ID 20240617095332.30543-1-yan.y.zhao@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio: Reuse file f_inode as vfio device inode | expand

Commit Message

Yan Zhao June 17, 2024, 9:53 a.m. UTC
Reuse file f_inode as vfio device inode and associate pseudo path file
directly to inode allocated in vfio fs.

Currently, vfio device is opened via 2 ways:
1) via cdev open
   vfio device is opened with a cdev device with file f_inode and address
   space associated with a cdev inode;
2) via VFIO_GROUP_GET_DEVICE_FD ioctl
   vfio device is opened via a pseudo path file with file f_inode and
   address space associated with an inode in anon_inode_fs.

In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per device"),
an inode in vfio fs is allocated for each vfio device. However, this inode
in vfio fs is only used to assign its address space to that of a file
associated with another cdev inode or an inode in anon_inode_fs.

This patch
- reuses cdev device inode as the vfio device inode when it's opened via
  cdev way;
- allocates an inode in vfio fs, associate it to the pseudo path file,
  and save it as the vfio device inode when the vfio device is opened via
  VFIO_GROUP_GET_DEVICE_FD ioctl.

File address space will then point automatically to the address space of
the vfio device inode. Tools like unmap_mapping_range() can then zap all
vmas associated with the vfio device.

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 drivers/vfio/device_cdev.c |  9 ++++---
 drivers/vfio/group.c       | 21 ++--------------
 drivers/vfio/vfio.h        |  2 ++
 drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
 4 files changed, 43 insertions(+), 38 deletions(-)


base-commit: 6ba59ff4227927d3a8530fc2973b80e94b54d58f

Comments

Yan Zhao June 20, 2024, 10:14 a.m. UTC | #1
On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
...
> diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> index ded364588d29..aaef188003b6 100644
> --- a/drivers/vfio/group.c
> +++ b/drivers/vfio/group.c
> @@ -268,31 +268,14 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
>  	if (ret)
>  		goto err_free;
>  
> -	/*
> -	 * We can't use anon_inode_getfd() because we need to modify
> -	 * the f_mode flags directly to allow more than just ioctls
> -	 */
> -	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> -				   df, O_RDWR);
> +	filep = vfio_device_get_pseudo_file(device);
If getting an inode from vfio_fs_type is not a must, maybe we could use
anon_inode_create_getfile() here?
Then changes to group.c and vfio_main.c can be simplified as below:

diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index ded364588d29..7f2f7871403f 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -269,29 +269,22 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
                goto err_free;

        /*
-        * We can't use anon_inode_getfd() because we need to modify
-        * the f_mode flags directly to allow more than just ioctls
+        * Get a unique inode from anon_inodefs
         */
-       filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
-                                  df, O_RDWR);
+       filep = anon_inode_create_getfile("[vfio-device]", &vfio_device_fops, df,
+                                         O_RDWR, NULL);
        if (IS_ERR(filep)) {
                ret = PTR_ERR(filep);
                goto err_close_device;
        }
-
-       /*
-        * TODO: add an anon_inode interface to do this.
-        * Appears to be missing by lack of need rather than
-        * explicitly prevented.  Now there's need.
-        */
        filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);

        /*
-        * Use the pseudo fs inode on the device to link all mmaps
-        * to the same address space, allowing us to unmap all vmas
-        * associated to this device using unmap_mapping_range().
+        * mmaps are linked to the address space of the filep->f_inode.
+        * Save the inode in device->inode to allow unmap_mapping_range() to
+        * unmap all vmas.
         */
-       filep->f_mapping = device->inode->i_mapping;
+       device->inode = filep->f_inode;

        if (device->group->type == VFIO_NO_IOMMU)
                dev_warn(device->dev, "vfio-noiommu device opened by user "

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index a5a62d9d963f..c9dac788411b 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -192,8 +192,6 @@ static void vfio_device_release(struct device *dev)
        if (device->ops->release)
                device->ops->release(device);

-       iput(device->inode);
-       simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
        kvfree(device);
 }

@@ -248,22 +246,6 @@ static struct file_system_type vfio_fs_type = {
        .kill_sb = kill_anon_super,
 };

-static struct inode *vfio_fs_inode_new(void)
-{
-       struct inode *inode;
-       int ret;
-
-       ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
-       if (ret)
-               return ERR_PTR(ret);
-
-       inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
-       if (IS_ERR(inode))
-               simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
-
-       return inode;
-}
-
 /*
  * Initialize a vfio_device so it can be registered to vfio core.
  */
@@ -282,11 +264,6 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
        init_completion(&device->comp);
        device->dev = dev;
        device->ops = ops;
-       device->inode = vfio_fs_inode_new();
-       if (IS_ERR(device->inode)) {
-               ret = PTR_ERR(device->inode);
-               goto out_inode;
-       }

        if (ops->init) {
                ret = ops->init(device);
@@ -301,9 +278,6 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
        return 0;

 out_uninit:
-       iput(device->inode);
-       simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
-out_inode:
        vfio_release_device_set(device);
        ida_free(&vfio.device_ida, device->index);
        return ret;


> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 50128da18bca..1f8915f79fbb 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -35,6 +35,7 @@ struct vfio_device_file *
>  vfio_allocate_device_file(struct vfio_device *device);
>  
>  extern const struct file_operations vfio_device_fops;
> +struct file *vfio_device_get_pseudo_file(struct vfio_device *device);
>  
>  #ifdef CONFIG_VFIO_NOIOMMU
>  extern bool vfio_noiommu __read_mostly;
> @@ -420,6 +421,7 @@ static inline void vfio_cdev_cleanup(void)
>  {
>  }
>  #endif /* CONFIG_VFIO_DEVICE_CDEV */
> +struct file *vfio_device_get_pseduo_file(struct vfio_device *device);
Sorry, this line was included by mistake.
Tian, Kevin June 26, 2024, 8:36 a.m. UTC | #2
> From: Zhao, Yan Y <yan.y.zhao@intel.com>
> Sent: Thursday, June 20, 2024 6:15 PM
> 
> On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
> ...
> > diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> > index ded364588d29..aaef188003b6 100644
> > --- a/drivers/vfio/group.c
> > +++ b/drivers/vfio/group.c
> > @@ -268,31 +268,14 @@ static struct file *vfio_device_open_file(struct
> vfio_device *device)
> >  	if (ret)
> >  		goto err_free;
> >
> > -	/*
> > -	 * We can't use anon_inode_getfd() because we need to modify
> > -	 * the f_mode flags directly to allow more than just ioctls
> > -	 */
> > -	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> > -				   df, O_RDWR);
> > +	filep = vfio_device_get_pseudo_file(device);
> If getting an inode from vfio_fs_type is not a must, maybe we could use
> anon_inode_create_getfile() here?
> Then changes to group.c and vfio_main.c can be simplified as below:

not familiar with file system, but at a glance the anon_inodefs is similar
to vfio's own pseudo fs so it might work. anyway what is required here
is to have an unique inode per vfio device to hold an unique address space
and anon_inode_create_getfile() appears to achieve it.

> 
> diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> index ded364588d29..7f2f7871403f 100644
> --- a/drivers/vfio/group.c
> +++ b/drivers/vfio/group.c
> @@ -269,29 +269,22 @@ static struct file *vfio_device_open_file(struct
> vfio_device *device)
>                 goto err_free;
> 
>         /*
> -        * We can't use anon_inode_getfd() because we need to modify
> -        * the f_mode flags directly to allow more than just ioctls
> +        * Get a unique inode from anon_inodefs
>          */
> -       filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> -                                  df, O_RDWR);
> +       filep = anon_inode_create_getfile("[vfio-device]", &vfio_device_fops, df,
> +                                         O_RDWR, NULL);
>         if (IS_ERR(filep)) {
>                 ret = PTR_ERR(filep);
>                 goto err_close_device;
>         }
> -
> -       /*
> -        * TODO: add an anon_inode interface to do this.
> -        * Appears to be missing by lack of need rather than
> -        * explicitly prevented.  Now there's need.
> -        */

why removing this comment?

>         filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
> 
>         /*
> -        * Use the pseudo fs inode on the device to link all mmaps
> -        * to the same address space, allowing us to unmap all vmas
> -        * associated to this device using unmap_mapping_range().
> +        * mmaps are linked to the address space of the filep->f_inode.
> +        * Save the inode in device->inode to allow unmap_mapping_range() to
> +        * unmap all vmas.
>          */
> -       filep->f_mapping = device->inode->i_mapping;
> +       device->inode = filep->f_inode;
> 
>         if (device->group->type == VFIO_NO_IOMMU)
>                 dev_warn(device->dev, "vfio-noiommu device opened by user "
> 
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index a5a62d9d963f..c9dac788411b 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -192,8 +192,6 @@ static void vfio_device_release(struct device *dev)
>         if (device->ops->release)
>                 device->ops->release(device);
> 
> -       iput(device->inode);
> -       simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
>         kvfree(device);
>  }
> 
> @@ -248,22 +246,6 @@ static struct file_system_type vfio_fs_type = {
>         .kill_sb = kill_anon_super,
>  };

then vfio_fs_type can be removed too.

> 
> -static struct inode *vfio_fs_inode_new(void)
> -{
> -       struct inode *inode;
> -       int ret;
> -
> -       ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
> -       if (ret)
> -               return ERR_PTR(ret);
> -
> -       inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
> -       if (IS_ERR(inode))
> -               simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> -
> -       return inode;
> -}
> -
>  /*
>   * Initialize a vfio_device so it can be registered to vfio core.
>   */
> @@ -282,11 +264,6 @@ static int vfio_init_device(struct vfio_device *device,
> struct device *dev,
>         init_completion(&device->comp);
>         device->dev = dev;
>         device->ops = ops;
> -       device->inode = vfio_fs_inode_new();
> -       if (IS_ERR(device->inode)) {
> -               ret = PTR_ERR(device->inode);
> -               goto out_inode;
> -       }
> 
>         if (ops->init) {
>                 ret = ops->init(device);
> @@ -301,9 +278,6 @@ static int vfio_init_device(struct vfio_device *device,
> struct device *dev,
>         return 0;
> 
>  out_uninit:
> -       iput(device->inode);
> -       simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> -out_inode:
>         vfio_release_device_set(device);
>         ida_free(&vfio.device_ida, device->index);
>         return ret;
> 
> 
> > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> > index 50128da18bca..1f8915f79fbb 100644
> > --- a/drivers/vfio/vfio.h
> > +++ b/drivers/vfio/vfio.h
> > @@ -35,6 +35,7 @@ struct vfio_device_file *
> >  vfio_allocate_device_file(struct vfio_device *device);
> >
> >  extern const struct file_operations vfio_device_fops;
> > +struct file *vfio_device_get_pseudo_file(struct vfio_device *device);
> >
> >  #ifdef CONFIG_VFIO_NOIOMMU
> >  extern bool vfio_noiommu __read_mostly;
> > @@ -420,6 +421,7 @@ static inline void vfio_cdev_cleanup(void)
> >  {
> >  }
> >  #endif /* CONFIG_VFIO_DEVICE_CDEV */
> > +struct file *vfio_device_get_pseduo_file(struct vfio_device *device);
> Sorry, this line was included by mistake.
Yan Zhao June 26, 2024, 9:11 a.m. UTC | #3
On Wed, Jun 26, 2024 at 04:36:26PM +0800, Tian, Kevin wrote:
> > From: Zhao, Yan Y <yan.y.zhao@intel.com>
> > Sent: Thursday, June 20, 2024 6:15 PM
> > 
> > On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
> > ...
> > > diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> > > index ded364588d29..aaef188003b6 100644
> > > --- a/drivers/vfio/group.c
> > > +++ b/drivers/vfio/group.c
> > > @@ -268,31 +268,14 @@ static struct file *vfio_device_open_file(struct
> > vfio_device *device)
> > >  	if (ret)
> > >  		goto err_free;
> > >
> > > -	/*
> > > -	 * We can't use anon_inode_getfd() because we need to modify
> > > -	 * the f_mode flags directly to allow more than just ioctls
> > > -	 */
> > > -	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> > > -				   df, O_RDWR);
> > > +	filep = vfio_device_get_pseudo_file(device);
> > If getting an inode from vfio_fs_type is not a must, maybe we could use
> > anon_inode_create_getfile() here?
> > Then changes to group.c and vfio_main.c can be simplified as below:
> 
> not familiar with file system, but at a glance the anon_inodefs is similar
> to vfio's own pseudo fs so it might work. anyway what is required here
> is to have an unique inode per vfio device to hold an unique address space
> and anon_inode_create_getfile() appears to achieve it.
Right.
 
> > 
> > diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> > index ded364588d29..7f2f7871403f 100644
> > --- a/drivers/vfio/group.c
> > +++ b/drivers/vfio/group.c
> > @@ -269,29 +269,22 @@ static struct file *vfio_device_open_file(struct
> > vfio_device *device)
> >                 goto err_free;
> > 
> >         /*
> > -        * We can't use anon_inode_getfd() because we need to modify
> > -        * the f_mode flags directly to allow more than just ioctls
> > +        * Get a unique inode from anon_inodefs
> >          */
> > -       filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> > -                                  df, O_RDWR);
> > +       filep = anon_inode_create_getfile("[vfio-device]", &vfio_device_fops, df,
> > +                                         O_RDWR, NULL);
> >         if (IS_ERR(filep)) {
> >                 ret = PTR_ERR(filep);
> >                 goto err_close_device;
> >         }
> > -
> > -       /*
> > -        * TODO: add an anon_inode interface to do this.
> > -        * Appears to be missing by lack of need rather than
> > -        * explicitly prevented.  Now there's need.
> > -        */
> 
> why removing this comment?
I found it's confusing, as now an anon_inode is available but
"filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE)" still cannot be avoided.
To avoid it, the file needs be opened through do_dentry_open() interface,
which is not easily achieved.


> >         filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
> > 
> >         /*
> > -        * Use the pseudo fs inode on the device to link all mmaps
> > -        * to the same address space, allowing us to unmap all vmas
> > -        * associated to this device using unmap_mapping_range().
> > +        * mmaps are linked to the address space of the filep->f_inode.
> > +        * Save the inode in device->inode to allow unmap_mapping_range() to
> > +        * unmap all vmas.
> >          */
> > -       filep->f_mapping = device->inode->i_mapping;
> > +       device->inode = filep->f_inode;
> > 
> >         if (device->group->type == VFIO_NO_IOMMU)
> >                 dev_warn(device->dev, "vfio-noiommu device opened by user "
> > 
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index a5a62d9d963f..c9dac788411b 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -192,8 +192,6 @@ static void vfio_device_release(struct device *dev)
> >         if (device->ops->release)
> >                 device->ops->release(device);
> > 
> > -       iput(device->inode);
> > -       simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> >         kvfree(device);
> >  }
> > 
> > @@ -248,22 +246,6 @@ static struct file_system_type vfio_fs_type = {
> >         .kill_sb = kill_anon_super,
> >  };
> 
> then vfio_fs_type can be removed too.
Right. Thanks for catching it.

> > 
> > -static struct inode *vfio_fs_inode_new(void)
> > -{
> > -       struct inode *inode;
> > -       int ret;
> > -
> > -       ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
> > -       if (ret)
> > -               return ERR_PTR(ret);
> > -
> > -       inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
> > -       if (IS_ERR(inode))
> > -               simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> > -
> > -       return inode;
> > -}
> > -
> >  /*
> >   * Initialize a vfio_device so it can be registered to vfio core.
> >   */
> > @@ -282,11 +264,6 @@ static int vfio_init_device(struct vfio_device *device,
> > struct device *dev,
> >         init_completion(&device->comp);
> >         device->dev = dev;
> >         device->ops = ops;
> > -       device->inode = vfio_fs_inode_new();
> > -       if (IS_ERR(device->inode)) {
> > -               ret = PTR_ERR(device->inode);
> > -               goto out_inode;
> > -       }
> > 
> >         if (ops->init) {
> >                 ret = ops->init(device);
> > @@ -301,9 +278,6 @@ static int vfio_init_device(struct vfio_device *device,
> > struct device *dev,
> >         return 0;
> > 
> >  out_uninit:
> > -       iput(device->inode);
> > -       simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> > -out_inode:
> >         vfio_release_device_set(device);
> >         ida_free(&vfio.device_ida, device->index);
> >         return ret;
> > 
> > 
> > > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> > > index 50128da18bca..1f8915f79fbb 100644
> > > --- a/drivers/vfio/vfio.h
> > > +++ b/drivers/vfio/vfio.h
> > > @@ -35,6 +35,7 @@ struct vfio_device_file *
> > >  vfio_allocate_device_file(struct vfio_device *device);
> > >
> > >  extern const struct file_operations vfio_device_fops;
> > > +struct file *vfio_device_get_pseudo_file(struct vfio_device *device);
> > >
> > >  #ifdef CONFIG_VFIO_NOIOMMU
> > >  extern bool vfio_noiommu __read_mostly;
> > > @@ -420,6 +421,7 @@ static inline void vfio_cdev_cleanup(void)
> > >  {
> > >  }
> > >  #endif /* CONFIG_VFIO_DEVICE_CDEV */
> > > +struct file *vfio_device_get_pseduo_file(struct vfio_device *device);
> > Sorry, this line was included by mistake.
Jason Gunthorpe June 26, 2024, 1:35 p.m. UTC | #4
On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
> Reuse file f_inode as vfio device inode and associate pseudo path file
> directly to inode allocated in vfio fs.
> 
> Currently, vfio device is opened via 2 ways:
> 1) via cdev open
>    vfio device is opened with a cdev device with file f_inode and address
>    space associated with a cdev inode;
> 2) via VFIO_GROUP_GET_DEVICE_FD ioctl
>    vfio device is opened via a pseudo path file with file f_inode and
>    address space associated with an inode in anon_inode_fs.
> 
> In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per device"),
> an inode in vfio fs is allocated for each vfio device. However, this inode
> in vfio fs is only used to assign its address space to that of a file
> associated with another cdev inode or an inode in anon_inode_fs.
> 
> This patch
> - reuses cdev device inode as the vfio device inode when it's opened via
>   cdev way;
> - allocates an inode in vfio fs, associate it to the pseudo path file,
>   and save it as the vfio device inode when the vfio device is opened via
>   VFIO_GROUP_GET_DEVICE_FD ioctl.
> 
> File address space will then point automatically to the address space of
> the vfio device inode. Tools like unmap_mapping_range() can then zap all
> vmas associated with the vfio device.
> 
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
>  drivers/vfio/device_cdev.c |  9 ++++---
>  drivers/vfio/group.c       | 21 ++--------------
>  drivers/vfio/vfio.h        |  2 ++
>  drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
>  4 files changed, 43 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index bb1817bd4ff3..a4eec8e88f5c 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>  	filep->private_data = df;
>  
>  	/*
> -	 * Use the pseudo fs inode on the device to link all mmaps
> -	 * to the same address space, allowing us to unmap all vmas
> -	 * associated to this device using unmap_mapping_range().
> +	 * mmaps are linked to the address space of the inode of device cdev.
> +	 * Save the inode of device cdev in device->inode to allow
> +	 * unmap_mapping_range() to unmap all vmas.
>  	 */
> -	filep->f_mapping = device->inode->i_mapping;
> -
> +	device->inode = inode;

This doesn't seem right.. There is only one device but multiple file
can be opened on that device.

We expect every open file to have a unique inode otherwise the
unmap_mapping_range() will not function properly.

Jason
Tian, Kevin June 26, 2024, 11:55 p.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, June 26, 2024 9:35 PM
> 
> On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
> > Reuse file f_inode as vfio device inode and associate pseudo path file
> > directly to inode allocated in vfio fs.
> >
> > Currently, vfio device is opened via 2 ways:
> > 1) via cdev open
> >    vfio device is opened with a cdev device with file f_inode and address
> >    space associated with a cdev inode;
> > 2) via VFIO_GROUP_GET_DEVICE_FD ioctl
> >    vfio device is opened via a pseudo path file with file f_inode and
> >    address space associated with an inode in anon_inode_fs.
> >
> > In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per device"),
> > an inode in vfio fs is allocated for each vfio device. However, this inode
> > in vfio fs is only used to assign its address space to that of a file
> > associated with another cdev inode or an inode in anon_inode_fs.
> >
> > This patch
> > - reuses cdev device inode as the vfio device inode when it's opened via
> >   cdev way;
> > - allocates an inode in vfio fs, associate it to the pseudo path file,
> >   and save it as the vfio device inode when the vfio device is opened via
> >   VFIO_GROUP_GET_DEVICE_FD ioctl.
> >
> > File address space will then point automatically to the address space of
> > the vfio device inode. Tools like unmap_mapping_range() can then zap all
> > vmas associated with the vfio device.
> >
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> >  drivers/vfio/device_cdev.c |  9 ++++---
> >  drivers/vfio/group.c       | 21 ++--------------
> >  drivers/vfio/vfio.h        |  2 ++
> >  drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
> >  4 files changed, 43 insertions(+), 38 deletions(-)
> >
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index bb1817bd4ff3..a4eec8e88f5c 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode
> *inode, struct file *filep)
> >  	filep->private_data = df;
> >
> >  	/*
> > -	 * Use the pseudo fs inode on the device to link all mmaps
> > -	 * to the same address space, allowing us to unmap all vmas
> > -	 * associated to this device using unmap_mapping_range().
> > +	 * mmaps are linked to the address space of the inode of device cdev.
> > +	 * Save the inode of device cdev in device->inode to allow
> > +	 * unmap_mapping_range() to unmap all vmas.
> >  	 */
> > -	filep->f_mapping = device->inode->i_mapping;
> > -
> > +	device->inode = inode;
> 
> This doesn't seem right.. There is only one device but multiple file
> can be opened on that device.
> 
> We expect every open file to have a unique inode otherwise the
> unmap_mapping_range() will not function properly.
> 

Does it mean that the existing code is already broken? there is only
one vfio-fs inode per device (allocated at vfio_init_device()).

And if we expect unique inode per open file then there will be a list
of inodes tracked under vfio_pci_core_device for unmap_mapping_range()
but it's also not the case today:

static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
{
	struct vfio_device *core_vdev = &vdev->vdev;
	loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
	loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
	loff_t len = end - start;

	unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true);
}
Tian, Kevin June 27, 2024, 12:17 a.m. UTC | #6
> From: Tian, Kevin
> Sent: Thursday, June 27, 2024 7:56 AM
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, June 26, 2024 9:35 PM
> >
> > On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
> > > Reuse file f_inode as vfio device inode and associate pseudo path file
> > > directly to inode allocated in vfio fs.
> > >
> > > Currently, vfio device is opened via 2 ways:
> > > 1) via cdev open
> > >    vfio device is opened with a cdev device with file f_inode and address
> > >    space associated with a cdev inode;
> > > 2) via VFIO_GROUP_GET_DEVICE_FD ioctl
> > >    vfio device is opened via a pseudo path file with file f_inode and
> > >    address space associated with an inode in anon_inode_fs.
> > >
> > > In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per
> device"),
> > > an inode in vfio fs is allocated for each vfio device. However, this inode
> > > in vfio fs is only used to assign its address space to that of a file
> > > associated with another cdev inode or an inode in anon_inode_fs.
> > >
> > > This patch
> > > - reuses cdev device inode as the vfio device inode when it's opened via
> > >   cdev way;
> > > - allocates an inode in vfio fs, associate it to the pseudo path file,
> > >   and save it as the vfio device inode when the vfio device is opened via
> > >   VFIO_GROUP_GET_DEVICE_FD ioctl.
> > >
> > > File address space will then point automatically to the address space of
> > > the vfio device inode. Tools like unmap_mapping_range() can then zap all
> > > vmas associated with the vfio device.
> > >
> > > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > > ---
> > >  drivers/vfio/device_cdev.c |  9 ++++---
> > >  drivers/vfio/group.c       | 21 ++--------------
> > >  drivers/vfio/vfio.h        |  2 ++
> > >  drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
> > >  4 files changed, 43 insertions(+), 38 deletions(-)
> > >
> > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > > index bb1817bd4ff3..a4eec8e88f5c 100644
> > > --- a/drivers/vfio/device_cdev.c
> > > +++ b/drivers/vfio/device_cdev.c
> > > @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode
> > *inode, struct file *filep)
> > >  	filep->private_data = df;
> > >
> > >  	/*
> > > -	 * Use the pseudo fs inode on the device to link all mmaps
> > > -	 * to the same address space, allowing us to unmap all vmas
> > > -	 * associated to this device using unmap_mapping_range().
> > > +	 * mmaps are linked to the address space of the inode of device cdev.
> > > +	 * Save the inode of device cdev in device->inode to allow
> > > +	 * unmap_mapping_range() to unmap all vmas.
> > >  	 */
> > > -	filep->f_mapping = device->inode->i_mapping;
> > > -
> > > +	device->inode = inode;
> >
> > This doesn't seem right.. There is only one device but multiple file
> > can be opened on that device.
> >
> > We expect every open file to have a unique inode otherwise the
> > unmap_mapping_range() will not function properly.
> >

Can you elaborate the reason of this expectation? If multiple open's
come from a same process then having them share a single address
space per device still makes sense. Are you considering a scenario
where a vfio device is opened by multiple processes? is it allowed?

btw Yan's patch appears to impose different behaviors between cdev
and group paths. For cdev all open files share the address space of
the cdev inode, same effect as sharing that of the vfio-fs inode today.
But for group open every open file will get a new inode which kind of
matches your expectation but the patch simply overrides
vfio_device->inode instead of tracking a list. That sound incomplete.

> 
> Does it mean that the existing code is already broken? there is only
> one vfio-fs inode per device (allocated at vfio_init_device()).
> 
> And if we expect unique inode per open file then there will be a list
> of inodes tracked under vfio_pci_core_device for unmap_mapping_range()
> but it's also not the case today:
> 
> static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
> {
> 	struct vfio_device *core_vdev = &vdev->vdev;
> 	loff_t start =
> VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
> 	loff_t end =
> VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
> 	loff_t len = end - start;
> 
> 	unmap_mapping_range(core_vdev->inode->i_mapping, start, len,
> true);
> }
Yan Zhao June 27, 2024, 9:51 a.m. UTC | #7
On Thu, Jun 27, 2024 at 08:17:02AM +0800, Tian, Kevin wrote:
> > From: Tian, Kevin
> > Sent: Thursday, June 27, 2024 7:56 AM
> > 
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Wednesday, June 26, 2024 9:35 PM
> > >
> > > On Mon, Jun 17, 2024 at 05:53:32PM +0800, Yan Zhao wrote:
> > > > Reuse file f_inode as vfio device inode and associate pseudo path file
> > > > directly to inode allocated in vfio fs.
> > > >
> > > > Currently, vfio device is opened via 2 ways:
> > > > 1) via cdev open
> > > >    vfio device is opened with a cdev device with file f_inode and address
> > > >    space associated with a cdev inode;
> > > > 2) via VFIO_GROUP_GET_DEVICE_FD ioctl
> > > >    vfio device is opened via a pseudo path file with file f_inode and
> > > >    address space associated with an inode in anon_inode_fs.
> > > >
> > > > In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per
> > device"),
> > > > an inode in vfio fs is allocated for each vfio device. However, this inode
> > > > in vfio fs is only used to assign its address space to that of a file
> > > > associated with another cdev inode or an inode in anon_inode_fs.
> > > >
> > > > This patch
> > > > - reuses cdev device inode as the vfio device inode when it's opened via
> > > >   cdev way;
> > > > - allocates an inode in vfio fs, associate it to the pseudo path file,
> > > >   and save it as the vfio device inode when the vfio device is opened via
> > > >   VFIO_GROUP_GET_DEVICE_FD ioctl.
> > > >
> > > > File address space will then point automatically to the address space of
> > > > the vfio device inode. Tools like unmap_mapping_range() can then zap all
> > > > vmas associated with the vfio device.
> > > >
> > > > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > > > ---
> > > >  drivers/vfio/device_cdev.c |  9 ++++---
> > > >  drivers/vfio/group.c       | 21 ++--------------
> > > >  drivers/vfio/vfio.h        |  2 ++
> > > >  drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
> > > >  4 files changed, 43 insertions(+), 38 deletions(-)
> > > >
> > > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > > > index bb1817bd4ff3..a4eec8e88f5c 100644
> > > > --- a/drivers/vfio/device_cdev.c
> > > > +++ b/drivers/vfio/device_cdev.c
> > > > @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode
> > > *inode, struct file *filep)
> > > >  	filep->private_data = df;
> > > >
> > > >  	/*
> > > > -	 * Use the pseudo fs inode on the device to link all mmaps
> > > > -	 * to the same address space, allowing us to unmap all vmas
> > > > -	 * associated to this device using unmap_mapping_range().
> > > > +	 * mmaps are linked to the address space of the inode of device cdev.
> > > > +	 * Save the inode of device cdev in device->inode to allow
> > > > +	 * unmap_mapping_range() to unmap all vmas.
> > > >  	 */
> > > > -	filep->f_mapping = device->inode->i_mapping;
> > > > -
> > > > +	device->inode = inode;
> > >
> > > This doesn't seem right.. There is only one device but multiple file
> > > can be opened on that device.
Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
vfio_df_open() makes sure device->open_count is 1.

e.g.
-long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
+long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, struct inode *inode,
                                struct vfio_device_bind_iommufd __user *arg)
 {
        struct vfio_device *device = df->device;
@@ -118,6 +111,8 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
                goto out_close_device;

        device->cdev_opened = true;
+
+       device->inode = inode;
        /*
         * Paired with smp_load_acquire() in vfio_device_fops::ioctl/

 @@ -1266,7 +1296,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
        int ret;

        if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
-               return vfio_df_ioctl_bind_iommufd(df, uptr);
+               return vfio_df_ioctl_bind_iommufd(df, filep->f_inode, uptr);


> > >
> > > We expect every open file to have a unique inode otherwise the
> > > unmap_mapping_range() will not function properly.
Even with commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per
device"), in group path,
vfio_device_open_file() calls anon_inode_getfile(), from which the inode
returned is always anon_inode_inode, which is no unique even across
different devices.

> 
> Can you elaborate the reason of this expectation? If multiple open's
> come from a same process then having them share a single address
> space per device still makes sense. Are you considering a scenario
> where a vfio device is opened by multiple processes? is it allowed?
> 
> btw Yan's patch appears to impose different behaviors between cdev
> and group paths. For cdev all open files share the address space of
> the cdev inode, same effect as sharing that of the vfio-fs inode today.
> But for group open every open file will get a new inode which kind of
> matches your expectation but the patch simply overrides
> vfio_device->inode instead of tracking a list. That sound incomplete.
Yes, looks it's incomplete for group path.
What about still using inode in vfio fs as below in group path?
(I'll post the complete code which have passed my local tests if you think the
direction is right).

BTW, in group path, what's the benefit of allowing multiple open of device?

struct file *vfio_device_get_pseudo_file(struct vfio_device *device)             
{                                                                                
        const struct file_operations *fops = &vfio_device_fops;                  
        struct inode *inode;                                                     
        struct file *filep;                                                      
        int ret;                                                                 
                                                                                 
        if (!fops_get(fops))                                                     
                return ERR_PTR(-ENODEV);                                         
                                                                                 
        mutex_lock(&device->dev_set->lock);                                      
        if (device->open_count == 1) {                                           
                ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
                if (ret)                                                         
                        goto err_pin_fs;                                         
                                                                                 
                inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);                
                if (IS_ERR(inode)) {                                             
                        ret = PTR_ERR(inode);                                    
                        goto err_inode;                                          
                }                                                                
                device->inode = inode;                                           
        } else {                                                                 
                inode = device->inode;                                           
                ihold(inode);                                                    
        }                                                                        
                                                                                 
        filep = alloc_file_pseudo(inode, vfio.vfs_mount, "[vfio-device]",        
                                  O_RDWR, fops);                                 
                                                                                 
        if (IS_ERR(filep)) {                                                     
                ret = PTR_ERR(filep);                                            
                goto err_file;                                                   
        }                                                                        
        mutex_unlock(&device->dev_set->lock);                                    
        return filep;                                                            
                                                                                 
err_file:                                                                        
        iput(inode);                                                             
err_inode:                                                                       
        if (device->open_count == 1)                                             
                simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);              
err_pin_fs:                                                                      
        mutex_unlock(&device->dev_set->lock);                                    
        fops_put(fops);                                                          
                                                                                 
        return ERR_PTR(ret);                                                     
}
Jason Gunthorpe June 27, 2024, 12:26 p.m. UTC | #8
On Wed, Jun 26, 2024 at 11:55:59PM +0000, Tian, Kevin wrote:

> > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > > index bb1817bd4ff3..a4eec8e88f5c 100644
> > > --- a/drivers/vfio/device_cdev.c
> > > +++ b/drivers/vfio/device_cdev.c
> > > @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode
> > *inode, struct file *filep)
> > >  	filep->private_data = df;
> > >
> > >  	/*
> > > -	 * Use the pseudo fs inode on the device to link all mmaps
> > > -	 * to the same address space, allowing us to unmap all vmas
> > > -	 * associated to this device using unmap_mapping_range().
> > > +	 * mmaps are linked to the address space of the inode of device cdev.
> > > +	 * Save the inode of device cdev in device->inode to allow
> > > +	 * unmap_mapping_range() to unmap all vmas.
> > >  	 */
> > > -	filep->f_mapping = device->inode->i_mapping;
> > > -
> > > +	device->inode = inode;
> > 
> > This doesn't seem right.. There is only one device but multiple file
> > can be opened on that device.
> > 
> > We expect every open file to have a unique inode otherwise the
> > unmap_mapping_range() will not function properly.
> 
> Does it mean that the existing code is already broken? there is only
> one vfio-fs inode per device (allocated at vfio_init_device()).

I may have gone too far, it is not that we expect evey FD to have a
unique inode, but we expect that there is only one active FD using the
mmap and only that inode will be invalidated.

So changing the inode above before we know that this is an active FD
that can do mmap isn't going to entirely work. ie someone could open
the cdev FD (and fail to make it active) while an active VFIO is using
the legacy path and clobber the invalidation inode.

Having per-FD inodes is just one answer (it is what my similar RDMA
patches were going to do), we could also move the above to the first
mmap so that the one and only active FD also sets the correct inode
for the invalidations.

Jason
Jason Gunthorpe June 27, 2024, 12:42 p.m. UTC | #9
On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:

> > > > This doesn't seem right.. There is only one device but multiple file
> > > > can be opened on that device.
> Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
> vfio_df_open() makes sure device->open_count is 1.

Yeah, that seems better.

Logically it would be best if all places set the inode once the
inode/FD has been made to be the one and only way to access it.

> BTW, in group path, what's the benefit of allowing multiple open of device?

I don't know, the thing that opened the first FD can just dup it, no
idea why two different FDs would be useful. It is something we removed
in the cdev flow

Jason
Yan Zhao June 28, 2024, 5:21 a.m. UTC | #10
On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
> 
> > > > > This doesn't seem right.. There is only one device but multiple file
> > > > > can be opened on that device.
> > Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
> > vfio_df_open() makes sure device->open_count is 1.
> 
> Yeah, that seems better.
> 
> Logically it would be best if all places set the inode once the
> inode/FD has been made to be the one and only way to access it.
For group path, I'm afraid there's no such a place ensuring only one active fd
in kernel.
I tried modifying QEMU to allow two openings and two assignments of the same
device. It works and appears to guest that there were 2 devices, though this
ultimately leads to device malfunctions in guest.

> > BTW, in group path, what's the benefit of allowing multiple open of device?
> 
> I don't know, the thing that opened the first FD can just dup it, no
> idea why two different FDs would be useful. It is something we removed
> in the cdev flow
>
Thanks. However, from the code, it reads like a drawback of the cdev flow :)
I don't understand why the group path is secure though.

        /*
         * Only the group path allows the device to be opened multiple
         * times.  The device cdev path doesn't have a secure way for it.
         */
        if (device->open_count != 0 && !df->group)
                return -EINVAL;
Yi Liu June 28, 2024, 9:48 a.m. UTC | #11
On 2024/6/28 13:21, Yan Zhao wrote:
> On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
>> On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
>>
>>>>>> This doesn't seem right.. There is only one device but multiple file
>>>>>> can be opened on that device.
>>> Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
>>> vfio_df_open() makes sure device->open_count is 1.
>>
>> Yeah, that seems better.
>>
>> Logically it would be best if all places set the inode once the
>> inode/FD has been made to be the one and only way to access it.
> For group path, I'm afraid there's no such a place ensuring only one active fd
> in kernel.
> I tried modifying QEMU to allow two openings and two assignments of the same
> device. It works and appears to guest that there were 2 devices, though this
> ultimately leads to device malfunctions in guest.
> 
>>> BTW, in group path, what's the benefit of allowing multiple open of device?
>>
>> I don't know, the thing that opened the first FD can just dup it, no
>> idea why two different FDs would be useful. It is something we removed
>> in the cdev flow
>>
> Thanks. However, from the code, it reads like a drawback of the cdev flow :)
> I don't understand why the group path is secure though.
> 
>          /*
>           * Only the group path allows the device to be opened multiple
>           * times.  The device cdev path doesn't have a secure way for it.
>           */
>          if (device->open_count != 0 && !df->group)
>                  return -EINVAL;
> 
> 

The group path only allow single group open, so the device FDs retrieved
via the group is just within the opener of the group. This secure is built
on top of single open of group.
Yan Zhao June 28, 2024, 3:28 p.m. UTC | #12
On Fri, Jun 28, 2024 at 05:48:11PM +0800, Yi Liu wrote:
> On 2024/6/28 13:21, Yan Zhao wrote:
> > On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
> > > On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
> > > 
> > > > > > > This doesn't seem right.. There is only one device but multiple file
> > > > > > > can be opened on that device.
> > > > Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
> > > > vfio_df_open() makes sure device->open_count is 1.
> > > 
> > > Yeah, that seems better.
> > > 
> > > Logically it would be best if all places set the inode once the
> > > inode/FD has been made to be the one and only way to access it.
> > For group path, I'm afraid there's no such a place ensuring only one active fd
> > in kernel.
> > I tried modifying QEMU to allow two openings and two assignments of the same
> > device. It works and appears to guest that there were 2 devices, though this
> > ultimately leads to device malfunctions in guest.
> > 
> > > > BTW, in group path, what's the benefit of allowing multiple open of device?
> > > 
> > > I don't know, the thing that opened the first FD can just dup it, no
> > > idea why two different FDs would be useful. It is something we removed
> > > in the cdev flow
> > > 
> > Thanks. However, from the code, it reads like a drawback of the cdev flow :)
> > I don't understand why the group path is secure though.
> > 
> >          /*
> >           * Only the group path allows the device to be opened multiple
> >           * times.  The device cdev path doesn't have a secure way for it.
> >           */
> >          if (device->open_count != 0 && !df->group)
> >                  return -EINVAL;
> > 
> > 
> 
> The group path only allow single group open, so the device FDs retrieved
> via the group is just within the opener of the group. This secure is built
> on top of single open of group.
What if the group is opened for only once but VFIO_GROUP_GET_DEVICE_FD
ioctl is called for multiple times?
Yi Liu June 30, 2024, 7:06 a.m. UTC | #13
On 2024/6/28 23:28, Yan Zhao wrote:
> On Fri, Jun 28, 2024 at 05:48:11PM +0800, Yi Liu wrote:
>> On 2024/6/28 13:21, Yan Zhao wrote:
>>> On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
>>>> On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
>>>>
>>>>>>>> This doesn't seem right.. There is only one device but multiple file
>>>>>>>> can be opened on that device.
>>>>> Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
>>>>> vfio_df_open() makes sure device->open_count is 1.
>>>>
>>>> Yeah, that seems better.
>>>>
>>>> Logically it would be best if all places set the inode once the
>>>> inode/FD has been made to be the one and only way to access it.
>>> For group path, I'm afraid there's no such a place ensuring only one active fd
>>> in kernel.
>>> I tried modifying QEMU to allow two openings and two assignments of the same
>>> device. It works and appears to guest that there were 2 devices, though this
>>> ultimately leads to device malfunctions in guest.
>>>
>>>>> BTW, in group path, what's the benefit of allowing multiple open of device?
>>>>
>>>> I don't know, the thing that opened the first FD can just dup it, no
>>>> idea why two different FDs would be useful. It is something we removed
>>>> in the cdev flow
>>>>
>>> Thanks. However, from the code, it reads like a drawback of the cdev flow :)
>>> I don't understand why the group path is secure though.
>>>
>>>           /*
>>>            * Only the group path allows the device to be opened multiple
>>>            * times.  The device cdev path doesn't have a secure way for it.
>>>            */
>>>           if (device->open_count != 0 && !df->group)
>>>                   return -EINVAL;
>>>
>>>
>>
>> The group path only allow single group open, so the device FDs retrieved
>> via the group is just within the opener of the group. This secure is built
>> on top of single open of group.
> What if the group is opened for only once but VFIO_GROUP_GET_DEVICE_FD
> ioctl is called for multiple times?

this should happen within the process context that has opened the group. it
should be safe, and that would be tracked by the open_count.
Yan Zhao July 1, 2024, 1:47 a.m. UTC | #14
On Sun, Jun 30, 2024 at 03:06:05PM +0800, Yi Liu wrote:
> On 2024/6/28 23:28, Yan Zhao wrote:
> > On Fri, Jun 28, 2024 at 05:48:11PM +0800, Yi Liu wrote:
> > > On 2024/6/28 13:21, Yan Zhao wrote:
> > > > On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
> > > > > On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
> > > > > 
> > > > > > > > > This doesn't seem right.. There is only one device but multiple file
> > > > > > > > > can be opened on that device.
> > > > > > Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
> > > > > > vfio_df_open() makes sure device->open_count is 1.
> > > > > 
> > > > > Yeah, that seems better.
> > > > > 
> > > > > Logically it would be best if all places set the inode once the
> > > > > inode/FD has been made to be the one and only way to access it.
> > > > For group path, I'm afraid there's no such a place ensuring only one active fd
> > > > in kernel.
> > > > I tried modifying QEMU to allow two openings and two assignments of the same
> > > > device. It works and appears to guest that there were 2 devices, though this
> > > > ultimately leads to device malfunctions in guest.
> > > > 
> > > > > > BTW, in group path, what's the benefit of allowing multiple open of device?
> > > > > 
> > > > > I don't know, the thing that opened the first FD can just dup it, no
> > > > > idea why two different FDs would be useful. It is something we removed
> > > > > in the cdev flow
> > > > > 
> > > > Thanks. However, from the code, it reads like a drawback of the cdev flow :)
> > > > I don't understand why the group path is secure though.
> > > > 
> > > >           /*
> > > >            * Only the group path allows the device to be opened multiple
> > > >            * times.  The device cdev path doesn't have a secure way for it.
> > > >            */
> > > >           if (device->open_count != 0 && !df->group)
> > > >                   return -EINVAL;
> > > > 
> > > > 
> > > 
> > > The group path only allow single group open, so the device FDs retrieved
> > > via the group is just within the opener of the group. This secure is built
> > > on top of single open of group.
> > What if the group is opened for only once but VFIO_GROUP_GET_DEVICE_FD
> > ioctl is called for multiple times?
> 
> this should happen within the process context that has opened the group. it
> should be safe, and that would be tracked by the open_count.
Thanks for explanation.

Even within a single process, for the group path, it appears that accesses to
the multiple opened device fds still require proper synchronization.
With proper synchronizations, for cdev path, accesses from different processes
can still function correctly.
Additionally, the group fd can also be passed to another process, allowing
device fds to be acquired and accessed from a different process.

On the other hand, cdev path might also support multiple opened fds from a
single process by checking task gid.

The device cdev path simply opts not to do that because it is unnecessary, right?
Yi Liu July 1, 2024, 5:44 a.m. UTC | #15
On 2024/7/1 09:47, Yan Zhao wrote:
> On Sun, Jun 30, 2024 at 03:06:05PM +0800, Yi Liu wrote:
>> On 2024/6/28 23:28, Yan Zhao wrote:
>>> On Fri, Jun 28, 2024 at 05:48:11PM +0800, Yi Liu wrote:
>>>> On 2024/6/28 13:21, Yan Zhao wrote:
>>>>> On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
>>>>>> On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
>>>>>>
>>>>>>>>>> This doesn't seem right.. There is only one device but multiple file
>>>>>>>>>> can be opened on that device.
>>>>>>> Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
>>>>>>> vfio_df_open() makes sure device->open_count is 1.
>>>>>>
>>>>>> Yeah, that seems better.
>>>>>>
>>>>>> Logically it would be best if all places set the inode once the
>>>>>> inode/FD has been made to be the one and only way to access it.
>>>>> For group path, I'm afraid there's no such a place ensuring only one active fd
>>>>> in kernel.
>>>>> I tried modifying QEMU to allow two openings and two assignments of the same
>>>>> device. It works and appears to guest that there were 2 devices, though this
>>>>> ultimately leads to device malfunctions in guest.
>>>>>
>>>>>>> BTW, in group path, what's the benefit of allowing multiple open of device?
>>>>>>
>>>>>> I don't know, the thing that opened the first FD can just dup it, no
>>>>>> idea why two different FDs would be useful. It is something we removed
>>>>>> in the cdev flow
>>>>>>
>>>>> Thanks. However, from the code, it reads like a drawback of the cdev flow :)
>>>>> I don't understand why the group path is secure though.
>>>>>
>>>>>            /*
>>>>>             * Only the group path allows the device to be opened multiple
>>>>>             * times.  The device cdev path doesn't have a secure way for it.
>>>>>             */
>>>>>            if (device->open_count != 0 && !df->group)
>>>>>                    return -EINVAL;
>>>>>
>>>>>
>>>>
>>>> The group path only allow single group open, so the device FDs retrieved
>>>> via the group is just within the opener of the group. This secure is built
>>>> on top of single open of group.
>>> What if the group is opened for only once but VFIO_GROUP_GET_DEVICE_FD
>>> ioctl is called for multiple times?
>>
>> this should happen within the process context that has opened the group. it
>> should be safe, and that would be tracked by the open_count.
> Thanks for explanation.
> 
> Even within a single process, for the group path, it appears that accesses to
> the multiple opened device fds still require proper synchronization.

this is for sure as they are accessing the same device.

> With proper synchronizations, for cdev path, accesses from different processes
> can still function correctly.
> Additionally, the group fd can also be passed to another process, allowing
> device fds to be acquired and accessed from a different process.

I think the secure boundary is within a process. If there are multiple
processes accessing a single device, then the boundary is broken.

> On the other hand, cdev path might also support multiple opened fds from a
> single process by checking task gid.
> 
> The device cdev path simply opts not to do that because it is unnecessary, right?

This is part of the reason. The major reason is that the vfio group can be
compiled out. Without the vfio group, it's a bit complicated to ensure all
the devices within the same iommu group been opened by one user. As no
known usage of it, so we didn't explore it very much. Actually, if multiple
FDs are needed, may be dup() is a choice. Do you have such a need?

Regards,
Yi Liu
Yan Zhao July 1, 2024, 5:48 a.m. UTC | #16
On Mon, Jul 01, 2024 at 01:44:01PM +0800, Yi Liu wrote:
> On 2024/7/1 09:47, Yan Zhao wrote:
> > On Sun, Jun 30, 2024 at 03:06:05PM +0800, Yi Liu wrote:
> > > On 2024/6/28 23:28, Yan Zhao wrote:
> > > > On Fri, Jun 28, 2024 at 05:48:11PM +0800, Yi Liu wrote:
> > > > > On 2024/6/28 13:21, Yan Zhao wrote:
> > > > > > On Thu, Jun 27, 2024 at 09:42:09AM -0300, Jason Gunthorpe wrote:
> > > > > > > On Thu, Jun 27, 2024 at 05:51:01PM +0800, Yan Zhao wrote:
> > > > > > > 
> > > > > > > > > > > This doesn't seem right.. There is only one device but multiple file
> > > > > > > > > > > can be opened on that device.
> > > > > > > > Maybe we can put this assignment to vfio_df_ioctl_bind_iommufd() after
> > > > > > > > vfio_df_open() makes sure device->open_count is 1.
> > > > > > > 
> > > > > > > Yeah, that seems better.
> > > > > > > 
> > > > > > > Logically it would be best if all places set the inode once the
> > > > > > > inode/FD has been made to be the one and only way to access it.
> > > > > > For group path, I'm afraid there's no such a place ensuring only one active fd
> > > > > > in kernel.
> > > > > > I tried modifying QEMU to allow two openings and two assignments of the same
> > > > > > device. It works and appears to guest that there were 2 devices, though this
> > > > > > ultimately leads to device malfunctions in guest.
> > > > > > 
> > > > > > > > BTW, in group path, what's the benefit of allowing multiple open of device?
> > > > > > > 
> > > > > > > I don't know, the thing that opened the first FD can just dup it, no
> > > > > > > idea why two different FDs would be useful. It is something we removed
> > > > > > > in the cdev flow
> > > > > > > 
> > > > > > Thanks. However, from the code, it reads like a drawback of the cdev flow :)
> > > > > > I don't understand why the group path is secure though.
> > > > > > 
> > > > > >            /*
> > > > > >             * Only the group path allows the device to be opened multiple
> > > > > >             * times.  The device cdev path doesn't have a secure way for it.
> > > > > >             */
> > > > > >            if (device->open_count != 0 && !df->group)
> > > > > >                    return -EINVAL;
> > > > > > 
> > > > > > 
> > > > > 
> > > > > The group path only allow single group open, so the device FDs retrieved
> > > > > via the group is just within the opener of the group. This secure is built
> > > > > on top of single open of group.
> > > > What if the group is opened for only once but VFIO_GROUP_GET_DEVICE_FD
> > > > ioctl is called for multiple times?
> > > 
> > > this should happen within the process context that has opened the group. it
> > > should be safe, and that would be tracked by the open_count.
> > Thanks for explanation.
> > 
> > Even within a single process, for the group path, it appears that accesses to
> > the multiple opened device fds still require proper synchronization.
> 
> this is for sure as they are accessing the same device.
> 
> > With proper synchronizations, for cdev path, accesses from different processes
> > can still function correctly.
> > Additionally, the group fd can also be passed to another process, allowing
> > device fds to be acquired and accessed from a different process.
> 
> I think the secure boundary is within a process. If there are multiple
> processes accessing a single device, then the boundary is broken.
> 
> > On the other hand, cdev path might also support multiple opened fds from a
> > single process by checking task gid.
> > 
> > The device cdev path simply opts not to do that because it is unnecessary, right?
> 
> This is part of the reason. The major reason is that the vfio group can be
> compiled out. Without the vfio group, it's a bit complicated to ensure all
> the devices within the same iommu group been opened by one user. As no
> known usage of it, so we didn't explore it very much. Actually, if multiple
> FDs are needed, may be dup() is a choice. Do you have such a need?
No, I don't have such a need.
I just find it's confusing to say "Only the group path allows the device to be
opened multiple times. The device cdev path doesn't have a secure way for it",
since it's still doable to achieve the same "secure" level in cdev path and the
group path is not that "secure" :)
Yi Liu July 1, 2024, 7:54 a.m. UTC | #17
On 2024/6/17 17:53, Yan Zhao wrote:
> Reuse file f_inode as vfio device inode and associate pseudo path file
> directly to inode allocated in vfio fs.
> 
> Currently, vfio device is opened via 2 ways:
> 1) via cdev open
>     vfio device is opened with a cdev device with file f_inode and address
>     space associated with a cdev inode;
> 2) via VFIO_GROUP_GET_DEVICE_FD ioctl
>     vfio device is opened via a pseudo path file with file f_inode and
>     address space associated with an inode in anon_inode_fs.
> 

You can simply say the cdev path and group path. :)

> In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per device"),
> an inode in vfio fs is allocated for each vfio device. However, this inode
> in vfio fs is only used to assign its address space to that of a file
> associated with another cdev inode or an inode in anon_inode_fs.
> 
> This patch
> - reuses cdev device inode as the vfio device inode when it's opened via
>    cdev way;
> - allocates an inode in vfio fs, associate it to the pseudo path file,
>    and save it as the vfio device inode when the vfio device is opened via
>    VFIO_GROUP_GET_DEVICE_FD ioctl.

So Alex's prior series only makes use of the i_mapping of the inode instead
of associating the inode with the pseudo path file?

> File address space will then point automatically to the address space of
> the vfio device inode. Tools like unmap_mapping_range() can then zap all
> vmas associated with the vfio device.
> 
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
>   drivers/vfio/device_cdev.c |  9 ++++---
>   drivers/vfio/group.c       | 21 ++--------------
>   drivers/vfio/vfio.h        |  2 ++
>   drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
>   4 files changed, 43 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index bb1817bd4ff3..a4eec8e88f5c 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>   	filep->private_data = df;
>   
>   	/*
> -	 * Use the pseudo fs inode on the device to link all mmaps
> -	 * to the same address space, allowing us to unmap all vmas
> -	 * associated to this device using unmap_mapping_range().
> +	 * mmaps are linked to the address space of the inode of device cdev.
> +	 * Save the inode of device cdev in device->inode to allow
> +	 * unmap_mapping_range() to unmap all vmas.
>   	 */
> -	filep->f_mapping = device->inode->i_mapping;
> -
> +	device->inode = inode;
>   	return 0;
>   
>   err_put_registration:
> diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> index ded364588d29..aaef188003b6 100644
> --- a/drivers/vfio/group.c
> +++ b/drivers/vfio/group.c
> @@ -268,31 +268,14 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
>   	if (ret)
>   		goto err_free;
>   
> -	/*
> -	 * We can't use anon_inode_getfd() because we need to modify
> -	 * the f_mode flags directly to allow more than just ioctls
> -	 */
> -	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> -				   df, O_RDWR);
> +	filep = vfio_device_get_pseudo_file(device);
>   	if (IS_ERR(filep)) {
>   		ret = PTR_ERR(filep);
>   		goto err_close_device;
>   	}
> -
> -	/*
> -	 * TODO: add an anon_inode interface to do this.
> -	 * Appears to be missing by lack of need rather than
> -	 * explicitly prevented.  Now there's need.
> -	 */
> +	filep->private_data = df;
>   	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
>   
> -	/*
> -	 * Use the pseudo fs inode on the device to link all mmaps
> -	 * to the same address space, allowing us to unmap all vmas
> -	 * associated to this device using unmap_mapping_range().
> -	 */
> -	filep->f_mapping = device->inode->i_mapping;
> -
>   	if (device->group->type == VFIO_NO_IOMMU)
>   		dev_warn(device->dev, "vfio-noiommu device opened by user "
>   			 "(%s:%d)\n", current->comm, task_pid_nr(current));
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 50128da18bca..1f8915f79fbb 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -35,6 +35,7 @@ struct vfio_device_file *
>   vfio_allocate_device_file(struct vfio_device *device);
>   
>   extern const struct file_operations vfio_device_fops;
> +struct file *vfio_device_get_pseudo_file(struct vfio_device *device);
>   
>   #ifdef CONFIG_VFIO_NOIOMMU
>   extern bool vfio_noiommu __read_mostly;
> @@ -420,6 +421,7 @@ static inline void vfio_cdev_cleanup(void)
>   {
>   }
>   #endif /* CONFIG_VFIO_DEVICE_CDEV */
> +struct file *vfio_device_get_pseduo_file(struct vfio_device *device);
>   
>   #if IS_ENABLED(CONFIG_VFIO_VIRQFD)
>   int __init vfio_virqfd_init(void);
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index a5a62d9d963f..e81d0f910c70 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -192,7 +192,6 @@ static void vfio_device_release(struct device *dev)
>   	if (device->ops->release)
>   		device->ops->release(device);
>   
> -	iput(device->inode);
>   	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
>   	kvfree(device);
>   }
> @@ -248,20 +247,50 @@ static struct file_system_type vfio_fs_type = {
>   	.kill_sb = kill_anon_super,
>   };
>   
> -static struct inode *vfio_fs_inode_new(void)
> +/*
> + * Alloc pseudo file from inode associated of vfio.vfs_mount.

nit: s/Alloc/Allocate/ and s/of/with/

> + * This is called when vfio device is opened via pseudo file.

group path might be better. Is this pseudo file only needed for the device
files opened in the group path? If so, might be helpful to move the related
codes into group.c.

> + * mmaps are linked to the address space of the inode of the pseudo file.
> + * Save the inode in device->inode for unmap_mapping_range() to unmap all vmas.
> + */
> +struct file *vfio_device_get_pseudo_file(struct vfio_device *device)
>   {
> +	const struct file_operations *fops = &vfio_device_fops;
>   	struct inode *inode;
> +	struct file *filep;
>   	int ret;
>   
> +	if (!fops_get(fops))
> +		return ERR_PTR(-ENODEV);
> +
>   	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
>   	if (ret)
> -		return ERR_PTR(ret);
> +		goto err_pin_fs;
>   
>   	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
> -	if (IS_ERR(inode))
> -		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> +	if (IS_ERR(inode)) {
> +		ret = PTR_ERR(inode);
> +		goto err_inode;
> +	}
> +
> +	filep = alloc_file_pseudo(inode, vfio.vfs_mount, "[vfio-device]",
> +				  O_RDWR, fops);
> +
> +	if (IS_ERR(filep)) {
> +		ret = PTR_ERR(filep);
> +		goto err_file;
> +	}
> +	device->inode = inode;

The group path allows multiple device fd get, hence this will set the
device->inode multiple times. It does not look good. Setting it once
should be enough?

> +	return filep;
> +
> +err_file:
> +	iput(inode);

If the vfio_device_get_pseudo_file() succeeds, who will put inode? I
noticed all the other iput() of this file are removed.

> +err_inode:
> +	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> +err_pin_fs:
> +	fops_put(fops);
>   
> -	return inode;
> +	return ERR_PTR(ret);
>   }
>   
>   /*
> @@ -282,11 +311,6 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
>   	init_completion(&device->comp);
>   	device->dev = dev;
>   	device->ops = ops;
> -	device->inode = vfio_fs_inode_new();
> -	if (IS_ERR(device->inode)) {
> -		ret = PTR_ERR(device->inode);
> -		goto out_inode;
> -	}
>   
>   	if (ops->init) {
>   		ret = ops->init(device);
> @@ -301,9 +325,6 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
>   	return 0;
>   
>   out_uninit:
> -	iput(device->inode);
> -	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> -out_inode:
>   	vfio_release_device_set(device);
>   	ida_free(&vfio.device_ida, device->index);
>   	return ret;
> 
> base-commit: 6ba59ff4227927d3a8530fc2973b80e94b54d58f
Yan Zhao July 1, 2024, 11:29 a.m. UTC | #18
On Mon, Jul 01, 2024 at 03:54:19PM +0800, Yi Liu wrote:
> On 2024/6/17 17:53, Yan Zhao wrote:
> > Reuse file f_inode as vfio device inode and associate pseudo path file
> > directly to inode allocated in vfio fs.
> > 
> > Currently, vfio device is opened via 2 ways:
> > 1) via cdev open
> >     vfio device is opened with a cdev device with file f_inode and address
> >     space associated with a cdev inode;
> > 2) via VFIO_GROUP_GET_DEVICE_FD ioctl
> >     vfio device is opened via a pseudo path file with file f_inode and
> >     address space associated with an inode in anon_inode_fs.
> > 
> 
> You can simply say the cdev path and group path. :)
Ok, if they are well-known terms.

> 
> > In commit b7c5e64fecfa ("vfio: Create vfio_fs_type with inode per device"),
> > an inode in vfio fs is allocated for each vfio device. However, this inode
> > in vfio fs is only used to assign its address space to that of a file
> > associated with another cdev inode or an inode in anon_inode_fs.
> > 
> > This patch
> > - reuses cdev device inode as the vfio device inode when it's opened via
> >    cdev way;
> > - allocates an inode in vfio fs, associate it to the pseudo path file,
> >    and save it as the vfio device inode when the vfio device is opened via
> >    VFIO_GROUP_GET_DEVICE_FD ioctl.
> 
> So Alex's prior series only makes use of the i_mapping of the inode instead
> of associating the inode with the pseudo path file?
Right.

> > File address space will then point automatically to the address space of
> > the vfio device inode. Tools like unmap_mapping_range() can then zap all
> > vmas associated with the vfio device.
> > 
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> >   drivers/vfio/device_cdev.c |  9 ++++---
> >   drivers/vfio/group.c       | 21 ++--------------
> >   drivers/vfio/vfio.h        |  2 ++
> >   drivers/vfio/vfio_main.c   | 49 +++++++++++++++++++++++++++-----------
> >   4 files changed, 43 insertions(+), 38 deletions(-)
> > 
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index bb1817bd4ff3..a4eec8e88f5c 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -40,12 +40,11 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
> >   	filep->private_data = df;
> >   	/*
> > -	 * Use the pseudo fs inode on the device to link all mmaps
> > -	 * to the same address space, allowing us to unmap all vmas
> > -	 * associated to this device using unmap_mapping_range().
> > +	 * mmaps are linked to the address space of the inode of device cdev.
> > +	 * Save the inode of device cdev in device->inode to allow
> > +	 * unmap_mapping_range() to unmap all vmas.
> >   	 */
> > -	filep->f_mapping = device->inode->i_mapping;
> > -
> > +	device->inode = inode;
> >   	return 0;
> >   err_put_registration:
> > diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> > index ded364588d29..aaef188003b6 100644
> > --- a/drivers/vfio/group.c
> > +++ b/drivers/vfio/group.c
> > @@ -268,31 +268,14 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
> >   	if (ret)
> >   		goto err_free;
> > -	/*
> > -	 * We can't use anon_inode_getfd() because we need to modify
> > -	 * the f_mode flags directly to allow more than just ioctls
> > -	 */
> > -	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
> > -				   df, O_RDWR);
> > +	filep = vfio_device_get_pseudo_file(device);
> >   	if (IS_ERR(filep)) {
> >   		ret = PTR_ERR(filep);
> >   		goto err_close_device;
> >   	}
> > -
> > -	/*
> > -	 * TODO: add an anon_inode interface to do this.
> > -	 * Appears to be missing by lack of need rather than
> > -	 * explicitly prevented.  Now there's need.
> > -	 */
> > +	filep->private_data = df;
> >   	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
> > -	/*
> > -	 * Use the pseudo fs inode on the device to link all mmaps
> > -	 * to the same address space, allowing us to unmap all vmas
> > -	 * associated to this device using unmap_mapping_range().
> > -	 */
> > -	filep->f_mapping = device->inode->i_mapping;
> > -
> >   	if (device->group->type == VFIO_NO_IOMMU)
> >   		dev_warn(device->dev, "vfio-noiommu device opened by user "
> >   			 "(%s:%d)\n", current->comm, task_pid_nr(current));
> > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> > index 50128da18bca..1f8915f79fbb 100644
> > --- a/drivers/vfio/vfio.h
> > +++ b/drivers/vfio/vfio.h
> > @@ -35,6 +35,7 @@ struct vfio_device_file *
> >   vfio_allocate_device_file(struct vfio_device *device);
> >   extern const struct file_operations vfio_device_fops;
> > +struct file *vfio_device_get_pseudo_file(struct vfio_device *device);
> >   #ifdef CONFIG_VFIO_NOIOMMU
> >   extern bool vfio_noiommu __read_mostly;
> > @@ -420,6 +421,7 @@ static inline void vfio_cdev_cleanup(void)
> >   {
> >   }
> >   #endif /* CONFIG_VFIO_DEVICE_CDEV */
> > +struct file *vfio_device_get_pseduo_file(struct vfio_device *device);
> >   #if IS_ENABLED(CONFIG_VFIO_VIRQFD)
> >   int __init vfio_virqfd_init(void);
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index a5a62d9d963f..e81d0f910c70 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -192,7 +192,6 @@ static void vfio_device_release(struct device *dev)
> >   	if (device->ops->release)
> >   		device->ops->release(device);
> > -	iput(device->inode);
> >   	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> >   	kvfree(device);
> >   }
> > @@ -248,20 +247,50 @@ static struct file_system_type vfio_fs_type = {
> >   	.kill_sb = kill_anon_super,
> >   };
> > -static struct inode *vfio_fs_inode_new(void)
> > +/*
> > + * Alloc pseudo file from inode associated of vfio.vfs_mount.
> 
> nit: s/Alloc/Allocate/ and s/of/with/
> 
> > + * This is called when vfio device is opened via pseudo file.
> 
> group path might be better. Is this pseudo file only needed for the device
> files opened in the group path? If so, might be helpful to move the related
> codes into group.c.
Yes. I also planed to move the this to group.c in v2 :)

> 
> > + * mmaps are linked to the address space of the inode of the pseudo file.
> > + * Save the inode in device->inode for unmap_mapping_range() to unmap all vmas.
> > + */
> > +struct file *vfio_device_get_pseudo_file(struct vfio_device *device)
> >   {
> > +	const struct file_operations *fops = &vfio_device_fops;
> >   	struct inode *inode;
> > +	struct file *filep;
> >   	int ret;
> > +	if (!fops_get(fops))
> > +		return ERR_PTR(-ENODEV);
> > +
> >   	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
> >   	if (ret)
> > -		return ERR_PTR(ret);
> > +		goto err_pin_fs;
> >   	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
> > -	if (IS_ERR(inode))
> > -		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> > +	if (IS_ERR(inode)) {
> > +		ret = PTR_ERR(inode);
> > +		goto err_inode;
> > +	}
> > +
> > +	filep = alloc_file_pseudo(inode, vfio.vfs_mount, "[vfio-device]",
> > +				  O_RDWR, fops);
> > +
> > +	if (IS_ERR(filep)) {
> > +		ret = PTR_ERR(filep);
> > +		goto err_file;
> > +	}
> > +	device->inode = inode;
> 
> The group path allows multiple device fd get, hence this will set the
> device->inode multiple times. It does not look good. Setting it once
> should be enough?
Will make the multipl opens to use the same vfio inode, as in [1].

[1] https://lore.kernel.org/all/Zn02BUdJ7kvOg6Vw@yzhao56-desk.sh.intel.com/

> 
> > +	return filep;
> > +
> > +err_file:
> > +	iput(inode);
> 
> If the vfio_device_get_pseudo_file() succeeds, who will put inode? I
> noticed all the other iput() of this file are removed.
On success, the inode ref count is moved to the file and put in
vfio_device_fops_release().
The prevous iput() is only required because that inode is not associated
with any file.

> > +err_inode:
> > +	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> > +err_pin_fs:
> > +	fops_put(fops);
> > -	return inode;
> > +	return ERR_PTR(ret);
> >   }
> >   /*
> > @@ -282,11 +311,6 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
> >   	init_completion(&device->comp);
> >   	device->dev = dev;
> >   	device->ops = ops;
> > -	device->inode = vfio_fs_inode_new();
> > -	if (IS_ERR(device->inode)) {
> > -		ret = PTR_ERR(device->inode);
> > -		goto out_inode;
> > -	}
> >   	if (ops->init) {
> >   		ret = ops->init(device);
> > @@ -301,9 +325,6 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
> >   	return 0;
> >   out_uninit:
> > -	iput(device->inode);
> > -	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
> > -out_inode:
> >   	vfio_release_device_set(device);
> >   	ida_free(&vfio.device_ida, device->index);
> >   	return ret;
> > 
> > base-commit: 6ba59ff4227927d3a8530fc2973b80e94b54d58f
> 
> -- 
> Regards,
> Yi Liu
Jason Gunthorpe July 10, 2024, 2:40 p.m. UTC | #19
On Mon, Jul 01, 2024 at 01:48:07PM +0800, Yan Zhao wrote:

> No, I don't have such a need.
> I just find it's confusing to say "Only the group path allows the device to be
> opened multiple times. The device cdev path doesn't have a secure way for it",
> since it's still doable to achieve the same "secure" level in cdev path and the
> group path is not that "secure" :)

It is more that the group path had an API that allowed for multiple
FDs without an actual need to ever use that. You can always make more
FDs with dup.

There is no reason for this functionality, we just have to keep it
working as a matter of uABI compatability and we are being more strict
in the new APIs.

Jason
Yan Zhao July 12, 2024, 5:19 a.m. UTC | #20
On Wed, Jul 10, 2024 at 11:40:44AM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 01, 2024 at 01:48:07PM +0800, Yan Zhao wrote:
> 
> > No, I don't have such a need.
> > I just find it's confusing to say "Only the group path allows the device to be
> > opened multiple times. The device cdev path doesn't have a secure way for it",
> > since it's still doable to achieve the same "secure" level in cdev path and the
> > group path is not that "secure" :)
> 
> It is more that the group path had an API that allowed for multiple
> FDs without an actual need to ever use that. You can always make more
> FDs with dup.
> 
> There is no reason for this functionality, we just have to keep it
> working as a matter of uABI compatability and we are being more strict
> in the new APIs.
Thanks for clarification.
Regarding to uABI compatability, even after being more strict and returning
error when opening the second FD in group path, the uABI compatability is still
maintained? e.g.
QEMU would correctly reports "Verify all devices in group xxx are bound to
vfio-<bus> or pci-stub and not already in use" in that case.
Given there's no actual users, could we also remove the support of multiple FDs
in group path to simplify code?
Yi Liu July 12, 2024, 6:14 a.m. UTC | #21
On 2024/7/12 13:19, Yan Zhao wrote:
> On Wed, Jul 10, 2024 at 11:40:44AM -0300, Jason Gunthorpe wrote:
>> On Mon, Jul 01, 2024 at 01:48:07PM +0800, Yan Zhao wrote:
>>
>>> No, I don't have such a need.
>>> I just find it's confusing to say "Only the group path allows the device to be
>>> opened multiple times. The device cdev path doesn't have a secure way for it",
>>> since it's still doable to achieve the same "secure" level in cdev path and the
>>> group path is not that "secure" :)
>>
>> It is more that the group path had an API that allowed for multiple
>> FDs without an actual need to ever use that. You can always make more
>> FDs with dup.
>>
>> There is no reason for this functionality, we just have to keep it
>> working as a matter of uABI compatability and we are being more strict
>> in the new APIs.
> Thanks for clarification.
> Regarding to uABI compatability, even after being more strict and returning
> error when opening the second FD in group path, the uABI compatability is still
> maintained? e.g.
> QEMU would correctly reports "Verify all devices in group xxx are bound to
> vfio-<bus> or pci-stub and not already in use" in that case.
> Given there's no actual users, could we also remove the support of multiple FDs
> in group path to simplify code?

QEMU is not the only user of vfio. DPDK or other applications may have
usage for multi-fds.
diff mbox series

Patch

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index bb1817bd4ff3..a4eec8e88f5c 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -40,12 +40,11 @@  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
 	filep->private_data = df;
 
 	/*
-	 * Use the pseudo fs inode on the device to link all mmaps
-	 * to the same address space, allowing us to unmap all vmas
-	 * associated to this device using unmap_mapping_range().
+	 * mmaps are linked to the address space of the inode of device cdev.
+	 * Save the inode of device cdev in device->inode to allow
+	 * unmap_mapping_range() to unmap all vmas.
 	 */
-	filep->f_mapping = device->inode->i_mapping;
-
+	device->inode = inode;
 	return 0;
 
 err_put_registration:
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index ded364588d29..aaef188003b6 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -268,31 +268,14 @@  static struct file *vfio_device_open_file(struct vfio_device *device)
 	if (ret)
 		goto err_free;
 
-	/*
-	 * We can't use anon_inode_getfd() because we need to modify
-	 * the f_mode flags directly to allow more than just ioctls
-	 */
-	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
-				   df, O_RDWR);
+	filep = vfio_device_get_pseudo_file(device);
 	if (IS_ERR(filep)) {
 		ret = PTR_ERR(filep);
 		goto err_close_device;
 	}
-
-	/*
-	 * TODO: add an anon_inode interface to do this.
-	 * Appears to be missing by lack of need rather than
-	 * explicitly prevented.  Now there's need.
-	 */
+	filep->private_data = df;
 	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
 
-	/*
-	 * Use the pseudo fs inode on the device to link all mmaps
-	 * to the same address space, allowing us to unmap all vmas
-	 * associated to this device using unmap_mapping_range().
-	 */
-	filep->f_mapping = device->inode->i_mapping;
-
 	if (device->group->type == VFIO_NO_IOMMU)
 		dev_warn(device->dev, "vfio-noiommu device opened by user "
 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 50128da18bca..1f8915f79fbb 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -35,6 +35,7 @@  struct vfio_device_file *
 vfio_allocate_device_file(struct vfio_device *device);
 
 extern const struct file_operations vfio_device_fops;
+struct file *vfio_device_get_pseudo_file(struct vfio_device *device);
 
 #ifdef CONFIG_VFIO_NOIOMMU
 extern bool vfio_noiommu __read_mostly;
@@ -420,6 +421,7 @@  static inline void vfio_cdev_cleanup(void)
 {
 }
 #endif /* CONFIG_VFIO_DEVICE_CDEV */
+struct file *vfio_device_get_pseduo_file(struct vfio_device *device);
 
 #if IS_ENABLED(CONFIG_VFIO_VIRQFD)
 int __init vfio_virqfd_init(void);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index a5a62d9d963f..e81d0f910c70 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -192,7 +192,6 @@  static void vfio_device_release(struct device *dev)
 	if (device->ops->release)
 		device->ops->release(device);
 
-	iput(device->inode);
 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 	kvfree(device);
 }
@@ -248,20 +247,50 @@  static struct file_system_type vfio_fs_type = {
 	.kill_sb = kill_anon_super,
 };
 
-static struct inode *vfio_fs_inode_new(void)
+/*
+ * Alloc pseudo file from inode associated of vfio.vfs_mount.
+ * This is called when vfio device is opened via pseudo file.
+ * mmaps are linked to the address space of the inode of the pseudo file.
+ * Save the inode in device->inode for unmap_mapping_range() to unmap all vmas.
+ */
+struct file *vfio_device_get_pseudo_file(struct vfio_device *device)
 {
+	const struct file_operations *fops = &vfio_device_fops;
 	struct inode *inode;
+	struct file *filep;
 	int ret;
 
+	if (!fops_get(fops))
+		return ERR_PTR(-ENODEV);
+
 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
 	if (ret)
-		return ERR_PTR(ret);
+		goto err_pin_fs;
 
 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
-	if (IS_ERR(inode))
-		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto err_inode;
+	}
+
+	filep = alloc_file_pseudo(inode, vfio.vfs_mount, "[vfio-device]",
+				  O_RDWR, fops);
+
+	if (IS_ERR(filep)) {
+		ret = PTR_ERR(filep);
+		goto err_file;
+	}
+	device->inode = inode;
+	return filep;
+
+err_file:
+	iput(inode);
+err_inode:
+	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
+err_pin_fs:
+	fops_put(fops);
 
-	return inode;
+	return ERR_PTR(ret);
 }
 
 /*
@@ -282,11 +311,6 @@  static int vfio_init_device(struct vfio_device *device, struct device *dev,
 	init_completion(&device->comp);
 	device->dev = dev;
 	device->ops = ops;
-	device->inode = vfio_fs_inode_new();
-	if (IS_ERR(device->inode)) {
-		ret = PTR_ERR(device->inode);
-		goto out_inode;
-	}
 
 	if (ops->init) {
 		ret = ops->init(device);
@@ -301,9 +325,6 @@  static int vfio_init_device(struct vfio_device *device, struct device *dev,
 	return 0;
 
 out_uninit:
-	iput(device->inode);
-	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
-out_inode:
 	vfio_release_device_set(device);
 	ida_free(&vfio.device_ida, device->index);
 	return ret;