diff mbox series

[1/3] vfio: Introduce vma ops registration and notifier

Message ID 161315805248.7320.13358719859656681660.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series vfio: Device memory DMA mapping improvements | expand

Commit Message

Alex Williamson Feb. 12, 2021, 7:27 p.m. UTC
Create an interface through vfio-core where a vfio bus driver (ex.
vfio-pci) can register the vm_operations_struct it uses to map device
memory, along with a set of registration callbacks.  This allows
vfio-core to expose interfaces for IOMMU backends to match a
vm_area_struct to a bus driver and register a notifier for relavant
changes to the device mapping.  For now we define only a notifier
action for closing the device.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  |  120 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vfio.h |   20 ++++++++
 2 files changed, 140 insertions(+)

Comments

Jason Gunthorpe Feb. 12, 2021, 9:20 p.m. UTC | #1
On Fri, Feb 12, 2021 at 12:27:39PM -0700, Alex Williamson wrote:
> Create an interface through vfio-core where a vfio bus driver (ex.
> vfio-pci) can register the vm_operations_struct it uses to map device
> memory, along with a set of registration callbacks.  This allows
> vfio-core to expose interfaces for IOMMU backends to match a
> vm_area_struct to a bus driver and register a notifier for relavant
> changes to the device mapping.  For now we define only a notifier
> action for closing the device.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>  drivers/vfio/vfio.c  |  120 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h |   20 ++++++++
>  2 files changed, 140 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 38779e6fd80c..568f5e37a95f 100644
> +++ b/drivers/vfio/vfio.c
> @@ -47,6 +47,8 @@ static struct vfio {
>  	struct cdev			group_cdev;
>  	dev_t				group_devt;
>  	wait_queue_head_t		release_q;
> +	struct list_head		vm_ops_list;
> +	struct mutex			vm_ops_lock;
>  } vfio;
>  
>  struct vfio_iommu_driver {
> @@ -2354,6 +2356,121 @@ struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
>  }
>  EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
>  
> +struct vfio_vma_ops {
> +	const struct vm_operations_struct	*vm_ops;
> +	vfio_register_vma_nb_t			*reg_fn;
> +	vfio_unregister_vma_nb_t		*unreg_fn;
> +	struct list_head			next;
> +};
> +
> +int vfio_register_vma_ops(const struct vm_operations_struct *vm_ops,
> +			  vfio_register_vma_nb_t *reg_fn,
> +			  vfio_unregister_vma_nb_t *unreg_fn)

This just feels a little bit too complicated

I've recently learned from Daniel that we can use the address_space
machinery to drive the zap_vma_ptes() via unmap_mapping_range(). This
technique replaces all the open, close and vma_list logic in vfio_pci

If we don't need open anymore, we could do something like this:

 static const struct vm_operations_struct vfio_pci_mmap_ops = {
        .open = vfio_pfn_open, // implemented in vfio.c
        .close = vfio_pfn_close,
        .fault = vfio_pci_mmap_fault,
 };

Then we could code the function needed:

struct vfio_pfn_range_handle 
{
       struct kref kref;
       struct vfio_device *vfio;
       struct notifier_block invalidation_cb;
       unsigned int flags;
}

struct vfio_pfn_range_handle *get_pfn_range(struct vm_area_struct *vma)
{
       struct vfio_pfn_range_handle *handle;

       if (vma->ops->open != vfio_pfn_open)
              return NULL;
       
       handle = vma->vm_private_data;
       if (test_bit(handle->flags, DMA_STOPPED)
              return NULL;
       kref_get(&handle->kref);
       return handle;
}

Where the common open/close only kref inc/dec the kref and all 'vfio'
VMAs always have a pointer to the same vfio_pfn_range_handle in their
private_data.

The vm_pgoff is already pointing at the physical pfn, so every part of
the system can get the information it needs fairly trivially.

Some stop access function is pretty simple looking

void stop_access(struct vfio_pfn_range_handle *handle)
{
      set_bit(handle->flags, DMA_STOPPED);
      unmap_mapping_range(handle->vfio->[..]->inode, 0, max, false);
      srcu_notifier_call_chain(handle->invalidation_cb, VFIO_VMA_NOTIFY_CLOSE, NULL);
}

(well, have to sort out the locking some more, but that is the
 general idea)

I think that would remove alot of the code added here and acts a lot
closer to how a someday dmabuf could act.

Also, this will need to update the nvlink vmops as well

Jason
Jason Gunthorpe Feb. 18, 2021, 1:12 a.m. UTC | #2
On Fri, Feb 12, 2021 at 05:20:57PM -0400, Jason Gunthorpe wrote:
> On Fri, Feb 12, 2021 at 12:27:39PM -0700, Alex Williamson wrote:
> > Create an interface through vfio-core where a vfio bus driver (ex.
> > vfio-pci) can register the vm_operations_struct it uses to map device
> > memory, along with a set of registration callbacks.  This allows
> > vfio-core to expose interfaces for IOMMU backends to match a
> > vm_area_struct to a bus driver and register a notifier for relavant
> > changes to the device mapping.  For now we define only a notifier
> > action for closing the device.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> >  drivers/vfio/vfio.c  |  120 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h |   20 ++++++++
> >  2 files changed, 140 insertions(+)
> > 
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index 38779e6fd80c..568f5e37a95f 100644
> > +++ b/drivers/vfio/vfio.c
> > @@ -47,6 +47,8 @@ static struct vfio {
> >  	struct cdev			group_cdev;
> >  	dev_t				group_devt;
> >  	wait_queue_head_t		release_q;
> > +	struct list_head		vm_ops_list;
> > +	struct mutex			vm_ops_lock;
> >  } vfio;
> >  
> >  struct vfio_iommu_driver {
> > @@ -2354,6 +2356,121 @@ struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
> >  }
> >  EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
> >  
> > +struct vfio_vma_ops {
> > +	const struct vm_operations_struct	*vm_ops;
> > +	vfio_register_vma_nb_t			*reg_fn;
> > +	vfio_unregister_vma_nb_t		*unreg_fn;
> > +	struct list_head			next;
> > +};
> > +
> > +int vfio_register_vma_ops(const struct vm_operations_struct *vm_ops,
> > +			  vfio_register_vma_nb_t *reg_fn,
> > +			  vfio_unregister_vma_nb_t *unreg_fn)
> 
> This just feels a little bit too complicated
> 
> I've recently learned from Daniel that we can use the address_space
> machinery to drive the zap_vma_ptes() via unmap_mapping_range(). This
> technique replaces all the open, close and vma_list logic in vfio_pci

Here is my effort to make rdma use this, it removes a lot of ugly code:

https://github.com/jgunthorpe/linux/commits/rdma_addr_space

Still needs some more detailed testing.

This gives an option to detect vfio VMAs by checking

   if (vma->vm_file &&
       file_inode(vma->vm_file) &&
       file_inode(vma->vm_file)->i_sb->s_type == vfio_fs_type)

And all vfio VMA's can have some consistent vm_private_data, or at
worst a consistent extended vm operations struct.

Jason
Alex Williamson Feb. 18, 2021, 9:56 p.m. UTC | #3
On Wed, 17 Feb 2021 21:12:09 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Feb 12, 2021 at 05:20:57PM -0400, Jason Gunthorpe wrote:
> > On Fri, Feb 12, 2021 at 12:27:39PM -0700, Alex Williamson wrote:  
> > > Create an interface through vfio-core where a vfio bus driver (ex.
> > > vfio-pci) can register the vm_operations_struct it uses to map device
> > > memory, along with a set of registration callbacks.  This allows
> > > vfio-core to expose interfaces for IOMMU backends to match a
> > > vm_area_struct to a bus driver and register a notifier for relavant
> > > changes to the device mapping.  For now we define only a notifier
> > > action for closing the device.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > >  drivers/vfio/vfio.c  |  120 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/linux/vfio.h |   20 ++++++++
> > >  2 files changed, 140 insertions(+)
> > > 
> > > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > > index 38779e6fd80c..568f5e37a95f 100644
> > > +++ b/drivers/vfio/vfio.c
> > > @@ -47,6 +47,8 @@ static struct vfio {
> > >  	struct cdev			group_cdev;
> > >  	dev_t				group_devt;
> > >  	wait_queue_head_t		release_q;
> > > +	struct list_head		vm_ops_list;
> > > +	struct mutex			vm_ops_lock;
> > >  } vfio;
> > >  
> > >  struct vfio_iommu_driver {
> > > @@ -2354,6 +2356,121 @@ struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
> > >  }
> > >  EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
> > >  
> > > +struct vfio_vma_ops {
> > > +	const struct vm_operations_struct	*vm_ops;
> > > +	vfio_register_vma_nb_t			*reg_fn;
> > > +	vfio_unregister_vma_nb_t		*unreg_fn;
> > > +	struct list_head			next;
> > > +};
> > > +
> > > +int vfio_register_vma_ops(const struct vm_operations_struct *vm_ops,
> > > +			  vfio_register_vma_nb_t *reg_fn,
> > > +			  vfio_unregister_vma_nb_t *unreg_fn)  
> > 
> > This just feels a little bit too complicated
> > 
> > I've recently learned from Daniel that we can use the address_space
> > machinery to drive the zap_vma_ptes() via unmap_mapping_range(). This
> > technique replaces all the open, close and vma_list logic in vfio_pci  
> 
> Here is my effort to make rdma use this, it removes a lot of ugly code:
> 
> https://github.com/jgunthorpe/linux/commits/rdma_addr_space
> 
> Still needs some more detailed testing.
> 
> This gives an option to detect vfio VMAs by checking
> 
>    if (vma->vm_file &&
>        file_inode(vma->vm_file) &&
>        file_inode(vma->vm_file)->i_sb->s_type == vfio_fs_type)
> 
> And all vfio VMA's can have some consistent vm_private_data, or at
> worst a consistent extended vm operations struct.

Looks pretty slick.  I won't claim it's fully gelled in my head yet,
but AIUI you're creating these inodes on your new pseudo fs and
associating it via the actual user fd via the f_mapping pointer, which
allows multiple fds to associate and address space back to this inode
when you want to call unmap_mapping_range().  That clarifies from the
previous email how we'd store the inode on the vfio_device without
introducing yet another tracking list for device fds.  I'll try to
piece together something similar for vfio, especially if we can avoid
that nasty lock switcheroo we copied from
uverbs_user_mmap_disassociate().  Thanks,

Alex
Jason Gunthorpe Feb. 18, 2021, 11:04 p.m. UTC | #4
On Thu, Feb 18, 2021 at 02:56:06PM -0700, Alex Williamson wrote:

> Looks pretty slick.  I won't claim it's fully gelled in my head yet,
> but AIUI you're creating these inodes on your new pseudo fs and
> associating it via the actual user fd via the f_mapping pointer, which
> allows multiple fds to associate and address space back to this inode
> when you want to call unmap_mapping_range().  

Yes, from what I can tell all the fs/inode stuff is just mandatory
overhead to get a unique address_space pointer, as that is the only
thing this is actually using.

I have to check the mmap flow more carefully, I recall pointing to a
existing race here with Daniel, but the general idea should hold.

> That clarifies from the previous email how we'd store the inode on
> the vfio_device without introducing yet another tracking list for
> device fds.

Right, you can tell from the vma what inode it is for, and the inode
can tell you if it is a VFIO VMA or not, so no tracking lists needed
at all.

Jason
Alex Williamson Feb. 19, 2021, 10:02 p.m. UTC | #5
On Thu, 18 Feb 2021 19:04:04 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Thu, Feb 18, 2021 at 02:56:06PM -0700, Alex Williamson wrote:
> 
> > Looks pretty slick.  I won't claim it's fully gelled in my head yet,
> > but AIUI you're creating these inodes on your new pseudo fs and
> > associating it via the actual user fd via the f_mapping pointer, which
> > allows multiple fds to associate and address space back to this inode
> > when you want to call unmap_mapping_range().    
> 
> Yes, from what I can tell all the fs/inode stuff is just mandatory
> overhead to get a unique address_space pointer, as that is the only
> thing this is actually using.
> 
> I have to check the mmap flow more carefully, I recall pointing to a
> existing race here with Daniel, but the general idea should hold.
> 
> > That clarifies from the previous email how we'd store the inode on
> > the vfio_device without introducing yet another tracking list for
> > device fds.  
> 
> Right, you can tell from the vma what inode it is for, and the inode
> can tell you if it is a VFIO VMA or not, so no tracking lists needed
> at all.

Seems to be a nice cleanup for vfio as well, more testing and analysis
required, but here are a few (4) wip commits that re-implement the
current vma zapping scheme following your example: 

https://github.com/awilliam/linux-vfio/commits/vfio-unmap-mapping-range

Thanks,
Alex
diff mbox series

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 38779e6fd80c..568f5e37a95f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -47,6 +47,8 @@  static struct vfio {
 	struct cdev			group_cdev;
 	dev_t				group_devt;
 	wait_queue_head_t		release_q;
+	struct list_head		vm_ops_list;
+	struct mutex			vm_ops_lock;
 } vfio;
 
 struct vfio_iommu_driver {
@@ -2354,6 +2356,121 @@  struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
 }
 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
 
+struct vfio_vma_ops {
+	const struct vm_operations_struct	*vm_ops;
+	vfio_register_vma_nb_t			*reg_fn;
+	vfio_unregister_vma_nb_t		*unreg_fn;
+	struct list_head			next;
+};
+
+int vfio_register_vma_ops(const struct vm_operations_struct *vm_ops,
+			  vfio_register_vma_nb_t *reg_fn,
+			  vfio_unregister_vma_nb_t *unreg_fn)
+{
+	struct vfio_vma_ops *ops;
+	int ret = 0;
+
+	mutex_lock(&vfio.vm_ops_lock);
+	list_for_each_entry(ops, &vfio.vm_ops_list, next) {
+		if (ops->vm_ops == vm_ops) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	ops = kmalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ops->vm_ops = vm_ops;
+	ops->reg_fn = reg_fn;
+	ops->unreg_fn = unreg_fn;
+
+	list_add(&ops->next, &vfio.vm_ops_list);
+out:
+	mutex_unlock(&vfio.vm_ops_lock);
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(vfio_register_vma_ops);
+
+void vfio_unregister_vma_ops(const struct vm_operations_struct *vm_ops)
+{
+	struct vfio_vma_ops *ops;
+
+	mutex_lock(&vfio.vm_ops_lock);
+	list_for_each_entry(ops, &vfio.vm_ops_list, next) {
+		if (ops->vm_ops == vm_ops) {
+			list_del(&ops->next);
+			kfree(ops);
+			break;
+		}
+	}
+	mutex_unlock(&vfio.vm_ops_lock);
+}
+EXPORT_SYMBOL_GPL(vfio_unregister_vma_ops);
+
+struct vfio_vma_obj {
+	const struct vm_operations_struct *vm_ops;
+	void *opaque;
+};
+
+void *vfio_register_vma_nb(struct vm_area_struct *vma,
+			   struct notifier_block *nb)
+{
+	struct vfio_vma_ops *ops;
+	struct vfio_vma_obj *obj = ERR_PTR(-ENODEV);
+
+	mutex_lock(&vfio.vm_ops_lock);
+	list_for_each_entry(ops, &vfio.vm_ops_list, next) {
+		if (ops->vm_ops == vma->vm_ops) {
+			void *opaque;
+
+			obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+			if (!obj) {
+				obj = ERR_PTR(-ENOMEM);
+				break;
+			}
+
+			obj->vm_ops = ops->vm_ops;
+
+			opaque = ops->reg_fn(vma, nb);
+			if (IS_ERR(opaque)) {
+				kfree(obj);
+				obj = opaque;
+			} else {
+				obj->opaque = opaque;
+			}
+
+			break;
+		}
+	}
+	mutex_unlock(&vfio.vm_ops_lock);
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(vfio_register_vma_nb);
+
+void vfio_unregister_vma_nb(void *opaque)
+{
+	struct vfio_vma_obj *obj = opaque;
+	struct vfio_vma_ops *ops;
+
+	mutex_lock(&vfio.vm_ops_lock);
+	list_for_each_entry(ops, &vfio.vm_ops_list, next) {
+		if (ops->vm_ops == obj->vm_ops) {
+			ops->unreg_fn(obj->opaque);
+			break;
+		}
+	}
+	mutex_unlock(&vfio.vm_ops_lock);
+
+	kfree(obj);
+}
+EXPORT_SYMBOL_GPL(vfio_unregister_vma_nb);
+
 /**
  * Module/class support
  */
@@ -2377,8 +2494,10 @@  static int __init vfio_init(void)
 	idr_init(&vfio.group_idr);
 	mutex_init(&vfio.group_lock);
 	mutex_init(&vfio.iommu_drivers_lock);
+	mutex_init(&vfio.vm_ops_lock);
 	INIT_LIST_HEAD(&vfio.group_list);
 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
+	INIT_LIST_HEAD(&vfio.vm_ops_list);
 	init_waitqueue_head(&vfio.release_q);
 
 	ret = misc_register(&vfio_dev);
@@ -2425,6 +2544,7 @@  static int __init vfio_init(void)
 static void __exit vfio_cleanup(void)
 {
 	WARN_ON(!list_empty(&vfio.group_list));
+	WARN_ON(!list_empty(&vfio.vm_ops_list));
 
 #ifdef CONFIG_VFIO_NOIOMMU
 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b7e18bde5aa8..1b5c6179d869 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -137,6 +137,26 @@  extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
 
 extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group);
 
+typedef void *(vfio_register_vma_nb_t)(struct vm_area_struct *vma,
+				       struct notifier_block *nb);
+typedef void (vfio_unregister_vma_nb_t)(void *opaque);
+
+extern int vfio_register_vma_ops(const struct vm_operations_struct *vm_ops,
+				 vfio_register_vma_nb_t *reg_fn,
+				 vfio_unregister_vma_nb_t *unreg_fn);
+
+extern void vfio_unregister_vma_ops(const struct vm_operations_struct *vm_ops);
+
+enum vfio_vma_notify_type {
+	VFIO_VMA_NOTIFY_CLOSE = 0,
+};
+
+extern void *vfio_register_vma_nb(struct vm_area_struct *vma,
+				  struct notifier_block *nb);
+
+extern void vfio_unregister_vma_nb(void *opaque);
+
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,