diff mbox

[v10,10/19] vfio iommu: Add blocking notifier to notify DMA_UNMAP

Message ID 1477517366-27871-11-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede Oct. 26, 2016, 9:29 p.m. UTC
Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
about DMA_UNMAP.
Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
Vendor driver should register notifer using these APIs.
Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
mappings.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
---
 drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
 include/linux/vfio.h            | 11 +++++
 3 files changed, 163 insertions(+), 10 deletions(-)

Comments

Jike Song Oct. 28, 2016, 7:33 a.m. UTC | #1
On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
> about DMA_UNMAP.
> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
> Vendor driver should register notifer using these APIs.
> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
> mappings.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
> ---
>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
>  include/linux/vfio.h            | 11 +++++
>  3 files changed, 163 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 28b50ca14c52..ff05ac6b1e90 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1891,6 +1891,79 @@ err_unpin_pages:
>  }
>  EXPORT_SYMBOL(vfio_unpin_pages);
>  
> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret;
> +
> +	if (!dev || !nb)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_register_nb;
> +
> +	container = group->container;
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->register_notifier))
> +		ret = driver->ops->register_notifier(container->iommu_data, nb);
> +	else
> +		ret = -EINVAL;
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_register_nb:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_register_notifier);
> +
> +int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret;
> +
> +	if (!dev || !nb)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_unregister_nb;
> +
> +	container = group->container;
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unregister_notifier))
> +		ret = driver->ops->unregister_notifier(container->iommu_data,
> +						       nb);
> +	else
> +		ret = -EINVAL;
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_unregister_nb:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unregister_notifier);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 5add11a147e1..a4bd331ac0fd 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -37,6 +37,7 @@
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
>  #include <linux/mdev.h>
> +#include <linux/notifier.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -59,6 +60,7 @@ struct vfio_iommu {
>  	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
> +	struct blocking_notifier_head notifier;
>  	bool			v2;
>  	bool			nesting;
>  };
> @@ -549,7 +551,8 @@ static long vfio_iommu_type1_pin_pages(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	if (!iommu->external_domain) {
> +	/* Fail if notifier list is empty */
> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>  		ret = -EINVAL;
>  		goto pin_done;
>  	}
> @@ -768,6 +771,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +/*
> + * This function finds pfn in domain->external_addr_space->pfn_list for given
> + * iova range. If pfn exist, notify pfn to registered notifier list. On
> + * receiving notifier callback, vendor driver should invalidate the mapping and
> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> + * searching for next vfio_pfn in rb tree, start search from first node again.
> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> + * from rb tree and so in next search vfio_pfn would be same as previous
> + * vfio_pfn. In that case, exit from loop.
> + */
> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> +				     struct vfio_iommu_type1_dma_unmap *unmap)
> +{
> +	struct vfio_domain *domain = iommu->external_domain;
> +	struct rb_node *n;
> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> +
> +	do {
> +		prev_vpfn = vpfn;
> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +		n = rb_first(&domain->external_addr_space->pfn_list);
> +
> +		for (; n; n = rb_next(n), vpfn = NULL) {
> +			vpfn = rb_entry(n, struct vfio_pfn, node);
> +
> +			if ((vpfn->iova >= unmap->iova) &&
> +			    (vpfn->iova < unmap->iova + unmap->size))
> +				break;
> +		}
> +
> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +
> +		/* Notify any listeners about DMA_UNMAP */
> +		if (vpfn)
> +			blocking_notifier_call_chain(&iommu->notifier,
> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> +						    &vpfn->pfn);

Hi Kirti, 

The information carried by notifier is only a pfn.

Since your pin/unpin interfaces design, it's the vendor driver who should
guarantee pin/unpin same times. To achieve that, the vendor driver must
cache it's iova->pfn mapping on its side, to avoid pinning a same page
for multiple times.

With the notifier carrying only a pfn, to find the iova by this pfn,
the vendor driver must *also* keep a reverse-mapping. That's a bit
too much.

Since the vendor could also suffer from IOMMU-compatible problem,
which means a local cache is always helpful, so I'd like to have the
iova carried to the notifier.

What'd you say?

--
Thanks,
Jike

> +	} while (vpfn && (prev_vpfn != vpfn));
> +
> +	WARN_ON(vpfn);
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -844,6 +891,9 @@ unlock:
>  	/* Report how much was unmapped */
>  	unmap->size = unmapped;
>  
> +	if (unmapped && iommu->external_domain)
> +		vfio_notifier_call_chain(iommu, unmap);
> +
>  	return ret;
>  }
>  
> @@ -1418,6 +1468,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	INIT_LIST_HEAD(&iommu->domain_list);
>  	iommu->dma_list = RB_ROOT;
>  	mutex_init(&iommu->lock);
> +	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>  
>  	return iommu;
>  }
> @@ -1555,16 +1606,34 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  	return -ENOTTY;
>  }
>  
> +static int vfio_iommu_type1_register_notifier(void *iommu_data,
> +					      struct notifier_block *nb)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +
> +	return blocking_notifier_chain_register(&iommu->notifier, nb);
> +}
> +
> +static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> +						struct notifier_block *nb)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +
> +	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> +}
> +
>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> -	.name		= "vfio-iommu-type1",
> -	.owner		= THIS_MODULE,
> -	.open		= vfio_iommu_type1_open,
> -	.release	= vfio_iommu_type1_release,
> -	.ioctl		= vfio_iommu_type1_ioctl,
> -	.attach_group	= vfio_iommu_type1_attach_group,
> -	.detach_group	= vfio_iommu_type1_detach_group,
> -	.pin_pages	= vfio_iommu_type1_pin_pages,
> -	.unpin_pages	= vfio_iommu_type1_unpin_pages,
> +	.name			= "vfio-iommu-type1",
> +	.owner			= THIS_MODULE,
> +	.open			= vfio_iommu_type1_open,
> +	.release		= vfio_iommu_type1_release,
> +	.ioctl			= vfio_iommu_type1_ioctl,
> +	.attach_group		= vfio_iommu_type1_attach_group,
> +	.detach_group		= vfio_iommu_type1_detach_group,
> +	.pin_pages		= vfio_iommu_type1_pin_pages,
> +	.unpin_pages		= vfio_iommu_type1_unpin_pages,
> +	.register_notifier	= vfio_iommu_type1_register_notifier,
> +	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0609a2052846..4c91ce8bfaeb 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -80,6 +80,10 @@ struct vfio_iommu_driver_ops {
>  				     unsigned long *phys_pfn);
>  	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
>  				       long npage);
> +	int		(*register_notifier)(void *iommu_data,
> +					     struct notifier_block *nb);
> +	int		(*unregister_notifier)(void *iommu_data,
> +					       struct notifier_block *nb);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -137,6 +141,13 @@ extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
>  extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
>  			     long npage);
>  
> +#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	1
> +
> +extern int vfio_register_notifier(struct device *dev,
> +				  struct notifier_block *nb);
> +
> +extern int vfio_unregister_notifier(struct device *dev,
> +				    struct notifier_block *nb);
>  /*
>   * IRQfd - generic
>   */
>
Kirti Wankhede Oct. 28, 2016, 12:38 p.m. UTC | #2
On 10/28/2016 1:03 PM, Jike Song wrote:
> On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
>>  
>> +/*
>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
>> + * receiving notifier callback, vendor driver should invalidate the mapping and
>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
>> + * searching for next vfio_pfn in rb tree, start search from first node again.
>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
>> + * from rb tree and so in next search vfio_pfn would be same as previous
>> + * vfio_pfn. In that case, exit from loop.
>> + */
>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
>> +{
>> +	struct vfio_domain *domain = iommu->external_domain;
>> +	struct rb_node *n;
>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
>> +
>> +	do {
>> +		prev_vpfn = vpfn;
>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
>> +
>> +		n = rb_first(&domain->external_addr_space->pfn_list);
>> +
>> +		for (; n; n = rb_next(n), vpfn = NULL) {
>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
>> +
>> +			if ((vpfn->iova >= unmap->iova) &&
>> +			    (vpfn->iova < unmap->iova + unmap->size))
>> +				break;
>> +		}
>> +
>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
>> +
>> +		/* Notify any listeners about DMA_UNMAP */
>> +		if (vpfn)
>> +			blocking_notifier_call_chain(&iommu->notifier,
>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>> +						    &vpfn->pfn);
> 
> Hi Kirti, 
> 
> The information carried by notifier is only a pfn.
> 
> Since your pin/unpin interfaces design, it's the vendor driver who should
> guarantee pin/unpin same times. 

That because there could be multiple mdev devices from different vendor
driver in a domain, and pin request for a page can come from multiple
vendor driver for same page and in pin/unpin page interface we don't
keep track of who had pinned pages. So its vendor driver's
responsibility to unpin pages which he had pinned.

> To achieve that, the vendor driver must
> cache it's iova->pfn mapping on its side, to avoid pinning a same page
> for multiple times.
>

That's right.

> With the notifier carrying only a pfn, to find the iova by this pfn,
> the vendor driver must *also* keep a reverse-mapping. That's a bit
> too much.
> 

Don't know what the reserve mapping is, vendor should know the pfn that
he has previously pinned to call the unpin anyway so there is no need to
duplicate the logic here.

Kirti

> Since the vendor could also suffer from IOMMU-compatible problem,
> which means a local cache is always helpful, so I'd like to have the
> iova carried to the notifier.
> 
> What'd you say?
>
Alex Williamson Oct. 28, 2016, 12:40 p.m. UTC | #3
On Fri, 28 Oct 2016 15:33:58 +0800
Jike Song <jike.song@intel.com> wrote:

> On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> > Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
> > about DMA_UNMAP.
> > Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
> > Vendor driver should register notifer using these APIs.
> > Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
> > mappings.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Signed-off-by: Neo Jia <cjia@nvidia.com>
> > Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
> > ---
> >  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
> >  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
> >  include/linux/vfio.h            | 11 +++++
> >  3 files changed, 163 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index 28b50ca14c52..ff05ac6b1e90 100644
> > --- a/drivers/vfio/vfio.c
> > +++ b/drivers/vfio/vfio.c
> > @@ -1891,6 +1891,79 @@ err_unpin_pages:
> >  }
> >  EXPORT_SYMBOL(vfio_unpin_pages);
> >  
> > +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
> > +{
> > +	struct vfio_container *container;
> > +	struct vfio_group *group;
> > +	struct vfio_iommu_driver *driver;
> > +	ssize_t ret;
> > +
> > +	if (!dev || !nb)
> > +		return -EINVAL;
> > +
> > +	group = vfio_group_get_from_dev(dev);
> > +	if (IS_ERR(group))
> > +		return PTR_ERR(group);
> > +
> > +	ret = vfio_group_add_container_user(group);
> > +	if (ret)
> > +		goto err_register_nb;
> > +
> > +	container = group->container;
> > +	down_read(&container->group_lock);
> > +
> > +	driver = container->iommu_driver;
> > +	if (likely(driver && driver->ops->register_notifier))
> > +		ret = driver->ops->register_notifier(container->iommu_data, nb);
> > +	else
> > +		ret = -EINVAL;
> > +
> > +	up_read(&container->group_lock);
> > +	vfio_group_try_dissolve_container(group);
> > +
> > +err_register_nb:
> > +	vfio_group_put(group);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(vfio_register_notifier);
> > +
> > +int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
> > +{
> > +	struct vfio_container *container;
> > +	struct vfio_group *group;
> > +	struct vfio_iommu_driver *driver;
> > +	ssize_t ret;
> > +
> > +	if (!dev || !nb)
> > +		return -EINVAL;
> > +
> > +	group = vfio_group_get_from_dev(dev);
> > +	if (IS_ERR(group))
> > +		return PTR_ERR(group);
> > +
> > +	ret = vfio_group_add_container_user(group);
> > +	if (ret)
> > +		goto err_unregister_nb;
> > +
> > +	container = group->container;
> > +	down_read(&container->group_lock);
> > +
> > +	driver = container->iommu_driver;
> > +	if (likely(driver && driver->ops->unregister_notifier))
> > +		ret = driver->ops->unregister_notifier(container->iommu_data,
> > +						       nb);
> > +	else
> > +		ret = -EINVAL;
> > +
> > +	up_read(&container->group_lock);
> > +	vfio_group_try_dissolve_container(group);
> > +
> > +err_unregister_nb:
> > +	vfio_group_put(group);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(vfio_unregister_notifier);
> > +
> >  /**
> >   * Module/class support
> >   */
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 5add11a147e1..a4bd331ac0fd 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -37,6 +37,7 @@
> >  #include <linux/vfio.h>
> >  #include <linux/workqueue.h>
> >  #include <linux/mdev.h>
> > +#include <linux/notifier.h>
> >  
> >  #define DRIVER_VERSION  "0.2"
> >  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> > @@ -59,6 +60,7 @@ struct vfio_iommu {
> >  	struct vfio_domain	*external_domain; /* domain for external user */
> >  	struct mutex		lock;
> >  	struct rb_root		dma_list;
> > +	struct blocking_notifier_head notifier;
> >  	bool			v2;
> >  	bool			nesting;
> >  };
> > @@ -549,7 +551,8 @@ static long vfio_iommu_type1_pin_pages(void *iommu_data,
> >  
> >  	mutex_lock(&iommu->lock);
> >  
> > -	if (!iommu->external_domain) {
> > +	/* Fail if notifier list is empty */
> > +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
> >  		ret = -EINVAL;
> >  		goto pin_done;
> >  	}
> > @@ -768,6 +771,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >  	return bitmap;
> >  }
> >  
> > +/*
> > + * This function finds pfn in domain->external_addr_space->pfn_list for given
> > + * iova range. If pfn exist, notify pfn to registered notifier list. On
> > + * receiving notifier callback, vendor driver should invalidate the mapping and
> > + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> > + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> > + * searching for next vfio_pfn in rb tree, start search from first node again.
> > + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> > + * from rb tree and so in next search vfio_pfn would be same as previous
> > + * vfio_pfn. In that case, exit from loop.
> > + */
> > +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> > +				     struct vfio_iommu_type1_dma_unmap *unmap)
> > +{
> > +	struct vfio_domain *domain = iommu->external_domain;
> > +	struct rb_node *n;
> > +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> > +
> > +	do {
> > +		prev_vpfn = vpfn;
> > +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> > +
> > +		n = rb_first(&domain->external_addr_space->pfn_list);
> > +
> > +		for (; n; n = rb_next(n), vpfn = NULL) {
> > +			vpfn = rb_entry(n, struct vfio_pfn, node);
> > +
> > +			if ((vpfn->iova >= unmap->iova) &&
> > +			    (vpfn->iova < unmap->iova + unmap->size))
> > +				break;
> > +		}
> > +
> > +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> > +
> > +		/* Notify any listeners about DMA_UNMAP */
> > +		if (vpfn)
> > +			blocking_notifier_call_chain(&iommu->notifier,
> > +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> > +						    &vpfn->pfn);  
> 
> Hi Kirti, 
> 
> The information carried by notifier is only a pfn.
> 
> Since your pin/unpin interfaces design, it's the vendor driver who should
> guarantee pin/unpin same times. To achieve that, the vendor driver must
> cache it's iova->pfn mapping on its side, to avoid pinning a same page
> for multiple times.
> 
> With the notifier carrying only a pfn, to find the iova by this pfn,
> the vendor driver must *also* keep a reverse-mapping. That's a bit
> too much.
> 
> Since the vendor could also suffer from IOMMU-compatible problem,
> which means a local cache is always helpful, so I'd like to have the
> iova carried to the notifier.
> 
> What'd you say?

I agree, the pfn is not unique, multiple guest pfns (iovas) might be
backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
notifier through to the vendor driver must be based on the same.
Thanks,

Alex

> > +	} while (vpfn && (prev_vpfn != vpfn));
> > +
> > +	WARN_ON(vpfn);
> > +}
> > +
> >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> >  {
> > @@ -844,6 +891,9 @@ unlock:
> >  	/* Report how much was unmapped */
> >  	unmap->size = unmapped;
> >  
> > +	if (unmapped && iommu->external_domain)
> > +		vfio_notifier_call_chain(iommu, unmap);
> > +
> >  	return ret;
> >  }
> >  
> > @@ -1418,6 +1468,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> >  	INIT_LIST_HEAD(&iommu->domain_list);
> >  	iommu->dma_list = RB_ROOT;
> >  	mutex_init(&iommu->lock);
> > +	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> >  
> >  	return iommu;
> >  }
> > @@ -1555,16 +1606,34 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  	return -ENOTTY;
> >  }
> >  
> > +static int vfio_iommu_type1_register_notifier(void *iommu_data,
> > +					      struct notifier_block *nb)
> > +{
> > +	struct vfio_iommu *iommu = iommu_data;
> > +
> > +	return blocking_notifier_chain_register(&iommu->notifier, nb);
> > +}
> > +
> > +static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> > +						struct notifier_block *nb)
> > +{
> > +	struct vfio_iommu *iommu = iommu_data;
> > +
> > +	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> > +}
> > +
> >  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> > -	.name		= "vfio-iommu-type1",
> > -	.owner		= THIS_MODULE,
> > -	.open		= vfio_iommu_type1_open,
> > -	.release	= vfio_iommu_type1_release,
> > -	.ioctl		= vfio_iommu_type1_ioctl,
> > -	.attach_group	= vfio_iommu_type1_attach_group,
> > -	.detach_group	= vfio_iommu_type1_detach_group,
> > -	.pin_pages	= vfio_iommu_type1_pin_pages,
> > -	.unpin_pages	= vfio_iommu_type1_unpin_pages,
> > +	.name			= "vfio-iommu-type1",
> > +	.owner			= THIS_MODULE,
> > +	.open			= vfio_iommu_type1_open,
> > +	.release		= vfio_iommu_type1_release,
> > +	.ioctl			= vfio_iommu_type1_ioctl,
> > +	.attach_group		= vfio_iommu_type1_attach_group,
> > +	.detach_group		= vfio_iommu_type1_detach_group,
> > +	.pin_pages		= vfio_iommu_type1_pin_pages,
> > +	.unpin_pages		= vfio_iommu_type1_unpin_pages,
> > +	.register_notifier	= vfio_iommu_type1_register_notifier,
> > +	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
> >  };
> >  
> >  static int __init vfio_iommu_type1_init(void)
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index 0609a2052846..4c91ce8bfaeb 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -80,6 +80,10 @@ struct vfio_iommu_driver_ops {
> >  				     unsigned long *phys_pfn);
> >  	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
> >  				       long npage);
> > +	int		(*register_notifier)(void *iommu_data,
> > +					     struct notifier_block *nb);
> > +	int		(*unregister_notifier)(void *iommu_data,
> > +					       struct notifier_block *nb);
> >  };
> >  
> >  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> > @@ -137,6 +141,13 @@ extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> >  extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> >  			     long npage);
> >  
> > +#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	1
> > +
> > +extern int vfio_register_notifier(struct device *dev,
> > +				  struct notifier_block *nb);
> > +
> > +extern int vfio_unregister_notifier(struct device *dev,
> > +				    struct notifier_block *nb);
> >  /*
> >   * IRQfd - generic
> >   */
> >   
>
Kirti Wankhede Oct. 28, 2016, 8:02 p.m. UTC | #4
On 10/28/2016 6:10 PM, Alex Williamson wrote:
> On Fri, 28 Oct 2016 15:33:58 +0800
> Jike Song <jike.song@intel.com> wrote:
> 
>> On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
>>> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
>>> about DMA_UNMAP.
>>> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
>>> Vendor driver should register notifer using these APIs.
>>> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
>>> mappings.
>>>
>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>>> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
>>> ---
>>>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
>>>  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
>>>  include/linux/vfio.h            | 11 +++++
>>>  3 files changed, 163 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>> index 28b50ca14c52..ff05ac6b1e90 100644
>>> --- a/drivers/vfio/vfio.c
>>> +++ b/drivers/vfio/vfio.c
>>> @@ -1891,6 +1891,79 @@ err_unpin_pages:
>>>  }
>>>  EXPORT_SYMBOL(vfio_unpin_pages);
>>>  
>>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
>>> +{
>>> +	struct vfio_container *container;
>>> +	struct vfio_group *group;
>>> +	struct vfio_iommu_driver *driver;
>>> +	ssize_t ret;
>>> +
>>> +	if (!dev || !nb)
>>> +		return -EINVAL;
>>> +
>>> +	group = vfio_group_get_from_dev(dev);
>>> +	if (IS_ERR(group))
>>> +		return PTR_ERR(group);
>>> +
>>> +	ret = vfio_group_add_container_user(group);
>>> +	if (ret)
>>> +		goto err_register_nb;
>>> +
>>> +	container = group->container;
>>> +	down_read(&container->group_lock);
>>> +
>>> +	driver = container->iommu_driver;
>>> +	if (likely(driver && driver->ops->register_notifier))
>>> +		ret = driver->ops->register_notifier(container->iommu_data, nb);
>>> +	else
>>> +		ret = -EINVAL;
>>> +
>>> +	up_read(&container->group_lock);
>>> +	vfio_group_try_dissolve_container(group);
>>> +
>>> +err_register_nb:
>>> +	vfio_group_put(group);
>>> +	return ret;
>>> +}
>>> +EXPORT_SYMBOL(vfio_register_notifier);
>>> +
>>> +int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
>>> +{
>>> +	struct vfio_container *container;
>>> +	struct vfio_group *group;
>>> +	struct vfio_iommu_driver *driver;
>>> +	ssize_t ret;
>>> +
>>> +	if (!dev || !nb)
>>> +		return -EINVAL;
>>> +
>>> +	group = vfio_group_get_from_dev(dev);
>>> +	if (IS_ERR(group))
>>> +		return PTR_ERR(group);
>>> +
>>> +	ret = vfio_group_add_container_user(group);
>>> +	if (ret)
>>> +		goto err_unregister_nb;
>>> +
>>> +	container = group->container;
>>> +	down_read(&container->group_lock);
>>> +
>>> +	driver = container->iommu_driver;
>>> +	if (likely(driver && driver->ops->unregister_notifier))
>>> +		ret = driver->ops->unregister_notifier(container->iommu_data,
>>> +						       nb);
>>> +	else
>>> +		ret = -EINVAL;
>>> +
>>> +	up_read(&container->group_lock);
>>> +	vfio_group_try_dissolve_container(group);
>>> +
>>> +err_unregister_nb:
>>> +	vfio_group_put(group);
>>> +	return ret;
>>> +}
>>> +EXPORT_SYMBOL(vfio_unregister_notifier);
>>> +
>>>  /**
>>>   * Module/class support
>>>   */
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>> index 5add11a147e1..a4bd331ac0fd 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -37,6 +37,7 @@
>>>  #include <linux/vfio.h>
>>>  #include <linux/workqueue.h>
>>>  #include <linux/mdev.h>
>>> +#include <linux/notifier.h>
>>>  
>>>  #define DRIVER_VERSION  "0.2"
>>>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>>> @@ -59,6 +60,7 @@ struct vfio_iommu {
>>>  	struct vfio_domain	*external_domain; /* domain for external user */
>>>  	struct mutex		lock;
>>>  	struct rb_root		dma_list;
>>> +	struct blocking_notifier_head notifier;
>>>  	bool			v2;
>>>  	bool			nesting;
>>>  };
>>> @@ -549,7 +551,8 @@ static long vfio_iommu_type1_pin_pages(void *iommu_data,
>>>  
>>>  	mutex_lock(&iommu->lock);
>>>  
>>> -	if (!iommu->external_domain) {
>>> +	/* Fail if notifier list is empty */
>>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>>>  		ret = -EINVAL;
>>>  		goto pin_done;
>>>  	}
>>> @@ -768,6 +771,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>>  	return bitmap;
>>>  }
>>>  
>>> +/*
>>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
>>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
>>> + * receiving notifier callback, vendor driver should invalidate the mapping and
>>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
>>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
>>> + * searching for next vfio_pfn in rb tree, start search from first node again.
>>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
>>> + * from rb tree and so in next search vfio_pfn would be same as previous
>>> + * vfio_pfn. In that case, exit from loop.
>>> + */
>>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
>>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
>>> +{
>>> +	struct vfio_domain *domain = iommu->external_domain;
>>> +	struct rb_node *n;
>>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
>>> +
>>> +	do {
>>> +		prev_vpfn = vpfn;
>>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
>>> +
>>> +		n = rb_first(&domain->external_addr_space->pfn_list);
>>> +
>>> +		for (; n; n = rb_next(n), vpfn = NULL) {
>>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
>>> +
>>> +			if ((vpfn->iova >= unmap->iova) &&
>>> +			    (vpfn->iova < unmap->iova + unmap->size))
>>> +				break;
>>> +		}
>>> +
>>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
>>> +
>>> +		/* Notify any listeners about DMA_UNMAP */
>>> +		if (vpfn)
>>> +			blocking_notifier_call_chain(&iommu->notifier,
>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>> +						    &vpfn->pfn);  
>>
>> Hi Kirti, 
>>
>> The information carried by notifier is only a pfn.
>>
>> Since your pin/unpin interfaces design, it's the vendor driver who should
>> guarantee pin/unpin same times. To achieve that, the vendor driver must
>> cache it's iova->pfn mapping on its side, to avoid pinning a same page
>> for multiple times.
>>
>> With the notifier carrying only a pfn, to find the iova by this pfn,
>> the vendor driver must *also* keep a reverse-mapping. That's a bit
>> too much.
>>
>> Since the vendor could also suffer from IOMMU-compatible problem,
>> which means a local cache is always helpful, so I'd like to have the
>> iova carried to the notifier.
>>
>> What'd you say?
> 
> I agree, the pfn is not unique, multiple guest pfns (iovas) might be
> backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
> notifier through to the vendor driver must be based on the same.

Host pfn should be unique, right?

Kirti.
Alex Williamson Oct. 28, 2016, 8:33 p.m. UTC | #5
On Sat, 29 Oct 2016 01:32:35 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/28/2016 6:10 PM, Alex Williamson wrote:
> > On Fri, 28 Oct 2016 15:33:58 +0800
> > Jike Song <jike.song@intel.com> wrote:
> >   
> >> On 10/27/2016 05:29 AM, Kirti Wankhede wrote:  
> >>> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
> >>> about DMA_UNMAP.
> >>> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
> >>> Vendor driver should register notifer using these APIs.
> >>> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
> >>> mappings.
> >>>
> >>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>> Signed-off-by: Neo Jia <cjia@nvidia.com>
> >>> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
> >>> ---
> >>>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
> >>>  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
> >>>  include/linux/vfio.h            | 11 +++++
> >>>  3 files changed, 163 insertions(+), 10 deletions(-)
> >>>
> >>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >>> index 28b50ca14c52..ff05ac6b1e90 100644
> >>> --- a/drivers/vfio/vfio.c
> >>> +++ b/drivers/vfio/vfio.c
> >>> @@ -1891,6 +1891,79 @@ err_unpin_pages:
> >>>  }
> >>>  EXPORT_SYMBOL(vfio_unpin_pages);
> >>>  
> >>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
> >>> +{
> >>> +	struct vfio_container *container;
> >>> +	struct vfio_group *group;
> >>> +	struct vfio_iommu_driver *driver;
> >>> +	ssize_t ret;
> >>> +
> >>> +	if (!dev || !nb)
> >>> +		return -EINVAL;
> >>> +
> >>> +	group = vfio_group_get_from_dev(dev);
> >>> +	if (IS_ERR(group))
> >>> +		return PTR_ERR(group);
> >>> +
> >>> +	ret = vfio_group_add_container_user(group);
> >>> +	if (ret)
> >>> +		goto err_register_nb;
> >>> +
> >>> +	container = group->container;
> >>> +	down_read(&container->group_lock);
> >>> +
> >>> +	driver = container->iommu_driver;
> >>> +	if (likely(driver && driver->ops->register_notifier))
> >>> +		ret = driver->ops->register_notifier(container->iommu_data, nb);
> >>> +	else
> >>> +		ret = -EINVAL;
> >>> +
> >>> +	up_read(&container->group_lock);
> >>> +	vfio_group_try_dissolve_container(group);
> >>> +
> >>> +err_register_nb:
> >>> +	vfio_group_put(group);
> >>> +	return ret;
> >>> +}
> >>> +EXPORT_SYMBOL(vfio_register_notifier);
> >>> +
> >>> +int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
> >>> +{
> >>> +	struct vfio_container *container;
> >>> +	struct vfio_group *group;
> >>> +	struct vfio_iommu_driver *driver;
> >>> +	ssize_t ret;
> >>> +
> >>> +	if (!dev || !nb)
> >>> +		return -EINVAL;
> >>> +
> >>> +	group = vfio_group_get_from_dev(dev);
> >>> +	if (IS_ERR(group))
> >>> +		return PTR_ERR(group);
> >>> +
> >>> +	ret = vfio_group_add_container_user(group);
> >>> +	if (ret)
> >>> +		goto err_unregister_nb;
> >>> +
> >>> +	container = group->container;
> >>> +	down_read(&container->group_lock);
> >>> +
> >>> +	driver = container->iommu_driver;
> >>> +	if (likely(driver && driver->ops->unregister_notifier))
> >>> +		ret = driver->ops->unregister_notifier(container->iommu_data,
> >>> +						       nb);
> >>> +	else
> >>> +		ret = -EINVAL;
> >>> +
> >>> +	up_read(&container->group_lock);
> >>> +	vfio_group_try_dissolve_container(group);
> >>> +
> >>> +err_unregister_nb:
> >>> +	vfio_group_put(group);
> >>> +	return ret;
> >>> +}
> >>> +EXPORT_SYMBOL(vfio_unregister_notifier);
> >>> +
> >>>  /**
> >>>   * Module/class support
> >>>   */
> >>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>> index 5add11a147e1..a4bd331ac0fd 100644
> >>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>> @@ -37,6 +37,7 @@
> >>>  #include <linux/vfio.h>
> >>>  #include <linux/workqueue.h>
> >>>  #include <linux/mdev.h>
> >>> +#include <linux/notifier.h>
> >>>  
> >>>  #define DRIVER_VERSION  "0.2"
> >>>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> >>> @@ -59,6 +60,7 @@ struct vfio_iommu {
> >>>  	struct vfio_domain	*external_domain; /* domain for external user */
> >>>  	struct mutex		lock;
> >>>  	struct rb_root		dma_list;
> >>> +	struct blocking_notifier_head notifier;
> >>>  	bool			v2;
> >>>  	bool			nesting;
> >>>  };
> >>> @@ -549,7 +551,8 @@ static long vfio_iommu_type1_pin_pages(void *iommu_data,
> >>>  
> >>>  	mutex_lock(&iommu->lock);
> >>>  
> >>> -	if (!iommu->external_domain) {
> >>> +	/* Fail if notifier list is empty */
> >>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
> >>>  		ret = -EINVAL;
> >>>  		goto pin_done;
> >>>  	}
> >>> @@ -768,6 +771,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>>  	return bitmap;
> >>>  }
> >>>  
> >>> +/*
> >>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
> >>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
> >>> + * receiving notifier callback, vendor driver should invalidate the mapping and
> >>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> >>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> >>> + * searching for next vfio_pfn in rb tree, start search from first node again.
> >>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> >>> + * from rb tree and so in next search vfio_pfn would be same as previous
> >>> + * vfio_pfn. In that case, exit from loop.
> >>> + */
> >>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> >>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
> >>> +{
> >>> +	struct vfio_domain *domain = iommu->external_domain;
> >>> +	struct rb_node *n;
> >>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> >>> +
> >>> +	do {
> >>> +		prev_vpfn = vpfn;
> >>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> >>> +
> >>> +		n = rb_first(&domain->external_addr_space->pfn_list);
> >>> +
> >>> +		for (; n; n = rb_next(n), vpfn = NULL) {
> >>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
> >>> +
> >>> +			if ((vpfn->iova >= unmap->iova) &&
> >>> +			    (vpfn->iova < unmap->iova + unmap->size))
> >>> +				break;
> >>> +		}
> >>> +
> >>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> >>> +
> >>> +		/* Notify any listeners about DMA_UNMAP */
> >>> +		if (vpfn)
> >>> +			blocking_notifier_call_chain(&iommu->notifier,
> >>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>> +						    &vpfn->pfn);    
> >>
> >> Hi Kirti, 
> >>
> >> The information carried by notifier is only a pfn.
> >>
> >> Since your pin/unpin interfaces design, it's the vendor driver who should
> >> guarantee pin/unpin same times. To achieve that, the vendor driver must
> >> cache it's iova->pfn mapping on its side, to avoid pinning a same page
> >> for multiple times.
> >>
> >> With the notifier carrying only a pfn, to find the iova by this pfn,
> >> the vendor driver must *also* keep a reverse-mapping. That's a bit
> >> too much.
> >>
> >> Since the vendor could also suffer from IOMMU-compatible problem,
> >> which means a local cache is always helpful, so I'd like to have the
> >> iova carried to the notifier.
> >>
> >> What'd you say?  
> > 
> > I agree, the pfn is not unique, multiple guest pfns (iovas) might be
> > backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
> > notifier through to the vendor driver must be based on the same.  
> 
> Host pfn should be unique, right?

Let's say a user does a malloc of a single page and does 100 calls to
MAP_DMA populating 100 pages of IOVA space all backed by the same
malloc'd page.  This is valid, I have unit tests that do essentially
this.  Those will all have the same pfn.  The user then does an
UNMAP_DMA to a single one of those IOVA pages.  Did the user unmap
everything matching that pfn?  Of course not, they only unmapped that
one IOVA page.  There is no guarantee of a 1:1 mapping of pfn to IOVA.
UNMAP_DMA works based on IOVA.  Invalidation broadcasts to the vendor
driver MUST therefore also work based on IOVA.  This is not an academic
problem, address space aliases exist in real VMs, imagine a virtual
IOMMU.  Thanks,

Alex
Kirti Wankhede Oct. 29, 2016, 10:37 a.m. UTC | #6
On 10/29/2016 2:03 AM, Alex Williamson wrote:
> On Sat, 29 Oct 2016 01:32:35 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 10/28/2016 6:10 PM, Alex Williamson wrote:
>>> On Fri, 28 Oct 2016 15:33:58 +0800
>>> Jike Song <jike.song@intel.com> wrote:
>>>   
...
>>>>>  
>>>>> +/*
>>>>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
>>>>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
>>>>> + * receiving notifier callback, vendor driver should invalidate the mapping and
>>>>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
>>>>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
>>>>> + * searching for next vfio_pfn in rb tree, start search from first node again.
>>>>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
>>>>> + * from rb tree and so in next search vfio_pfn would be same as previous
>>>>> + * vfio_pfn. In that case, exit from loop.
>>>>> + */
>>>>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
>>>>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
>>>>> +{
>>>>> +	struct vfio_domain *domain = iommu->external_domain;
>>>>> +	struct rb_node *n;
>>>>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
>>>>> +
>>>>> +	do {
>>>>> +		prev_vpfn = vpfn;
>>>>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
>>>>> +
>>>>> +		n = rb_first(&domain->external_addr_space->pfn_list);
>>>>> +
>>>>> +		for (; n; n = rb_next(n), vpfn = NULL) {
>>>>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
>>>>> +
>>>>> +			if ((vpfn->iova >= unmap->iova) &&
>>>>> +			    (vpfn->iova < unmap->iova + unmap->size))
>>>>> +				break;
>>>>> +		}
>>>>> +
>>>>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
>>>>> +
>>>>> +		/* Notify any listeners about DMA_UNMAP */
>>>>> +		if (vpfn)
>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>> +						    &vpfn->pfn);    
>>>>
>>>> Hi Kirti, 
>>>>
>>>> The information carried by notifier is only a pfn.
>>>>
>>>> Since your pin/unpin interfaces design, it's the vendor driver who should
>>>> guarantee pin/unpin same times. To achieve that, the vendor driver must
>>>> cache it's iova->pfn mapping on its side, to avoid pinning a same page
>>>> for multiple times.
>>>>
>>>> With the notifier carrying only a pfn, to find the iova by this pfn,
>>>> the vendor driver must *also* keep a reverse-mapping. That's a bit
>>>> too much.
>>>>
>>>> Since the vendor could also suffer from IOMMU-compatible problem,
>>>> which means a local cache is always helpful, so I'd like to have the
>>>> iova carried to the notifier.
>>>>
>>>> What'd you say?  
>>>
>>> I agree, the pfn is not unique, multiple guest pfns (iovas) might be
>>> backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
>>> notifier through to the vendor driver must be based on the same.  
>>
>> Host pfn should be unique, right?
> 
> Let's say a user does a malloc of a single page and does 100 calls to
> MAP_DMA populating 100 pages of IOVA space all backed by the same
> malloc'd page.  This is valid, I have unit tests that do essentially
> this.  Those will all have the same pfn.  The user then does an
> UNMAP_DMA to a single one of those IOVA pages.  Did the user unmap
> everything matching that pfn?  Of course not, they only unmapped that
> one IOVA page.  There is no guarantee of a 1:1 mapping of pfn to IOVA.
> UNMAP_DMA works based on IOVA.  Invalidation broadcasts to the vendor
> driver MUST therefore also work based on IOVA.  This is not an academic
> problem, address space aliases exist in real VMs, imagine a virtual
> IOMMU.  Thanks,
> 


So struct vfio_iommu_type1_dma_unmap should be passed as argument to
notifier callback:

        if (unmapped && iommu->external_domain)
-               vfio_notifier_call_chain(iommu, unmap);
+               blocking_notifier_call_chain(&iommu->notifier,
+                                           VFIO_IOMMU_NOTIFY_DMA_UNMAP,
+                                            unmap);

Then vendor driver should find pfns he has pinned from this range of
iovas, then invalidate and unpin pfns. Right?

Thanks,
Kirti
Alex Williamson Oct. 29, 2016, 2:03 p.m. UTC | #7
On Sat, 29 Oct 2016 16:07:05 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/29/2016 2:03 AM, Alex Williamson wrote:
> > On Sat, 29 Oct 2016 01:32:35 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 10/28/2016 6:10 PM, Alex Williamson wrote:  
> >>> On Fri, 28 Oct 2016 15:33:58 +0800
> >>> Jike Song <jike.song@intel.com> wrote:
> >>>     
> ...
> >>>>>  
> >>>>> +/*
> >>>>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
> >>>>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
> >>>>> + * receiving notifier callback, vendor driver should invalidate the mapping and
> >>>>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> >>>>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> >>>>> + * searching for next vfio_pfn in rb tree, start search from first node again.
> >>>>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> >>>>> + * from rb tree and so in next search vfio_pfn would be same as previous
> >>>>> + * vfio_pfn. In that case, exit from loop.
> >>>>> + */
> >>>>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> >>>>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
> >>>>> +{
> >>>>> +	struct vfio_domain *domain = iommu->external_domain;
> >>>>> +	struct rb_node *n;
> >>>>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> >>>>> +
> >>>>> +	do {
> >>>>> +		prev_vpfn = vpfn;
> >>>>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> >>>>> +
> >>>>> +		n = rb_first(&domain->external_addr_space->pfn_list);
> >>>>> +
> >>>>> +		for (; n; n = rb_next(n), vpfn = NULL) {
> >>>>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
> >>>>> +
> >>>>> +			if ((vpfn->iova >= unmap->iova) &&
> >>>>> +			    (vpfn->iova < unmap->iova + unmap->size))
> >>>>> +				break;
> >>>>> +		}
> >>>>> +
> >>>>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> >>>>> +
> >>>>> +		/* Notify any listeners about DMA_UNMAP */
> >>>>> +		if (vpfn)
> >>>>> +			blocking_notifier_call_chain(&iommu->notifier,
> >>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>>> +						    &vpfn->pfn);      
> >>>>
> >>>> Hi Kirti, 
> >>>>
> >>>> The information carried by notifier is only a pfn.
> >>>>
> >>>> Since your pin/unpin interfaces design, it's the vendor driver who should
> >>>> guarantee pin/unpin same times. To achieve that, the vendor driver must
> >>>> cache it's iova->pfn mapping on its side, to avoid pinning a same page
> >>>> for multiple times.
> >>>>
> >>>> With the notifier carrying only a pfn, to find the iova by this pfn,
> >>>> the vendor driver must *also* keep a reverse-mapping. That's a bit
> >>>> too much.
> >>>>
> >>>> Since the vendor could also suffer from IOMMU-compatible problem,
> >>>> which means a local cache is always helpful, so I'd like to have the
> >>>> iova carried to the notifier.
> >>>>
> >>>> What'd you say?    
> >>>
> >>> I agree, the pfn is not unique, multiple guest pfns (iovas) might be
> >>> backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
> >>> notifier through to the vendor driver must be based on the same.    
> >>
> >> Host pfn should be unique, right?  
> > 
> > Let's say a user does a malloc of a single page and does 100 calls to
> > MAP_DMA populating 100 pages of IOVA space all backed by the same
> > malloc'd page.  This is valid, I have unit tests that do essentially
> > this.  Those will all have the same pfn.  The user then does an
> > UNMAP_DMA to a single one of those IOVA pages.  Did the user unmap
> > everything matching that pfn?  Of course not, they only unmapped that
> > one IOVA page.  There is no guarantee of a 1:1 mapping of pfn to IOVA.
> > UNMAP_DMA works based on IOVA.  Invalidation broadcasts to the vendor
> > driver MUST therefore also work based on IOVA.  This is not an academic
> > problem, address space aliases exist in real VMs, imagine a virtual
> > IOMMU.  Thanks,
> >   
> 
> 
> So struct vfio_iommu_type1_dma_unmap should be passed as argument to
> notifier callback:
> 
>         if (unmapped && iommu->external_domain)
> -               vfio_notifier_call_chain(iommu, unmap);
> +               blocking_notifier_call_chain(&iommu->notifier,
> +                                           VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> +                                            unmap);
> 
> Then vendor driver should find pfns he has pinned from this range of
> iovas, then invalidate and unpin pfns. Right?

That seems like a valid choice.  It's probably better than calling the
notifier for each page of iova.  Thanks,

Alex
Jike Song Oct. 31, 2016, 3:50 a.m. UTC | #8
On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
> about DMA_UNMAP.
> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
> Vendor driver should register notifer using these APIs.
> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
> mappings.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
> ---
>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
>  include/linux/vfio.h            | 11 +++++
>  3 files changed, 163 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 28b50ca14c52..ff05ac6b1e90 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1891,6 +1891,79 @@ err_unpin_pages:
>  }
>  EXPORT_SYMBOL(vfio_unpin_pages);
>  
> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
> +{

Hi Kirti,

Given that below 4 methods are members of vfio_iommu_driver_ops:

	pin_pages
	unpin_pages
	register_notifier
	unregister_notifier

the names of exposed VFIO APIs could possibly be clearer:

	vfio_iommu_pin_pages
	vfio_iommu_unpin_pages
	vfio_iommu_register_notifier
	vfio_iommu_unreigster_nodier

--
Thanks,
Jike

> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret;
> +
> +	if (!dev || !nb)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_register_nb;
> +
> +	container = group->container;
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->register_notifier))
> +		ret = driver->ops->register_notifier(container->iommu_data, nb);
> +	else
> +		ret = -EINVAL;
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_register_nb:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_register_notifier);
> +
> +int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret;
> +
> +	if (!dev || !nb)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_unregister_nb;
> +
> +	container = group->container;
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unregister_notifier))
> +		ret = driver->ops->unregister_notifier(container->iommu_data,
> +						       nb);
> +	else
> +		ret = -EINVAL;
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_unregister_nb:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unregister_notifier);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 5add11a147e1..a4bd331ac0fd 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -37,6 +37,7 @@
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
>  #include <linux/mdev.h>
> +#include <linux/notifier.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -59,6 +60,7 @@ struct vfio_iommu {
>  	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
> +	struct blocking_notifier_head notifier;
>  	bool			v2;
>  	bool			nesting;
>  };
> @@ -549,7 +551,8 @@ static long vfio_iommu_type1_pin_pages(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	if (!iommu->external_domain) {
> +	/* Fail if notifier list is empty */
> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>  		ret = -EINVAL;
>  		goto pin_done;
>  	}
> @@ -768,6 +771,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +/*
> + * This function finds pfn in domain->external_addr_space->pfn_list for given
> + * iova range. If pfn exist, notify pfn to registered notifier list. On
> + * receiving notifier callback, vendor driver should invalidate the mapping and
> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> + * searching for next vfio_pfn in rb tree, start search from first node again.
> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> + * from rb tree and so in next search vfio_pfn would be same as previous
> + * vfio_pfn. In that case, exit from loop.
> + */
> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> +				     struct vfio_iommu_type1_dma_unmap *unmap)
> +{
> +	struct vfio_domain *domain = iommu->external_domain;
> +	struct rb_node *n;
> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> +
> +	do {
> +		prev_vpfn = vpfn;
> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +		n = rb_first(&domain->external_addr_space->pfn_list);
> +
> +		for (; n; n = rb_next(n), vpfn = NULL) {
> +			vpfn = rb_entry(n, struct vfio_pfn, node);
> +
> +			if ((vpfn->iova >= unmap->iova) &&
> +			    (vpfn->iova < unmap->iova + unmap->size))
> +				break;
> +		}
> +
> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +
> +		/* Notify any listeners about DMA_UNMAP */
> +		if (vpfn)
> +			blocking_notifier_call_chain(&iommu->notifier,
> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> +						    &vpfn->pfn);
> +	} while (vpfn && (prev_vpfn != vpfn));
> +
> +	WARN_ON(vpfn);
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -844,6 +891,9 @@ unlock:
>  	/* Report how much was unmapped */
>  	unmap->size = unmapped;
>  
> +	if (unmapped && iommu->external_domain)
> +		vfio_notifier_call_chain(iommu, unmap);
> +
>  	return ret;
>  }
>  
> @@ -1418,6 +1468,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	INIT_LIST_HEAD(&iommu->domain_list);
>  	iommu->dma_list = RB_ROOT;
>  	mutex_init(&iommu->lock);
> +	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>  
>  	return iommu;
>  }
> @@ -1555,16 +1606,34 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  	return -ENOTTY;
>  }
>  
> +static int vfio_iommu_type1_register_notifier(void *iommu_data,
> +					      struct notifier_block *nb)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +
> +	return blocking_notifier_chain_register(&iommu->notifier, nb);
> +}
> +
> +static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> +						struct notifier_block *nb)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +
> +	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> +}
> +
>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> -	.name		= "vfio-iommu-type1",
> -	.owner		= THIS_MODULE,
> -	.open		= vfio_iommu_type1_open,
> -	.release	= vfio_iommu_type1_release,
> -	.ioctl		= vfio_iommu_type1_ioctl,
> -	.attach_group	= vfio_iommu_type1_attach_group,
> -	.detach_group	= vfio_iommu_type1_detach_group,
> -	.pin_pages	= vfio_iommu_type1_pin_pages,
> -	.unpin_pages	= vfio_iommu_type1_unpin_pages,
> +	.name			= "vfio-iommu-type1",
> +	.owner			= THIS_MODULE,
> +	.open			= vfio_iommu_type1_open,
> +	.release		= vfio_iommu_type1_release,
> +	.ioctl			= vfio_iommu_type1_ioctl,
> +	.attach_group		= vfio_iommu_type1_attach_group,
> +	.detach_group		= vfio_iommu_type1_detach_group,
> +	.pin_pages		= vfio_iommu_type1_pin_pages,
> +	.unpin_pages		= vfio_iommu_type1_unpin_pages,
> +	.register_notifier	= vfio_iommu_type1_register_notifier,
> +	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0609a2052846..4c91ce8bfaeb 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -80,6 +80,10 @@ struct vfio_iommu_driver_ops {
>  				     unsigned long *phys_pfn);
>  	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
>  				       long npage);
> +	int		(*register_notifier)(void *iommu_data,
> +					     struct notifier_block *nb);
> +	int		(*unregister_notifier)(void *iommu_data,
> +					       struct notifier_block *nb);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -137,6 +141,13 @@ extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
>  extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
>  			     long npage);
>  
> +#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	1
> +
> +extern int vfio_register_notifier(struct device *dev,
> +				  struct notifier_block *nb);
> +
> +extern int vfio_unregister_notifier(struct device *dev,
> +				    struct notifier_block *nb);
>  /*
>   * IRQfd - generic
>   */
>
Kirti Wankhede Oct. 31, 2016, 5:59 a.m. UTC | #9
On 10/31/2016 9:20 AM, Jike Song wrote:
> On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
>> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
>> about DMA_UNMAP.
>> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
>> Vendor driver should register notifer using these APIs.
>> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
>> mappings.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
>> ---
>>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
>>  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
>>  include/linux/vfio.h            | 11 +++++
>>  3 files changed, 163 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index 28b50ca14c52..ff05ac6b1e90 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1891,6 +1891,79 @@ err_unpin_pages:
>>  }
>>  EXPORT_SYMBOL(vfio_unpin_pages);
>>  
>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
>> +{
> 
> Hi Kirti,
> 
> Given that below 4 methods are members of vfio_iommu_driver_ops:
> 
> 	pin_pages
> 	unpin_pages
> 	register_notifier
> 	unregister_notifier
> 
> the names of exposed VFIO APIs could possibly be clearer:
> 
> 	vfio_iommu_pin_pages
> 	vfio_iommu_unpin_pages
> 	vfio_iommu_register_notifier
> 	vfio_iommu_unreigster_nodier
> 

Hey Jike,

I had followed the same style as other members in this structure:

	attach_group
	detach_group

Thanks,
Kirti
Jike Song Oct. 31, 2016, 6:05 a.m. UTC | #10
On 10/31/2016 01:59 PM, Kirti Wankhede wrote:
> On 10/31/2016 9:20 AM, Jike Song wrote:
>> On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
>>> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
>>> about DMA_UNMAP.
>>> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
>>> Vendor driver should register notifer using these APIs.
>>> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
>>> mappings.
>>>
>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>>> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
>>> ---
>>>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++
>>>  drivers/vfio/vfio_iommu_type1.c | 89 ++++++++++++++++++++++++++++++++++++-----
>>>  include/linux/vfio.h            | 11 +++++
>>>  3 files changed, 163 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>> index 28b50ca14c52..ff05ac6b1e90 100644
>>> --- a/drivers/vfio/vfio.c
>>> +++ b/drivers/vfio/vfio.c
>>> @@ -1891,6 +1891,79 @@ err_unpin_pages:
>>>  }
>>>  EXPORT_SYMBOL(vfio_unpin_pages);
>>>  
>>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
>>> +{
>>
>> Hi Kirti,
>>
>> Given that below 4 methods are members of vfio_iommu_driver_ops:
>>
>> 	pin_pages
>> 	unpin_pages
>> 	register_notifier
>> 	unregister_notifier
>>
>> the names of exposed VFIO APIs could possibly be clearer:
>>
>> 	vfio_iommu_pin_pages
>> 	vfio_iommu_unpin_pages
>> 	vfio_iommu_register_notifier
>> 	vfio_iommu_unreigster_nodier
>>
> 
> Hey Jike,
> 
> I had followed the same style as other members in this structure:
> 
> 	attach_group
> 	detach_group
> 

I mean the APIs exposed. For example, vfio_register_notifier() is somehow
by the name too generic to know what it is provided for.

--
Thanks,
Jike
Dong Jia Shi Nov. 1, 2016, 3:45 a.m. UTC | #11
* Alex Williamson <alex.williamson@redhat.com> [2016-10-29 08:03:01 -0600]:

> On Sat, 29 Oct 2016 16:07:05 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 10/29/2016 2:03 AM, Alex Williamson wrote:
> > > On Sat, 29 Oct 2016 01:32:35 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> On 10/28/2016 6:10 PM, Alex Williamson wrote:  
> > >>> On Fri, 28 Oct 2016 15:33:58 +0800
> > >>> Jike Song <jike.song@intel.com> wrote:
> > >>>     
> > ...
> > >>>>>  
> > >>>>> +/*
> > >>>>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
> > >>>>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
> > >>>>> + * receiving notifier callback, vendor driver should invalidate the mapping and
> > >>>>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> > >>>>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> > >>>>> + * searching for next vfio_pfn in rb tree, start search from first node again.
> > >>>>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> > >>>>> + * from rb tree and so in next search vfio_pfn would be same as previous
> > >>>>> + * vfio_pfn. In that case, exit from loop.
> > >>>>> + */
> > >>>>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> > >>>>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
> > >>>>> +{
> > >>>>> +	struct vfio_domain *domain = iommu->external_domain;
> > >>>>> +	struct rb_node *n;
> > >>>>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> > >>>>> +
> > >>>>> +	do {
> > >>>>> +		prev_vpfn = vpfn;
> > >>>>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> > >>>>> +
> > >>>>> +		n = rb_first(&domain->external_addr_space->pfn_list);
> > >>>>> +
> > >>>>> +		for (; n; n = rb_next(n), vpfn = NULL) {
> > >>>>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
> > >>>>> +
> > >>>>> +			if ((vpfn->iova >= unmap->iova) &&
> > >>>>> +			    (vpfn->iova < unmap->iova + unmap->size))
> > >>>>> +				break;
> > >>>>> +		}
> > >>>>> +
> > >>>>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> > >>>>> +
> > >>>>> +		/* Notify any listeners about DMA_UNMAP */
> > >>>>> +		if (vpfn)
> > >>>>> +			blocking_notifier_call_chain(&iommu->notifier,
> > >>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> > >>>>> +						    &vpfn->pfn);      
> > >>>>
> > >>>> Hi Kirti, 
> > >>>>
> > >>>> The information carried by notifier is only a pfn.
> > >>>>
> > >>>> Since your pin/unpin interfaces design, it's the vendor driver who should
> > >>>> guarantee pin/unpin same times. To achieve that, the vendor driver must
> > >>>> cache it's iova->pfn mapping on its side, to avoid pinning a same page
> > >>>> for multiple times.
> > >>>>
> > >>>> With the notifier carrying only a pfn, to find the iova by this pfn,
> > >>>> the vendor driver must *also* keep a reverse-mapping. That's a bit
> > >>>> too much.
> > >>>>
> > >>>> Since the vendor could also suffer from IOMMU-compatible problem,
> > >>>> which means a local cache is always helpful, so I'd like to have the
> > >>>> iova carried to the notifier.
> > >>>>
> > >>>> What'd you say?    
> > >>>
> > >>> I agree, the pfn is not unique, multiple guest pfns (iovas) might be
> > >>> backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
> > >>> notifier through to the vendor driver must be based on the same.    
> > >>
> > >> Host pfn should be unique, right?  
> > > 
> > > Let's say a user does a malloc of a single page and does 100 calls to
> > > MAP_DMA populating 100 pages of IOVA space all backed by the same
> > > malloc'd page.  This is valid, I have unit tests that do essentially
> > > this.  Those will all have the same pfn.  The user then does an
> > > UNMAP_DMA to a single one of those IOVA pages.  Did the user unmap
> > > everything matching that pfn?  Of course not, they only unmapped that
> > > one IOVA page.  There is no guarantee of a 1:1 mapping of pfn to IOVA.
> > > UNMAP_DMA works based on IOVA.  Invalidation broadcasts to the vendor
> > > driver MUST therefore also work based on IOVA.  This is not an academic
> > > problem, address space aliases exist in real VMs, imagine a virtual
> > > IOMMU.  Thanks,
> > >   
> > 
> > 
> > So struct vfio_iommu_type1_dma_unmap should be passed as argument to
> > notifier callback:
> > 
> >         if (unmapped && iommu->external_domain)
> > -               vfio_notifier_call_chain(iommu, unmap);
> > +               blocking_notifier_call_chain(&iommu->notifier,
> > +                                           VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> > +                                            unmap);
> > 
> > Then vendor driver should find pfns he has pinned from this range of
> > iovas, then invalidate and unpin pfns. Right?
> 
> That seems like a valid choice.  It's probably better than calling the
> notifier for each page of iova.  Thanks,
> 
> Alex
> 
Hi Kirti,

This version requires the *vendor driver* call vfio_register_notifier
for an mdev device before any pinning operations. I guess all of the
vendor drivers may have some alike code for notifier
registration/unregistration.

My question is, how about letting the mdev framework managing the
notifier registration/unregistration process?

We could add a notifier_fn_t callback to "struct parent_ops", then the
mdev framework should make sure that the vendor driver assigned a value
to this callback. The mdev core could initiate a notifier_block for each
parent driver with its callback, and register/unregister it to vfio in
the right time.
Kirti Wankhede Nov. 1, 2016, 7:47 a.m. UTC | #12
On 11/1/2016 9:15 AM, Dong Jia Shi wrote:
> * Alex Williamson <alex.williamson@redhat.com> [2016-10-29 08:03:01 -0600]:
> 
>> On Sat, 29 Oct 2016 16:07:05 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> On 10/29/2016 2:03 AM, Alex Williamson wrote:
>>>> On Sat, 29 Oct 2016 01:32:35 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>   
>>>>> On 10/28/2016 6:10 PM, Alex Williamson wrote:  
>>>>>> On Fri, 28 Oct 2016 15:33:58 +0800
>>>>>> Jike Song <jike.song@intel.com> wrote:
>>>>>>     
>>> ...
>>>>>>>>  
>>>>>>>> +/*
>>>>>>>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
>>>>>>>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
>>>>>>>> + * receiving notifier callback, vendor driver should invalidate the mapping and
>>>>>>>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
>>>>>>>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
>>>>>>>> + * searching for next vfio_pfn in rb tree, start search from first node again.
>>>>>>>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
>>>>>>>> + * from rb tree and so in next search vfio_pfn would be same as previous
>>>>>>>> + * vfio_pfn. In that case, exit from loop.
>>>>>>>> + */
>>>>>>>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
>>>>>>>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
>>>>>>>> +{
>>>>>>>> +	struct vfio_domain *domain = iommu->external_domain;
>>>>>>>> +	struct rb_node *n;
>>>>>>>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
>>>>>>>> +
>>>>>>>> +	do {
>>>>>>>> +		prev_vpfn = vpfn;
>>>>>>>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
>>>>>>>> +
>>>>>>>> +		n = rb_first(&domain->external_addr_space->pfn_list);
>>>>>>>> +
>>>>>>>> +		for (; n; n = rb_next(n), vpfn = NULL) {
>>>>>>>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
>>>>>>>> +
>>>>>>>> +			if ((vpfn->iova >= unmap->iova) &&
>>>>>>>> +			    (vpfn->iova < unmap->iova + unmap->size))
>>>>>>>> +				break;
>>>>>>>> +		}
>>>>>>>> +
>>>>>>>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
>>>>>>>> +
>>>>>>>> +		/* Notify any listeners about DMA_UNMAP */
>>>>>>>> +		if (vpfn)
>>>>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
>>>>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>>>>> +						    &vpfn->pfn);      
>>>>>>>
>>>>>>> Hi Kirti, 
>>>>>>>
>>>>>>> The information carried by notifier is only a pfn.
>>>>>>>
>>>>>>> Since your pin/unpin interfaces design, it's the vendor driver who should
>>>>>>> guarantee pin/unpin same times. To achieve that, the vendor driver must
>>>>>>> cache it's iova->pfn mapping on its side, to avoid pinning a same page
>>>>>>> for multiple times.
>>>>>>>
>>>>>>> With the notifier carrying only a pfn, to find the iova by this pfn,
>>>>>>> the vendor driver must *also* keep a reverse-mapping. That's a bit
>>>>>>> too much.
>>>>>>>
>>>>>>> Since the vendor could also suffer from IOMMU-compatible problem,
>>>>>>> which means a local cache is always helpful, so I'd like to have the
>>>>>>> iova carried to the notifier.
>>>>>>>
>>>>>>> What'd you say?    
>>>>>>
>>>>>> I agree, the pfn is not unique, multiple guest pfns (iovas) might be
>>>>>> backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
>>>>>> notifier through to the vendor driver must be based on the same.    
>>>>>
>>>>> Host pfn should be unique, right?  
>>>>
>>>> Let's say a user does a malloc of a single page and does 100 calls to
>>>> MAP_DMA populating 100 pages of IOVA space all backed by the same
>>>> malloc'd page.  This is valid, I have unit tests that do essentially
>>>> this.  Those will all have the same pfn.  The user then does an
>>>> UNMAP_DMA to a single one of those IOVA pages.  Did the user unmap
>>>> everything matching that pfn?  Of course not, they only unmapped that
>>>> one IOVA page.  There is no guarantee of a 1:1 mapping of pfn to IOVA.
>>>> UNMAP_DMA works based on IOVA.  Invalidation broadcasts to the vendor
>>>> driver MUST therefore also work based on IOVA.  This is not an academic
>>>> problem, address space aliases exist in real VMs, imagine a virtual
>>>> IOMMU.  Thanks,
>>>>   
>>>
>>>
>>> So struct vfio_iommu_type1_dma_unmap should be passed as argument to
>>> notifier callback:
>>>
>>>         if (unmapped && iommu->external_domain)
>>> -               vfio_notifier_call_chain(iommu, unmap);
>>> +               blocking_notifier_call_chain(&iommu->notifier,
>>> +                                           VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>> +                                            unmap);
>>>
>>> Then vendor driver should find pfns he has pinned from this range of
>>> iovas, then invalidate and unpin pfns. Right?
>>
>> That seems like a valid choice.  It's probably better than calling the
>> notifier for each page of iova.  Thanks,
>>
>> Alex
>>
> Hi Kirti,
> 
> This version requires the *vendor driver* call vfio_register_notifier
> for an mdev device before any pinning operations. I guess all of the
> vendor drivers may have some alike code for notifier
> registration/unregistration.
> 
> My question is, how about letting the mdev framework managing the
> notifier registration/unregistration process?
> 
> We could add a notifier_fn_t callback to "struct parent_ops", then the
> mdev framework should make sure that the vendor driver assigned a value
> to this callback. The mdev core could initiate a notifier_block for each
> parent driver with its callback, and register/unregister it to vfio in
> the right time.
> 

Module mdev_core is independent of VFIO so far and it should be
independent of VFIO module.

Its a good suggestion to have a notifier callback in parent_ops, but we
shouldn't call vfio_register_notifier()/ vfio_unregister_notifier() from
mdev core module. vfio_mdev module would take care to
register/unregister notifier to vfio from its vfio_mdev_open()/
vfio_mdev_release() call. That looks cleaner to me.

Notifier callback in parent_ops should be optional since all vendor
drivers might not pin/unpin pages, for example sample mtty driver.
Notifier would be registered to vfio module only if the notifier
callback is provided by vendor driver.

If this looks reasonable, I'll have this patch in my next version of
patch series.

Thanks,
Kirti
Dong Jia Shi Nov. 1, 2016, 8:33 a.m. UTC | #13
* Kirti Wankhede <kwankhede@nvidia.com> [2016-11-01 13:17:19 +0530]:

> 
> 
> On 11/1/2016 9:15 AM, Dong Jia Shi wrote:
> > * Alex Williamson <alex.williamson@redhat.com> [2016-10-29 08:03:01 -0600]:
> > 
> >> On Sat, 29 Oct 2016 16:07:05 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>
> >>> On 10/29/2016 2:03 AM, Alex Williamson wrote:
> >>>> On Sat, 29 Oct 2016 01:32:35 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>   
> >>>>> On 10/28/2016 6:10 PM, Alex Williamson wrote:  
> >>>>>> On Fri, 28 Oct 2016 15:33:58 +0800
> >>>>>> Jike Song <jike.song@intel.com> wrote:
> >>>>>>     
> >>> ...
> >>>>>>>>  
> >>>>>>>> +/*
> >>>>>>>> + * This function finds pfn in domain->external_addr_space->pfn_list for given
> >>>>>>>> + * iova range. If pfn exist, notify pfn to registered notifier list. On
> >>>>>>>> + * receiving notifier callback, vendor driver should invalidate the mapping and
> >>>>>>>> + * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
> >>>>>>>> + * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
> >>>>>>>> + * searching for next vfio_pfn in rb tree, start search from first node again.
> >>>>>>>> + * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
> >>>>>>>> + * from rb tree and so in next search vfio_pfn would be same as previous
> >>>>>>>> + * vfio_pfn. In that case, exit from loop.
> >>>>>>>> + */
> >>>>>>>> +static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
> >>>>>>>> +				     struct vfio_iommu_type1_dma_unmap *unmap)
> >>>>>>>> +{
> >>>>>>>> +	struct vfio_domain *domain = iommu->external_domain;
> >>>>>>>> +	struct rb_node *n;
> >>>>>>>> +	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
> >>>>>>>> +
> >>>>>>>> +	do {
> >>>>>>>> +		prev_vpfn = vpfn;
> >>>>>>>> +		mutex_lock(&domain->external_addr_space->pfn_list_lock);
> >>>>>>>> +
> >>>>>>>> +		n = rb_first(&domain->external_addr_space->pfn_list);
> >>>>>>>> +
> >>>>>>>> +		for (; n; n = rb_next(n), vpfn = NULL) {
> >>>>>>>> +			vpfn = rb_entry(n, struct vfio_pfn, node);
> >>>>>>>> +
> >>>>>>>> +			if ((vpfn->iova >= unmap->iova) &&
> >>>>>>>> +			    (vpfn->iova < unmap->iova + unmap->size))
> >>>>>>>> +				break;
> >>>>>>>> +		}
> >>>>>>>> +
> >>>>>>>> +		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> >>>>>>>> +
> >>>>>>>> +		/* Notify any listeners about DMA_UNMAP */
> >>>>>>>> +		if (vpfn)
> >>>>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
> >>>>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>>>>>> +						    &vpfn->pfn);      
> >>>>>>>
> >>>>>>> Hi Kirti, 
> >>>>>>>
> >>>>>>> The information carried by notifier is only a pfn.
> >>>>>>>
> >>>>>>> Since your pin/unpin interfaces design, it's the vendor driver who should
> >>>>>>> guarantee pin/unpin same times. To achieve that, the vendor driver must
> >>>>>>> cache it's iova->pfn mapping on its side, to avoid pinning a same page
> >>>>>>> for multiple times.
> >>>>>>>
> >>>>>>> With the notifier carrying only a pfn, to find the iova by this pfn,
> >>>>>>> the vendor driver must *also* keep a reverse-mapping. That's a bit
> >>>>>>> too much.
> >>>>>>>
> >>>>>>> Since the vendor could also suffer from IOMMU-compatible problem,
> >>>>>>> which means a local cache is always helpful, so I'd like to have the
> >>>>>>> iova carried to the notifier.
> >>>>>>>
> >>>>>>> What'd you say?    
> >>>>>>
> >>>>>> I agree, the pfn is not unique, multiple guest pfns (iovas) might be
> >>>>>> backed by the same host pfn.  DMA_UNMAP calls are based on iova, the
> >>>>>> notifier through to the vendor driver must be based on the same.    
> >>>>>
> >>>>> Host pfn should be unique, right?  
> >>>>
> >>>> Let's say a user does a malloc of a single page and does 100 calls to
> >>>> MAP_DMA populating 100 pages of IOVA space all backed by the same
> >>>> malloc'd page.  This is valid, I have unit tests that do essentially
> >>>> this.  Those will all have the same pfn.  The user then does an
> >>>> UNMAP_DMA to a single one of those IOVA pages.  Did the user unmap
> >>>> everything matching that pfn?  Of course not, they only unmapped that
> >>>> one IOVA page.  There is no guarantee of a 1:1 mapping of pfn to IOVA.
> >>>> UNMAP_DMA works based on IOVA.  Invalidation broadcasts to the vendor
> >>>> driver MUST therefore also work based on IOVA.  This is not an academic
> >>>> problem, address space aliases exist in real VMs, imagine a virtual
> >>>> IOMMU.  Thanks,
> >>>>   
> >>>
> >>>
> >>> So struct vfio_iommu_type1_dma_unmap should be passed as argument to
> >>> notifier callback:
> >>>
> >>>         if (unmapped && iommu->external_domain)
> >>> -               vfio_notifier_call_chain(iommu, unmap);
> >>> +               blocking_notifier_call_chain(&iommu->notifier,
> >>> +                                           VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>> +                                            unmap);
> >>>
> >>> Then vendor driver should find pfns he has pinned from this range of
> >>> iovas, then invalidate and unpin pfns. Right?
> >>
> >> That seems like a valid choice.  It's probably better than calling the
> >> notifier for each page of iova.  Thanks,
> >>
> >> Alex
> >>
> > Hi Kirti,
> > 
> > This version requires the *vendor driver* call vfio_register_notifier
> > for an mdev device before any pinning operations. I guess all of the
> > vendor drivers may have some alike code for notifier
> > registration/unregistration.
> > 
> > My question is, how about letting the mdev framework managing the
> > notifier registration/unregistration process?
> > 
> > We could add a notifier_fn_t callback to "struct parent_ops", then the
> > mdev framework should make sure that the vendor driver assigned a value
> > to this callback. The mdev core could initiate a notifier_block for each
> > parent driver with its callback, and register/unregister it to vfio in
> > the right time.
> > 
> 
> Module mdev_core is independent of VFIO so far and it should be
> independent of VFIO module.
> 
> Its a good suggestion to have a notifier callback in parent_ops, but we
> shouldn't call vfio_register_notifier()/ vfio_unregister_notifier() from
> mdev core module. vfio_mdev module would take care to
> register/unregister notifier to vfio from its vfio_mdev_open()/
> vfio_mdev_release() call. That looks cleaner to me.
Nod.

> 
> Notifier callback in parent_ops should be optional since all vendor
> drivers might not pin/unpin pages, for example sample mtty driver.
> Notifier would be registered to vfio module only if the notifier
> callback is provided by vendor driver.
> 
> If this looks reasonable, I'll have this patch in my next version of
> patch series.
Sounds reasonable for me. Thanks,

> 
> Thanks,
> Kirti
>
diff mbox

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 28b50ca14c52..ff05ac6b1e90 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1891,6 +1891,79 @@  err_unpin_pages:
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
+int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret;
+
+	if (!dev || !nb)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_register_nb;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->register_notifier))
+		ret = driver->ops->register_notifier(container->iommu_data, nb);
+	else
+		ret = -EINVAL;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_register_nb:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_register_notifier);
+
+int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret;
+
+	if (!dev || !nb)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_unregister_nb;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unregister_notifier))
+		ret = driver->ops->unregister_notifier(container->iommu_data,
+						       nb);
+	else
+		ret = -EINVAL;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_unregister_nb:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unregister_notifier);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5add11a147e1..a4bd331ac0fd 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -37,6 +37,7 @@ 
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
 #include <linux/mdev.h>
+#include <linux/notifier.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -59,6 +60,7 @@  struct vfio_iommu {
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
+	struct blocking_notifier_head notifier;
 	bool			v2;
 	bool			nesting;
 };
@@ -549,7 +551,8 @@  static long vfio_iommu_type1_pin_pages(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	if (!iommu->external_domain) {
+	/* Fail if notifier list is empty */
+	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
 		ret = -EINVAL;
 		goto pin_done;
 	}
@@ -768,6 +771,50 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	return bitmap;
 }
 
+/*
+ * This function finds pfn in domain->external_addr_space->pfn_list for given
+ * iova range. If pfn exist, notify pfn to registered notifier list. On
+ * receiving notifier callback, vendor driver should invalidate the mapping and
+ * call vfio_unpin_pages() to unpin this pfn. With that vfio_pfn for this pfn
+ * gets removed from rb tree of pfn_list. That re-arranges rb tree, so while
+ * searching for next vfio_pfn in rb tree, start search from first node again.
+ * If any vendor driver doesn't unpin that pfn, vfio_pfn would not get removed
+ * from rb tree and so in next search vfio_pfn would be same as previous
+ * vfio_pfn. In that case, exit from loop.
+ */
+static void vfio_notifier_call_chain(struct vfio_iommu *iommu,
+				     struct vfio_iommu_type1_dma_unmap *unmap)
+{
+	struct vfio_domain *domain = iommu->external_domain;
+	struct rb_node *n;
+	struct vfio_pfn *vpfn = NULL, *prev_vpfn;
+
+	do {
+		prev_vpfn = vpfn;
+		mutex_lock(&domain->external_addr_space->pfn_list_lock);
+
+		n = rb_first(&domain->external_addr_space->pfn_list);
+
+		for (; n; n = rb_next(n), vpfn = NULL) {
+			vpfn = rb_entry(n, struct vfio_pfn, node);
+
+			if ((vpfn->iova >= unmap->iova) &&
+			    (vpfn->iova < unmap->iova + unmap->size))
+				break;
+		}
+
+		mutex_unlock(&domain->external_addr_space->pfn_list_lock);
+
+		/* Notify any listeners about DMA_UNMAP */
+		if (vpfn)
+			blocking_notifier_call_chain(&iommu->notifier,
+						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
+						    &vpfn->pfn);
+	} while (vpfn && (prev_vpfn != vpfn));
+
+	WARN_ON(vpfn);
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
@@ -844,6 +891,9 @@  unlock:
 	/* Report how much was unmapped */
 	unmap->size = unmapped;
 
+	if (unmapped && iommu->external_domain)
+		vfio_notifier_call_chain(iommu, unmap);
+
 	return ret;
 }
 
@@ -1418,6 +1468,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	INIT_LIST_HEAD(&iommu->domain_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
+	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
 
 	return iommu;
 }
@@ -1555,16 +1606,34 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 	return -ENOTTY;
 }
 
+static int vfio_iommu_type1_register_notifier(void *iommu_data,
+					      struct notifier_block *nb)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	return blocking_notifier_chain_register(&iommu->notifier, nb);
+}
+
+static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
+						struct notifier_block *nb)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
-	.name		= "vfio-iommu-type1",
-	.owner		= THIS_MODULE,
-	.open		= vfio_iommu_type1_open,
-	.release	= vfio_iommu_type1_release,
-	.ioctl		= vfio_iommu_type1_ioctl,
-	.attach_group	= vfio_iommu_type1_attach_group,
-	.detach_group	= vfio_iommu_type1_detach_group,
-	.pin_pages	= vfio_iommu_type1_pin_pages,
-	.unpin_pages	= vfio_iommu_type1_unpin_pages,
+	.name			= "vfio-iommu-type1",
+	.owner			= THIS_MODULE,
+	.open			= vfio_iommu_type1_open,
+	.release		= vfio_iommu_type1_release,
+	.ioctl			= vfio_iommu_type1_ioctl,
+	.attach_group		= vfio_iommu_type1_attach_group,
+	.detach_group		= vfio_iommu_type1_detach_group,
+	.pin_pages		= vfio_iommu_type1_pin_pages,
+	.unpin_pages		= vfio_iommu_type1_unpin_pages,
+	.register_notifier	= vfio_iommu_type1_register_notifier,
+	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0609a2052846..4c91ce8bfaeb 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -80,6 +80,10 @@  struct vfio_iommu_driver_ops {
 				     unsigned long *phys_pfn);
 	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
 				       long npage);
+	int		(*register_notifier)(void *iommu_data,
+					     struct notifier_block *nb);
+	int		(*unregister_notifier)(void *iommu_data,
+					       struct notifier_block *nb);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -137,6 +141,13 @@  extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
 extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
 			     long npage);
 
+#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	1
+
+extern int vfio_register_notifier(struct device *dev,
+				  struct notifier_block *nb);
+
+extern int vfio_unregister_notifier(struct device *dev,
+				    struct notifier_block *nb);
 /*
  * IRQfd - generic
  */