diff mbox

[v12,10/22] vfio iommu type1: Add support for mediated devices

Message ID 1479138156-28905-11-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede Nov. 14, 2016, 3:42 p.m. UTC
VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
Mediated device only uses IOMMU APIs, the underlying hardware can be
managed by an IOMMU domain.

Aim of this change is:
- To use most of the code of TYPE1 IOMMU driver for mediated devices
- To support direct assigned device and mediated device in single module

This change adds pin and unpin support for mediated device to TYPE1 IOMMU
backend module. More details:
- Domain for external user is tracked separately in vfio_iommu structure.
  It is allocated when group for first mdev device is attached.
- Pages pinned for external domain are tracked in each vfio_dma structure
  for that iova range.
- Page tracking rb-tree in vfio_dma keeps <iova, pfn, ref_count>. Key of
  rb-tree is iova, but it actually aims to track pfns.
- On external pin request for an iova, page is pinned only once, if iova
  is already pinned and tracked, ref_count is incremented.
- External unin request unpins pages only when ref_count is 0.
- Pinned pages list is used to verify unpinning request and to unpin
  remaining pages while detaching the group for that device.
- Page accounting is updated to account in its address space where the
  pages are pinned/unpinned, i.e dma->task
-  Accouting for mdev device is only done if there is no iommu capable
  domain in the container. When there is a direct device assigned to the
  container and that domain is iommu capable, all pages are already pinned
  during DMA_MAP.
- Page accouting is updated on hot plug and unplug mdev device and pass
  through device.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- Linux VM hot plug and unplug vGPU device while GPU pass through device
  exist
- Linux VM hot plug and unplug GPU pass through device while vGPU device
  exist

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
---
 drivers/vfio/vfio_iommu_type1.c | 587 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 526 insertions(+), 61 deletions(-)

Comments

Alex Williamson Nov. 14, 2016, 11:25 p.m. UTC | #1
On Mon, 14 Nov 2016 21:12:24 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - Domain for external user is tracked separately in vfio_iommu structure.
>   It is allocated when group for first mdev device is attached.
> - Pages pinned for external domain are tracked in each vfio_dma structure
>   for that iova range.
> - Page tracking rb-tree in vfio_dma keeps <iova, pfn, ref_count>. Key of
>   rb-tree is iova, but it actually aims to track pfns.
> - On external pin request for an iova, page is pinned only once, if iova
>   is already pinned and tracked, ref_count is incremented.

This is referring only to the external (ie. pfn_list) tracking only,
correct?  In general, a page can be pinned up to twice per iova
referencing it, once for an iommu mapped domain and again in the
pfn_list, right?

> - External unin request unpins pages only when ref_count is 0.
             ^^^^ unpin

> - Pinned pages list is used to verify unpinning request and to unpin
>   remaining pages while detaching the group for that device.
> - Page accounting is updated to account in its address space where the
>   pages are pinned/unpinned, i.e dma->task
> -  Accouting for mdev device is only done if there is no iommu capable
>   domain in the container. When there is a direct device assigned to the
>   container and that domain is iommu capable, all pages are already pinned
>   during DMA_MAP.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio_iommu_type1.c | 587 +++++++++++++++++++++++++++++++++++-----
>  1 file changed, 526 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 50aca95cf61e..2697d874dd35 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -37,6 +37,7 @@
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
>  #include <linux/pid_namespace.h>
> +#include <linux/mdev.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
> @@ -76,7 +78,9 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>  };
>  
>  struct vfio_group {
> @@ -85,6 +89,21 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	dma_addr_t		iova;		/* Device address */
> +	unsigned long		pfn;		/* Host pfn */
> +	atomic_t		ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> +					(!list_empty(&iommu->domain_list))
> +
> +static int put_pfn(unsigned long pfn, int prot);
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -132,6 +151,97 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host iova-pfn list
> + */
> +static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
> +{
> +	struct vfio_pfn *vpfn;
> +	struct rb_node *node = dma->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (iova < vpfn->iova)
> +			node = node->rb_left;
> +		else if (iova > vpfn->iova)
> +			node = node->rb_right;
> +		else
> +			return vpfn;
> +	}
> +	return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_dma *dma,
> +			  struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &dma->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->iova < vpfn->iova)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &dma->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &dma->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
> +				unsigned long pfn)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(dma, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(dma, vpfn);
> +	kfree(vpfn);
> +}
> +
> +static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> +					       unsigned long iova)
> +{
> +	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> +
> +	if (vpfn)
> +		atomic_inc(&vpfn->ref_count);
> +	return vpfn;
> +}
> +
> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> +{
> +	int ret = 0;
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count)) {
> +		ret = put_pfn(vpfn->pfn, dma->prot);
> +		vfio_remove_from_pfn_list(dma, vpfn);
> +	}
> +	return ret;
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -270,7 +380,6 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>  	}
>  
>  	up_read(&mm->mmap_sem);
> -
>  	return ret;
>  }
>  
> @@ -280,28 +389,35 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>   * first page and all consecutive pages with the same locking.
>   */
>  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> -				  long npage, int prot, unsigned long *pfn_base)
> +				  long npage, unsigned long *pfn_base)
>  {
>  	unsigned long limit;
>  	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
>  				   CAP_IPC_LOCK);
>  	struct mm_struct *mm;
> -	long ret, i;
> +	long ret, i, lock_acct = 0;
>  	bool rsvd;
> +	struct vfio_pfn *vpfn;
> +	dma_addr_t iova;
>  
>  	mm = get_task_mm(dma->task);
>  	if (!mm)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
>  	if (ret)
>  		goto pin_pg_remote_exit;
>  
> +	iova = vaddr - dma->vaddr + dma->iova;
> +	vpfn = vfio_find_vpfn(dma, iova);
> +	if (!vpfn)
> +		lock_acct = 1;
> +
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  
> -	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> -		put_pfn(*pfn_base, prot);
> +	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
> +		put_pfn(*pfn_base, dma->prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
>  		ret = -ENOMEM;
> @@ -310,7 +426,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  
>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(dma->task, 1);
> +			vfio_lock_acct(dma->task, lock_acct);
>  		ret = 1;
>  		goto pin_pg_remote_exit;
>  	}
> @@ -319,18 +435,24 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
>  		if (ret)
>  			break;
>  
>  		if (pfn != *pfn_base + i ||
>  		    rsvd != is_invalid_reserved_pfn(pfn)) {
> -			put_pfn(pfn, prot);
> +			put_pfn(pfn, dma->prot);
>  			break;
>  		}
>  
> -		if (!rsvd && !lock_cap && mm->locked_vm + i + 1 > limit) {
> -			put_pfn(pfn, prot);
> +		iova = vaddr - dma->vaddr + dma->iova;


nit, this could be incremented in the for loop just like vaddr, we
don't need to create it from scratch (iova += PAGE_SIZE).


> +		vpfn = vfio_find_vpfn(dma, iova);
> +		if (!vpfn)
> +			lock_acct++;
> +
> +		if (!rsvd && !lock_cap &&
> +		    mm->locked_vm + lock_acct + 1 > limit) {


lock_acct was incremented just above, why is this lock_acct + 1?  I
think we're off by one here (ie. remove the +1)?

> +			put_pfn(pfn, dma->prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
>  			break;
> @@ -338,7 +460,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(dma->task, i);
> +		vfio_lock_acct(dma->task, lock_acct);
>  	ret = i;
>  
>  pin_pg_remote_exit:
> @@ -346,14 +468,78 @@ pin_pg_remote_exit:
>  	return ret;
>  }
>  
> -static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
> -				    long npage, int prot, bool do_accounting)
> +static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> +				    unsigned long pfn, long npage,
> +				    bool do_accounting)
>  {
> -	unsigned long unlocked = 0;
> +	long unlocked = 0, locked = 0;
>  	long i;
>  
> -	for (i = 0; i < npage; i++)
> -		unlocked += put_pfn(pfn++, prot);
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *vpfn;
> +
> +		unlocked += put_pfn(pfn++, dma->prot);
> +
> +		vpfn = vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT));
> +		if (vpfn)
> +			locked++;

This isn't taking into account reserved pages (ex. device mmaps).  In
the pinning path above we skip accounting of reserved pages, put_pfn
also only increments for non-reserved pages, but vfio_find_vpfn()
doesn't care, so it's possible that (locked - unlocked) could result in
a positive value.  Maybe something like:

if (put_pfn(...)) {
	unlocked++;
	if (vfio_find_vpfn(...))
		locked++;
}

> +	}
> +
> +	if (do_accounting)
> +		vfio_lock_acct(dma->task, locked - unlocked);
> +
> +	return unlocked;
> +}
> +
> +static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> +				  unsigned long *pfn_base, bool do_accounting)
> +{
> +	unsigned long limit;
> +	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
> +				   CAP_IPC_LOCK);
> +	struct mm_struct *mm;
> +	int ret;
> +	bool rsvd;
> +
> +	mm = get_task_mm(dma->task);
> +	if (!mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> +	if (ret)
> +		goto pin_page_exit;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, dma->prot);
> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> +			__func__, dma->task->comm, task_pid_nr(dma->task),
> +			limit << PAGE_SHIFT);
> +		ret = -ENOMEM;
> +		goto pin_page_exit;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(dma->task, 1);
> +	ret = 1;
> +
> +pin_page_exit:
> +	mmput(mm);
> +	return ret;
> +}
> +
> +static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> +				    bool do_accounting)
> +{
> +	int unlocked;
> +	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> +
> +	if (!vpfn)
> +		return 0;
> +
> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
>  
>  	if (do_accounting)
>  		vfio_lock_acct(dma->task, -unlocked);
> @@ -361,14 +547,148 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
>  	return unlocked;
>  }
>  
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static int vfio_iommu_type1_pin_pages(void *iommu_data,
> +				      unsigned long *user_pfn,
> +				      int npage, int prot,
> +				      unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	int i, j, ret;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;

nit, why do we have this variable rather than using phys_pfn directly?
Maybe before we had unwind support we incremented this rather than
indexing it?

> +	struct vfio_dma *dma;
> +	bool do_accounting;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	/* Supported for v2 version only */
> +	if (!iommu->v2)
> +		return -EACCES;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->external_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		dma_addr_t iova;
> +		struct vfio_pfn *vpfn;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		if ((dma->prot & prot) != prot) {
> +			pr_info("%s: dma->prot: 0x%x prot: 0x%x\n",
> +				__func__, dma->prot, prot);
> +			ret = -EPERM;
> +			goto pin_unwind;
> +		}
> +
> +		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> +		if (vpfn) {
> +			pfn[i] = vpfn->pfn;
> +			continue;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +		ret = vfio_pin_page_external(dma, remote_vaddr, &pfn[i],
> +					     do_accounting);
> +		if (ret <= 0) {
> +			WARN_ON(!ret);
> +			goto pin_unwind;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(dma, iova, pfn[i]);
> +		if (ret) {
> +			vfio_unpin_page_external(dma, iova, do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	for (j = 0; j < i; j++) {
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[j] << PAGE_SHIFT;
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		vfio_unpin_page_external(dma, iova, do_accounting);
> +		pfn[j] = 0;
> +	}
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> +					unsigned long *user_pfn,
> +					int npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	bool do_accounting;
> +	int unlocked = 0, i;
> +
> +	if (!iommu || !user_pfn)
> +		return -EINVAL;
> +
> +	/* Supported for v2 version only */
> +	if (!iommu->v2)
> +		return -EACCES;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->external_domain) {
> +		mutex_unlock(&iommu->lock);
> +		return -EINVAL;
> +	}
> +
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_dma *dma;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma)
> +			goto unpin_exit;
> +		unlocked += vfio_unpin_page_external(dma, iova, do_accounting);
> +	}
> +
> +unpin_exit:
> +	mutex_unlock(&iommu->lock);
> +	return unlocked;

This is not returning what it's supposed to.  unlocked is going to be
less than or equal to the number of pages unpinned.  We don't need to
track unlocked, I think we're just tracking where we are in the unpin
array, whether it was partial or complete.  I think we want:

return i > npage ? npage : i;

Or maybe we can make it more obvious if there's an error on the first
unpin entry:

return i > npage ? npage : (i > 0 ? i : -EINVAL);

> +}
> +
> +static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			     bool do_accounting)
>  {
>  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
>  	struct vfio_domain *domain, *d;
>  	long unlocked = 0;
>  
>  	if (!dma->size)
> -		return;
> +		return 0;
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		return 0;
> +
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -410,20 +730,26 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
> +		unlocked += vfio_unpin_pages_remote(dma, iova,
> +						    phys >> PAGE_SHIFT,
>  						    unmapped >> PAGE_SHIFT,
> -						    dma->prot, false);
> +						    false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(dma->task, -unlocked);
> +	dma->iommu_mapped = false;
> +	if (do_accounting) {
> +		vfio_lock_acct(dma->task, -unlocked);
> +		return 0;
> +	}
> +	return unlocked;
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
> -	vfio_unmap_unpin(iommu, dma);
> +	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
>  	kfree(dma);
> @@ -606,8 +932,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  	while (size) {
>  		/* Pin a contiguous chunk of memory */
>  		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
> -					      size >> PAGE_SHIFT, dma->prot,
> -					      &pfn);
> +					      size >> PAGE_SHIFT, &pfn);
>  		if (npage <= 0) {
>  			WARN_ON(!npage);
>  			ret = (int)npage;
> @@ -618,8 +943,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
>  				     dma->prot);
>  		if (ret) {
> -			vfio_unpin_pages_remote(dma, pfn, npage,
> -						 dma->prot, true);
> +			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
> +						npage, true);
>  			break;
>  		}
>  
> @@ -627,6 +952,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  		dma->size += npage << PAGE_SHIFT;
>  	}
>  
> +	dma->iommu_mapped = true;
> +
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> @@ -682,11 +1009,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	dma->prot = prot;
>  	get_task_struct(current);
>  	dma->task = current;
> +	dma->pfn_list = RB_ROOT;
>  
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> -	ret = vfio_pin_map_dma(iommu, dma, size);
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		dma->size = size;
> +	else
> +		ret = vfio_pin_map_dma(iommu, dma, size);
>  do_map_err:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -715,10 +1047,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>  	n = rb_first(&iommu->dma_list);
>  
> -	/* If there's not a domain, there better not be any mappings */
> -	if (WARN_ON(n && !d))
> -		return -EINVAL;
> -
>  	for (; n; n = rb_next(n)) {
>  		struct vfio_dma *dma;
>  		dma_addr_t iova;
> @@ -727,21 +1055,49 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  		iova = dma->iova;
>  
>  		while (iova < dma->iova + dma->size) {
> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> +			phys_addr_t phys;
>  			size_t size;
>  
> -			if (WARN_ON(!phys)) {
> -				iova += PAGE_SIZE;
> -				continue;
> +			if (dma->iommu_mapped) {
> +				phys_addr_t p;
> +				dma_addr_t i;
> +
> +				phys = iommu_iova_to_phys(d->domain, iova);
> +
> +				if (WARN_ON(!phys)) {
> +					iova += PAGE_SIZE;
> +					continue;
> +				}
> +
> +				size = PAGE_SIZE;
> +				p = phys + size;
> +				i = iova + size;
> +				while (i < dma->iova + dma->size &&
> +				       p == iommu_iova_to_phys(d->domain, i)) {
> +					size += PAGE_SIZE;
> +					p += PAGE_SIZE;
> +					i += PAGE_SIZE;
> +				}
> +			} else {
> +				unsigned long pfn;
> +				unsigned long vaddr = dma->vaddr +
> +						     (iova - dma->iova);
> +				size_t n = dma->iova + dma->size - iova;
> +				long npage;
> +
> +				npage = vfio_pin_pages_remote(dma, vaddr,
> +							      n >> PAGE_SHIFT,
> +							      &pfn);
> +				if (npage <= 0) {
> +					WARN_ON(!npage);
> +					ret = (int)npage;
> +					return ret;
> +				}
> +
> +				phys = pfn << PAGE_SHIFT;
> +				size = npage << PAGE_SHIFT;
>  			}
>  
> -			size = PAGE_SIZE;
> -
> -			while (iova + size < dma->iova + dma->size &&
> -			       phys + size == iommu_iova_to_phys(d->domain,
> -								 iova + size))
> -				size += PAGE_SIZE;
> -
>  			ret = iommu_map(domain->domain, iova, phys,
>  					size, dma->prot | domain->prot);
>  			if (ret)
> @@ -749,8 +1105,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  
>  			iova += size;
>  		}
> +		dma->iommu_mapped = true;
>  	}
> -
>  	return 0;
>  }
>  
> @@ -806,7 +1162,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
> -	struct bus_type *bus = NULL;
> +	struct bus_type *bus = NULL, *mdev_bus;
>  	int ret;
>  
>  	mutex_lock(&iommu->lock);
> @@ -818,6 +1174,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		}
>  	}
>  
> +	if (iommu->external_domain) {
> +		if (find_iommu_group(iommu->external_domain, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
> +
>  	group = kzalloc(sizeof(*group), GFP_KERNEL);
>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>  	if (!group || !domain) {
> @@ -832,6 +1195,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	mdev_bus = symbol_get(mdev_bus_type);
> +
> +	if (mdev_bus) {
> +		if ((bus == mdev_bus) && !iommu_present(bus)) {
> +			symbol_put(mdev_bus_type);
> +			if (!iommu->external_domain) {
> +				INIT_LIST_HEAD(&domain->group_list);
> +				iommu->external_domain = domain;
> +			} else
> +				kfree(domain);
> +
> +			list_add(&group->next,
> +				 &iommu->external_domain->group_list);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +		symbol_put(mdev_bus_type);
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -922,6 +1304,46 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n, *p;
> +
> +	n = rb_first(&iommu->dma_list);
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma;
> +		long locked = 0, unlocked = 0;
> +
> +		dma = rb_entry(n, struct vfio_dma, node);
> +		unlocked += vfio_unmap_unpin(iommu, dma, false);
> +		p = rb_first(&dma->pfn_list);
> +		for (; p; p = rb_next(p))
> +			locked++;

We don't know that these weren't reserved pages.  If the vendor driver
was pinning peer-to-peer ranges vfio_unmap_unpin() might have returned
0 yet we're assuming each is locked RAM, so our accounting can go
upside down.

> +		vfio_lock_acct(dma->task, locked - unlocked);
> +	}
> +}
> +
> +static void vfio_external_unpin_all(struct vfio_iommu *iommu,
> +				    bool do_accounting)
> +{
> +	struct rb_node *n, *p;
> +
> +	n = rb_first(&iommu->dma_list);
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma;
> +
> +		dma = rb_entry(n, struct vfio_dma, node);
> +		while ((p = rb_first(&dma->pfn_list))) {
> +			int unlocked;
> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> +							 node);
> +
> +			unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> +			if (do_accounting)
> +				vfio_lock_acct(dma->task, -unlocked);

nit, we could batch these further, only updating accounting once per
vfio_dma, or once entirely.

> +		}
> +	}
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -931,6 +1353,26 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> +	if (iommu->external_domain) {
> +		domain = iommu->external_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
> +			list_del(&group->next);
> +			kfree(group);
> +
> +			if (list_empty(&domain->group_list)) {
> +				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +					vfio_external_unpin_all(iommu, true);
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				} else
> +					vfio_external_unpin_all(iommu, false);
> +				kfree(domain);
> +				iommu->external_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
>  	list_for_each_entry(domain, &iommu->domain_list, next) {
>  		group = find_iommu_group(domain, iommu_group);
>  		if (!group)
> @@ -940,21 +1382,27 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		list_del(&group->next);
>  		kfree(group);
>  		/*
> -		 * Group ownership provides privilege, if the group
> -		 * list is empty, the domain goes away.  If it's the
> -		 * last domain, then all the mappings go away too.
> +		 * Group ownership provides privilege, if the group list is
> +		 * empty, the domain goes away. If it's the last domain with
> +		 * iommu and external domain doesn't exist, then all the
> +		 * mappings go away too. If it's the last domain with iommu and
> +		 * external domain exist, update accounting
>  		 */
>  		if (list_empty(&domain->group_list)) {
> -			if (list_is_singular(&iommu->domain_list))
> -				vfio_iommu_unmap_unpin_all(iommu);
> +			if (list_is_singular(&iommu->domain_list)) {
> +				if (!iommu->external_domain)
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				else
> +					vfio_iommu_unmap_unpin_reaccount(iommu);
> +			}
>  			iommu_domain_free(domain->domain);
>  			list_del(&domain->next);
>  			kfree(domain);
>  		}
> -		goto done;
> +		break;
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -986,27 +1434,42 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain, bool external)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!external)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (!external)
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->external_domain) {
> +		vfio_release_domain(iommu->external_domain, true);
> +		vfio_external_unpin_all(iommu, false);
> +		kfree(iommu->external_domain);
> +		iommu->external_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain, false);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
> -
>  	kfree(iommu);
>  }
>  
> @@ -1110,6 +1573,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
Kirti Wankhede Nov. 15, 2016, 3:09 p.m. UTC | #2
On 11/15/2016 4:55 AM, Alex Williamson wrote:
> On Mon, 14 Nov 2016 21:12:24 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>> managed by an IOMMU domain.
>>
>> Aim of this change is:
>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>> - To support direct assigned device and mediated device in single module
>>
>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>> backend module. More details:
>> - Domain for external user is tracked separately in vfio_iommu structure.
>>   It is allocated when group for first mdev device is attached.
>> - Pages pinned for external domain are tracked in each vfio_dma structure
>>   for that iova range.
>> - Page tracking rb-tree in vfio_dma keeps <iova, pfn, ref_count>. Key of
>>   rb-tree is iova, but it actually aims to track pfns.
>> - On external pin request for an iova, page is pinned only once, if iova
>>   is already pinned and tracked, ref_count is incremented.
> 
> This is referring only to the external (ie. pfn_list) tracking only,
> correct?  In general, a page can be pinned up to twice per iova
> referencing it, once for an iommu mapped domain and again in the
> pfn_list, right?
> 

That's right.

...
  		}
>>  
>> -		if (!rsvd && !lock_cap && mm->locked_vm + i + 1 > limit) {
>> -			put_pfn(pfn, prot);
>> +		iova = vaddr - dma->vaddr + dma->iova;
> 
> 
> nit, this could be incremented in the for loop just like vaddr, we
> don't need to create it from scratch (iova += PAGE_SIZE).
> 

Ok.

> 
>> +		vpfn = vfio_find_vpfn(dma, iova);
>> +		if (!vpfn)
>> +			lock_acct++;
>> +
>> +		if (!rsvd && !lock_cap &&
>> +		    mm->locked_vm + lock_acct + 1 > limit) {
> 
> 
> lock_acct was incremented just above, why is this lock_acct + 1?  I
> think we're off by one here (ie. remove the +1)?
> 

Moved this vfio_find_vpfn() check after this check.


...


>> -static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
>> -				    long npage, int prot, bool do_accounting)
>> +static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
>> +				    unsigned long pfn, long npage,
>> +				    bool do_accounting)
>>  {
>> -	unsigned long unlocked = 0;
>> +	long unlocked = 0, locked = 0;
>>  	long i;
>>  
>> -	for (i = 0; i < npage; i++)
>> -		unlocked += put_pfn(pfn++, prot);
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_pfn *vpfn;
>> +
>> +		unlocked += put_pfn(pfn++, dma->prot);
>> +
>> +		vpfn = vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT));
>> +		if (vpfn)
>> +			locked++;
> 
> This isn't taking into account reserved pages (ex. device mmaps).  In
> the pinning path above we skip accounting of reserved pages, put_pfn
> also only increments for non-reserved pages, but vfio_find_vpfn()
> doesn't care, so it's possible that (locked - unlocked) could result in
> a positive value.  Maybe something like:
> 
> if (put_pfn(...)) {
> 	unlocked++;
> 	if (vfio_find_vpfn(...))
> 		locked++;
> }
> 

Updating.

...
>>  
>> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>> +static int vfio_iommu_type1_pin_pages(void *iommu_data,
>> +				      unsigned long *user_pfn,
>> +				      int npage, int prot,
>> +				      unsigned long *phys_pfn)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	int i, j, ret;
>> +	unsigned long remote_vaddr;
>> +	unsigned long *pfn = phys_pfn;
> 
> nit, why do we have this variable rather than using phys_pfn directly?
> Maybe before we had unwind support we incremented this rather than
> indexing it?
> 

Updating.

...

>> +		iova = user_pfn[i] << PAGE_SHIFT;
>> +		dma = vfio_find_dma(iommu, iova, 0);
>> +		if (!dma)
>> +			goto unpin_exit;
>> +		unlocked += vfio_unpin_page_external(dma, iova, do_accounting);
>> +	}
>> +
>> +unpin_exit:
>> +	mutex_unlock(&iommu->lock);
>> +	return unlocked;
> 
> This is not returning what it's supposed to.  unlocked is going to be
> less than or equal to the number of pages unpinned.  We don't need to
> track unlocked, I think we're just tracking where we are in the unpin
> array, whether it was partial or complete.  I think we want:
> 
> return i > npage ? npage : i;
> 
> Or maybe we can make it more obvious if there's an error on the first
> unpin entry:
> 
> return i > npage ? npage : (i > 0 ? i : -EINVAL);
> 

Thanks for pointing that out. Updating.

...
>> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
>> +{
>> +	struct rb_node *n, *p;
>> +
>> +	n = rb_first(&iommu->dma_list);
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma;
>> +		long locked = 0, unlocked = 0;
>> +
>> +		dma = rb_entry(n, struct vfio_dma, node);
>> +		unlocked += vfio_unmap_unpin(iommu, dma, false);
>> +		p = rb_first(&dma->pfn_list);
>> +		for (; p; p = rb_next(p))
>> +			locked++;
> 
> We don't know that these weren't reserved pages.  If the vendor driver
> was pinning peer-to-peer ranges vfio_unmap_unpin() might have returned
> 0 yet we're assuming each is locked RAM, so our accounting can go
> upside down.
> 

Adding if rsvd check here.

>> +		vfio_lock_acct(dma->task, locked - unlocked);
>> +	}
>> +}
>> +
>> +static void vfio_external_unpin_all(struct vfio_iommu *iommu,
>> +				    bool do_accounting)
>> +{
>> +	struct rb_node *n, *p;
>> +
>> +	n = rb_first(&iommu->dma_list);
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma;
>> +
>> +		dma = rb_entry(n, struct vfio_dma, node);
>> +		while ((p = rb_first(&dma->pfn_list))) {
>> +			int unlocked;
>> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
>> +							 node);
>> +
>> +			unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
>> +			if (do_accounting)
>> +				vfio_lock_acct(dma->task, -unlocked);
> 
> nit, we could batch these further, only updating accounting once per
> vfio_dma, or once entirely.
> 

Ok.

Thanks,
Kirti
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 50aca95cf61e..2697d874dd35 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -37,6 +37,7 @@ 
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
 #include <linux/pid_namespace.h>
+#include <linux/mdev.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -56,6 +57,7 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
@@ -76,7 +78,9 @@  struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	bool			iommu_mapped;
 	struct task_struct	*task;
+	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
 };
 
 struct vfio_group {
@@ -85,6 +89,21 @@  struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	dma_addr_t		iova;		/* Device address */
+	unsigned long		pfn;		/* Host pfn */
+	atomic_t		ref_count;
+};
+
+#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
+					(!list_empty(&iommu->domain_list))
+
+static int put_pfn(unsigned long pfn, int prot);
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -132,6 +151,97 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host iova-pfn list
+ */
+static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
+{
+	struct vfio_pfn *vpfn;
+	struct rb_node *node = dma->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (iova < vpfn->iova)
+			node = node->rb_left;
+		else if (iova > vpfn->iova)
+			node = node->rb_right;
+		else
+			return vpfn;
+	}
+	return NULL;
+}
+
+static void vfio_link_pfn(struct vfio_dma *dma,
+			  struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &dma->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->iova < vpfn->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &dma->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &dma->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
+				unsigned long pfn)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->iova = iova;
+	vpfn->pfn = pfn;
+	atomic_set(&vpfn->ref_count, 1);
+	vfio_link_pfn(dma, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(dma, vpfn);
+	kfree(vpfn);
+}
+
+static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
+					       unsigned long iova)
+{
+	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+	if (vpfn)
+		atomic_inc(&vpfn->ref_count);
+	return vpfn;
+}
+
+static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
+{
+	int ret = 0;
+
+	if (atomic_dec_and_test(&vpfn->ref_count)) {
+		ret = put_pfn(vpfn->pfn, dma->prot);
+		vfio_remove_from_pfn_list(dma, vpfn);
+	}
+	return ret;
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -270,7 +380,6 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 	}
 
 	up_read(&mm->mmap_sem);
-
 	return ret;
 }
 
@@ -280,28 +389,35 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
  * first page and all consecutive pages with the same locking.
  */
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
-				  long npage, int prot, unsigned long *pfn_base)
+				  long npage, unsigned long *pfn_base)
 {
 	unsigned long limit;
 	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
 				   CAP_IPC_LOCK);
 	struct mm_struct *mm;
-	long ret, i;
+	long ret, i, lock_acct = 0;
 	bool rsvd;
+	struct vfio_pfn *vpfn;
+	dma_addr_t iova;
 
 	mm = get_task_mm(dma->task);
 	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
 	if (ret)
 		goto pin_pg_remote_exit;
 
+	iova = vaddr - dma->vaddr + dma->iova;
+	vpfn = vfio_find_vpfn(dma, iova);
+	if (!vpfn)
+		lock_acct = 1;
+
 	rsvd = is_invalid_reserved_pfn(*pfn_base);
 	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
-		put_pfn(*pfn_base, prot);
+	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
+		put_pfn(*pfn_base, dma->prot);
 		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 			limit << PAGE_SHIFT);
 		ret = -ENOMEM;
@@ -310,7 +426,7 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 
 	if (unlikely(disable_hugepages)) {
 		if (!rsvd)
-			vfio_lock_acct(dma->task, 1);
+			vfio_lock_acct(dma->task, lock_acct);
 		ret = 1;
 		goto pin_pg_remote_exit;
 	}
@@ -319,18 +435,24 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
 		if (ret)
 			break;
 
 		if (pfn != *pfn_base + i ||
 		    rsvd != is_invalid_reserved_pfn(pfn)) {
-			put_pfn(pfn, prot);
+			put_pfn(pfn, dma->prot);
 			break;
 		}
 
-		if (!rsvd && !lock_cap && mm->locked_vm + i + 1 > limit) {
-			put_pfn(pfn, prot);
+		iova = vaddr - dma->vaddr + dma->iova;
+		vpfn = vfio_find_vpfn(dma, iova);
+		if (!vpfn)
+			lock_acct++;
+
+		if (!rsvd && !lock_cap &&
+		    mm->locked_vm + lock_acct + 1 > limit) {
+			put_pfn(pfn, dma->prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 				__func__, limit << PAGE_SHIFT);
 			break;
@@ -338,7 +460,7 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(dma->task, i);
+		vfio_lock_acct(dma->task, lock_acct);
 	ret = i;
 
 pin_pg_remote_exit:
@@ -346,14 +468,78 @@  pin_pg_remote_exit:
 	return ret;
 }
 
-static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
-				    long npage, int prot, bool do_accounting)
+static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
+				    unsigned long pfn, long npage,
+				    bool do_accounting)
 {
-	unsigned long unlocked = 0;
+	long unlocked = 0, locked = 0;
 	long i;
 
-	for (i = 0; i < npage; i++)
-		unlocked += put_pfn(pfn++, prot);
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *vpfn;
+
+		unlocked += put_pfn(pfn++, dma->prot);
+
+		vpfn = vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT));
+		if (vpfn)
+			locked++;
+	}
+
+	if (do_accounting)
+		vfio_lock_acct(dma->task, locked - unlocked);
+
+	return unlocked;
+}
+
+static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
+				  unsigned long *pfn_base, bool do_accounting)
+{
+	unsigned long limit;
+	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
+				   CAP_IPC_LOCK);
+	struct mm_struct *mm;
+	int ret;
+	bool rsvd;
+
+	mm = get_task_mm(dma->task);
+	if (!mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+	if (ret)
+		goto pin_page_exit;
+
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
+	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, dma->prot);
+		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
+			__func__, dma->task->comm, task_pid_nr(dma->task),
+			limit << PAGE_SHIFT);
+		ret = -ENOMEM;
+		goto pin_page_exit;
+	}
+
+	if (!rsvd && do_accounting)
+		vfio_lock_acct(dma->task, 1);
+	ret = 1;
+
+pin_page_exit:
+	mmput(mm);
+	return ret;
+}
+
+static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
+				    bool do_accounting)
+{
+	int unlocked;
+	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+	if (!vpfn)
+		return 0;
+
+	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
 
 	if (do_accounting)
 		vfio_lock_acct(dma->task, -unlocked);
@@ -361,14 +547,148 @@  static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
 	return unlocked;
 }
 
-static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
+static int vfio_iommu_type1_pin_pages(void *iommu_data,
+				      unsigned long *user_pfn,
+				      int npage, int prot,
+				      unsigned long *phys_pfn)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int i, j, ret;
+	unsigned long remote_vaddr;
+	unsigned long *pfn = phys_pfn;
+	struct vfio_dma *dma;
+	bool do_accounting;
+
+	if (!iommu || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	/* Supported for v2 version only */
+	if (!iommu->v2)
+		return -EACCES;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->external_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accouting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		dma_addr_t iova;
+		struct vfio_pfn *vpfn;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		if ((dma->prot & prot) != prot) {
+			pr_info("%s: dma->prot: 0x%x prot: 0x%x\n",
+				__func__, dma->prot, prot);
+			ret = -EPERM;
+			goto pin_unwind;
+		}
+
+		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
+		if (vpfn) {
+			pfn[i] = vpfn->pfn;
+			continue;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+		ret = vfio_pin_page_external(dma, remote_vaddr, &pfn[i],
+					     do_accounting);
+		if (ret <= 0) {
+			WARN_ON(!ret);
+			goto pin_unwind;
+		}
+
+		ret = vfio_add_to_pfn_list(dma, iova, pfn[i]);
+		if (ret) {
+			vfio_unpin_page_external(dma, iova, do_accounting);
+			goto pin_unwind;
+		}
+	}
+
+	ret = i;
+	goto pin_done;
+
+pin_unwind:
+	pfn[i] = 0;
+	for (j = 0; j < i; j++) {
+		dma_addr_t iova;
+
+		iova = user_pfn[j] << PAGE_SHIFT;
+		dma = vfio_find_dma(iommu, iova, 0);
+		vfio_unpin_page_external(dma, iova, do_accounting);
+		pfn[j] = 0;
+	}
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_type1_unpin_pages(void *iommu_data,
+					unsigned long *user_pfn,
+					int npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	bool do_accounting;
+	int unlocked = 0, i;
+
+	if (!iommu || !user_pfn)
+		return -EINVAL;
+
+	/* Supported for v2 version only */
+	if (!iommu->v2)
+		return -EACCES;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->external_domain) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
+	}
+
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+	for (i = 0; i < npage; i++) {
+		struct vfio_dma *dma;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma)
+			goto unpin_exit;
+		unlocked += vfio_unpin_page_external(dma, iova, do_accounting);
+	}
+
+unpin_exit:
+	mutex_unlock(&iommu->lock);
+	return unlocked;
+}
+
+static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			     bool do_accounting)
 {
 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 	struct vfio_domain *domain, *d;
 	long unlocked = 0;
 
 	if (!dma->size)
-		return;
+		return 0;
+
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		return 0;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -410,20 +730,26 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
+		unlocked += vfio_unpin_pages_remote(dma, iova,
+						    phys >> PAGE_SHIFT,
 						    unmapped >> PAGE_SHIFT,
-						    dma->prot, false);
+						    false);
 		iova += unmapped;
 
 		cond_resched();
 	}
 
-	vfio_lock_acct(dma->task, -unlocked);
+	dma->iommu_mapped = false;
+	if (do_accounting) {
+		vfio_lock_acct(dma->task, -unlocked);
+		return 0;
+	}
+	return unlocked;
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
-	vfio_unmap_unpin(iommu, dma);
+	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
 	kfree(dma);
@@ -606,8 +932,7 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	while (size) {
 		/* Pin a contiguous chunk of memory */
 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
-					      size >> PAGE_SHIFT, dma->prot,
-					      &pfn);
+					      size >> PAGE_SHIFT, &pfn);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -618,8 +943,8 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
 				     dma->prot);
 		if (ret) {
-			vfio_unpin_pages_remote(dma, pfn, npage,
-						 dma->prot, true);
+			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+						npage, true);
 			break;
 		}
 
@@ -627,6 +952,8 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		dma->size += npage << PAGE_SHIFT;
 	}
 
+	dma->iommu_mapped = true;
+
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
@@ -682,11 +1009,16 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	dma->prot = prot;
 	get_task_struct(current);
 	dma->task = current;
+	dma->pfn_list = RB_ROOT;
 
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	ret = vfio_pin_map_dma(iommu, dma, size);
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		dma->size = size;
+	else
+		ret = vfio_pin_map_dma(iommu, dma, size);
 do_map_err:
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -715,10 +1047,6 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
 
-	/* If there's not a domain, there better not be any mappings */
-	if (WARN_ON(n && !d))
-		return -EINVAL;
-
 	for (; n; n = rb_next(n)) {
 		struct vfio_dma *dma;
 		dma_addr_t iova;
@@ -727,21 +1055,49 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		iova = dma->iova;
 
 		while (iova < dma->iova + dma->size) {
-			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
+			phys_addr_t phys;
 			size_t size;
 
-			if (WARN_ON(!phys)) {
-				iova += PAGE_SIZE;
-				continue;
+			if (dma->iommu_mapped) {
+				phys_addr_t p;
+				dma_addr_t i;
+
+				phys = iommu_iova_to_phys(d->domain, iova);
+
+				if (WARN_ON(!phys)) {
+					iova += PAGE_SIZE;
+					continue;
+				}
+
+				size = PAGE_SIZE;
+				p = phys + size;
+				i = iova + size;
+				while (i < dma->iova + dma->size &&
+				       p == iommu_iova_to_phys(d->domain, i)) {
+					size += PAGE_SIZE;
+					p += PAGE_SIZE;
+					i += PAGE_SIZE;
+				}
+			} else {
+				unsigned long pfn;
+				unsigned long vaddr = dma->vaddr +
+						     (iova - dma->iova);
+				size_t n = dma->iova + dma->size - iova;
+				long npage;
+
+				npage = vfio_pin_pages_remote(dma, vaddr,
+							      n >> PAGE_SHIFT,
+							      &pfn);
+				if (npage <= 0) {
+					WARN_ON(!npage);
+					ret = (int)npage;
+					return ret;
+				}
+
+				phys = pfn << PAGE_SHIFT;
+				size = npage << PAGE_SHIFT;
 			}
 
-			size = PAGE_SIZE;
-
-			while (iova + size < dma->iova + dma->size &&
-			       phys + size == iommu_iova_to_phys(d->domain,
-								 iova + size))
-				size += PAGE_SIZE;
-
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
 			if (ret)
@@ -749,8 +1105,8 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 			iova += size;
 		}
+		dma->iommu_mapped = true;
 	}
-
 	return 0;
 }
 
@@ -806,7 +1162,7 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
-	struct bus_type *bus = NULL;
+	struct bus_type *bus = NULL, *mdev_bus;
 	int ret;
 
 	mutex_lock(&iommu->lock);
@@ -818,6 +1174,13 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 		}
 	}
 
+	if (iommu->external_domain) {
+		if (find_iommu_group(iommu->external_domain, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
+
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 	if (!group || !domain) {
@@ -832,6 +1195,25 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	mdev_bus = symbol_get(mdev_bus_type);
+
+	if (mdev_bus) {
+		if ((bus == mdev_bus) && !iommu_present(bus)) {
+			symbol_put(mdev_bus_type);
+			if (!iommu->external_domain) {
+				INIT_LIST_HEAD(&domain->group_list);
+				iommu->external_domain = domain;
+			} else
+				kfree(domain);
+
+			list_add(&group->next,
+				 &iommu->external_domain->group_list);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+		symbol_put(mdev_bus_type);
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -922,6 +1304,46 @@  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+	struct rb_node *n, *p;
+
+	n = rb_first(&iommu->dma_list);
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+		long locked = 0, unlocked = 0;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+		unlocked += vfio_unmap_unpin(iommu, dma, false);
+		p = rb_first(&dma->pfn_list);
+		for (; p; p = rb_next(p))
+			locked++;
+		vfio_lock_acct(dma->task, locked - unlocked);
+	}
+}
+
+static void vfio_external_unpin_all(struct vfio_iommu *iommu,
+				    bool do_accounting)
+{
+	struct rb_node *n, *p;
+
+	n = rb_first(&iommu->dma_list);
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+		while ((p = rb_first(&dma->pfn_list))) {
+			int unlocked;
+			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
+							 node);
+
+			unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
+			if (do_accounting)
+				vfio_lock_acct(dma->task, -unlocked);
+		}
+	}
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -931,6 +1353,26 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
+	if (iommu->external_domain) {
+		domain = iommu->external_domain;
+		group = find_iommu_group(domain, iommu_group);
+		if (group) {
+			list_del(&group->next);
+			kfree(group);
+
+			if (list_empty(&domain->group_list)) {
+				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+					vfio_external_unpin_all(iommu, true);
+					vfio_iommu_unmap_unpin_all(iommu);
+				} else
+					vfio_external_unpin_all(iommu, false);
+				kfree(domain);
+				iommu->external_domain = NULL;
+			}
+			goto detach_group_done;
+		}
+	}
+
 	list_for_each_entry(domain, &iommu->domain_list, next) {
 		group = find_iommu_group(domain, iommu_group);
 		if (!group)
@@ -940,21 +1382,27 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 		list_del(&group->next);
 		kfree(group);
 		/*
-		 * Group ownership provides privilege, if the group
-		 * list is empty, the domain goes away.  If it's the
-		 * last domain, then all the mappings go away too.
+		 * Group ownership provides privilege, if the group list is
+		 * empty, the domain goes away. If it's the last domain with
+		 * iommu and external domain doesn't exist, then all the
+		 * mappings go away too. If it's the last domain with iommu and
+		 * external domain exist, update accounting
 		 */
 		if (list_empty(&domain->group_list)) {
-			if (list_is_singular(&iommu->domain_list))
-				vfio_iommu_unmap_unpin_all(iommu);
+			if (list_is_singular(&iommu->domain_list)) {
+				if (!iommu->external_domain)
+					vfio_iommu_unmap_unpin_all(iommu);
+				else
+					vfio_iommu_unmap_unpin_reaccount(iommu);
+			}
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
 			kfree(domain);
 		}
-		goto done;
+		break;
 	}
 
-done:
+detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -986,27 +1434,42 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	return iommu;
 }
 
+static void vfio_release_domain(struct vfio_domain *domain, bool external)
+{
+	struct vfio_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		if (!external)
+			iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	if (!external)
+		iommu_domain_free(domain->domain);
+}
+
 static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
-	struct vfio_group *group, *group_tmp;
+
+	if (iommu->external_domain) {
+		vfio_release_domain(iommu->external_domain, true);
+		vfio_external_unpin_all(iommu, false);
+		kfree(iommu->external_domain);
+		iommu->external_domain = NULL;
+	}
 
 	vfio_iommu_unmap_unpin_all(iommu);
 
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
-		list_for_each_entry_safe(group, group_tmp,
-					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
-			list_del(&group->next);
-			kfree(group);
-		}
-		iommu_domain_free(domain->domain);
+		vfio_release_domain(domain, false);
 		list_del(&domain->next);
 		kfree(domain);
 	}
-
 	kfree(iommu);
 }
 
@@ -1110,6 +1573,8 @@  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.ioctl		= vfio_iommu_type1_ioctl,
 	.attach_group	= vfio_iommu_type1_attach_group,
 	.detach_group	= vfio_iommu_type1_detach_group,
+	.pin_pages	= vfio_iommu_type1_pin_pages,
+	.unpin_pages	= vfio_iommu_type1_unpin_pages,
 };
 
 static int __init vfio_iommu_type1_init(void)