diff mbox

[v11,10/22] vfio iommu type1: Add support for mediated devices

Message ID 1478293856-8191-11-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede Nov. 4, 2016, 9:10 p.m. UTC
VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
Mediated device only uses IOMMU APIs, the underlying hardware can be
managed by an IOMMU domain.

Aim of this change is:
- To use most of the code of TYPE1 IOMMU driver for mediated devices
- To support direct assigned device and mediated device in single module

This change adds pin and unpin support for mediated device to TYPE1 IOMMU
backend module. More details:
- vfio_pin_pages() callback here uses task and address space of vfio_dma,
  that is, of the process who mapped that iova range.
- Added pfn_list tracking logic to address space structure. All pages
  pinned through this interface are trached in its address space.
- Pinned pages list is used to verify unpinning request and to unpin
  remaining pages while detaching the group for that device.
- Page accounting is updated to account in its address space where the
  pages are pinned/unpinned.
-  Accouting for mdev device is only done if there is no iommu capable
  domain in the container. When there is a direct device assigned to the
  container and that domain is iommu capable, all pages are already pinned
  during DMA_MAP.
- Page accouting is updated on hot plug and unplug mdev device and pass
  through device.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- Linux VM hot plug and unplug vGPU device while GPU pass through device
  exist
- Linux VM hot plug and unplug GPU pass through device while vGPU device
  exist

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
---
 drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 500 insertions(+), 38 deletions(-)

Comments

Alex Williamson Nov. 7, 2016, 11:16 p.m. UTC | #1
On Sat, 5 Nov 2016 02:40:44 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>   that is, of the process who mapped that iova range.
> - Added pfn_list tracking logic to address space structure. All pages
>   pinned through this interface are trached in its address space.
                                          ^ k
------------------------------------------|

> - Pinned pages list is used to verify unpinning request and to unpin
>   remaining pages while detaching the group for that device.
> - Page accounting is updated to account in its address space where the
>   pages are pinned/unpinned.
> -  Accouting for mdev device is only done if there is no iommu capable
>   domain in the container. When there is a direct device assigned to the
>   container and that domain is iommu capable, all pages are already pinned
>   during DMA_MAP.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 500 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 8d64528dcc22..e511073446a0 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
>  struct vfio_iommu {
>  	struct list_head	domain_list;
>  	struct list_head	addr_space_list;
> +	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
> @@ -67,6 +69,9 @@ struct vfio_addr_space {
>  	struct mm_struct	*mm;
>  	struct list_head	next;
>  	atomic_t		ref_count;
> +	/* external user pinned pfns */
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
>  };
>  
>  struct vfio_domain {
> @@ -83,6 +88,7 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  	struct vfio_addr_space	*addr_space;
>  	struct task_struct	*task;
>  	bool			mlock_cap;
> @@ -94,6 +100,19 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		pfn;		/* Host pfn */
> +	int			prot;
> +	atomic_t		ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> +					(!list_empty(&iommu->domain_list))
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
>  	return NULL;
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space,
> +				      unsigned long pfn)
> +{
> +	struct vfio_pfn *vpfn;
> +	struct rb_node *node = addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else
> +			return vpfn;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_addr_space *addr_space,
> +			  struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space,
> +			    struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space,
> +				unsigned long pfn, int prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(addr_space, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(addr_space, vpfn);
> +	kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_addr_space *addr_space,
> +			    unsigned long pfn)
> +{
> +	struct vfio_pfn *p;
> +	int ret = 1;
> +
> +	mutex_lock(&addr_space->pfn_list_lock);
> +	p = vfio_find_pfn(addr_space, pfn);
> +	if (p)
> +		ret = 0;
> +	mutex_unlock(&addr_space->pfn_list_lock);
> +	return ret;
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = dma->mlock_cap;
>  	struct mm_struct *mm = dma->addr_space->mm;
> -	long ret, i;
> +	long ret, i, lock_acct;
>  	bool rsvd;
>  
>  	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> +	lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base);
> +
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> +	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
>  		put_pfn(*pfn_base, prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  			break;
>  		}
>  
> +		lock_acct += vfio_pfn_account(dma->addr_space, pfn);
> +
>  		if (!rsvd && !lock_cap &&
> -		    mm->locked_vm + i + 1 > limit) {
> +		    mm->locked_vm + lock_acct + 1 > limit) {
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(mm, i);
> +		vfio_lock_acct(mm, lock_acct);
>  
>  	return i;
>  }
> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
>  	return unlocked;
>  }
>  
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> +				    int prot, unsigned long *pfn_base,
> +				    bool do_accounting)
> +{
> +	struct task_struct *task = dma->task;
> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = dma->mlock_cap;
> +	struct mm_struct *mm = dma->addr_space->mm;
> +	int ret;
> +	bool rsvd;
> +
> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> +			__func__, task->comm, task_pid_nr(task),
> +			limit << PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(mm, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
> +				       unsigned long pfn, int prot,
> +				       bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(addr_space->mm, -1);

Can't we batch this like we do elsewhere?  Intel folks, AIUI you intend
to pin all VM memory through this side channel, have you tested the
scalability and performance of this with larger VMs?  Our vfio_pfn
data structure alone is 40 bytes per pinned page, which means for
each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
Additionally, unmapping each 1GB of VM memory will result in 256k
separate vfio_lock_acct() callbacks.  I'm concerned that we're not
being efficient enough in either space or time.

One thought might be whether we really need to save the pfn, we better
always get the same result if we pin it again, or maybe we can just do
a lookup through the mm at that point without re-pinning.  Could we get
to the point where we only need an atomic_t ref count per page in a
linear array relative to the IOVA?  That would give us 1MB per 1GB
overhead. The semantics of the pin and unpin would make more sense then
too, both would take an IOVA range, only pinning would need a return
mechanism. For instance:

int pin_pages(void *iommu_data, dma_addr_t iova_base,
	      int npage, unsigned long *pfn_base);

This would pin physically contiguous pages up to npage, returning the
base pfn and returning the number of pages pinned (<= npage).  The
vendor driver would make multiple calls to fill the necessary range.
Unpin would then simply be:

void unpin_pages(void *iommu_data, dma_addr_t iova_base, int npage);

Hugepage usage would really make such an interface shine (ie. 2MB+
contiguous ranges).  A downside would be the overhead of getting the
group and container reference in vfio for each callback, perhaps we'd
need to figure out how the vendor driver could hold that reference.
The current API of passing around pfn arrays, further increasing the
overhead of the whole ecosystem just makes me cringe though.

> +}
> +
> +static int vfio_unpin_pfn(struct vfio_addr_space *addr_space,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_page_external(addr_space, vpfn->pfn, vpfn->prot,
> +			do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(addr_space, vpfn);
> +
> +	return 1;
> +}
> +
> +static int vfio_iommu_type1_pin_pages(void *iommu_data,
> +				      unsigned long *user_pfn,
> +				      int npage, int prot,
> +				      unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	int i, j, ret;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->external_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		ret = __vfio_pin_page_external(dma, remote_vaddr, prot,
> +				&pfn[i], do_accounting);

Please wrap such that the following line falls within the relevant ()s
when possible.  Above and below on unpin calls.

> +		if (ret <= 0) {
> +			WARN_ON(!ret);
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&dma->addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(dma->addr_space, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);

We never test whether (p->prot == prot), shouldn't we be doing
something in that case?  In fact, why do we allow the side-channel
through the .{un}pin_pages to specify page protection flags that might
be different than the user specified for the DMA_MAP?  If the user
specified read-only, the vendor driver should not be allowed to
override with read-write access.

> +			mutex_unlock(&dma->addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(dma->addr_space, pfn[i], prot);
> +		mutex_unlock(&dma->addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_page_external(dma->addr_space, pfn[i],
> +					prot, do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[j] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +
> +		mutex_lock(&dma->addr_space->pfn_list_lock);
> +		p = vfio_find_pfn(dma->addr_space, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(dma->addr_space, p, do_accounting);
> +
> +		mutex_unlock(&dma->addr_space->pfn_list_lock);
> +		pfn[j] = 0;
> +	}
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> +					unsigned long *user_pfn,
> +					unsigned long *pfn,
> +					int npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	bool do_accounting;
> +	int unlocked = 0, i;
> +
> +	if (!iommu || !user_pfn || !pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->external_domain) {
> +		mutex_unlock(&iommu->lock);
> +		return -EINVAL;
> +	}
> +
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		struct vfio_dma *dma;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma)
> +			goto unpin_exit;
> +
> +		mutex_lock(&dma->addr_space->pfn_list_lock);
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(dma->addr_space, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(dma->addr_space, p,
> +						   do_accounting);
> +		mutex_unlock(&dma->addr_space->pfn_list_lock);
> +	}
> +unpin_exit:
> +	mutex_unlock(&iommu->lock);
> +	return unlocked;
> +}
> +
> +static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			     bool do_accounting)
>  {
>  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
>  	struct vfio_domain *domain, *d;
>  	long unlocked = 0;
>  
>  	if (!dma->size)
> -		return;
> +		return 0;
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		return 0;
> +
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -427,12 +737,17 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(dma->addr_space->mm, -unlocked);
> +	dma->iommu_mapped = false;
> +	if (do_accounting) {
> +		vfio_lock_acct(dma->addr_space->mm, -unlocked);
> +		return 0;
> +	}
> +	return unlocked;
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
> -	vfio_unmap_unpin(iommu, dma);
> +	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  
>  	if (atomic_dec_and_test(&dma->addr_space->ref_count)) {
> @@ -642,6 +957,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  		dma->size += npage << PAGE_SHIFT;
>  	}
>  
> +	dma->iommu_mapped = true;
> +
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> @@ -706,6 +1023,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			goto do_map_err;
>  		}
>  		addr_space->mm = mm;
> +		addr_space->pfn_list = RB_ROOT;
> +		mutex_init(&addr_space->pfn_list_lock);
>  		atomic_set(&addr_space->ref_count, 1);
>  		list_add(&addr_space->next, &iommu->addr_space_list);
>  		free_addr_space_on_err = true;
> @@ -733,7 +1052,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> -	ret = vfio_pin_map_dma(iommu, dma, size);
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		dma->size = size;
> +	else
> +		ret = vfio_pin_map_dma(iommu, dma, size);
>  do_map_err:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -762,10 +1085,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>  	n = rb_first(&iommu->dma_list);
>  
> -	/* If there's not a domain, there better not be any mappings */
> -	if (WARN_ON(n && !d))
> -		return -EINVAL;
> -
>  	for (; n; n = rb_next(n)) {
>  		struct vfio_dma *dma;
>  		dma_addr_t iova;
> @@ -774,20 +1093,43 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  		iova = dma->iova;
>  
>  		while (iova < dma->iova + dma->size) {
> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> +			phys_addr_t phys;
>  			size_t size;
>  
> -			if (WARN_ON(!phys)) {
> -				iova += PAGE_SIZE;
> -				continue;
> -			}
> +			if (dma->iommu_mapped) {
> +				phys = iommu_iova_to_phys(d->domain, iova);
> +
> +				if (WARN_ON(!phys)) {
> +					iova += PAGE_SIZE;
> +					continue;
> +				}
>  
> -			size = PAGE_SIZE;
> +				size = PAGE_SIZE;
>  
> -			while (iova + size < dma->iova + dma->size &&
> -			       phys + size == iommu_iova_to_phys(d->domain,
> +				while (iova + size < dma->iova + dma->size &&
> +				    phys + size == iommu_iova_to_phys(d->domain,
>  								 iova + size))
> -				size += PAGE_SIZE;
> +					size += PAGE_SIZE;
> +			} else {
> +				unsigned long pfn;
> +				unsigned long vaddr = dma->vaddr +
> +						     (iova - dma->iova);
> +				size_t n = dma->iova + dma->size - iova;
> +				long npage;
> +
> +				npage = __vfio_pin_pages_remote(dma, vaddr,
> +								n >> PAGE_SHIFT,
> +								dma->prot,
> +								&pfn);
> +				if (npage <= 0) {
> +					WARN_ON(!npage);
> +					ret = (int)npage;
> +					return ret;
> +				}
> +
> +				phys = pfn << PAGE_SHIFT;
> +				size = npage << PAGE_SHIFT;
> +			}
>  
>  			ret = iommu_map(domain->domain, iova, phys,
>  					size, dma->prot | domain->prot);
> @@ -796,6 +1138,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  
>  			iova += size;
>  		}
> +
> +		dma->iommu_mapped = true;
>  	}
>  
>  	return 0;
> @@ -853,7 +1197,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
> -	struct bus_type *bus = NULL;
> +	struct bus_type *bus = NULL, *mdev_bus;
>  	int ret;
>  
>  	mutex_lock(&iommu->lock);
> @@ -865,6 +1209,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		}
>  	}
>  
> +	if (iommu->external_domain) {
> +		if (find_iommu_group(iommu->external_domain, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
> +
>  	group = kzalloc(sizeof(*group), GFP_KERNEL);
>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>  	if (!group || !domain) {
> @@ -879,6 +1230,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	mdev_bus = symbol_get(mdev_bus_type);
> +
> +	if (mdev_bus) {
> +		if ((bus == mdev_bus) && !iommu_present(bus)) {
> +			symbol_put(mdev_bus_type);
> +			if (!iommu->external_domain) {
> +				INIT_LIST_HEAD(&domain->group_list);
> +				iommu->external_domain = domain;
> +			} else
> +				kfree(domain);
> +
> +			list_add(&group->next,
> +				 &iommu->external_domain->group_list);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +		symbol_put(mdev_bus_type);
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -969,6 +1339,51 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> +{
> +	struct vfio_addr_space *as;
> +
> +	list_for_each_entry(as, &iommu->addr_space_list, next) {
> +		struct rb_node *n, *p;
> +		long locked = 0, unlocked = 0;
> +
> +		n = rb_first(&iommu->dma_list);
> +		for (; n; n = rb_next(n)) {
> +			struct vfio_dma *dma;
> +
> +			dma = rb_entry(n, struct vfio_dma, node);
> +			if (dma->addr_space == as)
> +				unlocked += vfio_unmap_unpin(iommu, dma, false);
> +		}
> +
> +		mutex_lock(&as->pfn_list_lock);
> +		p = rb_first(&as->pfn_list);
> +		for (; p; p = rb_next(p))
> +			locked++;
> +
> +		mutex_unlock(&as->pfn_list_lock);
> +		vfio_lock_acct(as->mm, locked - unlocked);
> +	}
> +}
> +
> +static void vfio_external_unpin_all(struct vfio_iommu *iommu,
> +				    bool do_accounting)
> +{
> +	struct vfio_addr_space *as;
> +
> +	list_for_each_entry(as, &iommu->addr_space_list, next) {
> +		struct rb_node *node;
> +
> +		mutex_lock(&as->pfn_list_lock);
> +		while ((node = rb_first(&as->pfn_list)))
> +			vfio_unpin_pfn(as,
> +				       rb_entry(node, struct vfio_pfn, node),
> +				       do_accounting);
> +
> +		mutex_unlock(&as->pfn_list_lock);
> +	}
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -978,6 +1393,28 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> +	if (iommu->external_domain) {
> +		domain = iommu->external_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
> +			list_del(&group->next);
> +			kfree(group);
> +
> +			if (list_empty(&domain->group_list)) {
> +				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +					vfio_external_unpin_all(iommu, true);
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				} else
> +					vfio_external_unpin_all(iommu, false);
> +				kfree(domain);
> +				iommu->external_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
>  
>  	list_for_each_entry(domain, &iommu->domain_list, next) {
>  		group = find_iommu_group(domain, iommu_group);
> @@ -988,21 +1425,27 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		list_del(&group->next);
>  		kfree(group);
>  		/*
> -		 * Group ownership provides privilege, if the group
> -		 * list is empty, the domain goes away.  If it's the
> -		 * last domain, then all the mappings go away too.
> +		 * Group ownership provides privilege, if the group list is
> +		 * empty, the domain goes away. If it's the last domain with
> +		 * iommu and external domain doesn't exist, then all the
> +		 * mappings go away too. If it's the last domain with iommu and
> +		 * external domain exist, update accounting
>  		 */
>  		if (list_empty(&domain->group_list)) {
> -			if (list_is_singular(&iommu->domain_list))
> -				vfio_iommu_unmap_unpin_all(iommu);
> +			if (list_is_singular(&iommu->domain_list)) {
> +				if (!iommu->external_domain)
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				else
> +					vfio_iommu_unmap_unpin_reaccount(iommu);
> +			}
>  			iommu_domain_free(domain->domain);
>  			list_del(&domain->next);
>  			kfree(domain);
>  		}
> -		goto done;
> +		break;
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -1028,29 +1471,46 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	}
>  
>  	INIT_LIST_HEAD(&iommu->domain_list);
> +	INIT_LIST_HEAD(&iommu->addr_space_list);
>  	iommu->dma_list = RB_ROOT;
>  	mutex_init(&iommu->lock);
>  
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain, bool external)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!external)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (!external)
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->external_domain) {
> +		vfio_release_domain(iommu->external_domain, true);
> +		vfio_external_unpin_all(iommu, false);
> +		kfree(iommu->external_domain);
> +		iommu->external_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain, false);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
> @@ -1158,6 +1618,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song Nov. 8, 2016, 2:20 a.m. UTC | #2
On 11/08/2016 07:16 AM, Alex Williamson wrote:
> On Sat, 5 Nov 2016 02:40:44 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>> managed by an IOMMU domain.
>>
>> Aim of this change is:
>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>> - To support direct assigned device and mediated device in single module
>>
>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>> backend module. More details:
>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>>   that is, of the process who mapped that iova range.
>> - Added pfn_list tracking logic to address space structure. All pages
>>   pinned through this interface are trached in its address space.
>                                           ^ k
> ------------------------------------------|
> 
>> - Pinned pages list is used to verify unpinning request and to unpin
>>   remaining pages while detaching the group for that device.
>> - Page accounting is updated to account in its address space where the
>>   pages are pinned/unpinned.
>> -  Accouting for mdev device is only done if there is no iommu capable
>>   domain in the container. When there is a direct device assigned to the
>>   container and that domain is iommu capable, all pages are already pinned
>>   during DMA_MAP.
>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>   through device.
>>
>> Tested by assigning below combinations of devices to a single VM:
>> - GPU pass through only
>> - vGPU device only
>> - One GPU pass through and one vGPU device
>> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>>   exist
>> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>>   exist
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 500 insertions(+), 38 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 8d64528dcc22..e511073446a0 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -36,6 +36,7 @@
>>  #include <linux/uaccess.h>
>>  #include <linux/vfio.h>
>>  #include <linux/workqueue.h>
>> +#include <linux/mdev.h>
>>  
>>  #define DRIVER_VERSION  "0.2"
>>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
>>  struct vfio_iommu {
>>  	struct list_head	domain_list;
>>  	struct list_head	addr_space_list;
>> +	struct vfio_domain	*external_domain; /* domain for external user */
>>  	struct mutex		lock;
>>  	struct rb_root		dma_list;
>>  	bool			v2;
>> @@ -67,6 +69,9 @@ struct vfio_addr_space {
>>  	struct mm_struct	*mm;
>>  	struct list_head	next;
>>  	atomic_t		ref_count;
>> +	/* external user pinned pfns */
>> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
>> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
>>  };
>>  
>>  struct vfio_domain {
>> @@ -83,6 +88,7 @@ struct vfio_dma {
>>  	unsigned long		vaddr;		/* Process virtual addr */
>>  	size_t			size;		/* Map size (bytes) */
>>  	int			prot;		/* IOMMU_READ/WRITE */
>> +	bool			iommu_mapped;
>>  	struct vfio_addr_space	*addr_space;
>>  	struct task_struct	*task;
>>  	bool			mlock_cap;
>> @@ -94,6 +100,19 @@ struct vfio_group {
>>  };
>>  
>>  /*
>> + * Guest RAM pinning working set or DMA target
>> + */
>> +struct vfio_pfn {
>> +	struct rb_node		node;
>> +	unsigned long		pfn;		/* Host pfn */
>> +	int			prot;
>> +	atomic_t		ref_count;
>> +};
>> +
>> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>> +					(!list_empty(&iommu->domain_list))
>> +
>> +/*
>>   * This code handles mapping and unmapping of user data buffers
>>   * into DMA'ble space using the IOMMU
>>   */
>> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
>>  	return NULL;
>>  }
>>  
>> +/*
>> + * Helper Functions for host pfn list
>> + */
>> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space,
>> +				      unsigned long pfn)
>> +{
>> +	struct vfio_pfn *vpfn;
>> +	struct rb_node *node = addr_space->pfn_list.rb_node;
>> +
>> +	while (node) {
>> +		vpfn = rb_entry(node, struct vfio_pfn, node);
>> +
>> +		if (pfn < vpfn->pfn)
>> +			node = node->rb_left;
>> +		else if (pfn > vpfn->pfn)
>> +			node = node->rb_right;
>> +		else
>> +			return vpfn;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static void vfio_link_pfn(struct vfio_addr_space *addr_space,
>> +			  struct vfio_pfn *new)
>> +{
>> +	struct rb_node **link, *parent = NULL;
>> +	struct vfio_pfn *vpfn;
>> +
>> +	link = &addr_space->pfn_list.rb_node;
>> +	while (*link) {
>> +		parent = *link;
>> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
>> +
>> +		if (new->pfn < vpfn->pfn)
>> +			link = &(*link)->rb_left;
>> +		else
>> +			link = &(*link)->rb_right;
>> +	}
>> +
>> +	rb_link_node(&new->node, parent, link);
>> +	rb_insert_color(&new->node, &addr_space->pfn_list);
>> +}
>> +
>> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space,
>> +			    struct vfio_pfn *old)
>> +{
>> +	rb_erase(&old->node, &addr_space->pfn_list);
>> +}
>> +
>> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space,
>> +				unsigned long pfn, int prot)
>> +{
>> +	struct vfio_pfn *vpfn;
>> +
>> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
>> +	if (!vpfn)
>> +		return -ENOMEM;
>> +
>> +	vpfn->pfn = pfn;
>> +	vpfn->prot = prot;
>> +	atomic_set(&vpfn->ref_count, 1);
>> +	vfio_link_pfn(addr_space, vpfn);
>> +	return 0;
>> +}
>> +
>> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space,
>> +				      struct vfio_pfn *vpfn)
>> +{
>> +	vfio_unlink_pfn(addr_space, vpfn);
>> +	kfree(vpfn);
>> +}
>> +
>> +static int vfio_pfn_account(struct vfio_addr_space *addr_space,
>> +			    unsigned long pfn)
>> +{
>> +	struct vfio_pfn *p;
>> +	int ret = 1;
>> +
>> +	mutex_lock(&addr_space->pfn_list_lock);
>> +	p = vfio_find_pfn(addr_space, pfn);
>> +	if (p)
>> +		ret = 0;
>> +	mutex_unlock(&addr_space->pfn_list_lock);
>> +	return ret;
>> +}
>> +
>>  struct vwork {
>>  	struct mm_struct	*mm;
>>  	long			npage;
>> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>  	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>  	bool lock_cap = dma->mlock_cap;
>>  	struct mm_struct *mm = dma->addr_space->mm;
>> -	long ret, i;
>> +	long ret, i, lock_acct;
>>  	bool rsvd;
>>  
>>  	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>>  	if (ret)
>>  		return ret;
>>  
>> +	lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base);
>> +
>>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>>  
>> -	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> +	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
>>  		put_pfn(*pfn_base, prot);
>>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>>  			limit << PAGE_SHIFT);
>> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>  			break;
>>  		}
>>  
>> +		lock_acct += vfio_pfn_account(dma->addr_space, pfn);
>> +
>>  		if (!rsvd && !lock_cap &&
>> -		    mm->locked_vm + i + 1 > limit) {
>> +		    mm->locked_vm + lock_acct + 1 > limit) {
>>  			put_pfn(pfn, prot);
>>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>>  				__func__, limit << PAGE_SHIFT);
>> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>  	}
>>  
>>  	if (!rsvd)
>> -		vfio_lock_acct(mm, i);
>> +		vfio_lock_acct(mm, lock_acct);
>>  
>>  	return i;
>>  }
>> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
>>  	return unlocked;
>>  }
>>  
>> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>> +				    int prot, unsigned long *pfn_base,
>> +				    bool do_accounting)
>> +{
>> +	struct task_struct *task = dma->task;
>> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +	bool lock_cap = dma->mlock_cap;
>> +	struct mm_struct *mm = dma->addr_space->mm;
>> +	int ret;
>> +	bool rsvd;
>> +
>> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>> +	if (ret)
>> +		return ret;
>> +
>> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
>> +
>> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> +		put_pfn(*pfn_base, prot);
>> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +			__func__, task->comm, task_pid_nr(task),
>> +			limit << PAGE_SHIFT);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	if (!rsvd && do_accounting)
>> +		vfio_lock_acct(mm, 1);
>> +
>> +	return 1;
>> +}
>> +
>> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
>> +				       unsigned long pfn, int prot,
>> +				       bool do_accounting)
>> +{
>> +	put_pfn(pfn, prot);
>> +
>> +	if (do_accounting)
>> +		vfio_lock_acct(addr_space->mm, -1);
> 
> Can't we batch this like we do elsewhere?  Intel folks, AIUI you intend
> to pin all VM memory through this side channel, have you tested the
> scalability and performance of this with larger VMs?  Our vfio_pfn
> data structure alone is 40 bytes per pinned page, which means for
> each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
> Additionally, unmapping each 1GB of VM memory will result in 256k
> separate vfio_lock_acct() callbacks.  I'm concerned that we're not
> being efficient enough in either space or time.

Hi Alex,

Sorry for being confusing, Intel vGPU actually doesn't necessarily need
to pin all guest memory. A vGPU has its page table (GTT), whose access
is trapped. Whenever guest driver wants to specify a page for DMA, it
writes the GTT entry - thereby we could know the event and pin that
page only.

Performance data will be shared once available. Thanks :)

--
Thanks,
Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Kardashevskiy Nov. 8, 2016, 6:52 a.m. UTC | #3
On 05/11/16 08:10, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>   that is, of the process who mapped that iova range.
> - Added pfn_list tracking logic to address space structure. All pages
>   pinned through this interface are trached in its address space.
> - Pinned pages list is used to verify unpinning request and to unpin
>   remaining pages while detaching the group for that device.
> - Page accounting is updated to account in its address space where the
>   pages are pinned/unpinned.
> -  Accouting for mdev device is only done if there is no iommu capable
>   domain in the container. When there is a direct device assigned to the
>   container and that domain is iommu capable, all pages are already pinned
>   during DMA_MAP.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only

This does not require this patchset, right?

> - vGPU device only

Out of curiosity - how exactly did you test this? The exact GPU, how to
create vGPU, what was the QEMU command line and the guest does with this
passed device? Thanks.
Kirti Wankhede Nov. 8, 2016, 3:06 p.m. UTC | #4
On 11/8/2016 4:46 AM, Alex Williamson wrote:
> On Sat, 5 Nov 2016 02:40:44 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
...

>> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>> +				    int prot, unsigned long *pfn_base,
>> +				    bool do_accounting)
>> +{
>> +	struct task_struct *task = dma->task;
>> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +	bool lock_cap = dma->mlock_cap;
>> +	struct mm_struct *mm = dma->addr_space->mm;
>> +	int ret;
>> +	bool rsvd;
>> +
>> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>> +	if (ret)
>> +		return ret;
>> +
>> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
>> +
>> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> +		put_pfn(*pfn_base, prot);
>> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +			__func__, task->comm, task_pid_nr(task),
>> +			limit << PAGE_SHIFT);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	if (!rsvd && do_accounting)
>> +		vfio_lock_acct(mm, 1);
>> +
>> +	return 1;
>> +}
>> +
>> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
>> +				       unsigned long pfn, int prot,
>> +				       bool do_accounting)
>> +{
>> +	put_pfn(pfn, prot);
>> +
>> +	if (do_accounting)
>> +		vfio_lock_acct(addr_space->mm, -1);
> 
> Can't we batch this like we do elsewhere?  Intel folks, AIUI you intend
> to pin all VM memory through this side channel, have you tested the
> scalability and performance of this with larger VMs?  Our vfio_pfn
> data structure alone is 40 bytes per pinned page, which means for
> each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
> Additionally, unmapping each 1GB of VM memory will result in 256k
> separate vfio_lock_acct() callbacks.  I'm concerned that we're not
> being efficient enough in either space or time.
> 
> One thought might be whether we really need to save the pfn, we better
> always get the same result if we pin it again, or maybe we can just do
> a lookup through the mm at that point without re-pinning.  Could we get
> to the point where we only need an atomic_t ref count per page in a
> linear array relative to the IOVA?

Ok. Is System RAM hot-plug supported? How is system RAM hot-plug
handled? Are there DMA_MAP calls on such hot-plug for additional range?
If we have a linear array/memory, we will have to realloc it on memory
hot-plug?

>  That would give us 1MB per 1GB
> overhead. The semantics of the pin and unpin would make more sense then
> too, both would take an IOVA range, only pinning would need a return
> mechanism. For instance:
> 
> int pin_pages(void *iommu_data, dma_addr_t iova_base,
> 	      int npage, unsigned long *pfn_base);
> 
> This would pin physically contiguous pages up to npage, returning the
> base pfn and returning the number of pages pinned (<= npage).  The
> vendor driver would make multiple calls to fill the necessary range.


With the current patch, input is user_pfn[] array and npages.

int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
                   int npage, int prot, unsigned long *phys_pfn)


When guest allocates memory with malloc(), gfns would not be contiguous,
right? These gfns (user_pfns) are passed as argument here.
Is there any case where we could get pin/unpin request for contiguous pages?


> Unpin would then simply be:
> 
> void unpin_pages(void *iommu_data, dma_addr_t iova_base, int npage);
> 
> Hugepage usage would really make such an interface shine (ie. 2MB+
> contiguous ranges).  A downside would be the overhead of getting the
> group and container reference in vfio for each callback, perhaps we'd
> need to figure out how the vendor driver could hold that reference.

In very initial phases of proposal, I had suggested to keep pointer to
container->iommu_data in struct mdev_device. But that was discarded.

> The current API of passing around pfn arrays, further increasing the
> overhead of the whole ecosystem just makes me cringe though.
> 

...

>> +		if (ret <= 0) {
>> +			WARN_ON(!ret);
>> +			goto pin_unwind;
>> +		}
>> +
>> +		mutex_lock(&dma->addr_space->pfn_list_lock);
>> +
>> +		/* search if pfn exist */
>> +		p = vfio_find_pfn(dma->addr_space, pfn[i]);
>> +		if (p) {
>> +			atomic_inc(&p->ref_count);
> 
> We never test whether (p->prot == prot), shouldn't we be doing
> something in that case?  In fact, why do we allow the side-channel
> through the .{un}pin_pages to specify page protection flags that might
> be different than the user specified for the DMA_MAP?  If the user
> specified read-only, the vendor driver should not be allowed to
> override with read-write access.
> 

If user specified protection flags for DMA_MAP could be
prot = IOMMU_WRITE | IOMMU_READ;

But vendor driver can request to pin page to be readonly, i.e.
IOMMU_READ. In that case, pin pages should be allowed, right?

Then the check should be if (p->prot & prot).

Thanks,
Kirti
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Nov. 8, 2016, 4:18 p.m. UTC | #5
On Tue, 08 Nov 2016 10:20:14 +0800
Jike Song <jike.song@intel.com> wrote:

> On 11/08/2016 07:16 AM, Alex Williamson wrote:
> > On Sat, 5 Nov 2016 02:40:44 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> >> Mediated device only uses IOMMU APIs, the underlying hardware can be
> >> managed by an IOMMU domain.
> >>
> >> Aim of this change is:
> >> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> >> - To support direct assigned device and mediated device in single module
> >>
> >> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> >> backend module. More details:
> >> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
> >>   that is, of the process who mapped that iova range.
> >> - Added pfn_list tracking logic to address space structure. All pages
> >>   pinned through this interface are trached in its address space.  
> >                                           ^ k
> > ------------------------------------------|
> >   
> >> - Pinned pages list is used to verify unpinning request and to unpin
> >>   remaining pages while detaching the group for that device.
> >> - Page accounting is updated to account in its address space where the
> >>   pages are pinned/unpinned.
> >> -  Accouting for mdev device is only done if there is no iommu capable
> >>   domain in the container. When there is a direct device assigned to the
> >>   container and that domain is iommu capable, all pages are already pinned
> >>   during DMA_MAP.
> >> - Page accouting is updated on hot plug and unplug mdev device and pass
> >>   through device.
> >>
> >> Tested by assigning below combinations of devices to a single VM:
> >> - GPU pass through only
> >> - vGPU device only
> >> - One GPU pass through and one vGPU device
> >> - Linux VM hot plug and unplug vGPU device while GPU pass through device
> >>   exist
> >> - Linux VM hot plug and unplug GPU pass through device while vGPU device
> >>   exist
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Signed-off-by: Neo Jia <cjia@nvidia.com>
> >> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 500 insertions(+), 38 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 8d64528dcc22..e511073446a0 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -36,6 +36,7 @@
> >>  #include <linux/uaccess.h>
> >>  #include <linux/vfio.h>
> >>  #include <linux/workqueue.h>
> >> +#include <linux/mdev.h>
> >>  
> >>  #define DRIVER_VERSION  "0.2"
> >>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> >> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
> >>  struct vfio_iommu {
> >>  	struct list_head	domain_list;
> >>  	struct list_head	addr_space_list;
> >> +	struct vfio_domain	*external_domain; /* domain for external user */
> >>  	struct mutex		lock;
> >>  	struct rb_root		dma_list;
> >>  	bool			v2;
> >> @@ -67,6 +69,9 @@ struct vfio_addr_space {
> >>  	struct mm_struct	*mm;
> >>  	struct list_head	next;
> >>  	atomic_t		ref_count;
> >> +	/* external user pinned pfns */
> >> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> >> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> >>  };
> >>  
> >>  struct vfio_domain {
> >> @@ -83,6 +88,7 @@ struct vfio_dma {
> >>  	unsigned long		vaddr;		/* Process virtual addr */
> >>  	size_t			size;		/* Map size (bytes) */
> >>  	int			prot;		/* IOMMU_READ/WRITE */
> >> +	bool			iommu_mapped;
> >>  	struct vfio_addr_space	*addr_space;
> >>  	struct task_struct	*task;
> >>  	bool			mlock_cap;
> >> @@ -94,6 +100,19 @@ struct vfio_group {
> >>  };
> >>  
> >>  /*
> >> + * Guest RAM pinning working set or DMA target
> >> + */
> >> +struct vfio_pfn {
> >> +	struct rb_node		node;
> >> +	unsigned long		pfn;		/* Host pfn */
> >> +	int			prot;
> >> +	atomic_t		ref_count;
> >> +};
> >> +
> >> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >> +					(!list_empty(&iommu->domain_list))
> >> +
> >> +/*
> >>   * This code handles mapping and unmapping of user data buffers
> >>   * into DMA'ble space using the IOMMU
> >>   */
> >> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
> >>  	return NULL;
> >>  }
> >>  
> >> +/*
> >> + * Helper Functions for host pfn list
> >> + */
> >> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space,
> >> +				      unsigned long pfn)
> >> +{
> >> +	struct vfio_pfn *vpfn;
> >> +	struct rb_node *node = addr_space->pfn_list.rb_node;
> >> +
> >> +	while (node) {
> >> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> >> +
> >> +		if (pfn < vpfn->pfn)
> >> +			node = node->rb_left;
> >> +		else if (pfn > vpfn->pfn)
> >> +			node = node->rb_right;
> >> +		else
> >> +			return vpfn;
> >> +	}
> >> +
> >> +	return NULL;
> >> +}
> >> +
> >> +static void vfio_link_pfn(struct vfio_addr_space *addr_space,
> >> +			  struct vfio_pfn *new)
> >> +{
> >> +	struct rb_node **link, *parent = NULL;
> >> +	struct vfio_pfn *vpfn;
> >> +
> >> +	link = &addr_space->pfn_list.rb_node;
> >> +	while (*link) {
> >> +		parent = *link;
> >> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> >> +
> >> +		if (new->pfn < vpfn->pfn)
> >> +			link = &(*link)->rb_left;
> >> +		else
> >> +			link = &(*link)->rb_right;
> >> +	}
> >> +
> >> +	rb_link_node(&new->node, parent, link);
> >> +	rb_insert_color(&new->node, &addr_space->pfn_list);
> >> +}
> >> +
> >> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space,
> >> +			    struct vfio_pfn *old)
> >> +{
> >> +	rb_erase(&old->node, &addr_space->pfn_list);
> >> +}
> >> +
> >> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space,
> >> +				unsigned long pfn, int prot)
> >> +{
> >> +	struct vfio_pfn *vpfn;
> >> +
> >> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> >> +	if (!vpfn)
> >> +		return -ENOMEM;
> >> +
> >> +	vpfn->pfn = pfn;
> >> +	vpfn->prot = prot;
> >> +	atomic_set(&vpfn->ref_count, 1);
> >> +	vfio_link_pfn(addr_space, vpfn);
> >> +	return 0;
> >> +}
> >> +
> >> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space,
> >> +				      struct vfio_pfn *vpfn)
> >> +{
> >> +	vfio_unlink_pfn(addr_space, vpfn);
> >> +	kfree(vpfn);
> >> +}
> >> +
> >> +static int vfio_pfn_account(struct vfio_addr_space *addr_space,
> >> +			    unsigned long pfn)
> >> +{
> >> +	struct vfio_pfn *p;
> >> +	int ret = 1;
> >> +
> >> +	mutex_lock(&addr_space->pfn_list_lock);
> >> +	p = vfio_find_pfn(addr_space, pfn);
> >> +	if (p)
> >> +		ret = 0;
> >> +	mutex_unlock(&addr_space->pfn_list_lock);
> >> +	return ret;
> >> +}
> >> +
> >>  struct vwork {
> >>  	struct mm_struct	*mm;
> >>  	long			npage;
> >> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >>  	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >>  	bool lock_cap = dma->mlock_cap;
> >>  	struct mm_struct *mm = dma->addr_space->mm;
> >> -	long ret, i;
> >> +	long ret, i, lock_acct;
> >>  	bool rsvd;
> >>  
> >>  	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> >>  	if (ret)
> >>  		return ret;
> >>  
> >> +	lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base);
> >> +
> >>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
> >>  
> >> -	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> >> +	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
> >>  		put_pfn(*pfn_base, prot);
> >>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> >>  			limit << PAGE_SHIFT);
> >> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >>  			break;
> >>  		}
> >>  
> >> +		lock_acct += vfio_pfn_account(dma->addr_space, pfn);
> >> +
> >>  		if (!rsvd && !lock_cap &&
> >> -		    mm->locked_vm + i + 1 > limit) {
> >> +		    mm->locked_vm + lock_acct + 1 > limit) {
> >>  			put_pfn(pfn, prot);
> >>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> >>  				__func__, limit << PAGE_SHIFT);
> >> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >>  	}
> >>  
> >>  	if (!rsvd)
> >> -		vfio_lock_acct(mm, i);
> >> +		vfio_lock_acct(mm, lock_acct);
> >>  
> >>  	return i;
> >>  }
> >> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
> >>  	return unlocked;
> >>  }
> >>  
> >> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >> +				    int prot, unsigned long *pfn_base,
> >> +				    bool do_accounting)
> >> +{
> >> +	struct task_struct *task = dma->task;
> >> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >> +	bool lock_cap = dma->mlock_cap;
> >> +	struct mm_struct *mm = dma->addr_space->mm;
> >> +	int ret;
> >> +	bool rsvd;
> >> +
> >> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> >> +
> >> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> >> +		put_pfn(*pfn_base, prot);
> >> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> >> +			__func__, task->comm, task_pid_nr(task),
> >> +			limit << PAGE_SHIFT);
> >> +		return -ENOMEM;
> >> +	}
> >> +
> >> +	if (!rsvd && do_accounting)
> >> +		vfio_lock_acct(mm, 1);
> >> +
> >> +	return 1;
> >> +}
> >> +
> >> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
> >> +				       unsigned long pfn, int prot,
> >> +				       bool do_accounting)
> >> +{
> >> +	put_pfn(pfn, prot);
> >> +
> >> +	if (do_accounting)
> >> +		vfio_lock_acct(addr_space->mm, -1);  
> > 
> > Can't we batch this like we do elsewhere?  Intel folks, AIUI you intend
> > to pin all VM memory through this side channel, have you tested the
> > scalability and performance of this with larger VMs?  Our vfio_pfn
> > data structure alone is 40 bytes per pinned page, which means for
> > each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
> > Additionally, unmapping each 1GB of VM memory will result in 256k
> > separate vfio_lock_acct() callbacks.  I'm concerned that we're not
> > being efficient enough in either space or time.  
> 
> Hi Alex,
> 
> Sorry for being confusing, Intel vGPU actually doesn't necessarily need
> to pin all guest memory. A vGPU has its page table (GTT), whose access
> is trapped. Whenever guest driver wants to specify a page for DMA, it
> writes the GTT entry - thereby we could know the event and pin that
> page only.
> 
> Performance data will be shared once available. Thanks :)

Ok, so maybe if none of the initial users are mapping full VM memory
then scalability can be saved for future optimization.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Nov. 8, 2016, 5:05 p.m. UTC | #6
On Tue, 8 Nov 2016 20:36:34 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/8/2016 4:46 AM, Alex Williamson wrote:
> > On Sat, 5 Nov 2016 02:40:44 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> ...
> 
> >> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >> +				    int prot, unsigned long *pfn_base,
> >> +				    bool do_accounting)
> >> +{
> >> +	struct task_struct *task = dma->task;
> >> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >> +	bool lock_cap = dma->mlock_cap;
> >> +	struct mm_struct *mm = dma->addr_space->mm;
> >> +	int ret;
> >> +	bool rsvd;
> >> +
> >> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> >> +
> >> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
> >> +		put_pfn(*pfn_base, prot);
> >> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> >> +			__func__, task->comm, task_pid_nr(task),
> >> +			limit << PAGE_SHIFT);
> >> +		return -ENOMEM;
> >> +	}
> >> +
> >> +	if (!rsvd && do_accounting)
> >> +		vfio_lock_acct(mm, 1);
> >> +
> >> +	return 1;
> >> +}
> >> +
> >> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
> >> +				       unsigned long pfn, int prot,
> >> +				       bool do_accounting)
> >> +{
> >> +	put_pfn(pfn, prot);
> >> +
> >> +	if (do_accounting)
> >> +		vfio_lock_acct(addr_space->mm, -1);  
> > 
> > Can't we batch this like we do elsewhere?  Intel folks, AIUI you intend
> > to pin all VM memory through this side channel, have you tested the
> > scalability and performance of this with larger VMs?  Our vfio_pfn
> > data structure alone is 40 bytes per pinned page, which means for
> > each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
> > Additionally, unmapping each 1GB of VM memory will result in 256k
> > separate vfio_lock_acct() callbacks.  I'm concerned that we're not
> > being efficient enough in either space or time.
> > 
> > One thought might be whether we really need to save the pfn, we better
> > always get the same result if we pin it again, or maybe we can just do
> > a lookup through the mm at that point without re-pinning.  Could we get
> > to the point where we only need an atomic_t ref count per page in a
> > linear array relative to the IOVA?  
> 
> Ok. Is System RAM hot-plug supported? How is system RAM hot-plug
> handled? Are there DMA_MAP calls on such hot-plug for additional range?
> If we have a linear array/memory, we will have to realloc it on memory
> hot-plug?

I was thinking a linear array for each IOVA page within a vfio_dma.
The array would track the number of references (pins) of each page.  It
might actually need to be a page table given that a single vfio_dma can
nearly map the entire 64bit address space.  I don't think RAM hotplug
is a factor here, we need to support and properly account for multiple
IOVAs mapping to the same pfn, but the typical case will be a 1:1
mapping, I think that's what we'd optimize for.

> >  That would give us 1MB per 1GB
> > overhead. The semantics of the pin and unpin would make more sense then
> > too, both would take an IOVA range, only pinning would need a return
> > mechanism. For instance:
> > 
> > int pin_pages(void *iommu_data, dma_addr_t iova_base,
> > 	      int npage, unsigned long *pfn_base);
> > 
> > This would pin physically contiguous pages up to npage, returning the
> > base pfn and returning the number of pages pinned (<= npage).  The
> > vendor driver would make multiple calls to fill the necessary range.  
> 
> 
> With the current patch, input is user_pfn[] array and npages.
> 
> int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
>                    int npage, int prot, unsigned long *phys_pfn)
> 
> 
> When guest allocates memory with malloc(), gfns would not be contiguous,
> right? These gfns (user_pfns) are passed as argument here.
> Is there any case where we could get pin/unpin request for contiguous pages?

It would depend on whether the user within the guest is actually
optimizing for hugepages.

> > Unpin would then simply be:
> > 
> > void unpin_pages(void *iommu_data, dma_addr_t iova_base, int npage);
> > 
> > Hugepage usage would really make such an interface shine (ie. 2MB+
> > contiguous ranges).  A downside would be the overhead of getting the
> > group and container reference in vfio for each callback, perhaps we'd
> > need to figure out how the vendor driver could hold that reference.  
> 
> In very initial phases of proposal, I had suggested to keep pointer to
> container->iommu_data in struct mdev_device. But that was discarded.

The referencing is definitely tricky, we run into the same problem that
Alexey had in trying to do that where holding a reference to the
container doesn't actually keep the container.  The user can tear down
the container or remove groups from the container, which automatically
removes the iommu backend.

> > The current API of passing around pfn arrays, further increasing the
> > overhead of the whole ecosystem just makes me cringe though.
> >   
> 
> ...
> 
> >> +		if (ret <= 0) {
> >> +			WARN_ON(!ret);
> >> +			goto pin_unwind;
> >> +		}
> >> +
> >> +		mutex_lock(&dma->addr_space->pfn_list_lock);
> >> +
> >> +		/* search if pfn exist */
> >> +		p = vfio_find_pfn(dma->addr_space, pfn[i]);
> >> +		if (p) {
> >> +			atomic_inc(&p->ref_count);  
> > 
> > We never test whether (p->prot == prot), shouldn't we be doing
> > something in that case?  In fact, why do we allow the side-channel
> > through the .{un}pin_pages to specify page protection flags that might
> > be different than the user specified for the DMA_MAP?  If the user
> > specified read-only, the vendor driver should not be allowed to
> > override with read-write access.
> >   
> 
> If user specified protection flags for DMA_MAP could be
> prot = IOMMU_WRITE | IOMMU_READ;
> 
> But vendor driver can request to pin page to be readonly, i.e.
> IOMMU_READ. In that case, pin pages should be allowed, right?
> 
> Then the check should be if (p->prot & prot).

That doesn't solve the problem, we might have an existing pfn with
prot R and we're asking for RW, (R & RW) is true, but that's not what
we asked for.  I think the solution would be to test the vendor driver
requested prot against vfio_dma.prot, but always pin the pages using
the vfio_dma.prot flags.  Thus if the vendor driver asks for read-only
and the vfio_dma.prot is read-write, these are compatible and the page
is pinned read-write since that's the user mapping.  Otherwise you need
to save prot per page (which you already do), but you're also going to
need to handle promoting pages if we have multiple vendors asking for
different prot attributes.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Kardashevskiy Nov. 15, 2016, 5:17 a.m. UTC | #7
On 08/11/16 17:52, Alexey Kardashevskiy wrote:
> On 05/11/16 08:10, Kirti Wankhede wrote:
>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>> managed by an IOMMU domain.
>>
>> Aim of this change is:
>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>> - To support direct assigned device and mediated device in single module
>>
>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>> backend module. More details:
>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>>   that is, of the process who mapped that iova range.
>> - Added pfn_list tracking logic to address space structure. All pages
>>   pinned through this interface are trached in its address space.
>> - Pinned pages list is used to verify unpinning request and to unpin
>>   remaining pages while detaching the group for that device.
>> - Page accounting is updated to account in its address space where the
>>   pages are pinned/unpinned.
>> -  Accouting for mdev device is only done if there is no iommu capable
>>   domain in the container. When there is a direct device assigned to the
>>   container and that domain is iommu capable, all pages are already pinned
>>   during DMA_MAP.
>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>   through device.
>>
>> Tested by assigning below combinations of devices to a single VM:
>> - GPU pass through only
> 
> This does not require this patchset, right?
> 
>> - vGPU device only
> 
> Out of curiosity - how exactly did you test this? The exact GPU, how to
> create vGPU, what was the QEMU command line and the guest does with this
> passed device? Thanks.


ping?
Kirti Wankhede Nov. 15, 2016, 6:33 a.m. UTC | #8
On 11/15/2016 10:47 AM, Alexey Kardashevskiy wrote:
> On 08/11/16 17:52, Alexey Kardashevskiy wrote:
>> On 05/11/16 08:10, Kirti Wankhede wrote:
>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>> managed by an IOMMU domain.
>>>
>>> Aim of this change is:
>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>> - To support direct assigned device and mediated device in single module
>>>
>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>> backend module. More details:
>>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>>>   that is, of the process who mapped that iova range.
>>> - Added pfn_list tracking logic to address space structure. All pages
>>>   pinned through this interface are trached in its address space.
>>> - Pinned pages list is used to verify unpinning request and to unpin
>>>   remaining pages while detaching the group for that device.
>>> - Page accounting is updated to account in its address space where the
>>>   pages are pinned/unpinned.
>>> -  Accouting for mdev device is only done if there is no iommu capable
>>>   domain in the container. When there is a direct device assigned to the
>>>   container and that domain is iommu capable, all pages are already pinned
>>>   during DMA_MAP.
>>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>>   through device.
>>>
>>> Tested by assigning below combinations of devices to a single VM:
>>> - GPU pass through only
>>
>> This does not require this patchset, right?
>>

Sorry I missed this earlier.
This testing is required for this patch, because this patch touches code
that is used for direct device assignment. Also for page accounting, all
cases are considered i.e. when there is only pass through device in a
container, when there is pass through device + vGPU device in a
container. Also have to test that pages are pinned properly when device
is hotplugged. In that case vfio_iommu_replay() is called to take
necessary action.

>>> - vGPU device only
>>
>> Out of curiosity - how exactly did you test this? The exact GPU, how to
>> create vGPU, what was the QEMU command line and the guest does with this
>> passed device? Thanks.
> 
> ping?
> 

I'm testing this code with M60, with custom changes in our driver.
Steps how to create mediated device are listed in
Documentation/vfio-mediated-device.txt for sample mtty driver. Same
steps I'm following for GPU. Quoting those steps here for you:

2. Create a mediated device by using the dummy device that you created
in the
   previous step.

   # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" >      \

/sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create

3. Add parameters to qemu-kvm.

   -device vfio-pci,\
    sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001


Thanks,
Kirti

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Kardashevskiy Nov. 15, 2016, 7:27 a.m. UTC | #9
On 15/11/16 17:33, Kirti Wankhede wrote:
> 
> 
> On 11/15/2016 10:47 AM, Alexey Kardashevskiy wrote:
>> On 08/11/16 17:52, Alexey Kardashevskiy wrote:
>>> On 05/11/16 08:10, Kirti Wankhede wrote:
>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>> managed by an IOMMU domain.
>>>>
>>>> Aim of this change is:
>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>> - To support direct assigned device and mediated device in single module
>>>>
>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>> backend module. More details:
>>>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>>>>   that is, of the process who mapped that iova range.
>>>> - Added pfn_list tracking logic to address space structure. All pages
>>>>   pinned through this interface are trached in its address space.
>>>> - Pinned pages list is used to verify unpinning request and to unpin
>>>>   remaining pages while detaching the group for that device.
>>>> - Page accounting is updated to account in its address space where the
>>>>   pages are pinned/unpinned.
>>>> -  Accouting for mdev device is only done if there is no iommu capable
>>>>   domain in the container. When there is a direct device assigned to the
>>>>   container and that domain is iommu capable, all pages are already pinned
>>>>   during DMA_MAP.
>>>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>>>   through device.
>>>>
>>>> Tested by assigning below combinations of devices to a single VM:
>>>> - GPU pass through only
>>>
>>> This does not require this patchset, right?
>>>
> 
> Sorry I missed this earlier.
> This testing is required for this patch, because this patch touches code
> that is used for direct device assignment. Also for page accounting, all
> cases are considered i.e. when there is only pass through device in a
> container, when there is pass through device + vGPU device in a
> container. Also have to test that pages are pinned properly when device
> is hotplugged. In that case vfio_iommu_replay() is called to take
> necessary action.

So in this particular test you are only testing that the patchset did not
break the already existing functionality, is that correct?


> 
>>>> - vGPU device only
>>>
>>> Out of curiosity - how exactly did you test this? The exact GPU, how to
>>> create vGPU, what was the QEMU command line and the guest does with this
>>> passed device? Thanks.
>>
>> ping?
>>
> 
> I'm testing this code with M60, with custom changes in our driver.


Is this shared anywhere? What does the mediated driver do? Can Tesla K80 do
the same thing, or [10de:15fe] (whatever its name is)?


> Steps how to create mediated device are listed in
> Documentation/vfio-mediated-device.txt for sample mtty driver. Same
> steps I'm following for GPU. Quoting those steps here for you:


Nah, I saw this, I was wondering about actual hardware :) Like when you say
"tested with vGPU" - I am wondering what is passed to the guest and how the
guest is actually using it.


> 
> 2. Create a mediated device by using the dummy device that you created
> in the
>    previous step.
> 
>    # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" >      \
> 
> /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create
> 
> 3. Add parameters to qemu-kvm.
> 
>    -device vfio-pci,\
>     sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
Kirti Wankhede Nov. 15, 2016, 7:56 a.m. UTC | #10
On 11/15/2016 12:57 PM, Alexey Kardashevskiy wrote:
> On 15/11/16 17:33, Kirti Wankhede wrote:
>>
>>
>> On 11/15/2016 10:47 AM, Alexey Kardashevskiy wrote:
>>> On 08/11/16 17:52, Alexey Kardashevskiy wrote:
>>>> On 05/11/16 08:10, Kirti Wankhede wrote:
>>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>>> managed by an IOMMU domain.
>>>>>
>>>>> Aim of this change is:
>>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>>> - To support direct assigned device and mediated device in single module
>>>>>
>>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>>> backend module. More details:
>>>>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>>>>>   that is, of the process who mapped that iova range.
>>>>> - Added pfn_list tracking logic to address space structure. All pages
>>>>>   pinned through this interface are trached in its address space.
>>>>> - Pinned pages list is used to verify unpinning request and to unpin
>>>>>   remaining pages while detaching the group for that device.
>>>>> - Page accounting is updated to account in its address space where the
>>>>>   pages are pinned/unpinned.
>>>>> -  Accouting for mdev device is only done if there is no iommu capable
>>>>>   domain in the container. When there is a direct device assigned to the
>>>>>   container and that domain is iommu capable, all pages are already pinned
>>>>>   during DMA_MAP.
>>>>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>>>>   through device.
>>>>>
>>>>> Tested by assigning below combinations of devices to a single VM:
>>>>> - GPU pass through only
>>>>
>>>> This does not require this patchset, right?
>>>>
>>
>> Sorry I missed this earlier.
>> This testing is required for this patch, because this patch touches code
>> that is used for direct device assignment. Also for page accounting, all
>> cases are considered i.e. when there is only pass through device in a
>> container, when there is pass through device + vGPU device in a
>> container. Also have to test that pages are pinned properly when device
>> is hotplugged. In that case vfio_iommu_replay() is called to take
>> necessary action.
> 
> So in this particular test you are only testing that the patchset did not
> break the already existing functionality, is that correct?
> 
> 

Yes.

>>
>>>>> - vGPU device only
>>>>
>>>> Out of curiosity - how exactly did you test this? The exact GPU, how to
>>>> create vGPU, what was the QEMU command line and the guest does with this
>>>> passed device? Thanks.
>>>
>>> ping?
>>>
>>
>> I'm testing this code with M60, with custom changes in our driver.
> 
> 
> Is this shared anywhere? What does the mediated driver do? Can Tesla K80 do
> the same thing, or [10de:15fe] (whatever its name is)?
> 
> 

No, we are still in the process of upstreaming this patchset. After that
we would prepare official drivers. There is no timeline for the driver
release yet.

>> Steps how to create mediated device are listed in
>> Documentation/vfio-mediated-device.txt for sample mtty driver. Same
>> steps I'm following for GPU. Quoting those steps here for you:
> 
> 
> Nah, I saw this, I was wondering about actual hardware :) Like when you say
> "tested with vGPU" - I am wondering what is passed to the guest and how the
> guest is actually using it.
> 
> 

Its similar to mtty sample driver, that simulates serial port and so you
see serial ports in guest. For vGPU, you see a virtual GPU in guest.


Thanks,
Kirti

>>
>> 2. Create a mediated device by using the dummy device that you created
>> in the
>>    previous step.
>>
>>    # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" >      \
>>
>> /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create
>>
>> 3. Add parameters to qemu-kvm.
>>
>>    -device vfio-pci,\
>>     sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
> 
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8d64528dcc22..e511073446a0 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/mdev.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -56,6 +57,7 @@  MODULE_PARM_DESC(disable_hugepages,
 struct vfio_iommu {
 	struct list_head	domain_list;
 	struct list_head	addr_space_list;
+	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
@@ -67,6 +69,9 @@  struct vfio_addr_space {
 	struct mm_struct	*mm;
 	struct list_head	next;
 	atomic_t		ref_count;
+	/* external user pinned pfns */
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
 };
 
 struct vfio_domain {
@@ -83,6 +88,7 @@  struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	bool			iommu_mapped;
 	struct vfio_addr_space	*addr_space;
 	struct task_struct	*task;
 	bool			mlock_cap;
@@ -94,6 +100,19 @@  struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		pfn;		/* Host pfn */
+	int			prot;
+	atomic_t		ref_count;
+};
+
+#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
+					(!list_empty(&iommu->domain_list))
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -153,6 +172,93 @@  static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
 	return NULL;
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space,
+				      unsigned long pfn)
+{
+	struct vfio_pfn *vpfn;
+	struct rb_node *node = addr_space->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else
+			return vpfn;
+	}
+
+	return NULL;
+}
+
+static void vfio_link_pfn(struct vfio_addr_space *addr_space,
+			  struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &addr_space->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &addr_space->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_addr_space *addr_space,
+			    struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &addr_space->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space,
+				unsigned long pfn, int prot)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->pfn = pfn;
+	vpfn->prot = prot;
+	atomic_set(&vpfn->ref_count, 1);
+	vfio_link_pfn(addr_space, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(addr_space, vpfn);
+	kfree(vpfn);
+}
+
+static int vfio_pfn_account(struct vfio_addr_space *addr_space,
+			    unsigned long pfn)
+{
+	struct vfio_pfn *p;
+	int ret = 1;
+
+	mutex_lock(&addr_space->pfn_list_lock);
+	p = vfio_find_pfn(addr_space, pfn);
+	if (p)
+		ret = 0;
+	mutex_unlock(&addr_space->pfn_list_lock);
+	return ret;
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -304,16 +410,18 @@  static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = dma->mlock_cap;
 	struct mm_struct *mm = dma->addr_space->mm;
-	long ret, i;
+	long ret, i, lock_acct;
 	bool rsvd;
 
 	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
+	lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base);
+
 	rsvd = is_invalid_reserved_pfn(*pfn_base);
 
-	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
+	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
 		put_pfn(*pfn_base, prot);
 		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 			limit << PAGE_SHIFT);
@@ -340,8 +448,10 @@  static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 			break;
 		}
 
+		lock_acct += vfio_pfn_account(dma->addr_space, pfn);
+
 		if (!rsvd && !lock_cap &&
-		    mm->locked_vm + i + 1 > limit) {
+		    mm->locked_vm + lock_acct + 1 > limit) {
 			put_pfn(pfn, prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 				__func__, limit << PAGE_SHIFT);
@@ -350,7 +460,7 @@  static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(mm, i);
+		vfio_lock_acct(mm, lock_acct);
 
 	return i;
 }
@@ -370,14 +480,214 @@  static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
 	return unlocked;
 }
 
-static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
+static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
+				    int prot, unsigned long *pfn_base,
+				    bool do_accounting)
+{
+	struct task_struct *task = dma->task;
+	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = dma->mlock_cap;
+	struct mm_struct *mm = dma->addr_space->mm;
+	int ret;
+	bool rsvd;
+
+	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
+
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
+
+	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
+			__func__, task->comm, task_pid_nr(task),
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	if (!rsvd && do_accounting)
+		vfio_lock_acct(mm, 1);
+
+	return 1;
+}
+
+static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
+				       unsigned long pfn, int prot,
+				       bool do_accounting)
+{
+	put_pfn(pfn, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(addr_space->mm, -1);
+}
+
+static int vfio_unpin_pfn(struct vfio_addr_space *addr_space,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	__vfio_unpin_page_external(addr_space, vpfn->pfn, vpfn->prot,
+			do_accounting);
+
+	if (atomic_dec_and_test(&vpfn->ref_count))
+		vfio_remove_from_pfn_list(addr_space, vpfn);
+
+	return 1;
+}
+
+static int vfio_iommu_type1_pin_pages(void *iommu_data,
+				      unsigned long *user_pfn,
+				      int npage, int prot,
+				      unsigned long *phys_pfn)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int i, j, ret;
+	unsigned long remote_vaddr;
+	unsigned long *pfn = phys_pfn;
+	struct vfio_dma *dma;
+	bool do_accounting;
+
+	if (!iommu || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->external_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accouting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		ret = __vfio_pin_page_external(dma, remote_vaddr, prot,
+				&pfn[i], do_accounting);
+		if (ret <= 0) {
+			WARN_ON(!ret);
+			goto pin_unwind;
+		}
+
+		mutex_lock(&dma->addr_space->pfn_list_lock);
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(dma->addr_space, pfn[i]);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			mutex_unlock(&dma->addr_space->pfn_list_lock);
+			continue;
+		}
+
+		ret = vfio_add_to_pfn_list(dma->addr_space, pfn[i], prot);
+		mutex_unlock(&dma->addr_space->pfn_list_lock);
+
+		if (ret) {
+			__vfio_unpin_page_external(dma->addr_space, pfn[i],
+					prot, do_accounting);
+			goto pin_unwind;
+		}
+	}
+
+	ret = i;
+	goto pin_done;
+
+pin_unwind:
+	pfn[i] = 0;
+	for (j = 0; j < i; j++) {
+		struct vfio_pfn *p;
+		dma_addr_t iova;
+
+		iova = user_pfn[j] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0);
+
+		mutex_lock(&dma->addr_space->pfn_list_lock);
+		p = vfio_find_pfn(dma->addr_space, pfn[j]);
+		if (p)
+			vfio_unpin_pfn(dma->addr_space, p, do_accounting);
+
+		mutex_unlock(&dma->addr_space->pfn_list_lock);
+		pfn[j] = 0;
+	}
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_type1_unpin_pages(void *iommu_data,
+					unsigned long *user_pfn,
+					unsigned long *pfn,
+					int npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	bool do_accounting;
+	int unlocked = 0, i;
+
+	if (!iommu || !user_pfn || !pfn)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->external_domain) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
+	}
+
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+		struct vfio_dma *dma;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma)
+			goto unpin_exit;
+
+		mutex_lock(&dma->addr_space->pfn_list_lock);
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(dma->addr_space, pfn[i]);
+		if (p)
+			unlocked += vfio_unpin_pfn(dma->addr_space, p,
+						   do_accounting);
+		mutex_unlock(&dma->addr_space->pfn_list_lock);
+	}
+unpin_exit:
+	mutex_unlock(&iommu->lock);
+	return unlocked;
+}
+
+static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			     bool do_accounting)
 {
 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 	struct vfio_domain *domain, *d;
 	long unlocked = 0;
 
 	if (!dma->size)
-		return;
+		return 0;
+
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		return 0;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -427,12 +737,17 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		cond_resched();
 	}
 
-	vfio_lock_acct(dma->addr_space->mm, -unlocked);
+	dma->iommu_mapped = false;
+	if (do_accounting) {
+		vfio_lock_acct(dma->addr_space->mm, -unlocked);
+		return 0;
+	}
+	return unlocked;
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
-	vfio_unmap_unpin(iommu, dma);
+	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 
 	if (atomic_dec_and_test(&dma->addr_space->ref_count)) {
@@ -642,6 +957,8 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		dma->size += npage << PAGE_SHIFT;
 	}
 
+	dma->iommu_mapped = true;
+
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
@@ -706,6 +1023,8 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			goto do_map_err;
 		}
 		addr_space->mm = mm;
+		addr_space->pfn_list = RB_ROOT;
+		mutex_init(&addr_space->pfn_list_lock);
 		atomic_set(&addr_space->ref_count, 1);
 		list_add(&addr_space->next, &iommu->addr_space_list);
 		free_addr_space_on_err = true;
@@ -733,7 +1052,11 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	ret = vfio_pin_map_dma(iommu, dma, size);
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		dma->size = size;
+	else
+		ret = vfio_pin_map_dma(iommu, dma, size);
 do_map_err:
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -762,10 +1085,6 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
 
-	/* If there's not a domain, there better not be any mappings */
-	if (WARN_ON(n && !d))
-		return -EINVAL;
-
 	for (; n; n = rb_next(n)) {
 		struct vfio_dma *dma;
 		dma_addr_t iova;
@@ -774,20 +1093,43 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		iova = dma->iova;
 
 		while (iova < dma->iova + dma->size) {
-			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
+			phys_addr_t phys;
 			size_t size;
 
-			if (WARN_ON(!phys)) {
-				iova += PAGE_SIZE;
-				continue;
-			}
+			if (dma->iommu_mapped) {
+				phys = iommu_iova_to_phys(d->domain, iova);
+
+				if (WARN_ON(!phys)) {
+					iova += PAGE_SIZE;
+					continue;
+				}
 
-			size = PAGE_SIZE;
+				size = PAGE_SIZE;
 
-			while (iova + size < dma->iova + dma->size &&
-			       phys + size == iommu_iova_to_phys(d->domain,
+				while (iova + size < dma->iova + dma->size &&
+				    phys + size == iommu_iova_to_phys(d->domain,
 								 iova + size))
-				size += PAGE_SIZE;
+					size += PAGE_SIZE;
+			} else {
+				unsigned long pfn;
+				unsigned long vaddr = dma->vaddr +
+						     (iova - dma->iova);
+				size_t n = dma->iova + dma->size - iova;
+				long npage;
+
+				npage = __vfio_pin_pages_remote(dma, vaddr,
+								n >> PAGE_SHIFT,
+								dma->prot,
+								&pfn);
+				if (npage <= 0) {
+					WARN_ON(!npage);
+					ret = (int)npage;
+					return ret;
+				}
+
+				phys = pfn << PAGE_SHIFT;
+				size = npage << PAGE_SHIFT;
+			}
 
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
@@ -796,6 +1138,8 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 			iova += size;
 		}
+
+		dma->iommu_mapped = true;
 	}
 
 	return 0;
@@ -853,7 +1197,7 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
-	struct bus_type *bus = NULL;
+	struct bus_type *bus = NULL, *mdev_bus;
 	int ret;
 
 	mutex_lock(&iommu->lock);
@@ -865,6 +1209,13 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 		}
 	}
 
+	if (iommu->external_domain) {
+		if (find_iommu_group(iommu->external_domain, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
+
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 	if (!group || !domain) {
@@ -879,6 +1230,25 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	mdev_bus = symbol_get(mdev_bus_type);
+
+	if (mdev_bus) {
+		if ((bus == mdev_bus) && !iommu_present(bus)) {
+			symbol_put(mdev_bus_type);
+			if (!iommu->external_domain) {
+				INIT_LIST_HEAD(&domain->group_list);
+				iommu->external_domain = domain;
+			} else
+				kfree(domain);
+
+			list_add(&group->next,
+				 &iommu->external_domain->group_list);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+		symbol_put(mdev_bus_type);
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -969,6 +1339,51 @@  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+	struct vfio_addr_space *as;
+
+	list_for_each_entry(as, &iommu->addr_space_list, next) {
+		struct rb_node *n, *p;
+		long locked = 0, unlocked = 0;
+
+		n = rb_first(&iommu->dma_list);
+		for (; n; n = rb_next(n)) {
+			struct vfio_dma *dma;
+
+			dma = rb_entry(n, struct vfio_dma, node);
+			if (dma->addr_space == as)
+				unlocked += vfio_unmap_unpin(iommu, dma, false);
+		}
+
+		mutex_lock(&as->pfn_list_lock);
+		p = rb_first(&as->pfn_list);
+		for (; p; p = rb_next(p))
+			locked++;
+
+		mutex_unlock(&as->pfn_list_lock);
+		vfio_lock_acct(as->mm, locked - unlocked);
+	}
+}
+
+static void vfio_external_unpin_all(struct vfio_iommu *iommu,
+				    bool do_accounting)
+{
+	struct vfio_addr_space *as;
+
+	list_for_each_entry(as, &iommu->addr_space_list, next) {
+		struct rb_node *node;
+
+		mutex_lock(&as->pfn_list_lock);
+		while ((node = rb_first(&as->pfn_list)))
+			vfio_unpin_pfn(as,
+				       rb_entry(node, struct vfio_pfn, node),
+				       do_accounting);
+
+		mutex_unlock(&as->pfn_list_lock);
+	}
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -978,6 +1393,28 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
+	if (iommu->external_domain) {
+		domain = iommu->external_domain;
+		group = find_iommu_group(domain, iommu_group);
+		if (group) {
+			list_del(&group->next);
+			kfree(group);
+
+			if (list_empty(&domain->group_list)) {
+				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+					vfio_external_unpin_all(iommu, true);
+					vfio_iommu_unmap_unpin_all(iommu);
+				} else
+					vfio_external_unpin_all(iommu, false);
+				kfree(domain);
+				iommu->external_domain = NULL;
+			}
+			goto detach_group_done;
+		}
+	}
+
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		goto detach_group_done;
 
 	list_for_each_entry(domain, &iommu->domain_list, next) {
 		group = find_iommu_group(domain, iommu_group);
@@ -988,21 +1425,27 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 		list_del(&group->next);
 		kfree(group);
 		/*
-		 * Group ownership provides privilege, if the group
-		 * list is empty, the domain goes away.  If it's the
-		 * last domain, then all the mappings go away too.
+		 * Group ownership provides privilege, if the group list is
+		 * empty, the domain goes away. If it's the last domain with
+		 * iommu and external domain doesn't exist, then all the
+		 * mappings go away too. If it's the last domain with iommu and
+		 * external domain exist, update accounting
 		 */
 		if (list_empty(&domain->group_list)) {
-			if (list_is_singular(&iommu->domain_list))
-				vfio_iommu_unmap_unpin_all(iommu);
+			if (list_is_singular(&iommu->domain_list)) {
+				if (!iommu->external_domain)
+					vfio_iommu_unmap_unpin_all(iommu);
+				else
+					vfio_iommu_unmap_unpin_reaccount(iommu);
+			}
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
 			kfree(domain);
 		}
-		goto done;
+		break;
 	}
 
-done:
+detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -1028,29 +1471,46 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	}
 
 	INIT_LIST_HEAD(&iommu->domain_list);
+	INIT_LIST_HEAD(&iommu->addr_space_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
 
 	return iommu;
 }
 
+static void vfio_release_domain(struct vfio_domain *domain, bool external)
+{
+	struct vfio_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		if (!external)
+			iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	if (!external)
+		iommu_domain_free(domain->domain);
+}
+
 static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
-	struct vfio_group *group, *group_tmp;
+
+	if (iommu->external_domain) {
+		vfio_release_domain(iommu->external_domain, true);
+		vfio_external_unpin_all(iommu, false);
+		kfree(iommu->external_domain);
+		iommu->external_domain = NULL;
+	}
 
 	vfio_iommu_unmap_unpin_all(iommu);
 
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
-		list_for_each_entry_safe(group, group_tmp,
-					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
-			list_del(&group->next);
-			kfree(group);
-		}
-		iommu_domain_free(domain->domain);
+		vfio_release_domain(domain, false);
 		list_del(&domain->next);
 		kfree(domain);
 	}
@@ -1158,6 +1618,8 @@  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.ioctl		= vfio_iommu_type1_ioctl,
 	.attach_group	= vfio_iommu_type1_attach_group,
 	.detach_group	= vfio_iommu_type1_detach_group,
+	.pin_pages	= vfio_iommu_type1_pin_pages,
+	.unpin_pages	= vfio_iommu_type1_unpin_pages,
 };
 
 static int __init vfio_iommu_type1_init(void)