Message ID | 1478293856-8191-10-git-send-email-kwankhede@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, 5 Nov 2016 02:40:43 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > Add task structure to vfio_dma. > Add address space structure. Each vfio_dma structure points to the address > space of the task who mapped it. > List of address spaces is maintained in vfio_iommu structure. > From DMA_MAP call if address space already exist in address space list, > vfio_dma points to it. If address space doesn't exist, allocate address > space, save pointer of mm to it and vfio_dma points to it. > Two tasks can share same address space and so we need keep address space > structure different from task in vfio_dma structure. vfio_dma keeps > pointer to its corresponding address space. > During DMA_UNMAP, same task who mapped it or other task who shares same > address space is allowed to unmap, otherwise unmap fails. > QEMU maps few iova ranges initially, then fork threads and from the child > thread calls DMA_UNMAP on previously mapped iova. Since child shares same > address space, DMA_UNMAP is successful. > This address space structure is used to track pages pinned by external > user in later changes. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Signed-off-by: Neo Jia <cjia@nvidia.com> > Change-Id: I7600f1bea6b384fd589fa72421ccf031bcfd9ac5 > --- > drivers/vfio/vfio_iommu_type1.c | 182 +++++++++++++++++++++++++++++----------- > 1 file changed, 134 insertions(+), 48 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 422c8d198abb..8d64528dcc22 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -55,12 +55,20 @@ MODULE_PARM_DESC(disable_hugepages, > > struct vfio_iommu { > struct list_head domain_list; > + struct list_head addr_space_list; > struct mutex lock; > struct rb_root dma_list; > bool v2; > bool nesting; > }; > > +/* address space */ > +struct vfio_addr_space { > + struct mm_struct *mm; > + struct list_head next; > + atomic_t ref_count; > +}; > + > struct vfio_domain { > struct iommu_domain *domain; > struct list_head next; > @@ -75,6 +83,9 @@ struct vfio_dma { > unsigned long vaddr; /* Process virtual addr */ > size_t size; /* Map size (bytes) */ > int prot; /* IOMMU_READ/WRITE */ > + struct vfio_addr_space *addr_space; > + struct task_struct *task; > + bool mlock_cap; > }; > > struct vfio_group { > @@ -130,6 +141,18 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) > rb_erase(&old->node, &iommu->dma_list); > } > > +static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu, > + struct mm_struct *mm) > +{ > + struct vfio_addr_space *as; > + > + list_for_each_entry(as, &iommu->addr_space_list, next) { > + if (as->mm == mm) > + return as; > + } > + return NULL; > +} > + > struct vwork { > struct mm_struct *mm; > long npage; > @@ -273,24 +296,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > * the iommu can only map chunks of consecutive pfns anyway, so get the > * first page and all consecutive pages with the same locking. > */ > -static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, > - int prot, unsigned long *pfn_base) > +static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > + long npage, int prot, > + unsigned long *pfn_base) > { > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > - bool lock_cap = capable(CAP_IPC_LOCK); > + struct task_struct *task = dma->task; > + unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + bool lock_cap = dma->mlock_cap; > + struct mm_struct *mm = dma->addr_space->mm; > long ret, i; > bool rsvd; > > - if (!current->mm) > - return -ENODEV; > - > - ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base); > + ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base); > if (ret) > return ret; > > rsvd = is_invalid_reserved_pfn(*pfn_base); > > - if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { > + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { > put_pfn(*pfn_base, prot); > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, > limit << PAGE_SHIFT); > @@ -299,7 +322,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, > > if (unlikely(disable_hugepages)) { > if (!rsvd) > - vfio_lock_acct(current->mm, 1); > + vfio_lock_acct(mm, 1); > return 1; > } > > @@ -307,7 +330,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, > for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { > unsigned long pfn = 0; > > - ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn); > + ret = vaddr_get_pfn(mm, vaddr, prot, &pfn); > if (ret) > break; > > @@ -318,7 +341,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, > } > > if (!rsvd && !lock_cap && > - current->mm->locked_vm + i + 1 > limit) { > + mm->locked_vm + i + 1 > limit) { > put_pfn(pfn, prot); > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > __func__, limit << PAGE_SHIFT); > @@ -327,13 +350,13 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, > } > > if (!rsvd) > - vfio_lock_acct(current->mm, i); > + vfio_lock_acct(mm, i); > > return i; > } > > -static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, > - int prot, bool do_accounting) > +static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn, > + long npage, int prot, bool do_accounting) > { > unsigned long unlocked = 0; > long i; > @@ -342,7 +365,7 @@ static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, > unlocked += put_pfn(pfn++, prot); > > if (do_accounting) > - vfio_lock_acct(current->mm, -unlocked); > + vfio_lock_acct(dma->addr_space->mm, -unlocked); > > return unlocked; > } > @@ -396,7 +419,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) > if (WARN_ON(!unmapped)) > break; > > - unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT, > + unlocked += __vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT, > unmapped >> PAGE_SHIFT, > dma->prot, false); > iova += unmapped; > @@ -404,13 +427,20 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) > cond_resched(); > } > > - vfio_lock_acct(current->mm, -unlocked); > + vfio_lock_acct(dma->addr_space->mm, -unlocked); > } > > static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) > { > vfio_unmap_unpin(iommu, dma); > vfio_unlink_dma(iommu, dma); > + > + if (atomic_dec_and_test(&dma->addr_space->ref_count)) { > + mmput(dma->addr_space->mm); > + put_task_struct(dma->task); > + list_del(&dma->addr_space->next); > + kfree(dma->addr_space); > + } > kfree(dma); > } > > @@ -506,6 +536,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, > while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { > if (!iommu->v2 && unmap->iova > dma->iova) > break; > + /* > + * Task with same address space who mapped this iova range is > + * allowed to unmap the iova range. > + */ > + if (dma->task->mm != current->mm) > + break; > unmapped += dma->size; > vfio_remove_dma(iommu, dma); > } > @@ -572,17 +608,58 @@ unwind: > return ret; > } > > +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, > + size_t map_size) > +{ > + dma_addr_t iova = dma->iova; > + unsigned long vaddr = dma->vaddr; > + size_t size = map_size; > + long npage; > + unsigned long pfn; > + int ret = 0; > + > + while (size) { > + /* Pin a contiguous chunk of memory */ > + npage = __vfio_pin_pages_remote(dma, vaddr + dma->size, > + size >> PAGE_SHIFT, dma->prot, > + &pfn); > + if (npage <= 0) { > + WARN_ON(!npage); > + ret = (int)npage; > + break; > + } > + > + /* Map it! */ > + ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, > + dma->prot); > + if (ret) { > + __vfio_unpin_pages_remote(dma, pfn, npage, dma->prot, > + true); > + break; > + } > + > + size -= npage << PAGE_SHIFT; > + dma->size += npage << PAGE_SHIFT; > + } > + > + if (ret) > + vfio_remove_dma(iommu, dma); > + > + return ret; > +} > + > static int vfio_dma_do_map(struct vfio_iommu *iommu, > struct vfio_iommu_type1_dma_map *map) > { > dma_addr_t iova = map->iova; > unsigned long vaddr = map->vaddr; > size_t size = map->size; > - long npage; > int ret = 0, prot = 0; > uint64_t mask; > struct vfio_dma *dma; > - unsigned long pfn; > + struct vfio_addr_space *addr_space; > + struct mm_struct *mm; > + bool free_addr_space_on_err = false; > > /* Verify that none of our __u64 fields overflow */ > if (map->size != size || map->vaddr != vaddr || map->iova != iova) > @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, > mutex_lock(&iommu->lock); > > if (vfio_find_dma(iommu, iova, size)) { > - mutex_unlock(&iommu->lock); > - return -EEXIST; > + ret = -EEXIST; > + goto do_map_err; > + } > + > + mm = get_task_mm(current); > + if (!mm) { > + ret = -ENODEV; -EFAULT? > + goto do_map_err; > + } > + > + addr_space = vfio_find_addr_space(iommu, mm); > + if (addr_space) { > + atomic_inc(&addr_space->ref_count); > + mmput(mm); > + } else { > + addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL); > + if (!addr_space) { > + ret = -ENOMEM; > + goto do_map_err; > + } > + addr_space->mm = mm; > + atomic_set(&addr_space->ref_count, 1); > + list_add(&addr_space->next, &iommu->addr_space_list); > + free_addr_space_on_err = true; > } > > dma = kzalloc(sizeof(*dma), GFP_KERNEL); > if (!dma) { > - mutex_unlock(&iommu->lock); > - return -ENOMEM; > + if (free_addr_space_on_err) { > + mmput(mm); > + list_del(&addr_space->next); > + kfree(addr_space); > + } > + ret = -ENOMEM; > + goto do_map_err; > } > > dma->iova = iova; > dma->vaddr = vaddr; > dma->prot = prot; > + dma->addr_space = addr_space; > + get_task_struct(current); > + dma->task = current; > + dma->mlock_cap = capable(CAP_IPC_LOCK); How do you reason we can cache this? Does the fact that the process had this capability at the time that it did a DMA_MAP imply that it necessarily still has this capability when an external user (vendor driver) tries to pin pages? I don't see how we can make that assumption. > > /* Insert zero-sized and grow as we map chunks of it */ > vfio_link_dma(iommu, dma); > > - while (size) { > - /* Pin a contiguous chunk of memory */ > - npage = __vfio_pin_pages_remote(vaddr + dma->size, > - size >> PAGE_SHIFT, prot, &pfn); > - if (npage <= 0) { > - WARN_ON(!npage); > - ret = (int)npage; > - break; > - } > - > - /* Map it! */ > - ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); > - if (ret) { > - __vfio_unpin_pages_remote(pfn, npage, prot, true); > - break; > - } > - > - size -= npage << PAGE_SHIFT; > - dma->size += npage << PAGE_SHIFT; > - } > - > - if (ret) > - vfio_remove_dma(iommu, dma); > - > + ret = vfio_pin_map_dma(iommu, dma, size); > +do_map_err: > mutex_unlock(&iommu->lock); > return ret; > }
On 11/8/2016 2:33 AM, Alex Williamson wrote: > On Sat, 5 Nov 2016 02:40:43 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > ... >> static int vfio_dma_do_map(struct vfio_iommu *iommu, >> struct vfio_iommu_type1_dma_map *map) >> { >> dma_addr_t iova = map->iova; >> unsigned long vaddr = map->vaddr; >> size_t size = map->size; >> - long npage; >> int ret = 0, prot = 0; >> uint64_t mask; >> struct vfio_dma *dma; >> - unsigned long pfn; >> + struct vfio_addr_space *addr_space; >> + struct mm_struct *mm; >> + bool free_addr_space_on_err = false; >> >> /* Verify that none of our __u64 fields overflow */ >> if (map->size != size || map->vaddr != vaddr || map->iova != iova) >> @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, >> mutex_lock(&iommu->lock); >> >> if (vfio_find_dma(iommu, iova, size)) { >> - mutex_unlock(&iommu->lock); >> - return -EEXIST; >> + ret = -EEXIST; >> + goto do_map_err; >> + } >> + >> + mm = get_task_mm(current); >> + if (!mm) { >> + ret = -ENODEV; > > -EFAULT? > -ENODEV return is in original code from vfio_pin_pages() if (!current->mm) return -ENODEV; Once I thought of changing it to -EFAULT, but then again changed to -ENODEV to be consistent with original error code. Should I still change this return to -EFAULT? >> + goto do_map_err; >> + } >> + >> + addr_space = vfio_find_addr_space(iommu, mm); >> + if (addr_space) { >> + atomic_inc(&addr_space->ref_count); >> + mmput(mm); >> + } else { >> + addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL); >> + if (!addr_space) { >> + ret = -ENOMEM; >> + goto do_map_err; >> + } >> + addr_space->mm = mm; >> + atomic_set(&addr_space->ref_count, 1); >> + list_add(&addr_space->next, &iommu->addr_space_list); >> + free_addr_space_on_err = true; >> } >> >> dma = kzalloc(sizeof(*dma), GFP_KERNEL); >> if (!dma) { >> - mutex_unlock(&iommu->lock); >> - return -ENOMEM; >> + if (free_addr_space_on_err) { >> + mmput(mm); >> + list_del(&addr_space->next); >> + kfree(addr_space); >> + } >> + ret = -ENOMEM; >> + goto do_map_err; >> } >> >> dma->iova = iova; >> dma->vaddr = vaddr; >> dma->prot = prot; >> + dma->addr_space = addr_space; >> + get_task_struct(current); >> + dma->task = current; >> + dma->mlock_cap = capable(CAP_IPC_LOCK); > > > How do you reason we can cache this? Does the fact that the process > had this capability at the time that it did a DMA_MAP imply that it > necessarily still has this capability when an external user (vendor > driver) tries to pin pages? I don't see how we can make that > assumption. > > Will process change MEMLOCK limit at runtime? I think it shouldn't, correct me if I'm wrong. QEMU doesn't do that, right? The function capable() determines current task's capability. But when vfio_pin_pages() is called, it could come from other task but pages are pinned from address space of task who mapped it. So we can't use capable() in vfio_pin_pages() If this capability shouldn't be cached, we have to use has_capability() with dma->task as argument in vfio_pin_pages() bool has_capability(struct task_struct *t, int cap) Thanks, Kirti
On Tue, 8 Nov 2016 19:43:25 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 11/8/2016 2:33 AM, Alex Williamson wrote: > > On Sat, 5 Nov 2016 02:40:43 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > > ... > > >> static int vfio_dma_do_map(struct vfio_iommu *iommu, > >> struct vfio_iommu_type1_dma_map *map) > >> { > >> dma_addr_t iova = map->iova; > >> unsigned long vaddr = map->vaddr; > >> size_t size = map->size; > >> - long npage; > >> int ret = 0, prot = 0; > >> uint64_t mask; > >> struct vfio_dma *dma; > >> - unsigned long pfn; > >> + struct vfio_addr_space *addr_space; > >> + struct mm_struct *mm; > >> + bool free_addr_space_on_err = false; > >> > >> /* Verify that none of our __u64 fields overflow */ > >> if (map->size != size || map->vaddr != vaddr || map->iova != iova) > >> @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, > >> mutex_lock(&iommu->lock); > >> > >> if (vfio_find_dma(iommu, iova, size)) { > >> - mutex_unlock(&iommu->lock); > >> - return -EEXIST; > >> + ret = -EEXIST; > >> + goto do_map_err; > >> + } > >> + > >> + mm = get_task_mm(current); > >> + if (!mm) { > >> + ret = -ENODEV; > > > > -EFAULT? > > > > -ENODEV return is in original code from vfio_pin_pages() > if (!current->mm) > return -ENODEV; > > Once I thought of changing it to -EFAULT, but then again changed to > -ENODEV to be consistent with original error code. > > Should I still change this return to -EFAULT? Let's keep ENODEV for less code churn, I guess. > >> + goto do_map_err; > >> + } > >> + > >> + addr_space = vfio_find_addr_space(iommu, mm); > >> + if (addr_space) { > >> + atomic_inc(&addr_space->ref_count); > >> + mmput(mm); > >> + } else { > >> + addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL); > >> + if (!addr_space) { > >> + ret = -ENOMEM; > >> + goto do_map_err; > >> + } > >> + addr_space->mm = mm; > >> + atomic_set(&addr_space->ref_count, 1); > >> + list_add(&addr_space->next, &iommu->addr_space_list); > >> + free_addr_space_on_err = true; > >> } > >> > >> dma = kzalloc(sizeof(*dma), GFP_KERNEL); > >> if (!dma) { > >> - mutex_unlock(&iommu->lock); > >> - return -ENOMEM; > >> + if (free_addr_space_on_err) { > >> + mmput(mm); > >> + list_del(&addr_space->next); > >> + kfree(addr_space); > >> + } > >> + ret = -ENOMEM; > >> + goto do_map_err; > >> } > >> > >> dma->iova = iova; > >> dma->vaddr = vaddr; > >> dma->prot = prot; > >> + dma->addr_space = addr_space; > >> + get_task_struct(current); > >> + dma->task = current; > >> + dma->mlock_cap = capable(CAP_IPC_LOCK); > > > > > > How do you reason we can cache this? Does the fact that the process > > had this capability at the time that it did a DMA_MAP imply that it > > necessarily still has this capability when an external user (vendor > > driver) tries to pin pages? I don't see how we can make that > > assumption. > > > > > > Will process change MEMLOCK limit at runtime? I think it shouldn't, > correct me if I'm wrong. QEMU doesn't do that, right? What QEMU does or doesn't do isn't relevant, the question is could a process change CAP_IPC_LOCK runtime. It seems plausible to me. > The function capable() determines current task's capability. But when > vfio_pin_pages() is called, it could come from other task but pages are > pinned from address space of task who mapped it. So we can't use > capable() in vfio_pin_pages() > > If this capability shouldn't be cached, we have to use has_capability() > with dma->task as argument in vfio_pin_pages() > > bool has_capability(struct task_struct *t, int cap) Yep, that sounds better. Thanks, Alex
* Kirti Wankhede <kwankhede@nvidia.com> [2016-11-05 02:40:43 +0530]: Hi Kirti, [...] > static int vfio_dma_do_map(struct vfio_iommu *iommu, > struct vfio_iommu_type1_dma_map *map) > { > dma_addr_t iova = map->iova; > unsigned long vaddr = map->vaddr; > size_t size = map->size; > - long npage; > int ret = 0, prot = 0; > uint64_t mask; > struct vfio_dma *dma; > - unsigned long pfn; > + struct vfio_addr_space *addr_space; > + struct mm_struct *mm; > + bool free_addr_space_on_err = false; > > /* Verify that none of our __u64 fields overflow */ > if (map->size != size || map->vaddr != vaddr || map->iova != iova) > @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, > mutex_lock(&iommu->lock); > > if (vfio_find_dma(iommu, iova, size)) { > - mutex_unlock(&iommu->lock); > - return -EEXIST; > + ret = -EEXIST; > + goto do_map_err; > + } > + > + mm = get_task_mm(current); > + if (!mm) { > + ret = -ENODEV; > + goto do_map_err; > + } > + > + addr_space = vfio_find_addr_space(iommu, mm); > + if (addr_space) { > + atomic_inc(&addr_space->ref_count); > + mmput(mm); > + } else { > + addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL); > + if (!addr_space) { > + ret = -ENOMEM; No need to call (?): mmput(mm); > + goto do_map_err; > + } > + addr_space->mm = mm; > + atomic_set(&addr_space->ref_count, 1); > + list_add(&addr_space->next, &iommu->addr_space_list); > + free_addr_space_on_err = true; > } > > dma = kzalloc(sizeof(*dma), GFP_KERNEL); > if (!dma) { > - mutex_unlock(&iommu->lock); > - return -ENOMEM; > + if (free_addr_space_on_err) { > + mmput(mm); > + list_del(&addr_space->next); > + kfree(addr_space); > + } > + ret = -ENOMEM; > + goto do_map_err; > } > > dma->iova = iova; > dma->vaddr = vaddr; > dma->prot = prot; > + dma->addr_space = addr_space; > + get_task_struct(current); > + dma->task = current; > + dma->mlock_cap = capable(CAP_IPC_LOCK); > > /* Insert zero-sized and grow as we map chunks of it */ > vfio_link_dma(iommu, dma); > > - while (size) { > - /* Pin a contiguous chunk of memory */ > - npage = __vfio_pin_pages_remote(vaddr + dma->size, > - size >> PAGE_SHIFT, prot, &pfn); > - if (npage <= 0) { > - WARN_ON(!npage); > - ret = (int)npage; > - break; > - } > - > - /* Map it! */ > - ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); > - if (ret) { > - __vfio_unpin_pages_remote(pfn, npage, prot, true); > - break; > - } > - > - size -= npage << PAGE_SHIFT; > - dma->size += npage << PAGE_SHIFT; > - } > - > - if (ret) > - vfio_remove_dma(iommu, dma); > - > + ret = vfio_pin_map_dma(iommu, dma, size); > +do_map_err: > mutex_unlock(&iommu->lock); > return ret; > } > -- > 2.7.0 >
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 422c8d198abb..8d64528dcc22 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -55,12 +55,20 @@ MODULE_PARM_DESC(disable_hugepages, struct vfio_iommu { struct list_head domain_list; + struct list_head addr_space_list; struct mutex lock; struct rb_root dma_list; bool v2; bool nesting; }; +/* address space */ +struct vfio_addr_space { + struct mm_struct *mm; + struct list_head next; + atomic_t ref_count; +}; + struct vfio_domain { struct iommu_domain *domain; struct list_head next; @@ -75,6 +83,9 @@ struct vfio_dma { unsigned long vaddr; /* Process virtual addr */ size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ + struct vfio_addr_space *addr_space; + struct task_struct *task; + bool mlock_cap; }; struct vfio_group { @@ -130,6 +141,18 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) rb_erase(&old->node, &iommu->dma_list); } +static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu, + struct mm_struct *mm) +{ + struct vfio_addr_space *as; + + list_for_each_entry(as, &iommu->addr_space_list, next) { + if (as->mm == mm) + return as; + } + return NULL; +} + struct vwork { struct mm_struct *mm; long npage; @@ -273,24 +296,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, * the iommu can only map chunks of consecutive pfns anyway, so get the * first page and all consecutive pages with the same locking. */ -static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, - int prot, unsigned long *pfn_base) +static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, + long npage, int prot, + unsigned long *pfn_base) { - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - bool lock_cap = capable(CAP_IPC_LOCK); + struct task_struct *task = dma->task; + unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = dma->mlock_cap; + struct mm_struct *mm = dma->addr_space->mm; long ret, i; bool rsvd; - if (!current->mm) - return -ENODEV; - - ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base); + ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base); if (ret) return ret; rsvd = is_invalid_reserved_pfn(*pfn_base); - if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { put_pfn(*pfn_base, prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); @@ -299,7 +322,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, if (unlikely(disable_hugepages)) { if (!rsvd) - vfio_lock_acct(current->mm, 1); + vfio_lock_acct(mm, 1); return 1; } @@ -307,7 +330,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; - ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn); + ret = vaddr_get_pfn(mm, vaddr, prot, &pfn); if (ret) break; @@ -318,7 +341,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, } if (!rsvd && !lock_cap && - current->mm->locked_vm + i + 1 > limit) { + mm->locked_vm + i + 1 > limit) { put_pfn(pfn, prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); @@ -327,13 +350,13 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage, } if (!rsvd) - vfio_lock_acct(current->mm, i); + vfio_lock_acct(mm, i); return i; } -static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, - int prot, bool do_accounting) +static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn, + long npage, int prot, bool do_accounting) { unsigned long unlocked = 0; long i; @@ -342,7 +365,7 @@ static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, unlocked += put_pfn(pfn++, prot); if (do_accounting) - vfio_lock_acct(current->mm, -unlocked); + vfio_lock_acct(dma->addr_space->mm, -unlocked); return unlocked; } @@ -396,7 +419,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) if (WARN_ON(!unmapped)) break; - unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT, + unlocked += __vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT, unmapped >> PAGE_SHIFT, dma->prot, false); iova += unmapped; @@ -404,13 +427,20 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) cond_resched(); } - vfio_lock_acct(current->mm, -unlocked); + vfio_lock_acct(dma->addr_space->mm, -unlocked); } static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) { vfio_unmap_unpin(iommu, dma); vfio_unlink_dma(iommu, dma); + + if (atomic_dec_and_test(&dma->addr_space->ref_count)) { + mmput(dma->addr_space->mm); + put_task_struct(dma->task); + list_del(&dma->addr_space->next); + kfree(dma->addr_space); + } kfree(dma); } @@ -506,6 +536,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { if (!iommu->v2 && unmap->iova > dma->iova) break; + /* + * Task with same address space who mapped this iova range is + * allowed to unmap the iova range. + */ + if (dma->task->mm != current->mm) + break; unmapped += dma->size; vfio_remove_dma(iommu, dma); } @@ -572,17 +608,58 @@ unwind: return ret; } +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, + size_t map_size) +{ + dma_addr_t iova = dma->iova; + unsigned long vaddr = dma->vaddr; + size_t size = map_size; + long npage; + unsigned long pfn; + int ret = 0; + + while (size) { + /* Pin a contiguous chunk of memory */ + npage = __vfio_pin_pages_remote(dma, vaddr + dma->size, + size >> PAGE_SHIFT, dma->prot, + &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } + + /* Map it! */ + ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, + dma->prot); + if (ret) { + __vfio_unpin_pages_remote(dma, pfn, npage, dma->prot, + true); + break; + } + + size -= npage << PAGE_SHIFT; + dma->size += npage << PAGE_SHIFT; + } + + if (ret) + vfio_remove_dma(iommu, dma); + + return ret; +} + static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { dma_addr_t iova = map->iova; unsigned long vaddr = map->vaddr; size_t size = map->size; - long npage; int ret = 0, prot = 0; uint64_t mask; struct vfio_dma *dma; - unsigned long pfn; + struct vfio_addr_space *addr_space; + struct mm_struct *mm; + bool free_addr_space_on_err = false; /* Verify that none of our __u64 fields overflow */ if (map->size != size || map->vaddr != vaddr || map->iova != iova) @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, mutex_lock(&iommu->lock); if (vfio_find_dma(iommu, iova, size)) { - mutex_unlock(&iommu->lock); - return -EEXIST; + ret = -EEXIST; + goto do_map_err; + } + + mm = get_task_mm(current); + if (!mm) { + ret = -ENODEV; + goto do_map_err; + } + + addr_space = vfio_find_addr_space(iommu, mm); + if (addr_space) { + atomic_inc(&addr_space->ref_count); + mmput(mm); + } else { + addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL); + if (!addr_space) { + ret = -ENOMEM; + goto do_map_err; + } + addr_space->mm = mm; + atomic_set(&addr_space->ref_count, 1); + list_add(&addr_space->next, &iommu->addr_space_list); + free_addr_space_on_err = true; } dma = kzalloc(sizeof(*dma), GFP_KERNEL); if (!dma) { - mutex_unlock(&iommu->lock); - return -ENOMEM; + if (free_addr_space_on_err) { + mmput(mm); + list_del(&addr_space->next); + kfree(addr_space); + } + ret = -ENOMEM; + goto do_map_err; } dma->iova = iova; dma->vaddr = vaddr; dma->prot = prot; + dma->addr_space = addr_space; + get_task_struct(current); + dma->task = current; + dma->mlock_cap = capable(CAP_IPC_LOCK); /* Insert zero-sized and grow as we map chunks of it */ vfio_link_dma(iommu, dma); - while (size) { - /* Pin a contiguous chunk of memory */ - npage = __vfio_pin_pages_remote(vaddr + dma->size, - size >> PAGE_SHIFT, prot, &pfn); - if (npage <= 0) { - WARN_ON(!npage); - ret = (int)npage; - break; - } - - /* Map it! */ - ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); - if (ret) { - __vfio_unpin_pages_remote(pfn, npage, prot, true); - break; - } - - size -= npage << PAGE_SHIFT; - dma->size += npage << PAGE_SHIFT; - } - - if (ret) - vfio_remove_dma(iommu, dma); - + ret = vfio_pin_map_dma(iommu, dma, size); +do_map_err: mutex_unlock(&iommu->lock); return ret; }