Message ID | 20170406145250.16956.95264.stgit@gimli.home (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Alex, On 06/04/2017 16:53, Alex Williamson wrote: > If the mmap_sem is contented then the vfio type1 IOMMU backend will > defer locked page accounting updates to a workqueue task. This has > a few problems and depending on which side the user tries to play, > they might be over-penalized for unmaps that haven't yet been > accounted, or able to race the workqueue to enter more mappings > than they're allowed. It's not entirely clear what motivated this > workqueue mechanism in the original vfio design, but it seems to > introduce more problems than it solves, so remove it and update the > callers to allow for failure. We can also now recheck the limit > under write lock to make sure we don't exceed it. > > Cc: stable@vger.kernel.org > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> Looks good to me. Reviewed-by: Eric Auger <eric.auger@redhat.com> Thanks Eric > --- > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric > > drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- > 1 file changed, 46 insertions(+), 55 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 32d2633092a3..b799edbb8c4f 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) > return ret; > } > > -struct vwork { > - struct mm_struct *mm; > - long npage; > - struct work_struct work; > -}; > - > -/* delayed decrement/increment for locked_vm */ > -static void vfio_lock_acct_bg(struct work_struct *work) > +static int vfio_lock_acct(struct task_struct *task, long npage) > { > - struct vwork *vwork = container_of(work, struct vwork, work); > - struct mm_struct *mm; > - > - mm = vwork->mm; > - down_write(&mm->mmap_sem); > - mm->locked_vm += vwork->npage; > - up_write(&mm->mmap_sem); > - mmput(mm); > - kfree(vwork); > -} > - > -static void vfio_lock_acct(struct task_struct *task, long npage) > -{ > - struct vwork *vwork; > struct mm_struct *mm; > bool is_current; > + int ret; > > if (!npage) > - return; > + return 0; > > is_current = (task->mm == current->mm); > > mm = is_current ? task->mm : get_task_mm(task); > if (!mm) > - return; /* process exited */ > + return -ESRCH; /* process exited */ > > - if (down_write_trylock(&mm->mmap_sem)) { > - mm->locked_vm += npage; > - up_write(&mm->mmap_sem); > - if (!is_current) > - mmput(mm); > - return; > - } > + ret = down_write_killable(&mm->mmap_sem); > + if (!ret) { > + if (npage < 0) { > + mm->locked_vm += npage; > + } else { > + unsigned long limit; > + > + limit = is_current ? > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + > + if (mm->locked_vm + npage <= limit) > + mm->locked_vm += npage; > + else > + ret = -ENOMEM; > + } > > - if (is_current) { > - mm = get_task_mm(task); > - if (!mm) > - return; > + up_write(&mm->mmap_sem); > } > > - /* > - * Couldn't get mmap_sem lock, so must setup to update > - * mm->locked_vm later. If locked_vm were atomic, we > - * wouldn't need this silliness > - */ > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > - if (WARN_ON(!vwork)) { > + if (!is_current) > mmput(mm); > - return; > - } > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > - vwork->mm = mm; > - vwork->npage = npage; > - schedule_work(&vwork->work); > + > + return ret; > } > > /* > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > long npage, unsigned long *pfn_base) > { > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > bool lock_cap = capable(CAP_IPC_LOCK); > long ret, pinned = 0, lock_acct = 0; > bool rsvd; > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > /* Lock all the consecutive pages from pfn_base */ > for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > - unsigned long pfn = 0; > - > ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); > if (ret) > break; > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > put_pfn(pfn, dma->prot); > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > __func__, limit << PAGE_SHIFT); > - break; > + ret = -ENOMEM; > + goto unpin_out; > } > lock_acct++; > } > } > > out: > - vfio_lock_acct(current, lock_acct); > + ret = vfio_lock_acct(current, lock_acct); > + > +unpin_out: > + if (ret) { > + if (!rsvd) { > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > + put_pfn(pfn, dma->prot); > + } > + > + return ret; > + } > > return pinned; > } > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > goto pin_page_exit; > } > > - if (!rsvd && do_accounting) > - vfio_lock_acct(dma->task, 1); > + if (!rsvd && do_accounting) { > + ret = vfio_lock_acct(dma->task, 1); > + if (ret) { > + put_pfn(*pfn_base, dma->prot); > + goto pin_page_exit; > + } > + } > + > ret = 1; > > pin_page_exit: >
On Thu, 6 Apr 2017 18:53:04 +0200 Auger Eric <eric.auger@redhat.com> wrote: > Hi Alex, > > On 06/04/2017 16:53, Alex Williamson wrote: > > If the mmap_sem is contented then the vfio type1 IOMMU backend will > > defer locked page accounting updates to a workqueue task. This has > > a few problems and depending on which side the user tries to play, > > they might be over-penalized for unmaps that haven't yet been > > accounted, or able to race the workqueue to enter more mappings > > than they're allowed. It's not entirely clear what motivated this > > workqueue mechanism in the original vfio design, but it seems to > > introduce more problems than it solves, so remove it and update the > > callers to allow for failure. We can also now recheck the limit > > under write lock to make sure we don't exceed it. > > > > Cc: stable@vger.kernel.org > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > Looks good to me. > > Reviewed-by: Eric Auger <eric.auger@redhat.com> > One possible enhancement below. > > > --- > > > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric > > > > drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- > > 1 file changed, 46 insertions(+), 55 deletions(-) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > index 32d2633092a3..b799edbb8c4f 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) > > return ret; > > } > > > > -struct vwork { > > - struct mm_struct *mm; > > - long npage; > > - struct work_struct work; > > -}; > > - > > -/* delayed decrement/increment for locked_vm */ > > -static void vfio_lock_acct_bg(struct work_struct *work) > > +static int vfio_lock_acct(struct task_struct *task, long npage) > > { > > - struct vwork *vwork = container_of(work, struct vwork, work); > > - struct mm_struct *mm; > > - > > - mm = vwork->mm; > > - down_write(&mm->mmap_sem); > > - mm->locked_vm += vwork->npage; > > - up_write(&mm->mmap_sem); > > - mmput(mm); > > - kfree(vwork); > > -} > > - > > -static void vfio_lock_acct(struct task_struct *task, long npage) > > -{ > > - struct vwork *vwork; > > struct mm_struct *mm; > > bool is_current; > > + int ret; > > > > if (!npage) > > - return; > > + return 0; > > > > is_current = (task->mm == current->mm); > > > > mm = is_current ? task->mm : get_task_mm(task); > > if (!mm) > > - return; /* process exited */ > > + return -ESRCH; /* process exited */ > > > > - if (down_write_trylock(&mm->mmap_sem)) { > > - mm->locked_vm += npage; > > - up_write(&mm->mmap_sem); > > - if (!is_current) > > - mmput(mm); > > - return; > > - } > > + ret = down_write_killable(&mm->mmap_sem); Hmm, since we're already testing current, I wonder if it makes sense to have this bimodal, killable if current, straight down_write() otherwise. I'm not too sure how important it is to use killable regardless, but mlock does, which seemed like a good model to follow. Thanks, Alex > > + if (!ret) { > > + if (npage < 0) { > > + mm->locked_vm += npage; > > + } else { > > + unsigned long limit; > > + > > + limit = is_current ? > > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > + > > + if (mm->locked_vm + npage <= limit) > > + mm->locked_vm += npage; > > + else > > + ret = -ENOMEM; > > + } > > > > - if (is_current) { > > - mm = get_task_mm(task); > > - if (!mm) > > - return; > > + up_write(&mm->mmap_sem); > > } > > > > - /* > > - * Couldn't get mmap_sem lock, so must setup to update > > - * mm->locked_vm later. If locked_vm were atomic, we > > - * wouldn't need this silliness > > - */ > > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > > - if (WARN_ON(!vwork)) { > > + if (!is_current) > > mmput(mm); > > - return; > > - } > > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > > - vwork->mm = mm; > > - vwork->npage = npage; > > - schedule_work(&vwork->work); > > + > > + return ret; > > } > > > > /* > > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > long npage, unsigned long *pfn_base) > > { > > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > bool lock_cap = capable(CAP_IPC_LOCK); > > long ret, pinned = 0, lock_acct = 0; > > bool rsvd; > > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > /* Lock all the consecutive pages from pfn_base */ > > for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > > pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > > - unsigned long pfn = 0; > > - > > ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); > > if (ret) > > break; > > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > put_pfn(pfn, dma->prot); > > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > > __func__, limit << PAGE_SHIFT); > > - break; > > + ret = -ENOMEM; > > + goto unpin_out; > > } > > lock_acct++; > > } > > } > > > > out: > > - vfio_lock_acct(current, lock_acct); > > + ret = vfio_lock_acct(current, lock_acct); > > + > > +unpin_out: > > + if (ret) { > > + if (!rsvd) { > > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > > + put_pfn(pfn, dma->prot); > > + } > > + > > + return ret; > > + } > > > > return pinned; > > } > > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > > goto pin_page_exit; > > } > > > > - if (!rsvd && do_accounting) > > - vfio_lock_acct(dma->task, 1); > > + if (!rsvd && do_accounting) { > > + ret = vfio_lock_acct(dma->task, 1); > > + if (ret) { > > + put_pfn(*pfn_base, dma->prot); > > + goto pin_page_exit; > > + } > > + } > > + > > ret = 1; > > > > pin_page_exit: > >
On Thu, 6 Apr 2017 11:05:31 -0600 Alex Williamson <alex.williamson@redhat.com> wrote: > On Thu, 6 Apr 2017 18:53:04 +0200 > Auger Eric <eric.auger@redhat.com> wrote: > > > Hi Alex, > > > > On 06/04/2017 16:53, Alex Williamson wrote: > > > If the mmap_sem is contented then the vfio type1 IOMMU backend will > > > defer locked page accounting updates to a workqueue task. This has > > > a few problems and depending on which side the user tries to play, > > > they might be over-penalized for unmaps that haven't yet been > > > accounted, or able to race the workqueue to enter more mappings > > > than they're allowed. It's not entirely clear what motivated this > > > workqueue mechanism in the original vfio design, but it seems to > > > introduce more problems than it solves, so remove it and update the > > > callers to allow for failure. We can also now recheck the limit > > > under write lock to make sure we don't exceed it. > > > > > > Cc: stable@vger.kernel.org > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > > Looks good to me. > > > > Reviewed-by: Eric Auger <eric.auger@redhat.com> > > > > One possible enhancement below. > > > > > > --- > > > > > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric > > > > > > drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- > > > 1 file changed, 46 insertions(+), 55 deletions(-) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > > index 32d2633092a3..b799edbb8c4f 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) > > > return ret; > > > } > > > > > > -struct vwork { > > > - struct mm_struct *mm; > > > - long npage; > > > - struct work_struct work; > > > -}; > > > - > > > -/* delayed decrement/increment for locked_vm */ > > > -static void vfio_lock_acct_bg(struct work_struct *work) > > > +static int vfio_lock_acct(struct task_struct *task, long npage) > > > { > > > - struct vwork *vwork = container_of(work, struct vwork, work); > > > - struct mm_struct *mm; > > > - > > > - mm = vwork->mm; > > > - down_write(&mm->mmap_sem); > > > - mm->locked_vm += vwork->npage; > > > - up_write(&mm->mmap_sem); > > > - mmput(mm); > > > - kfree(vwork); > > > -} > > > - > > > -static void vfio_lock_acct(struct task_struct *task, long npage) > > > -{ > > > - struct vwork *vwork; > > > struct mm_struct *mm; > > > bool is_current; > > > + int ret; > > > > > > if (!npage) > > > - return; > > > + return 0; > > > > > > is_current = (task->mm == current->mm); > > > > > > mm = is_current ? task->mm : get_task_mm(task); > > > if (!mm) > > > - return; /* process exited */ > > > + return -ESRCH; /* process exited */ > > > > > > - if (down_write_trylock(&mm->mmap_sem)) { > > > - mm->locked_vm += npage; > > > - up_write(&mm->mmap_sem); > > > - if (!is_current) > > > - mmput(mm); > > > - return; > > > - } > > > + ret = down_write_killable(&mm->mmap_sem); > > > Hmm, since we're already testing current, I wonder if it makes sense to > have this bimodal, killable if current, straight down_write() > otherwise. I'm not too sure how important it is to use killable > regardless, but mlock does, which seemed like a good model to follow. I see other examples in the kernel where killable is used even for remote mm, which would match the external page pinning usage, so I think I'll keep this as is unless there are other comments for v2. Thanks for the review. Alex > > > + if (!ret) { > > > + if (npage < 0) { > > > + mm->locked_vm += npage; > > > + } else { > > > + unsigned long limit; > > > + > > > + limit = is_current ? > > > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > > > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > + > > > + if (mm->locked_vm + npage <= limit) > > > + mm->locked_vm += npage; > > > + else > > > + ret = -ENOMEM; > > > + } > > > > > > - if (is_current) { > > > - mm = get_task_mm(task); > > > - if (!mm) > > > - return; > > > + up_write(&mm->mmap_sem); > > > } > > > > > > - /* > > > - * Couldn't get mmap_sem lock, so must setup to update > > > - * mm->locked_vm later. If locked_vm were atomic, we > > > - * wouldn't need this silliness > > > - */ > > > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > > > - if (WARN_ON(!vwork)) { > > > + if (!is_current) > > > mmput(mm); > > > - return; > > > - } > > > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > > > - vwork->mm = mm; > > > - vwork->npage = npage; > > > - schedule_work(&vwork->work); > > > + > > > + return ret; > > > } > > > > > > /* > > > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > > > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > > long npage, unsigned long *pfn_base) > > > { > > > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > bool lock_cap = capable(CAP_IPC_LOCK); > > > long ret, pinned = 0, lock_acct = 0; > > > bool rsvd; > > > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > > /* Lock all the consecutive pages from pfn_base */ > > > for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > > > pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > > > - unsigned long pfn = 0; > > > - > > > ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); > > > if (ret) > > > break; > > > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > > put_pfn(pfn, dma->prot); > > > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > > > __func__, limit << PAGE_SHIFT); > > > - break; > > > + ret = -ENOMEM; > > > + goto unpin_out; > > > } > > > lock_acct++; > > > } > > > } > > > > > > out: > > > - vfio_lock_acct(current, lock_acct); > > > + ret = vfio_lock_acct(current, lock_acct); > > > + > > > +unpin_out: > > > + if (ret) { > > > + if (!rsvd) { > > > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > > > + put_pfn(pfn, dma->prot); > > > + } > > > + > > > + return ret; > > > + } > > > > > > return pinned; > > > } > > > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > > > goto pin_page_exit; > > > } > > > > > > - if (!rsvd && do_accounting) > > > - vfio_lock_acct(dma->task, 1); > > > + if (!rsvd && do_accounting) { > > > + ret = vfio_lock_acct(dma->task, 1); > > > + if (ret) { > > > + put_pfn(*pfn_base, dma->prot); > > > + goto pin_page_exit; > > > + } > > > + } > > > + > > > ret = 1; > > > > > > pin_page_exit: > > > >
On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote: > If the mmap_sem is contented then the vfio type1 IOMMU backend will > defer locked page accounting updates to a workqueue task. This has > a few problems and depending on which side the user tries to play, > they might be over-penalized for unmaps that haven't yet been > accounted, or able to race the workqueue to enter more mappings > than they're allowed. It's not entirely clear what motivated this > workqueue mechanism in the original vfio design, but it seems to > introduce more problems than it solves, so remove it and update the > callers to allow for failure. We can also now recheck the limit > under write lock to make sure we don't exceed it. > > Cc: stable@vger.kernel.org > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > --- > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric > > drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- > 1 file changed, 46 insertions(+), 55 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 32d2633092a3..b799edbb8c4f 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) > return ret; > } > > -struct vwork { > - struct mm_struct *mm; > - long npage; > - struct work_struct work; > -}; > - > -/* delayed decrement/increment for locked_vm */ > -static void vfio_lock_acct_bg(struct work_struct *work) > +static int vfio_lock_acct(struct task_struct *task, long npage) > { > - struct vwork *vwork = container_of(work, struct vwork, work); > - struct mm_struct *mm; > - > - mm = vwork->mm; > - down_write(&mm->mmap_sem); > - mm->locked_vm += vwork->npage; > - up_write(&mm->mmap_sem); > - mmput(mm); > - kfree(vwork); > -} > - > -static void vfio_lock_acct(struct task_struct *task, long npage) > -{ > - struct vwork *vwork; > struct mm_struct *mm; > bool is_current; > + int ret; > > if (!npage) > - return; > + return 0; > > is_current = (task->mm == current->mm); > > mm = is_current ? task->mm : get_task_mm(task); A question besides current patch: could I ask why we need to take special care for is_current? I see that is only used to only try avoid get_task_mm() when proper, but is get_task_mm() a heavy operation? > if (!mm) > - return; /* process exited */ > + return -ESRCH; /* process exited */ > > - if (down_write_trylock(&mm->mmap_sem)) { > - mm->locked_vm += npage; > - up_write(&mm->mmap_sem); > - if (!is_current) > - mmput(mm); > - return; > - } > + ret = down_write_killable(&mm->mmap_sem); > + if (!ret) { > + if (npage < 0) { > + mm->locked_vm += npage; > + } else { > + unsigned long limit; > + > + limit = is_current ? > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; Maybe we can directly use task_rlimit() here? Since looks like rlimit() is calling it as well, with "current". > + > + if (mm->locked_vm + npage <= limit) > + mm->locked_vm += npage; > + else > + ret = -ENOMEM; > + } > > - if (is_current) { > - mm = get_task_mm(task); > - if (!mm) > - return; > + up_write(&mm->mmap_sem); > } > > - /* > - * Couldn't get mmap_sem lock, so must setup to update > - * mm->locked_vm later. If locked_vm were atomic, we > - * wouldn't need this silliness > - */ > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > - if (WARN_ON(!vwork)) { > + if (!is_current) > mmput(mm); > - return; > - } > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > - vwork->mm = mm; > - vwork->npage = npage; > - schedule_work(&vwork->work); > + > + return ret; > } > > /* > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > long npage, unsigned long *pfn_base) > { > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > bool lock_cap = capable(CAP_IPC_LOCK); > long ret, pinned = 0, lock_acct = 0; > bool rsvd; > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > /* Lock all the consecutive pages from pfn_base */ > for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > - unsigned long pfn = 0; > - > ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); > if (ret) > break; > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > put_pfn(pfn, dma->prot); > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > __func__, limit << PAGE_SHIFT); > - break; > + ret = -ENOMEM; > + goto unpin_out; > } > lock_acct++; > } > } > > out: > - vfio_lock_acct(current, lock_acct); > + ret = vfio_lock_acct(current, lock_acct); > + > +unpin_out: > + if (ret) { > + if (!rsvd) { > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > + put_pfn(pfn, dma->prot); > + } > + > + return ret; > + } The change in vfio_pin_pages_remote() seems to contain a functional change totally not related to the subject (IIUC, we are going to unpin those pages if the huge page can only be pinned partially, and we are not doing that before)? If so, would it be nice to split current patch into two, or at least mention this behavior change in commit log of this patch? Thanks, > > return pinned; > } > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > goto pin_page_exit; > } > > - if (!rsvd && do_accounting) > - vfio_lock_acct(dma->task, 1); > + if (!rsvd && do_accounting) { > + ret = vfio_lock_acct(dma->task, 1); > + if (ret) { > + put_pfn(*pfn_base, dma->prot); > + goto pin_page_exit; > + } > + } > + > ret = 1; > > pin_page_exit: >
On Tue, 11 Apr 2017 19:03:14 +0800 Peter Xu <peterx@redhat.com> wrote: > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote: > > If the mmap_sem is contented then the vfio type1 IOMMU backend will > > defer locked page accounting updates to a workqueue task. This has > > a few problems and depending on which side the user tries to play, > > they might be over-penalized for unmaps that haven't yet been > > accounted, or able to race the workqueue to enter more mappings > > than they're allowed. It's not entirely clear what motivated this > > workqueue mechanism in the original vfio design, but it seems to > > introduce more problems than it solves, so remove it and update the > > callers to allow for failure. We can also now recheck the limit > > under write lock to make sure we don't exceed it. > > > > Cc: stable@vger.kernel.org > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > > --- > > > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric > > > > drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- > > 1 file changed, 46 insertions(+), 55 deletions(-) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > index 32d2633092a3..b799edbb8c4f 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) > > return ret; > > } > > > > -struct vwork { > > - struct mm_struct *mm; > > - long npage; > > - struct work_struct work; > > -}; > > - > > -/* delayed decrement/increment for locked_vm */ > > -static void vfio_lock_acct_bg(struct work_struct *work) > > +static int vfio_lock_acct(struct task_struct *task, long npage) > > { > > - struct vwork *vwork = container_of(work, struct vwork, work); > > - struct mm_struct *mm; > > - > > - mm = vwork->mm; > > - down_write(&mm->mmap_sem); > > - mm->locked_vm += vwork->npage; > > - up_write(&mm->mmap_sem); > > - mmput(mm); > > - kfree(vwork); > > -} > > - > > -static void vfio_lock_acct(struct task_struct *task, long npage) > > -{ > > - struct vwork *vwork; > > struct mm_struct *mm; > > bool is_current; > > + int ret; > > > > if (!npage) > > - return; > > + return 0; > > > > is_current = (task->mm == current->mm); > > > > mm = is_current ? task->mm : get_task_mm(task); > > A question besides current patch: could I ask why we need to take > special care for is_current? I see that is only used to only try avoid > get_task_mm() when proper, but is get_task_mm() a heavy operation? Yes, it's slower, performance was significantly degraded when mdev support was introduced and imposed get_task_mm() on all calling paths. > > if (!mm) > > - return; /* process exited */ > > + return -ESRCH; /* process exited */ > > > > - if (down_write_trylock(&mm->mmap_sem)) { > > - mm->locked_vm += npage; > > - up_write(&mm->mmap_sem); > > - if (!is_current) > > - mmput(mm); > > - return; > > - } > > + ret = down_write_killable(&mm->mmap_sem); > > + if (!ret) { > > + if (npage < 0) { > > + mm->locked_vm += npage; > > + } else { > > + unsigned long limit; > > + > > + limit = is_current ? > > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > Maybe we can directly use task_rlimit() here? Since looks like > rlimit() is calling it as well, with "current". We could, but does it actually change anything? rlimit() is static inline, so using task_rlimit() for both just moves the is_current ternary into the task_rlimit() argument. Is this: limit = task_rlimit(is_current ? current : task, RLIMIT_MEMLOCK) >> PAGE_SHIFT); notably cleaner than above? > > + > > + if (mm->locked_vm + npage <= limit) > > + mm->locked_vm += npage; > > + else > > + ret = -ENOMEM; > > + } > > > > - if (is_current) { > > - mm = get_task_mm(task); > > - if (!mm) > > - return; > > + up_write(&mm->mmap_sem); > > } > > > > - /* > > - * Couldn't get mmap_sem lock, so must setup to update > > - * mm->locked_vm later. If locked_vm were atomic, we > > - * wouldn't need this silliness > > - */ > > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > > - if (WARN_ON(!vwork)) { > > + if (!is_current) > > mmput(mm); > > - return; > > - } > > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > > - vwork->mm = mm; > > - vwork->npage = npage; > > - schedule_work(&vwork->work); > > + > > + return ret; > > } > > > > /* > > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > long npage, unsigned long *pfn_base) > > { > > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > bool lock_cap = capable(CAP_IPC_LOCK); > > long ret, pinned = 0, lock_acct = 0; > > bool rsvd; > > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > /* Lock all the consecutive pages from pfn_base */ > > for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > > pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > > - unsigned long pfn = 0; > > - > > ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); > > if (ret) > > break; > > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > put_pfn(pfn, dma->prot); > > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > > __func__, limit << PAGE_SHIFT); > > - break; > > + ret = -ENOMEM; > > + goto unpin_out; > > } > > lock_acct++; > > } > > } > > > > out: > > - vfio_lock_acct(current, lock_acct); > > + ret = vfio_lock_acct(current, lock_acct); > > + > > +unpin_out: > > + if (ret) { > > + if (!rsvd) { > > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > > + put_pfn(pfn, dma->prot); > > + } > > + > > + return ret; > > + } > > The change in vfio_pin_pages_remote() seems to contain a functional > change totally not related to the subject (IIUC, we are going to unpin > those pages if the huge page can only be pinned partially, and we are > not doing that before)? If so, would it be nice to split current patch > into two, or at least mention this behavior change in commit log of > this patch? This is only tangentially about hugepages, this loop is looking for contiguous pages regardless of the processor or IOMMU page size support. We're trying to make as few calls to iommu_map() as we can and thus we want the maximum range of IOVA to physical address we can pump into the IOMMU driver. It's up to the IOMMU driver to figure out how it can optimize that range with hugepages or superpages in its page tables. So the caller of this function is looping over a range of memory and each time this function returns, it maps that many pages through the iommu. On success we return <=npage. The unpin_out loop here is absolutely related to the change proposed here, vfio_lock_acct() can fail, we cannot return both an error and pin pages, therefore we need to undo anything we've pinned this round. Are you perhaps only referring to the exit path above going straight to this loop rather than attempting to do the accounting for the pages pinned so far? Previously that was our only option because the unwind path was to return a short count, invoking the page accounting and iommu_mapping, while fully expecting the caller to again loop over the excess page, return -ENOMEM, and teardown the entire mapping request. So because we now require an unwind path for the vfio_lock_acct() change, we can now do the teardown w/o the additional pinning here and mapping by the caller. In a very strict sense, we could consider that a separate change and move those 3 lines to a follow-on patch but ultimately the caller did request pinned pages beyond what we believe their limit to be and making use of this new exit path here saves us a useless accounting and mapping iteration. I can note that in the commit log. Thanks, Alex > > > > return pinned; > > } > > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > > goto pin_page_exit; > > } > > > > - if (!rsvd && do_accounting) > > - vfio_lock_acct(dma->task, 1); > > + if (!rsvd && do_accounting) { > > + ret = vfio_lock_acct(dma->task, 1); > > + if (ret) { > > + put_pfn(*pfn_base, dma->prot); > > + goto pin_page_exit; > > + } > > + } > > + > > ret = 1; > > > > pin_page_exit: > > >
On Tue, 11 Apr 2017 12:27:55 -0600 Alex Williamson <alex.williamson@redhat.com> wrote: > On Tue, 11 Apr 2017 19:03:14 +0800 > Peter Xu <peterx@redhat.com> wrote: > > > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote: > > > If the mmap_sem is contented then the vfio type1 IOMMU backend will > > > defer locked page accounting updates to a workqueue task. This has > > > a few problems and depending on which side the user tries to play, > > > they might be over-penalized for unmaps that haven't yet been > > > accounted, or able to race the workqueue to enter more mappings > > > than they're allowed. It's not entirely clear what motivated this > > > workqueue mechanism in the original vfio design, but it seems to > > > introduce more problems than it solves, so remove it and update the > > > callers to allow for failure. We can also now recheck the limit > > > under write lock to make sure we don't exceed it. > > > > > > Cc: stable@vger.kernel.org > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > > > --- > > > > > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric > > > > > > drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- > > > 1 file changed, 46 insertions(+), 55 deletions(-) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > > index 32d2633092a3..b799edbb8c4f 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) > > > return ret; > > > } > > > > > > -struct vwork { > > > - struct mm_struct *mm; > > > - long npage; > > > - struct work_struct work; > > > -}; > > > - > > > -/* delayed decrement/increment for locked_vm */ > > > -static void vfio_lock_acct_bg(struct work_struct *work) > > > +static int vfio_lock_acct(struct task_struct *task, long npage) > > > { > > > - struct vwork *vwork = container_of(work, struct vwork, work); > > > - struct mm_struct *mm; > > > - > > > - mm = vwork->mm; > > > - down_write(&mm->mmap_sem); > > > - mm->locked_vm += vwork->npage; > > > - up_write(&mm->mmap_sem); > > > - mmput(mm); > > > - kfree(vwork); > > > -} > > > - > > > -static void vfio_lock_acct(struct task_struct *task, long npage) > > > -{ > > > - struct vwork *vwork; > > > struct mm_struct *mm; > > > bool is_current; > > > + int ret; > > > > > > if (!npage) > > > - return; > > > + return 0; > > > > > > is_current = (task->mm == current->mm); > > > > > > mm = is_current ? task->mm : get_task_mm(task); > > > > A question besides current patch: could I ask why we need to take > > special care for is_current? I see that is only used to only try avoid > > get_task_mm() when proper, but is get_task_mm() a heavy operation? > > Yes, it's slower, performance was significantly degraded when mdev > support was introduced and imposed get_task_mm() on all calling paths. > > > > if (!mm) > > > - return; /* process exited */ > > > + return -ESRCH; /* process exited */ > > > > > > - if (down_write_trylock(&mm->mmap_sem)) { > > > - mm->locked_vm += npage; > > > - up_write(&mm->mmap_sem); > > > - if (!is_current) > > > - mmput(mm); > > > - return; > > > - } > > > + ret = down_write_killable(&mm->mmap_sem); > > > + if (!ret) { > > > + if (npage < 0) { > > > + mm->locked_vm += npage; > > > + } else { > > > + unsigned long limit; > > > + > > > + limit = is_current ? > > > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > > > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > > Maybe we can directly use task_rlimit() here? Since looks like > > rlimit() is calling it as well, with "current". > > We could, but does it actually change anything? rlimit() is static > inline, so using task_rlimit() for both just moves the is_current > ternary into the task_rlimit() argument. Is this: > > limit = task_rlimit(is_current ? current : task, > RLIMIT_MEMLOCK) >> PAGE_SHIFT); > > notably cleaner than above? Ah, maybe you were suggesting that we can just use "task" here for both since it's always correct. Thanks, Alex > > > + > > > + if (mm->locked_vm + npage <= limit) > > > + mm->locked_vm += npage; > > > + else > > > + ret = -ENOMEM; > > > + } > > > > > > - if (is_current) { > > > - mm = get_task_mm(task); > > > - if (!mm) > > > - return; > > > + up_write(&mm->mmap_sem); > > > } > > > > > > - /* > > > - * Couldn't get mmap_sem lock, so must setup to update > > > - * mm->locked_vm later. If locked_vm were atomic, we > > > - * wouldn't need this silliness > > > - */ > > > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > > > - if (WARN_ON(!vwork)) { > > > + if (!is_current) > > > mmput(mm); > > > - return; > > > - } > > > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > > > - vwork->mm = mm; > > > - vwork->npage = npage; > > > - schedule_work(&vwork->work); > > > + > > > + return ret; > > > } > > > > > > /* > > > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > > > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > > long npage, unsigned long *pfn_base) > > > { > > > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > bool lock_cap = capable(CAP_IPC_LOCK); > > > long ret, pinned = 0, lock_acct = 0; > > > bool rsvd; > > > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > > /* Lock all the consecutive pages from pfn_base */ > > > for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > > > pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > > > - unsigned long pfn = 0; > > > - > > > ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); > > > if (ret) > > > break; > > > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > > > put_pfn(pfn, dma->prot); > > > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > > > __func__, limit << PAGE_SHIFT); > > > - break; > > > + ret = -ENOMEM; > > > + goto unpin_out; > > > } > > > lock_acct++; > > > } > > > } > > > > > > out: > > > - vfio_lock_acct(current, lock_acct); > > > + ret = vfio_lock_acct(current, lock_acct); > > > + > > > +unpin_out: > > > + if (ret) { > > > + if (!rsvd) { > > > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > > > + put_pfn(pfn, dma->prot); > > > + } > > > + > > > + return ret; > > > + } > > > > The change in vfio_pin_pages_remote() seems to contain a functional > > change totally not related to the subject (IIUC, we are going to unpin > > those pages if the huge page can only be pinned partially, and we are > > not doing that before)? If so, would it be nice to split current patch > > into two, or at least mention this behavior change in commit log of > > this patch? > > > This is only tangentially about hugepages, this loop is looking for > contiguous pages regardless of the processor or IOMMU page size > support. We're trying to make as few calls to iommu_map() as we can > and thus we want the maximum range of IOVA to physical address we can > pump into the IOMMU driver. It's up to the IOMMU driver to figure out > how it can optimize that range with hugepages or superpages in its page > tables. So the caller of this function is looping over a range of > memory and each time this function returns, it maps that many pages > through the iommu. On success we return <=npage. > > The unpin_out loop here is absolutely related to the change proposed > here, vfio_lock_acct() can fail, we cannot return both an error and pin > pages, therefore we need to undo anything we've pinned this round. > > Are you perhaps only referring to the exit path above going straight to > this loop rather than attempting to do the accounting for the pages > pinned so far? Previously that was our only option because the unwind > path was to return a short count, invoking the page accounting and > iommu_mapping, while fully expecting the caller to again loop over the > excess page, return -ENOMEM, and teardown the entire mapping request. > So because we now require an unwind path for the vfio_lock_acct() > change, we can now do the teardown w/o the additional pinning here and > mapping by the caller. In a very strict sense, we could consider that > a separate change and move those 3 lines to a follow-on patch but > ultimately the caller did request pinned pages beyond what we believe > their limit to be and making use of this new exit path here saves us a > useless accounting and mapping iteration. I can note that in the > commit log. Thanks, > > Alex > > > > > > > return pinned; > > > } > > > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > > > goto pin_page_exit; > > > } > > > > > > - if (!rsvd && do_accounting) > > > - vfio_lock_acct(dma->task, 1); > > > + if (!rsvd && do_accounting) { > > > + ret = vfio_lock_acct(dma->task, 1); > > > + if (ret) { > > > + put_pfn(*pfn_base, dma->prot); > > > + goto pin_page_exit; > > > + } > > > + } > > > + > > > ret = 1; > > > > > > pin_page_exit: > > > > > >
On Tue, Apr 11, 2017 at 12:50:11PM -0600, Alex Williamson wrote: > On Tue, 11 Apr 2017 12:27:55 -0600 > Alex Williamson <alex.williamson@redhat.com> wrote: > > > On Tue, 11 Apr 2017 19:03:14 +0800 > > Peter Xu <peterx@redhat.com> wrote: > > > > > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote: [...] > > > > -static void vfio_lock_acct(struct task_struct *task, long npage) > > > > -{ > > > > - struct vwork *vwork; > > > > struct mm_struct *mm; > > > > bool is_current; > > > > + int ret; > > > > > > > > if (!npage) > > > > - return; > > > > + return 0; > > > > > > > > is_current = (task->mm == current->mm); > > > > > > > > mm = is_current ? task->mm : get_task_mm(task); > > > > > > A question besides current patch: could I ask why we need to take > > > special care for is_current? I see that is only used to only try avoid > > > get_task_mm() when proper, but is get_task_mm() a heavy operation? > > > > Yes, it's slower, performance was significantly degraded when mdev > > support was introduced and imposed get_task_mm() on all calling paths. I see. Thanks. > > > > > > if (!mm) > > > > - return; /* process exited */ > > > > + return -ESRCH; /* process exited */ > > > > > > > > - if (down_write_trylock(&mm->mmap_sem)) { > > > > - mm->locked_vm += npage; > > > > - up_write(&mm->mmap_sem); > > > > - if (!is_current) > > > > - mmput(mm); > > > > - return; > > > > - } > > > > + ret = down_write_killable(&mm->mmap_sem); > > > > + if (!ret) { > > > > + if (npage < 0) { > > > > + mm->locked_vm += npage; > > > > + } else { > > > > + unsigned long limit; > > > > + > > > > + limit = is_current ? > > > > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > > > > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > > > > Maybe we can directly use task_rlimit() here? Since looks like > > > rlimit() is calling it as well, with "current". > > > > We could, but does it actually change anything? rlimit() is static > > inline, so using task_rlimit() for both just moves the is_current > > ternary into the task_rlimit() argument. Is this: > > > > limit = task_rlimit(is_current ? current : task, > > RLIMIT_MEMLOCK) >> PAGE_SHIFT); > > > > notably cleaner than above? > > Ah, maybe you were suggesting that we can just use "task" here for > both since it's always correct. Thanks, Yes it is. [...] > > > > out: > > > > - vfio_lock_acct(current, lock_acct); > > > > + ret = vfio_lock_acct(current, lock_acct); > > > > + > > > > +unpin_out: > > > > + if (ret) { > > > > + if (!rsvd) { > > > > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > > > > + put_pfn(pfn, dma->prot); > > > > + } > > > > + > > > > + return ret; > > > > + } > > > > > > The change in vfio_pin_pages_remote() seems to contain a functional > > > change totally not related to the subject (IIUC, we are going to unpin > > > those pages if the huge page can only be pinned partially, and we are > > > not doing that before)? If so, would it be nice to split current patch > > > into two, or at least mention this behavior change in commit log of > > > this patch? > > > > > > This is only tangentially about hugepages, this loop is looking for > > contiguous pages regardless of the processor or IOMMU page size > > support. It should somewhat related to huge pages? At least we have disable_hugepages parameter, and as well in vfio_pin_pages_remote() we have: if (unlikely(disable_hugepages)) goto out; So the loop will be skipped if that is specified. > > We're trying to make as few calls to iommu_map() as we can > > and thus we want the maximum range of IOVA to physical address we can > > pump into the IOMMU driver. It's up to the IOMMU driver to figure out > > how it can optimize that range with hugepages or superpages in its page > > tables. So the caller of this function is looping over a range of > > memory and each time this function returns, it maps that many pages > > through the iommu. On success we return <=npage. > > > > The unpin_out loop here is absolutely related to the change proposed > > here, vfio_lock_acct() can fail, we cannot return both an error and pin > > pages, therefore we need to undo anything we've pinned this round. Yes you are right. It's related. > > > > Are you perhaps only referring to the exit path above going straight to > > this loop rather than attempting to do the accounting for the pages > > pinned so far? Previously that was our only option because the unwind > > path was to return a short count, invoking the page accounting and > > iommu_mapping, while fully expecting the caller to again loop over the > > excess page, return -ENOMEM, and teardown the entire mapping request. > > So because we now require an unwind path for the vfio_lock_acct() > > change, we can now do the teardown w/o the additional pinning here and > > mapping by the caller. In a very strict sense, we could consider that > > a separate change and move those 3 lines to a follow-on patch but > > ultimately the caller did request pinned pages beyond what we believe > > their limit to be and making use of this new exit path here saves us a > > useless accounting and mapping iteration. I can note that in the > > commit log. Thanks, I agree that in all cases it's a corner case and trivial, and the userspace caller should anyway do something to release its memory accounting. Thanks for addressing my comments and replied with full detail!
On Wed, 12 Apr 2017 12:13:43 +0800 Peter Xu <peterx@redhat.com> wrote: > On Tue, Apr 11, 2017 at 12:50:11PM -0600, Alex Williamson wrote: > > On Tue, 11 Apr 2017 12:27:55 -0600 > > Alex Williamson <alex.williamson@redhat.com> wrote: > > > > > On Tue, 11 Apr 2017 19:03:14 +0800 > > > Peter Xu <peterx@redhat.com> wrote: > > > > > > > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote: > > [...] > > > > > > -static void vfio_lock_acct(struct task_struct *task, long npage) > > > > > -{ > > > > > - struct vwork *vwork; > > > > > struct mm_struct *mm; > > > > > bool is_current; > > > > > + int ret; > > > > > > > > > > if (!npage) > > > > > - return; > > > > > + return 0; > > > > > > > > > > is_current = (task->mm == current->mm); > > > > > > > > > > mm = is_current ? task->mm : get_task_mm(task); > > > > > > > > A question besides current patch: could I ask why we need to take > > > > special care for is_current? I see that is only used to only try avoid > > > > get_task_mm() when proper, but is get_task_mm() a heavy operation? > > > > > > Yes, it's slower, performance was significantly degraded when mdev > > > support was introduced and imposed get_task_mm() on all calling paths. > > I see. Thanks. > > > > > > > > > if (!mm) > > > > > - return; /* process exited */ > > > > > + return -ESRCH; /* process exited */ > > > > > > > > > > - if (down_write_trylock(&mm->mmap_sem)) { > > > > > - mm->locked_vm += npage; > > > > > - up_write(&mm->mmap_sem); > > > > > - if (!is_current) > > > > > - mmput(mm); > > > > > - return; > > > > > - } > > > > > + ret = down_write_killable(&mm->mmap_sem); > > > > > + if (!ret) { > > > > > + if (npage < 0) { > > > > > + mm->locked_vm += npage; > > > > > + } else { > > > > > + unsigned long limit; > > > > > + > > > > > + limit = is_current ? > > > > > + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : > > > > > + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > > > > > > > Maybe we can directly use task_rlimit() here? Since looks like > > > > rlimit() is calling it as well, with "current". > > > > > > We could, but does it actually change anything? rlimit() is static > > > inline, so using task_rlimit() for both just moves the is_current > > > ternary into the task_rlimit() argument. Is this: > > > > > > limit = task_rlimit(is_current ? current : task, > > > RLIMIT_MEMLOCK) >> PAGE_SHIFT); > > > > > > notably cleaner than above? > > > > Ah, maybe you were suggesting that we can just use "task" here for > > both since it's always correct. Thanks, > > Yes it is. > > [...] > > > > > > out: > > > > > - vfio_lock_acct(current, lock_acct); > > > > > + ret = vfio_lock_acct(current, lock_acct); > > > > > + > > > > > +unpin_out: > > > > > + if (ret) { > > > > > + if (!rsvd) { > > > > > + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > > > > > + put_pfn(pfn, dma->prot); > > > > > + } > > > > > + > > > > > + return ret; > > > > > + } > > > > > > > > The change in vfio_pin_pages_remote() seems to contain a functional > > > > change totally not related to the subject (IIUC, we are going to unpin > > > > those pages if the huge page can only be pinned partially, and we are > > > > not doing that before)? If so, would it be nice to split current patch > > > > into two, or at least mention this behavior change in commit log of > > > > this patch? > > > > > > > > > This is only tangentially about hugepages, this loop is looking for > > > contiguous pages regardless of the processor or IOMMU page size > > > support. > > It should somewhat related to huge pages? At least we have > disable_hugepages parameter, and as well in vfio_pin_pages_remote() we > have: > > if (unlikely(disable_hugepages)) > goto out; > > So the loop will be skipped if that is specified. Right, that's why I say tangentially, the IOMMU driver cannot make use of any sort of superpages if we map at PAGE_SIZE granularity, at least not until they implement something like transparent hugepage support for the IOMMU. But the point is that the type1 vfio IOMMU backend isn't directly aware of hugepages. > > > We're trying to make as few calls to iommu_map() as we can > > > and thus we want the maximum range of IOVA to physical address we can > > > pump into the IOMMU driver. It's up to the IOMMU driver to figure out > > > how it can optimize that range with hugepages or superpages in its page > > > tables. So the caller of this function is looping over a range of > > > memory and each time this function returns, it maps that many pages > > > through the iommu. On success we return <=npage. > > > > > > The unpin_out loop here is absolutely related to the change proposed > > > here, vfio_lock_acct() can fail, we cannot return both an error and pin > > > pages, therefore we need to undo anything we've pinned this round. > > Yes you are right. It's related. > > > > > > > Are you perhaps only referring to the exit path above going straight to > > > this loop rather than attempting to do the accounting for the pages > > > pinned so far? Previously that was our only option because the unwind > > > path was to return a short count, invoking the page accounting and > > > iommu_mapping, while fully expecting the caller to again loop over the > > > excess page, return -ENOMEM, and teardown the entire mapping request. > > > So because we now require an unwind path for the vfio_lock_acct() > > > change, we can now do the teardown w/o the additional pinning here and > > > mapping by the caller. In a very strict sense, we could consider that > > > a separate change and move those 3 lines to a follow-on patch but > > > ultimately the caller did request pinned pages beyond what we believe > > > their limit to be and making use of this new exit path here saves us a > > > useless accounting and mapping iteration. I can note that in the > > > commit log. Thanks, > > I agree that in all cases it's a corner case and trivial, and the > userspace caller should anyway do something to release its memory > accounting. There's actually no userspace visible change. Before or after, if the user attempts to map more pages than their locked memory limit, they get -ENOMEM. The difference is only whether we handle the condition lazily by completing pinning and mapping up to the limit, before failing and tearing down at the next pass, or undo the work of the current pass and return error to teardown any previous work. Either path all happens within the mapping ioctl. > Thanks for addressing my comments and replied with full detail! Thanks for reviewing! Alex
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 32d2633092a3..b799edbb8c4f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) return ret; } -struct vwork { - struct mm_struct *mm; - long npage; - struct work_struct work; -}; - -/* delayed decrement/increment for locked_vm */ -static void vfio_lock_acct_bg(struct work_struct *work) +static int vfio_lock_acct(struct task_struct *task, long npage) { - struct vwork *vwork = container_of(work, struct vwork, work); - struct mm_struct *mm; - - mm = vwork->mm; - down_write(&mm->mmap_sem); - mm->locked_vm += vwork->npage; - up_write(&mm->mmap_sem); - mmput(mm); - kfree(vwork); -} - -static void vfio_lock_acct(struct task_struct *task, long npage) -{ - struct vwork *vwork; struct mm_struct *mm; bool is_current; + int ret; if (!npage) - return; + return 0; is_current = (task->mm == current->mm); mm = is_current ? task->mm : get_task_mm(task); if (!mm) - return; /* process exited */ + return -ESRCH; /* process exited */ - if (down_write_trylock(&mm->mmap_sem)) { - mm->locked_vm += npage; - up_write(&mm->mmap_sem); - if (!is_current) - mmput(mm); - return; - } + ret = down_write_killable(&mm->mmap_sem); + if (!ret) { + if (npage < 0) { + mm->locked_vm += npage; + } else { + unsigned long limit; + + limit = is_current ? + rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT : + task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (mm->locked_vm + npage <= limit) + mm->locked_vm += npage; + else + ret = -ENOMEM; + } - if (is_current) { - mm = get_task_mm(task); - if (!mm) - return; + up_write(&mm->mmap_sem); } - /* - * Couldn't get mmap_sem lock, so must setup to update - * mm->locked_vm later. If locked_vm were atomic, we - * wouldn't need this silliness - */ - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); - if (WARN_ON(!vwork)) { + if (!is_current) mmput(mm); - return; - } - INIT_WORK(&vwork->work, vfio_lock_acct_bg); - vwork->mm = mm; - vwork->npage = npage; - schedule_work(&vwork->work); + + return ret; } /* @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, long npage, unsigned long *pfn_base) { - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; bool lock_cap = capable(CAP_IPC_LOCK); long ret, pinned = 0, lock_acct = 0; bool rsvd; @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, /* Lock all the consecutive pages from pfn_base */ for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { - unsigned long pfn = 0; - ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); if (ret) break; @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, put_pfn(pfn, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); - break; + ret = -ENOMEM; + goto unpin_out; } lock_acct++; } } out: - vfio_lock_acct(current, lock_acct); + ret = vfio_lock_acct(current, lock_acct); + +unpin_out: + if (ret) { + if (!rsvd) { + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) + put_pfn(pfn, dma->prot); + } + + return ret; + } return pinned; } @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, goto pin_page_exit; } - if (!rsvd && do_accounting) - vfio_lock_acct(dma->task, 1); + if (!rsvd && do_accounting) { + ret = vfio_lock_acct(dma->task, 1); + if (ret) { + put_pfn(*pfn_base, dma->prot); + goto pin_page_exit; + } + } + ret = 1; pin_page_exit:
If the mmap_sem is contented then the vfio type1 IOMMU backend will defer locked page accounting updates to a workqueue task. This has a few problems and depending on which side the user tries to play, they might be over-penalized for unmaps that haven't yet been accounted, or able to race the workqueue to enter more mappings than they're allowed. It's not entirely clear what motivated this workqueue mechanism in the original vfio design, but it seems to introduce more problems than it solves, so remove it and update the callers to allow for failure. We can also now recheck the limit under write lock to make sure we don't exceed it. Cc: stable@vger.kernel.org Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric drivers/vfio/vfio_iommu_type1.c | 101 ++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 55 deletions(-)