Message ID | 20191028201032.6352-10-jgg@ziepe.ca (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | Consolidate the mmu notifier interval_tree and locking | expand |
On 10/28/19 4:10 PM, Jason Gunthorpe wrote: > From: Jason Gunthorpe <jgg@mellanox.com> > > gntdev simply wants to monitor a specific VMA for any notifier events, > this can be done straightforwardly using mmu_range_notifier_insert() over > the VMA's VA range. > > The notifier should be attached until the original VMA is destroyed. > > It is unclear if any of this is even sane, but at least a lot of duplicate > code is removed. I didn't have a chance to look at the patch itself yet but as a heads-up --- it crashes dom0. -boris > > Cc: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> > Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> > Cc: xen-devel@lists.xenproject.org > Cc: Juergen Gross <jgross@suse.com> > Cc: Stefano Stabellini <sstabellini@kernel.org> > Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> > --- > drivers/xen/gntdev-common.h | 8 +- > drivers/xen/gntdev.c | 180 ++++++++++-------------------------- > 2 files changed, 49 insertions(+), 139 deletions(-) > > diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h > index 2f8b949c3eeb14..b201fdd20b667b 100644 > --- a/drivers/xen/gntdev-common.h > +++ b/drivers/xen/gntdev-common.h > @@ -21,15 +21,8 @@ struct gntdev_dmabuf_priv; > struct gntdev_priv { > /* Maps with visible offsets in the file descriptor. */ > struct list_head maps; > - /* > - * Maps that are not visible; will be freed on munmap. > - * Only populated if populate_freeable_maps == 1 > - */ > - struct list_head freeable_maps; > /* lock protects maps and freeable_maps. */ > struct mutex lock; > - struct mm_struct *mm; > - struct mmu_notifier mn; > > #ifdef CONFIG_XEN_GRANT_DMA_ALLOC > /* Device for which DMA memory is allocated. */ > @@ -49,6 +42,7 @@ struct gntdev_unmap_notify { > }; > > struct gntdev_grant_map { > + struct mmu_range_notifier notifier; > struct list_head next; > struct vm_area_struct *vma; > int index; > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c > index a446a7221e13e9..12d626670bebbc 100644 > --- a/drivers/xen/gntdev.c > +++ b/drivers/xen/gntdev.c > @@ -65,7 +65,6 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " > static atomic_t pages_mapped = ATOMIC_INIT(0); > > static int use_ptemod; > -#define populate_freeable_maps use_ptemod > > static int unmap_grant_pages(struct gntdev_grant_map *map, > int offset, int pages); > @@ -251,12 +250,6 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) > evtchn_put(map->notify.event); > } > > - if (populate_freeable_maps && priv) { > - mutex_lock(&priv->lock); > - list_del(&map->next); > - mutex_unlock(&priv->lock); > - } > - > if (map->pages && !use_ptemod) > unmap_grant_pages(map, 0, map->count); > gntdev_free_map(map); > @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma) > struct gntdev_priv *priv = file->private_data; > > pr_debug("gntdev_vma_close %p\n", vma); > - if (use_ptemod) { > - /* It is possible that an mmu notifier could be running > - * concurrently, so take priv->lock to ensure that the vma won't > - * vanishing during the unmap_grant_pages call, since we will > - * spin here until that completes. Such a concurrent call will > - * not do any unmapping, since that has been done prior to > - * closing the vma, but it may still iterate the unmap_ops list. > - */ > - mutex_lock(&priv->lock); > + if (use_ptemod && map->vma == vma) { > + mmu_range_notifier_remove(&map->notifier); > map->vma = NULL; > - mutex_unlock(&priv->lock); > } > vma->vm_private_data = NULL; > gntdev_put_map(priv, map); > @@ -477,109 +462,44 @@ static const struct vm_operations_struct gntdev_vmops = { > > /* ------------------------------------------------------------------ */ > > -static bool in_range(struct gntdev_grant_map *map, > - unsigned long start, unsigned long end) > -{ > - if (!map->vma) > - return false; > - if (map->vma->vm_start >= end) > - return false; > - if (map->vma->vm_end <= start) > - return false; > - > - return true; > -} > - > -static int unmap_if_in_range(struct gntdev_grant_map *map, > - unsigned long start, unsigned long end, > - bool blockable) > +static bool gntdev_invalidate(struct mmu_range_notifier *mn, > + const struct mmu_notifier_range *range, > + unsigned long cur_seq) > { > + struct gntdev_grant_map *map = > + container_of(mn, struct gntdev_grant_map, notifier); > unsigned long mstart, mend; > int err; > > - if (!in_range(map, start, end)) > - return 0; > + if (!mmu_notifier_range_blockable(range)) > + return false; > > - if (!blockable) > - return -EAGAIN; > + /* > + * If the VMA is split or otherwise changed the notifier is not > + * updated, but we don't want to process VA's outside the modified > + * VMA. FIXME: It would be much more understandable to just prevent > + * modifying the VMA in the first place. > + */ > + if (map->vma->vm_start >= range->end || > + map->vma->vm_end <= range->start) > + return true; > > - mstart = max(start, map->vma->vm_start); > - mend = min(end, map->vma->vm_end); > + mstart = max(range->start, map->vma->vm_start); > + mend = min(range->end, map->vma->vm_end); > pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", > map->index, map->count, > map->vma->vm_start, map->vma->vm_end, > - start, end, mstart, mend); > + range->start, range->end, mstart, mend); > err = unmap_grant_pages(map, > (mstart - map->vma->vm_start) >> PAGE_SHIFT, > (mend - mstart) >> PAGE_SHIFT); > WARN_ON(err); > > - return 0; > -} > - > -static int mn_invl_range_start(struct mmu_notifier *mn, > - const struct mmu_notifier_range *range) > -{ > - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); > - struct gntdev_grant_map *map; > - int ret = 0; > - > - if (mmu_notifier_range_blockable(range)) > - mutex_lock(&priv->lock); > - else if (!mutex_trylock(&priv->lock)) > - return -EAGAIN; > - > - list_for_each_entry(map, &priv->maps, next) { > - ret = unmap_if_in_range(map, range->start, range->end, > - mmu_notifier_range_blockable(range)); > - if (ret) > - goto out_unlock; > - } > - list_for_each_entry(map, &priv->freeable_maps, next) { > - ret = unmap_if_in_range(map, range->start, range->end, > - mmu_notifier_range_blockable(range)); > - if (ret) > - goto out_unlock; > - } > - > -out_unlock: > - mutex_unlock(&priv->lock); > - > - return ret; > -} > - > -static void mn_release(struct mmu_notifier *mn, > - struct mm_struct *mm) > -{ > - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); > - struct gntdev_grant_map *map; > - int err; > - > - mutex_lock(&priv->lock); > - list_for_each_entry(map, &priv->maps, next) { > - if (!map->vma) > - continue; > - pr_debug("map %d+%d (%lx %lx)\n", > - map->index, map->count, > - map->vma->vm_start, map->vma->vm_end); > - err = unmap_grant_pages(map, /* offset */ 0, map->count); > - WARN_ON(err); > - } > - list_for_each_entry(map, &priv->freeable_maps, next) { > - if (!map->vma) > - continue; > - pr_debug("map %d+%d (%lx %lx)\n", > - map->index, map->count, > - map->vma->vm_start, map->vma->vm_end); > - err = unmap_grant_pages(map, /* offset */ 0, map->count); > - WARN_ON(err); > - } > - mutex_unlock(&priv->lock); > + return true; > } > > -static const struct mmu_notifier_ops gntdev_mmu_ops = { > - .release = mn_release, > - .invalidate_range_start = mn_invl_range_start, > +static const struct mmu_range_notifier_ops gntdev_mmu_ops = { > + .invalidate = gntdev_invalidate, > }; > > /* ------------------------------------------------------------------ */ > @@ -594,7 +514,6 @@ static int gntdev_open(struct inode *inode, struct file *flip) > return -ENOMEM; > > INIT_LIST_HEAD(&priv->maps); > - INIT_LIST_HEAD(&priv->freeable_maps); > mutex_init(&priv->lock); > > #ifdef CONFIG_XEN_GNTDEV_DMABUF > @@ -606,17 +525,6 @@ static int gntdev_open(struct inode *inode, struct file *flip) > } > #endif > > - if (use_ptemod) { > - priv->mm = get_task_mm(current); > - if (!priv->mm) { > - kfree(priv); > - return -ENOMEM; > - } > - priv->mn.ops = &gntdev_mmu_ops; > - ret = mmu_notifier_register(&priv->mn, priv->mm); > - mmput(priv->mm); > - } > - > if (ret) { > kfree(priv); > return ret; > @@ -653,16 +561,12 @@ static int gntdev_release(struct inode *inode, struct file *flip) > list_del(&map->next); > gntdev_put_map(NULL /* already removed */, map); > } > - WARN_ON(!list_empty(&priv->freeable_maps)); > mutex_unlock(&priv->lock); > > #ifdef CONFIG_XEN_GNTDEV_DMABUF > gntdev_dmabuf_fini(priv->dmabuf_priv); > #endif > > - if (use_ptemod) > - mmu_notifier_unregister(&priv->mn, priv->mm); > - > kfree(priv); > return 0; > } > @@ -723,8 +627,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, > map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); > if (map) { > list_del(&map->next); > - if (populate_freeable_maps) > - list_add_tail(&map->next, &priv->freeable_maps); > err = 0; > } > mutex_unlock(&priv->lock); > @@ -1096,11 +998,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) > goto unlock_out; > if (use_ptemod && map->vma) > goto unlock_out; > - if (use_ptemod && priv->mm != vma->vm_mm) { > - pr_warn("Huh? Other mm?\n"); > - goto unlock_out; > - } > - > refcount_inc(&map->users); > > vma->vm_ops = &gntdev_vmops; > @@ -1111,10 +1008,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) > vma->vm_flags |= VM_DONTCOPY; > > vma->vm_private_data = map; > - > - if (use_ptemod) > - map->vma = vma; > - > if (map->flags) { > if ((vma->vm_flags & VM_WRITE) && > (map->flags & GNTMAP_readonly)) > @@ -1125,8 +1018,28 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) > map->flags |= GNTMAP_readonly; > } > > + if (use_ptemod) { > + map->vma = vma; > + err = mmu_range_notifier_insert_locked( > + &map->notifier, vma->vm_start, > + vma->vm_end - vma->vm_start, vma->vm_mm); > + if (err) > + goto out_unlock_put; > + } > mutex_unlock(&priv->lock); > > + /* > + * gntdev takes the address of the PTE in find_grant_ptes() and passes > + * it to the hypervisor in gntdev_map_grant_pages(). The purpose of > + * the notifier is to prevent the hypervisor pointer to the PTE from > + * going stale. > + * > + * Since this vma's mappings can't be touched without the mmap_sem, > + * and we are holding it now, there is no need for the notifier_range > + * locking pattern. > + */ > + mmu_range_read_begin(&map->notifier); > + > if (use_ptemod) { > map->pages_vm_start = vma->vm_start; > err = apply_to_page_range(vma->vm_mm, vma->vm_start, > @@ -1175,8 +1088,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) > mutex_unlock(&priv->lock); > out_put_map: > if (use_ptemod) { > - map->vma = NULL; > unmap_grant_pages(map, 0, map->count); > + if (map->vma) { > + mmu_range_notifier_remove(&map->notifier); > + map->vma = NULL; > + } > } > gntdev_put_map(priv, map); > return err;
On Wed, Oct 30, 2019 at 12:55:37PM -0400, Boris Ostrovsky wrote: > On 10/28/19 4:10 PM, Jason Gunthorpe wrote: > > From: Jason Gunthorpe <jgg@mellanox.com> > > > > gntdev simply wants to monitor a specific VMA for any notifier events, > > this can be done straightforwardly using mmu_range_notifier_insert() over > > the VMA's VA range. > > > > The notifier should be attached until the original VMA is destroyed. > > > > It is unclear if any of this is even sane, but at least a lot of duplicate > > code is removed. > > I didn't have a chance to look at the patch itself yet but as a heads-up > --- it crashes dom0. Thanks Boris. I spent a bit of time and got a VM running with a xen 4.9 hypervisor and a kernel with this patch series. It a ubuntu bionic VM with the distro's xen stuff. Can you give some guidance how you made it crash? I see the VM autoloaded gntdev: Module Size Used by xen_gntdev 24576 2 xen_evtchn 16384 1 xenfs 16384 1 xen_privcmd 24576 16 xenfs And lsof says several xen processes have the chardev open: xenstored 819 root 13u CHR 10,53 0t0 19595 /dev/xen/gntdev xenconsol 857 root 8u CHR 10,53 0t0 19595 /dev/xen/gntdev xenconsol 857 860 root 8u CHR 10,53 0t0 19595 /dev/xen/gntdev But no crashing.. However, I wasn't able to get my usual debug kernel .config to boot with the xen hypervisor, it crashes on early boot with: (XEN) Dom0 has maximum 8 VCPUs (XEN) Scrubbing Free RAM on 1 nodes using 8 CPUs (XEN) .done. (XEN) Initial low memory virq threshold set at 0x1000 pages. (XEN) Std. Loglevel: All (XEN) Guest Loglevel: All (XEN) *** Serial input -> DOM0 (type 'CTRL-a' three times to switch input to Xen) (XEN) Freed 468kB init memory (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0002] (XEN) Pagetable walk from fffffbfff0480fbe: (XEN) L4[0x1f7] = 0000000000000000 ffffffffffffffff (XEN) domain_crash_sync called from entry.S: fault at ffff82d080348a06 entry.o#create_bounce_frame+0x135/0x15f (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.9.2 x86_64 debug=n Not tainted ]---- (XEN) CPU: 0 (XEN) RIP: e033:[<ffffffff82b9f731>] (XEN) RFLAGS: 0000000000000296 EM: 1 CONTEXT: pv guest (d0v0) (XEN) rax: fffffbfff0480fbe rbx: 0000000000000000 rcx: 00000000c0000101 (XEN) rdx: 00000000ffffffff rsi: ffffffff84026000 rdi: ffffffff82cb4a20 (XEN) rbp: ffffffff82407ff8 rsp: ffffffff82407da0 r8: 0000000000000000 (XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 0000000000000000 (XEN) r12: 0000000000000000 r13: 1ffffffff0480fbe r14: 0000000000000000 (XEN) r15: 0000000000000000 cr0: 000000008005003b cr4: 00000000003506e0 (XEN) cr3: 0000000034027000 cr2: fffffbfff0480fbe (XEN) fsb: 0000000000000000 gsb: ffffffff82b61000 gss: 0000000000000000 (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e02b cs: e033 Which is surely some .config issue, but I didn't figure out what. Jason
On 11/1/19 1:48 PM, Jason Gunthorpe wrote: > On Wed, Oct 30, 2019 at 12:55:37PM -0400, Boris Ostrovsky wrote: >> On 10/28/19 4:10 PM, Jason Gunthorpe wrote: >>> From: Jason Gunthorpe <jgg@mellanox.com> >>> >>> gntdev simply wants to monitor a specific VMA for any notifier events, >>> this can be done straightforwardly using mmu_range_notifier_insert() over >>> the VMA's VA range. >>> >>> The notifier should be attached until the original VMA is destroyed. >>> >>> It is unclear if any of this is even sane, but at least a lot of duplicate >>> code is removed. >> I didn't have a chance to look at the patch itself yet but as a heads-up >> --- it crashes dom0. > Thanks Boris. I spent a bit of time and got a VM running with a xen > 4.9 hypervisor and a kernel with this patch series. It a ubuntu bionic > VM with the distro's xen stuff. > > Can you give some guidance how you made it crash? It crashes trying to dereference mrn->ops->invalidate in mn_itree_invalidate() when a guest exits. I don't think you've initialized notifier ops. I don't see you using gntdev_mmu_ops anywhere. -boris > I see the VM > autoloaded gntdev: > > Module Size Used by > xen_gntdev 24576 2 > xen_evtchn 16384 1 > xenfs 16384 1 > xen_privcmd 24576 16 xenfs > > And lsof says several xen processes have the chardev open: > > xenstored 819 root 13u CHR 10,53 0t0 19595 /dev/xen/gntdev > xenconsol 857 root 8u CHR 10,53 0t0 19595 /dev/xen/gntdev > xenconsol 857 860 root 8u CHR 10,53 0t0 19595 /dev/xen/gntdev > > But no crashing.. > > However, I wasn't able to get my usual debug kernel .config to boot > with the xen hypervisor, it crashes on early boot with: > > (XEN) Dom0 has maximum 8 VCPUs > (XEN) Scrubbing Free RAM on 1 nodes using 8 CPUs > (XEN) .done. > (XEN) Initial low memory virq threshold set at 0x1000 pages. > (XEN) Std. Loglevel: All > (XEN) Guest Loglevel: All > (XEN) *** Serial input -> DOM0 (type 'CTRL-a' three times to switch input to Xen) > (XEN) Freed 468kB init memory > (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0002] > (XEN) Pagetable walk from fffffbfff0480fbe: > (XEN) L4[0x1f7] = 0000000000000000 ffffffffffffffff > (XEN) domain_crash_sync called from entry.S: fault at ffff82d080348a06 entry.o#create_bounce_frame+0x135/0x15f > (XEN) Domain 0 (vcpu#0) crashed on cpu#0: > (XEN) ----[ Xen-4.9.2 x86_64 debug=n Not tainted ]---- > (XEN) CPU: 0 > (XEN) RIP: e033:[<ffffffff82b9f731>] > (XEN) RFLAGS: 0000000000000296 EM: 1 CONTEXT: pv guest (d0v0) > (XEN) rax: fffffbfff0480fbe rbx: 0000000000000000 rcx: 00000000c0000101 > (XEN) rdx: 00000000ffffffff rsi: ffffffff84026000 rdi: ffffffff82cb4a20 > (XEN) rbp: ffffffff82407ff8 rsp: ffffffff82407da0 r8: 0000000000000000 > (XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 0000000000000000 > (XEN) r12: 0000000000000000 r13: 1ffffffff0480fbe r14: 0000000000000000 > (XEN) r15: 0000000000000000 cr0: 000000008005003b cr4: 00000000003506e0 > (XEN) cr3: 0000000034027000 cr2: fffffbfff0480fbe > (XEN) fsb: 0000000000000000 gsb: ffffffff82b61000 gss: 0000000000000000 > (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e02b cs: e033 > > Which is surely some .config issue, but I didn't figure out what. > > Jason
On Fri, Nov 01, 2019 at 02:51:46PM -0400, Boris Ostrovsky wrote: > On 11/1/19 1:48 PM, Jason Gunthorpe wrote: > > On Wed, Oct 30, 2019 at 12:55:37PM -0400, Boris Ostrovsky wrote: > >> On 10/28/19 4:10 PM, Jason Gunthorpe wrote: > >>> From: Jason Gunthorpe <jgg@mellanox.com> > >>> > >>> gntdev simply wants to monitor a specific VMA for any notifier events, > >>> this can be done straightforwardly using mmu_range_notifier_insert() over > >>> the VMA's VA range. > >>> > >>> The notifier should be attached until the original VMA is destroyed. > >>> > >>> It is unclear if any of this is even sane, but at least a lot of duplicate > >>> code is removed. > >> I didn't have a chance to look at the patch itself yet but as a heads-up > > Thanks Boris. I spent a bit of time and got a VM running with a xen > > 4.9 hypervisor and a kernel with this patch series. It a ubuntu bionic > > VM with the distro's xen stuff. > > > > Can you give some guidance how you made it crash? > > It crashes trying to dereference mrn->ops->invalidate in > mn_itree_invalidate() when a guest exits. > > I don't think you've initialized notifier ops. I don't see you using > gntdev_mmu_ops anywhere. So weird the compiler didn't complain about an unused static... But yes, this is a mistake, it should be: diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 37b278857ad807..0ca35485fd3865 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1011,6 +1011,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (use_ptemod) { map->vma = vma; + map->notifier.ops = &gntdev_mmu_ops; err = mmu_range_notifier_insert_locked( &map->notifier, vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_mm); Thanks, Jason
On 10/28/19 4:10 PM, Jason Gunthorpe wrote: > @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma) > struct gntdev_priv *priv = file->private_data; > > pr_debug("gntdev_vma_close %p\n", vma); > - if (use_ptemod) { > - /* It is possible that an mmu notifier could be running > - * concurrently, so take priv->lock to ensure that the vma won't > - * vanishing during the unmap_grant_pages call, since we will > - * spin here until that completes. Such a concurrent call will > - * not do any unmapping, since that has been done prior to > - * closing the vma, but it may still iterate the unmap_ops list. > - */ > - mutex_lock(&priv->lock); > + if (use_ptemod && map->vma == vma) { Is it possible for map->vma not to be equal to vma? -boris > + mmu_range_notifier_remove(&map->notifier); > map->vma = NULL; > - mutex_unlock(&priv->lock); > } > vma->vm_private_data = NULL; > gntdev_put_map(priv, map); >
On Mon, Nov 04, 2019 at 05:03:31PM -0500, Boris Ostrovsky wrote: > On 10/28/19 4:10 PM, Jason Gunthorpe wrote: > > @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma) > > struct gntdev_priv *priv = file->private_data; > > > > pr_debug("gntdev_vma_close %p\n", vma); > > - if (use_ptemod) { > > - /* It is possible that an mmu notifier could be running > > - * concurrently, so take priv->lock to ensure that the vma won't > > - * vanishing during the unmap_grant_pages call, since we will > > - * spin here until that completes. Such a concurrent call will > > - * not do any unmapping, since that has been done prior to > > - * closing the vma, but it may still iterate the unmap_ops list. > > - */ > > - mutex_lock(&priv->lock); > > + if (use_ptemod && map->vma == vma) { > > > Is it possible for map->vma not to be equal to vma? It could be NULL at least if use_ptemod is not set. Otherwise, I'm not sure, the confusing bit is that the map comes from here: map = gntdev_find_map_index(priv, index, count); It looks like the intent is that the map->vma is always set to the only vma that has the map as private_data. So, I suppose it can be relaxed to a null test and a WARN_ON that it hasn't changed? Jason
On 11/4/19 9:31 PM, Jason Gunthorpe wrote: > On Mon, Nov 04, 2019 at 05:03:31PM -0500, Boris Ostrovsky wrote: >> On 10/28/19 4:10 PM, Jason Gunthorpe wrote: >>> @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma) >>> struct gntdev_priv *priv = file->private_data; >>> >>> pr_debug("gntdev_vma_close %p\n", vma); >>> - if (use_ptemod) { >>> - /* It is possible that an mmu notifier could be running >>> - * concurrently, so take priv->lock to ensure that the vma won't >>> - * vanishing during the unmap_grant_pages call, since we will >>> - * spin here until that completes. Such a concurrent call will >>> - * not do any unmapping, since that has been done prior to >>> - * closing the vma, but it may still iterate the unmap_ops list. >>> - */ >>> - mutex_lock(&priv->lock); >>> + if (use_ptemod && map->vma == vma) { >> >> Is it possible for map->vma not to be equal to vma? > It could be NULL at least if use_ptemod is not set. > > Otherwise, I'm not sure, the confusing bit is that the map comes from > here: > > map = gntdev_find_map_index(priv, index, count); > > It looks like the intent is that the map->vma is always set to the > only vma that has the map as private_data. I am not sure how this can work otherwise. We stash map pointer in vm's vm_private_data and vice versa (for use_ptemod) gntdev_mmap() so if they have to match. That's why I was asking you to see if you had something particular in mind when you added this test. > So, I suppose it can be relaxed to a null test and a WARN_ON that it > hasn't changed? You mean if (use_ptemod) { WARN_ON(map->vma != vma); ... Yes, that sounds good. -boris
On Tue, Nov 05, 2019 at 10:16:46AM -0500, Boris Ostrovsky wrote: > > So, I suppose it can be relaxed to a null test and a WARN_ON that it > > hasn't changed? > > You mean > > if (use_ptemod) { > WARN_ON(map->vma != vma); > ... > > > Yes, that sounds good. I amended my copy of the patch with the above, has this rework shown signs of working? @@ -436,7 +436,8 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - if (use_ptemod && map->vma == vma) { + if (use_ptemod) { + WARN_ON(map->vma != vma); mmu_range_notifier_remove(&map->notifier); map->vma = NULL; } Jason
On 11/7/19 3:36 PM, Jason Gunthorpe wrote: > On Tue, Nov 05, 2019 at 10:16:46AM -0500, Boris Ostrovsky wrote: > >>> So, I suppose it can be relaxed to a null test and a WARN_ON that it >>> hasn't changed? >> You mean >> >> if (use_ptemod) { >> WARN_ON(map->vma != vma); >> ... >> >> >> Yes, that sounds good. > I amended my copy of the patch with the above, has this rework shown > signs of working? Yes, it works fine. But please don't forget notifier ops initialization. With those two changes, Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> > > @@ -436,7 +436,8 @@ static void gntdev_vma_close(struct vm_area_struct *vma) > struct gntdev_priv *priv = file->private_data; > > pr_debug("gntdev_vma_close %p\n", vma); > - if (use_ptemod && map->vma == vma) { > + if (use_ptemod) { > + WARN_ON(map->vma != vma); > mmu_range_notifier_remove(&map->notifier); > map->vma = NULL; > } > > Jason
On Thu, Nov 07, 2019 at 05:54:52PM -0500, Boris Ostrovsky wrote: > On 11/7/19 3:36 PM, Jason Gunthorpe wrote: > > On Tue, Nov 05, 2019 at 10:16:46AM -0500, Boris Ostrovsky wrote: > > > >>> So, I suppose it can be relaxed to a null test and a WARN_ON that it > >>> hasn't changed? > >> You mean > >> > >> if (use_ptemod) { > >> WARN_ON(map->vma != vma); > >> ... > >> > >> > >> Yes, that sounds good. > > I amended my copy of the patch with the above, has this rework shown > > signs of working? > > Yes, it works fine. > > But please don't forget notifier ops initialization. > > With those two changes, > > Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Thanks, I got both things. I'll forward this toward linux-next and repost a v3 Jason
diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h index 2f8b949c3eeb14..b201fdd20b667b 100644 --- a/drivers/xen/gntdev-common.h +++ b/drivers/xen/gntdev-common.h @@ -21,15 +21,8 @@ struct gntdev_dmabuf_priv; struct gntdev_priv { /* Maps with visible offsets in the file descriptor. */ struct list_head maps; - /* - * Maps that are not visible; will be freed on munmap. - * Only populated if populate_freeable_maps == 1 - */ - struct list_head freeable_maps; /* lock protects maps and freeable_maps. */ struct mutex lock; - struct mm_struct *mm; - struct mmu_notifier mn; #ifdef CONFIG_XEN_GRANT_DMA_ALLOC /* Device for which DMA memory is allocated. */ @@ -49,6 +42,7 @@ struct gntdev_unmap_notify { }; struct gntdev_grant_map { + struct mmu_range_notifier notifier; struct list_head next; struct vm_area_struct *vma; int index; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index a446a7221e13e9..12d626670bebbc 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -65,7 +65,6 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " static atomic_t pages_mapped = ATOMIC_INIT(0); static int use_ptemod; -#define populate_freeable_maps use_ptemod static int unmap_grant_pages(struct gntdev_grant_map *map, int offset, int pages); @@ -251,12 +250,6 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) evtchn_put(map->notify.event); } - if (populate_freeable_maps && priv) { - mutex_lock(&priv->lock); - list_del(&map->next); - mutex_unlock(&priv->lock); - } - if (map->pages && !use_ptemod) unmap_grant_pages(map, 0, map->count); gntdev_free_map(map); @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - if (use_ptemod) { - /* It is possible that an mmu notifier could be running - * concurrently, so take priv->lock to ensure that the vma won't - * vanishing during the unmap_grant_pages call, since we will - * spin here until that completes. Such a concurrent call will - * not do any unmapping, since that has been done prior to - * closing the vma, but it may still iterate the unmap_ops list. - */ - mutex_lock(&priv->lock); + if (use_ptemod && map->vma == vma) { + mmu_range_notifier_remove(&map->notifier); map->vma = NULL; - mutex_unlock(&priv->lock); } vma->vm_private_data = NULL; gntdev_put_map(priv, map); @@ -477,109 +462,44 @@ static const struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ -static bool in_range(struct gntdev_grant_map *map, - unsigned long start, unsigned long end) -{ - if (!map->vma) - return false; - if (map->vma->vm_start >= end) - return false; - if (map->vma->vm_end <= start) - return false; - - return true; -} - -static int unmap_if_in_range(struct gntdev_grant_map *map, - unsigned long start, unsigned long end, - bool blockable) +static bool gntdev_invalidate(struct mmu_range_notifier *mn, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { + struct gntdev_grant_map *map = + container_of(mn, struct gntdev_grant_map, notifier); unsigned long mstart, mend; int err; - if (!in_range(map, start, end)) - return 0; + if (!mmu_notifier_range_blockable(range)) + return false; - if (!blockable) - return -EAGAIN; + /* + * If the VMA is split or otherwise changed the notifier is not + * updated, but we don't want to process VA's outside the modified + * VMA. FIXME: It would be much more understandable to just prevent + * modifying the VMA in the first place. + */ + if (map->vma->vm_start >= range->end || + map->vma->vm_end <= range->start) + return true; - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); + mstart = max(range->start, map->vma->vm_start); + mend = min(range->end, map->vma->vm_end); pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", map->index, map->count, map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); + range->start, range->end, mstart, mend); err = unmap_grant_pages(map, (mstart - map->vma->vm_start) >> PAGE_SHIFT, (mend - mstart) >> PAGE_SHIFT); WARN_ON(err); - return 0; -} - -static int mn_invl_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); - struct gntdev_grant_map *map; - int ret = 0; - - if (mmu_notifier_range_blockable(range)) - mutex_lock(&priv->lock); - else if (!mutex_trylock(&priv->lock)) - return -EAGAIN; - - list_for_each_entry(map, &priv->maps, next) { - ret = unmap_if_in_range(map, range->start, range->end, - mmu_notifier_range_blockable(range)); - if (ret) - goto out_unlock; - } - list_for_each_entry(map, &priv->freeable_maps, next) { - ret = unmap_if_in_range(map, range->start, range->end, - mmu_notifier_range_blockable(range)); - if (ret) - goto out_unlock; - } - -out_unlock: - mutex_unlock(&priv->lock); - - return ret; -} - -static void mn_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); - struct gntdev_grant_map *map; - int err; - - mutex_lock(&priv->lock); - list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - pr_debug("map %d+%d (%lx %lx)\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end); - err = unmap_grant_pages(map, /* offset */ 0, map->count); - WARN_ON(err); - } - list_for_each_entry(map, &priv->freeable_maps, next) { - if (!map->vma) - continue; - pr_debug("map %d+%d (%lx %lx)\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end); - err = unmap_grant_pages(map, /* offset */ 0, map->count); - WARN_ON(err); - } - mutex_unlock(&priv->lock); + return true; } -static const struct mmu_notifier_ops gntdev_mmu_ops = { - .release = mn_release, - .invalidate_range_start = mn_invl_range_start, +static const struct mmu_range_notifier_ops gntdev_mmu_ops = { + .invalidate = gntdev_invalidate, }; /* ------------------------------------------------------------------ */ @@ -594,7 +514,6 @@ static int gntdev_open(struct inode *inode, struct file *flip) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); - INIT_LIST_HEAD(&priv->freeable_maps); mutex_init(&priv->lock); #ifdef CONFIG_XEN_GNTDEV_DMABUF @@ -606,17 +525,6 @@ static int gntdev_open(struct inode *inode, struct file *flip) } #endif - if (use_ptemod) { - priv->mm = get_task_mm(current); - if (!priv->mm) { - kfree(priv); - return -ENOMEM; - } - priv->mn.ops = &gntdev_mmu_ops; - ret = mmu_notifier_register(&priv->mn, priv->mm); - mmput(priv->mm); - } - if (ret) { kfree(priv); return ret; @@ -653,16 +561,12 @@ static int gntdev_release(struct inode *inode, struct file *flip) list_del(&map->next); gntdev_put_map(NULL /* already removed */, map); } - WARN_ON(!list_empty(&priv->freeable_maps)); mutex_unlock(&priv->lock); #ifdef CONFIG_XEN_GNTDEV_DMABUF gntdev_dmabuf_fini(priv->dmabuf_priv); #endif - if (use_ptemod) - mmu_notifier_unregister(&priv->mn, priv->mm); - kfree(priv); return 0; } @@ -723,8 +627,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - if (populate_freeable_maps) - list_add_tail(&map->next, &priv->freeable_maps); err = 0; } mutex_unlock(&priv->lock); @@ -1096,11 +998,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) goto unlock_out; if (use_ptemod && map->vma) goto unlock_out; - if (use_ptemod && priv->mm != vma->vm_mm) { - pr_warn("Huh? Other mm?\n"); - goto unlock_out; - } - refcount_inc(&map->users); vma->vm_ops = &gntdev_vmops; @@ -1111,10 +1008,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_flags |= VM_DONTCOPY; vma->vm_private_data = map; - - if (use_ptemod) - map->vma = vma; - if (map->flags) { if ((vma->vm_flags & VM_WRITE) && (map->flags & GNTMAP_readonly)) @@ -1125,8 +1018,28 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->flags |= GNTMAP_readonly; } + if (use_ptemod) { + map->vma = vma; + err = mmu_range_notifier_insert_locked( + &map->notifier, vma->vm_start, + vma->vm_end - vma->vm_start, vma->vm_mm); + if (err) + goto out_unlock_put; + } mutex_unlock(&priv->lock); + /* + * gntdev takes the address of the PTE in find_grant_ptes() and passes + * it to the hypervisor in gntdev_map_grant_pages(). The purpose of + * the notifier is to prevent the hypervisor pointer to the PTE from + * going stale. + * + * Since this vma's mappings can't be touched without the mmap_sem, + * and we are holding it now, there is no need for the notifier_range + * locking pattern. + */ + mmu_range_read_begin(&map->notifier); + if (use_ptemod) { map->pages_vm_start = vma->vm_start; err = apply_to_page_range(vma->vm_mm, vma->vm_start, @@ -1175,8 +1088,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) mutex_unlock(&priv->lock); out_put_map: if (use_ptemod) { - map->vma = NULL; unmap_grant_pages(map, 0, map->count); + if (map->vma) { + mmu_range_notifier_remove(&map->notifier); + map->vma = NULL; + } } gntdev_put_map(priv, map); return err;