diff mbox series

[v2,09/15] xen/gntdev: use mmu_range_notifier_insert

Message ID 20191028201032.6352-10-jgg@ziepe.ca (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Consolidate the mmu notifier interval_tree and locking | expand

Commit Message

Jason Gunthorpe Oct. 28, 2019, 8:10 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

gntdev simply wants to monitor a specific VMA for any notifier events,
this can be done straightforwardly using mmu_range_notifier_insert() over
the VMA's VA range.

The notifier should be attached until the original VMA is destroyed.

It is unclear if any of this is even sane, but at least a lot of duplicate
code is removed.

Cc: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: xen-devel@lists.xenproject.org
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/xen/gntdev-common.h |   8 +-
 drivers/xen/gntdev.c        | 180 ++++++++++--------------------------
 2 files changed, 49 insertions(+), 139 deletions(-)

Comments

Boris Ostrovsky Oct. 30, 2019, 4:55 p.m. UTC | #1
On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
>
> gntdev simply wants to monitor a specific VMA for any notifier events,
> this can be done straightforwardly using mmu_range_notifier_insert() over
> the VMA's VA range.
>
> The notifier should be attached until the original VMA is destroyed.
>
> It is unclear if any of this is even sane, but at least a lot of duplicate
> code is removed.

I didn't have a chance to look at the patch itself yet but as a heads-up
--- it crashes dom0.

-boris


>
> Cc: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: xen-devel@lists.xenproject.org
> Cc: Juergen Gross <jgross@suse.com>
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>  drivers/xen/gntdev-common.h |   8 +-
>  drivers/xen/gntdev.c        | 180 ++++++++++--------------------------
>  2 files changed, 49 insertions(+), 139 deletions(-)
>
> diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h
> index 2f8b949c3eeb14..b201fdd20b667b 100644
> --- a/drivers/xen/gntdev-common.h
> +++ b/drivers/xen/gntdev-common.h
> @@ -21,15 +21,8 @@ struct gntdev_dmabuf_priv;
>  struct gntdev_priv {
>  	/* Maps with visible offsets in the file descriptor. */
>  	struct list_head maps;
> -	/*
> -	 * Maps that are not visible; will be freed on munmap.
> -	 * Only populated if populate_freeable_maps == 1
> -	 */
> -	struct list_head freeable_maps;
>  	/* lock protects maps and freeable_maps. */
>  	struct mutex lock;
> -	struct mm_struct *mm;
> -	struct mmu_notifier mn;
>  
>  #ifdef CONFIG_XEN_GRANT_DMA_ALLOC
>  	/* Device for which DMA memory is allocated. */
> @@ -49,6 +42,7 @@ struct gntdev_unmap_notify {
>  };
>  
>  struct gntdev_grant_map {
> +	struct mmu_range_notifier notifier;
>  	struct list_head next;
>  	struct vm_area_struct *vma;
>  	int index;
> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
> index a446a7221e13e9..12d626670bebbc 100644
> --- a/drivers/xen/gntdev.c
> +++ b/drivers/xen/gntdev.c
> @@ -65,7 +65,6 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
>  static atomic_t pages_mapped = ATOMIC_INIT(0);
>  
>  static int use_ptemod;
> -#define populate_freeable_maps use_ptemod
>  
>  static int unmap_grant_pages(struct gntdev_grant_map *map,
>  			     int offset, int pages);
> @@ -251,12 +250,6 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
>  		evtchn_put(map->notify.event);
>  	}
>  
> -	if (populate_freeable_maps && priv) {
> -		mutex_lock(&priv->lock);
> -		list_del(&map->next);
> -		mutex_unlock(&priv->lock);
> -	}
> -
>  	if (map->pages && !use_ptemod)
>  		unmap_grant_pages(map, 0, map->count);
>  	gntdev_free_map(map);
> @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
>  	struct gntdev_priv *priv = file->private_data;
>  
>  	pr_debug("gntdev_vma_close %p\n", vma);
> -	if (use_ptemod) {
> -		/* It is possible that an mmu notifier could be running
> -		 * concurrently, so take priv->lock to ensure that the vma won't
> -		 * vanishing during the unmap_grant_pages call, since we will
> -		 * spin here until that completes. Such a concurrent call will
> -		 * not do any unmapping, since that has been done prior to
> -		 * closing the vma, but it may still iterate the unmap_ops list.
> -		 */
> -		mutex_lock(&priv->lock);
> +	if (use_ptemod && map->vma == vma) {
> +		mmu_range_notifier_remove(&map->notifier);
>  		map->vma = NULL;
> -		mutex_unlock(&priv->lock);
>  	}
>  	vma->vm_private_data = NULL;
>  	gntdev_put_map(priv, map);
> @@ -477,109 +462,44 @@ static const struct vm_operations_struct gntdev_vmops = {
>  
>  /* ------------------------------------------------------------------ */
>  
> -static bool in_range(struct gntdev_grant_map *map,
> -			      unsigned long start, unsigned long end)
> -{
> -	if (!map->vma)
> -		return false;
> -	if (map->vma->vm_start >= end)
> -		return false;
> -	if (map->vma->vm_end <= start)
> -		return false;
> -
> -	return true;
> -}
> -
> -static int unmap_if_in_range(struct gntdev_grant_map *map,
> -			      unsigned long start, unsigned long end,
> -			      bool blockable)
> +static bool gntdev_invalidate(struct mmu_range_notifier *mn,
> +			      const struct mmu_notifier_range *range,
> +			      unsigned long cur_seq)
>  {
> +	struct gntdev_grant_map *map =
> +		container_of(mn, struct gntdev_grant_map, notifier);
>  	unsigned long mstart, mend;
>  	int err;
>  
> -	if (!in_range(map, start, end))
> -		return 0;
> +	if (!mmu_notifier_range_blockable(range))
> +		return false;
>  
> -	if (!blockable)
> -		return -EAGAIN;
> +	/*
> +	 * If the VMA is split or otherwise changed the notifier is not
> +	 * updated, but we don't want to process VA's outside the modified
> +	 * VMA. FIXME: It would be much more understandable to just prevent
> +	 * modifying the VMA in the first place.
> +	 */
> +	if (map->vma->vm_start >= range->end ||
> +	    map->vma->vm_end <= range->start)
> +		return true;
>  
> -	mstart = max(start, map->vma->vm_start);
> -	mend   = min(end,   map->vma->vm_end);
> +	mstart = max(range->start, map->vma->vm_start);
> +	mend = min(range->end, map->vma->vm_end);
>  	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
>  			map->index, map->count,
>  			map->vma->vm_start, map->vma->vm_end,
> -			start, end, mstart, mend);
> +			range->start, range->end, mstart, mend);
>  	err = unmap_grant_pages(map,
>  				(mstart - map->vma->vm_start) >> PAGE_SHIFT,
>  				(mend - mstart) >> PAGE_SHIFT);
>  	WARN_ON(err);
>  
> -	return 0;
> -}
> -
> -static int mn_invl_range_start(struct mmu_notifier *mn,
> -			       const struct mmu_notifier_range *range)
> -{
> -	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
> -	struct gntdev_grant_map *map;
> -	int ret = 0;
> -
> -	if (mmu_notifier_range_blockable(range))
> -		mutex_lock(&priv->lock);
> -	else if (!mutex_trylock(&priv->lock))
> -		return -EAGAIN;
> -
> -	list_for_each_entry(map, &priv->maps, next) {
> -		ret = unmap_if_in_range(map, range->start, range->end,
> -					mmu_notifier_range_blockable(range));
> -		if (ret)
> -			goto out_unlock;
> -	}
> -	list_for_each_entry(map, &priv->freeable_maps, next) {
> -		ret = unmap_if_in_range(map, range->start, range->end,
> -					mmu_notifier_range_blockable(range));
> -		if (ret)
> -			goto out_unlock;
> -	}
> -
> -out_unlock:
> -	mutex_unlock(&priv->lock);
> -
> -	return ret;
> -}
> -
> -static void mn_release(struct mmu_notifier *mn,
> -		       struct mm_struct *mm)
> -{
> -	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
> -	struct gntdev_grant_map *map;
> -	int err;
> -
> -	mutex_lock(&priv->lock);
> -	list_for_each_entry(map, &priv->maps, next) {
> -		if (!map->vma)
> -			continue;
> -		pr_debug("map %d+%d (%lx %lx)\n",
> -				map->index, map->count,
> -				map->vma->vm_start, map->vma->vm_end);
> -		err = unmap_grant_pages(map, /* offset */ 0, map->count);
> -		WARN_ON(err);
> -	}
> -	list_for_each_entry(map, &priv->freeable_maps, next) {
> -		if (!map->vma)
> -			continue;
> -		pr_debug("map %d+%d (%lx %lx)\n",
> -				map->index, map->count,
> -				map->vma->vm_start, map->vma->vm_end);
> -		err = unmap_grant_pages(map, /* offset */ 0, map->count);
> -		WARN_ON(err);
> -	}
> -	mutex_unlock(&priv->lock);
> +	return true;
>  }
>  
> -static const struct mmu_notifier_ops gntdev_mmu_ops = {
> -	.release                = mn_release,
> -	.invalidate_range_start = mn_invl_range_start,
> +static const struct mmu_range_notifier_ops gntdev_mmu_ops = {
> +	.invalidate = gntdev_invalidate,
>  };
>  
>  /* ------------------------------------------------------------------ */
> @@ -594,7 +514,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
>  		return -ENOMEM;
>  
>  	INIT_LIST_HEAD(&priv->maps);
> -	INIT_LIST_HEAD(&priv->freeable_maps);
>  	mutex_init(&priv->lock);
>  
>  #ifdef CONFIG_XEN_GNTDEV_DMABUF
> @@ -606,17 +525,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
>  	}
>  #endif
>  
> -	if (use_ptemod) {
> -		priv->mm = get_task_mm(current);
> -		if (!priv->mm) {
> -			kfree(priv);
> -			return -ENOMEM;
> -		}
> -		priv->mn.ops = &gntdev_mmu_ops;
> -		ret = mmu_notifier_register(&priv->mn, priv->mm);
> -		mmput(priv->mm);
> -	}
> -
>  	if (ret) {
>  		kfree(priv);
>  		return ret;
> @@ -653,16 +561,12 @@ static int gntdev_release(struct inode *inode, struct file *flip)
>  		list_del(&map->next);
>  		gntdev_put_map(NULL /* already removed */, map);
>  	}
> -	WARN_ON(!list_empty(&priv->freeable_maps));
>  	mutex_unlock(&priv->lock);
>  
>  #ifdef CONFIG_XEN_GNTDEV_DMABUF
>  	gntdev_dmabuf_fini(priv->dmabuf_priv);
>  #endif
>  
> -	if (use_ptemod)
> -		mmu_notifier_unregister(&priv->mn, priv->mm);
> -
>  	kfree(priv);
>  	return 0;
>  }
> @@ -723,8 +627,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
>  	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
>  	if (map) {
>  		list_del(&map->next);
> -		if (populate_freeable_maps)
> -			list_add_tail(&map->next, &priv->freeable_maps);
>  		err = 0;
>  	}
>  	mutex_unlock(&priv->lock);
> @@ -1096,11 +998,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
>  		goto unlock_out;
>  	if (use_ptemod && map->vma)
>  		goto unlock_out;
> -	if (use_ptemod && priv->mm != vma->vm_mm) {
> -		pr_warn("Huh? Other mm?\n");
> -		goto unlock_out;
> -	}
> -
>  	refcount_inc(&map->users);
>  
>  	vma->vm_ops = &gntdev_vmops;
> @@ -1111,10 +1008,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
>  		vma->vm_flags |= VM_DONTCOPY;
>  
>  	vma->vm_private_data = map;
> -
> -	if (use_ptemod)
> -		map->vma = vma;
> -
>  	if (map->flags) {
>  		if ((vma->vm_flags & VM_WRITE) &&
>  				(map->flags & GNTMAP_readonly))
> @@ -1125,8 +1018,28 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
>  			map->flags |= GNTMAP_readonly;
>  	}
>  
> +	if (use_ptemod) {
> +		map->vma = vma;
> +		err = mmu_range_notifier_insert_locked(
> +			&map->notifier, vma->vm_start,
> +			vma->vm_end - vma->vm_start, vma->vm_mm);
> +		if (err)
> +			goto out_unlock_put;
> +	}
>  	mutex_unlock(&priv->lock);
>  
> +	/*
> +	 * gntdev takes the address of the PTE in find_grant_ptes() and passes
> +	 * it to the hypervisor in gntdev_map_grant_pages(). The purpose of
> +	 * the notifier is to prevent the hypervisor pointer to the PTE from
> +	 * going stale.
> +	 *
> +	 * Since this vma's mappings can't be touched without the mmap_sem,
> +	 * and we are holding it now, there is no need for the notifier_range
> +	 * locking pattern.
> +	 */
> +	mmu_range_read_begin(&map->notifier);
> +
>  	if (use_ptemod) {
>  		map->pages_vm_start = vma->vm_start;
>  		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
> @@ -1175,8 +1088,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
>  	mutex_unlock(&priv->lock);
>  out_put_map:
>  	if (use_ptemod) {
> -		map->vma = NULL;
>  		unmap_grant_pages(map, 0, map->count);
> +		if (map->vma) {
> +			mmu_range_notifier_remove(&map->notifier);
> +			map->vma = NULL;
> +		}
>  	}
>  	gntdev_put_map(priv, map);
>  	return err;
Jason Gunthorpe Nov. 1, 2019, 5:48 p.m. UTC | #2
On Wed, Oct 30, 2019 at 12:55:37PM -0400, Boris Ostrovsky wrote:
> On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
> > From: Jason Gunthorpe <jgg@mellanox.com>
> >
> > gntdev simply wants to monitor a specific VMA for any notifier events,
> > this can be done straightforwardly using mmu_range_notifier_insert() over
> > the VMA's VA range.
> >
> > The notifier should be attached until the original VMA is destroyed.
> >
> > It is unclear if any of this is even sane, but at least a lot of duplicate
> > code is removed.
> 
> I didn't have a chance to look at the patch itself yet but as a heads-up
> --- it crashes dom0.

Thanks Boris. I spent a bit of time and got a VM running with a xen
4.9 hypervisor and a kernel with this patch series. It a ubuntu bionic
VM with the distro's xen stuff.

Can you give some guidance how you made it crash? I see the VM
autoloaded gntdev:

Module                  Size  Used by
xen_gntdev             24576  2
xen_evtchn             16384  1
xenfs                  16384  1
xen_privcmd            24576  16 xenfs

And lsof says several xen processes have the chardev open:

xenstored  819                 root   13u      CHR              10,53      0t0      19595 /dev/xen/gntdev
xenconsol  857                 root    8u      CHR              10,53      0t0      19595 /dev/xen/gntdev
xenconsol  857 860             root    8u      CHR              10,53      0t0      19595 /dev/xen/gntdev

But no crashing..

However, I wasn't able to get my usual debug kernel .config to boot
with the xen hypervisor, it crashes on early boot with:

(XEN) Dom0 has maximum 8 VCPUs
(XEN) Scrubbing Free RAM on 1 nodes using 8 CPUs
(XEN) .done.
(XEN) Initial low memory virq threshold set at 0x1000 pages.
(XEN) Std. Loglevel: All
(XEN) Guest Loglevel: All
(XEN) *** Serial input -> DOM0 (type 'CTRL-a' three times to switch input to Xen)
(XEN) Freed 468kB init memory
(XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0002]
(XEN) Pagetable walk from fffffbfff0480fbe:
(XEN)  L4[0x1f7] = 0000000000000000 ffffffffffffffff
(XEN) domain_crash_sync called from entry.S: fault at ffff82d080348a06 entry.o#create_bounce_frame+0x135/0x15f
(XEN) Domain 0 (vcpu#0) crashed on cpu#0:
(XEN) ----[ Xen-4.9.2  x86_64  debug=n   Not tainted ]----
(XEN) CPU:    0
(XEN) RIP:    e033:[<ffffffff82b9f731>]
(XEN) RFLAGS: 0000000000000296   EM: 1   CONTEXT: pv guest (d0v0)
(XEN) rax: fffffbfff0480fbe   rbx: 0000000000000000   rcx: 00000000c0000101
(XEN) rdx: 00000000ffffffff   rsi: ffffffff84026000   rdi: ffffffff82cb4a20
(XEN) rbp: ffffffff82407ff8   rsp: ffffffff82407da0   r8:  0000000000000000
(XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
(XEN) r12: 0000000000000000   r13: 1ffffffff0480fbe   r14: 0000000000000000
(XEN) r15: 0000000000000000   cr0: 000000008005003b   cr4: 00000000003506e0
(XEN) cr3: 0000000034027000   cr2: fffffbfff0480fbe
(XEN) fsb: 0000000000000000   gsb: ffffffff82b61000   gss: 0000000000000000
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e02b   cs: e033

Which is surely some .config issue, but I didn't figure out what.

Jason
Boris Ostrovsky Nov. 1, 2019, 6:51 p.m. UTC | #3
On 11/1/19 1:48 PM, Jason Gunthorpe wrote:
> On Wed, Oct 30, 2019 at 12:55:37PM -0400, Boris Ostrovsky wrote:
>> On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
>>> From: Jason Gunthorpe <jgg@mellanox.com>
>>>
>>> gntdev simply wants to monitor a specific VMA for any notifier events,
>>> this can be done straightforwardly using mmu_range_notifier_insert() over
>>> the VMA's VA range.
>>>
>>> The notifier should be attached until the original VMA is destroyed.
>>>
>>> It is unclear if any of this is even sane, but at least a lot of duplicate
>>> code is removed.
>> I didn't have a chance to look at the patch itself yet but as a heads-up
>> --- it crashes dom0.
> Thanks Boris. I spent a bit of time and got a VM running with a xen
> 4.9 hypervisor and a kernel with this patch series. It a ubuntu bionic
> VM with the distro's xen stuff.
>
> Can you give some guidance how you made it crash? 

It crashes trying to dereference mrn->ops->invalidate in
mn_itree_invalidate() when a guest exits.

I don't think you've initialized notifier ops. I don't see you using
gntdev_mmu_ops anywhere.

-boris


> I see the VM
> autoloaded gntdev:
>
> Module                  Size  Used by
> xen_gntdev             24576  2
> xen_evtchn             16384  1
> xenfs                  16384  1
> xen_privcmd            24576  16 xenfs
>
> And lsof says several xen processes have the chardev open:
>
> xenstored  819                 root   13u      CHR              10,53      0t0      19595 /dev/xen/gntdev
> xenconsol  857                 root    8u      CHR              10,53      0t0      19595 /dev/xen/gntdev
> xenconsol  857 860             root    8u      CHR              10,53      0t0      19595 /dev/xen/gntdev
>
> But no crashing..
>
> However, I wasn't able to get my usual debug kernel .config to boot
> with the xen hypervisor, it crashes on early boot with:
>
> (XEN) Dom0 has maximum 8 VCPUs
> (XEN) Scrubbing Free RAM on 1 nodes using 8 CPUs
> (XEN) .done.
> (XEN) Initial low memory virq threshold set at 0x1000 pages.
> (XEN) Std. Loglevel: All
> (XEN) Guest Loglevel: All
> (XEN) *** Serial input -> DOM0 (type 'CTRL-a' three times to switch input to Xen)
> (XEN) Freed 468kB init memory
> (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0002]
> (XEN) Pagetable walk from fffffbfff0480fbe:
> (XEN)  L4[0x1f7] = 0000000000000000 ffffffffffffffff
> (XEN) domain_crash_sync called from entry.S: fault at ffff82d080348a06 entry.o#create_bounce_frame+0x135/0x15f
> (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
> (XEN) ----[ Xen-4.9.2  x86_64  debug=n   Not tainted ]----
> (XEN) CPU:    0
> (XEN) RIP:    e033:[<ffffffff82b9f731>]
> (XEN) RFLAGS: 0000000000000296   EM: 1   CONTEXT: pv guest (d0v0)
> (XEN) rax: fffffbfff0480fbe   rbx: 0000000000000000   rcx: 00000000c0000101
> (XEN) rdx: 00000000ffffffff   rsi: ffffffff84026000   rdi: ffffffff82cb4a20
> (XEN) rbp: ffffffff82407ff8   rsp: ffffffff82407da0   r8:  0000000000000000
> (XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
> (XEN) r12: 0000000000000000   r13: 1ffffffff0480fbe   r14: 0000000000000000
> (XEN) r15: 0000000000000000   cr0: 000000008005003b   cr4: 00000000003506e0
> (XEN) cr3: 0000000034027000   cr2: fffffbfff0480fbe
> (XEN) fsb: 0000000000000000   gsb: ffffffff82b61000   gss: 0000000000000000
> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e02b   cs: e033
>
> Which is surely some .config issue, but I didn't figure out what.
>
> Jason
Jason Gunthorpe Nov. 1, 2019, 7:17 p.m. UTC | #4
On Fri, Nov 01, 2019 at 02:51:46PM -0400, Boris Ostrovsky wrote:
> On 11/1/19 1:48 PM, Jason Gunthorpe wrote:
> > On Wed, Oct 30, 2019 at 12:55:37PM -0400, Boris Ostrovsky wrote:
> >> On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
> >>> From: Jason Gunthorpe <jgg@mellanox.com>
> >>>
> >>> gntdev simply wants to monitor a specific VMA for any notifier events,
> >>> this can be done straightforwardly using mmu_range_notifier_insert() over
> >>> the VMA's VA range.
> >>>
> >>> The notifier should be attached until the original VMA is destroyed.
> >>>
> >>> It is unclear if any of this is even sane, but at least a lot of duplicate
> >>> code is removed.
> >> I didn't have a chance to look at the patch itself yet but as a heads-up
> > Thanks Boris. I spent a bit of time and got a VM running with a xen
> > 4.9 hypervisor and a kernel with this patch series. It a ubuntu bionic
> > VM with the distro's xen stuff.
> >
> > Can you give some guidance how you made it crash? 
> 
> It crashes trying to dereference mrn->ops->invalidate in
> mn_itree_invalidate() when a guest exits.
> 
> I don't think you've initialized notifier ops. I don't see you using
> gntdev_mmu_ops anywhere.

So weird the compiler didn't complain about an unused static...

But yes, this is a mistake, it should be:

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 37b278857ad807..0ca35485fd3865 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -1011,6 +1011,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 
 	if (use_ptemod) {
 		map->vma = vma;
+		map->notifier.ops = &gntdev_mmu_ops;
 		err = mmu_range_notifier_insert_locked(
 			&map->notifier, vma->vm_start,
 			vma->vm_end - vma->vm_start, vma->vm_mm);

Thanks,
Jason
Boris Ostrovsky Nov. 4, 2019, 10:03 p.m. UTC | #5
On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
> @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
>  	struct gntdev_priv *priv = file->private_data;
>  
>  	pr_debug("gntdev_vma_close %p\n", vma);
> -	if (use_ptemod) {
> -		/* It is possible that an mmu notifier could be running
> -		 * concurrently, so take priv->lock to ensure that the vma won't
> -		 * vanishing during the unmap_grant_pages call, since we will
> -		 * spin here until that completes. Such a concurrent call will
> -		 * not do any unmapping, since that has been done prior to
> -		 * closing the vma, but it may still iterate the unmap_ops list.
> -		 */
> -		mutex_lock(&priv->lock);
> +	if (use_ptemod && map->vma == vma) {


Is it possible for map->vma not to be equal to vma?

-boris


> +		mmu_range_notifier_remove(&map->notifier);
>  		map->vma = NULL;
> -		mutex_unlock(&priv->lock);
>  	}
>  	vma->vm_private_data = NULL;
>  	gntdev_put_map(priv, map);
>
Jason Gunthorpe Nov. 5, 2019, 2:31 a.m. UTC | #6
On Mon, Nov 04, 2019 at 05:03:31PM -0500, Boris Ostrovsky wrote:
> On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
> > @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
> >  	struct gntdev_priv *priv = file->private_data;
> >  
> >  	pr_debug("gntdev_vma_close %p\n", vma);
> > -	if (use_ptemod) {
> > -		/* It is possible that an mmu notifier could be running
> > -		 * concurrently, so take priv->lock to ensure that the vma won't
> > -		 * vanishing during the unmap_grant_pages call, since we will
> > -		 * spin here until that completes. Such a concurrent call will
> > -		 * not do any unmapping, since that has been done prior to
> > -		 * closing the vma, but it may still iterate the unmap_ops list.
> > -		 */
> > -		mutex_lock(&priv->lock);
> > +	if (use_ptemod && map->vma == vma) {
> 
> 
> Is it possible for map->vma not to be equal to vma?

It could be NULL at least if use_ptemod is not set.

Otherwise, I'm not sure, the confusing bit is that the map comes from
here:

        map = gntdev_find_map_index(priv, index, count);

It looks like the intent is that the map->vma is always set to the
only vma that has the map as private_data.

So, I suppose it can be relaxed to a null test and a WARN_ON that it
hasn't changed?

Jason
Boris Ostrovsky Nov. 5, 2019, 3:16 p.m. UTC | #7
On 11/4/19 9:31 PM, Jason Gunthorpe wrote:
> On Mon, Nov 04, 2019 at 05:03:31PM -0500, Boris Ostrovsky wrote:
>> On 10/28/19 4:10 PM, Jason Gunthorpe wrote:
>>> @@ -445,17 +438,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
>>>  	struct gntdev_priv *priv = file->private_data;
>>>  
>>>  	pr_debug("gntdev_vma_close %p\n", vma);
>>> -	if (use_ptemod) {
>>> -		/* It is possible that an mmu notifier could be running
>>> -		 * concurrently, so take priv->lock to ensure that the vma won't
>>> -		 * vanishing during the unmap_grant_pages call, since we will
>>> -		 * spin here until that completes. Such a concurrent call will
>>> -		 * not do any unmapping, since that has been done prior to
>>> -		 * closing the vma, but it may still iterate the unmap_ops list.
>>> -		 */
>>> -		mutex_lock(&priv->lock);
>>> +	if (use_ptemod && map->vma == vma) {
>>
>> Is it possible for map->vma not to be equal to vma?
> It could be NULL at least if use_ptemod is not set.
>
> Otherwise, I'm not sure, the confusing bit is that the map comes from
> here:
>
>         map = gntdev_find_map_index(priv, index, count);
>
> It looks like the intent is that the map->vma is always set to the
> only vma that has the map as private_data.

I am not sure how this can work otherwise. We stash map pointer in vm's
vm_private_data and vice versa (for use_ptemod) gntdev_mmap() so if they
have to match.

That's why I was asking you to see if you had something particular in
mind when you added this test.

> So, I suppose it can be relaxed to a null test and a WARN_ON that it
> hasn't changed?

You mean

if (use_ptemod) {
        WARN_ON(map->vma != vma);
        ...


Yes, that sounds good.


-boris
Jason Gunthorpe Nov. 7, 2019, 8:36 p.m. UTC | #8
On Tue, Nov 05, 2019 at 10:16:46AM -0500, Boris Ostrovsky wrote:

> > So, I suppose it can be relaxed to a null test and a WARN_ON that it
> > hasn't changed?
> 
> You mean
> 
> if (use_ptemod) {
>         WARN_ON(map->vma != vma);
>         ...
> 
> 
> Yes, that sounds good.

I amended my copy of the patch with the above, has this rework shown
signs of working?

@@ -436,7 +436,8 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
        struct gntdev_priv *priv = file->private_data;
 
        pr_debug("gntdev_vma_close %p\n", vma);
-       if (use_ptemod && map->vma == vma) {
+       if (use_ptemod) {
+               WARN_ON(map->vma != vma);
                mmu_range_notifier_remove(&map->notifier);
                map->vma = NULL;
        }

Jason
Boris Ostrovsky Nov. 7, 2019, 10:54 p.m. UTC | #9
On 11/7/19 3:36 PM, Jason Gunthorpe wrote:
> On Tue, Nov 05, 2019 at 10:16:46AM -0500, Boris Ostrovsky wrote:
>
>>> So, I suppose it can be relaxed to a null test and a WARN_ON that it
>>> hasn't changed?
>> You mean
>>
>> if (use_ptemod) {
>>         WARN_ON(map->vma != vma);
>>         ...
>>
>>
>> Yes, that sounds good.
> I amended my copy of the patch with the above, has this rework shown
> signs of working?

Yes, it works fine.

But please don't forget notifier ops initialization.

With those two changes,

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

>
> @@ -436,7 +436,8 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
>         struct gntdev_priv *priv = file->private_data;
>  
>         pr_debug("gntdev_vma_close %p\n", vma);
> -       if (use_ptemod && map->vma == vma) {
> +       if (use_ptemod) {
> +               WARN_ON(map->vma != vma);
>                 mmu_range_notifier_remove(&map->notifier);
>                 map->vma = NULL;
>         }
>
> Jason
Jason Gunthorpe Nov. 8, 2019, 2:53 p.m. UTC | #10
On Thu, Nov 07, 2019 at 05:54:52PM -0500, Boris Ostrovsky wrote:
> On 11/7/19 3:36 PM, Jason Gunthorpe wrote:
> > On Tue, Nov 05, 2019 at 10:16:46AM -0500, Boris Ostrovsky wrote:
> >
> >>> So, I suppose it can be relaxed to a null test and a WARN_ON that it
> >>> hasn't changed?
> >> You mean
> >>
> >> if (use_ptemod) {
> >>         WARN_ON(map->vma != vma);
> >>         ...
> >>
> >>
> >> Yes, that sounds good.
> > I amended my copy of the patch with the above, has this rework shown
> > signs of working?
> 
> Yes, it works fine.
> 
> But please don't forget notifier ops initialization.
> 
> With those two changes,
> 
> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

Thanks, I got both things. I'll forward this toward linux-next and
repost a v3 

Jason
diff mbox series

Patch

diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h
index 2f8b949c3eeb14..b201fdd20b667b 100644
--- a/drivers/xen/gntdev-common.h
+++ b/drivers/xen/gntdev-common.h
@@ -21,15 +21,8 @@  struct gntdev_dmabuf_priv;
 struct gntdev_priv {
 	/* Maps with visible offsets in the file descriptor. */
 	struct list_head maps;
-	/*
-	 * Maps that are not visible; will be freed on munmap.
-	 * Only populated if populate_freeable_maps == 1
-	 */
-	struct list_head freeable_maps;
 	/* lock protects maps and freeable_maps. */
 	struct mutex lock;
-	struct mm_struct *mm;
-	struct mmu_notifier mn;
 
 #ifdef CONFIG_XEN_GRANT_DMA_ALLOC
 	/* Device for which DMA memory is allocated. */
@@ -49,6 +42,7 @@  struct gntdev_unmap_notify {
 };
 
 struct gntdev_grant_map {
+	struct mmu_range_notifier notifier;
 	struct list_head next;
 	struct vm_area_struct *vma;
 	int index;
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index a446a7221e13e9..12d626670bebbc 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -65,7 +65,6 @@  MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
 static atomic_t pages_mapped = ATOMIC_INIT(0);
 
 static int use_ptemod;
-#define populate_freeable_maps use_ptemod
 
 static int unmap_grant_pages(struct gntdev_grant_map *map,
 			     int offset, int pages);
@@ -251,12 +250,6 @@  void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
 		evtchn_put(map->notify.event);
 	}
 
-	if (populate_freeable_maps && priv) {
-		mutex_lock(&priv->lock);
-		list_del(&map->next);
-		mutex_unlock(&priv->lock);
-	}
-
 	if (map->pages && !use_ptemod)
 		unmap_grant_pages(map, 0, map->count);
 	gntdev_free_map(map);
@@ -445,17 +438,9 @@  static void gntdev_vma_close(struct vm_area_struct *vma)
 	struct gntdev_priv *priv = file->private_data;
 
 	pr_debug("gntdev_vma_close %p\n", vma);
-	if (use_ptemod) {
-		/* It is possible that an mmu notifier could be running
-		 * concurrently, so take priv->lock to ensure that the vma won't
-		 * vanishing during the unmap_grant_pages call, since we will
-		 * spin here until that completes. Such a concurrent call will
-		 * not do any unmapping, since that has been done prior to
-		 * closing the vma, but it may still iterate the unmap_ops list.
-		 */
-		mutex_lock(&priv->lock);
+	if (use_ptemod && map->vma == vma) {
+		mmu_range_notifier_remove(&map->notifier);
 		map->vma = NULL;
-		mutex_unlock(&priv->lock);
 	}
 	vma->vm_private_data = NULL;
 	gntdev_put_map(priv, map);
@@ -477,109 +462,44 @@  static const struct vm_operations_struct gntdev_vmops = {
 
 /* ------------------------------------------------------------------ */
 
-static bool in_range(struct gntdev_grant_map *map,
-			      unsigned long start, unsigned long end)
-{
-	if (!map->vma)
-		return false;
-	if (map->vma->vm_start >= end)
-		return false;
-	if (map->vma->vm_end <= start)
-		return false;
-
-	return true;
-}
-
-static int unmap_if_in_range(struct gntdev_grant_map *map,
-			      unsigned long start, unsigned long end,
-			      bool blockable)
+static bool gntdev_invalidate(struct mmu_range_notifier *mn,
+			      const struct mmu_notifier_range *range,
+			      unsigned long cur_seq)
 {
+	struct gntdev_grant_map *map =
+		container_of(mn, struct gntdev_grant_map, notifier);
 	unsigned long mstart, mend;
 	int err;
 
-	if (!in_range(map, start, end))
-		return 0;
+	if (!mmu_notifier_range_blockable(range))
+		return false;
 
-	if (!blockable)
-		return -EAGAIN;
+	/*
+	 * If the VMA is split or otherwise changed the notifier is not
+	 * updated, but we don't want to process VA's outside the modified
+	 * VMA. FIXME: It would be much more understandable to just prevent
+	 * modifying the VMA in the first place.
+	 */
+	if (map->vma->vm_start >= range->end ||
+	    map->vma->vm_end <= range->start)
+		return true;
 
-	mstart = max(start, map->vma->vm_start);
-	mend   = min(end,   map->vma->vm_end);
+	mstart = max(range->start, map->vma->vm_start);
+	mend = min(range->end, map->vma->vm_end);
 	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
 			map->index, map->count,
 			map->vma->vm_start, map->vma->vm_end,
-			start, end, mstart, mend);
+			range->start, range->end, mstart, mend);
 	err = unmap_grant_pages(map,
 				(mstart - map->vma->vm_start) >> PAGE_SHIFT,
 				(mend - mstart) >> PAGE_SHIFT);
 	WARN_ON(err);
 
-	return 0;
-}
-
-static int mn_invl_range_start(struct mmu_notifier *mn,
-			       const struct mmu_notifier_range *range)
-{
-	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
-	struct gntdev_grant_map *map;
-	int ret = 0;
-
-	if (mmu_notifier_range_blockable(range))
-		mutex_lock(&priv->lock);
-	else if (!mutex_trylock(&priv->lock))
-		return -EAGAIN;
-
-	list_for_each_entry(map, &priv->maps, next) {
-		ret = unmap_if_in_range(map, range->start, range->end,
-					mmu_notifier_range_blockable(range));
-		if (ret)
-			goto out_unlock;
-	}
-	list_for_each_entry(map, &priv->freeable_maps, next) {
-		ret = unmap_if_in_range(map, range->start, range->end,
-					mmu_notifier_range_blockable(range));
-		if (ret)
-			goto out_unlock;
-	}
-
-out_unlock:
-	mutex_unlock(&priv->lock);
-
-	return ret;
-}
-
-static void mn_release(struct mmu_notifier *mn,
-		       struct mm_struct *mm)
-{
-	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
-	struct gntdev_grant_map *map;
-	int err;
-
-	mutex_lock(&priv->lock);
-	list_for_each_entry(map, &priv->maps, next) {
-		if (!map->vma)
-			continue;
-		pr_debug("map %d+%d (%lx %lx)\n",
-				map->index, map->count,
-				map->vma->vm_start, map->vma->vm_end);
-		err = unmap_grant_pages(map, /* offset */ 0, map->count);
-		WARN_ON(err);
-	}
-	list_for_each_entry(map, &priv->freeable_maps, next) {
-		if (!map->vma)
-			continue;
-		pr_debug("map %d+%d (%lx %lx)\n",
-				map->index, map->count,
-				map->vma->vm_start, map->vma->vm_end);
-		err = unmap_grant_pages(map, /* offset */ 0, map->count);
-		WARN_ON(err);
-	}
-	mutex_unlock(&priv->lock);
+	return true;
 }
 
-static const struct mmu_notifier_ops gntdev_mmu_ops = {
-	.release                = mn_release,
-	.invalidate_range_start = mn_invl_range_start,
+static const struct mmu_range_notifier_ops gntdev_mmu_ops = {
+	.invalidate = gntdev_invalidate,
 };
 
 /* ------------------------------------------------------------------ */
@@ -594,7 +514,6 @@  static int gntdev_open(struct inode *inode, struct file *flip)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&priv->maps);
-	INIT_LIST_HEAD(&priv->freeable_maps);
 	mutex_init(&priv->lock);
 
 #ifdef CONFIG_XEN_GNTDEV_DMABUF
@@ -606,17 +525,6 @@  static int gntdev_open(struct inode *inode, struct file *flip)
 	}
 #endif
 
-	if (use_ptemod) {
-		priv->mm = get_task_mm(current);
-		if (!priv->mm) {
-			kfree(priv);
-			return -ENOMEM;
-		}
-		priv->mn.ops = &gntdev_mmu_ops;
-		ret = mmu_notifier_register(&priv->mn, priv->mm);
-		mmput(priv->mm);
-	}
-
 	if (ret) {
 		kfree(priv);
 		return ret;
@@ -653,16 +561,12 @@  static int gntdev_release(struct inode *inode, struct file *flip)
 		list_del(&map->next);
 		gntdev_put_map(NULL /* already removed */, map);
 	}
-	WARN_ON(!list_empty(&priv->freeable_maps));
 	mutex_unlock(&priv->lock);
 
 #ifdef CONFIG_XEN_GNTDEV_DMABUF
 	gntdev_dmabuf_fini(priv->dmabuf_priv);
 #endif
 
-	if (use_ptemod)
-		mmu_notifier_unregister(&priv->mn, priv->mm);
-
 	kfree(priv);
 	return 0;
 }
@@ -723,8 +627,6 @@  static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
 	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
 	if (map) {
 		list_del(&map->next);
-		if (populate_freeable_maps)
-			list_add_tail(&map->next, &priv->freeable_maps);
 		err = 0;
 	}
 	mutex_unlock(&priv->lock);
@@ -1096,11 +998,6 @@  static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 		goto unlock_out;
 	if (use_ptemod && map->vma)
 		goto unlock_out;
-	if (use_ptemod && priv->mm != vma->vm_mm) {
-		pr_warn("Huh? Other mm?\n");
-		goto unlock_out;
-	}
-
 	refcount_inc(&map->users);
 
 	vma->vm_ops = &gntdev_vmops;
@@ -1111,10 +1008,6 @@  static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 		vma->vm_flags |= VM_DONTCOPY;
 
 	vma->vm_private_data = map;
-
-	if (use_ptemod)
-		map->vma = vma;
-
 	if (map->flags) {
 		if ((vma->vm_flags & VM_WRITE) &&
 				(map->flags & GNTMAP_readonly))
@@ -1125,8 +1018,28 @@  static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 			map->flags |= GNTMAP_readonly;
 	}
 
+	if (use_ptemod) {
+		map->vma = vma;
+		err = mmu_range_notifier_insert_locked(
+			&map->notifier, vma->vm_start,
+			vma->vm_end - vma->vm_start, vma->vm_mm);
+		if (err)
+			goto out_unlock_put;
+	}
 	mutex_unlock(&priv->lock);
 
+	/*
+	 * gntdev takes the address of the PTE in find_grant_ptes() and passes
+	 * it to the hypervisor in gntdev_map_grant_pages(). The purpose of
+	 * the notifier is to prevent the hypervisor pointer to the PTE from
+	 * going stale.
+	 *
+	 * Since this vma's mappings can't be touched without the mmap_sem,
+	 * and we are holding it now, there is no need for the notifier_range
+	 * locking pattern.
+	 */
+	mmu_range_read_begin(&map->notifier);
+
 	if (use_ptemod) {
 		map->pages_vm_start = vma->vm_start;
 		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
@@ -1175,8 +1088,11 @@  static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	mutex_unlock(&priv->lock);
 out_put_map:
 	if (use_ptemod) {
-		map->vma = NULL;
 		unmap_grant_pages(map, 0, map->count);
+		if (map->vma) {
+			mmu_range_notifier_remove(&map->notifier);
+			map->vma = NULL;
+		}
 	}
 	gntdev_put_map(priv, map);
 	return err;