diff mbox series

[12/41] mm: add per-VMA lock and helper functions to control it

Message ID 20230109205336.3665937-13-surenb@google.com (mailing list archive)
State New
Headers show
Series Per-VMA locks | expand

Commit Message

Suren Baghdasaryan Jan. 9, 2023, 8:53 p.m. UTC
Introduce a per-VMA rw_semaphore to be used during page fault handling
instead of mmap_lock. Because there are cases when multiple VMAs need
to be exclusively locked during VMA tree modifications, instead of the
usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
exclusively and setting vma->lock_seq to the current mm->lock_seq. When
mmap_write_lock holder is done with all modifications and drops mmap_lock,
it will increment mm->lock_seq, effectively unlocking all VMAs marked as
locked.
VMA lock is placed on the cache line boundary so that its 'count' field
falls into the first cache line while the rest of the fields fall into
the second cache line. This lets the 'count' field to be cached with
other frequently accessed fields and used quickly in uncontended case
while 'owner' and other fields used in the contended case will not
invalidate the first cache line while waiting on the lock.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 80 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  8 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 108 insertions(+)

Comments

Michal Hocko Jan. 17, 2023, 3:04 p.m. UTC | #1
On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> Introduce a per-VMA rw_semaphore to be used during page fault handling
> instead of mmap_lock. Because there are cases when multiple VMAs need
> to be exclusively locked during VMA tree modifications, instead of the
> usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> mmap_write_lock holder is done with all modifications and drops mmap_lock,
> it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> locked.

I have to say I was struggling a bit with the above and only understood
what you mean by reading the patch several times. I would phrase it like
this (feel free to use if you consider this to be an improvement).

Introduce a per-VMA rw_semaphore. The lock implementation relies on a
per-vma and per-mm sequence counters to note exclusive locking:
        - read lock - (implemented by vma_read_trylock) requires the the
          vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
          differ. If they match then there must be a vma exclusive lock
          held somewhere.
        - read unlock - (implemented by vma_read_unlock) is a trivial
          vma->lock unlock.
        - write lock - (vma_write_lock) requires the mmap_lock to be
          held exclusively and the current mm counter is noted to the vma
          side. This will allow multiple vmas to be locked under a single
          mmap_lock write lock (e.g. during vma merging). The vma counter
          is modified under exclusive vma lock.
        - write unlock - (vma_write_unlock_mm) is a batch release of all
          vma locks held. It doesn't pair with a specific
          vma_write_lock! It is done before exclusive mmap_lock is
          released by incrementing mm sequence counter (mm_lock_seq).
	- write downgrade - if the mmap_lock is downgraded to the read
	  lock all vma write locks are released as well (effectivelly
	  same as write unlock).

> VMA lock is placed on the cache line boundary so that its 'count' field
> falls into the first cache line while the rest of the fields fall into
> the second cache line. This lets the 'count' field to be cached with
> other frequently accessed fields and used quickly in uncontended case
> while 'owner' and other fields used in the contended case will not
> invalidate the first cache line while waiting on the lock.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/mm.h        | 80 +++++++++++++++++++++++++++++++++++++++
>  include/linux/mm_types.h  |  8 ++++
>  include/linux/mmap_lock.h | 13 +++++++
>  kernel/fork.c             |  4 ++
>  mm/init-mm.c              |  3 ++
>  5 files changed, 108 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f3f196e4d66d..ec2c4c227d51 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -612,6 +612,85 @@ struct vm_operations_struct {
>  					  unsigned long addr);
>  };
>  
> +#ifdef CONFIG_PER_VMA_LOCK
> +static inline void vma_init_lock(struct vm_area_struct *vma)
> +{
> +	init_rwsem(&vma->lock);
> +	vma->vm_lock_seq = -1;
> +}
> +
> +static inline void vma_write_lock(struct vm_area_struct *vma)
> +{
> +	int mm_lock_seq;
> +
> +	mmap_assert_write_locked(vma->vm_mm);
> +
> +	/*
> +	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> +	 * mm->mm_lock_seq can't be concurrently modified.
> +	 */
> +	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
> +	if (vma->vm_lock_seq == mm_lock_seq)
> +		return;
> +
> +	down_write(&vma->lock);
> +	vma->vm_lock_seq = mm_lock_seq;
> +	up_write(&vma->lock);
> +}
> +
> +/*
> + * Try to read-lock a vma. The function is allowed to occasionally yield false
> + * locked result to avoid performance overhead, in which case we fall back to
> + * using mmap_lock. The function should never yield false unlocked result.
> + */
> +static inline bool vma_read_trylock(struct vm_area_struct *vma)
> +{
> +	/* Check before locking. A race might cause false locked result. */
> +	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
> +		return false;
> +
> +	if (unlikely(down_read_trylock(&vma->lock) == 0))
> +		return false;
> +
> +	/*
> +	 * Overflow might produce false locked result.
> +	 * False unlocked result is impossible because we modify and check
> +	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
> +	 * modification invalidates all existing locks.
> +	 */
> +	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
> +		up_read(&vma->lock);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +static inline void vma_read_unlock(struct vm_area_struct *vma)
> +{
> +	up_read(&vma->lock);
> +}
> +
> +static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> +{
> +	mmap_assert_write_locked(vma->vm_mm);
> +	/*
> +	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> +	 * mm->mm_lock_seq can't be concurrently modified.
> +	 */
> +	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +static inline void vma_init_lock(struct vm_area_struct *vma) {}
> +static inline void vma_write_lock(struct vm_area_struct *vma) {}
> +static inline bool vma_read_trylock(struct vm_area_struct *vma)
> +		{ return false; }
> +static inline void vma_read_unlock(struct vm_area_struct *vma) {}
> +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
> +
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
>  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  {
>  	static const struct vm_operations_struct dummy_vm_ops = {};
> @@ -620,6 +699,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  	vma->vm_mm = mm;
>  	vma->vm_ops = &dummy_vm_ops;
>  	INIT_LIST_HEAD(&vma->anon_vma_chain);
> +	vma_init_lock(vma);
>  }
>  
>  static inline void vma_set_anonymous(struct vm_area_struct *vma)
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d5cdec1314fe..5f7c5ca89931 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -555,6 +555,11 @@ struct vm_area_struct {
>  	pgprot_t vm_page_prot;
>  	unsigned long vm_flags;		/* Flags, see mm.h. */
>  
> +#ifdef CONFIG_PER_VMA_LOCK
> +	int vm_lock_seq;
> +	struct rw_semaphore lock;
> +#endif
> +
>  	/*
>  	 * For areas with an address space and backing store,
>  	 * linkage into the address_space->i_mmap interval tree.
> @@ -680,6 +685,9 @@ struct mm_struct {
>  					  * init_mm.mmlist, and are protected
>  					  * by mmlist_lock
>  					  */
> +#ifdef CONFIG_PER_VMA_LOCK
> +		int mm_lock_seq;
> +#endif
>  
>  
>  		unsigned long hiwater_rss; /* High-watermark of RSS usage */
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index e49ba91bb1f0..40facd4c398b 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
>  	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
>  }
>  
> +#ifdef CONFIG_PER_VMA_LOCK
> +static inline void vma_write_unlock_mm(struct mm_struct *mm)
> +{
> +	mmap_assert_write_locked(mm);
> +	/* No races during update due to exclusive mmap_lock being held */
> +	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
> +}
> +#else
> +static inline void vma_write_unlock_mm(struct mm_struct *mm) {}
> +#endif
> +
>  static inline void mmap_init_lock(struct mm_struct *mm)
>  {
>  	init_rwsem(&mm->mmap_lock);
> @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
>  static inline void mmap_write_unlock(struct mm_struct *mm)
>  {
>  	__mmap_lock_trace_released(mm, true);
> +	vma_write_unlock_mm(mm);
>  	up_write(&mm->mmap_lock);
>  }
>  
>  static inline void mmap_write_downgrade(struct mm_struct *mm)
>  {
>  	__mmap_lock_trace_acquire_returned(mm, false, true);
> +	vma_write_unlock_mm(mm);
>  	downgrade_write(&mm->mmap_lock);
>  }
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 5986817f393c..c026d75108b3 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>  		 */
>  		*new = data_race(*orig);
>  		INIT_LIST_HEAD(&new->anon_vma_chain);
> +		vma_init_lock(new);
>  		dup_anon_vma_name(orig, new);
>  	}
>  	return new;
> @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	seqcount_init(&mm->write_protect_seq);
>  	mmap_init_lock(mm);
>  	INIT_LIST_HEAD(&mm->mmlist);
> +#ifdef CONFIG_PER_VMA_LOCK
> +	WRITE_ONCE(mm->mm_lock_seq, 0);
> +#endif
>  	mm_pgtables_bytes_init(mm);
>  	mm->map_count = 0;
>  	mm->locked_vm = 0;
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index c9327abb771c..33269314e060 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -37,6 +37,9 @@ struct mm_struct init_mm = {
>  	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
>  	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
> +#ifdef CONFIG_PER_VMA_LOCK
> +	.mm_lock_seq	= 0,
> +#endif
>  	.user_ns	= &init_user_ns,
>  	.cpu_bitmap	= CPU_BITS_NONE,
>  #ifdef CONFIG_IOMMU_SVA
> -- 
> 2.39.0
Michal Hocko Jan. 17, 2023, 3:07 p.m. UTC | #2
On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 5986817f393c..c026d75108b3 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>  		 */
>  		*new = data_race(*orig);
>  		INIT_LIST_HEAD(&new->anon_vma_chain);
> +		vma_init_lock(new);
>  		dup_anon_vma_name(orig, new);
>  	}
>  	return new;
> @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	seqcount_init(&mm->write_protect_seq);
>  	mmap_init_lock(mm);
>  	INIT_LIST_HEAD(&mm->mmlist);
> +#ifdef CONFIG_PER_VMA_LOCK
> +	WRITE_ONCE(mm->mm_lock_seq, 0);
> +#endif

The mm shouldn't be visible so why WRITE_ONCE?
Michal Hocko Jan. 17, 2023, 3:12 p.m. UTC | #3
On Tue 17-01-23 16:04:26, Michal Hocko wrote:
> On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > instead of mmap_lock. Because there are cases when multiple VMAs need
> > to be exclusively locked during VMA tree modifications, instead of the
> > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > locked.
> 
> I have to say I was struggling a bit with the above and only understood
> what you mean by reading the patch several times. I would phrase it like
> this (feel free to use if you consider this to be an improvement).
> 
> Introduce a per-VMA rw_semaphore. The lock implementation relies on a
> per-vma and per-mm sequence counters to note exclusive locking:
>         - read lock - (implemented by vma_read_trylock) requires the the
>           vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
>           differ. If they match then there must be a vma exclusive lock
>           held somewhere.
>         - read unlock - (implemented by vma_read_unlock) is a trivial
>           vma->lock unlock.
>         - write lock - (vma_write_lock) requires the mmap_lock to be
>           held exclusively and the current mm counter is noted to the vma
>           side. This will allow multiple vmas to be locked under a single
>           mmap_lock write lock (e.g. during vma merging). The vma counter
>           is modified under exclusive vma lock.

Didn't realize one more thing.
	    Unlike standard write lock this implementation allows to be
	    called multiple times under a single mmap_lock. In a sense
	    it is more of mark_vma_potentially_modified than a lock.

>         - write unlock - (vma_write_unlock_mm) is a batch release of all
>           vma locks held. It doesn't pair with a specific
>           vma_write_lock! It is done before exclusive mmap_lock is
>           released by incrementing mm sequence counter (mm_lock_seq).
> 	- write downgrade - if the mmap_lock is downgraded to the read
> 	  lock all vma write locks are released as well (effectivelly
> 	  same as write unlock).
Jann Horn Jan. 17, 2023, 6:02 p.m. UTC | #4
+locking maintainers

On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> Introduce a per-VMA rw_semaphore to be used during page fault handling
> instead of mmap_lock. Because there are cases when multiple VMAs need
> to be exclusively locked during VMA tree modifications, instead of the
> usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> mmap_write_lock holder is done with all modifications and drops mmap_lock,
> it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> locked.
[...]
> +static inline void vma_read_unlock(struct vm_area_struct *vma)
> +{
> +       up_read(&vma->lock);
> +}

One thing that might be gnarly here is that I think you might not be
allowed to use up_read() to fully release ownership of an object -
from what I remember, I think that up_read() (unlike something like
spin_unlock()) can access the lock object after it's already been
acquired by someone else. So if you want to protect against concurrent
deletion, this might have to be something like:

rcu_read_lock(); /* keeps vma alive */
up_read(&vma->lock);
rcu_read_unlock();

But I'm not entirely sure about that, the locking folks might know better.

Also, it might not matter given that the rw_semaphore part is removed
in the current patch 41/41 anyway...
Suren Baghdasaryan Jan. 17, 2023, 9:08 p.m. UTC | #5
On Tue, Jan 17, 2023 at 7:04 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > instead of mmap_lock. Because there are cases when multiple VMAs need
> > to be exclusively locked during VMA tree modifications, instead of the
> > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > locked.
>
> I have to say I was struggling a bit with the above and only understood
> what you mean by reading the patch several times. I would phrase it like
> this (feel free to use if you consider this to be an improvement).
>
> Introduce a per-VMA rw_semaphore. The lock implementation relies on a
> per-vma and per-mm sequence counters to note exclusive locking:
>         - read lock - (implemented by vma_read_trylock) requires the the
>           vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
>           differ. If they match then there must be a vma exclusive lock
>           held somewhere.
>         - read unlock - (implemented by vma_read_unlock) is a trivial
>           vma->lock unlock.
>         - write lock - (vma_write_lock) requires the mmap_lock to be
>           held exclusively and the current mm counter is noted to the vma
>           side. This will allow multiple vmas to be locked under a single
>           mmap_lock write lock (e.g. during vma merging). The vma counter
>           is modified under exclusive vma lock.
>         - write unlock - (vma_write_unlock_mm) is a batch release of all
>           vma locks held. It doesn't pair with a specific
>           vma_write_lock! It is done before exclusive mmap_lock is
>           released by incrementing mm sequence counter (mm_lock_seq).
>         - write downgrade - if the mmap_lock is downgraded to the read
>           lock all vma write locks are released as well (effectivelly
>           same as write unlock).

Thanks for the suggestion, Michal. I'll definitely reuse your description.

>
> > VMA lock is placed on the cache line boundary so that its 'count' field
> > falls into the first cache line while the rest of the fields fall into
> > the second cache line. This lets the 'count' field to be cached with
> > other frequently accessed fields and used quickly in uncontended case
> > while 'owner' and other fields used in the contended case will not
> > invalidate the first cache line while waiting on the lock.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  include/linux/mm.h        | 80 +++++++++++++++++++++++++++++++++++++++
> >  include/linux/mm_types.h  |  8 ++++
> >  include/linux/mmap_lock.h | 13 +++++++
> >  kernel/fork.c             |  4 ++
> >  mm/init-mm.c              |  3 ++
> >  5 files changed, 108 insertions(+)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index f3f196e4d66d..ec2c4c227d51 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -612,6 +612,85 @@ struct vm_operations_struct {
> >                                         unsigned long addr);
> >  };
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static inline void vma_init_lock(struct vm_area_struct *vma)
> > +{
> > +     init_rwsem(&vma->lock);
> > +     vma->vm_lock_seq = -1;
> > +}
> > +
> > +static inline void vma_write_lock(struct vm_area_struct *vma)
> > +{
> > +     int mm_lock_seq;
> > +
> > +     mmap_assert_write_locked(vma->vm_mm);
> > +
> > +     /*
> > +      * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> > +      * mm->mm_lock_seq can't be concurrently modified.
> > +      */
> > +     mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
> > +     if (vma->vm_lock_seq == mm_lock_seq)
> > +             return;
> > +
> > +     down_write(&vma->lock);
> > +     vma->vm_lock_seq = mm_lock_seq;
> > +     up_write(&vma->lock);
> > +}
> > +
> > +/*
> > + * Try to read-lock a vma. The function is allowed to occasionally yield false
> > + * locked result to avoid performance overhead, in which case we fall back to
> > + * using mmap_lock. The function should never yield false unlocked result.
> > + */
> > +static inline bool vma_read_trylock(struct vm_area_struct *vma)
> > +{
> > +     /* Check before locking. A race might cause false locked result. */
> > +     if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > +             return false;
> > +
> > +     if (unlikely(down_read_trylock(&vma->lock) == 0))
> > +             return false;
> > +
> > +     /*
> > +      * Overflow might produce false locked result.
> > +      * False unlocked result is impossible because we modify and check
> > +      * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
> > +      * modification invalidates all existing locks.
> > +      */
> > +     if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
> > +             up_read(&vma->lock);
> > +             return false;
> > +     }
> > +     return true;
> > +}
> > +
> > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > +{
> > +     up_read(&vma->lock);
> > +}
> > +
> > +static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > +{
> > +     mmap_assert_write_locked(vma->vm_mm);
> > +     /*
> > +      * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> > +      * mm->mm_lock_seq can't be concurrently modified.
> > +      */
> > +     VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
> > +}
> > +
> > +#else /* CONFIG_PER_VMA_LOCK */
> > +
> > +static inline void vma_init_lock(struct vm_area_struct *vma) {}
> > +static inline void vma_write_lock(struct vm_area_struct *vma) {}
> > +static inline bool vma_read_trylock(struct vm_area_struct *vma)
> > +             { return false; }
> > +static inline void vma_read_unlock(struct vm_area_struct *vma) {}
> > +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
> > +
> > +#endif /* CONFIG_PER_VMA_LOCK */
> > +
> >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> >  {
> >       static const struct vm_operations_struct dummy_vm_ops = {};
> > @@ -620,6 +699,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> >       vma->vm_mm = mm;
> >       vma->vm_ops = &dummy_vm_ops;
> >       INIT_LIST_HEAD(&vma->anon_vma_chain);
> > +     vma_init_lock(vma);
> >  }
> >
> >  static inline void vma_set_anonymous(struct vm_area_struct *vma)
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index d5cdec1314fe..5f7c5ca89931 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -555,6 +555,11 @@ struct vm_area_struct {
> >       pgprot_t vm_page_prot;
> >       unsigned long vm_flags;         /* Flags, see mm.h. */
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     int vm_lock_seq;
> > +     struct rw_semaphore lock;
> > +#endif
> > +
> >       /*
> >        * For areas with an address space and backing store,
> >        * linkage into the address_space->i_mmap interval tree.
> > @@ -680,6 +685,9 @@ struct mm_struct {
> >                                         * init_mm.mmlist, and are protected
> >                                         * by mmlist_lock
> >                                         */
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +             int mm_lock_seq;
> > +#endif
> >
> >
> >               unsigned long hiwater_rss; /* High-watermark of RSS usage */
> > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> > index e49ba91bb1f0..40facd4c398b 100644
> > --- a/include/linux/mmap_lock.h
> > +++ b/include/linux/mmap_lock.h
> > @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
> >       VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
> >  }
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static inline void vma_write_unlock_mm(struct mm_struct *mm)
> > +{
> > +     mmap_assert_write_locked(mm);
> > +     /* No races during update due to exclusive mmap_lock being held */
> > +     WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
> > +}
> > +#else
> > +static inline void vma_write_unlock_mm(struct mm_struct *mm) {}
> > +#endif
> > +
> >  static inline void mmap_init_lock(struct mm_struct *mm)
> >  {
> >       init_rwsem(&mm->mmap_lock);
> > @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
> >  static inline void mmap_write_unlock(struct mm_struct *mm)
> >  {
> >       __mmap_lock_trace_released(mm, true);
> > +     vma_write_unlock_mm(mm);
> >       up_write(&mm->mmap_lock);
> >  }
> >
> >  static inline void mmap_write_downgrade(struct mm_struct *mm)
> >  {
> >       __mmap_lock_trace_acquire_returned(mm, false, true);
> > +     vma_write_unlock_mm(mm);
> >       downgrade_write(&mm->mmap_lock);
> >  }
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 5986817f393c..c026d75108b3 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> >                */
> >               *new = data_race(*orig);
> >               INIT_LIST_HEAD(&new->anon_vma_chain);
> > +             vma_init_lock(new);
> >               dup_anon_vma_name(orig, new);
> >       }
> >       return new;
> > @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> >       seqcount_init(&mm->write_protect_seq);
> >       mmap_init_lock(mm);
> >       INIT_LIST_HEAD(&mm->mmlist);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     WRITE_ONCE(mm->mm_lock_seq, 0);
> > +#endif
> >       mm_pgtables_bytes_init(mm);
> >       mm->map_count = 0;
> >       mm->locked_vm = 0;
> > diff --git a/mm/init-mm.c b/mm/init-mm.c
> > index c9327abb771c..33269314e060 100644
> > --- a/mm/init-mm.c
> > +++ b/mm/init-mm.c
> > @@ -37,6 +37,9 @@ struct mm_struct init_mm = {
> >       .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
> >       .arg_lock       =  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
> >       .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     .mm_lock_seq    = 0,
> > +#endif
> >       .user_ns        = &init_user_ns,
> >       .cpu_bitmap     = CPU_BITS_NONE,
> >  #ifdef CONFIG_IOMMU_SVA
> > --
> > 2.39.0
>
> --
> Michal Hocko
> SUSE Labs
Suren Baghdasaryan Jan. 17, 2023, 9:09 p.m. UTC | #6
On Tue, Jan 17, 2023 at 7:07 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 5986817f393c..c026d75108b3 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> >                */
> >               *new = data_race(*orig);
> >               INIT_LIST_HEAD(&new->anon_vma_chain);
> > +             vma_init_lock(new);
> >               dup_anon_vma_name(orig, new);
> >       }
> >       return new;
> > @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> >       seqcount_init(&mm->write_protect_seq);
> >       mmap_init_lock(mm);
> >       INIT_LIST_HEAD(&mm->mmlist);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     WRITE_ONCE(mm->mm_lock_seq, 0);
> > +#endif
>
> The mm shouldn't be visible so why WRITE_ONCE?

True. Will change to a simple assignment.

>
> --
> Michal Hocko
> SUSE Labs
Suren Baghdasaryan Jan. 17, 2023, 9:21 p.m. UTC | #7
On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 17-01-23 16:04:26, Michal Hocko wrote:
> > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > to be exclusively locked during VMA tree modifications, instead of the
> > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > locked.
> >
> > I have to say I was struggling a bit with the above and only understood
> > what you mean by reading the patch several times. I would phrase it like
> > this (feel free to use if you consider this to be an improvement).
> >
> > Introduce a per-VMA rw_semaphore. The lock implementation relies on a
> > per-vma and per-mm sequence counters to note exclusive locking:
> >         - read lock - (implemented by vma_read_trylock) requires the the
> >           vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
> >           differ. If they match then there must be a vma exclusive lock
> >           held somewhere.
> >         - read unlock - (implemented by vma_read_unlock) is a trivial
> >           vma->lock unlock.
> >         - write lock - (vma_write_lock) requires the mmap_lock to be
> >           held exclusively and the current mm counter is noted to the vma
> >           side. This will allow multiple vmas to be locked under a single
> >           mmap_lock write lock (e.g. during vma merging). The vma counter
> >           is modified under exclusive vma lock.
>
> Didn't realize one more thing.
>             Unlike standard write lock this implementation allows to be
>             called multiple times under a single mmap_lock. In a sense
>             it is more of mark_vma_potentially_modified than a lock.

In the RFC it was called vma_mark_locked() originally and renames were
discussed in the email thread ending here:
https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/.
If other names are preferable I'm open to changing them.

>
> >         - write unlock - (vma_write_unlock_mm) is a batch release of all
> >           vma locks held. It doesn't pair with a specific
> >           vma_write_lock! It is done before exclusive mmap_lock is
> >           released by incrementing mm sequence counter (mm_lock_seq).
> >       - write downgrade - if the mmap_lock is downgraded to the read
> >         lock all vma write locks are released as well (effectivelly
> >         same as write unlock).
> --
> Michal Hocko
> SUSE Labs
Suren Baghdasaryan Jan. 17, 2023, 9:28 p.m. UTC | #8
On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote:
>
> +locking maintainers

Thanks! I'll CC the locking maintainers in the next posting.

>
> On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > instead of mmap_lock. Because there are cases when multiple VMAs need
> > to be exclusively locked during VMA tree modifications, instead of the
> > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > locked.
> [...]
> > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > +{
> > +       up_read(&vma->lock);
> > +}
>
> One thing that might be gnarly here is that I think you might not be
> allowed to use up_read() to fully release ownership of an object -
> from what I remember, I think that up_read() (unlike something like
> spin_unlock()) can access the lock object after it's already been
> acquired by someone else. So if you want to protect against concurrent
> deletion, this might have to be something like:
>
> rcu_read_lock(); /* keeps vma alive */
> up_read(&vma->lock);
> rcu_read_unlock();

But for deleting VMA one would need to write-lock the vma->lock first,
which I assume can't happen until this up_read() is complete. Is that
assumption wrong?

>
> But I'm not entirely sure about that, the locking folks might know better.
>
> Also, it might not matter given that the rw_semaphore part is removed
> in the current patch 41/41 anyway...

This does matter because Michal suggested dropping that last 41/41
patch for now.
Jann Horn Jan. 17, 2023, 9:45 p.m. UTC | #9
On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote:
> On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote:
> >
> > +locking maintainers
>
> Thanks! I'll CC the locking maintainers in the next posting.
>
> >
> > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > to be exclusively locked during VMA tree modifications, instead of the
> > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > locked.
> > [...]
> > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > +{
> > > +       up_read(&vma->lock);
> > > +}
> >
> > One thing that might be gnarly here is that I think you might not be
> > allowed to use up_read() to fully release ownership of an object -
> > from what I remember, I think that up_read() (unlike something like
> > spin_unlock()) can access the lock object after it's already been
> > acquired by someone else. So if you want to protect against concurrent
> > deletion, this might have to be something like:
> >
> > rcu_read_lock(); /* keeps vma alive */
> > up_read(&vma->lock);
> > rcu_read_unlock();
>
> But for deleting VMA one would need to write-lock the vma->lock first,
> which I assume can't happen until this up_read() is complete. Is that
> assumption wrong?

__up_read() does:

rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
      RWSEM_FLAG_WAITERS)) {
  clear_nonspinnable(sem);
  rwsem_wake(sem);
}

The atomic_long_add_return_release() is the point where we are doing
the main lock-releasing.

So if a reader dropped the read-lock while someone else was waiting on
the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the
lock together with it, the reader also does clear_nonspinnable() and
rwsem_wake() afterwards.
But in rwsem_down_write_slowpath(), after we've set
RWSEM_FLAG_WAITERS, we can return successfully immediately once
rwsem_try_write_lock() sees that there are no active readers or
writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg
succeeds). We're not necessarily waiting for the "nonspinnable" bit or
the wake.

So yeah, I think down_write() can return successfully before up_read()
is done with its memory accesses.

(Spinlocks are different - the kernel relies on being able to drop
references via spin_unlock() in some places.)
Matthew Wilcox Jan. 17, 2023, 9:54 p.m. UTC | #10
On Tue, Jan 17, 2023 at 01:21:47PM -0800, Suren Baghdasaryan wrote:
> On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Tue 17-01-23 16:04:26, Michal Hocko wrote:
> > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > locked.
> > >
> > > I have to say I was struggling a bit with the above and only understood
> > > what you mean by reading the patch several times. I would phrase it like
> > > this (feel free to use if you consider this to be an improvement).
> > >
> > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a
> > > per-vma and per-mm sequence counters to note exclusive locking:
> > >         - read lock - (implemented by vma_read_trylock) requires the the
> > >           vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
> > >           differ. If they match then there must be a vma exclusive lock
> > >           held somewhere.
> > >         - read unlock - (implemented by vma_read_unlock) is a trivial
> > >           vma->lock unlock.
> > >         - write lock - (vma_write_lock) requires the mmap_lock to be
> > >           held exclusively and the current mm counter is noted to the vma
> > >           side. This will allow multiple vmas to be locked under a single
> > >           mmap_lock write lock (e.g. during vma merging). The vma counter
> > >           is modified under exclusive vma lock.
> >
> > Didn't realize one more thing.
> >             Unlike standard write lock this implementation allows to be
> >             called multiple times under a single mmap_lock. In a sense
> >             it is more of mark_vma_potentially_modified than a lock.
> 
> In the RFC it was called vma_mark_locked() originally and renames were
> discussed in the email thread ending here:
> https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/.
> If other names are preferable I'm open to changing them.

I don't want to bikeshed this, but rather than locking it seems to be
more:

	vma_start_read()
	vma_end_read()
	vma_start_write()
	vma_end_write()
	vma_downgrade_write()

... and that these are _implemented_ with locks (in part) is an
implementation detail?

Would that reduce people's confusion?

> >
> > >         - write unlock - (vma_write_unlock_mm) is a batch release of all
> > >           vma locks held. It doesn't pair with a specific
> > >           vma_write_lock! It is done before exclusive mmap_lock is
> > >           released by incrementing mm sequence counter (mm_lock_seq).
> > >       - write downgrade - if the mmap_lock is downgraded to the read
> > >         lock all vma write locks are released as well (effectivelly
> > >         same as write unlock).
> > --
> > Michal Hocko
> > SUSE Labs
Suren Baghdasaryan Jan. 17, 2023, 10:33 p.m. UTC | #11
On Tue, Jan 17, 2023 at 1:54 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Tue, Jan 17, 2023 at 01:21:47PM -0800, Suren Baghdasaryan wrote:
> > On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Tue 17-01-23 16:04:26, Michal Hocko wrote:
> > > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > > locked.
> > > >
> > > > I have to say I was struggling a bit with the above and only understood
> > > > what you mean by reading the patch several times. I would phrase it like
> > > > this (feel free to use if you consider this to be an improvement).
> > > >
> > > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a
> > > > per-vma and per-mm sequence counters to note exclusive locking:
> > > >         - read lock - (implemented by vma_read_trylock) requires the the
> > > >           vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
> > > >           differ. If they match then there must be a vma exclusive lock
> > > >           held somewhere.
> > > >         - read unlock - (implemented by vma_read_unlock) is a trivial
> > > >           vma->lock unlock.
> > > >         - write lock - (vma_write_lock) requires the mmap_lock to be
> > > >           held exclusively and the current mm counter is noted to the vma
> > > >           side. This will allow multiple vmas to be locked under a single
> > > >           mmap_lock write lock (e.g. during vma merging). The vma counter
> > > >           is modified under exclusive vma lock.
> > >
> > > Didn't realize one more thing.
> > >             Unlike standard write lock this implementation allows to be
> > >             called multiple times under a single mmap_lock. In a sense
> > >             it is more of mark_vma_potentially_modified than a lock.
> >
> > In the RFC it was called vma_mark_locked() originally and renames were
> > discussed in the email thread ending here:
> > https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/.
> > If other names are preferable I'm open to changing them.
>
> I don't want to bikeshed this, but rather than locking it seems to be
> more:
>
>         vma_start_read()
>         vma_end_read()
>         vma_start_write()
>         vma_end_write()
>         vma_downgrade_write()

Couple corrections, we would have to have vma_start_tryread() and
vma_end_write_all(). Also there is no vma_downgrade_write().
mmap_write_downgrade() simply does vma_end_write_all().

>
> ... and that these are _implemented_ with locks (in part) is an
> implementation detail?
>
> Would that reduce people's confusion?
>
> > >
> > > >         - write unlock - (vma_write_unlock_mm) is a batch release of all
> > > >           vma locks held. It doesn't pair with a specific
> > > >           vma_write_lock! It is done before exclusive mmap_lock is
> > > >           released by incrementing mm sequence counter (mm_lock_seq).
> > > >       - write downgrade - if the mmap_lock is downgraded to the read
> > > >         lock all vma write locks are released as well (effectivelly
> > > >         same as write unlock).
> > > --
> > > Michal Hocko
> > > SUSE Labs
Suren Baghdasaryan Jan. 17, 2023, 10:36 p.m. UTC | #12
On Tue, Jan 17, 2023 at 1:46 PM Jann Horn <jannh@google.com> wrote:
>
> On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote:
> > >
> > > +locking maintainers
> >
> > Thanks! I'll CC the locking maintainers in the next posting.
> >
> > >
> > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > locked.
> > > [...]
> > > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > > +{
> > > > +       up_read(&vma->lock);
> > > > +}
> > >
> > > One thing that might be gnarly here is that I think you might not be
> > > allowed to use up_read() to fully release ownership of an object -
> > > from what I remember, I think that up_read() (unlike something like
> > > spin_unlock()) can access the lock object after it's already been
> > > acquired by someone else. So if you want to protect against concurrent
> > > deletion, this might have to be something like:
> > >
> > > rcu_read_lock(); /* keeps vma alive */
> > > up_read(&vma->lock);
> > > rcu_read_unlock();
> >
> > But for deleting VMA one would need to write-lock the vma->lock first,
> > which I assume can't happen until this up_read() is complete. Is that
> > assumption wrong?
>
> __up_read() does:
>
> rwsem_clear_reader_owned(sem);
> tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
> DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
> if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
>       RWSEM_FLAG_WAITERS)) {
>   clear_nonspinnable(sem);
>   rwsem_wake(sem);
> }
>
> The atomic_long_add_return_release() is the point where we are doing
> the main lock-releasing.
>
> So if a reader dropped the read-lock while someone else was waiting on
> the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the
> lock together with it, the reader also does clear_nonspinnable() and
> rwsem_wake() afterwards.
> But in rwsem_down_write_slowpath(), after we've set
> RWSEM_FLAG_WAITERS, we can return successfully immediately once
> rwsem_try_write_lock() sees that there are no active readers or
> writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg
> succeeds). We're not necessarily waiting for the "nonspinnable" bit or
> the wake.
>
> So yeah, I think down_write() can return successfully before up_read()
> is done with its memory accesses.
>
> (Spinlocks are different - the kernel relies on being able to drop
> references via spin_unlock() in some places.)

Thanks for bringing this up. I can add rcu_read_{lock/unlock) as you
suggested and that would fix the issue because we free VMAs from
call_rcu(). However this feels to me as an issue of rw_semaphore
design that this locking pattern is unsafe and might lead to UAF.
Matthew Wilcox Jan. 17, 2023, 11:15 p.m. UTC | #13
On Tue, Jan 17, 2023 at 02:36:47PM -0800, Suren Baghdasaryan wrote:
> On Tue, Jan 17, 2023 at 1:46 PM Jann Horn <jannh@google.com> wrote:
> > On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote:
> > > > One thing that might be gnarly here is that I think you might not be
> > > > allowed to use up_read() to fully release ownership of an object -
> > > > from what I remember, I think that up_read() (unlike something like
> > > > spin_unlock()) can access the lock object after it's already been
> > > > acquired by someone else. So if you want to protect against concurrent
> > > > deletion, this might have to be something like:
> > > >
> > > > rcu_read_lock(); /* keeps vma alive */
> > > > up_read(&vma->lock);
> > > > rcu_read_unlock();
> > >
> > > But for deleting VMA one would need to write-lock the vma->lock first,
> > > which I assume can't happen until this up_read() is complete. Is that
> > > assumption wrong?
> >
> > __up_read() does:
> >
> > rwsem_clear_reader_owned(sem);
> > tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
> > DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
> > if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
> >       RWSEM_FLAG_WAITERS)) {
> >   clear_nonspinnable(sem);
> >   rwsem_wake(sem);
> > }
> >
> > The atomic_long_add_return_release() is the point where we are doing
> > the main lock-releasing.
> >
> > So if a reader dropped the read-lock while someone else was waiting on
> > the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the
> > lock together with it, the reader also does clear_nonspinnable() and
> > rwsem_wake() afterwards.
> > But in rwsem_down_write_slowpath(), after we've set
> > RWSEM_FLAG_WAITERS, we can return successfully immediately once
> > rwsem_try_write_lock() sees that there are no active readers or
> > writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg
> > succeeds). We're not necessarily waiting for the "nonspinnable" bit or
> > the wake.
> >
> > So yeah, I think down_write() can return successfully before up_read()
> > is done with its memory accesses.
> >
> > (Spinlocks are different - the kernel relies on being able to drop
> > references via spin_unlock() in some places.)
> 
> Thanks for bringing this up. I can add rcu_read_{lock/unlock) as you
> suggested and that would fix the issue because we free VMAs from
> call_rcu(). However this feels to me as an issue of rw_semaphore
> design that this locking pattern is unsafe and might lead to UAF.

We have/had this problem with normal mutexes too.  It was the impetus
for adding the struct completion which is very careful to not touch
anything after the completion is, well, completed.
Michal Hocko Jan. 18, 2023, 9:18 a.m. UTC | #14
On Tue 17-01-23 21:54:58, Matthew Wilcox wrote:
> On Tue, Jan 17, 2023 at 01:21:47PM -0800, Suren Baghdasaryan wrote:
> > On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Tue 17-01-23 16:04:26, Michal Hocko wrote:
> > > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote:
> > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > > locked.
> > > >
> > > > I have to say I was struggling a bit with the above and only understood
> > > > what you mean by reading the patch several times. I would phrase it like
> > > > this (feel free to use if you consider this to be an improvement).
> > > >
> > > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a
> > > > per-vma and per-mm sequence counters to note exclusive locking:
> > > >         - read lock - (implemented by vma_read_trylock) requires the the
> > > >           vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to
> > > >           differ. If they match then there must be a vma exclusive lock
> > > >           held somewhere.
> > > >         - read unlock - (implemented by vma_read_unlock) is a trivial
> > > >           vma->lock unlock.
> > > >         - write lock - (vma_write_lock) requires the mmap_lock to be
> > > >           held exclusively and the current mm counter is noted to the vma
> > > >           side. This will allow multiple vmas to be locked under a single
> > > >           mmap_lock write lock (e.g. during vma merging). The vma counter
> > > >           is modified under exclusive vma lock.
> > >
> > > Didn't realize one more thing.
> > >             Unlike standard write lock this implementation allows to be
> > >             called multiple times under a single mmap_lock. In a sense
> > >             it is more of mark_vma_potentially_modified than a lock.
> > 
> > In the RFC it was called vma_mark_locked() originally and renames were
> > discussed in the email thread ending here:
> > https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/.
> > If other names are preferable I'm open to changing them.
> 
> I don't want to bikeshed this, but rather than locking it seems to be
> more:
> 
> 	vma_start_read()
> 	vma_end_read()
> 	vma_start_write()
> 	vma_end_write()
> 	vma_downgrade_write()
> 
> ... and that these are _implemented_ with locks (in part) is an
> implementation detail?

Agreed!

> Would that reduce people's confusion?

Yes I believe that naming it less like a locking primitive will clarify
it. vma_{start,end}_[try]read is better indeed. I am wondering about the
write side of things because that is where things get confusing. There
is no explicit write lock nor unlock. vma_start_write sounds better than
the vma_write_lock but it still lacks that pairing with vma_end_write
which is never the right thing to call. Wouldn't vma_mark_modified and
vma_publish_changes describe the scheme better?

Downgrade case is probably the least interesting one because that is
just one off thing that can be completely hidden from any code besides
mmap_write_downgrade so I wouldn't be too concern about that one.

But as you say, no need to bikeshed this too much. Great naming is hard
and if the scheme is documented properly we can live with a suboptimal
naming as well.
Michal Hocko Jan. 18, 2023, 12:28 p.m. UTC | #15
On Tue 17-01-23 19:02:55, Jann Horn wrote:
> +locking maintainers
> 
> On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > instead of mmap_lock. Because there are cases when multiple VMAs need
> > to be exclusively locked during VMA tree modifications, instead of the
> > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > locked.
> [...]
> > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > +{
> > +       up_read(&vma->lock);
> > +}
> 
> One thing that might be gnarly here is that I think you might not be
> allowed to use up_read() to fully release ownership of an object -
> from what I remember, I think that up_read() (unlike something like
> spin_unlock()) can access the lock object after it's already been
> acquired by someone else.

Yes, I think you are right. From a look into the code it seems that
the UAF is quite unlikely as there is a ton of work to be done between
vma_write_lock used to prepare vma for removal and actual removal.
That doesn't make it less of a problem though.

> So if you want to protect against concurrent
> deletion, this might have to be something like:
> 
> rcu_read_lock(); /* keeps vma alive */
> up_read(&vma->lock);
> rcu_read_unlock();
> 
> But I'm not entirely sure about that, the locking folks might know better.

I am not a locking expert but to me it looks like this should work
because the final cleanup would have to happen rcu_read_unlock.

Thanks, I have completely missed this aspect of the locking when looking
into the code.

Btw. looking at this again I have fully realized how hard it is actually
to see that vm_area_free is guaranteed to sync up with ongoing readers.
vma manipulation functions like __adjust_vma make my head spin. Would it
make more sense to have a rcu style synchronization point in
vm_area_free directly before call_rcu? This would add an overhead of
uncontended down_write of course.
David Laight Jan. 18, 2023, 1:09 p.m. UTC | #16
...
> > One thing that might be gnarly here is that I think you might not be
> > allowed to use up_read() to fully release ownership of an object -
> > from what I remember, I think that up_read() (unlike something like
> > spin_unlock()) can access the lock object after it's already been
> > acquired by someone else.
> 
> Yes, I think you are right. From a look into the code it seems that
> the UAF is quite unlikely as there is a ton of work to be done between
> vma_write_lock used to prepare vma for removal and actual removal.
> That doesn't make it less of a problem though.

All it takes is a hardware interrupt....
Especially if the softint code can also run.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Jann Horn Jan. 18, 2023, 1:23 p.m. UTC | #17
On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote:
> On Tue 17-01-23 19:02:55, Jann Horn wrote:
> > +locking maintainers
> >
> > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > to be exclusively locked during VMA tree modifications, instead of the
> > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > locked.
> > [...]
> > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > +{
> > > +       up_read(&vma->lock);
> > > +}
> >
> > One thing that might be gnarly here is that I think you might not be
> > allowed to use up_read() to fully release ownership of an object -
> > from what I remember, I think that up_read() (unlike something like
> > spin_unlock()) can access the lock object after it's already been
> > acquired by someone else.
>
> Yes, I think you are right. From a look into the code it seems that
> the UAF is quite unlikely as there is a ton of work to be done between
> vma_write_lock used to prepare vma for removal and actual removal.
> That doesn't make it less of a problem though.
>
> > So if you want to protect against concurrent
> > deletion, this might have to be something like:
> >
> > rcu_read_lock(); /* keeps vma alive */
> > up_read(&vma->lock);
> > rcu_read_unlock();
> >
> > But I'm not entirely sure about that, the locking folks might know better.
>
> I am not a locking expert but to me it looks like this should work
> because the final cleanup would have to happen rcu_read_unlock.
>
> Thanks, I have completely missed this aspect of the locking when looking
> into the code.
>
> Btw. looking at this again I have fully realized how hard it is actually
> to see that vm_area_free is guaranteed to sync up with ongoing readers.
> vma manipulation functions like __adjust_vma make my head spin. Would it
> make more sense to have a rcu style synchronization point in
> vm_area_free directly before call_rcu? This would add an overhead of
> uncontended down_write of course.

Something along those lines might be a good idea, but I think that
rather than synchronizing the removal, it should maybe be something
that splats (and bails out?) if it detects pending readers. If we get
to vm_area_free() on a VMA that has pending readers, we might already
be in a lot of trouble because the concurrent readers might have been
traversing page tables while we were tearing them down or fun stuff
like that.

I think maybe Suren was already talking about something like that in
another part of this patch series but I don't remember...
Michal Hocko Jan. 18, 2023, 3:11 p.m. UTC | #18
On Wed 18-01-23 14:23:32, Jann Horn wrote:
> On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote:
> > On Tue 17-01-23 19:02:55, Jann Horn wrote:
> > > +locking maintainers
> > >
> > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > locked.
> > > [...]
> > > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > > +{
> > > > +       up_read(&vma->lock);
> > > > +}
> > >
> > > One thing that might be gnarly here is that I think you might not be
> > > allowed to use up_read() to fully release ownership of an object -
> > > from what I remember, I think that up_read() (unlike something like
> > > spin_unlock()) can access the lock object after it's already been
> > > acquired by someone else.
> >
> > Yes, I think you are right. From a look into the code it seems that
> > the UAF is quite unlikely as there is a ton of work to be done between
> > vma_write_lock used to prepare vma for removal and actual removal.
> > That doesn't make it less of a problem though.
> >
> > > So if you want to protect against concurrent
> > > deletion, this might have to be something like:
> > >
> > > rcu_read_lock(); /* keeps vma alive */
> > > up_read(&vma->lock);
> > > rcu_read_unlock();
> > >
> > > But I'm not entirely sure about that, the locking folks might know better.
> >
> > I am not a locking expert but to me it looks like this should work
> > because the final cleanup would have to happen rcu_read_unlock.
> >
> > Thanks, I have completely missed this aspect of the locking when looking
> > into the code.
> >
> > Btw. looking at this again I have fully realized how hard it is actually
> > to see that vm_area_free is guaranteed to sync up with ongoing readers.
> > vma manipulation functions like __adjust_vma make my head spin. Would it
> > make more sense to have a rcu style synchronization point in
> > vm_area_free directly before call_rcu? This would add an overhead of
> > uncontended down_write of course.
> 
> Something along those lines might be a good idea, but I think that
> rather than synchronizing the removal, it should maybe be something
> that splats (and bails out?) if it detects pending readers. If we get
> to vm_area_free() on a VMA that has pending readers, we might already
> be in a lot of trouble because the concurrent readers might have been
> traversing page tables while we were tearing them down or fun stuff
> like that.
> 
> I think maybe Suren was already talking about something like that in
> another part of this patch series but I don't remember...

This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com?
Suren Baghdasaryan Jan. 18, 2023, 5:36 p.m. UTC | #19
On Wed, Jan 18, 2023 at 7:11 AM 'Michal Hocko' via kernel-team
<kernel-team@android.com> wrote:
>
> On Wed 18-01-23 14:23:32, Jann Horn wrote:
> > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote:
> > > On Tue 17-01-23 19:02:55, Jann Horn wrote:
> > > > +locking maintainers
> > > >
> > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > > locked.
> > > > [...]
> > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > > > +{
> > > > > +       up_read(&vma->lock);
> > > > > +}
> > > >
> > > > One thing that might be gnarly here is that I think you might not be
> > > > allowed to use up_read() to fully release ownership of an object -
> > > > from what I remember, I think that up_read() (unlike something like
> > > > spin_unlock()) can access the lock object after it's already been
> > > > acquired by someone else.
> > >
> > > Yes, I think you are right. From a look into the code it seems that
> > > the UAF is quite unlikely as there is a ton of work to be done between
> > > vma_write_lock used to prepare vma for removal and actual removal.
> > > That doesn't make it less of a problem though.
> > >
> > > > So if you want to protect against concurrent
> > > > deletion, this might have to be something like:
> > > >
> > > > rcu_read_lock(); /* keeps vma alive */
> > > > up_read(&vma->lock);
> > > > rcu_read_unlock();
> > > >
> > > > But I'm not entirely sure about that, the locking folks might know better.
> > >
> > > I am not a locking expert but to me it looks like this should work
> > > because the final cleanup would have to happen rcu_read_unlock.
> > >
> > > Thanks, I have completely missed this aspect of the locking when looking
> > > into the code.
> > >
> > > Btw. looking at this again I have fully realized how hard it is actually
> > > to see that vm_area_free is guaranteed to sync up with ongoing readers.
> > > vma manipulation functions like __adjust_vma make my head spin. Would it
> > > make more sense to have a rcu style synchronization point in
> > > vm_area_free directly before call_rcu? This would add an overhead of
> > > uncontended down_write of course.
> >
> > Something along those lines might be a good idea, but I think that
> > rather than synchronizing the removal, it should maybe be something
> > that splats (and bails out?) if it detects pending readers. If we get
> > to vm_area_free() on a VMA that has pending readers, we might already
> > be in a lot of trouble because the concurrent readers might have been
> > traversing page tables while we were tearing them down or fun stuff
> > like that.
> >
> > I think maybe Suren was already talking about something like that in
> > another part of this patch series but I don't remember...
>
> This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com?

Yes, I spent a lot of time ensuring that __adjust_vma locks the right
VMAs and that VMAs are freed or isolated under VMA write lock
protection to exclude any readers. If the VM_BUG_ON_VMA in the patch
Michal mentioned gets hit then it's a bug in my design and I'll have
to fix it. But please, let's not add synchronize_rcu() in the
vm_area_free(). That will slow down any path that frees a VMA,
especially the exit path which might be freeing thousands of them. I
had an SPF version with synchronize_rcu() in the vm_area_free() and
phone vendors started yelling at me the very next day. call_rcu() with
CONFIG_RCU_NOCB_CPU (which Android uses for power saving purposes) is
already bad enough to show up in the benchmarks and that's why I had
to add call_rcu() batching in
https://lore.kernel.org/all/20230109205336.3665937-40-surenb@google.com.

>
> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Michal Hocko Jan. 18, 2023, 9:28 p.m. UTC | #20
On Wed 18-01-23 09:36:44, Suren Baghdasaryan wrote:
> On Wed, Jan 18, 2023 at 7:11 AM 'Michal Hocko' via kernel-team
> <kernel-team@android.com> wrote:
> >
> > On Wed 18-01-23 14:23:32, Jann Horn wrote:
> > > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > On Tue 17-01-23 19:02:55, Jann Horn wrote:
> > > > > +locking maintainers
> > > > >
> > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > > > locked.
> > > > > [...]
> > > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +       up_read(&vma->lock);
> > > > > > +}
> > > > >
> > > > > One thing that might be gnarly here is that I think you might not be
> > > > > allowed to use up_read() to fully release ownership of an object -
> > > > > from what I remember, I think that up_read() (unlike something like
> > > > > spin_unlock()) can access the lock object after it's already been
> > > > > acquired by someone else.
> > > >
> > > > Yes, I think you are right. From a look into the code it seems that
> > > > the UAF is quite unlikely as there is a ton of work to be done between
> > > > vma_write_lock used to prepare vma for removal and actual removal.
> > > > That doesn't make it less of a problem though.
> > > >
> > > > > So if you want to protect against concurrent
> > > > > deletion, this might have to be something like:
> > > > >
> > > > > rcu_read_lock(); /* keeps vma alive */
> > > > > up_read(&vma->lock);
> > > > > rcu_read_unlock();
> > > > >
> > > > > But I'm not entirely sure about that, the locking folks might know better.
> > > >
> > > > I am not a locking expert but to me it looks like this should work
> > > > because the final cleanup would have to happen rcu_read_unlock.
> > > >
> > > > Thanks, I have completely missed this aspect of the locking when looking
> > > > into the code.
> > > >
> > > > Btw. looking at this again I have fully realized how hard it is actually
> > > > to see that vm_area_free is guaranteed to sync up with ongoing readers.
> > > > vma manipulation functions like __adjust_vma make my head spin. Would it
> > > > make more sense to have a rcu style synchronization point in
> > > > vm_area_free directly before call_rcu? This would add an overhead of
> > > > uncontended down_write of course.
> > >
> > > Something along those lines might be a good idea, but I think that
> > > rather than synchronizing the removal, it should maybe be something
> > > that splats (and bails out?) if it detects pending readers. If we get
> > > to vm_area_free() on a VMA that has pending readers, we might already
> > > be in a lot of trouble because the concurrent readers might have been
> > > traversing page tables while we were tearing them down or fun stuff
> > > like that.
> > >
> > > I think maybe Suren was already talking about something like that in
> > > another part of this patch series but I don't remember...
> >
> > This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com?
> 
> Yes, I spent a lot of time ensuring that __adjust_vma locks the right
> VMAs and that VMAs are freed or isolated under VMA write lock
> protection to exclude any readers. If the VM_BUG_ON_VMA in the patch
> Michal mentioned gets hit then it's a bug in my design and I'll have
> to fix it. But please, let's not add synchronize_rcu() in the
> vm_area_free().

Just to clarify. I didn't suggest to add synchronize_rcu into
vm_area_free. What I really meant was synchronize_rcu like primitive to
effectivelly synchronize with any potential pending read locker (so
something like vma_write_lock (or whatever it is called). The point is
that vma freeing is an event all readers should be notified about.
This can be done explicitly for each and every vma before vm_area_free
is called but this is just hard to review and easy to break over time.
See my point?
Suren Baghdasaryan Jan. 18, 2023, 9:45 p.m. UTC | #21
On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Wed 18-01-23 09:36:44, Suren Baghdasaryan wrote:
> > On Wed, Jan 18, 2023 at 7:11 AM 'Michal Hocko' via kernel-team
> > <kernel-team@android.com> wrote:
> > >
> > > On Wed 18-01-23 14:23:32, Jann Horn wrote:
> > > > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > > On Tue 17-01-23 19:02:55, Jann Horn wrote:
> > > > > > +locking maintainers
> > > > > >
> > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > > > > locked.
> > > > > > [...]
> > > > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > > > > > +{
> > > > > > > +       up_read(&vma->lock);
> > > > > > > +}
> > > > > >
> > > > > > One thing that might be gnarly here is that I think you might not be
> > > > > > allowed to use up_read() to fully release ownership of an object -
> > > > > > from what I remember, I think that up_read() (unlike something like
> > > > > > spin_unlock()) can access the lock object after it's already been
> > > > > > acquired by someone else.
> > > > >
> > > > > Yes, I think you are right. From a look into the code it seems that
> > > > > the UAF is quite unlikely as there is a ton of work to be done between
> > > > > vma_write_lock used to prepare vma for removal and actual removal.
> > > > > That doesn't make it less of a problem though.
> > > > >
> > > > > > So if you want to protect against concurrent
> > > > > > deletion, this might have to be something like:
> > > > > >
> > > > > > rcu_read_lock(); /* keeps vma alive */
> > > > > > up_read(&vma->lock);
> > > > > > rcu_read_unlock();
> > > > > >
> > > > > > But I'm not entirely sure about that, the locking folks might know better.
> > > > >
> > > > > I am not a locking expert but to me it looks like this should work
> > > > > because the final cleanup would have to happen rcu_read_unlock.
> > > > >
> > > > > Thanks, I have completely missed this aspect of the locking when looking
> > > > > into the code.
> > > > >
> > > > > Btw. looking at this again I have fully realized how hard it is actually
> > > > > to see that vm_area_free is guaranteed to sync up with ongoing readers.
> > > > > vma manipulation functions like __adjust_vma make my head spin. Would it
> > > > > make more sense to have a rcu style synchronization point in
> > > > > vm_area_free directly before call_rcu? This would add an overhead of
> > > > > uncontended down_write of course.
> > > >
> > > > Something along those lines might be a good idea, but I think that
> > > > rather than synchronizing the removal, it should maybe be something
> > > > that splats (and bails out?) if it detects pending readers. If we get
> > > > to vm_area_free() on a VMA that has pending readers, we might already
> > > > be in a lot of trouble because the concurrent readers might have been
> > > > traversing page tables while we were tearing them down or fun stuff
> > > > like that.
> > > >
> > > > I think maybe Suren was already talking about something like that in
> > > > another part of this patch series but I don't remember...
> > >
> > > This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com?
> >
> > Yes, I spent a lot of time ensuring that __adjust_vma locks the right
> > VMAs and that VMAs are freed or isolated under VMA write lock
> > protection to exclude any readers. If the VM_BUG_ON_VMA in the patch
> > Michal mentioned gets hit then it's a bug in my design and I'll have
> > to fix it. But please, let's not add synchronize_rcu() in the
> > vm_area_free().
>
> Just to clarify. I didn't suggest to add synchronize_rcu into
> vm_area_free. What I really meant was synchronize_rcu like primitive to
> effectivelly synchronize with any potential pending read locker (so
> something like vma_write_lock (or whatever it is called). The point is
> that vma freeing is an event all readers should be notified about.

I don't think readers need to be notified if we are ensuring that the
VMA is not used by anyone else and is not reachable by the readers.
This is currently done by write-locking the VMA either before removing
it from the tree or before freeing it.

> This can be done explicitly for each and every vma before vm_area_free
> is called but this is just hard to review and easy to break over time.
> See my point?

I understand your point now and if we really need that, one way would
be to have a VMA refcount (like Laurent had in his version of SPF
implementation). I don't think current implementation needs that level
of VMA lifetime control unless I missed some location that should take
the lock and does not.

>
> --
> Michal Hocko
> SUSE Labs
Alexander Gordeev Nov. 22, 2023, 2:04 p.m. UTC | #22
On Tue, Jan 17, 2023 at 10:45:25PM +0100, Jann Horn wrote:

Hi Jann,

> On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote:
> > >
> > > +locking maintainers
> >
> > Thanks! I'll CC the locking maintainers in the next posting.
> >
> > >
> > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > > > instead of mmap_lock. Because there are cases when multiple VMAs need
> > > > to be exclusively locked during VMA tree modifications, instead of the
> > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > > > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > > > locked.
> > > [...]
> > > > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > > > +{
> > > > +       up_read(&vma->lock);
> > > > +}
> > >
> > > One thing that might be gnarly here is that I think you might not be
> > > allowed to use up_read() to fully release ownership of an object -
> > > from what I remember, I think that up_read() (unlike something like
> > > spin_unlock()) can access the lock object after it's already been
> > > acquired by someone else. So if you want to protect against concurrent
> > > deletion, this might have to be something like:
> > >
> > > rcu_read_lock(); /* keeps vma alive */
> > > up_read(&vma->lock);
> > > rcu_read_unlock();
> >
> > But for deleting VMA one would need to write-lock the vma->lock first,
> > which I assume can't happen until this up_read() is complete. Is that
> > assumption wrong?
> 
> __up_read() does:
> 
> rwsem_clear_reader_owned(sem);
> tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
> DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
> if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
>       RWSEM_FLAG_WAITERS)) {
>   clear_nonspinnable(sem);
>   rwsem_wake(sem);
> }

This sequence is covered by preempt_disable()/preempt_enable().
Would not it preserve the RCU grace period until after __up_read()
exited?

> The atomic_long_add_return_release() is the point where we are doing
> the main lock-releasing.
> 
> So if a reader dropped the read-lock while someone else was waiting on
> the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the
> lock together with it, the reader also does clear_nonspinnable() and
> rwsem_wake() afterwards.
> But in rwsem_down_write_slowpath(), after we've set
> RWSEM_FLAG_WAITERS, we can return successfully immediately once
> rwsem_try_write_lock() sees that there are no active readers or
> writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg
> succeeds). We're not necessarily waiting for the "nonspinnable" bit or
> the wake.
> 
> So yeah, I think down_write() can return successfully before up_read()
> is done with its memory accesses.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..ec2c4c227d51 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -612,6 +612,85 @@  struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+static inline void vma_write_lock(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+{
+	/* Check before locking. A race might cause false locked result. */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+		return false;
+
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result.
+	 * False unlocked result is impossible because we modify and check
+	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * modification invalidates all existing locks.
+	 */
+	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_read_unlock(struct vm_area_struct *vma)
+{
+	up_read(&vma->lock);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline void vma_write_lock(struct vm_area_struct *vma) {}
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_read_unlock(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -620,6 +699,7 @@  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d5cdec1314fe..5f7c5ca89931 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -555,6 +555,11 @@  struct vm_area_struct {
 	pgprot_t vm_page_prot;
 	unsigned long vm_flags;		/* Flags, see mm.h. */
 
+#ifdef CONFIG_PER_VMA_LOCK
+	int vm_lock_seq;
+	struct rw_semaphore lock;
+#endif
+
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
@@ -680,6 +685,9 @@  struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..40facd4c398b 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@  static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_write_unlock_mm(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_write_unlock_mm(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@  static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_write_unlock_mm(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_write_unlock_mm(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5986817f393c..c026d75108b3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -1145,6 +1146,9 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	WRITE_ONCE(mm->mm_lock_seq, 0);
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@  struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA