Message ID | 20230109205336.3665937-13-surenb@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Per-VMA locks | expand |
On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > Introduce a per-VMA rw_semaphore to be used during page fault handling > instead of mmap_lock. Because there are cases when multiple VMAs need > to be exclusively locked during VMA tree modifications, instead of the > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > mmap_write_lock holder is done with all modifications and drops mmap_lock, > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > locked. I have to say I was struggling a bit with the above and only understood what you mean by reading the patch several times. I would phrase it like this (feel free to use if you consider this to be an improvement). Introduce a per-VMA rw_semaphore. The lock implementation relies on a per-vma and per-mm sequence counters to note exclusive locking: - read lock - (implemented by vma_read_trylock) requires the the vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ. If they match then there must be a vma exclusive lock held somewhere. - read unlock - (implemented by vma_read_unlock) is a trivial vma->lock unlock. - write lock - (vma_write_lock) requires the mmap_lock to be held exclusively and the current mm counter is noted to the vma side. This will allow multiple vmas to be locked under a single mmap_lock write lock (e.g. during vma merging). The vma counter is modified under exclusive vma lock. - write unlock - (vma_write_unlock_mm) is a batch release of all vma locks held. It doesn't pair with a specific vma_write_lock! It is done before exclusive mmap_lock is released by incrementing mm sequence counter (mm_lock_seq). - write downgrade - if the mmap_lock is downgraded to the read lock all vma write locks are released as well (effectivelly same as write unlock). > VMA lock is placed on the cache line boundary so that its 'count' field > falls into the first cache line while the rest of the fields fall into > the second cache line. This lets the 'count' field to be cached with > other frequently accessed fields and used quickly in uncontended case > while 'owner' and other fields used in the contended case will not > invalidate the first cache line while waiting on the lock. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > --- > include/linux/mm.h | 80 +++++++++++++++++++++++++++++++++++++++ > include/linux/mm_types.h | 8 ++++ > include/linux/mmap_lock.h | 13 +++++++ > kernel/fork.c | 4 ++ > mm/init-mm.c | 3 ++ > 5 files changed, 108 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f3f196e4d66d..ec2c4c227d51 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -612,6 +612,85 @@ struct vm_operations_struct { > unsigned long addr); > }; > > +#ifdef CONFIG_PER_VMA_LOCK > +static inline void vma_init_lock(struct vm_area_struct *vma) > +{ > + init_rwsem(&vma->lock); > + vma->vm_lock_seq = -1; > +} > + > +static inline void vma_write_lock(struct vm_area_struct *vma) > +{ > + int mm_lock_seq; > + > + mmap_assert_write_locked(vma->vm_mm); > + > + /* > + * current task is holding mmap_write_lock, both vma->vm_lock_seq and > + * mm->mm_lock_seq can't be concurrently modified. > + */ > + mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); > + if (vma->vm_lock_seq == mm_lock_seq) > + return; > + > + down_write(&vma->lock); > + vma->vm_lock_seq = mm_lock_seq; > + up_write(&vma->lock); > +} > + > +/* > + * Try to read-lock a vma. The function is allowed to occasionally yield false > + * locked result to avoid performance overhead, in which case we fall back to > + * using mmap_lock. The function should never yield false unlocked result. > + */ > +static inline bool vma_read_trylock(struct vm_area_struct *vma) > +{ > + /* Check before locking. A race might cause false locked result. */ > + if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) > + return false; > + > + if (unlikely(down_read_trylock(&vma->lock) == 0)) > + return false; > + > + /* > + * Overflow might produce false locked result. > + * False unlocked result is impossible because we modify and check > + * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq > + * modification invalidates all existing locks. > + */ > + if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { > + up_read(&vma->lock); > + return false; > + } > + return true; > +} > + > +static inline void vma_read_unlock(struct vm_area_struct *vma) > +{ > + up_read(&vma->lock); > +} > + > +static inline void vma_assert_write_locked(struct vm_area_struct *vma) > +{ > + mmap_assert_write_locked(vma->vm_mm); > + /* > + * current task is holding mmap_write_lock, both vma->vm_lock_seq and > + * mm->mm_lock_seq can't be concurrently modified. > + */ > + VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma); > +} > + > +#else /* CONFIG_PER_VMA_LOCK */ > + > +static inline void vma_init_lock(struct vm_area_struct *vma) {} > +static inline void vma_write_lock(struct vm_area_struct *vma) {} > +static inline bool vma_read_trylock(struct vm_area_struct *vma) > + { return false; } > +static inline void vma_read_unlock(struct vm_area_struct *vma) {} > +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} > + > +#endif /* CONFIG_PER_VMA_LOCK */ > + > static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > { > static const struct vm_operations_struct dummy_vm_ops = {}; > @@ -620,6 +699,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > vma->vm_mm = mm; > vma->vm_ops = &dummy_vm_ops; > INIT_LIST_HEAD(&vma->anon_vma_chain); > + vma_init_lock(vma); > } > > static inline void vma_set_anonymous(struct vm_area_struct *vma) > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index d5cdec1314fe..5f7c5ca89931 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -555,6 +555,11 @@ struct vm_area_struct { > pgprot_t vm_page_prot; > unsigned long vm_flags; /* Flags, see mm.h. */ > > +#ifdef CONFIG_PER_VMA_LOCK > + int vm_lock_seq; > + struct rw_semaphore lock; > +#endif > + > /* > * For areas with an address space and backing store, > * linkage into the address_space->i_mmap interval tree. > @@ -680,6 +685,9 @@ struct mm_struct { > * init_mm.mmlist, and are protected > * by mmlist_lock > */ > +#ifdef CONFIG_PER_VMA_LOCK > + int mm_lock_seq; > +#endif > > > unsigned long hiwater_rss; /* High-watermark of RSS usage */ > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > index e49ba91bb1f0..40facd4c398b 100644 > --- a/include/linux/mmap_lock.h > +++ b/include/linux/mmap_lock.h > @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) > VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); > } > > +#ifdef CONFIG_PER_VMA_LOCK > +static inline void vma_write_unlock_mm(struct mm_struct *mm) > +{ > + mmap_assert_write_locked(mm); > + /* No races during update due to exclusive mmap_lock being held */ > + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); > +} > +#else > +static inline void vma_write_unlock_mm(struct mm_struct *mm) {} > +#endif > + > static inline void mmap_init_lock(struct mm_struct *mm) > { > init_rwsem(&mm->mmap_lock); > @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) > static inline void mmap_write_unlock(struct mm_struct *mm) > { > __mmap_lock_trace_released(mm, true); > + vma_write_unlock_mm(mm); > up_write(&mm->mmap_lock); > } > > static inline void mmap_write_downgrade(struct mm_struct *mm) > { > __mmap_lock_trace_acquire_returned(mm, false, true); > + vma_write_unlock_mm(mm); > downgrade_write(&mm->mmap_lock); > } > > diff --git a/kernel/fork.c b/kernel/fork.c > index 5986817f393c..c026d75108b3 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > */ > *new = data_race(*orig); > INIT_LIST_HEAD(&new->anon_vma_chain); > + vma_init_lock(new); > dup_anon_vma_name(orig, new); > } > return new; > @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > seqcount_init(&mm->write_protect_seq); > mmap_init_lock(mm); > INIT_LIST_HEAD(&mm->mmlist); > +#ifdef CONFIG_PER_VMA_LOCK > + WRITE_ONCE(mm->mm_lock_seq, 0); > +#endif > mm_pgtables_bytes_init(mm); > mm->map_count = 0; > mm->locked_vm = 0; > diff --git a/mm/init-mm.c b/mm/init-mm.c > index c9327abb771c..33269314e060 100644 > --- a/mm/init-mm.c > +++ b/mm/init-mm.c > @@ -37,6 +37,9 @@ struct mm_struct init_mm = { > .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > +#ifdef CONFIG_PER_VMA_LOCK > + .mm_lock_seq = 0, > +#endif > .user_ns = &init_user_ns, > .cpu_bitmap = CPU_BITS_NONE, > #ifdef CONFIG_IOMMU_SVA > -- > 2.39.0
On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > diff --git a/kernel/fork.c b/kernel/fork.c > index 5986817f393c..c026d75108b3 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > */ > *new = data_race(*orig); > INIT_LIST_HEAD(&new->anon_vma_chain); > + vma_init_lock(new); > dup_anon_vma_name(orig, new); > } > return new; > @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > seqcount_init(&mm->write_protect_seq); > mmap_init_lock(mm); > INIT_LIST_HEAD(&mm->mmlist); > +#ifdef CONFIG_PER_VMA_LOCK > + WRITE_ONCE(mm->mm_lock_seq, 0); > +#endif The mm shouldn't be visible so why WRITE_ONCE?
On Tue 17-01-23 16:04:26, Michal Hocko wrote: > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > instead of mmap_lock. Because there are cases when multiple VMAs need > > to be exclusively locked during VMA tree modifications, instead of the > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > locked. > > I have to say I was struggling a bit with the above and only understood > what you mean by reading the patch several times. I would phrase it like > this (feel free to use if you consider this to be an improvement). > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a > per-vma and per-mm sequence counters to note exclusive locking: > - read lock - (implemented by vma_read_trylock) requires the the > vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to > differ. If they match then there must be a vma exclusive lock > held somewhere. > - read unlock - (implemented by vma_read_unlock) is a trivial > vma->lock unlock. > - write lock - (vma_write_lock) requires the mmap_lock to be > held exclusively and the current mm counter is noted to the vma > side. This will allow multiple vmas to be locked under a single > mmap_lock write lock (e.g. during vma merging). The vma counter > is modified under exclusive vma lock. Didn't realize one more thing. Unlike standard write lock this implementation allows to be called multiple times under a single mmap_lock. In a sense it is more of mark_vma_potentially_modified than a lock. > - write unlock - (vma_write_unlock_mm) is a batch release of all > vma locks held. It doesn't pair with a specific > vma_write_lock! It is done before exclusive mmap_lock is > released by incrementing mm sequence counter (mm_lock_seq). > - write downgrade - if the mmap_lock is downgraded to the read > lock all vma write locks are released as well (effectivelly > same as write unlock).
+locking maintainers On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > Introduce a per-VMA rw_semaphore to be used during page fault handling > instead of mmap_lock. Because there are cases when multiple VMAs need > to be exclusively locked during VMA tree modifications, instead of the > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > mmap_write_lock holder is done with all modifications and drops mmap_lock, > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > locked. [...] > +static inline void vma_read_unlock(struct vm_area_struct *vma) > +{ > + up_read(&vma->lock); > +} One thing that might be gnarly here is that I think you might not be allowed to use up_read() to fully release ownership of an object - from what I remember, I think that up_read() (unlike something like spin_unlock()) can access the lock object after it's already been acquired by someone else. So if you want to protect against concurrent deletion, this might have to be something like: rcu_read_lock(); /* keeps vma alive */ up_read(&vma->lock); rcu_read_unlock(); But I'm not entirely sure about that, the locking folks might know better. Also, it might not matter given that the rw_semaphore part is removed in the current patch 41/41 anyway...
On Tue, Jan 17, 2023 at 7:04 AM Michal Hocko <mhocko@suse.com> wrote: > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > instead of mmap_lock. Because there are cases when multiple VMAs need > > to be exclusively locked during VMA tree modifications, instead of the > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > locked. > > I have to say I was struggling a bit with the above and only understood > what you mean by reading the patch several times. I would phrase it like > this (feel free to use if you consider this to be an improvement). > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a > per-vma and per-mm sequence counters to note exclusive locking: > - read lock - (implemented by vma_read_trylock) requires the the > vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to > differ. If they match then there must be a vma exclusive lock > held somewhere. > - read unlock - (implemented by vma_read_unlock) is a trivial > vma->lock unlock. > - write lock - (vma_write_lock) requires the mmap_lock to be > held exclusively and the current mm counter is noted to the vma > side. This will allow multiple vmas to be locked under a single > mmap_lock write lock (e.g. during vma merging). The vma counter > is modified under exclusive vma lock. > - write unlock - (vma_write_unlock_mm) is a batch release of all > vma locks held. It doesn't pair with a specific > vma_write_lock! It is done before exclusive mmap_lock is > released by incrementing mm sequence counter (mm_lock_seq). > - write downgrade - if the mmap_lock is downgraded to the read > lock all vma write locks are released as well (effectivelly > same as write unlock). Thanks for the suggestion, Michal. I'll definitely reuse your description. > > > VMA lock is placed on the cache line boundary so that its 'count' field > > falls into the first cache line while the rest of the fields fall into > > the second cache line. This lets the 'count' field to be cached with > > other frequently accessed fields and used quickly in uncontended case > > while 'owner' and other fields used in the contended case will not > > invalidate the first cache line while waiting on the lock. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > --- > > include/linux/mm.h | 80 +++++++++++++++++++++++++++++++++++++++ > > include/linux/mm_types.h | 8 ++++ > > include/linux/mmap_lock.h | 13 +++++++ > > kernel/fork.c | 4 ++ > > mm/init-mm.c | 3 ++ > > 5 files changed, 108 insertions(+) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index f3f196e4d66d..ec2c4c227d51 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -612,6 +612,85 @@ struct vm_operations_struct { > > unsigned long addr); > > }; > > > > +#ifdef CONFIG_PER_VMA_LOCK > > +static inline void vma_init_lock(struct vm_area_struct *vma) > > +{ > > + init_rwsem(&vma->lock); > > + vma->vm_lock_seq = -1; > > +} > > + > > +static inline void vma_write_lock(struct vm_area_struct *vma) > > +{ > > + int mm_lock_seq; > > + > > + mmap_assert_write_locked(vma->vm_mm); > > + > > + /* > > + * current task is holding mmap_write_lock, both vma->vm_lock_seq and > > + * mm->mm_lock_seq can't be concurrently modified. > > + */ > > + mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); > > + if (vma->vm_lock_seq == mm_lock_seq) > > + return; > > + > > + down_write(&vma->lock); > > + vma->vm_lock_seq = mm_lock_seq; > > + up_write(&vma->lock); > > +} > > + > > +/* > > + * Try to read-lock a vma. The function is allowed to occasionally yield false > > + * locked result to avoid performance overhead, in which case we fall back to > > + * using mmap_lock. The function should never yield false unlocked result. > > + */ > > +static inline bool vma_read_trylock(struct vm_area_struct *vma) > > +{ > > + /* Check before locking. A race might cause false locked result. */ > > + if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) > > + return false; > > + > > + if (unlikely(down_read_trylock(&vma->lock) == 0)) > > + return false; > > + > > + /* > > + * Overflow might produce false locked result. > > + * False unlocked result is impossible because we modify and check > > + * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq > > + * modification invalidates all existing locks. > > + */ > > + if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { > > + up_read(&vma->lock); > > + return false; > > + } > > + return true; > > +} > > + > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > +{ > > + up_read(&vma->lock); > > +} > > + > > +static inline void vma_assert_write_locked(struct vm_area_struct *vma) > > +{ > > + mmap_assert_write_locked(vma->vm_mm); > > + /* > > + * current task is holding mmap_write_lock, both vma->vm_lock_seq and > > + * mm->mm_lock_seq can't be concurrently modified. > > + */ > > + VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma); > > +} > > + > > +#else /* CONFIG_PER_VMA_LOCK */ > > + > > +static inline void vma_init_lock(struct vm_area_struct *vma) {} > > +static inline void vma_write_lock(struct vm_area_struct *vma) {} > > +static inline bool vma_read_trylock(struct vm_area_struct *vma) > > + { return false; } > > +static inline void vma_read_unlock(struct vm_area_struct *vma) {} > > +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} > > + > > +#endif /* CONFIG_PER_VMA_LOCK */ > > + > > static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > > { > > static const struct vm_operations_struct dummy_vm_ops = {}; > > @@ -620,6 +699,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > > vma->vm_mm = mm; > > vma->vm_ops = &dummy_vm_ops; > > INIT_LIST_HEAD(&vma->anon_vma_chain); > > + vma_init_lock(vma); > > } > > > > static inline void vma_set_anonymous(struct vm_area_struct *vma) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > index d5cdec1314fe..5f7c5ca89931 100644 > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -555,6 +555,11 @@ struct vm_area_struct { > > pgprot_t vm_page_prot; > > unsigned long vm_flags; /* Flags, see mm.h. */ > > > > +#ifdef CONFIG_PER_VMA_LOCK > > + int vm_lock_seq; > > + struct rw_semaphore lock; > > +#endif > > + > > /* > > * For areas with an address space and backing store, > > * linkage into the address_space->i_mmap interval tree. > > @@ -680,6 +685,9 @@ struct mm_struct { > > * init_mm.mmlist, and are protected > > * by mmlist_lock > > */ > > +#ifdef CONFIG_PER_VMA_LOCK > > + int mm_lock_seq; > > +#endif > > > > > > unsigned long hiwater_rss; /* High-watermark of RSS usage */ > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > > index e49ba91bb1f0..40facd4c398b 100644 > > --- a/include/linux/mmap_lock.h > > +++ b/include/linux/mmap_lock.h > > @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) > > VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); > > } > > > > +#ifdef CONFIG_PER_VMA_LOCK > > +static inline void vma_write_unlock_mm(struct mm_struct *mm) > > +{ > > + mmap_assert_write_locked(mm); > > + /* No races during update due to exclusive mmap_lock being held */ > > + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); > > +} > > +#else > > +static inline void vma_write_unlock_mm(struct mm_struct *mm) {} > > +#endif > > + > > static inline void mmap_init_lock(struct mm_struct *mm) > > { > > init_rwsem(&mm->mmap_lock); > > @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) > > static inline void mmap_write_unlock(struct mm_struct *mm) > > { > > __mmap_lock_trace_released(mm, true); > > + vma_write_unlock_mm(mm); > > up_write(&mm->mmap_lock); > > } > > > > static inline void mmap_write_downgrade(struct mm_struct *mm) > > { > > __mmap_lock_trace_acquire_returned(mm, false, true); > > + vma_write_unlock_mm(mm); > > downgrade_write(&mm->mmap_lock); > > } > > > > diff --git a/kernel/fork.c b/kernel/fork.c > > index 5986817f393c..c026d75108b3 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > > */ > > *new = data_race(*orig); > > INIT_LIST_HEAD(&new->anon_vma_chain); > > + vma_init_lock(new); > > dup_anon_vma_name(orig, new); > > } > > return new; > > @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > > seqcount_init(&mm->write_protect_seq); > > mmap_init_lock(mm); > > INIT_LIST_HEAD(&mm->mmlist); > > +#ifdef CONFIG_PER_VMA_LOCK > > + WRITE_ONCE(mm->mm_lock_seq, 0); > > +#endif > > mm_pgtables_bytes_init(mm); > > mm->map_count = 0; > > mm->locked_vm = 0; > > diff --git a/mm/init-mm.c b/mm/init-mm.c > > index c9327abb771c..33269314e060 100644 > > --- a/mm/init-mm.c > > +++ b/mm/init-mm.c > > @@ -37,6 +37,9 @@ struct mm_struct init_mm = { > > .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), > > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > > +#ifdef CONFIG_PER_VMA_LOCK > > + .mm_lock_seq = 0, > > +#endif > > .user_ns = &init_user_ns, > > .cpu_bitmap = CPU_BITS_NONE, > > #ifdef CONFIG_IOMMU_SVA > > -- > > 2.39.0 > > -- > Michal Hocko > SUSE Labs
On Tue, Jan 17, 2023 at 7:07 AM Michal Hocko <mhocko@suse.com> wrote: > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > diff --git a/kernel/fork.c b/kernel/fork.c > > index 5986817f393c..c026d75108b3 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > > */ > > *new = data_race(*orig); > > INIT_LIST_HEAD(&new->anon_vma_chain); > > + vma_init_lock(new); > > dup_anon_vma_name(orig, new); > > } > > return new; > > @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > > seqcount_init(&mm->write_protect_seq); > > mmap_init_lock(mm); > > INIT_LIST_HEAD(&mm->mmlist); > > +#ifdef CONFIG_PER_VMA_LOCK > > + WRITE_ONCE(mm->mm_lock_seq, 0); > > +#endif > > The mm shouldn't be visible so why WRITE_ONCE? True. Will change to a simple assignment. > > -- > Michal Hocko > SUSE Labs
On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote: > > On Tue 17-01-23 16:04:26, Michal Hocko wrote: > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > to be exclusively locked during VMA tree modifications, instead of the > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > locked. > > > > I have to say I was struggling a bit with the above and only understood > > what you mean by reading the patch several times. I would phrase it like > > this (feel free to use if you consider this to be an improvement). > > > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a > > per-vma and per-mm sequence counters to note exclusive locking: > > - read lock - (implemented by vma_read_trylock) requires the the > > vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to > > differ. If they match then there must be a vma exclusive lock > > held somewhere. > > - read unlock - (implemented by vma_read_unlock) is a trivial > > vma->lock unlock. > > - write lock - (vma_write_lock) requires the mmap_lock to be > > held exclusively and the current mm counter is noted to the vma > > side. This will allow multiple vmas to be locked under a single > > mmap_lock write lock (e.g. during vma merging). The vma counter > > is modified under exclusive vma lock. > > Didn't realize one more thing. > Unlike standard write lock this implementation allows to be > called multiple times under a single mmap_lock. In a sense > it is more of mark_vma_potentially_modified than a lock. In the RFC it was called vma_mark_locked() originally and renames were discussed in the email thread ending here: https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/. If other names are preferable I'm open to changing them. > > > - write unlock - (vma_write_unlock_mm) is a batch release of all > > vma locks held. It doesn't pair with a specific > > vma_write_lock! It is done before exclusive mmap_lock is > > released by incrementing mm sequence counter (mm_lock_seq). > > - write downgrade - if the mmap_lock is downgraded to the read > > lock all vma write locks are released as well (effectivelly > > same as write unlock). > -- > Michal Hocko > SUSE Labs
On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote: > > +locking maintainers Thanks! I'll CC the locking maintainers in the next posting. > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > instead of mmap_lock. Because there are cases when multiple VMAs need > > to be exclusively locked during VMA tree modifications, instead of the > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > locked. > [...] > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > +{ > > + up_read(&vma->lock); > > +} > > One thing that might be gnarly here is that I think you might not be > allowed to use up_read() to fully release ownership of an object - > from what I remember, I think that up_read() (unlike something like > spin_unlock()) can access the lock object after it's already been > acquired by someone else. So if you want to protect against concurrent > deletion, this might have to be something like: > > rcu_read_lock(); /* keeps vma alive */ > up_read(&vma->lock); > rcu_read_unlock(); But for deleting VMA one would need to write-lock the vma->lock first, which I assume can't happen until this up_read() is complete. Is that assumption wrong? > > But I'm not entirely sure about that, the locking folks might know better. > > Also, it might not matter given that the rw_semaphore part is removed > in the current patch 41/41 anyway... This does matter because Michal suggested dropping that last 41/41 patch for now.
On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote: > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote: > > > > +locking maintainers > > Thanks! I'll CC the locking maintainers in the next posting. > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > to be exclusively locked during VMA tree modifications, instead of the > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > locked. > > [...] > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > +{ > > > + up_read(&vma->lock); > > > +} > > > > One thing that might be gnarly here is that I think you might not be > > allowed to use up_read() to fully release ownership of an object - > > from what I remember, I think that up_read() (unlike something like > > spin_unlock()) can access the lock object after it's already been > > acquired by someone else. So if you want to protect against concurrent > > deletion, this might have to be something like: > > > > rcu_read_lock(); /* keeps vma alive */ > > up_read(&vma->lock); > > rcu_read_unlock(); > > But for deleting VMA one would need to write-lock the vma->lock first, > which I assume can't happen until this up_read() is complete. Is that > assumption wrong? __up_read() does: rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { clear_nonspinnable(sem); rwsem_wake(sem); } The atomic_long_add_return_release() is the point where we are doing the main lock-releasing. So if a reader dropped the read-lock while someone else was waiting on the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the lock together with it, the reader also does clear_nonspinnable() and rwsem_wake() afterwards. But in rwsem_down_write_slowpath(), after we've set RWSEM_FLAG_WAITERS, we can return successfully immediately once rwsem_try_write_lock() sees that there are no active readers or writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg succeeds). We're not necessarily waiting for the "nonspinnable" bit or the wake. So yeah, I think down_write() can return successfully before up_read() is done with its memory accesses. (Spinlocks are different - the kernel relies on being able to drop references via spin_unlock() in some places.)
On Tue, Jan 17, 2023 at 01:21:47PM -0800, Suren Baghdasaryan wrote: > On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Tue 17-01-23 16:04:26, Michal Hocko wrote: > > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > locked. > > > > > > I have to say I was struggling a bit with the above and only understood > > > what you mean by reading the patch several times. I would phrase it like > > > this (feel free to use if you consider this to be an improvement). > > > > > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a > > > per-vma and per-mm sequence counters to note exclusive locking: > > > - read lock - (implemented by vma_read_trylock) requires the the > > > vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to > > > differ. If they match then there must be a vma exclusive lock > > > held somewhere. > > > - read unlock - (implemented by vma_read_unlock) is a trivial > > > vma->lock unlock. > > > - write lock - (vma_write_lock) requires the mmap_lock to be > > > held exclusively and the current mm counter is noted to the vma > > > side. This will allow multiple vmas to be locked under a single > > > mmap_lock write lock (e.g. during vma merging). The vma counter > > > is modified under exclusive vma lock. > > > > Didn't realize one more thing. > > Unlike standard write lock this implementation allows to be > > called multiple times under a single mmap_lock. In a sense > > it is more of mark_vma_potentially_modified than a lock. > > In the RFC it was called vma_mark_locked() originally and renames were > discussed in the email thread ending here: > https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/. > If other names are preferable I'm open to changing them. I don't want to bikeshed this, but rather than locking it seems to be more: vma_start_read() vma_end_read() vma_start_write() vma_end_write() vma_downgrade_write() ... and that these are _implemented_ with locks (in part) is an implementation detail? Would that reduce people's confusion? > > > > > - write unlock - (vma_write_unlock_mm) is a batch release of all > > > vma locks held. It doesn't pair with a specific > > > vma_write_lock! It is done before exclusive mmap_lock is > > > released by incrementing mm sequence counter (mm_lock_seq). > > > - write downgrade - if the mmap_lock is downgraded to the read > > > lock all vma write locks are released as well (effectivelly > > > same as write unlock). > > -- > > Michal Hocko > > SUSE Labs
On Tue, Jan 17, 2023 at 1:54 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Tue, Jan 17, 2023 at 01:21:47PM -0800, Suren Baghdasaryan wrote: > > On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Tue 17-01-23 16:04:26, Michal Hocko wrote: > > > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > > locked. > > > > > > > > I have to say I was struggling a bit with the above and only understood > > > > what you mean by reading the patch several times. I would phrase it like > > > > this (feel free to use if you consider this to be an improvement). > > > > > > > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a > > > > per-vma and per-mm sequence counters to note exclusive locking: > > > > - read lock - (implemented by vma_read_trylock) requires the the > > > > vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to > > > > differ. If they match then there must be a vma exclusive lock > > > > held somewhere. > > > > - read unlock - (implemented by vma_read_unlock) is a trivial > > > > vma->lock unlock. > > > > - write lock - (vma_write_lock) requires the mmap_lock to be > > > > held exclusively and the current mm counter is noted to the vma > > > > side. This will allow multiple vmas to be locked under a single > > > > mmap_lock write lock (e.g. during vma merging). The vma counter > > > > is modified under exclusive vma lock. > > > > > > Didn't realize one more thing. > > > Unlike standard write lock this implementation allows to be > > > called multiple times under a single mmap_lock. In a sense > > > it is more of mark_vma_potentially_modified than a lock. > > > > In the RFC it was called vma_mark_locked() originally and renames were > > discussed in the email thread ending here: > > https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/. > > If other names are preferable I'm open to changing them. > > I don't want to bikeshed this, but rather than locking it seems to be > more: > > vma_start_read() > vma_end_read() > vma_start_write() > vma_end_write() > vma_downgrade_write() Couple corrections, we would have to have vma_start_tryread() and vma_end_write_all(). Also there is no vma_downgrade_write(). mmap_write_downgrade() simply does vma_end_write_all(). > > ... and that these are _implemented_ with locks (in part) is an > implementation detail? > > Would that reduce people's confusion? > > > > > > > > - write unlock - (vma_write_unlock_mm) is a batch release of all > > > > vma locks held. It doesn't pair with a specific > > > > vma_write_lock! It is done before exclusive mmap_lock is > > > > released by incrementing mm sequence counter (mm_lock_seq). > > > > - write downgrade - if the mmap_lock is downgraded to the read > > > > lock all vma write locks are released as well (effectivelly > > > > same as write unlock). > > > -- > > > Michal Hocko > > > SUSE Labs
On Tue, Jan 17, 2023 at 1:46 PM Jann Horn <jannh@google.com> wrote: > > On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote: > > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote: > > > > > > +locking maintainers > > > > Thanks! I'll CC the locking maintainers in the next posting. > > > > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > locked. > > > [...] > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > > +{ > > > > + up_read(&vma->lock); > > > > +} > > > > > > One thing that might be gnarly here is that I think you might not be > > > allowed to use up_read() to fully release ownership of an object - > > > from what I remember, I think that up_read() (unlike something like > > > spin_unlock()) can access the lock object after it's already been > > > acquired by someone else. So if you want to protect against concurrent > > > deletion, this might have to be something like: > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > up_read(&vma->lock); > > > rcu_read_unlock(); > > > > But for deleting VMA one would need to write-lock the vma->lock first, > > which I assume can't happen until this up_read() is complete. Is that > > assumption wrong? > > __up_read() does: > > rwsem_clear_reader_owned(sem); > tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); > DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); > if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == > RWSEM_FLAG_WAITERS)) { > clear_nonspinnable(sem); > rwsem_wake(sem); > } > > The atomic_long_add_return_release() is the point where we are doing > the main lock-releasing. > > So if a reader dropped the read-lock while someone else was waiting on > the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the > lock together with it, the reader also does clear_nonspinnable() and > rwsem_wake() afterwards. > But in rwsem_down_write_slowpath(), after we've set > RWSEM_FLAG_WAITERS, we can return successfully immediately once > rwsem_try_write_lock() sees that there are no active readers or > writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg > succeeds). We're not necessarily waiting for the "nonspinnable" bit or > the wake. > > So yeah, I think down_write() can return successfully before up_read() > is done with its memory accesses. > > (Spinlocks are different - the kernel relies on being able to drop > references via spin_unlock() in some places.) Thanks for bringing this up. I can add rcu_read_{lock/unlock) as you suggested and that would fix the issue because we free VMAs from call_rcu(). However this feels to me as an issue of rw_semaphore design that this locking pattern is unsafe and might lead to UAF.
On Tue, Jan 17, 2023 at 02:36:47PM -0800, Suren Baghdasaryan wrote: > On Tue, Jan 17, 2023 at 1:46 PM Jann Horn <jannh@google.com> wrote: > > On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote: > > > > One thing that might be gnarly here is that I think you might not be > > > > allowed to use up_read() to fully release ownership of an object - > > > > from what I remember, I think that up_read() (unlike something like > > > > spin_unlock()) can access the lock object after it's already been > > > > acquired by someone else. So if you want to protect against concurrent > > > > deletion, this might have to be something like: > > > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > > up_read(&vma->lock); > > > > rcu_read_unlock(); > > > > > > But for deleting VMA one would need to write-lock the vma->lock first, > > > which I assume can't happen until this up_read() is complete. Is that > > > assumption wrong? > > > > __up_read() does: > > > > rwsem_clear_reader_owned(sem); > > tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); > > DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); > > if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == > > RWSEM_FLAG_WAITERS)) { > > clear_nonspinnable(sem); > > rwsem_wake(sem); > > } > > > > The atomic_long_add_return_release() is the point where we are doing > > the main lock-releasing. > > > > So if a reader dropped the read-lock while someone else was waiting on > > the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the > > lock together with it, the reader also does clear_nonspinnable() and > > rwsem_wake() afterwards. > > But in rwsem_down_write_slowpath(), after we've set > > RWSEM_FLAG_WAITERS, we can return successfully immediately once > > rwsem_try_write_lock() sees that there are no active readers or > > writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg > > succeeds). We're not necessarily waiting for the "nonspinnable" bit or > > the wake. > > > > So yeah, I think down_write() can return successfully before up_read() > > is done with its memory accesses. > > > > (Spinlocks are different - the kernel relies on being able to drop > > references via spin_unlock() in some places.) > > Thanks for bringing this up. I can add rcu_read_{lock/unlock) as you > suggested and that would fix the issue because we free VMAs from > call_rcu(). However this feels to me as an issue of rw_semaphore > design that this locking pattern is unsafe and might lead to UAF. We have/had this problem with normal mutexes too. It was the impetus for adding the struct completion which is very careful to not touch anything after the completion is, well, completed.
On Tue 17-01-23 21:54:58, Matthew Wilcox wrote: > On Tue, Jan 17, 2023 at 01:21:47PM -0800, Suren Baghdasaryan wrote: > > On Tue, Jan 17, 2023 at 7:12 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Tue 17-01-23 16:04:26, Michal Hocko wrote: > > > > On Mon 09-01-23 12:53:07, Suren Baghdasaryan wrote: > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > > locked. > > > > > > > > I have to say I was struggling a bit with the above and only understood > > > > what you mean by reading the patch several times. I would phrase it like > > > > this (feel free to use if you consider this to be an improvement). > > > > > > > > Introduce a per-VMA rw_semaphore. The lock implementation relies on a > > > > per-vma and per-mm sequence counters to note exclusive locking: > > > > - read lock - (implemented by vma_read_trylock) requires the the > > > > vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to > > > > differ. If they match then there must be a vma exclusive lock > > > > held somewhere. > > > > - read unlock - (implemented by vma_read_unlock) is a trivial > > > > vma->lock unlock. > > > > - write lock - (vma_write_lock) requires the mmap_lock to be > > > > held exclusively and the current mm counter is noted to the vma > > > > side. This will allow multiple vmas to be locked under a single > > > > mmap_lock write lock (e.g. during vma merging). The vma counter > > > > is modified under exclusive vma lock. > > > > > > Didn't realize one more thing. > > > Unlike standard write lock this implementation allows to be > > > called multiple times under a single mmap_lock. In a sense > > > it is more of mark_vma_potentially_modified than a lock. > > > > In the RFC it was called vma_mark_locked() originally and renames were > > discussed in the email thread ending here: > > https://lore.kernel.org/all/621612d7-c537-3971-9520-a3dec7b43cb4@suse.cz/. > > If other names are preferable I'm open to changing them. > > I don't want to bikeshed this, but rather than locking it seems to be > more: > > vma_start_read() > vma_end_read() > vma_start_write() > vma_end_write() > vma_downgrade_write() > > ... and that these are _implemented_ with locks (in part) is an > implementation detail? Agreed! > Would that reduce people's confusion? Yes I believe that naming it less like a locking primitive will clarify it. vma_{start,end}_[try]read is better indeed. I am wondering about the write side of things because that is where things get confusing. There is no explicit write lock nor unlock. vma_start_write sounds better than the vma_write_lock but it still lacks that pairing with vma_end_write which is never the right thing to call. Wouldn't vma_mark_modified and vma_publish_changes describe the scheme better? Downgrade case is probably the least interesting one because that is just one off thing that can be completely hidden from any code besides mmap_write_downgrade so I wouldn't be too concern about that one. But as you say, no need to bikeshed this too much. Great naming is hard and if the scheme is documented properly we can live with a suboptimal naming as well.
On Tue 17-01-23 19:02:55, Jann Horn wrote: > +locking maintainers > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > instead of mmap_lock. Because there are cases when multiple VMAs need > > to be exclusively locked during VMA tree modifications, instead of the > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > locked. > [...] > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > +{ > > + up_read(&vma->lock); > > +} > > One thing that might be gnarly here is that I think you might not be > allowed to use up_read() to fully release ownership of an object - > from what I remember, I think that up_read() (unlike something like > spin_unlock()) can access the lock object after it's already been > acquired by someone else. Yes, I think you are right. From a look into the code it seems that the UAF is quite unlikely as there is a ton of work to be done between vma_write_lock used to prepare vma for removal and actual removal. That doesn't make it less of a problem though. > So if you want to protect against concurrent > deletion, this might have to be something like: > > rcu_read_lock(); /* keeps vma alive */ > up_read(&vma->lock); > rcu_read_unlock(); > > But I'm not entirely sure about that, the locking folks might know better. I am not a locking expert but to me it looks like this should work because the final cleanup would have to happen rcu_read_unlock. Thanks, I have completely missed this aspect of the locking when looking into the code. Btw. looking at this again I have fully realized how hard it is actually to see that vm_area_free is guaranteed to sync up with ongoing readers. vma manipulation functions like __adjust_vma make my head spin. Would it make more sense to have a rcu style synchronization point in vm_area_free directly before call_rcu? This would add an overhead of uncontended down_write of course.
... > > One thing that might be gnarly here is that I think you might not be > > allowed to use up_read() to fully release ownership of an object - > > from what I remember, I think that up_read() (unlike something like > > spin_unlock()) can access the lock object after it's already been > > acquired by someone else. > > Yes, I think you are right. From a look into the code it seems that > the UAF is quite unlikely as there is a ton of work to be done between > vma_write_lock used to prepare vma for removal and actual removal. > That doesn't make it less of a problem though. All it takes is a hardware interrupt.... Especially if the softint code can also run. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote: > On Tue 17-01-23 19:02:55, Jann Horn wrote: > > +locking maintainers > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > to be exclusively locked during VMA tree modifications, instead of the > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > locked. > > [...] > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > +{ > > > + up_read(&vma->lock); > > > +} > > > > One thing that might be gnarly here is that I think you might not be > > allowed to use up_read() to fully release ownership of an object - > > from what I remember, I think that up_read() (unlike something like > > spin_unlock()) can access the lock object after it's already been > > acquired by someone else. > > Yes, I think you are right. From a look into the code it seems that > the UAF is quite unlikely as there is a ton of work to be done between > vma_write_lock used to prepare vma for removal and actual removal. > That doesn't make it less of a problem though. > > > So if you want to protect against concurrent > > deletion, this might have to be something like: > > > > rcu_read_lock(); /* keeps vma alive */ > > up_read(&vma->lock); > > rcu_read_unlock(); > > > > But I'm not entirely sure about that, the locking folks might know better. > > I am not a locking expert but to me it looks like this should work > because the final cleanup would have to happen rcu_read_unlock. > > Thanks, I have completely missed this aspect of the locking when looking > into the code. > > Btw. looking at this again I have fully realized how hard it is actually > to see that vm_area_free is guaranteed to sync up with ongoing readers. > vma manipulation functions like __adjust_vma make my head spin. Would it > make more sense to have a rcu style synchronization point in > vm_area_free directly before call_rcu? This would add an overhead of > uncontended down_write of course. Something along those lines might be a good idea, but I think that rather than synchronizing the removal, it should maybe be something that splats (and bails out?) if it detects pending readers. If we get to vm_area_free() on a VMA that has pending readers, we might already be in a lot of trouble because the concurrent readers might have been traversing page tables while we were tearing them down or fun stuff like that. I think maybe Suren was already talking about something like that in another part of this patch series but I don't remember...
On Wed 18-01-23 14:23:32, Jann Horn wrote: > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote: > > On Tue 17-01-23 19:02:55, Jann Horn wrote: > > > +locking maintainers > > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > locked. > > > [...] > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > > +{ > > > > + up_read(&vma->lock); > > > > +} > > > > > > One thing that might be gnarly here is that I think you might not be > > > allowed to use up_read() to fully release ownership of an object - > > > from what I remember, I think that up_read() (unlike something like > > > spin_unlock()) can access the lock object after it's already been > > > acquired by someone else. > > > > Yes, I think you are right. From a look into the code it seems that > > the UAF is quite unlikely as there is a ton of work to be done between > > vma_write_lock used to prepare vma for removal and actual removal. > > That doesn't make it less of a problem though. > > > > > So if you want to protect against concurrent > > > deletion, this might have to be something like: > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > up_read(&vma->lock); > > > rcu_read_unlock(); > > > > > > But I'm not entirely sure about that, the locking folks might know better. > > > > I am not a locking expert but to me it looks like this should work > > because the final cleanup would have to happen rcu_read_unlock. > > > > Thanks, I have completely missed this aspect of the locking when looking > > into the code. > > > > Btw. looking at this again I have fully realized how hard it is actually > > to see that vm_area_free is guaranteed to sync up with ongoing readers. > > vma manipulation functions like __adjust_vma make my head spin. Would it > > make more sense to have a rcu style synchronization point in > > vm_area_free directly before call_rcu? This would add an overhead of > > uncontended down_write of course. > > Something along those lines might be a good idea, but I think that > rather than synchronizing the removal, it should maybe be something > that splats (and bails out?) if it detects pending readers. If we get > to vm_area_free() on a VMA that has pending readers, we might already > be in a lot of trouble because the concurrent readers might have been > traversing page tables while we were tearing them down or fun stuff > like that. > > I think maybe Suren was already talking about something like that in > another part of this patch series but I don't remember... This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com?
On Wed, Jan 18, 2023 at 7:11 AM 'Michal Hocko' via kernel-team <kernel-team@android.com> wrote: > > On Wed 18-01-23 14:23:32, Jann Horn wrote: > > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote: > > > On Tue 17-01-23 19:02:55, Jann Horn wrote: > > > > +locking maintainers > > > > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > > locked. > > > > [...] > > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > > > +{ > > > > > + up_read(&vma->lock); > > > > > +} > > > > > > > > One thing that might be gnarly here is that I think you might not be > > > > allowed to use up_read() to fully release ownership of an object - > > > > from what I remember, I think that up_read() (unlike something like > > > > spin_unlock()) can access the lock object after it's already been > > > > acquired by someone else. > > > > > > Yes, I think you are right. From a look into the code it seems that > > > the UAF is quite unlikely as there is a ton of work to be done between > > > vma_write_lock used to prepare vma for removal and actual removal. > > > That doesn't make it less of a problem though. > > > > > > > So if you want to protect against concurrent > > > > deletion, this might have to be something like: > > > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > > up_read(&vma->lock); > > > > rcu_read_unlock(); > > > > > > > > But I'm not entirely sure about that, the locking folks might know better. > > > > > > I am not a locking expert but to me it looks like this should work > > > because the final cleanup would have to happen rcu_read_unlock. > > > > > > Thanks, I have completely missed this aspect of the locking when looking > > > into the code. > > > > > > Btw. looking at this again I have fully realized how hard it is actually > > > to see that vm_area_free is guaranteed to sync up with ongoing readers. > > > vma manipulation functions like __adjust_vma make my head spin. Would it > > > make more sense to have a rcu style synchronization point in > > > vm_area_free directly before call_rcu? This would add an overhead of > > > uncontended down_write of course. > > > > Something along those lines might be a good idea, but I think that > > rather than synchronizing the removal, it should maybe be something > > that splats (and bails out?) if it detects pending readers. If we get > > to vm_area_free() on a VMA that has pending readers, we might already > > be in a lot of trouble because the concurrent readers might have been > > traversing page tables while we were tearing them down or fun stuff > > like that. > > > > I think maybe Suren was already talking about something like that in > > another part of this patch series but I don't remember... > > This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com? Yes, I spent a lot of time ensuring that __adjust_vma locks the right VMAs and that VMAs are freed or isolated under VMA write lock protection to exclude any readers. If the VM_BUG_ON_VMA in the patch Michal mentioned gets hit then it's a bug in my design and I'll have to fix it. But please, let's not add synchronize_rcu() in the vm_area_free(). That will slow down any path that frees a VMA, especially the exit path which might be freeing thousands of them. I had an SPF version with synchronize_rcu() in the vm_area_free() and phone vendors started yelling at me the very next day. call_rcu() with CONFIG_RCU_NOCB_CPU (which Android uses for power saving purposes) is already bad enough to show up in the benchmarks and that's why I had to add call_rcu() batching in https://lore.kernel.org/all/20230109205336.3665937-40-surenb@google.com. > > -- > Michal Hocko > SUSE Labs > > -- > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
On Wed 18-01-23 09:36:44, Suren Baghdasaryan wrote: > On Wed, Jan 18, 2023 at 7:11 AM 'Michal Hocko' via kernel-team > <kernel-team@android.com> wrote: > > > > On Wed 18-01-23 14:23:32, Jann Horn wrote: > > > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote: > > > > On Tue 17-01-23 19:02:55, Jann Horn wrote: > > > > > +locking maintainers > > > > > > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > > > locked. > > > > > [...] > > > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > > > > +{ > > > > > > + up_read(&vma->lock); > > > > > > +} > > > > > > > > > > One thing that might be gnarly here is that I think you might not be > > > > > allowed to use up_read() to fully release ownership of an object - > > > > > from what I remember, I think that up_read() (unlike something like > > > > > spin_unlock()) can access the lock object after it's already been > > > > > acquired by someone else. > > > > > > > > Yes, I think you are right. From a look into the code it seems that > > > > the UAF is quite unlikely as there is a ton of work to be done between > > > > vma_write_lock used to prepare vma for removal and actual removal. > > > > That doesn't make it less of a problem though. > > > > > > > > > So if you want to protect against concurrent > > > > > deletion, this might have to be something like: > > > > > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > > > up_read(&vma->lock); > > > > > rcu_read_unlock(); > > > > > > > > > > But I'm not entirely sure about that, the locking folks might know better. > > > > > > > > I am not a locking expert but to me it looks like this should work > > > > because the final cleanup would have to happen rcu_read_unlock. > > > > > > > > Thanks, I have completely missed this aspect of the locking when looking > > > > into the code. > > > > > > > > Btw. looking at this again I have fully realized how hard it is actually > > > > to see that vm_area_free is guaranteed to sync up with ongoing readers. > > > > vma manipulation functions like __adjust_vma make my head spin. Would it > > > > make more sense to have a rcu style synchronization point in > > > > vm_area_free directly before call_rcu? This would add an overhead of > > > > uncontended down_write of course. > > > > > > Something along those lines might be a good idea, but I think that > > > rather than synchronizing the removal, it should maybe be something > > > that splats (and bails out?) if it detects pending readers. If we get > > > to vm_area_free() on a VMA that has pending readers, we might already > > > be in a lot of trouble because the concurrent readers might have been > > > traversing page tables while we were tearing them down or fun stuff > > > like that. > > > > > > I think maybe Suren was already talking about something like that in > > > another part of this patch series but I don't remember... > > > > This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com? > > Yes, I spent a lot of time ensuring that __adjust_vma locks the right > VMAs and that VMAs are freed or isolated under VMA write lock > protection to exclude any readers. If the VM_BUG_ON_VMA in the patch > Michal mentioned gets hit then it's a bug in my design and I'll have > to fix it. But please, let's not add synchronize_rcu() in the > vm_area_free(). Just to clarify. I didn't suggest to add synchronize_rcu into vm_area_free. What I really meant was synchronize_rcu like primitive to effectivelly synchronize with any potential pending read locker (so something like vma_write_lock (or whatever it is called). The point is that vma freeing is an event all readers should be notified about. This can be done explicitly for each and every vma before vm_area_free is called but this is just hard to review and easy to break over time. See my point?
On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote: > > On Wed 18-01-23 09:36:44, Suren Baghdasaryan wrote: > > On Wed, Jan 18, 2023 at 7:11 AM 'Michal Hocko' via kernel-team > > <kernel-team@android.com> wrote: > > > > > > On Wed 18-01-23 14:23:32, Jann Horn wrote: > > > > On Wed, Jan 18, 2023 at 1:28 PM Michal Hocko <mhocko@suse.com> wrote: > > > > > On Tue 17-01-23 19:02:55, Jann Horn wrote: > > > > > > +locking maintainers > > > > > > > > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > > > > locked. > > > > > > [...] > > > > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > > > > > +{ > > > > > > > + up_read(&vma->lock); > > > > > > > +} > > > > > > > > > > > > One thing that might be gnarly here is that I think you might not be > > > > > > allowed to use up_read() to fully release ownership of an object - > > > > > > from what I remember, I think that up_read() (unlike something like > > > > > > spin_unlock()) can access the lock object after it's already been > > > > > > acquired by someone else. > > > > > > > > > > Yes, I think you are right. From a look into the code it seems that > > > > > the UAF is quite unlikely as there is a ton of work to be done between > > > > > vma_write_lock used to prepare vma for removal and actual removal. > > > > > That doesn't make it less of a problem though. > > > > > > > > > > > So if you want to protect against concurrent > > > > > > deletion, this might have to be something like: > > > > > > > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > > > > up_read(&vma->lock); > > > > > > rcu_read_unlock(); > > > > > > > > > > > > But I'm not entirely sure about that, the locking folks might know better. > > > > > > > > > > I am not a locking expert but to me it looks like this should work > > > > > because the final cleanup would have to happen rcu_read_unlock. > > > > > > > > > > Thanks, I have completely missed this aspect of the locking when looking > > > > > into the code. > > > > > > > > > > Btw. looking at this again I have fully realized how hard it is actually > > > > > to see that vm_area_free is guaranteed to sync up with ongoing readers. > > > > > vma manipulation functions like __adjust_vma make my head spin. Would it > > > > > make more sense to have a rcu style synchronization point in > > > > > vm_area_free directly before call_rcu? This would add an overhead of > > > > > uncontended down_write of course. > > > > > > > > Something along those lines might be a good idea, but I think that > > > > rather than synchronizing the removal, it should maybe be something > > > > that splats (and bails out?) if it detects pending readers. If we get > > > > to vm_area_free() on a VMA that has pending readers, we might already > > > > be in a lot of trouble because the concurrent readers might have been > > > > traversing page tables while we were tearing them down or fun stuff > > > > like that. > > > > > > > > I think maybe Suren was already talking about something like that in > > > > another part of this patch series but I don't remember... > > > > > > This http://lkml.kernel.org/r/20230109205336.3665937-27-surenb@google.com? > > > > Yes, I spent a lot of time ensuring that __adjust_vma locks the right > > VMAs and that VMAs are freed or isolated under VMA write lock > > protection to exclude any readers. If the VM_BUG_ON_VMA in the patch > > Michal mentioned gets hit then it's a bug in my design and I'll have > > to fix it. But please, let's not add synchronize_rcu() in the > > vm_area_free(). > > Just to clarify. I didn't suggest to add synchronize_rcu into > vm_area_free. What I really meant was synchronize_rcu like primitive to > effectivelly synchronize with any potential pending read locker (so > something like vma_write_lock (or whatever it is called). The point is > that vma freeing is an event all readers should be notified about. I don't think readers need to be notified if we are ensuring that the VMA is not used by anyone else and is not reachable by the readers. This is currently done by write-locking the VMA either before removing it from the tree or before freeing it. > This can be done explicitly for each and every vma before vm_area_free > is called but this is just hard to review and easy to break over time. > See my point? I understand your point now and if we really need that, one way would be to have a VMA refcount (like Laurent had in his version of SPF implementation). I don't think current implementation needs that level of VMA lifetime control unless I missed some location that should take the lock and does not. > > -- > Michal Hocko > SUSE Labs
On Tue, Jan 17, 2023 at 10:45:25PM +0100, Jann Horn wrote: Hi Jann, > On Tue, Jan 17, 2023 at 10:28 PM Suren Baghdasaryan <surenb@google.com> wrote: > > On Tue, Jan 17, 2023 at 10:03 AM Jann Horn <jannh@google.com> wrote: > > > > > > +locking maintainers > > > > Thanks! I'll CC the locking maintainers in the next posting. > > > > > > > > On Mon, Jan 9, 2023 at 9:54 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > > > instead of mmap_lock. Because there are cases when multiple VMAs need > > > > to be exclusively locked during VMA tree modifications, instead of the > > > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > > > locked. > > > [...] > > > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > > > +{ > > > > + up_read(&vma->lock); > > > > +} > > > > > > One thing that might be gnarly here is that I think you might not be > > > allowed to use up_read() to fully release ownership of an object - > > > from what I remember, I think that up_read() (unlike something like > > > spin_unlock()) can access the lock object after it's already been > > > acquired by someone else. So if you want to protect against concurrent > > > deletion, this might have to be something like: > > > > > > rcu_read_lock(); /* keeps vma alive */ > > > up_read(&vma->lock); > > > rcu_read_unlock(); > > > > But for deleting VMA one would need to write-lock the vma->lock first, > > which I assume can't happen until this up_read() is complete. Is that > > assumption wrong? > > __up_read() does: > > rwsem_clear_reader_owned(sem); > tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); > DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); > if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == > RWSEM_FLAG_WAITERS)) { > clear_nonspinnable(sem); > rwsem_wake(sem); > } This sequence is covered by preempt_disable()/preempt_enable(). Would not it preserve the RCU grace period until after __up_read() exited? > The atomic_long_add_return_release() is the point where we are doing > the main lock-releasing. > > So if a reader dropped the read-lock while someone else was waiting on > the lock (RWSEM_FLAG_WAITERS) and no other readers were holding the > lock together with it, the reader also does clear_nonspinnable() and > rwsem_wake() afterwards. > But in rwsem_down_write_slowpath(), after we've set > RWSEM_FLAG_WAITERS, we can return successfully immediately once > rwsem_try_write_lock() sees that there are no active readers or > writers anymore (if RWSEM_LOCK_MASK is unset and the cmpxchg > succeeds). We're not necessarily waiting for the "nonspinnable" bit or > the wake. > > So yeah, I think down_write() can return successfully before up_read() > is done with its memory accesses. Thanks!
diff --git a/include/linux/mm.h b/include/linux/mm.h index f3f196e4d66d..ec2c4c227d51 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -612,6 +612,85 @@ struct vm_operations_struct { unsigned long addr); }; +#ifdef CONFIG_PER_VMA_LOCK +static inline void vma_init_lock(struct vm_area_struct *vma) +{ + init_rwsem(&vma->lock); + vma->vm_lock_seq = -1; +} + +static inline void vma_write_lock(struct vm_area_struct *vma) +{ + int mm_lock_seq; + + mmap_assert_write_locked(vma->vm_mm); + + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); + if (vma->vm_lock_seq == mm_lock_seq) + return; + + down_write(&vma->lock); + vma->vm_lock_seq = mm_lock_seq; + up_write(&vma->lock); +} + +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + */ +static inline bool vma_read_trylock(struct vm_area_struct *vma) +{ + /* Check before locking. A race might cause false locked result. */ + if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) + return false; + + if (unlikely(down_read_trylock(&vma->lock) == 0)) + return false; + + /* + * Overflow might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq + * modification invalidates all existing locks. + */ + if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { + up_read(&vma->lock); + return false; + } + return true; +} + +static inline void vma_read_unlock(struct vm_area_struct *vma) +{ + up_read(&vma->lock); +} + +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ + mmap_assert_write_locked(vma->vm_mm); + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline void vma_init_lock(struct vm_area_struct *vma) {} +static inline void vma_write_lock(struct vm_area_struct *vma) {} +static inline bool vma_read_trylock(struct vm_area_struct *vma) + { return false; } +static inline void vma_read_unlock(struct vm_area_struct *vma) {} +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} + +#endif /* CONFIG_PER_VMA_LOCK */ + static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; @@ -620,6 +699,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_init_lock(vma); } static inline void vma_set_anonymous(struct vm_area_struct *vma) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d5cdec1314fe..5f7c5ca89931 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -555,6 +555,11 @@ struct vm_area_struct { pgprot_t vm_page_prot; unsigned long vm_flags; /* Flags, see mm.h. */ +#ifdef CONFIG_PER_VMA_LOCK + int vm_lock_seq; + struct rw_semaphore lock; +#endif + /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. @@ -680,6 +685,9 @@ struct mm_struct { * init_mm.mmlist, and are protected * by mmlist_lock */ +#ifdef CONFIG_PER_VMA_LOCK + int mm_lock_seq; +#endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e49ba91bb1f0..40facd4c398b 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } +#ifdef CONFIG_PER_VMA_LOCK +static inline void vma_write_unlock_mm(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + /* No races during update due to exclusive mmap_lock being held */ + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); +} +#else +static inline void vma_write_unlock_mm(struct mm_struct *mm) {} +#endif + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); + vma_write_unlock_mm(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); + vma_write_unlock_mm(mm); downgrade_write(&mm->mmap_lock); } diff --git a/kernel/fork.c b/kernel/fork.c index 5986817f393c..c026d75108b3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); + vma_init_lock(new); dup_anon_vma_name(orig, new); } return new; @@ -1145,6 +1146,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); +#ifdef CONFIG_PER_VMA_LOCK + WRITE_ONCE(mm->mm_lock_seq, 0); +#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index c9327abb771c..33269314e060 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -37,6 +37,9 @@ struct mm_struct init_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), +#ifdef CONFIG_PER_VMA_LOCK + .mm_lock_seq = 0, +#endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, #ifdef CONFIG_IOMMU_SVA
Introduce a per-VMA rw_semaphore to be used during page fault handling instead of mmap_lock. Because there are cases when multiple VMAs need to be exclusively locked during VMA tree modifications, instead of the usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock exclusively and setting vma->lock_seq to the current mm->lock_seq. When mmap_write_lock holder is done with all modifications and drops mmap_lock, it will increment mm->lock_seq, effectively unlocking all VMAs marked as locked. VMA lock is placed on the cache line boundary so that its 'count' field falls into the first cache line while the rest of the fields fall into the second cache line. This lets the 'count' field to be cached with other frequently accessed fields and used quickly in uncontended case while 'owner' and other fields used in the contended case will not invalidate the first cache line while waiting on the lock. Signed-off-by: Suren Baghdasaryan <surenb@google.com> --- include/linux/mm.h | 80 +++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 8 ++++ include/linux/mmap_lock.h | 13 +++++++ kernel/fork.c | 4 ++ mm/init-mm.c | 3 ++ 5 files changed, 108 insertions(+)