diff mbox series

[v2,10/61] kernel/fork: Use maple tree for dup_mmap() during forking

Message ID 20210817154651.1570984-11-Liam.Howlett@oracle.com (mailing list archive)
State New
Headers show
Series Introducing the Maple Tree | expand

Commit Message

Liam R. Howlett Aug. 17, 2021, 3:47 p.m. UTC
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>

The maple tree was already tracking VMAs in this function by an earlier
commit, but the rbtree iterator was being used to iterate the list.
Change the iterator to use a maple tree native iterator, rcu locking,
and switch to the maple tree advanced API to avoid multiple walks of the
tree during insert operations.

anon_vma_fork() may enter the slow path and cause a schedule() call to
cause rcu issues.  Drop the rcu lock and reacquiring the lock.  There is
no harm in this approach as the mmap_sem is taken for write/read and
held across the schedule() call so the VMAs will not change.

Note that the bulk allocation of nodes is also happening here for
performance reasons.  The node calculations are done internally to the
tree and use the VMA count and assume the worst-case node requirements.
The VM_DONT_COPY flag does not allow for the most efficient copy method
of the tree and so a bulk loading algorithm is used.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---
 include/linux/mm.h       |  2 --
 include/linux/sched/mm.h |  9 +++++++++
 kernel/fork.c            | 35 +++++++++++++++++++++++++++++------
 mm/mmap.c                |  4 ----
 4 files changed, 38 insertions(+), 12 deletions(-)

Comments

Hillf Danton Aug. 18, 2021, 8:36 a.m. UTC | #1
On Tue, 17 Aug 2021 15:47:11 +0000 Liam R. Howlett wrote:
> 
>  static inline void mmget(struct mm_struct *mm)
>  {
> +	mt_set_in_rcu(&mm->mm_mt);
>  	atomic_inc(&mm->mm_users);
>  }
> 
>  static inline bool mmget_not_zero(struct mm_struct *mm)
>  {
> +	/*
> +	 * There is a race below during task tear down that can cause the maple
> +	 * tree to enter rcu mode with only a single user.  If this race
> +	 * happens, the result would be that the maple tree nodes would remain
> +	 * active for an extra RCU read cycle.
> +	 */
> +	mt_set_in_rcu(&mm->mm_mt);
>  	return atomic_inc_not_zero(&mm->mm_users);
>  }

Nit, leave the mm with zero refcount intact.

 	if (atomic_inc_not_zero(&mm->mm_users)) {
		mt_set_in_rcu(&mm->mm_mt);
		return true;
	}
	return false;
Liam R. Howlett Aug. 18, 2021, 2:54 p.m. UTC | #2
* Hillf Danton <hdanton@sina.com> [210818 04:36]:
> On Tue, 17 Aug 2021 15:47:11 +0000 Liam R. Howlett wrote:
> > 
> >  static inline void mmget(struct mm_struct *mm)
> >  {
> > +	mt_set_in_rcu(&mm->mm_mt);
> >  	atomic_inc(&mm->mm_users);
> >  }
> > 
> >  static inline bool mmget_not_zero(struct mm_struct *mm)
> >  {
> > +	/*
> > +	 * There is a race below during task tear down that can cause the maple
> > +	 * tree to enter rcu mode with only a single user.  If this race
> > +	 * happens, the result would be that the maple tree nodes would remain
> > +	 * active for an extra RCU read cycle.
> > +	 */
> > +	mt_set_in_rcu(&mm->mm_mt);
> >  	return atomic_inc_not_zero(&mm->mm_users);
> >  }
> 
> Nit, leave the mm with zero refcount intact.
> 
>  	if (atomic_inc_not_zero(&mm->mm_users)) {
> 		mt_set_in_rcu(&mm->mm_mt);
> 		return true;
> 	}
> 	return false;

Thanks for looking at this.

I thought about that, but came up with the following scenario:

thread 1	thread 2
mmget(mm)
		mmget_not_zero() enter..
		atomic_inc_not_zero(&mm->mm_users)
mmput(mm)
		mt_set_in_rcu(&mm->mm_mt);
		return true;


So I think the above does not remove the race, but does add instructions
to each call to mmget_not_zero().  I thought the race of having nodes
sitting around for an rcu read cycle was worth the trade off.

Cheers,
Liam
Hillf Danton Aug. 19, 2021, 2:01 a.m. UTC | #3
On Wed, 18 Aug 2021 14:54:29 +0000 Liam R. Howlett wrote:
>* Hillf Danton <hdanton@sina.com> [210818 04:36]:
>> On Tue, 17 Aug 2021 15:47:11 +0000 Liam R. Howlett wrote:
>> >
>> >  static inline void mmget(struct mm_struct *mm)
>> >  {
>> > +	mt_set_in_rcu(&mm->mm_mt);
>> >  	atomic_inc(&mm->mm_users);
>> >  }
>> >
>> >  static inline bool mmget_not_zero(struct mm_struct *mm)
>> >  {
>> > +	/*
>> > +	 * There is a race below during task tear down that can cause the maple
>> > +	 * tree to enter rcu mode with only a single user.  If this race
>> > +	 * happens, the result would be that the maple tree nodes would remain
>> > +	 * active for an extra RCU read cycle.
>> > +	 */
>> > +	mt_set_in_rcu(&mm->mm_mt);
>> >  	return atomic_inc_not_zero(&mm->mm_users);
>> >  }
>>
>> Nit, leave the mm with zero refcount intact.
>>
>>  	if (atomic_inc_not_zero(&mm->mm_users)) {
>> 		mt_set_in_rcu(&mm->mm_mt);
>> 		return true;
>> 	}
>> 	return false;
>
>Thanks for looking at this.
>
>I thought about that, but came up with the following scenario:
>
>thread 1	thread 2
>mmget(mm)
>		mmget_not_zero() enter..
>		atomic_inc_not_zero(&mm->mm_users)
>mmput(mm)
>		mt_set_in_rcu(&mm->mm_mt);
>		return true;
>

At first glance, given the above mmget, mmput will not hurt anyone.

>
>So I think the above does not remove the race, but does add instructions

If the mm refcount drops to one after mmput then it is one before
atomic_inc_not_zero() which only ensures the mm is stable afterwards
until mmput again.

>to each call to mmget_not_zero().  I thought the race of having nodes
>sitting around for an rcu read cycle was worth the trade off.

Is one ounce of the mm stability worth two pounds? Or three?
Liam R. Howlett Aug. 19, 2021, 1:32 p.m. UTC | #4
* Hillf Danton <hdanton@sina.com> [210818 22:01]:
> On Wed, 18 Aug 2021 14:54:29 +0000 Liam R. Howlett wrote:
> >* Hillf Danton <hdanton@sina.com> [210818 04:36]:
> >> On Tue, 17 Aug 2021 15:47:11 +0000 Liam R. Howlett wrote:
> >> >
> >> >  static inline void mmget(struct mm_struct *mm)
> >> >  {
> >> > +	mt_set_in_rcu(&mm->mm_mt);
> >> >  	atomic_inc(&mm->mm_users);
> >> >  }
> >> >
> >> >  static inline bool mmget_not_zero(struct mm_struct *mm)
> >> >  {
> >> > +	/*
> >> > +	 * There is a race below during task tear down that can cause the maple
> >> > +	 * tree to enter rcu mode with only a single user.  If this race
> >> > +	 * happens, the result would be that the maple tree nodes would remain
> >> > +	 * active for an extra RCU read cycle.
> >> > +	 */
> >> > +	mt_set_in_rcu(&mm->mm_mt);
> >> >  	return atomic_inc_not_zero(&mm->mm_users);
> >> >  }
> >>
> >> Nit, leave the mm with zero refcount intact.
> >>
> >>  	if (atomic_inc_not_zero(&mm->mm_users)) {
> >> 		mt_set_in_rcu(&mm->mm_mt);
> >> 		return true;
> >> 	}
> >> 	return false;
> >
> >Thanks for looking at this.
> >
> >I thought about that, but came up with the following scenario:
> >
> >thread 1	thread 2
> >mmget(mm)
> >		mmget_not_zero() enter..
> >		atomic_inc_not_zero(&mm->mm_users)
> >mmput(mm)
> >		mt_set_in_rcu(&mm->mm_mt);
> >		return true;
> >
> 
> At first glance, given the above mmget, mmput will not hurt anyone.

In the case above, the maple tree enters RCU mode with a single user.
This will have the result of nodes being freed in RCU mode which is the
same result as what happens as it is written, except the racing thread
wins (in this case).  I thought this was what you were trying to fix?

> 
> >
> >So I think the above does not remove the race, but does add instructions
> 
> If the mm refcount drops to one after mmput then it is one before
> atomic_inc_not_zero() which only ensures the mm is stable afterwards
> until mmput again.

Right.  The race we are worried about is the zero referenced mm.  If
mm->mm_users is safe, then mm->mm_mt is also safe.

> 
> >to each call to mmget_not_zero().  I thought the race of having nodes
> >sitting around for an rcu read cycle was worth the trade off.
> 
> Is one ounce of the mm stability worth two pounds? Or three?

I don't see a stability problem with the way it is written.  Your change
does not remove the race.  Can you explain how the stability is affected
negatively by the way it is written?

Thanks,
Liam
Liam R. Howlett Aug. 20, 2021, 5:45 p.m. UTC | #5
* Hillf Danton <hdanton@sina.com> [210820 00:05]:
> On Thu, 19 Aug 2021 13:32:58 +0000 Liam R. Howlett wrote:
> >* Hillf Danton <hdanton@sina.com> [210818 22:01]:
> >> On Wed, 18 Aug 2021 14:54:29 +0000 Liam R. Howlett wrote:
> >> >* Hillf Danton <hdanton@sina.com> [210818 04:36]:
> >> >> On Tue, 17 Aug 2021 15:47:11 +0000 Liam R. Howlett wrote:
> >> >> >
> >> >> >  static inline void mmget(struct mm_struct *mm)
> >> >> >  {
> >> >> > +	mt_set_in_rcu(&mm->mm_mt);
> >> >> >  	atomic_inc(&mm->mm_users);
> >> >> >  }
> >> >> >
> >> >> >  static inline bool mmget_not_zero(struct mm_struct *mm)
> >> >> >  {
> >> >> > +	/*
> >> >> > +	 * There is a race below during task tear down that can cause the =
> >maple
> >> >> > +	 * tree to enter rcu mode with only a single user.  If this race
> >> >> > +	 * happens, the result would be that the maple tree nodes would re=
> >main
> >> >> > +	 * active for an extra RCU read cycle.
> >> >> > +	 */
> >> >> > +	mt_set_in_rcu(&mm->mm_mt);
> >> >> >  	return atomic_inc_not_zero(&mm->mm_users);
> >> >> >  }
> >> >>
> >> >> Nit, leave the mm with zero refcount intact.
> >> >>
> >> >>  	if (atomic_inc_not_zero(&mm->mm_users)) {
> >> >> 		mt_set_in_rcu(&mm->mm_mt);
> >> >> 		return true;
> >> >> 	}
> >> >> 	return false;
> >> >
> >> >Thanks for looking at this.
> >> >
> >> >I thought about that, but came up with the following scenario:
> >> >
> >> >thread 1	thread 2
> >> >mmget(mm)
> >> >		mmget_not_zero() enter..
> >> >		atomic_inc_not_zero(&mm->mm_users)
> >> >mmput(mm)
> >> >		mt_set_in_rcu(&mm->mm_mt);
> >> >		return true;
> >> >
> >>=20
> >> At first glance, given the above mmget, mmput will not hurt anyone.
> >
> >In the case above, the maple tree enters RCU mode with a single user.
> >This will have the result of nodes being freed in RCU mode which is the
> >same result as what happens as it is written, except the racing thread
> >wins (in this case).  I thought this was what you were trying to fix?
> >
> >>=20
> >> >
> >> >So I think the above does not remove the race, but does add instructions
> >>=20
> >> If the mm refcount drops to one after mmput then it is one before
> >> atomic_inc_not_zero() which only ensures the mm is stable afterwards
> >> until mmput again.
> >
> >Right.  The race we are worried about is the zero referenced mm.  If
> >mm->mm_users is safe, then mm->mm_mt is also safe.
> >
> >>=20
> >> >to each call to mmget_not_zero().  I thought the race of having nodes
> >> >sitting around for an rcu read cycle was worth the trade off.
> >>=20
> >> Is one ounce of the mm stability worth two pounds? Or three?
> >
> >I don't see a stability problem with the way it is written.
> 
> On the maple tree side, I see in
>  [PATCH v2 05/61] Maple Tree: Add new data structure
> 
> + * MAPLE_USE_RCU	Operate in read/copy/update mode for multi-readers
> 
> <...>
> 
> +/**
> + * mt_set_in_rcu() - Switch the tree to RCU safe mode.
> + */
> +static inline void mt_set_in_rcu(struct maple_tree *mt)
> +{
> +	if (mt_in_rcu(mt))
> +		return;
> +
> +	mtree_lock(mt);
> +	mt->ma_flags |= MAPLE_USE_RCU;
> +	mtree_unlock(mt);
> +}
> 
> and on the mm side, however, if atomic_inc_not_zero(&mm->mm_users) fails
> then who can be the "RCU multi-readers"?

There wouldn't be one.  But the consequence is that the maple tree nodes
will remain active for one extra RCU cycle.  This is why there is a big
comment above mmget_not_zero() explaining how this race exists but will
cause no issue.

> 
> >Your change does not remove the race.
> 
> If atomic_inc_not_zero() fails then there is no pre-condition in any form
> for race; if it succeeds then the race window is slammed.

My example shows how we can still end up with the tree being in RCU mode
with a single thread.  atomic_inc_not_zero() succeeds, then the other
thread drops the reference counter before the maple tree enters RCU
mode.

> 
> >Can you explain how the stability is affected negatively by the way it
> >is written?
> 
> Hard to find the correct answer without knowing why you prefer to update
> the flags for mm->mm_mt with mm->mm_users dropping down to ground.

I prefer to update the flags for mm->mm_mt first because the fallout is
minimal, however the alternative is to increase the execution time for
the vast majority of calls.  The trade off just isn't worth it,
especially since the tree may be left in RCU mode with a single user
anyways.  There is no stability issue here.

Thank you,
Liam
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80b7af9e725c..ce8fc0fd6d6e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2596,8 +2596,6 @@  extern bool arch_has_descending_max_zone_pfns(void);
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
-/* maple_tree */
-void vma_store(struct mm_struct *mm, struct vm_area_struct *vma);
 
 /* interval_tree.c */
 void vma_interval_tree_insert(struct vm_area_struct *node,
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index fd6e4d14f477..0b8a4f07f3f8 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,6 +8,7 @@ 
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
 #include <linux/sync_core.h>
+#include <linux/maple_tree.h>
 
 /*
  * Routines for handling mm_structs
@@ -88,11 +89,19 @@  static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
  */
 static inline void mmget(struct mm_struct *mm)
 {
+	mt_set_in_rcu(&mm->mm_mt);
 	atomic_inc(&mm->mm_users);
 }
 
 static inline bool mmget_not_zero(struct mm_struct *mm)
 {
+	/*
+	 * There is a race below during task tear down that can cause the maple
+	 * tree to enter rcu mode with only a single user.  If this race
+	 * happens, the result would be that the maple tree nodes would remain
+	 * active for an extra RCU read cycle.
+	 */
+	mt_set_in_rcu(&mm->mm_mt);
 	return atomic_inc_not_zero(&mm->mm_users);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1da5c1b20a60..9ef5661abbd1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -477,7 +477,9 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
-	unsigned long charge;
+	unsigned long charge = 0;
+	MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
+	MA_STATE(mas, &mm->mm_mt, 0, 0);
 	LIST_HEAD(uf);
 
 	uprobe_start_dup_mmap();
@@ -511,11 +513,19 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		goto out;
 
 	prev = NULL;
-	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+
+	retval = mas_entry_count(&mas, oldmm->map_count);
+	if (retval)
+		goto out;
+
+	rcu_read_lock();
+	mas_for_each(&old_mas, mpnt, ULONG_MAX) {
 		struct file *file;
 
+		rcu_read_unlock();
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+			rcu_read_lock();
 			continue;
 		}
 		charge = 0;
@@ -525,7 +535,7 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		 */
 		if (fatal_signal_pending(current)) {
 			retval = -EINTR;
-			goto out;
+			goto loop_out;
 		}
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned long len = vma_pages(mpnt);
@@ -553,6 +563,7 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 			tmp->anon_vma = NULL;
 		} else if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
+
 		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		file = tmp->vm_file;
 		if (file) {
@@ -594,7 +605,11 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		rb_parent = &tmp->vm_rb;
 
 		/* Link the vma into the MT */
-		vma_store(mm, tmp);
+		mas_lock(&mas);
+		mas.index = tmp->vm_start;
+		mas.last = tmp->vm_end - 1;
+		mas_store(&mas, tmp);
+		mas_unlock(&mas);
 
 		mm->map_count++;
 		if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -604,10 +619,17 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 			tmp->vm_ops->open(tmp);
 
 		if (retval)
-			goto out;
+			goto loop_out;
+
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 	/* a new mm has just been created */
 	retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+	rcu_read_lock();
+	mas_destroy(&mas);
+	rcu_read_unlock();
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
@@ -623,7 +645,7 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 fail_nomem:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
-	goto out;
+	goto loop_out;
 }
 
 static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -1149,6 +1171,7 @@  static inline void __mmput(struct mm_struct *mm)
 {
 	VM_BUG_ON(atomic_read(&mm->mm_users));
 
+	mt_clear_in_rcu(&mm->mm_mt);
 	uprobe_clear_state(mm);
 	exit_aio(mm);
 	ksm_exit(mm);
diff --git a/mm/mmap.c b/mm/mmap.c
index fac6e2554351..cec8ba0b598f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -786,10 +786,6 @@  void vma_mt_store(struct mm_struct *mm, struct vm_area_struct *vma)
 		GFP_KERNEL);
 }
 
-void vma_store(struct mm_struct *mm, struct vm_area_struct *vma) {
-	vma_mt_store(mm, vma);
-}
-
 static void
 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *prev, struct rb_node **rb_link,