Message ID | 20190416134522.17540-20-ldufour@linux.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Speculative page faults | expand |
On Tue, Apr 16, 2019 at 03:45:10PM +0200, Laurent Dufour wrote: > Introducing a per mm_struct seqlock, mm_seq field, to protect the changes > made in the MM RB tree. This allows to walk the RB tree without grabbing > the mmap_sem, and on the walk is done to double check that sequence counter > was stable during the walk. > > The mm seqlock is held while inserting and removing entries into the MM RB > tree. Later in this series, it will be check when looking for a VMA > without holding the mmap_sem. > > This is based on the initial work from Peter Zijlstra: > https://lore.kernel.org/linux-mm/20100104182813.479668508@chello.nl/ > > Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Jérôme Glisse <jglisse@redhat.com> > --- > include/linux/mm_types.h | 3 +++ > kernel/fork.c | 3 +++ > mm/init-mm.c | 3 +++ > mm/mmap.c | 48 +++++++++++++++++++++++++++++++--------- > 4 files changed, 46 insertions(+), 11 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index e78f72eb2576..24b3f8ce9e42 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -358,6 +358,9 @@ struct mm_struct { > struct { > struct vm_area_struct *mmap; /* list of VMAs */ > struct rb_root mm_rb; > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + seqlock_t mm_seq; > +#endif > u64 vmacache_seqnum; /* per-thread vmacache */ > #ifdef CONFIG_MMU > unsigned long (*get_unmapped_area) (struct file *filp, > diff --git a/kernel/fork.c b/kernel/fork.c > index 2992d2c95256..3a1739197ebc 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1008,6 +1008,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > mm->mmap = NULL; > mm->mm_rb = RB_ROOT; > mm->vmacache_seqnum = 0; > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + seqlock_init(&mm->mm_seq); > +#endif > atomic_set(&mm->mm_users, 1); > atomic_set(&mm->mm_count, 1); > init_rwsem(&mm->mmap_sem); > diff --git a/mm/init-mm.c b/mm/init-mm.c > index a787a319211e..69346b883a4e 100644 > --- a/mm/init-mm.c > +++ b/mm/init-mm.c > @@ -27,6 +27,9 @@ > */ > struct mm_struct init_mm = { > .mm_rb = RB_ROOT, > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq), > +#endif > .pgd = swapper_pg_dir, > .mm_users = ATOMIC_INIT(2), > .mm_count = ATOMIC_INIT(1), > diff --git a/mm/mmap.c b/mm/mmap.c > index 13460b38b0fb..f7f6027a7dff 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -170,6 +170,24 @@ void unlink_file_vma(struct vm_area_struct *vma) > } > } > > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > +static inline void mm_write_seqlock(struct mm_struct *mm) > +{ > + write_seqlock(&mm->mm_seq); > +} > +static inline void mm_write_sequnlock(struct mm_struct *mm) > +{ > + write_sequnlock(&mm->mm_seq); > +} > +#else > +static inline void mm_write_seqlock(struct mm_struct *mm) > +{ > +} > +static inline void mm_write_sequnlock(struct mm_struct *mm) > +{ > +} > +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ > + > /* > * Close a vm structure and free it, returning the next. > */ > @@ -445,26 +463,32 @@ static void vma_gap_update(struct vm_area_struct *vma) > } > > static inline void vma_rb_insert(struct vm_area_struct *vma, > - struct rb_root *root) > + struct mm_struct *mm) > { > + struct rb_root *root = &mm->mm_rb; > + > /* All rb_subtree_gap values must be consistent prior to insertion */ > validate_mm_rb(root, NULL); > > rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); > } > > -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) > +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) > { > + struct rb_root *root = &mm->mm_rb; > + > /* > * Note rb_erase_augmented is a fairly large inline function, > * so make sure we instantiate it only once with our desired > * augmented rbtree callbacks. > */ > + mm_write_seqlock(mm); > rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); > + mm_write_sequnlock(mm); /* wmb */ > } > > static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, > - struct rb_root *root, > + struct mm_struct *mm, > struct vm_area_struct *ignore) > { > /* > @@ -472,21 +496,21 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, > * with the possible exception of the "next" vma being erased if > * next->vm_start was reduced. > */ > - validate_mm_rb(root, ignore); > + validate_mm_rb(&mm->mm_rb, ignore); > > - __vma_rb_erase(vma, root); > + __vma_rb_erase(vma, mm); > } > > static __always_inline void vma_rb_erase(struct vm_area_struct *vma, > - struct rb_root *root) > + struct mm_struct *mm) > { > /* > * All rb_subtree_gap values must be consistent prior to erase, > * with the possible exception of the vma being erased. > */ > - validate_mm_rb(root, vma); > + validate_mm_rb(&mm->mm_rb, vma); > > - __vma_rb_erase(vma, root); > + __vma_rb_erase(vma, mm); > } > > /* > @@ -601,10 +625,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, > * immediately update the gap to the correct value. Finally we > * rebalance the rbtree after all augmented values have been set. > */ > + mm_write_seqlock(mm); > rb_link_node(&vma->vm_rb, rb_parent, rb_link); > vma->rb_subtree_gap = 0; > vma_gap_update(vma); > - vma_rb_insert(vma, &mm->mm_rb); > + vma_rb_insert(vma, mm); > + mm_write_sequnlock(mm); > } > > static void __vma_link_file(struct vm_area_struct *vma) > @@ -680,7 +706,7 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm, > { > struct vm_area_struct *next; > > - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); > + vma_rb_erase_ignore(vma, mm, ignore); > next = vma->vm_next; > if (has_prev) > prev->vm_next = next; > @@ -2674,7 +2700,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, > insertion_point = (prev ? &prev->vm_next : &mm->mmap); > vma->vm_prev = NULL; > do { > - vma_rb_erase(vma, &mm->mm_rb); > + vma_rb_erase(vma, mm); > mm->map_count--; > tail_vma = vma; > vma = vma->vm_next; > -- > 2.21.0 >
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e78f72eb2576..24b3f8ce9e42 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -358,6 +358,9 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + seqlock_t mm_seq; +#endif u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index 2992d2c95256..3a1739197ebc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1008,6 +1008,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->mmap = NULL; mm->mm_rb = RB_ROOT; mm->vmacache_seqnum = 0; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + seqlock_init(&mm->mm_seq); +#endif atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..69346b883a4e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -27,6 +27,9 @@ */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq), +#endif .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/mmap.c b/mm/mmap.c index 13460b38b0fb..f7f6027a7dff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -170,6 +170,24 @@ void unlink_file_vma(struct vm_area_struct *vma) } } +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static inline void mm_write_seqlock(struct mm_struct *mm) +{ + write_seqlock(&mm->mm_seq); +} +static inline void mm_write_sequnlock(struct mm_struct *mm) +{ + write_sequnlock(&mm->mm_seq); +} +#else +static inline void mm_write_seqlock(struct mm_struct *mm) +{ +} +static inline void mm_write_sequnlock(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + /* * Close a vm structure and free it, returning the next. */ @@ -445,26 +463,32 @@ static void vma_gap_update(struct vm_area_struct *vma) } static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) + struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; + /* All rb_subtree_gap values must be consistent prior to insertion */ validate_mm_rb(root, NULL); rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; + /* * Note rb_erase_augmented is a fairly large inline function, * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. */ + mm_write_seqlock(mm); rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); + mm_write_sequnlock(mm); /* wmb */ } static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, + struct mm_struct *mm, struct vm_area_struct *ignore) { /* @@ -472,21 +496,21 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, * with the possible exception of the "next" vma being erased if * next->vm_start was reduced. */ - validate_mm_rb(root, ignore); + validate_mm_rb(&mm->mm_rb, ignore); - __vma_rb_erase(vma, root); + __vma_rb_erase(vma, mm); } static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) + struct mm_struct *mm) { /* * All rb_subtree_gap values must be consistent prior to erase, * with the possible exception of the vma being erased. */ - validate_mm_rb(root, vma); + validate_mm_rb(&mm->mm_rb, vma); - __vma_rb_erase(vma, root); + __vma_rb_erase(vma, mm); } /* @@ -601,10 +625,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, * immediately update the gap to the correct value. Finally we * rebalance the rbtree after all augmented values have been set. */ + mm_write_seqlock(mm); rb_link_node(&vma->vm_rb, rb_parent, rb_link); vma->rb_subtree_gap = 0; vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); + vma_rb_insert(vma, mm); + mm_write_sequnlock(mm); } static void __vma_link_file(struct vm_area_struct *vma) @@ -680,7 +706,7 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm, { struct vm_area_struct *next; - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); + vma_rb_erase_ignore(vma, mm, ignore); next = vma->vm_next; if (has_prev) prev->vm_next = next; @@ -2674,7 +2700,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase(vma, mm); mm->map_count--; tail_vma = vma; vma = vma->vm_next;
Introducing a per mm_struct seqlock, mm_seq field, to protect the changes made in the MM RB tree. This allows to walk the RB tree without grabbing the mmap_sem, and on the walk is done to double check that sequence counter was stable during the walk. The mm seqlock is held while inserting and removing entries into the MM RB tree. Later in this series, it will be check when looking for a VMA without holding the mmap_sem. This is based on the initial work from Peter Zijlstra: https://lore.kernel.org/linux-mm/20100104182813.479668508@chello.nl/ Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> --- include/linux/mm_types.h | 3 +++ kernel/fork.c | 3 +++ mm/init-mm.c | 3 +++ mm/mmap.c | 48 +++++++++++++++++++++++++++++++--------- 4 files changed, 46 insertions(+), 11 deletions(-)