Message ID | 20220901173516.702122-7-surenb@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | per-VMA locks proposal | expand |
Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit : > VMA flag modifications should be done under VMA lock to prevent concurrent > page fault handling in that area. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > --- > fs/proc/task_mmu.c | 1 + > fs/userfaultfd.c | 6 ++++++ > mm/madvise.c | 1 + > mm/mlock.c | 2 ++ > mm/mmap.c | 1 + > mm/mprotect.c | 1 + > 6 files changed, 12 insertions(+) There are few changes also done in the driver's space, for instance: *** arch/x86/kernel/cpu/sgx/driver.c: sgx_mmap[98] vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; *** arch/x86/kernel/cpu/sgx/virt.c: sgx_vepc_mmap[108] vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; *** drivers/dax/device.c: dax_mmap[311] vma->vm_flags |= VM_HUGEPAGE; I guess these changes to vm_flags should be protected as well, or to be checked one by one. > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 4e0023643f8b..ceffa5c2c650 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, > for (vma = mm->mmap; vma; vma = vma->vm_next) { > if (!(vma->vm_flags & VM_SOFTDIRTY)) > continue; > + vma_mark_locked(vma); > vma->vm_flags &= ~VM_SOFTDIRTY; > vma_set_page_prot(vma); > } > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 175de70e3adf..fe557b3d1c07 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > mmap_write_lock(mm); > for (vma = mm->mmap; vma; vma = vma->vm_next) > if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > + vma_mark_locked(vma); > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > vma->vm_flags &= ~__VM_UFFD_FLAGS; > } > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > > octx = vma->vm_userfaultfd_ctx.ctx; > if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { > + vma_mark_locked(vma); > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > vma->vm_flags &= ~__VM_UFFD_FLAGS; > return 0; > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > atomic_inc(&ctx->mmap_changing); > } else { > /* Drop uffd context if remap feature not enabled */ > + vma_mark_locked(vma); > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > vma->vm_flags &= ~__VM_UFFD_FLAGS; > } > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > vma = prev; > else > prev = vma; > + vma_mark_locked(vma); > vma->vm_flags = new_flags; > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > } > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > * the next vma was merged into the current one and > * the current one has not been updated yet. > */ > + vma_mark_locked(vma); > vma->vm_flags = new_flags; > vma->vm_userfaultfd_ctx.ctx = ctx; > > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > * the next vma was merged into the current one and > * the current one has not been updated yet. > */ > + vma_mark_locked(vma); > vma->vm_flags = new_flags; > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > diff --git a/mm/madvise.c b/mm/madvise.c > index 5f0f0948a50e..a173f0025abd 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, > /* > * vm_flags is protected by the mmap_lock held in write mode. > */ > + vma_mark_locked(vma); > vma->vm_flags = new_flags; > if (!vma->vm_file) { > error = replace_anon_vma_name(vma, anon_name); > diff --git a/mm/mlock.c b/mm/mlock.c > index b14e929084cc..f62e1a4d05f2 100644 > --- a/mm/mlock.c > +++ b/mm/mlock.c > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, > */ > if (newflags & VM_LOCKED) > newflags |= VM_IO; > + vma_mark_locked(vma); > WRITE_ONCE(vma->vm_flags, newflags); > > lru_add_drain(); > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, > > if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { > /* No work to do, and mlocking twice would be wrong */ > + vma_mark_locked(vma); > vma->vm_flags = newflags; > } else { > mlock_vma_pages_range(vma, start, end, newflags); > diff --git a/mm/mmap.c b/mm/mmap.c > index 693e6776be39..f89c9b058105 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > out: > perf_event_mmap(vma); > > + vma_mark_locked(vma); > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > if (vm_flags & VM_LOCKED) { > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || I guess, this doesn't really impact, but the call to vma_mark_locked(vma) may be done only in the case the vm_flags field is touched. Something like this: vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) + vma == get_gate_vma(current->mm)) { + vma_mark_locked(vma); vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - else + } else mm->locked_vm += (len >> PAGE_SHIFT); } > diff --git a/mm/mprotect.c b/mm/mprotect.c > index bc6bddd156ca..df47fc21b0e4 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, > * vm_flags and vm_page_prot are protected by the mmap_lock > * held in write mode. > */ > + vma_mark_locked(vma); > vma->vm_flags = newflags; > /* > * We want to check manually if we can change individual PTEs writable
On Tue, Sep 6, 2022 at 7:27 AM Laurent Dufour <ldufour@linux.ibm.com> wrote: > > Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit : > > VMA flag modifications should be done under VMA lock to prevent concurrent > > page fault handling in that area. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > --- > > fs/proc/task_mmu.c | 1 + > > fs/userfaultfd.c | 6 ++++++ > > mm/madvise.c | 1 + > > mm/mlock.c | 2 ++ > > mm/mmap.c | 1 + > > mm/mprotect.c | 1 + > > 6 files changed, 12 insertions(+) > > There are few changes also done in the driver's space, for instance: > > *** arch/x86/kernel/cpu/sgx/driver.c: > sgx_mmap[98] vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | > VM_DONTDUMP | VM_IO; > *** arch/x86/kernel/cpu/sgx/virt.c: > sgx_vepc_mmap[108] vma->vm_flags |= VM_PFNMAP | VM_IO | > VM_DONTDUMP | VM_DONTCOPY; > *** drivers/dax/device.c: > dax_mmap[311] vma->vm_flags |= VM_HUGEPAGE; > > I guess these changes to vm_flags should be protected as well, or to be > checked one by one. Thanks for noting these! I'll add necessary locking here and will look for other places I might have missed. > > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index 4e0023643f8b..ceffa5c2c650 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, > > for (vma = mm->mmap; vma; vma = vma->vm_next) { > > if (!(vma->vm_flags & VM_SOFTDIRTY)) > > continue; > > + vma_mark_locked(vma); > > vma->vm_flags &= ~VM_SOFTDIRTY; > > vma_set_page_prot(vma); > > } > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index 175de70e3adf..fe557b3d1c07 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > > mmap_write_lock(mm); > > for (vma = mm->mmap; vma; vma = vma->vm_next) > > if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > > + vma_mark_locked(vma); > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > } > > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > > > > octx = vma->vm_userfaultfd_ctx.ctx; > > if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { > > + vma_mark_locked(vma); > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > return 0; > > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > > atomic_inc(&ctx->mmap_changing); > > } else { > > /* Drop uffd context if remap feature not enabled */ > > + vma_mark_locked(vma); > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > } > > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > > vma = prev; > > else > > prev = vma; > > + vma_mark_locked(vma); > > vma->vm_flags = new_flags; > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > } > > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > * the next vma was merged into the current one and > > * the current one has not been updated yet. > > */ > > + vma_mark_locked(vma); > > vma->vm_flags = new_flags; > > vma->vm_userfaultfd_ctx.ctx = ctx; > > > > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > * the next vma was merged into the current one and > > * the current one has not been updated yet. > > */ > > + vma_mark_locked(vma); > > vma->vm_flags = new_flags; > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 5f0f0948a50e..a173f0025abd 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, > > /* > > * vm_flags is protected by the mmap_lock held in write mode. > > */ > > + vma_mark_locked(vma); > > vma->vm_flags = new_flags; > > if (!vma->vm_file) { > > error = replace_anon_vma_name(vma, anon_name); > > diff --git a/mm/mlock.c b/mm/mlock.c > > index b14e929084cc..f62e1a4d05f2 100644 > > --- a/mm/mlock.c > > +++ b/mm/mlock.c > > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, > > */ > > if (newflags & VM_LOCKED) > > newflags |= VM_IO; > > + vma_mark_locked(vma); > > WRITE_ONCE(vma->vm_flags, newflags); > > > > lru_add_drain(); > > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, > > > > if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { > > /* No work to do, and mlocking twice would be wrong */ > > + vma_mark_locked(vma); > > vma->vm_flags = newflags; > > } else { > > mlock_vma_pages_range(vma, start, end, newflags); > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 693e6776be39..f89c9b058105 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > out: > > perf_event_mmap(vma); > > > > + vma_mark_locked(vma); > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > > if (vm_flags & VM_LOCKED) { > > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > > I guess, this doesn't really impact, but the call to vma_mark_locked(vma) > may be done only in the case the vm_flags field is touched. > Something like this: > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > if (vm_flags & VM_LOCKED) { > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > is_vm_hugetlb_page(vma) || > - vma == get_gate_vma(current->mm)) > + vma == get_gate_vma(current->mm)) { > + vma_mark_locked(vma); > vma->vm_flags &= VM_LOCKED_CLEAR_MASK; > - else > + } else > mm->locked_vm += (len >> PAGE_SHIFT); > } > > > > diff --git a/mm/mprotect.c b/mm/mprotect.c > > index bc6bddd156ca..df47fc21b0e4 100644 > > --- a/mm/mprotect.c > > +++ b/mm/mprotect.c > > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, > > * vm_flags and vm_page_prot are protected by the mmap_lock > > * held in write mode. > > */ > > + vma_mark_locked(vma); > > vma->vm_flags = newflags; > > /* > > * We want to check manually if we can change individual PTEs writable >
* Suren Baghdasaryan <surenb@google.com> [220906 15:01]: > On Tue, Sep 6, 2022 at 7:27 AM Laurent Dufour <ldufour@linux.ibm.com> wrote: > > > > Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit : > > > VMA flag modifications should be done under VMA lock to prevent concurrent > > > page fault handling in that area. > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > --- > > > fs/proc/task_mmu.c | 1 + > > > fs/userfaultfd.c | 6 ++++++ > > > mm/madvise.c | 1 + > > > mm/mlock.c | 2 ++ > > > mm/mmap.c | 1 + > > > mm/mprotect.c | 1 + > > > 6 files changed, 12 insertions(+) > > > > There are few changes also done in the driver's space, for instance: > > > > *** arch/x86/kernel/cpu/sgx/driver.c: > > sgx_mmap[98] vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | > > VM_DONTDUMP | VM_IO; > > *** arch/x86/kernel/cpu/sgx/virt.c: > > sgx_vepc_mmap[108] vma->vm_flags |= VM_PFNMAP | VM_IO | > > VM_DONTDUMP | VM_DONTCOPY; > > *** drivers/dax/device.c: > > dax_mmap[311] vma->vm_flags |= VM_HUGEPAGE; > > > > I guess these changes to vm_flags should be protected as well, or to be > > checked one by one. > > Thanks for noting these! I'll add necessary locking here and will look > for other places I might have missed. Would an inline set/clear bit function be worth while for vm_flags? If it is then a name change to vm_flags may get the compiler to catch any missed cases. There doesn't seem to be many cases (12 inserts) so maybe not. > > > > > > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > > index 4e0023643f8b..ceffa5c2c650 100644 > > > --- a/fs/proc/task_mmu.c > > > +++ b/fs/proc/task_mmu.c > > > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, > > > for (vma = mm->mmap; vma; vma = vma->vm_next) { > > > if (!(vma->vm_flags & VM_SOFTDIRTY)) > > > continue; > > > + vma_mark_locked(vma); > > > vma->vm_flags &= ~VM_SOFTDIRTY; > > > vma_set_page_prot(vma); > > > } > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > > index 175de70e3adf..fe557b3d1c07 100644 > > > --- a/fs/userfaultfd.c > > > +++ b/fs/userfaultfd.c > > > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > > > mmap_write_lock(mm); > > > for (vma = mm->mmap; vma; vma = vma->vm_next) > > > if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > > > + vma_mark_locked(vma); > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > > } > > > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > > > > > > octx = vma->vm_userfaultfd_ctx.ctx; > > > if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { > > > + vma_mark_locked(vma); > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > > return 0; > > > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > > > atomic_inc(&ctx->mmap_changing); > > > } else { > > > /* Drop uffd context if remap feature not enabled */ > > > + vma_mark_locked(vma); > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > > } > > > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > > > vma = prev; > > > else > > > prev = vma; > > > + vma_mark_locked(vma); > > > vma->vm_flags = new_flags; > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > } > > > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > > * the next vma was merged into the current one and > > > * the current one has not been updated yet. > > > */ > > > + vma_mark_locked(vma); > > > vma->vm_flags = new_flags; > > > vma->vm_userfaultfd_ctx.ctx = ctx; > > > > > > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > > * the next vma was merged into the current one and > > > * the current one has not been updated yet. > > > */ > > > + vma_mark_locked(vma); > > > vma->vm_flags = new_flags; > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > index 5f0f0948a50e..a173f0025abd 100644 > > > --- a/mm/madvise.c > > > +++ b/mm/madvise.c > > > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, > > > /* > > > * vm_flags is protected by the mmap_lock held in write mode. > > > */ > > > + vma_mark_locked(vma); > > > vma->vm_flags = new_flags; > > > if (!vma->vm_file) { > > > error = replace_anon_vma_name(vma, anon_name); > > > diff --git a/mm/mlock.c b/mm/mlock.c > > > index b14e929084cc..f62e1a4d05f2 100644 > > > --- a/mm/mlock.c > > > +++ b/mm/mlock.c > > > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, > > > */ > > > if (newflags & VM_LOCKED) > > > newflags |= VM_IO; > > > + vma_mark_locked(vma); > > > WRITE_ONCE(vma->vm_flags, newflags); > > > > > > lru_add_drain(); > > > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, > > > > > > if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { > > > /* No work to do, and mlocking twice would be wrong */ > > > + vma_mark_locked(vma); > > > vma->vm_flags = newflags; > > > } else { > > > mlock_vma_pages_range(vma, start, end, newflags); > > > diff --git a/mm/mmap.c b/mm/mmap.c > > > index 693e6776be39..f89c9b058105 100644 > > > --- a/mm/mmap.c > > > +++ b/mm/mmap.c > > > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > > out: > > > perf_event_mmap(vma); > > > > > > + vma_mark_locked(vma); > > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > > > if (vm_flags & VM_LOCKED) { > > > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > > > > I guess, this doesn't really impact, but the call to vma_mark_locked(vma) > > may be done only in the case the vm_flags field is touched. > > Something like this: > > > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > > if (vm_flags & VM_LOCKED) { > > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > > is_vm_hugetlb_page(vma) || > > - vma == get_gate_vma(current->mm)) > > + vma == get_gate_vma(current->mm)) { > > + vma_mark_locked(vma); > > vma->vm_flags &= VM_LOCKED_CLEAR_MASK; > > - else > > + } else > > mm->locked_vm += (len >> PAGE_SHIFT); > > } > > > > > > > diff --git a/mm/mprotect.c b/mm/mprotect.c > > > index bc6bddd156ca..df47fc21b0e4 100644 > > > --- a/mm/mprotect.c > > > +++ b/mm/mprotect.c > > > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, > > > * vm_flags and vm_page_prot are protected by the mmap_lock > > > * held in write mode. > > > */ > > > + vma_mark_locked(vma); > > > vma->vm_flags = newflags; > > > /* > > > * We want to check manually if we can change individual PTEs writable > >
On Tue, Sep 6, 2022 at 1:00 PM Liam Howlett <liam.howlett@oracle.com> wrote: > > * Suren Baghdasaryan <surenb@google.com> [220906 15:01]: > > On Tue, Sep 6, 2022 at 7:27 AM Laurent Dufour <ldufour@linux.ibm.com> wrote: > > > > > > Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit : > > > > VMA flag modifications should be done under VMA lock to prevent concurrent > > > > page fault handling in that area. > > > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > > --- > > > > fs/proc/task_mmu.c | 1 + > > > > fs/userfaultfd.c | 6 ++++++ > > > > mm/madvise.c | 1 + > > > > mm/mlock.c | 2 ++ > > > > mm/mmap.c | 1 + > > > > mm/mprotect.c | 1 + > > > > 6 files changed, 12 insertions(+) > > > > > > There are few changes also done in the driver's space, for instance: > > > > > > *** arch/x86/kernel/cpu/sgx/driver.c: > > > sgx_mmap[98] vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | > > > VM_DONTDUMP | VM_IO; > > > *** arch/x86/kernel/cpu/sgx/virt.c: > > > sgx_vepc_mmap[108] vma->vm_flags |= VM_PFNMAP | VM_IO | > > > VM_DONTDUMP | VM_DONTCOPY; > > > *** drivers/dax/device.c: > > > dax_mmap[311] vma->vm_flags |= VM_HUGEPAGE; > > > > > > I guess these changes to vm_flags should be protected as well, or to be > > > checked one by one. > > > > Thanks for noting these! I'll add necessary locking here and will look > > for other places I might have missed. > > Would an inline set/clear bit function be worth while for vm_flags? If > it is then a name change to vm_flags may get the compiler to catch any > missed cases. There doesn't seem to be many cases (12 inserts) so maybe > not. That would probably simplify the maintenance for flags in the future and we can add vma_mark_locked directly in the set/clear functions. > > > > > > > > > > > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > > > index 4e0023643f8b..ceffa5c2c650 100644 > > > > --- a/fs/proc/task_mmu.c > > > > +++ b/fs/proc/task_mmu.c > > > > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, > > > > for (vma = mm->mmap; vma; vma = vma->vm_next) { > > > > if (!(vma->vm_flags & VM_SOFTDIRTY)) > > > > continue; > > > > + vma_mark_locked(vma); > > > > vma->vm_flags &= ~VM_SOFTDIRTY; > > > > vma_set_page_prot(vma); > > > > } > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > > > index 175de70e3adf..fe557b3d1c07 100644 > > > > --- a/fs/userfaultfd.c > > > > +++ b/fs/userfaultfd.c > > > > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > > > > mmap_write_lock(mm); > > > > for (vma = mm->mmap; vma; vma = vma->vm_next) > > > > if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > > > > + vma_mark_locked(vma); > > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > > > } > > > > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > > > > > > > > octx = vma->vm_userfaultfd_ctx.ctx; > > > > if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { > > > > + vma_mark_locked(vma); > > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > > > return 0; > > > > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > > > > atomic_inc(&ctx->mmap_changing); > > > > } else { > > > > /* Drop uffd context if remap feature not enabled */ > > > > + vma_mark_locked(vma); > > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > vma->vm_flags &= ~__VM_UFFD_FLAGS; > > > > } > > > > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > > > > vma = prev; > > > > else > > > > prev = vma; > > > > + vma_mark_locked(vma); > > > > vma->vm_flags = new_flags; > > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > } > > > > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > > > * the next vma was merged into the current one and > > > > * the current one has not been updated yet. > > > > */ > > > > + vma_mark_locked(vma); > > > > vma->vm_flags = new_flags; > > > > vma->vm_userfaultfd_ctx.ctx = ctx; > > > > > > > > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > > > * the next vma was merged into the current one and > > > > * the current one has not been updated yet. > > > > */ > > > > + vma_mark_locked(vma); > > > > vma->vm_flags = new_flags; > > > > vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > > > > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > > index 5f0f0948a50e..a173f0025abd 100644 > > > > --- a/mm/madvise.c > > > > +++ b/mm/madvise.c > > > > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, > > > > /* > > > > * vm_flags is protected by the mmap_lock held in write mode. > > > > */ > > > > + vma_mark_locked(vma); > > > > vma->vm_flags = new_flags; > > > > if (!vma->vm_file) { > > > > error = replace_anon_vma_name(vma, anon_name); > > > > diff --git a/mm/mlock.c b/mm/mlock.c > > > > index b14e929084cc..f62e1a4d05f2 100644 > > > > --- a/mm/mlock.c > > > > +++ b/mm/mlock.c > > > > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, > > > > */ > > > > if (newflags & VM_LOCKED) > > > > newflags |= VM_IO; > > > > + vma_mark_locked(vma); > > > > WRITE_ONCE(vma->vm_flags, newflags); > > > > > > > > lru_add_drain(); > > > > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, > > > > > > > > if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { > > > > /* No work to do, and mlocking twice would be wrong */ > > > > + vma_mark_locked(vma); > > > > vma->vm_flags = newflags; > > > > } else { > > > > mlock_vma_pages_range(vma, start, end, newflags); > > > > diff --git a/mm/mmap.c b/mm/mmap.c > > > > index 693e6776be39..f89c9b058105 100644 > > > > --- a/mm/mmap.c > > > > +++ b/mm/mmap.c > > > > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > > > out: > > > > perf_event_mmap(vma); > > > > > > > > + vma_mark_locked(vma); > > > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > > > > if (vm_flags & VM_LOCKED) { > > > > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > > > > > > I guess, this doesn't really impact, but the call to vma_mark_locked(vma) > > > may be done only in the case the vm_flags field is touched. > > > Something like this: > > > > > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > > > if (vm_flags & VM_LOCKED) { > > > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > > > is_vm_hugetlb_page(vma) || > > > - vma == get_gate_vma(current->mm)) > > > + vma == get_gate_vma(current->mm)) { > > > + vma_mark_locked(vma); > > > vma->vm_flags &= VM_LOCKED_CLEAR_MASK; > > > - else > > > + } else > > > mm->locked_vm += (len >> PAGE_SHIFT); > > > } > > > > > > > > > > diff --git a/mm/mprotect.c b/mm/mprotect.c > > > > index bc6bddd156ca..df47fc21b0e4 100644 > > > > --- a/mm/mprotect.c > > > > +++ b/mm/mprotect.c > > > > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, > > > > * vm_flags and vm_page_prot are protected by the mmap_lock > > > > * held in write mode. > > > > */ > > > > + vma_mark_locked(vma); > > > > vma->vm_flags = newflags; > > > > /* > > > > * We want to check manually if we can change individual PTEs writable > > > > > -- > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0023643f8b..ceffa5c2c650 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; + vma_mark_locked(vma); vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 175de70e3adf..fe557b3d1c07 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_mark_locked(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_mark_locked(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; return 0; @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_mark_locked(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma = prev; else prev = vma; + vma_mark_locked(vma); vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_mark_locked(vma); vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_mark_locked(vma); vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; diff --git a/mm/madvise.c b/mm/madvise.c index 5f0f0948a50e..a173f0025abd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, /* * vm_flags is protected by the mmap_lock held in write mode. */ + vma_mark_locked(vma); vma->vm_flags = new_flags; if (!vma->vm_file) { error = replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index b14e929084cc..f62e1a4d05f2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; + vma_mark_locked(vma); WRITE_ONCE(vma->vm_flags, newflags); lru_add_drain(); @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ + vma_mark_locked(vma); vma->vm_flags = newflags; } else { mlock_vma_pages_range(vma, start, end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index 693e6776be39..f89c9b058105 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); + vma_mark_locked(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || diff --git a/mm/mprotect.c b/mm/mprotect.c index bc6bddd156ca..df47fc21b0e4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ + vma_mark_locked(vma); vma->vm_flags = newflags; /* * We want to check manually if we can change individual PTEs writable
VMA flag modifications should be done under VMA lock to prevent concurrent page fault handling in that area. Signed-off-by: Suren Baghdasaryan <surenb@google.com> --- fs/proc/task_mmu.c | 1 + fs/userfaultfd.c | 6 ++++++ mm/madvise.c | 1 + mm/mlock.c | 2 ++ mm/mmap.c | 1 + mm/mprotect.c | 1 + 6 files changed, 12 insertions(+)