@@ -2253,18 +2253,32 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
+
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand);
+ struct vm_area_struct *expand, bool keep_locked);
+
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
- return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+ return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
}
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+
+extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
+ struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+ unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+ pgoff_t pgoff, struct mempolicy *mpol,
+ struct vm_userfaultfd_ctx uff, bool keep_locked);
+
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
- unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx);
+ unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+ pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff)
+{
+ return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
+ pol, uff, false);
+}
+
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
@@ -689,7 +689,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm,
*/
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand)
+ struct vm_area_struct *expand, bool keep_locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -805,8 +805,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
- if (error)
+ if (error) {
+ if (next && next != vma)
+ vm_raw_write_end(next);
+ vm_raw_write_end(vma);
return error;
+ }
}
}
again:
@@ -1001,7 +1005,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
if (next && next != vma)
vm_raw_write_end(next);
- vm_raw_write_end(vma);
+ if (!keep_locked)
+ vm_raw_write_end(vma);
validate_mm(mm);
@@ -1137,12 +1142,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* parameter) may establish ptes with the wrong permissions of NNNN
* instead of the right permissions of XXXX.
*/
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ bool keep_locked)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1190,10 +1196,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
- prev);
+ prev, keep_locked);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
+ end, prev->vm_pgoff, NULL, prev,
+ keep_locked);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1210,10 +1217,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
+ addr, prev->vm_pgoff, NULL, next,
+ keep_locked);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
+ next->vm_pgoff - pglen, NULL, next,
+ keep_locked);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
@@ -3184,9 +3193,20 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+
+ /* There is 3 cases to manage here in
+ * AAAA AAAA AAAA AAAA
+ * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN
+ * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL
+ * PPPPPPPPNNNN(2)
+ * PPPPNNNNNNNN(3)
+ *
+ * new_vma == prev in case A,1,2
+ * new_vma == next in case B,3
+ */
+ new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx, true);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3226,6 +3246,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
+ /*
+ * As the VMA is linked right now, it may be hit by the
+ * speculative page fault handler. But we don't want it to
+ * to start mapping page in this area until the caller has
+ * potentially move the pte from the moved VMA. To prevent
+ * that we protect it right now, and let the caller unprotect
+ * it once the move is done.
+ */
+ vm_raw_write_begin(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
}
@@ -302,6 +302,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (!new_vma)
return -ENOMEM;
+ /* new_vma is returned protected by copy_vma, to prevent speculative
+ * page fault to be done in the destination area before we move the pte.
+ * Now, we must also protect the source VMA since we don't want pages
+ * to be mapped in our back while we are copying the PTEs.
+ */
+ if (vma != new_vma)
+ vm_raw_write_begin(vma);
+
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) {
@@ -318,6 +326,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
*/
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
true);
+ if (vma != new_vma)
+ vm_raw_write_end(vma);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
@@ -326,7 +336,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mremap_userfaultfd_prep(new_vma, uf);
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
+ if (vma != new_vma)
+ vm_raw_write_end(vma);
}
+ vm_raw_write_end(new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {