diff mbox series

[RFC,23/37] mm: rcu safe vma->vm_file freeing

Message ID 20210407014502.24091-24-michel@lespinasse.org (mailing list archive)
State New, archived
Headers show
Series [RFC,01/37] mmap locking API: mmap_lock_is_contended returns a bool | expand

Commit Message

Michel Lespinasse April 7, 2021, 1:44 a.m. UTC
Defer freeing of vma->vm_file when freeing vmas.
This is to allow speculative page faults in the mapped file case.

Signed-off-by: Michel Lespinasse <michel@lespinasse.org>
---
 fs/exec.c     |  1 +
 kernel/fork.c | 17 +++++++++++++++--
 mm/mmap.c     | 11 +++++++----
 mm/nommu.c    |  6 ++----
 4 files changed, 25 insertions(+), 10 deletions(-)

Comments

kernel test robot April 8, 2021, 5:12 a.m. UTC | #1
Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: 87b1c39af431a20ab69bdceb9036360cf58a28e4 ("[RFC PATCH 23/37] mm: rcu safe vma->vm_file freeing")
url: https://github.com/0day-ci/linux/commits/Michel-Lespinasse/Speculative-page-faults/20210407-095517
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git a500fc918f7b8dc3dff2e6c74f3e73e856c18248

in testcase: nvml
version: nvml-x86_64-09f75f696-1_20210402
with following parameters:

	test: pmem
	group: blk
	nr_pmem: 1
	fs: ext4
	mount_option: dax
	bp_memmap: 32G!4G
	ucode: 0x7000019



on test machine: 16 threads Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz with 48G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):




If you fix the issue, kindly add following tag
Reported-by: kernel test robot <oliver.sang@intel.com>


2021-04-07 16:01:30 ./RUNTESTS -f pmem blk_rw_mt
blk_rw_mt/TEST0: SETUP (check/pmem/debug)
[MATCHING FAILED, COMPLETE FILE (out0.log) BELOW]
blk_rw_mt/TEST0: START: blk_rw_mt
 ./blk_rw_mt 4096 /fs/pmem0//test_blk_rw_mt0
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..c9da73eb0f53 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -286,6 +286,7 @@  static int __bprm_mm_init(struct linux_binprm *bprm)
 	mmap_write_unlock(mm);
 err_free:
 	bprm->vma = NULL;
+	VM_BUG_ON(vma->vm_file);
 	vm_area_free(vma);
 	return err;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index b6078e546114..2f20a5c5fed8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -369,19 +369,31 @@  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	return new;
 }
 
+static inline void ____vm_area_free(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
 static void __vm_area_free(struct rcu_head *head)
 {
 	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
 						  vm_rcu);
-	kmem_cache_free(vm_area_cachep, vma);
+	____vm_area_free(vma);
 }
 
+#endif
+
 void vm_area_free(struct vm_area_struct *vma)
 {
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
 	call_rcu(&vma->vm_rcu, __vm_area_free);
+#else
+	____vm_area_free(vma);
+#endif
 }
-#endif	/* CONFIG_SPECULATIVE_PAGE_FAULT */
 
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
@@ -621,6 +633,7 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
+	tmp->vm_file = NULL;	/* prevents fput within vm_area_free() */
 	vm_area_free(tmp);
 fail_nomem:
 	retval = -ENOMEM;
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..cc2323e243bb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -178,9 +178,8 @@  static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file)
-		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
+	/* fput(vma->vm_file) happens in vm_area_free after an RCU delay. */
 	vm_area_free(vma);
 	return next;
 }
@@ -949,7 +948,8 @@  int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	if (remove_next) {
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
-			fput(file);
+			/* fput(file) happens whthin vm_area_free(next) */
+			VM_BUG_ON(file != next->vm_file);
 		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
@@ -1828,7 +1828,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 				 * fput the vma->vm_file here or we would add an extra fput for file
 				 * and cause general protection fault ultimately.
 				 */
-				fput(vma->vm_file);
+				/* fput happens within vm_area_free */
 				vm_area_free(vma);
 				vma = merge;
 				/* Update vm_flags to pick up the change. */
@@ -1907,6 +1907,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
+	VM_BUG_ON(vma->vm_file);
 	vm_area_free(vma);
 unacct_error:
 	if (charged)
@@ -2779,6 +2780,7 @@  int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
  out_free_mpol:
 	mpol_put(vma_policy(new));
  out_free_vma:
+	new->vm_file = NULL;	/* prevents fput within vm_area_free() */
 	vm_area_free(new);
 	return err;
 }
@@ -3343,6 +3345,7 @@  struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 out_free_mempol:
 	mpol_put(vma_policy(new_vma));
 out_free_vma:
+	new_vma->vm_file = NULL;	/* Prevent fput within vm_area_free */
 	vm_area_free(new_vma);
 out:
 	return NULL;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c9ab799c0e6..06a0dc0b913b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -664,9 +664,8 @@  static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file)
-		fput(vma->vm_file);
 	put_nommu_region(vma->vm_region);
+	/* fput(vma->vm_file) happens within vm_area_free() */
 	vm_area_free(vma);
 }
 
@@ -1267,8 +1266,7 @@  unsigned long do_mmap(struct file *file,
 	if (region->vm_file)
 		fput(region->vm_file);
 	kmem_cache_free(vm_region_jar, region);
-	if (vma->vm_file)
-		fput(vma->vm_file);
+	/* fput(vma->vm_file) happens within vm_area_free() */
 	vm_area_free(vma);
 	return ret;