@@ -3181,6 +3181,49 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
return fpin;
}
+static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ pte_t *ptep;
+
+ /*
+ * We might have COW'ed a pagecache folio and might now have an mlocked
+ * anon folio mapped. The original pagecache folio is not mlocked and
+ * might have been evicted. During a read+clear/modify/write update of
+ * the PTE, such as done in do_numa_page()/change_pte_range(), we
+ * temporarily clear the PTE under PT lock and might detect it here as
+ * "none" when not holding the PT lock.
+ *
+ * Not rechecking the PTE under PT lock could result in an unexpected
+ * major fault in an mlock'ed region. Recheck only for this special
+ * scenario while holding the PT lock, to not degrade non-mlocked
+ * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
+ * the number of times we hold PT lock.
+ */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return 0;
+
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return 0;
+
+ ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!ptep))
+ return VM_FAULT_NOPAGE;
+
+ if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
+ ret = VM_FAULT_NOPAGE;
+ } else {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_none(ptep_get(ptep))))
+ ret = VM_FAULT_NOPAGE;
+ spin_unlock(vmf->ptl);
+ }
+ pte_unmap(ptep);
+ return ret;
+}
+
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
@@ -3236,6 +3279,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
mapping_locked = true;
}
} else {
+ ret = filemap_fault_recheck_pte_none(vmf);
+ if (unlikely(ret))
+ return ret;
+
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);