@@ -509,6 +509,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
vmf->address,
vmf->flags, reason);
+ pte_put_vmf(vmf);
mmap_read_unlock(mm);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
@@ -1708,6 +1708,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
if (flags & FAULT_FLAG_RETRY_NOWAIT)
return false;
+ pte_put_vmf(vmf);
mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
folio_wait_locked_killable(folio);
@@ -1720,6 +1721,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
ret = __folio_lock_killable(folio);
if (ret) {
+ pte_put_vmf(vmf);
mmap_read_unlock(mm);
return false;
}
@@ -488,6 +488,7 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
if (fault_flag_allow_retry_first(flags) &&
!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
+ pte_put_vmf(vmf);
mmap_read_unlock(vmf->vma->vm_mm);
}
return fpin;
@@ -1019,10 +1019,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
.pmd = pmd,
};
- vmf.pte = pte_offset_map(pmd, address);
+ vmf.pte = pte_tryget_map(pmd, address);
+ if (!vmf.pte)
+ continue;
vmf.orig_pte = *vmf.pte;
if (!is_swap_pte(vmf.orig_pte)) {
pte_unmap(vmf.pte);
+ pte_put_vmf(&vmf);
continue;
}
swapped_in++;
@@ -1041,7 +1044,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
+ } else {
+ pte_put_vmf(&vmf);
}
+
if (ret & VM_FAULT_ERROR) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
@@ -4571,8 +4571,10 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
+ vm_fault_t ret;
- if (unlikely(pmd_none(*vmf->pmd))) {
+retry:
+ if (unlikely(pmd_none(READ_ONCE(*vmf->pmd)))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
@@ -4595,13 +4597,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
*/
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
+
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
* mmap_lock read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->pte = pte_tryget_map(vmf->pmd, vmf->address);
+ if (!vmf->pte)
+ goto retry;
vmf->orig_pte = *vmf->pte;
/*
@@ -4616,6 +4621,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
+ pte_put_vmf(vmf);
}
}
@@ -4626,11 +4632,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
return do_fault(vmf);
}
- if (!pte_present(vmf->orig_pte))
- return do_swap_page(vmf);
+ if (!pte_present(vmf->orig_pte)) {
+ ret = do_swap_page(vmf);
+ goto put;
+ }
- if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
- return do_numa_page(vmf);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+ ret = do_numa_page(vmf);
+ goto put;
+ }
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
@@ -4640,8 +4650,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
goto unlock;
}
if (vmf->flags & FAULT_FLAG_WRITE) {
- if (!pte_write(entry))
- return do_wp_page(vmf);
+ if (!pte_write(entry)) {
+ ret = do_wp_page(vmf);
+ goto put;
+ }
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
@@ -4663,7 +4675,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
+ ret = 0;
+put:
+ pte_put_vmf(vmf);
+ return ret;
}
/*
In the page fault path, we need to take a reference of the PTE page table page if the pmd entry is not none, which ensures the PTE page table page will not be released by other threads. And the mmap_lock may be unlocked in advance in some cases in handle_pte_fault(), then the pmd entry will no longer be stable: thread A thread B page fault collapse_huge_page ========== ================== mmap_read_unlock() mmap_write_lock() pgtable_trans_huge_deposit() set_pmd_at() /* pmd entry is changed! */ pte_put() So we should call pte_put() before dropping the mmap_lock. Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> --- fs/userfaultfd.c | 1 + mm/filemap.c | 2 ++ mm/internal.h | 1 + mm/khugepaged.c | 8 +++++++- mm/memory.c | 33 ++++++++++++++++++++++++--------- 5 files changed, 35 insertions(+), 10 deletions(-)