@@ -18,7 +18,8 @@ There are helpers to lock/unlock a table and other accessor functions:
pointer to its PTE table lock, or returns NULL if no PTE table;
- pte_offset_map_nolock()
maps PTE, returns pointer to PTE with pointer to its PTE table
- lock (not taken), or returns NULL if no PTE table;
+ lock (not taken) and the value of its pmd entry, or returns NULL
+ if no PTE table;
- pte_offset_map()
maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
- pte_unmap()
@@ -117,7 +117,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
* must use the nested version. This also means we need to
* open-code the spin-locking.
*/
- pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
+ pte = pte_offset_map_nolock(vma->vm_mm, pmd, NULL, address, &ptl);
if (!pte)
return 0;
@@ -350,7 +350,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
*/
if (pmd_none(*pmd))
return;
- pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+ pte = pte_offset_map_nolock(mm, pmd, NULL, addr, &ptl);
BUG_ON(!pte);
assert_spin_locked(ptl);
pte_unmap(pte);
@@ -2979,8 +2979,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdvalp,
+ unsigned long addr, spinlock_t **ptlp);
#define pte_unmap_unlock(pte, ptl) do { \
spin_unlock(ptl); \
@@ -3231,7 +3231,7 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return 0;
- ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+ ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, NULL, vmf->address,
&vmf->ptl);
if (unlikely(!ptep))
return VM_FAULT_NOPAGE;
@@ -989,7 +989,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
};
if (!pte++) {
- pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_nolock(mm, pmd, NULL, address, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
@@ -1578,7 +1578,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ start_pte = pte_offset_map_nolock(mm, pmd, NULL, haddr, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1108,7 +1108,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+ src_pte = pte_offset_map_nolock(src_mm, src_pmd, NULL, addr, &src_ptl);
if (!src_pte) {
pte_unmap_unlock(dst_pte, dst_ptl);
/* ret == 0 */
@@ -5507,7 +5507,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* it into a huge pmd: just retry later if so.
*/
vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, &vmf->ptl);
+ NULL, vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte))
return 0;
vmf->orig_pte = ptep_get_lockless(vmf->pte);
@@ -175,7 +175,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
err = -EAGAIN;
goto out;
}
- new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+ new_pte = pte_offset_map_nolock(mm, new_pmd, NULL, new_addr, &new_ptl);
if (!new_pte) {
pte_unmap_unlock(old_pte, old_ptl);
err = -EAGAIN;
@@ -33,7 +33,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* Though, in most cases, page lock already protects this.
*/
pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
- pvmw->address, ptlp);
+ NULL, pvmw->address, ptlp);
if (!pvmw->pte)
return false;
@@ -305,7 +305,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
return NULL;
}
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdvalp,
unsigned long addr, spinlock_t **ptlp)
{
pmd_t pmdval;
@@ -314,6 +314,8 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
pte = __pte_offset_map(pmd, addr, &pmdval);
if (likely(pte))
*ptlp = pte_lockptr(mm, &pmdval);
+ if (pmdvalp)
+ *pmdvalp = pmdval;
return pte;
}
@@ -347,14 +349,15 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
* and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
* afterwards.
*
- * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
- * but when successful, it also outputs a pointer to the spinlock in ptlp - as
- * pte_offset_map_lock() does, but in this case without locking it. This helps
- * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
- * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
- * pointer for the page table that it returns. In principle, the caller should
- * recheck *pmd once the lock is taken; in practice, no callsite needs that -
- * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ * pte_offset_map_nolock(mm, pmd, pmdvalp, addr, ptlp), above, is like
+ * pte_offset_map(); but when successful, it also outputs a pointer to the
+ * spinlock in ptlp - as pte_offset_map_lock() does, but in this case without
+ * locking it. This helps the caller to avoid a later pte_lockptr(mm, *pmd),
+ * which might by that time act on a changed *pmd: pte_offset_map_nolock()
+ * provides the correct spinlock pointer for the page table that it returns.
+ * In principle, the caller should recheck *pmd once the lock is taken; But in
+ * most cases, either the mmap_lock for write, or pte_same() check on contents,
+ * is enough.
*
* Note that free_pgtables(), used after unmapping detached vmas, or when
* exiting the whole mm, does not take page table lock before freeing a page
@@ -1143,7 +1143,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
src_addr, src_addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
retry:
- dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
+ dst_pte = pte_offset_map_nolock(mm, dst_pmd, NULL, dst_addr, &dst_ptl);
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
@@ -1151,7 +1151,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
goto out;
}
- src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
+ src_pte = pte_offset_map_nolock(mm, src_pmd, NULL, src_addr, &src_ptl);
/*
* We held the mmap_lock for reading so MADV_DONTNEED
@@ -3373,7 +3373,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
DEFINE_MAX_SEQ(walk->lruvec);
int old_gen, new_gen = lru_gen_from_seq(max_seq);
- pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ pte = pte_offset_map_nolock(args->mm, pmd, NULL, start & PMD_MASK, &ptl);
if (!pte)
return false;
if (!spin_trylock(ptl)) {
Make pte_offset_map_nolock() return pmdval so that we can recheck the *pmd once the lock is taken. This is a preparation for freeing empty PTE pages, no functional changes are expected. Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> --- Documentation/mm/split_page_table_lock.rst | 3 ++- arch/arm/mm/fault-armv.c | 2 +- arch/powerpc/mm/pgtable.c | 2 +- include/linux/mm.h | 4 ++-- mm/filemap.c | 2 +- mm/khugepaged.c | 4 ++-- mm/memory.c | 4 ++-- mm/mremap.c | 2 +- mm/page_vma_mapped.c | 2 +- mm/pgtable-generic.c | 21 ++++++++++++--------- mm/userfaultfd.c | 4 ++-- mm/vmscan.c | 2 +- 12 files changed, 28 insertions(+), 24 deletions(-)