diff mbox series

[RFC,2/3] mm: pgtable: make pte_offset_map_nolock() return pmdval

Message ID b94d5903dbe6fd792d05f790c9a244d384561f81.1718267194.git.zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series asynchronously scan and free empty user PTE pages | expand

Commit Message

Qi Zheng June 13, 2024, 8:38 a.m. UTC
Make pte_offset_map_nolock() return pmdval so that we can recheck the
*pmd once the lock is taken. This is a preparation for freeing empty
PTE pages, no functional changes are expected.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 Documentation/mm/split_page_table_lock.rst |  3 ++-
 arch/arm/mm/fault-armv.c                   |  2 +-
 arch/powerpc/mm/pgtable.c                  |  2 +-
 include/linux/mm.h                         |  4 ++--
 mm/filemap.c                               |  2 +-
 mm/khugepaged.c                            |  4 ++--
 mm/memory.c                                |  4 ++--
 mm/mremap.c                                |  2 +-
 mm/page_vma_mapped.c                       |  2 +-
 mm/pgtable-generic.c                       | 21 ++++++++++++---------
 mm/userfaultfd.c                           |  4 ++--
 mm/vmscan.c                                |  2 +-
 12 files changed, 28 insertions(+), 24 deletions(-)
diff mbox series

Patch

diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
index e4f6972eb6c0..e6a47d57531c 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -18,7 +18,8 @@  There are helpers to lock/unlock a table and other accessor functions:
 	pointer to its PTE table lock, or returns NULL if no PTE table;
  - pte_offset_map_nolock()
 	maps PTE, returns pointer to PTE with pointer to its PTE table
-	lock (not taken), or returns NULL if no PTE table;
+	lock (not taken) and the value of its pmd entry, or returns NULL
+	if no PTE table;
  - pte_offset_map()
 	maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
  - pte_unmap()
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 2286c2ea60ec..3e4ed99b9330 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -117,7 +117,7 @@  static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
 	 * must use the nested version.  This also means we need to
 	 * open-code the spin-locking.
 	 */
-	pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
+	pte = pte_offset_map_nolock(vma->vm_mm, pmd, NULL, address, &ptl);
 	if (!pte)
 		return 0;
 
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9e7ba9c3851f..ab0250f1b226 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -350,7 +350,7 @@  void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 	 */
 	if (pmd_none(*pmd))
 		return;
-	pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+	pte = pte_offset_map_nolock(mm, pmd, NULL, addr, &ptl);
 	BUG_ON(!pte);
 	assert_spin_locked(ptl);
 	pte_unmap(pte);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 106bb0310352..d5550c3dc550 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2969,8 +2969,8 @@  static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
 	return pte;
 }
 
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
-			unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdvalp,
+			     unsigned long addr, spinlock_t **ptlp);
 
 #define pte_unmap_unlock(pte, ptl)	do {		\
 	spin_unlock(ptl);				\
diff --git a/mm/filemap.c b/mm/filemap.c
index 37061aafd191..7eb2e3599966 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3231,7 +3231,7 @@  static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
 	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
 		return 0;
 
-	ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+	ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, NULL, vmf->address,
 				     &vmf->ptl);
 	if (unlikely(!ptep))
 		return VM_FAULT_NOPAGE;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 774a97e6e2da..2a8703ee876c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -992,7 +992,7 @@  static int __collapse_huge_page_swapin(struct mm_struct *mm,
 		};
 
 		if (!pte++) {
-			pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+			pte = pte_offset_map_nolock(mm, pmd, NULL, address, &ptl);
 			if (!pte) {
 				mmap_read_unlock(mm);
 				result = SCAN_PMD_NULL;
@@ -1581,7 +1581,7 @@  int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
 		pml = pmd_lock(mm, pmd);
 
-	start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+	start_pte = pte_offset_map_nolock(mm, pmd, NULL, haddr, &ptl);
 	if (!start_pte)		/* mmap_lock + page lock should prevent this */
 		goto abort;
 	if (!pml)
diff --git a/mm/memory.c b/mm/memory.c
index 1bd2ffb76ec2..694c0989a1d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1108,7 +1108,7 @@  copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		ret = -ENOMEM;
 		goto out;
 	}
-	src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+	src_pte = pte_offset_map_nolock(src_mm, src_pmd, NULL, addr, &src_ptl);
 	if (!src_pte) {
 		pte_unmap_unlock(dst_pte, dst_ptl);
 		/* ret == 0 */
@@ -5486,7 +5486,7 @@  static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		 * it into a huge pmd: just retry later if so.
 		 */
 		vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
-						 vmf->address, &vmf->ptl);
+						 NULL, vmf->address, &vmf->ptl);
 		if (unlikely(!vmf->pte))
 			return 0;
 		vmf->orig_pte = ptep_get_lockless(vmf->pte);
diff --git a/mm/mremap.c b/mm/mremap.c
index e7ae140fc640..f672d0218a6f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -175,7 +175,7 @@  static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		err = -EAGAIN;
 		goto out;
 	}
-	new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+	new_pte = pte_offset_map_nolock(mm, new_pmd, NULL, new_addr, &new_ptl);
 	if (!new_pte) {
 		pte_unmap_unlock(old_pte, old_ptl);
 		err = -EAGAIN;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae5cc42aa208..507701b7bcc1 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -33,7 +33,7 @@  static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
 	 * Though, in most cases, page lock already protects this.
 	 */
 	pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
-					  pvmw->address, ptlp);
+					  NULL, pvmw->address, ptlp);
 	if (!pvmw->pte)
 		return false;
 
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 197937495a0a..b8b28715cb4f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -305,7 +305,7 @@  pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 	return NULL;
 }
 
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdvalp,
 			     unsigned long addr, spinlock_t **ptlp)
 {
 	pmd_t pmdval;
@@ -314,6 +314,8 @@  pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
 	pte = __pte_offset_map(pmd, addr, &pmdval);
 	if (likely(pte))
 		*ptlp = pte_lockptr(mm, &pmdval);
+	if (pmdvalp)
+		*pmdvalp = pmdval;
 	return pte;
 }
 
@@ -347,14 +349,15 @@  pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
  * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
  * afterwards.
  *
- * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
- * but when successful, it also outputs a pointer to the spinlock in ptlp - as
- * pte_offset_map_lock() does, but in this case without locking it.  This helps
- * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
- * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
- * pointer for the page table that it returns.  In principle, the caller should
- * recheck *pmd once the lock is taken; in practice, no callsite needs that -
- * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ * pte_offset_map_nolock(mm, pmd, pmdvalp, addr, ptlp), above, is like
+ * pte_offset_map(); but when successful, it also outputs a pointer to the
+ * spinlock in ptlp - as pte_offset_map_lock() does, but in this case without
+ * locking it.  This helps the caller to avoid a later pte_lockptr(mm, *pmd),
+ * which might by that time act on a changed *pmd: pte_offset_map_nolock()
+ * provides the correct spinlock pointer for the page table that it returns.
+ * In principle, the caller should recheck *pmd once the lock is taken; But in
+ * most cases, either the mmap_lock for write, or pte_same() check on contents,
+ * is enough.
  *
  * Note that free_pgtables(), used after unmapping detached vmas, or when
  * exiting the whole mm, does not take page table lock before freeing a page
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 5e7f2801698a..9c77271d499c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1143,7 +1143,7 @@  static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 				src_addr, src_addr + PAGE_SIZE);
 	mmu_notifier_invalidate_range_start(&range);
 retry:
-	dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
+	dst_pte = pte_offset_map_nolock(mm, dst_pmd, NULL, dst_addr, &dst_ptl);
 
 	/* Retry if a huge pmd materialized from under us */
 	if (unlikely(!dst_pte)) {
@@ -1151,7 +1151,7 @@  static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 		goto out;
 	}
 
-	src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
+	src_pte = pte_offset_map_nolock(mm, src_pmd, NULL, src_addr, &src_ptl);
 
 	/*
 	 * We held the mmap_lock for reading so MADV_DONTNEED
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c0429fd6c573..56727caa907b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3374,7 +3374,7 @@  static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	DEFINE_MAX_SEQ(walk->lruvec);
 	int old_gen, new_gen = lru_gen_from_seq(max_seq);
 
-	pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+	pte = pte_offset_map_nolock(args->mm, pmd, NULL, start & PMD_MASK, &ptl);
 	if (!pte)
 		return false;
 	if (!spin_trylock(ptl)) {