@@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
unsigned long address;
pmd_t *pmd;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *pte_ptl;
+ spinlock_t *pmd_ptl;
unsigned int flags;
};
@@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
if (pvmw->pte && !PageHuge(pvmw->page))
pte_unmap(pvmw->pte);
- if (pvmw->ptl)
- spin_unlock(pvmw->ptl);
+ if (pvmw->pte_ptl)
+ spin_unlock(pvmw->pte_ptl);
+ if (pvmw->pmd_ptl)
+ spin_unlock(pvmw->pmd_ptl);
}
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
@@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return false;
}
}
- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
+ if (USE_SPLIT_PTE_PTLOCKS) {
+ pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+ spin_lock(pvmw->pte_ptl);
+ }
return true;
}
@@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
- spin_lock(pvmw->ptl);
+ pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+ spin_lock(pvmw->pte_ptl);
if (!check_pte(pvmw))
return not_found(pvmw);
return true;
@@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pud_present(*pud))
return false;
pvmw->pmd = pmd_offset(pud, pvmw->address);
+ pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
/*
* Make sure the pmd value isn't cached in a register by the
* compiler and used as a stale value after we've observed a
@@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
*/
pmde = READ_ONCE(*pvmw->pmd);
if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
if (likely(pmd_trans_huge(*pvmw->pmd))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
@@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
}
}
return not_found(pvmw);
- } else {
- /* THP pmd was split under us: handle on pte level */
- spin_unlock(pvmw->ptl);
- pvmw->ptl = NULL;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
+ } else if (!pmd_present(pmde))
+ return not_found(pvmw);
+
if (!map_pte(pvmw))
goto next_pte;
while (1) {
@@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
/* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) {
pte_unmap(pvmw->pte);
- if (pvmw->ptl) {
- spin_unlock(pvmw->ptl);
- pvmw->ptl = NULL;
+ if (pvmw->pte_ptl) {
+ spin_unlock(pvmw->pte_ptl);
+ pvmw->pte_ptl = NULL;
}
+ spin_unlock(pvmw->pmd_ptl);
+ pvmw->pmd_ptl = NULL;
goto restart;
} else {
pvmw->pte++;
}
} while (pte_none(*pvmw->pte));
- if (!pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
+ if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
+ pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->pte_ptl);
}
}
}
CPU 1 CPU 2 CPU 3 mremap(old_addr, new_addr) page_shrinker/try_to_unmap_one mmap_write_lock_killable() addr = old_addr lock(pte_ptl) lock(pmd_ptl) pmd = *old_pmd pmd_clear(old_pmd) flush_tlb_range(old_addr) *new_pmd = pmd *new_addr = 10; and fills TLB with new addr and old pfn unlock(pmd_ptl) ptep_clear_flush() old pfn is free. Stale TLB entry Fix this race by holding pmd lock in pageout. This still doesn't handle the race between MOVE_PUD and pageout. Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions") Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> --- include/linux/rmap.h | 9 ++++++--- mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 21 deletions(-)