diff mbox series

[RFC,2/2] mm/mremap: Fix race between MOVE_PUD mremap and pageout

Message ID 20210525083344.162377-2-aneesh.kumar@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/2] mm/mremap: Fix race between MOVE_PMD mremap and pageout | expand

Commit Message

Aneesh Kumar K.V May 25, 2021, 8:33 a.m. UTC
CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

mmap_write_lock_killable()

				addr = old_addr
				lock(pte_ptl)
lock(pud_ptl)
pud = *old_pud
pud_clear(old_pud)
flush_tlb_range(old_addr)

*new_pud = pud
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

unlock(pud_ptl)
				ptep_clear_flush()
				old pfn is free.
									Stale TLB entry

Fix this race by holding pud lock in pageout.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/rmap.h |  4 ++++
 mm/page_vma_mapped.c | 13 ++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

Comments

Linus Torvalds May 25, 2021, 5:28 p.m. UTC | #1
On Mon, May 24, 2021 at 10:34 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> @@ -221,6 +222,9 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
>                 spin_unlock(pvmw->pte_ptl);
>         if (pvmw->pmd_ptl)
>                 spin_unlock(pvmw->pmd_ptl);
> +       if (pvmw->pud_ptl)
> +               spin_unlock(pvmw->pud_ptl);
> +
>  }

You have this habit of adding odd whitespace..

But yes, this seems to be the right way to fix the races properly. The
pageout code is special, the pageout code is normally not critical, so
it's the pageout code that should go the extra mile to make up for the
fact that it doesn't hold the mmap_sem like good page table
modification codepaths do.

                Linus
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 272ab0c2b60b..491c65ce1d46 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -209,6 +209,7 @@  struct page_vma_mapped_walk {
 	pte_t *pte;
 	spinlock_t *pte_ptl;
 	spinlock_t *pmd_ptl;
+	spinlock_t *pud_ptl;
 	unsigned int flags;
 };
 
@@ -221,6 +222,9 @@  static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 		spin_unlock(pvmw->pte_ptl);
 	if (pvmw->pmd_ptl)
 		spin_unlock(pvmw->pmd_ptl);
+	if (pvmw->pud_ptl)
+		spin_unlock(pvmw->pud_ptl);
+
 }
 
 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 87a2c94c7e27..c913bc34b1d3 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -180,8 +180,11 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	pud = pud_offset(p4d, pvmw->address);
 	if (!pud_present(*pud))
 		return false;
+
+	pvmw->pud_ptl = pud_lock(mm, pud);
 	pvmw->pmd = pmd_offset(pud, pvmw->address);
-	pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
+	if (USE_SPLIT_PMD_PTLOCKS)
+		pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
 	/*
 	 * Make sure the pmd value isn't cached in a register by the
 	 * compiler and used as a stale value after we've observed a
@@ -235,8 +238,12 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 					spin_unlock(pvmw->pte_ptl);
 					pvmw->pte_ptl = NULL;
 				}
-				spin_unlock(pvmw->pmd_ptl);
-				pvmw->pmd_ptl = NULL;
+				if (pvmw->pmd_ptl) {
+					spin_unlock(pvmw->pmd_ptl);
+					pvmw->pmd_ptl = NULL;
+				}
+				spin_unlock(pvmw->pud_ptl);
+				pvmw->pud_ptl = NULL;
 				goto restart;
 			} else {
 				pvmw->pte++;