@@ -39,6 +39,9 @@ struct folio_batch;
*
* Use the PAGETABLE_MOVE() macro to initialise this struct.
*
+ * The old_addr and new_addr fields are updated as the page table move is
+ * executed.
+ *
* NOTE: The page table move is affected by reading from [old_addr, old_end),
* and old_addr may be updated for better page table alignment, so len_in
* represents the length of the range being copied as specified by the user.
@@ -107,8 +107,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return pmd;
}
-static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -121,13 +120,12 @@ static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
return pud_alloc(mm, p4d, addr);
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
- pud = alloc_new_pud(mm, vma, addr);
+ pud = alloc_new_pud(mm, addr);
if (!pud)
return NULL;
@@ -171,17 +169,19 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- unsigned long old_addr, unsigned long old_end,
- struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr, bool need_rmap_locks)
+static int move_ptes(struct pagetable_move_control *pmc,
+ unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
{
+ struct vm_area_struct *vma = pmc->old;
bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long new_addr = pmc->new_addr;
+ unsigned long old_end = old_addr + extent;
unsigned long len = old_end - old_addr;
int err = 0;
@@ -203,7 +203,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
take_rmap_locks(vma);
/*
@@ -277,7 +277,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
out:
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
drop_rmap_locks(vma);
return err;
}
@@ -292,10 +292,11 @@ static inline bool arch_supports_page_table_move(void)
#endif
#ifdef CONFIG_HAVE_MOVE_PMD
-static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
+static bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
bool res = false;
pmd_t pmd;
@@ -341,7 +342,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ old_ptl = pmd_lock(mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -358,7 +359,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE);
out_unlock:
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -367,19 +368,19 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
return res;
}
#else
-static inline bool move_normal_pmd(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
- pmd_t *new_pmd)
+static inline bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
-static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -405,7 +406,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -417,7 +418,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
pud_populate(mm, new_pud, pud_pgtable(pud));
- flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -425,19 +426,19 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static inline bool move_normal_pud(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
- pud_t *new_pud)
+static inline bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
return false;
}
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -452,7 +453,7 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -465,8 +466,8 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
/* Set the new pud */
/* mark soft_ditry when we add pud level soft dirty support */
- set_pud_at(mm, new_addr, new_pud, pud);
- flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+ set_pud_at(mm, pmc->new_addr, new_pud, pud);
+ flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -496,10 +497,12 @@ enum pgt_entry {
* destination pgt_entry.
*/
static __always_inline unsigned long get_extent(enum pgt_entry entry,
- unsigned long old_addr, unsigned long old_end,
- unsigned long new_addr)
+ struct pagetable_move_control *pmc)
{
unsigned long next, extent, mask, size;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long old_end = pmc->old_end;
+ unsigned long new_addr = pmc->new_addr;
switch (entry) {
case HPAGE_PMD:
@@ -528,38 +531,54 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
return extent;
}
+/*
+ * Should move_pgt_entry() acquire the rmap locks? This is either expressed in
+ * the PMC, or overridden in the case of normal, larger page tables.
+ */
+static bool should_take_rmap_locks(struct pagetable_move_control *pmc,
+ enum pgt_entry entry)
+{
+ if (pmc->need_rmap_locks)
+ return true;
+
+ switch (entry) {
+ case NORMAL_PMD:
+ case NORMAL_PUD:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Attempts to speedup the move by moving entry at the level corresponding to
* pgt_entry. Returns true if the move was successful, else false.
*/
-static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr,
- void *old_entry, void *new_entry, bool need_rmap_locks)
+static bool move_pgt_entry(struct pagetable_move_control *pmc,
+ enum pgt_entry entry, void *old_entry, void *new_entry)
{
bool moved = false;
+ bool need_rmap_locks = should_take_rmap_locks(pmc, entry);
/* See comment in move_ptes() */
if (need_rmap_locks)
- take_rmap_locks(vma);
+ take_rmap_locks(pmc->old);
switch (entry) {
case NORMAL_PMD:
- moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pmd(pmc, old_entry, new_entry);
break;
case NORMAL_PUD:
- moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pud(pmc, old_entry, new_entry);
break;
case HPAGE_PMD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pmd(vma, old_addr, new_addr, old_entry,
+ move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry,
new_entry);
break;
case HPAGE_PUD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ move_huge_pud(pmc, old_entry, new_entry);
break;
default:
@@ -568,7 +587,7 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
}
if (need_rmap_locks)
- drop_rmap_locks(vma);
+ drop_rmap_locks(pmc->old);
return moved;
}
@@ -704,19 +723,48 @@ static void try_realign_addr(struct pagetable_move_control *pmc,
pmc->new_addr &= pagetable_mask;
}
+/* Is the page table move operation done? */
+static bool pmc_done(struct pagetable_move_control *pmc)
+{
+ return pmc->old_addr >= pmc->old_end;
+}
+
+/* Advance to the next page table, offset by extent bytes. */
+static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent)
+{
+ pmc->old_addr += extent;
+ pmc->new_addr += extent;
+}
+
+/*
+ * Determine how many bytes in the specified input range have had their page
+ * tables moved so far.
+ */
+static unsigned long pmc_progress(struct pagetable_move_control *pmc)
+{
+ unsigned long orig_old_addr = pmc->old_end - pmc->len_in;
+ unsigned long old_addr = pmc->old_addr;
+
+ /*
+ * Prevent negative return values when {old,new}_addr was realigned but
+ * we broke out of the loop in move_page_tables() for the first PMD
+ * itself.
+ */
+ return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr;
+}
+
unsigned long move_page_tables(struct pagetable_move_control *pmc)
{
- unsigned long extent, old_end;
+ unsigned long extent;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
pud_t *old_pud, *new_pud;
- unsigned long old_addr, new_addr;
- struct vm_area_struct *vma = pmc->old;
+ struct mm_struct *mm = pmc->old->vm_mm;
if (!pmc->len_in)
return 0;
- if (is_vm_hugetlb_page(vma))
+ if (is_vm_hugetlb_page(pmc->old))
return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
pmc->new_addr, pmc->len_in);
@@ -725,87 +773,71 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
* Only realign if the mremap copying hits a PMD boundary.
*/
try_realign_addr(pmc, PMD_MASK);
- /* These may have been changed. */
- old_addr = pmc->old_addr;
- new_addr = pmc->new_addr;
- old_end = pmc->old_end;
-
- flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
- old_addr, old_end);
+
+ flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm,
+ pmc->old_addr, pmc->old_end);
mmu_notifier_invalidate_range_start(&range);
- for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
+ for (; !pmc_done(pmc); pmc_next(pmc, extent)) {
cond_resched();
/*
* If extent is PUD-sized try to speed up the move by moving at the
* PUD level if possible.
*/
- extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+ extent = get_extent(NORMAL_PUD, pmc);
- old_pud = get_old_pud(vma->vm_mm, old_addr);
+ old_pud = get_old_pud(mm, pmc->old_addr);
if (!old_pud)
continue;
- new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ new_pud = alloc_new_pud(mm, pmc->new_addr);
if (!new_pud)
break;
if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
- move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, pmc->need_rmap_locks);
+ move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
/* We ignore and continue on error? */
continue;
}
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
- if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, true))
+ if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud))
continue;
}
- extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
- old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ extent = get_extent(NORMAL_PMD, pmc);
+ old_pmd = get_old_pmd(mm, pmc->old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
+ new_pmd = alloc_new_pmd(mm, pmc->new_addr);
if (!new_pmd)
break;
again:
if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
- move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, pmc->need_rmap_locks))
+ move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
- split_huge_pmd(vma, old_pmd, old_addr);
+ split_huge_pmd(pmc->old, old_pmd, pmc->old_addr);
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
extent == PMD_SIZE) {
/*
* If the extent is PMD-sized, try to speed the move by
* moving at the PMD level if possible.
*/
- if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, true))
+ if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd))
continue;
}
if (pmd_none(*old_pmd))
continue;
if (pte_alloc(pmc->new->vm_mm, new_pmd))
break;
- if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- pmc->new, new_pmd, new_addr, pmc->need_rmap_locks) < 0)
+ if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0)
goto again;
}
mmu_notifier_invalidate_range_end(&range);
- /*
- * Prevent negative return values when {old,new}_addr was realigned
- * but we broke out of the above loop for the first PMD itself.
- */
- if (old_addr < old_end - pmc->len_in)
- return 0;
-
- return pmc->len_in + old_addr - old_end; /* how much done */
+ return pmc_progress(pmc);
}
/* Set vrm->delta to the difference in VMA size specified by user. */
Finish refactoring the page table logic by threading the PMC state throughout the operation, allowing us to control the operation as we go. Additionally, update the old_addr, new_addr fields in move_page_tables() as we progress through the process making use of the fact we have this state object now to track this. With these changes made, not only is the code far more readable, but we can finally transmit state throughout the entire operation, which lays the groundwork for sensibly making changes in future to how the mremap() operation is performed. Additionally take the opportunity to refactor the means of determining the progress of the operation, abstracting this to pmc_progress() and simplifying the logic to make it clearer what's going on. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> --- mm/internal.h | 3 + mm/mremap.c | 196 +++++++++++++++++++++++++++++--------------------- 2 files changed, 117 insertions(+), 82 deletions(-)