@@ -14,6 +14,7 @@
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swapops.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
if (!vma_is_anonymous(walk->vma))
return migrate_vma_collect_skip(start, end, walk);
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+
+ /*
+ * Collect the remaining entries as holes, in case we
+ * need to split later
+ */
+ return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -54,50 +72,145 @@ static int migrate_vma_collect_hole(unsigned long start,
return 0;
}
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
+/**
+ * migrate_vma_collect_huge_pmd - collect THP pages without splitting the
+ * folio for device private pages.
+ * @pmdp: pointer to pmd entry
+ * @start: start address of the range for migration
+ * @end: end address of the range for migration
+ * @walk: mm_walk callback structure
+ *
+ * Collect the huge pmd entry at @pmdp for migration and set the
+ * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
+ * migration will occur at HPAGE_PMD granularity
+ */
+static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
{
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
struct migrate_vma *migrate = walk->private;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
+ swp_entry_t entry;
+ int ret;
+ unsigned long write = 0;
+
spinlock_t *ptl;
- pte_t *ptep;
-again:
- if (pmd_none(*pmdp))
+ ptl = pmd_lock(mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ spin_unlock(ptl);
return migrate_vma_collect_hole(start, end, -1, walk);
+ }
if (pmd_trans_huge(*pmdp)) {
- struct folio *folio;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
spin_unlock(ptl);
- goto again;
+ return migrate_vma_collect_skip(start, end, walk);
}
folio = pmd_folio(*pmdp);
if (is_huge_zero_folio(folio)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- } else {
- int ret;
+ return migrate_vma_collect_hole(start, end, -1, walk);
+ }
+ if (pmd_write(*pmdp))
+ write = MIGRATE_PFN_WRITE;
+ } else if (!pmd_present(*pmdp)) {
+ entry = pmd_to_swp_entry(*pmdp);
+ folio = pfn_swap_entry_folio(entry);
+
+ if (!is_device_private_entry(entry) ||
+ !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ (folio->pgmap->owner != migrate->pgmap_owner)) {
+ spin_unlock(ptl);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
- folio_get(folio);
+ if (is_migration_entry(entry)) {
+ migration_entry_wait_on_locked(entry, ptl);
spin_unlock(ptl);
- if (unlikely(!folio_trylock(folio)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
+ return -EAGAIN;
}
+
+ if (is_writable_device_private_entry(entry))
+ write = MIGRATE_PFN_WRITE;
+ } else {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ folio_get(folio);
+ if (unlikely(!folio_trylock(folio))) {
+ spin_unlock(ptl);
+ folio_put(folio);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+ struct page_vma_mapped_walk pvmw = {
+ .ptl = ptl,
+ .address = start,
+ .pmd = pmdp,
+ .vma = walk->vma,
+ };
+
+ unsigned long pfn = page_to_pfn(folio_page(folio, 0));
+
+ migrate->src[migrate->npages] = migrate_pfn(pfn) | write
+ | MIGRATE_PFN_MIGRATE
+ | MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages++] = 0;
+ migrate->cpages++;
+ ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+ if (ret) {
+ migrate->npages--;
+ migrate->cpages--;
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ goto fallback;
+ }
+ migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+fallback:
+ spin_unlock(ptl);
+ ret = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (ret)
+ return migrate_vma_collect_skip(start, end, walk);
+ if (pmd_none(pmdp_get_lockless(pmdp)))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+ return -ENOENT;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
+ int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk);
+
+ if (ret == -EAGAIN)
+ goto again;
+ if (ret == 0)
+ return 0;
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
@@ -168,8 +281,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
+ if (!page || !page->mapping) {
mpfn = 0;
goto next;
}
@@ -339,14 +451,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
*/
int extra = 1 + (page == fault_page);
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (folio_test_large(folio))
- return false;
-
/* Page from ZONE_DEVICE have one extra reference */
if (folio_is_zone_device(folio))
extra++;
@@ -375,17 +479,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
lru_add_drain();
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
+ unsigned int nr = 1;
if (!page) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
unmapped++;
- continue;
+ goto next;
}
folio = page_folio(page);
+ nr = folio_nr_pages(folio);
+
+ if (nr > 1)
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+
+
/* ZONE_DEVICE folios are not on LRU */
if (!folio_is_zone_device(folio)) {
if (!folio_test_lru(folio) && allow_drain) {
@@ -397,7 +508,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
if (!folio_isolate_lru(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
/* Drop the reference we took in collect */
@@ -416,10 +527,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
unmapped++;
+next:
+ i += nr;
}
for (i = 0; i < npages && restore; i++) {
@@ -562,6 +675,146 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
+ * at @addr. folio is already allocated as a part of the migration process with
+ * large page.
+ *
+ * @folio needs to be initialized and setup after it's allocated. The code bits
+ * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
+ * not support THP zero pages.
+ *
+ * @migrate: migrate_vma arguments
+ * @addr: address where the folio will be inserted
+ * @folio: folio to be inserted at @addr
+ * @src: src pfn which is being migrated
+ * @pmdp: pointer to the pmd
+ */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ struct folio *folio = page_folio(page);
+ int ret;
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ pmd_t entry;
+ bool flush = false;
+ unsigned long i;
+
+ VM_WARN_ON_FOLIO(!folio, folio);
+ VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+
+ if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ return ret;
+
+ folio_set_order(folio, HPAGE_PMD_ORDER);
+ folio_set_large_rmappable(folio);
+
+ if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ __folio_mark_uptodate(folio);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable))
+ goto abort;
+
+ if (folio_is_device_private(folio)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pmd(swp_entry);
+ } else {
+ if (folio_is_zone_device(folio) &&
+ !folio_is_device_coherent(folio)) {
+ goto abort;
+ }
+ entry = mk_pmd(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pmd_mkwrite(pmd_mkdirty(entry), vma);
+ }
+
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ if (!pmd_none(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pmd_none(*pmdp))
+ goto unlock_abort;
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+ if (!folio_is_zone_device(folio))
+ folio_add_lru_vma(folio, vma);
+ folio_get(folio);
+
+ if (flush) {
+ pte_free(vma->vm_mm, pgtable);
+ flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, addr, pmdp);
+ } else {
+ pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ set_pmd_at(vma->vm_mm, addr, pmdp, entry);
+ update_mmu_cache_pmd(vma, addr, pmdp);
+
+ spin_unlock(ptl);
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ return 0;
+
+unlock_abort:
+ spin_unlock(ptl);
+abort:
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ src[i] &= ~MIGRATE_PFN_MIGRATE;
+ return 0;
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ return 0;
+}
+#endif
+
/*
* This code closely matches the code in:
* __handle_mm_fault()
@@ -572,9 +825,10 @@ EXPORT_SYMBOL(migrate_vma_setup);
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
- struct page *page,
+ unsigned long *dst,
unsigned long *src)
{
+ struct page *page = migrate_pfn_to_page(*dst);
struct folio *folio = page_folio(page);
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -602,8 +856,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
- goto abort;
+
+ if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
+ int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
+ src, pmdp);
+ if (ret)
+ goto abort;
+ return;
+ }
+
+ if (!pmd_none(*pmdp)) {
+ if (pmd_trans_huge(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto abort;
+ folio_get(pmd_folio(*pmdp));
+ split_huge_pmd(vma, pmdp, addr);
+ } else if (pmd_leaf(*pmdp))
+ goto abort;
+ }
+
if (pte_alloc(mm, pmdp))
goto abort;
if (unlikely(anon_vma_prepare(vma)))
@@ -694,23 +965,24 @@ static void __migrate_device_pages(unsigned long *src_pfns,
unsigned long i;
bool notified = false;
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
struct folio *newfolio, *folio;
int r, extra_cnt = 0;
+ unsigned long nr = 1;
if (!newpage) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
if (!page) {
unsigned long addr;
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
/*
* The only time there is no vma is when called from
@@ -728,15 +1000,47 @@ static void __migrate_device_pages(unsigned long *src_pfns,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
- migrate_vma_insert_page(migrate, addr, newpage,
+
+ if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
+ nr = HPAGE_PMD_NR;
+ src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ goto next;
+ }
+
+ migrate_vma_insert_page(migrate, addr, &dst_pfns[i],
&src_pfns[i]);
- continue;
+ goto next;
}
newfolio = page_folio(newpage);
folio = page_folio(page);
mapping = folio_mapping(folio);
+ /*
+ * If THP migration is enabled, check if both src and dst
+ * can migrate large pages
+ */
+ if (thp_migration_supported()) {
+ if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+
+ if (!migrate) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+ }
+
+
if (folio_is_device_private(newfolio) ||
folio_is_device_coherent(newfolio)) {
if (mapping) {
@@ -749,7 +1053,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (!folio_test_anon(folio) ||
!folio_free_swap(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
}
} else if (folio_is_zone_device(newfolio)) {
@@ -757,7 +1061,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
* Other types of ZONE_DEVICE page are not supported.
*/
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
BUG_ON(folio_test_writeback(folio));
@@ -769,6 +1073,8 @@ static void __migrate_device_pages(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
+next:
+ i += nr;
}
if (notified)
@@ -899,24 +1205,40 @@ EXPORT_SYMBOL(migrate_vma_finalize);
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages)
{
- unsigned long i, pfn;
+ unsigned long i, j, pfn;
- for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct folio *folio;
+ i = 0;
+ pfn = start;
+ while (i < npages) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
folio = folio_get_nontail_page(pfn_to_page(pfn));
if (!folio) {
src_pfns[i] = 0;
- continue;
+ goto next;
}
if (!folio_trylock(folio)) {
src_pfns[i] = 0;
folio_put(folio);
- continue;
+ goto next;
}
src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j;
+ pfn += j;
+ continue;
+ }
+next:
+ i++;
+ pfn++;
}
migrate_device_unmap(src_pfns, npages, NULL);
migrate_device code paths go through the collect, setup and finalize phases of migration. Support for MIGRATE_PFN_COMPOUND was added earlier in the series to mark THP pages as MIGRATE_PFN_COMPOUND. The entries in src and dst arrays passed to these functions still remain at a PAGE_SIZE granularity. When a compound page is passed, the first entry has the PFN along with MIGRATE_PFN_COMPOUND and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This representation allows for the compound page to be split into smaller page sizes. migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP page aware. Two new helper functions migrate_vma_collect_huge_pmd() and migrate_vma_insert_huge_pmd_page() have been added. migrate_vma_collect_huge_pmd() can collect THP pages, but if for some reason this fails, there is fallback support to split the folio and migrate it. migrate_vma_insert_huge_pmd_page() closely follows the logic of migrate_vma_insert_page() Support for splitting pages as needed for migration will follow in later patches in this series. Signed-off-by: Balbir Singh <balbirs@nvidia.com> --- mm/migrate_device.c | 436 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 379 insertions(+), 57 deletions(-)