@@ -2247,10 +2247,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else if (thp_migration_supported()) {
swp_entry_t entry;
- VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
folio = pfn_swap_entry_folio(entry);
flush_needed = 0;
+
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) &&
+ !folio_is_device_private(folio));
+
+ if (folio_is_device_private(folio)) {
+ folio_remove_rmap_pmd(folio, folio_page(folio, 0), vma);
+ WARN_ON_ONCE(folio_mapcount(folio) < 0);
+ }
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -2264,6 +2271,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
-HPAGE_PMD_NR);
}
+ /*
+ * Do a folio put on zone device private pages after
+ * changes to mm_counter, because the folio_put() will
+ * clean folio->mapping and the folio_test_anon() check
+ * will not be usable.
+ */
+ if (folio_is_device_private(folio))
+ folio_put(folio);
+
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2392,7 +2408,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct folio *folio = pfn_swap_entry_folio(entry);
pmd_t newpmd;
- VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) &&
+ !folio_is_device_private(folio));
if (is_writable_migration_entry(entry)) {
/*
* A protection check is difficult so
@@ -2405,9 +2422,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
- } else {
+ } else if (is_writable_device_private_entry(entry)) {
+ newpmd = swp_entry_to_pmd(entry);
+ entry = make_device_exclusive_entry(swp_offset(entry));
+ } else
newpmd = *pmd;
- }
if (uffd_wp)
newpmd = pmd_swp_mkuffd_wp(newpmd);
@@ -2860,11 +2879,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false, dirty = false;
+ bool young, write, soft_dirty, uffd_wp = false;
+ bool anon_exclusive = false, dirty = false, present = false;
unsigned long addr;
pte_t *pte;
int i;
+ swp_entry_t swp_entry;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
@@ -2918,20 +2938,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
+ present = pmd_present(*pmd);
+ if (unlikely(!present)) {
+ swp_entry = pmd_to_swp_entry(*pmd);
old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
+
+ folio = pfn_swap_entry_folio(swp_entry);
+ VM_BUG_ON(!is_migration_entry(swp_entry) &&
+ !is_device_private_entry(swp_entry));
+ page = pfn_swap_entry_to_page(swp_entry);
+ write = is_writable_migration_entry(swp_entry);
+
if (PageAnon(page))
- anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = is_migration_entry_young(entry);
- dirty = is_migration_entry_dirty(entry);
+ anon_exclusive =
+ is_readable_exclusive_migration_entry(swp_entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
+ young = is_migration_entry_young(swp_entry);
+ dirty = is_migration_entry_dirty(swp_entry);
} else {
/*
* Up to this point the pmd is present and huge and userland has
@@ -3015,30 +3040,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* Note that NUMA hinting access restrictions are not transferred to
* avoid any possibility of altering permissions across VMAs.
*/
- if (freeze || pmd_migration) {
+ if (freeze || !present) {
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry;
- swp_entry_t swp_entry;
-
- if (write)
- swp_entry = make_writable_migration_entry(
- page_to_pfn(page + i));
- else if (anon_exclusive)
- swp_entry = make_readable_exclusive_migration_entry(
- page_to_pfn(page + i));
- else
- swp_entry = make_readable_migration_entry(
- page_to_pfn(page + i));
- if (young)
- swp_entry = make_migration_entry_young(swp_entry);
- if (dirty)
- swp_entry = make_migration_entry_dirty(swp_entry);
- entry = swp_entry_to_pte(swp_entry);
- if (soft_dirty)
- entry = pte_swp_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_swp_mkuffd_wp(entry);
-
+ if (freeze || is_migration_entry(swp_entry)) {
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ } else {
+ VM_BUG_ON(!is_device_private_entry(swp_entry));
+ if (write)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_device_exclusive_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page + i));
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ }
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
set_pte_at(mm, addr, pte + i, entry);
}
@@ -3065,7 +3105,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
pte_unmap(pte);
- if (!pmd_migration)
+ if (present)
folio_remove_rmap_pmd(folio, page, vma);
if (freeze)
put_page(page);
@@ -3077,6 +3117,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze, struct folio *folio)
{
+ struct folio *pmd_folio;
VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
@@ -3089,7 +3130,14 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
*/
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
is_pmd_migration_entry(*pmd)) {
- if (folio && folio != pmd_folio(*pmd))
+ if (folio && !pmd_present(*pmd)) {
+ swp_entry_t swp_entry = pmd_to_swp_entry(*pmd);
+
+ pmd_folio = page_folio(pfn_swap_entry_to_page(swp_entry));
+ } else {
+ pmd_folio = pmd_folio(*pmd);
+ }
+ if (folio && folio != pmd_folio)
return;
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3581,11 +3629,16 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
folio_test_swapcache(origin_folio)) ?
folio_nr_pages(release) : 0));
+ if (folio_is_device_private(release))
+ percpu_ref_get_many(&release->pgmap->ref,
+ (1 << new_order) - 1);
+
if (release == origin_folio)
continue;
- lru_add_page_tail(origin_folio, &release->page,
- lruvec, list);
+ if (!folio_is_device_private(origin_folio))
+ lru_add_page_tail(origin_folio, &release->page,
+ lruvec, list);
/* Some pages can be beyond EOF: drop them from page cache */
if (release->index >= end) {
@@ -4625,7 +4678,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ if (!folio_is_device_private(folio))
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ else
+ pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
@@ -4675,6 +4731,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+
+ if (unlikely(folio_is_device_private(folio))) {
+ if (pmd_write(pmde))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
+ pmde = swp_entry_to_pmd(entry);
+ }
+
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
@@ -200,6 +200,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
if (PageCompound(page))
return false;
+ if (folio_is_device_private(folio))
+ return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
@@ -278,6 +278,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
* cannot return prematurely, while zap_huge_pmd() has
* cleared *pmd but not decremented compound_mapcount().
*/
+ swp_entry_t entry;
+
+ if (!thp_migration_supported())
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (is_device_private_entry(entry)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ return true;
+ }
+
if ((pvmw->flags & PVMW_SYNC) &&
thp_vma_suitable_order(vma, pvmw->address,
PMD_ORDER) &&
@@ -2326,8 +2326,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
- subpage = folio_page(folio,
- pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+ /*
+ * Zone device private folios do not work well with
+ * pmd_pfn() on some architectures due to pte
+ * inversion.
+ */
+ if (folio_is_device_private(folio)) {
+ swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
+ unsigned long pfn = swp_offset_pfn(entry);
+
+ subpage = folio_page(folio, pfn
+ - folio_pfn(folio));
+ } else {
+ subpage = folio_page(folio,
+ pmd_pfn(*pvmw.pmd)
+ - folio_pfn(folio));
+ }
+
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
Make THP handling code in the mm subsystem for THP pages aware of zone device pages. Although the code is designed to be generic when it comes to handling splitting of pages, the code is designed to work for THP page sizes corresponding to HPAGE_PMD_NR. Modify page_vma_mapped_walk() to return true when a zone device huge entry is present, enabling try_to_migrate() and other code migration paths to appropriately process the entry pmd_pfn() does not work well with zone device entries, use pfn_pmd_entry_to_swap() for checking and comparison as for zone device entries. try_to_map_to_unused_zeropage() does not apply to zone device entries, zone device entries are ignored in the call. Signed-off-by: Balbir Singh <balbirs@nvidia.com> --- mm/huge_memory.c | 151 +++++++++++++++++++++++++++++++------------ mm/migrate.c | 2 + mm/page_vma_mapped.c | 10 +++ mm/rmap.c | 19 +++++- 4 files changed, 138 insertions(+), 44 deletions(-)