diff mbox series

[RFC,03/11] mm/thp: zone_device awareness in THP handling code

Message ID 20250306044239.3874247-4-balbirs@nvidia.com (mailing list archive)
State New
Headers show
Series THP support for zone device pages | expand

Commit Message

Balbir Singh March 6, 2025, 4:42 a.m. UTC
Make THP handling code in the mm subsystem for THP pages
aware of zone device pages. Although the code is
designed to be generic when it comes to handling splitting
of pages, the code is designed to work for THP page sizes
corresponding to HPAGE_PMD_NR.

Modify page_vma_mapped_walk() to return true when a zone
device huge entry is present, enabling try_to_migrate()
and other code migration paths to appropriately process the
entry

pmd_pfn() does not work well with zone device entries, use
pfn_pmd_entry_to_swap() for checking and comparison as for
zone device entries.

try_to_map_to_unused_zeropage() does not apply to zone device
entries, zone device entries are ignored in the call.

Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
 mm/huge_memory.c     | 151 +++++++++++++++++++++++++++++++------------
 mm/migrate.c         |   2 +
 mm/page_vma_mapped.c |  10 +++
 mm/rmap.c            |  19 +++++-
 4 files changed, 138 insertions(+), 44 deletions(-)
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 826bfe907017..d8e018d1bdbd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2247,10 +2247,17 @@  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		} else if (thp_migration_supported()) {
 			swp_entry_t entry;
 
-			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
 			entry = pmd_to_swp_entry(orig_pmd);
 			folio = pfn_swap_entry_folio(entry);
 			flush_needed = 0;
+
+			VM_BUG_ON(!is_pmd_migration_entry(*pmd) &&
+					!folio_is_device_private(folio));
+
+			if (folio_is_device_private(folio)) {
+				folio_remove_rmap_pmd(folio, folio_page(folio, 0), vma);
+				WARN_ON_ONCE(folio_mapcount(folio) < 0);
+			}
 		} else
 			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
 
@@ -2264,6 +2271,15 @@  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				       -HPAGE_PMD_NR);
 		}
 
+		/*
+		 * Do a folio put on zone device private pages after
+		 * changes to mm_counter, because the folio_put() will
+		 * clean folio->mapping and the folio_test_anon() check
+		 * will not be usable.
+		 */
+		if (folio_is_device_private(folio))
+			folio_put(folio);
+
 		spin_unlock(ptl);
 		if (flush_needed)
 			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2392,7 +2408,8 @@  int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		struct folio *folio = pfn_swap_entry_folio(entry);
 		pmd_t newpmd;
 
-		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+		VM_BUG_ON(!is_pmd_migration_entry(*pmd) &&
+			  !folio_is_device_private(folio));
 		if (is_writable_migration_entry(entry)) {
 			/*
 			 * A protection check is difficult so
@@ -2405,9 +2422,11 @@  int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			newpmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*pmd))
 				newpmd = pmd_swp_mksoft_dirty(newpmd);
-		} else {
+		} else if (is_writable_device_private_entry(entry)) {
+			newpmd = swp_entry_to_pmd(entry);
+			entry = make_device_exclusive_entry(swp_offset(entry));
+		} else
 			newpmd = *pmd;
-		}
 
 		if (uffd_wp)
 			newpmd = pmd_swp_mkuffd_wp(newpmd);
@@ -2860,11 +2879,12 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	struct page *page;
 	pgtable_t pgtable;
 	pmd_t old_pmd, _pmd;
-	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
-	bool anon_exclusive = false, dirty = false;
+	bool young, write, soft_dirty, uffd_wp = false;
+	bool anon_exclusive = false, dirty = false, present = false;
 	unsigned long addr;
 	pte_t *pte;
 	int i;
+	swp_entry_t swp_entry;
 
 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
@@ -2918,20 +2938,25 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
-	pmd_migration = is_pmd_migration_entry(*pmd);
-	if (unlikely(pmd_migration)) {
-		swp_entry_t entry;
 
+	present = pmd_present(*pmd);
+	if (unlikely(!present)) {
+		swp_entry = pmd_to_swp_entry(*pmd);
 		old_pmd = *pmd;
-		entry = pmd_to_swp_entry(old_pmd);
-		page = pfn_swap_entry_to_page(entry);
-		write = is_writable_migration_entry(entry);
+
+		folio = pfn_swap_entry_folio(swp_entry);
+		VM_BUG_ON(!is_migration_entry(swp_entry) &&
+				!is_device_private_entry(swp_entry));
+		page = pfn_swap_entry_to_page(swp_entry);
+		write = is_writable_migration_entry(swp_entry);
+
 		if (PageAnon(page))
-			anon_exclusive = is_readable_exclusive_migration_entry(entry);
-		young = is_migration_entry_young(entry);
-		dirty = is_migration_entry_dirty(entry);
+			anon_exclusive =
+				is_readable_exclusive_migration_entry(swp_entry);
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
+		young = is_migration_entry_young(swp_entry);
+		dirty = is_migration_entry_dirty(swp_entry);
 	} else {
 		/*
 		 * Up to this point the pmd is present and huge and userland has
@@ -3015,30 +3040,45 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	 * Note that NUMA hinting access restrictions are not transferred to
 	 * avoid any possibility of altering permissions across VMAs.
 	 */
-	if (freeze || pmd_migration) {
+	if (freeze || !present) {
 		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
 			pte_t entry;
-			swp_entry_t swp_entry;
-
-			if (write)
-				swp_entry = make_writable_migration_entry(
-							page_to_pfn(page + i));
-			else if (anon_exclusive)
-				swp_entry = make_readable_exclusive_migration_entry(
-							page_to_pfn(page + i));
-			else
-				swp_entry = make_readable_migration_entry(
-							page_to_pfn(page + i));
-			if (young)
-				swp_entry = make_migration_entry_young(swp_entry);
-			if (dirty)
-				swp_entry = make_migration_entry_dirty(swp_entry);
-			entry = swp_entry_to_pte(swp_entry);
-			if (soft_dirty)
-				entry = pte_swp_mksoft_dirty(entry);
-			if (uffd_wp)
-				entry = pte_swp_mkuffd_wp(entry);
-
+			if (freeze || is_migration_entry(swp_entry)) {
+				if (write)
+					swp_entry = make_writable_migration_entry(
+								page_to_pfn(page + i));
+				else if (anon_exclusive)
+					swp_entry = make_readable_exclusive_migration_entry(
+								page_to_pfn(page + i));
+				else
+					swp_entry = make_readable_migration_entry(
+								page_to_pfn(page + i));
+				if (young)
+					swp_entry = make_migration_entry_young(swp_entry);
+				if (dirty)
+					swp_entry = make_migration_entry_dirty(swp_entry);
+				entry = swp_entry_to_pte(swp_entry);
+				if (soft_dirty)
+					entry = pte_swp_mksoft_dirty(entry);
+				if (uffd_wp)
+					entry = pte_swp_mkuffd_wp(entry);
+			} else {
+				VM_BUG_ON(!is_device_private_entry(swp_entry));
+				if (write)
+					swp_entry = make_writable_device_private_entry(
+								page_to_pfn(page + i));
+				else if (anon_exclusive)
+					swp_entry = make_device_exclusive_entry(
+								page_to_pfn(page + i));
+				else
+					swp_entry = make_readable_device_private_entry(
+								page_to_pfn(page + i));
+				entry = swp_entry_to_pte(swp_entry);
+				if (soft_dirty)
+					entry = pte_swp_mksoft_dirty(entry);
+				if (uffd_wp)
+					entry = pte_swp_mkuffd_wp(entry);
+			}
 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
 			set_pte_at(mm, addr, pte + i, entry);
 		}
@@ -3065,7 +3105,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 	pte_unmap(pte);
 
-	if (!pmd_migration)
+	if (present)
 		folio_remove_rmap_pmd(folio, page, vma);
 	if (freeze)
 		put_page(page);
@@ -3077,6 +3117,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze, struct folio *folio)
 {
+	struct folio *pmd_folio;
 	VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
 	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
@@ -3089,7 +3130,14 @@  void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 	 */
 	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
 	    is_pmd_migration_entry(*pmd)) {
-		if (folio && folio != pmd_folio(*pmd))
+		if (folio && !pmd_present(*pmd)) {
+			swp_entry_t swp_entry = pmd_to_swp_entry(*pmd);
+
+			pmd_folio = page_folio(pfn_swap_entry_to_page(swp_entry));
+		} else {
+			pmd_folio = pmd_folio(*pmd);
+		}
+		if (folio && folio != pmd_folio)
 			return;
 		__split_huge_pmd_locked(vma, pmd, address, freeze);
 	}
@@ -3581,11 +3629,16 @@  static int __split_unmapped_folio(struct folio *folio, int new_order,
 				     folio_test_swapcache(origin_folio)) ?
 					     folio_nr_pages(release) : 0));
 
+			if (folio_is_device_private(release))
+				percpu_ref_get_many(&release->pgmap->ref,
+							(1 << new_order) - 1);
+
 			if (release == origin_folio)
 				continue;
 
-			lru_add_page_tail(origin_folio, &release->page,
-						lruvec, list);
+			if (!folio_is_device_private(origin_folio))
+				lru_add_page_tail(origin_folio, &release->page,
+							lruvec, list);
 
 			/* Some pages can be beyond EOF: drop them from page cache */
 			if (release->index >= end) {
@@ -4625,7 +4678,10 @@  int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 		return 0;
 
 	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
-	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+	if (!folio_is_device_private(folio))
+		pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+	else
+		pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
 
 	/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
 	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
@@ -4675,6 +4731,17 @@  void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	entry = pmd_to_swp_entry(*pvmw->pmd);
 	folio_get(folio);
 	pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+
+	if (unlikely(folio_is_device_private(folio))) {
+		if (pmd_write(pmde))
+			entry = make_writable_device_private_entry(
+							page_to_pfn(new));
+		else
+			entry = make_readable_device_private_entry(
+							page_to_pfn(new));
+		pmde = swp_entry_to_pmd(entry);
+	}
+
 	if (pmd_swp_soft_dirty(*pvmw->pmd))
 		pmde = pmd_mksoft_dirty(pmde);
 	if (is_writable_migration_entry(entry))
diff --git a/mm/migrate.c b/mm/migrate.c
index 59e39aaa74e7..0aa1bdb711c3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -200,6 +200,8 @@  static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
 
 	if (PageCompound(page))
 		return false;
+	if (folio_is_device_private(folio))
+		return false;
 	VM_BUG_ON_PAGE(!PageAnon(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e463c3be934a..5dd2e51477d3 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -278,6 +278,16 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 			 * cannot return prematurely, while zap_huge_pmd() has
 			 * cleared *pmd but not decremented compound_mapcount().
 			 */
+			swp_entry_t entry;
+
+			if (!thp_migration_supported())
+				return not_found(pvmw);
+			entry = pmd_to_swp_entry(pmde);
+			if (is_device_private_entry(entry)) {
+				pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+				return true;
+			}
+
 			if ((pvmw->flags & PVMW_SYNC) &&
 			    thp_vma_suitable_order(vma, pvmw->address,
 						   PMD_ORDER) &&
diff --git a/mm/rmap.c b/mm/rmap.c
index 67bb273dfb80..67e99dc5f2ef 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2326,8 +2326,23 @@  static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 		/* PMD-mapped THP migration entry */
 		if (!pvmw.pte) {
-			subpage = folio_page(folio,
-				pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+			/*
+			 * Zone device private folios do not work well with
+			 * pmd_pfn() on some architectures due to pte
+			 * inversion.
+			 */
+			if (folio_is_device_private(folio)) {
+				swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
+				unsigned long pfn = swp_offset_pfn(entry);
+
+				subpage = folio_page(folio, pfn
+							- folio_pfn(folio));
+			} else {
+				subpage = folio_page(folio,
+							pmd_pfn(*pvmw.pmd)
+							- folio_pfn(folio));
+			}
+
 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
 					!folio_test_pmd_mappable(folio), folio);