diff mbox series

[v3,1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon THPs by pmds

Message ID 1fecc331345653b8a3ab1dc2cfb24b5f946f5569.1702882426.git.xuyu@linux.alibaba.com (mailing list archive)
State New
Headers show
Series attempt to map anonymous pte-mapped THPs by pmds | expand

Commit Message

Xu Yu Dec. 18, 2023, 7:06 a.m. UTC
In the anonymous collapse path, khugepaged always collapses pte-mapped
hugepage by allocating and copying to a new hugepage.

In some scenarios, we can only update the mapping page tables for
anonymous pte-mapped THPs, in the same way as file/shmem-backed
pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
attempt to map file/shmem-backed pte-mapped THPs by pmds")

The simplest scenario that satisfies the conditions, as David points out,
is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
collapse into a R/O PMD without further action.

Let's start from this simplest scenario.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..57e261387124 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1237,6 +1237,196 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	return result;
 }
 
+static struct folio *find_lock_pte_mapped_folio(struct vm_area_struct *vma,
+						unsigned long addr, pmd_t *pmd)
+{
+	pte_t *pte, pteval;
+	struct folio *folio = NULL;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	pteval = ptep_get_lockless(pte);
+	if (pte_none(pteval) || !pte_present(pteval))
+		goto out;
+
+	folio = vm_normal_folio(vma, addr, pteval);
+	if (unlikely(!folio) || unlikely(folio_is_zone_device(folio)))
+		goto out;
+
+	if (!folio_trylock(folio)) {
+		folio = NULL;
+		goto out;
+	}
+
+	if (!folio_try_get(folio)) {
+		folio_unlock(folio);
+		folio = NULL;
+		goto out;
+	}
+
+out:
+	pte_unmap(pte);
+	return folio;
+}
+
+static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
+				struct vm_area_struct *vma,
+				unsigned long haddr, bool *mmap_locked,
+				struct collapse_control *cc)
+{
+	struct mmu_notifier_range range;
+	struct folio *folio;
+	pte_t *start_pte, *pte;
+	pmd_t *pmd, pmdval;
+	spinlock_t *pml, *ptl;
+	pgtable_t pgtable;
+	unsigned long addr;
+	int exclusive = 0;
+	bool writable = false;
+	int result, i;
+
+	/* Fast check before locking folio if already PMD-mapped */
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+	if (result == SCAN_PMD_MAPPED)
+		return result;
+
+	folio = find_lock_pte_mapped_folio(vma, haddr, pmd);
+	if (!folio)
+		return SCAN_PAGE_NULL;
+	if (!folio_test_large(folio)) {
+		result = SCAN_FAIL;
+		goto drop_folio;
+	}
+	if (folio_order(folio) != HPAGE_PMD_ORDER) {
+		result = SCAN_PAGE_COMPOUND;
+		goto drop_folio;
+	}
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+
+	/* Prevent all access to pagetables */
+	mmap_write_lock(mm);
+	vma_start_write(vma);
+
+	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	result = check_pmd_still_valid(mm, haddr, pmd);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/* Recheck with mmap write lock */
+	result = SCAN_SUCCEED;
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+	if (!start_pte)
+		goto up_write;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *subpage;
+		pte_t pteval = ptep_get(pte);
+
+		if (pte_none(pteval) || !pte_present(pteval)) {
+			result = SCAN_PTE_NON_PRESENT;
+			break;
+		}
+
+		if (pte_uffd_wp(pteval)) {
+			result = SCAN_PTE_UFFD_WP;
+			break;
+		}
+
+		if (pte_write(pteval))
+			writable = true;
+
+		subpage = vm_normal_page(vma, addr, pteval);
+
+		if (unlikely(!subpage) ||
+		    unlikely(is_zone_device_page(subpage))) {
+			result = SCAN_PAGE_NULL;
+			break;
+		}
+
+		if (folio_page(folio, i) != subpage) {
+			result = SCAN_FAIL;
+			break;
+		}
+
+		if (PageAnonExclusive(subpage))
+			exclusive++;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/*
+	 * Case 1:
+	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
+	 * collapse into a R/O PMD without further action.
+	 */
+	if (!(exclusive == 0 && !writable))
+		goto up_write;
+
+	/* Collapse pmd entry */
+	anon_vma_lock_write(vma->anon_vma);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	pml = pmd_lock(mm, pmd); /* probably unnecessary */
+	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_remove_table_sync_one();
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	/*
+	 * Obtain a new pmd rmap before dropping pte rmaps to avoid
+	 * false-negative page_mapped().
+	 */
+	folio_get(folio);
+	page_add_anon_rmap(&folio->page, vma, haddr, RMAP_COMPOUND);
+
+	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *subpage;
+		pte_t pteval = ptep_get(pte);
+
+		ptep_clear(mm, addr, pte);
+		subpage = vm_normal_page(vma, addr, pteval);
+		page_remove_rmap(subpage, vma, false);
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	folio_ref_sub(folio, HPAGE_PMD_NR);
+
+	/* Install pmd entry */
+	pgtable = pmd_pgtable(pmdval);
+	pmdval = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+	spin_lock(pml);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, pmdval);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	result = SCAN_SUCCEED;
+
+up_write:
+	mmap_write_unlock(mm);
+
+drop_folio:
+	folio_unlock(folio);
+	folio_put(folio);
+
+	/* TODO: tracepoints */
+	return result;
+}
+
 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				   struct vm_area_struct *vma,
 				   unsigned long address, bool *mmap_locked,
@@ -1251,6 +1441,8 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
 	bool writable = false;
+	int exclusive = 0;
+	bool is_hpage = false;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1333,8 +1525,14 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
+		if (PageAnonExclusive(page))
+			exclusive++;
+
 		page = compound_head(page);
 
+		if (compound_order(page) == HPAGE_PMD_ORDER)
+			is_hpage = true;
+
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1396,7 +1594,21 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+
+	if (is_hpage && (exclusive == 0 && !writable)) {
+		int res;
+
+		res = collapse_pte_mapped_anon_thp(mm, vma, address,
+						   mmap_locked, cc);
+		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
+			result = res;
+			goto out;
+		}
+	}
+
 	if (result == SCAN_SUCCEED) {
+		if (!*mmap_locked)
+			mmap_read_lock(mm);
 		result = collapse_huge_page(mm, address, referenced,
 					    unmapped, cc);
 		/* collapse_huge_page will return with the mmap_lock released */