@@ -1237,6 +1237,196 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
return result;
}
+static struct folio *find_lock_pte_mapped_folio(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd)
+{
+ pte_t *pte, pteval;
+ struct folio *folio = NULL;
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte)
+ return NULL;
+
+ pteval = ptep_get_lockless(pte);
+ if (pte_none(pteval) || !pte_present(pteval))
+ goto out;
+
+ folio = vm_normal_folio(vma, addr, pteval);
+ if (unlikely(!folio) || unlikely(folio_is_zone_device(folio)))
+ goto out;
+
+ if (!folio_trylock(folio)) {
+ folio = NULL;
+ goto out;
+ }
+
+ if (!folio_try_get(folio)) {
+ folio_unlock(folio);
+ folio = NULL;
+ goto out;
+ }
+
+out:
+ pte_unmap(pte);
+ return folio;
+}
+
+static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, bool *mmap_locked,
+ struct collapse_control *cc)
+{
+ struct mmu_notifier_range range;
+ struct folio *folio;
+ pte_t *start_pte, *pte;
+ pmd_t *pmd, pmdval;
+ spinlock_t *pml, *ptl;
+ pgtable_t pgtable;
+ unsigned long addr;
+ int exclusive = 0;
+ bool writable = false;
+ int result, i;
+
+ /* Fast check before locking folio if already PMD-mapped */
+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+ if (result == SCAN_PMD_MAPPED)
+ return result;
+
+ folio = find_lock_pte_mapped_folio(vma, haddr, pmd);
+ if (!folio)
+ return SCAN_PAGE_NULL;
+ if (!folio_test_large(folio)) {
+ result = SCAN_FAIL;
+ goto drop_folio;
+ }
+ if (folio_order(folio) != HPAGE_PMD_ORDER) {
+ result = SCAN_PAGE_COMPOUND;
+ goto drop_folio;
+ }
+
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+
+ /* Prevent all access to pagetables */
+ mmap_write_lock(mm);
+ vma_start_write(vma);
+
+ result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
+ if (result != SCAN_SUCCEED)
+ goto up_write;
+
+ result = check_pmd_still_valid(mm, haddr, pmd);
+ if (result != SCAN_SUCCEED)
+ goto up_write;
+
+ /* Recheck with mmap write lock */
+ result = SCAN_SUCCEED;
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ if (!start_pte)
+ goto up_write;
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *subpage;
+ pte_t pteval = ptep_get(pte);
+
+ if (pte_none(pteval) || !pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
+ break;
+ }
+
+ if (pte_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ break;
+ }
+
+ if (pte_write(pteval))
+ writable = true;
+
+ subpage = vm_normal_page(vma, addr, pteval);
+
+ if (unlikely(!subpage) ||
+ unlikely(is_zone_device_page(subpage))) {
+ result = SCAN_PAGE_NULL;
+ break;
+ }
+
+ if (folio_page(folio, i) != subpage) {
+ result = SCAN_FAIL;
+ break;
+ }
+
+ if (PageAnonExclusive(subpage))
+ exclusive++;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+ if (result != SCAN_SUCCEED)
+ goto up_write;
+
+ /*
+ * Case 1:
+ * No subpages are PageAnonExclusive (PTEs must be R/O), we can
+ * collapse into a R/O PMD without further action.
+ */
+ if (!(exclusive == 0 && !writable))
+ goto up_write;
+
+ /* Collapse pmd entry */
+ anon_vma_lock_write(vma->anon_vma);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ pml = pmd_lock(mm, pmd); /* probably unnecessary */
+ pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+ spin_unlock(pml);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
+
+ anon_vma_unlock_write(vma->anon_vma);
+
+ /*
+ * Obtain a new pmd rmap before dropping pte rmaps to avoid
+ * false-negative page_mapped().
+ */
+ folio_get(folio);
+ page_add_anon_rmap(&folio->page, vma, haddr, RMAP_COMPOUND);
+
+ start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *subpage;
+ pte_t pteval = ptep_get(pte);
+
+ ptep_clear(mm, addr, pte);
+ subpage = vm_normal_page(vma, addr, pteval);
+ page_remove_rmap(subpage, vma, false);
+ }
+ pte_unmap_unlock(start_pte, ptl);
+ folio_ref_sub(folio, HPAGE_PMD_NR);
+
+ /* Install pmd entry */
+ pgtable = pmd_pgtable(pmdval);
+ pmdval = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ spin_lock(pml);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, haddr, pmd, pmdval);
+ update_mmu_cache_pmd(vma, haddr, pmd);
+ spin_unlock(pml);
+
+ result = SCAN_SUCCEED;
+
+up_write:
+ mmap_write_unlock(mm);
+
+drop_folio:
+ folio_unlock(folio);
+ folio_put(folio);
+
+ /* TODO: tracepoints */
+ return result;
+}
+
static int hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, bool *mmap_locked,
@@ -1251,6 +1441,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false;
+ int exclusive = 0;
+ bool is_hpage = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1333,8 +1525,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
}
}
+ if (PageAnonExclusive(page))
+ exclusive++;
+
page = compound_head(page);
+ if (compound_order(page) == HPAGE_PMD_ORDER)
+ is_hpage = true;
+
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
@@ -1396,7 +1594,21 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
}
out_unmap:
pte_unmap_unlock(pte, ptl);
+
+ if (is_hpage && (exclusive == 0 && !writable)) {
+ int res;
+
+ res = collapse_pte_mapped_anon_thp(mm, vma, address,
+ mmap_locked, cc);
+ if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
+ result = res;
+ goto out;
+ }
+ }
+
if (result == SCAN_SUCCEED) {
+ if (!*mmap_locked)
+ mmap_read_lock(mm);
result = collapse_huge_page(mm, address, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
In the anonymous collapse path, khugepaged always collapses pte-mapped hugepage by allocating and copying to a new hugepage. In some scenarios, we can only update the mapping page tables for anonymous pte-mapped THPs, in the same way as file/shmem-backed pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged: attempt to map file/shmem-backed pte-mapped THPs by pmds") The simplest scenario that satisfies the conditions, as David points out, is when no subpages are PageAnonExclusive (PTEs must be R/O), we can collapse into a R/O PMD without further action. Let's start from this simplest scenario. Signed-off-by: Xu Yu <xuyu@linux.alibaba.com> --- mm/khugepaged.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+)