@@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
#endif
}
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+ struct folio *folio, pte_t *ptep)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = folio_nr_pages(folio);
+ pte_t pte = ptep_get(ptep);
+
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return false;
+ if (pte_unused(pte))
+ return false;
+ if (pte_pfn(pte) != folio_pfn(folio))
+ return false;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ NULL, NULL) == max_nr;
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1794,6 +1813,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
struct page *subpage;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1933,23 +1953,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+ can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+ nr_pages = folio_nr_pages(folio);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else {
@@ -2027,7 +2050,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* redirtied either using the page table or a previously
* obtained GUP reference.
*/
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
goto walk_abort;
} else if (ref_count != 1 + map_count) {
@@ -2040,10 +2063,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* We'll come back here later and detect if the folio was
* dirtied when the additional reference is gone.
*/
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
goto walk_abort;
}
- dec_mm_counter(mm, MM_ANONPAGES);
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
goto discard;
}
@@ -2108,13 +2131,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ folio_ref_sub(folio, nr_pages - 1);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
+ /* We have already batched the entire folio */
+ if (nr_pages > 1)
+ goto walk_done;
continue;
walk_abort:
ret = false;