@@ -68,7 +68,7 @@ static inline bool zswap_is_enabled(void)
static inline bool zswap_never_enabled(void)
{
- return false;
+ return true;
}
#endif
@@ -3987,6 +3987,141 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
+/*
+ * check a range of PTEs are completely swap entries with
+ * contiguous swap offsets and the same SWAP_HAS_CACHE.
+ * ptep must be first one in the range
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
+{
+ struct swap_info_struct *si;
+ unsigned long addr;
+ swp_entry_t entry;
+ pgoff_t offset;
+ char has_cache;
+ int idx, i;
+ pte_t pte;
+
+ addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+ idx = (vmf->address - addr) / PAGE_SIZE;
+ pte = ptep_get(ptep);
+
+ if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
+ return false;
+ entry = pte_to_swp_entry(pte);
+ offset = swp_offset(entry);
+ if (!IS_ALIGNED(offset, nr_pages))
+ return false;
+ if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
+ return false;
+
+ si = swp_swap_info(entry);
+ has_cache = si->swap_map[offset] & SWAP_HAS_CACHE;
+ for (i = 1; i < nr_pages; i++) {
+ /*
+ * while allocating a large folio and doing swap_read_folio for the
+ * SWP_SYNCHRONOUS_IO path, which is the case the being faulted pte
+ * doesn't have swapcache. We need to ensure all PTEs have no cache
+ * as well, otherwise, we might go to swap devices while the content
+ * is in swapcache
+ */
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE) != has_cache)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Get a list of all the (large) orders below PMD_ORDER that are enabled
+ * for this vma. Then filter out the orders that can't be allocated over
+ * the faulting address and still be fully contained in the vma.
+ */
+static inline unsigned long get_alloc_folio_orders(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long orders;
+
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+ return orders;
+}
+#else
+static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
+{
+ return false;
+}
+#endif
+
+static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ unsigned long orders;
+ struct folio *folio;
+ unsigned long addr;
+ spinlock_t *ptl;
+ pte_t *pte;
+ gfp_t gfp;
+ int order;
+
+ /*
+ * If uffd is active for the vma we need per-page fault fidelity to
+ * maintain the uffd semantics.
+ */
+ if (unlikely(userfaultfd_armed(vma)))
+ goto fallback;
+
+ /*
+ * a large folio being swapped-in could be partially in
+ * zswap and partially in swap devices, zswap doesn't
+ * support large folios yet, we might get corrupted
+ * zero-filled data by reading all subpages from swap
+ * devices while some of them are actually in zswap
+ */
+ if (!zswap_never_enabled())
+ goto fallback;
+
+ orders = get_alloc_folio_orders(vmf);
+ if (!orders)
+ goto fallback;
+
+ pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl);
+ if (unlikely(!pte))
+ goto fallback;
+
+ /*
+ * For do_swap_page, find the highest order where the aligned range is
+ * completely swap entries with contiguous swap offsets.
+ */
+ order = highest_order(orders);
+ while (orders) {
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
+ break;
+ order = next_order(&orders, order);
+ }
+
+ pte_unmap_unlock(pte, ptl);
+
+ /* Try allocating the highest of the remaining orders. */
+ gfp = vma_thp_gfp_mask(vma);
+ while (orders) {
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ if (folio)
+ return folio;
+ order = next_order(&orders, order);
+ }
+
+fallback:
+#endif
+ return vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false);
+}
+
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4075,35 +4210,38 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
- /*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread may
- * finish swapin first, free the entry, and swapout
- * reusing the same entry. It's undetectable as
- * pte_same() returns true due to entry reuse.
- */
- if (swapcache_prepare(entry)) {
- /* Relax a bit to prevent rapid repeated page faults */
- schedule_timeout_uninterruptible(1);
- goto out;
- }
- need_clear_cache = true;
-
/* skip swapcache */
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
- vma, vmf->address, false);
+ folio = alloc_swap_folio(vmf);
page = &folio->page;
if (folio) {
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
+ nr_pages = folio_nr_pages(folio);
+ if (folio_test_large(folio))
+ entry.val = ALIGN_DOWN(entry.val, nr_pages);
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare_nr(entry, nr_pages)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out_page;
+ }
+ need_clear_cache = true;
+
if (mem_cgroup_swapin_charge_folio(folio,
vma->vm_mm, GFP_KERNEL,
entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
- mem_cgroup_swapin_uncharge_swap(entry);
+ for (swp_entry_t e = entry; e.val < entry.val + nr_pages; e.val++)
+ mem_cgroup_swapin_uncharge_swap(e);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -4210,6 +4348,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_nomap;
}
+ /* allocated large folios for SWP_SYNCHRONOUS_IO */
+ if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+ unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
+ unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
+ pte_t *folio_ptep = vmf->pte - idx;
+
+ if (!can_swapin_thp(vmf, folio_ptep, nr))
+ goto out_nomap;
+
+ page_idx = idx;
+ address = folio_start;
+ ptep = folio_ptep;
+ goto check_folio;
+ }
+
nr_pages = 1;
page_idx = 0;
address = vmf->address;
@@ -4341,11 +4495,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_add_lru_vma(folio, vma);
} else if (!folio_test_anon(folio)) {
/*
- * We currently only expect small !anon folios, which are either
- * fully exclusive or fully shared. If we ever get large folios
- * here, we have to be careful.
+ * We currently only expect small !anon folios which are either
+ * fully exclusive or fully shared, or new allocated large folios
+ * which are fully exclusive. If we ever get large folios within
+ * swapcache here, we have to be careful.
*/
- VM_WARN_ON_ONCE(folio_test_large(folio));
+ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
} else {
@@ -4388,7 +4543,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
if (need_clear_cache)
- swapcache_clear(si, entry);
+ swapcache_clear_nr(si, entry, nr_pages);
if (si)
put_swap_device(si);
return ret;
@@ -4404,7 +4559,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_put(swapcache);
}
if (need_clear_cache)
- swapcache_clear(si, entry);
+ swapcache_clear_nr(si, entry, nr_pages);
if (si)
put_swap_device(si);
return ret;
@@ -4440,14 +4595,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
if (unlikely(userfaultfd_armed(vma)))
goto fallback;
- /*
- * Get a list of all the (large) orders below PMD_ORDER that are enabled
- * for this vma. Then filter out the orders that can't be allocated over
- * the faulting address and still be fully contained in the vma.
- */
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
- orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+ orders = get_alloc_folio_orders(vmf);
if (!orders)
goto fallback;
@@ -478,7 +478,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Swap entry may have been freed since our caller observed it.
*/
- err = swapcache_prepare(entry);
+ err = swapcache_prepare_nr(entry, 1);
if (!err)
break;