@@ -1512,9 +1512,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(folio, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
- NULL) == 0) {
+ if (!swap_cache_add_wait(folio, swap,
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
shmem_recalc_inode(inode, 0, 1);
swap_shmem_alloc(swap);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
@@ -34,23 +34,20 @@ extern struct address_space *swapper_spaces[];
void show_swap_cache_info(void);
bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp);
void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow);
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(swp_entry_t entry);
+int swap_cache_add_wait(struct folio *folio, swp_entry_t entry, gfp_t gfp);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
pgoff_t index);
-
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug);
struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_flags,
- struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
- bool skip_if_exists);
+ struct mempolicy *mpol, pgoff_t ilx, bool *folio_allocated);
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_direct(swp_entry_t entry, gfp_t flag,
@@ -109,6 +106,11 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline int swap_cache_add_wait(struct folio *folio, swp_entry_t entry, gfp_t gfp)
+{
+ return -1;
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
@@ -132,12 +134,6 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
return NULL;
}
-static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp_mask, void **shadowp)
-{
- return -1;
-}
-
static inline void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow)
{
@@ -85,8 +85,8 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp)
+static int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
+ gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
@@ -169,14 +169,16 @@ bool add_to_swap(struct folio *folio)
/*
* Add it to the swap cache.
*/
- err = add_to_swap_cache(folio, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
- if (err)
+ err = swap_cache_add_wait(folio, entry,
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ if (err) {
/*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
+ * swap_cache_add_wait() doesn't return -EEXIST, so we can
+ * safely clear SWAP_HAS_CACHE flag.
*/
goto fail;
+ }
+
/*
* Normally the folio will be dirtied in unmap because its
* pte should be dirty. A special case is MADV_FREE page. The
@@ -208,11 +210,12 @@ void delete_from_swap_cache(struct folio *folio)
swp_entry_t entry = folio->swap;
struct address_space *address_space = swap_address_space(entry);
+ put_swap_folio(folio, entry);
+
xa_lock_irq(&address_space->i_pages);
__delete_from_swap_cache(folio, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
- put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
@@ -385,119 +388,123 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
return folio;
}
-struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_mask,
- struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
- bool skip_if_exists)
+/*
+ * Try add a new folio, return NULL if the entry is swapped in by someone
+ * else or hitting OOM.
+ */
+static struct folio *swap_cache_add_or_get(struct folio *folio,
+ swp_entry_t entry, gfp_t gfp_mask)
{
- struct swap_info_struct *si;
- struct folio *folio;
+ int ret = 0;
void *shadow = NULL;
+ struct address_space *address_space = swap_address_space(entry);
- *new_page_allocated = false;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
-
- for (;;) {
- int err;
- /*
- * First check the swap cache. Since this is normally
- * called after swap_cache_get_folio() failed, re-calling
- * that would confuse statistics.
- */
- folio = filemap_get_folio(swap_address_space(entry),
- swp_offset(entry));
- if (!IS_ERR(folio))
- goto got_folio;
-
- /*
- * Just skip read ahead for unused swap slot.
- * During swap_off when swap_slot_cache is disabled,
- * we have to handle the race between putting
- * swap entry in swap cache and marking swap slot
- * as SWAP_HAS_CACHE. That's done in later part of code or
- * else swap_off will be aborted if we return NULL.
- */
- if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
- goto fail_put_swap;
-
- /*
- * Get a new folio to read into from swap. Allocate it now,
- * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
- * cause any racers to loop around until we add it to cache.
- */
- folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
- mpol, ilx, numa_node_id());
- if (!folio)
- goto fail_put_swap;
-
- /*
- * Swap entry may have been freed since our caller observed it.
- */
- err = swapcache_prepare(entry);
- if (!err)
- break;
-
- folio_put(folio);
- if (err != -EEXIST)
- goto fail_put_swap;
-
- /*
- * Protect against a recursive call to swap_cache_alloc_or_get()
- * on the same entry waiting forever here because SWAP_HAS_CACHE
- * is set but the folio is not the swap cache yet. This can
- * happen today if mem_cgroup_swapin_charge_folio() below
- * triggers reclaim through zswap, which may call
- * swap_cache_alloc_or_get() in the writeback path.
- */
- if (skip_if_exists)
- goto fail_put_swap;
+ /* If folio is NULL, simply go lookup the swapcache */
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ ret = add_to_swap_cache(folio, entry, gfp_mask, &shadow);
+ if (ret)
+ __folio_clear_locked(folio);
+ }
- /*
- * We might race against __delete_from_swap_cache(), and
- * stumble across a swap_map entry whose SWAP_HAS_CACHE
- * has not yet been cleared. Or race against another
- * swap_cache_alloc_or_get(), which has set SWAP_HAS_CACHE
- * in swap_map, but not yet added its folio to swap cache.
- */
- schedule_timeout_uninterruptible(1);
+ if (!folio || ret) {
+ /* If the folio is already added, return it untouched. */
+ folio = filemap_get_folio(address_space, swp_offset(entry));
+ /* If not, either the entry have been freed or we are OOM. */
+ if (IS_ERR(folio))
+ return NULL;
+ return folio;
}
/*
- * The swap entry is ours to swap in. Prepare the new folio.
+ * The folio is now added to swap cache, try update the swap map
+ * to ensure the entry is still valid. If we accidentally added
+ * a stalled entry, undo the add.
*/
+ ret = swapcache_prepare(entry);
+ if (unlikely(ret))
+ goto fail_delete_cache;
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
+ /* Charge and shadow check */
if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
- goto fail_unlock;
-
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
- goto fail_unlock;
-
+ goto fail_put_flag;
mem_cgroup_swapin_uncharge_swap(entry);
-
if (shadow)
workingset_refault(folio, shadow);
- /* Caller will initiate read into locked folio */
+ /* Return new added folio locked */
folio_add_lru(folio);
- *new_page_allocated = true;
-got_folio:
- put_swap_device(si);
return folio;
-fail_unlock:
+fail_put_flag:
put_swap_folio(folio, entry);
+fail_delete_cache:
+ xa_lock_irq(&address_space->i_pages);
+ __delete_from_swap_cache(folio, entry, shadow);
+ xa_unlock_irq(&address_space->i_pages);
+ folio_ref_sub(folio, folio_nr_pages(folio));
folio_unlock(folio);
- folio_put(folio);
-fail_put_swap:
- put_swap_device(si);
+
return NULL;
}
+/*
+ * Try to add a folio to swap cache, caller must ensure entry is freed.
+ * May block if swap_cache_alloc_or_get accidently loaded a freed entry
+ * and it will be removed very soon, so just wait and retry.
+ */
+int swap_cache_add_wait(struct folio *folio, swp_entry_t entry, gfp_t gfp)
+{
+ int ret;
+ struct folio *wait_folio;
+
+ for (;;) {
+ ret = add_to_swap_cache(folio, entry, gfp, NULL);
+ if (ret != -EEXIST)
+ break;
+ wait_folio = filemap_get_folio(swap_address_space(entry),
+ swp_offset(entry));
+ if (!IS_ERR(wait_folio)) {
+ folio_wait_locked(wait_folio);
+ folio_put(wait_folio);
+ }
+ }
+
+ return ret;
+}
+
+struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t ilx, bool *folio_allocated)
+{
+ struct folio *folio, *swapcache = NULL;
+ struct swap_info_struct *si;
+
+ /* Prevent swapoff from happening to us */
+ si = get_swap_device(entry);
+ if (!si)
+ goto out_no_device;
+
+ /* We are very likely the first user, alloc and try add to the swapcache. */
+ folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, mpol, ilx,
+ numa_node_id());
+ swapcache = swap_cache_add_or_get(folio, entry, gfp_mask);
+ if (swapcache != folio) {
+ folio_put(folio);
+ goto out_no_alloc;
+ }
+
+ put_swap_device(si);
+ *folio_allocated = true;
+ return swapcache;
+
+out_no_alloc:
+ put_swap_device(si);
+out_no_device:
+ *folio_allocated = false;
+ return swapcache;
+}
+
/*
* Locate a page of swap in physical memory, reserving swap cache space
* and reading the disk if it is not already cached.
@@ -519,7 +526,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mpol = get_vma_policy(vma, addr, 0, &ilx);
folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
mpol_cond_put(mpol);
if (page_allocated)
@@ -636,7 +643,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
/* Ok, do the async read-ahead now */
folio = swap_cache_alloc_or_get(
swp_entry(swp_type(entry), offset),
- gfp_mask, mpol, ilx, &page_allocated, false);
+ gfp_mask, mpol, ilx, &page_allocated);
if (!folio)
continue;
if (page_allocated) {
@@ -654,7 +661,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
skip:
/* The page was likely read above, so no need for plugging here */
folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
if (unlikely(page_allocated)) {
zswap_folio_swapin(folio);
swap_read_folio(folio, false, NULL);
@@ -810,7 +817,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
pte_unmap(pte);
pte = NULL;
folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
if (!folio)
continue;
if (page_allocated) {
@@ -830,7 +837,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
skip:
/* The folio was likely read above, so no need for plugging here */
folio = swap_cache_alloc_or_get(targ_entry, gfp_mask, mpol, targ_ilx,
- &page_allocated, false);
+ &page_allocated);
if (unlikely(page_allocated)) {
zswap_folio_swapin(folio);
swap_read_folio(folio, false, NULL);
@@ -856,7 +863,7 @@ struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
mpol_cond_put(mpol);
if (page_allocated)
@@ -149,9 +149,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* in usual operations.
*/
if (folio_trylock(folio)) {
- if ((flags & TTRS_ANYWAY) ||
+ if (folio_test_swapcache(folio) &&
+ ((flags & TTRS_ANYWAY) ||
((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
- ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))))
ret = folio_free_swap(folio);
folio_unlock(folio);
}
@@ -1344,7 +1345,8 @@ void swap_free(swp_entry_t entry)
}
/*
- * Called after dropping swapcache to decrease refcnt to swap entries.
+ * Called before dropping swapcache, free the entry and ensure
+ * new insertion will success.
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
@@ -1897,13 +1899,15 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
if (!folio) {
swp_count = READ_ONCE(si->swap_map[offset]);
- if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
+ if (swap_count(swp_count) == 0 || swp_count == SWAP_MAP_BAD)
continue;
return -ENOMEM;
}
folio_lock(folio);
folio_wait_writeback(folio);
+ if (!folio_test_swapcache(folio))
+ goto free_folio;
ret = unuse_pte(vma, pmd, addr, entry, folio);
if (ret < 0) {
folio_unlock(folio);
@@ -1912,6 +1916,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
folio_free_swap(folio);
+free_folio:
folio_unlock(folio);
folio_put(folio);
} while (addr += PAGE_SIZE, addr != end);
@@ -732,10 +732,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
+ put_swap_folio(folio, swap);
__delete_from_swap_cache(folio, swap, shadow);
mem_cgroup_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
- put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -1127,7 +1127,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/* try to allocate swap cache folio */
mpol = get_task_policy(current);
folio = swap_cache_alloc_or_get(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ NO_INTERLEAVE_INDEX, &folio_was_allocated);
if (!folio)
return -ENOMEM;