@@ -12,6 +12,8 @@ extern atomic_long_t zswap_stored_pages;
#ifdef CONFIG_ZSWAP
struct swap_in_memory_cache_cb;
+struct zswap_decomp_batch;
+struct zswap_entry;
struct zswap_lruvec_state {
/*
@@ -120,6 +122,19 @@ static inline void zswap_store_batch(struct swap_in_memory_cache_cb *simc)
}
bool zswap_load_batching_enabled(void);
+void zswap_load_batch_init(struct zswap_decomp_batch *zd_batch);
+void zswap_load_batch_reinit(struct zswap_decomp_batch *zd_batch);
+bool __zswap_add_load_batch(struct zswap_decomp_batch *zd_batch,
+ struct folio *folio);
+static inline bool zswap_add_load_batch(
+ struct zswap_decomp_batch *zd_batch,
+ struct folio *folio)
+{
+ if (zswap_load_batching_enabled())
+ return __zswap_add_load_batch(zd_batch, folio);
+
+ return false;
+}
unsigned long zswap_total_pages(void);
bool zswap_store(struct folio *folio);
@@ -138,6 +153,8 @@ struct zswap_lruvec_state {};
struct zswap_store_sub_batch_page {};
struct zswap_store_pipeline_state {};
struct swap_in_memory_cache_cb;
+struct zswap_decomp_batch;
+struct zswap_entry;
static inline bool zswap_store_batching_enabled(void)
{
@@ -153,6 +170,24 @@ static inline bool zswap_load_batching_enabled(void)
return false;
}
+static inline void zswap_load_batch_init(
+ struct zswap_decomp_batch *zd_batch)
+{
+}
+
+static inline void zswap_load_batch_reinit(
+ struct zswap_decomp_batch *zd_batch)
+{
+}
+
+static inline bool zswap_add_load_batch(
+ struct folio *folio,
+ struct zswap_entry *entry,
+ struct zswap_decomp_batch *zd_batch)
+{
+ return false;
+}
+
static inline bool zswap_store(struct folio *folio)
{
return false;
@@ -4322,7 +4322,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* To provide entry to swap_read_folio() */
folio->swap = entry;
- swap_read_folio(folio, NULL);
+ swap_read_folio(folio, NULL, NULL, NULL);
folio->private = NULL;
}
} else {
@@ -744,11 +744,17 @@ static void swap_read_folio_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+/*
+ * Returns true if the folio was read, and false if the folio was added to
+ * the zswap_decomp_batch for batched decompression.
+ */
+bool swap_read_folio(struct folio *folio, struct swap_iocb **plug,
+ struct zswap_decomp_batch *zswap_batch,
+ struct folio_batch *non_zswap_batch)
{
struct swap_info_struct *sis = swp_swap_info(folio->swap);
bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
- bool workingset = folio_test_workingset(folio);
+ bool workingset;
unsigned long pflags;
bool in_thrashing;
@@ -756,11 +762,26 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
+ /*
+ * If entry is found in zswap xarray, and zswap load batching
+ * is enabled, this is a candidate for zswap batch decompression.
+ */
+ if (zswap_batch && zswap_add_load_batch(zswap_batch, folio))
+ return false;
+
+ if (zswap_load_batching_enabled() && non_zswap_batch) {
+ BUG_ON(!folio_batch_space(non_zswap_batch));
+ folio_batch_add(non_zswap_batch, folio);
+ return false;
+ }
+
/*
* Count submission time as memory stall and delay. When the device
* is congested, or the submitting cgroup IO-throttled, submission
* can be a significant part of overall IO time.
*/
+ workingset = folio_test_workingset(folio);
+
if (workingset) {
delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
@@ -792,6 +813,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
psi_memstall_leave(&pflags);
}
delayacct_swapin_end();
+ return true;
}
void __swap_read_unplug(struct swap_iocb *sio)
@@ -84,6 +84,27 @@ void swap_crypto_acomp_decompress_batch(
int nr_pages,
struct crypto_acomp_ctx *acomp_ctx);
+#if defined(CONFIG_ZSWAP_LOAD_BATCHING_ENABLED)
+#define MAX_NR_ZSWAP_LOAD_SUB_BATCHES DIV_ROUND_UP(PAGEVEC_SIZE, \
+ SWAP_CRYPTO_SUB_BATCH_SIZE)
+#else
+#define MAX_NR_ZSWAP_LOAD_SUB_BATCHES 1UL
+#endif /* CONFIG_ZSWAP_LOAD_BATCHING_ENABLED */
+
+/*
+ * Note: If PAGEVEC_SIZE or SWAP_CRYPTO_SUB_BATCH_SIZE
+ * exceeds 256, change the u8 to u16.
+ */
+struct zswap_decomp_batch {
+ struct folio_batch fbatch;
+ bool swapcache[PAGEVEC_SIZE];
+ struct xarray *trees[MAX_NR_ZSWAP_LOAD_SUB_BATCHES][SWAP_CRYPTO_SUB_BATCH_SIZE];
+ struct zswap_entry *entries[MAX_NR_ZSWAP_LOAD_SUB_BATCHES][SWAP_CRYPTO_SUB_BATCH_SIZE];
+ struct page *pages[MAX_NR_ZSWAP_LOAD_SUB_BATCHES][SWAP_CRYPTO_SUB_BATCH_SIZE];
+ unsigned int slens[MAX_NR_ZSWAP_LOAD_SUB_BATCHES][SWAP_CRYPTO_SUB_BATCH_SIZE];
+ u8 nr_decomp[MAX_NR_ZSWAP_LOAD_SUB_BATCHES];
+};
+
/* linux/mm/vmscan.c, linux/mm/page_io.c, linux/mm/zswap.c */
/* For batching of compressions in reclaim path. */
struct swap_in_memory_cache_cb {
@@ -101,7 +122,9 @@ struct swap_in_memory_cache_cb {
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug);
+bool swap_read_folio(struct folio *folio, struct swap_iocb **plug,
+ struct zswap_decomp_batch *zswap_batch,
+ struct folio_batch *non_zswap_batch);
void __swap_read_unplug(struct swap_iocb *plug);
static inline void swap_read_unplug(struct swap_iocb *plug)
{
@@ -238,8 +261,12 @@ static inline void swap_crypto_acomp_decompress_batch(
{
}
-static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+struct zswap_decomp_batch {};
+static inline bool swap_read_folio(struct folio *folio, struct swap_iocb **plug,
+ struct zswap_decomp_batch *zswap_batch,
+ struct folio_batch *non_zswap_batch)
{
+ return false;
}
static inline void swap_write_unplug(struct swap_iocb *sio)
{
@@ -570,7 +570,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mpol_cond_put(mpol);
if (page_allocated)
- swap_read_folio(folio, plug);
+ swap_read_folio(folio, plug, NULL, NULL);
return folio;
}
@@ -687,7 +687,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!folio)
continue;
if (page_allocated) {
- swap_read_folio(folio, &splug);
+ swap_read_folio(folio, &splug, NULL, NULL);
if (offset != entry_offset) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
@@ -703,7 +703,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
if (unlikely(page_allocated))
- swap_read_folio(folio, NULL);
+ swap_read_folio(folio, NULL, NULL, NULL);
return folio;
}
@@ -1057,7 +1057,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (!folio)
continue;
if (page_allocated) {
- swap_read_folio(folio, &splug);
+ swap_read_folio(folio, &splug, NULL, NULL);
if (addr != vmf->address) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
@@ -1075,7 +1075,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated, false);
if (unlikely(page_allocated))
- swap_read_folio(folio, NULL);
+ swap_read_folio(folio, NULL, NULL, NULL);
return folio;
}
@@ -2312,6 +2312,95 @@ bool zswap_load(struct folio *folio)
return true;
}
+/* Code for zswap load batch with batch decompress. */
+
+__always_inline void zswap_load_batch_init(struct zswap_decomp_batch *zd_batch)
+{
+ unsigned int sb;
+
+ folio_batch_init(&zd_batch->fbatch);
+
+ for (sb = 0; sb < MAX_NR_ZSWAP_LOAD_SUB_BATCHES; ++sb)
+ zd_batch->nr_decomp[sb] = 0;
+}
+
+__always_inline void zswap_load_batch_reinit(struct zswap_decomp_batch *zd_batch)
+{
+ unsigned int sb;
+
+ folio_batch_reinit(&zd_batch->fbatch);
+
+ for (sb = 0; sb < MAX_NR_ZSWAP_LOAD_SUB_BATCHES; ++sb)
+ zd_batch->nr_decomp[sb] = 0;
+}
+
+/*
+ * All folios in zd_batch are allocated into the swapcache
+ * in swapin_readahead(), before being added to the zd_batch
+ * for batch decompression.
+ */
+bool __zswap_add_load_batch(struct zswap_decomp_batch *zd_batch,
+ struct folio *folio)
+{
+ swp_entry_t swp = folio->swap;
+ pgoff_t offset = swp_offset(swp);
+ bool swapcache = folio_test_swapcache(folio);
+ struct xarray *tree = swap_zswap_tree(swp);
+ struct zswap_entry *entry;
+ unsigned int batch_idx, sb;
+
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+
+ if (zswap_never_enabled())
+ return false;
+
+ /*
+ * Large folios should not be swapped in while zswap is being used, as
+ * they are not properly handled. Zswap does not properly load large
+ * folios, and a large folio may only be partially in zswap.
+ *
+ * Returning false here will cause the large folio to be added to
+ * the "non_zswap_batch" in swap_read_folio(), which will eventually
+ * call zswap_load() if the folio is not in the zeromap. Finally,
+ * zswap_load() will return true without marking the folio uptodate
+ * so that an IO error is emitted (e.g. do_swap_page() will sigbus).
+ */
+ if (WARN_ON_ONCE(folio_test_large(folio)))
+ return false;
+
+ /*
+ * When reading into the swapcache, invalidate our entry. The
+ * swapcache can be the authoritative owner of the page and
+ * its mappings, and the pressure that results from having two
+ * in-memory copies outweighs any benefits of caching the
+ * compression work.
+ */
+ if (swapcache)
+ entry = xa_erase(tree, offset);
+ else
+ entry = xa_load(tree, offset);
+
+ if (!entry)
+ return false;
+
+ BUG_ON(!folio_batch_space(&zd_batch->fbatch));
+ folio_batch_add(&zd_batch->fbatch, folio);
+
+ batch_idx = folio_batch_count(&zd_batch->fbatch) - 1;
+ zd_batch->swapcache[batch_idx] = swapcache;
+ sb = batch_idx / SWAP_CRYPTO_SUB_BATCH_SIZE;
+
+ if (entry->length) {
+ zd_batch->trees[sb][zd_batch->nr_decomp[sb]] = tree;
+ zd_batch->entries[sb][zd_batch->nr_decomp[sb]] = entry;
+ zd_batch->pages[sb][zd_batch->nr_decomp[sb]] = &folio->page;
+ zd_batch->slens[sb][zd_batch->nr_decomp[sb]] = entry->length;
+ zd_batch->nr_decomp[sb]++;
+ }
+
+ return true;
+}
+
void zswap_invalidate(swp_entry_t swp)
{
pgoff_t offset = swp_offset(swp);
This patch modifies swap_read_folio() to check if the swap entry is present in zswap, and if so, it will be added to a "zswap_batch" folio_batch, if the caller (e.g. swapin_readahead()) has passed in a valid "zswap_batch". If the swap entry is found in zswap, it will be added the next available index in a sub-batch. This sub-batch is part of "struct zswap_decomp_batch" which progressively constructs SWAP_CRYPTO_SUB_BATCH_SIZE arrays of zswap entries/xarrays/pages/source-lengths ready for batch decompression in IAA. The function that does this, zswap_add_load_batch(), will return true to swap_read_folio(). If the entry is not found in zswap, it will return false. If the swap entry was not found in zswap, and if zswap_load_batching_enabled() and a valid "non_zswap_batch" folio_batch is passed to swap_read_folio(), the folio will be added to the "non_zswap_batch" batch. Finally, the code falls through to the usual/existing swap_read_folio() flow. Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> --- include/linux/zswap.h | 35 +++++++++++++++++ mm/memory.c | 2 +- mm/page_io.c | 26 ++++++++++++- mm/swap.h | 31 ++++++++++++++- mm/swap_state.c | 10 ++--- mm/zswap.c | 89 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 183 insertions(+), 10 deletions(-)