diff mbox series

[v8,14/14] mm: zswap: Compress batching with request chaining in zswap_store() of large folios.

Message ID 20250303084724.6490-15-kanchana.p.sridhar@intel.com (mailing list archive)
State New
Headers show
Series zswap IAA compress batching | expand

Commit Message

Sridhar, Kanchana P March 3, 2025, 8:47 a.m. UTC
This patch introduces three new procedures:

 zswap_store_folio()
 zswap_store_pages()
 zswap_batch_compress()

zswap_store_folio() stores a folio in chunks of "batch_size" pages. If the
compressor supports batching and has multiple acomp_ctx->nr_reqs, the
batch_size will be the acomp_ctx->nr_reqs. If the compressor does not
support batching, the batch_size will be ZSWAP_MAX_BATCH_SIZE. The
compressor having multiple acomp_ctx->nr_reqs is passed as a bool
"batching" parameter to zswap_store_pages() and zswap_batch_compress().
This refactorization allows us to move the loop over the folio's pages from
zswap_store() to zswap_store_folio(), and also enables batching.

zswap_store_pages() implements all the computes done earlier in
zswap_store_page() for a single-page, for multiple pages in a folio, namely
a "batch". zswap_store_pages() starts by allocating all zswap entries
required to store the batch. Next, it calls zswap_batch_compress() to
compress the batch. Finally, it adds the batch's zswap entries to the
xarray and LRU, charges zswap memory and increments zswap stats.

The error handling and cleanup required for all failure scenarios that can
occur while storing a batch in zswap are consolidated to a single
"store_pages_failed" label in zswap_store_pages().

And finally, this patch introduces zswap_batch_compress(), which does the
following:
  - If the compressor supports batching, sets up a request chain for
    compressing the batch in one shot. If Intel IAA is the zswap
    compressor, the request chain will be compressed in parallel in
    hardware. If all requests in the chain are compressed without errors,
    the compressed buffers are then stored in zpool.
  - If the compressor does not support batching, each page in the batch is
    compressed and stored sequentially. zswap_batch_compress() replaces
    zswap_compress(), thereby eliminating code duplication and facilitating
    maintainability of the code with the introduction of batching.

The call to the crypto layer is exactly the same in both cases: when batch
compressing a request chain or when sequentially compressing each page in
the batch.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/zswap.c | 396 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 270 insertions(+), 126 deletions(-)

Comments

kernel test robot March 3, 2025, 11:07 a.m. UTC | #1
Hi Kanchana,

kernel test robot noticed the following build errors:

[auto build test ERROR on 5f089a9aa987ccf72df0c6955e168e865f280603]

url:    https://github.com/intel-lab-lkp/linux/commits/Kanchana-P-Sridhar/crypto-acomp-Add-synchronous-asynchronous-acomp-request-chaining/20250303-164927
base:   5f089a9aa987ccf72df0c6955e168e865f280603
patch link:    https://lore.kernel.org/r/20250303084724.6490-15-kanchana.p.sridhar%40intel.com
patch subject: [PATCH v8 14/14] mm: zswap: Compress batching with request chaining in zswap_store() of large folios.
config: s390-randconfig-001-20250303 (https://download.01.org/0day-ci/archive/20250303/202503031847.j1iReOtf-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250303/202503031847.j1iReOtf-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503031847.j1iReOtf-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/zswap.c:1166:4: error: call to undeclared function 'prefetchw'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1166 |                         prefetchw(entries[j]);
         |                         ^
   1 error generated.


vim +/prefetchw +1166 mm/zswap.c

  1053	
  1054	/*
  1055	 * Unified code paths for compressors that do and do not support
  1056	 * batching. This procedure will compress multiple @nr_pages in @folio,
  1057	 * starting from @index.
  1058	 * If @batching is set to true, it will create a request chain for
  1059	 * compression batching. It is assumed that the caller has verified
  1060	 * that the acomp_ctx->nr_reqs is at least @nr_pages.
  1061	 * If @batching is set to false, it will process each page sequentially.
  1062	 * In both cases, if all compressions were successful, it will proceed
  1063	 * to store the compressed buffers in zpool.
  1064	 */
  1065	static bool zswap_batch_compress(struct folio *folio,
  1066					 long index,
  1067					 unsigned int nr_pages,
  1068					 struct zswap_entry *entries[],
  1069					 struct zswap_pool *pool,
  1070					 struct crypto_acomp_ctx *acomp_ctx,
  1071					 bool batching)
  1072	{
  1073		struct scatterlist inputs[ZSWAP_MAX_BATCH_SIZE];
  1074		struct scatterlist outputs[ZSWAP_MAX_BATCH_SIZE];
  1075		struct zpool *zpool = pool->zpool;
  1076		int acomp_idx = 0, nr_to_store = 1;
  1077		unsigned int i, j;
  1078		int err = 0;
  1079		gfp_t gfp;
  1080	
  1081		lockdep_assert_held(&acomp_ctx->mutex);
  1082	
  1083		gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
  1084		if (zpool_malloc_support_movable(zpool))
  1085			gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
  1086	
  1087		for (i = 0; i < nr_pages; ++i) {
  1088			struct page *page = folio_page(folio, index + i);
  1089	
  1090			sg_init_table(&inputs[acomp_idx], 1);
  1091			sg_set_page(&inputs[acomp_idx], page, PAGE_SIZE, 0);
  1092	
  1093			/*
  1094			 * Each dst buffer should be of size (PAGE_SIZE * 2).
  1095			 * Reflect same in sg_list.
  1096			 */
  1097			sg_init_one(&outputs[acomp_idx], acomp_ctx->buffers[acomp_idx], PAGE_SIZE * 2);
  1098			acomp_request_set_params(acomp_ctx->reqs[acomp_idx], &inputs[acomp_idx],
  1099						 &outputs[acomp_idx], PAGE_SIZE, PAGE_SIZE);
  1100	
  1101			if (batching) {
  1102				/* Add the acomp request to the chain. */
  1103				if (likely(i))
  1104					acomp_request_chain(acomp_ctx->reqs[acomp_idx], acomp_ctx->reqs[0]);
  1105				else
  1106					acomp_reqchain_init(acomp_ctx->reqs[0], 0, crypto_req_done,
  1107							    &acomp_ctx->wait);
  1108	
  1109				if (i == (nr_pages - 1)) {
  1110					/* Process the request chain. */
  1111					err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
  1112	
  1113					/*
  1114					 * Get the individual compress errors from request chaining.
  1115					 */
  1116					for (j = 0; j < nr_pages; ++j) {
  1117						if (unlikely(acomp_request_err(acomp_ctx->reqs[j]))) {
  1118							err = -EINVAL;
  1119							if (acomp_request_err(acomp_ctx->reqs[j]) == -ENOSPC)
  1120								zswap_reject_compress_poor++;
  1121							else
  1122								zswap_reject_compress_fail++;
  1123						}
  1124					}
  1125					/*
  1126					 * Request chaining cleanup:
  1127					 *
  1128					 * - Clear the CRYPTO_TFM_REQ_CHAIN bit on acomp_ctx->reqs[0].
  1129					 * - Reset the acomp_ctx->wait to notify acomp_ctx->reqs[0].
  1130					 */
  1131					acomp_reqchain_clear(acomp_ctx->reqs[0], &acomp_ctx->wait);
  1132					if (unlikely(err))
  1133						return false;
  1134					j = 0;
  1135					nr_to_store = nr_pages;
  1136					goto store_zpool;
  1137				}
  1138	
  1139				++acomp_idx;
  1140				continue;
  1141			} else {
  1142				err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
  1143	
  1144				if (unlikely(err)) {
  1145					if (err == -ENOSPC)
  1146						zswap_reject_compress_poor++;
  1147					else
  1148						zswap_reject_compress_fail++;
  1149					return false;
  1150				}
  1151				j = i;
  1152				nr_to_store = 1;
  1153			}
  1154	
  1155	store_zpool:
  1156			/*
  1157			 * All batch pages were successfully compressed.
  1158			 * Store the pages in zpool.
  1159			 */
  1160			acomp_idx = -1;
  1161			while (nr_to_store--) {
  1162				unsigned long handle;
  1163				char *buf;
  1164	
  1165				++acomp_idx;
> 1166				prefetchw(entries[j]);
  1167				err = zpool_malloc(zpool, acomp_ctx->reqs[acomp_idx]->dlen, gfp, &handle);
  1168	
  1169				if (unlikely(err)) {
  1170					if (err == -ENOSPC)
  1171						zswap_reject_compress_poor++;
  1172					else
  1173						zswap_reject_alloc_fail++;
  1174	
  1175					return false;
  1176				}
  1177	
  1178				buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
  1179				memcpy(buf, acomp_ctx->buffers[acomp_idx], acomp_ctx->reqs[acomp_idx]->dlen);
  1180				zpool_unmap_handle(zpool, handle);
  1181	
  1182				entries[j]->handle = handle;
  1183				entries[j]->length = acomp_ctx->reqs[acomp_idx]->dlen;
  1184				++j;
  1185			}
  1186		}
  1187	
  1188		return true;
  1189	}
  1190
kernel test robot March 3, 2025, 11:07 a.m. UTC | #2
Hi Kanchana,

kernel test robot noticed the following build errors:

[auto build test ERROR on 5f089a9aa987ccf72df0c6955e168e865f280603]

url:    https://github.com/intel-lab-lkp/linux/commits/Kanchana-P-Sridhar/crypto-acomp-Add-synchronous-asynchronous-acomp-request-chaining/20250303-164927
base:   5f089a9aa987ccf72df0c6955e168e865f280603
patch link:    https://lore.kernel.org/r/20250303084724.6490-15-kanchana.p.sridhar%40intel.com
patch subject: [PATCH v8 14/14] mm: zswap: Compress batching with request chaining in zswap_store() of large folios.
config: csky-randconfig-002-20250303 (https://download.01.org/0day-ci/archive/20250303/202503031843.GyX5ZVrC-lkp@intel.com/config)
compiler: csky-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250303/202503031843.GyX5ZVrC-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503031843.GyX5ZVrC-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/zswap.c: In function 'zswap_batch_compress':
>> mm/zswap.c:1166:25: error: implicit declaration of function 'prefetchw' [-Wimplicit-function-declaration]
    1166 |                         prefetchw(entries[j]);
         |                         ^~~~~~~~~


vim +/prefetchw +1166 mm/zswap.c

  1053	
  1054	/*
  1055	 * Unified code paths for compressors that do and do not support
  1056	 * batching. This procedure will compress multiple @nr_pages in @folio,
  1057	 * starting from @index.
  1058	 * If @batching is set to true, it will create a request chain for
  1059	 * compression batching. It is assumed that the caller has verified
  1060	 * that the acomp_ctx->nr_reqs is at least @nr_pages.
  1061	 * If @batching is set to false, it will process each page sequentially.
  1062	 * In both cases, if all compressions were successful, it will proceed
  1063	 * to store the compressed buffers in zpool.
  1064	 */
  1065	static bool zswap_batch_compress(struct folio *folio,
  1066					 long index,
  1067					 unsigned int nr_pages,
  1068					 struct zswap_entry *entries[],
  1069					 struct zswap_pool *pool,
  1070					 struct crypto_acomp_ctx *acomp_ctx,
  1071					 bool batching)
  1072	{
  1073		struct scatterlist inputs[ZSWAP_MAX_BATCH_SIZE];
  1074		struct scatterlist outputs[ZSWAP_MAX_BATCH_SIZE];
  1075		struct zpool *zpool = pool->zpool;
  1076		int acomp_idx = 0, nr_to_store = 1;
  1077		unsigned int i, j;
  1078		int err = 0;
  1079		gfp_t gfp;
  1080	
  1081		lockdep_assert_held(&acomp_ctx->mutex);
  1082	
  1083		gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
  1084		if (zpool_malloc_support_movable(zpool))
  1085			gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
  1086	
  1087		for (i = 0; i < nr_pages; ++i) {
  1088			struct page *page = folio_page(folio, index + i);
  1089	
  1090			sg_init_table(&inputs[acomp_idx], 1);
  1091			sg_set_page(&inputs[acomp_idx], page, PAGE_SIZE, 0);
  1092	
  1093			/*
  1094			 * Each dst buffer should be of size (PAGE_SIZE * 2).
  1095			 * Reflect same in sg_list.
  1096			 */
  1097			sg_init_one(&outputs[acomp_idx], acomp_ctx->buffers[acomp_idx], PAGE_SIZE * 2);
  1098			acomp_request_set_params(acomp_ctx->reqs[acomp_idx], &inputs[acomp_idx],
  1099						 &outputs[acomp_idx], PAGE_SIZE, PAGE_SIZE);
  1100	
  1101			if (batching) {
  1102				/* Add the acomp request to the chain. */
  1103				if (likely(i))
  1104					acomp_request_chain(acomp_ctx->reqs[acomp_idx], acomp_ctx->reqs[0]);
  1105				else
  1106					acomp_reqchain_init(acomp_ctx->reqs[0], 0, crypto_req_done,
  1107							    &acomp_ctx->wait);
  1108	
  1109				if (i == (nr_pages - 1)) {
  1110					/* Process the request chain. */
  1111					err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
  1112	
  1113					/*
  1114					 * Get the individual compress errors from request chaining.
  1115					 */
  1116					for (j = 0; j < nr_pages; ++j) {
  1117						if (unlikely(acomp_request_err(acomp_ctx->reqs[j]))) {
  1118							err = -EINVAL;
  1119							if (acomp_request_err(acomp_ctx->reqs[j]) == -ENOSPC)
  1120								zswap_reject_compress_poor++;
  1121							else
  1122								zswap_reject_compress_fail++;
  1123						}
  1124					}
  1125					/*
  1126					 * Request chaining cleanup:
  1127					 *
  1128					 * - Clear the CRYPTO_TFM_REQ_CHAIN bit on acomp_ctx->reqs[0].
  1129					 * - Reset the acomp_ctx->wait to notify acomp_ctx->reqs[0].
  1130					 */
  1131					acomp_reqchain_clear(acomp_ctx->reqs[0], &acomp_ctx->wait);
  1132					if (unlikely(err))
  1133						return false;
  1134					j = 0;
  1135					nr_to_store = nr_pages;
  1136					goto store_zpool;
  1137				}
  1138	
  1139				++acomp_idx;
  1140				continue;
  1141			} else {
  1142				err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
  1143	
  1144				if (unlikely(err)) {
  1145					if (err == -ENOSPC)
  1146						zswap_reject_compress_poor++;
  1147					else
  1148						zswap_reject_compress_fail++;
  1149					return false;
  1150				}
  1151				j = i;
  1152				nr_to_store = 1;
  1153			}
  1154	
  1155	store_zpool:
  1156			/*
  1157			 * All batch pages were successfully compressed.
  1158			 * Store the pages in zpool.
  1159			 */
  1160			acomp_idx = -1;
  1161			while (nr_to_store--) {
  1162				unsigned long handle;
  1163				char *buf;
  1164	
  1165				++acomp_idx;
> 1166				prefetchw(entries[j]);
  1167				err = zpool_malloc(zpool, acomp_ctx->reqs[acomp_idx]->dlen, gfp, &handle);
  1168	
  1169				if (unlikely(err)) {
  1170					if (err == -ENOSPC)
  1171						zswap_reject_compress_poor++;
  1172					else
  1173						zswap_reject_alloc_fail++;
  1174	
  1175					return false;
  1176				}
  1177	
  1178				buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
  1179				memcpy(buf, acomp_ctx->buffers[acomp_idx], acomp_ctx->reqs[acomp_idx]->dlen);
  1180				zpool_unmap_handle(zpool, handle);
  1181	
  1182				entries[j]->handle = handle;
  1183				entries[j]->length = acomp_ctx->reqs[acomp_idx]->dlen;
  1184				++j;
  1185			}
  1186		}
  1187	
  1188		return true;
  1189	}
  1190
Nhat Pham March 3, 2025, 6:21 p.m. UTC | #3
On Mon, Mar 3, 2025 at 3:07 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi Kanchana,
>
> kernel test robot noticed the following build errors:
>
> > 1166                          prefetchw(entries[j]);
> --

Why are we doing this anyway? Does it have a notable performance
difference? At the very least, leave a comment explaining why we're
prefetching this (although the build error suggests that we have to
remove it anyway).
Sridhar, Kanchana P March 3, 2025, 9:34 p.m. UTC | #4
> -----Original Message-----
> From: Nhat Pham <nphamcs@gmail.com>
> Sent: Monday, March 3, 2025 10:22 AM
> To: lkp <lkp@intel.com>
> Cc: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>; linux-
> kernel@vger.kernel.org; linux-mm@kvack.org; hannes@cmpxchg.org;
> yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> ying.huang@linux.alibaba.com; akpm@linux-foundation.org; linux-
> crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; llvm@lists.linux.dev; oe-kbuild-
> all@lists.linux.dev; Feghali, Wajdi K <wajdi.k.feghali@intel.com>; Gopal,
> Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v8 14/14] mm: zswap: Compress batching with request
> chaining in zswap_store() of large folios.
> 
> On Mon, Mar 3, 2025 at 3:07 AM kernel test robot <lkp@intel.com> wrote:
> >
> > Hi Kanchana,
> >
> > kernel test robot noticed the following build errors:
> >
> > > 1166                          prefetchw(entries[j]);
> > --
> 
> Why are we doing this anyway? Does it have a notable performance
> difference? At the very least, leave a comment explaining why we're
> prefetching this (although the build error suggests that we have to
> remove it anyway).

Hi Nhat,

Yes, it does. The use of prefetchw reduces sys time by ~1.5% because
it minimizes cache-miss latency by moving the zswap entry to the cache
before it is written to. 

This is data with kernel compilation test, v8 without prefetchw and v8 as-is:

--------------------------------------------------------------------------------
 Kernel compile       v8 without               v8      v8 without              v8
 allmodconfig          prefetchw                        prefetchw
 2M folios
 --------------------------------------------------------------------------------
 zswap compressor    deflate-iaa      deflate-iaa            zstd            zstd   
 --------------------------------------------------------------------------------
 real_sec                 732.89           735.63          768.53          758.21
 user_sec              15,708.37        15,699.84       15,702.64       15,678.73
 sys_sec                4,632.58         4,563.70        5,735.06        5,635.69
 --------------------------------------------------------------------------------
 Max_Res_Set_Size_KB   1,874,672        1,867,516       1,874,684       1,872,888
 --------------------------------------------------------------------------------
 memcg_high                    0                0               0               0
 memcg_swap_fail               0                0               0               0
 zswpout             114,742,930      112,836,725      92,904,961      89,596,085
 zswpin               41,184,897       39,983,793      31,018,149      29,163,932
 pswpout                     625            1,069             558           1,059
 pswpin                      599            1,056             540           1,051
 thp_swpout                    1                2               1               2
 thp_swpout_fallback      10,967           10,195           6,918           6,141
 pgmajfault           42,588,331       41,349,069      31,931,882      30,006,422
 ZSWPOUT-2048kB            7,661            8,710           6,799           7,480
 SWPOUT-2048kB                 1                2               1               2
 --------------------------------------------------------------------------------


Sure, I will add a comment, and also "#include <linux/prefetch.h>" in zswap.c
that will resolve the build error. This is similar to how these files handle prefetchw:
mm/vmscan.c, kernel/locking/qspinlock.c, include/asm-generic/xor.h, etc.

Thanks,
Kanchana
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index fae59d6d5147..135d5792ce50 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1051,74 +1051,141 @@  static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
 	mutex_unlock(&acomp_ctx->mutex);
 }
 
-static bool zswap_compress(struct page *page, struct zswap_entry *entry,
-			   struct zswap_pool *pool)
+/*
+ * Unified code paths for compressors that do and do not support
+ * batching. This procedure will compress multiple @nr_pages in @folio,
+ * starting from @index.
+ * If @batching is set to true, it will create a request chain for
+ * compression batching. It is assumed that the caller has verified
+ * that the acomp_ctx->nr_reqs is at least @nr_pages.
+ * If @batching is set to false, it will process each page sequentially.
+ * In both cases, if all compressions were successful, it will proceed
+ * to store the compressed buffers in zpool.
+ */
+static bool zswap_batch_compress(struct folio *folio,
+				 long index,
+				 unsigned int nr_pages,
+				 struct zswap_entry *entries[],
+				 struct zswap_pool *pool,
+				 struct crypto_acomp_ctx *acomp_ctx,
+				 bool batching)
 {
-	struct crypto_acomp_ctx *acomp_ctx;
-	struct scatterlist input, output;
-	int comp_ret = 0, alloc_ret = 0;
-	unsigned int dlen = PAGE_SIZE;
-	unsigned long handle;
-	struct zpool *zpool;
-	char *buf;
+	struct scatterlist inputs[ZSWAP_MAX_BATCH_SIZE];
+	struct scatterlist outputs[ZSWAP_MAX_BATCH_SIZE];
+	struct zpool *zpool = pool->zpool;
+	int acomp_idx = 0, nr_to_store = 1;
+	unsigned int i, j;
+	int err = 0;
 	gfp_t gfp;
-	u8 *dst;
 
-	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
-	dst = acomp_ctx->buffers[0];
-	sg_init_table(&input, 1);
-	sg_set_page(&input, page, PAGE_SIZE, 0);
+	lockdep_assert_held(&acomp_ctx->mutex);
 
-	/*
-	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-	 * and hardware-accelerators may won't check the dst buffer size, so
-	 * giving the dst buffer with enough length to avoid buffer overflow.
-	 */
-	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->reqs[0], &input, &output, PAGE_SIZE, dlen);
-
-	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in this
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
-	 */
-	comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
-	dlen = acomp_ctx->reqs[0]->dlen;
-	if (comp_ret)
-		goto unlock;
-
-	zpool = pool->zpool;
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
 	if (zpool_malloc_support_movable(zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (alloc_ret)
-		goto unlock;
 
-	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
-	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(zpool, handle);
+	for (i = 0; i < nr_pages; ++i) {
+		struct page *page = folio_page(folio, index + i);
 
-	entry->handle = handle;
-	entry->length = dlen;
+		sg_init_table(&inputs[acomp_idx], 1);
+		sg_set_page(&inputs[acomp_idx], page, PAGE_SIZE, 0);
 
-unlock:
-	if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
-		zswap_reject_compress_poor++;
-	else if (comp_ret)
-		zswap_reject_compress_fail++;
-	else if (alloc_ret)
-		zswap_reject_alloc_fail++;
+		/*
+		 * Each dst buffer should be of size (PAGE_SIZE * 2).
+		 * Reflect same in sg_list.
+		 */
+		sg_init_one(&outputs[acomp_idx], acomp_ctx->buffers[acomp_idx], PAGE_SIZE * 2);
+		acomp_request_set_params(acomp_ctx->reqs[acomp_idx], &inputs[acomp_idx],
+					 &outputs[acomp_idx], PAGE_SIZE, PAGE_SIZE);
+
+		if (batching) {
+			/* Add the acomp request to the chain. */
+			if (likely(i))
+				acomp_request_chain(acomp_ctx->reqs[acomp_idx], acomp_ctx->reqs[0]);
+			else
+				acomp_reqchain_init(acomp_ctx->reqs[0], 0, crypto_req_done,
+						    &acomp_ctx->wait);
+
+			if (i == (nr_pages - 1)) {
+				/* Process the request chain. */
+				err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
+
+				/*
+				 * Get the individual compress errors from request chaining.
+				 */
+				for (j = 0; j < nr_pages; ++j) {
+					if (unlikely(acomp_request_err(acomp_ctx->reqs[j]))) {
+						err = -EINVAL;
+						if (acomp_request_err(acomp_ctx->reqs[j]) == -ENOSPC)
+							zswap_reject_compress_poor++;
+						else
+							zswap_reject_compress_fail++;
+					}
+				}
+				/*
+				 * Request chaining cleanup:
+				 *
+				 * - Clear the CRYPTO_TFM_REQ_CHAIN bit on acomp_ctx->reqs[0].
+				 * - Reset the acomp_ctx->wait to notify acomp_ctx->reqs[0].
+				 */
+				acomp_reqchain_clear(acomp_ctx->reqs[0], &acomp_ctx->wait);
+				if (unlikely(err))
+					return false;
+				j = 0;
+				nr_to_store = nr_pages;
+				goto store_zpool;
+			}
+
+			++acomp_idx;
+			continue;
+		} else {
+			err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
+
+			if (unlikely(err)) {
+				if (err == -ENOSPC)
+					zswap_reject_compress_poor++;
+				else
+					zswap_reject_compress_fail++;
+				return false;
+			}
+			j = i;
+			nr_to_store = 1;
+		}
 
-	acomp_ctx_put_unlock(acomp_ctx);
-	return comp_ret == 0 && alloc_ret == 0;
+store_zpool:
+		/*
+		 * All batch pages were successfully compressed.
+		 * Store the pages in zpool.
+		 */
+		acomp_idx = -1;
+		while (nr_to_store--) {
+			unsigned long handle;
+			char *buf;
+
+			++acomp_idx;
+			prefetchw(entries[j]);
+			err = zpool_malloc(zpool, acomp_ctx->reqs[acomp_idx]->dlen, gfp, &handle);
+
+			if (unlikely(err)) {
+				if (err == -ENOSPC)
+					zswap_reject_compress_poor++;
+				else
+					zswap_reject_alloc_fail++;
+
+				return false;
+			}
+
+			buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+			memcpy(buf, acomp_ctx->buffers[acomp_idx], acomp_ctx->reqs[acomp_idx]->dlen);
+			zpool_unmap_handle(zpool, handle);
+
+			entries[j]->handle = handle;
+			entries[j]->length = acomp_ctx->reqs[acomp_idx]->dlen;
+			++j;
+		}
+	}
+
+	return true;
 }
 
 static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
@@ -1581,84 +1648,165 @@  static void shrink_worker(struct work_struct *w)
 * main API
 **********************************/
 
-static bool zswap_store_page(struct page *page,
-			     struct obj_cgroup *objcg,
-			     struct zswap_pool *pool)
+/*
+ * Store multiple pages in @folio, starting from the page at index @si up to
+ * and including the page at index @ei.
+ *
+ * The error handling from all failure points is consolidated to the
+ * "store_pages_failed" label, based on the initialization of the zswap entries'
+ * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the
+ * entry's handle is subsequently modified only upon a successful zpool_malloc()
+ * after the page is compressed.
+ */
+static bool zswap_store_pages(struct folio *folio,
+			      long si,
+			      long ei,
+			      struct obj_cgroup *objcg,
+			      struct zswap_pool *pool,
+			      struct crypto_acomp_ctx *acomp_ctx,
+			      bool batching)
 {
-	swp_entry_t page_swpentry = page_swap_entry(page);
-	struct zswap_entry *entry, *old;
+	struct zswap_entry *entries[ZSWAP_MAX_BATCH_SIZE];
+	int node_id = folio_nid(folio);
+	u8 i, from_i = 0, nr_pages = ei - si + 1;
 
-	/* allocate entry */
-	entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
-	if (!entry) {
-		zswap_reject_kmemcache_fail++;
-		return false;
+	for (i = 0; i < nr_pages; ++i) {
+		entries[i] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
+
+		if (unlikely(!entries[i])) {
+			zswap_reject_kmemcache_fail++;
+			nr_pages = i;
+			goto store_pages_failed;
+		}
+
+		entries[i]->handle = (unsigned long)ERR_PTR(-EINVAL);
 	}
 
-	if (!zswap_compress(page, entry, pool))
-		goto compress_failed;
+	if (!zswap_batch_compress(folio, si, nr_pages, entries, pool, acomp_ctx, batching))
+		goto store_pages_failed;
 
-	old = xa_store(swap_zswap_tree(page_swpentry),
-		       swp_offset(page_swpentry),
-		       entry, GFP_KERNEL);
-	if (xa_is_err(old)) {
-		int err = xa_err(old);
+	for (i = 0; i < nr_pages; ++i) {
+		swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, si + i));
+		struct zswap_entry *old, *entry = entries[i];
 
-		WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
-		zswap_reject_alloc_fail++;
-		goto store_failed;
-	}
+		old = xa_store(swap_zswap_tree(page_swpentry),
+			       swp_offset(page_swpentry),
+			       entry, GFP_KERNEL);
+		if (unlikely(xa_is_err(old))) {
+			int err = xa_err(old);
 
-	/*
-	 * We may have had an existing entry that became stale when
-	 * the folio was redirtied and now the new version is being
-	 * swapped out. Get rid of the old.
-	 */
-	if (old)
-		zswap_entry_free(old);
+			WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
+			zswap_reject_alloc_fail++;
+			from_i = i;
+			goto store_pages_failed;
+		}
 
-	/*
-	 * The entry is successfully compressed and stored in the tree, there is
-	 * no further possibility of failure. Grab refs to the pool and objcg,
-	 * charge zswap memory, and increment zswap_stored_pages.
-	 * The opposite actions will be performed by zswap_entry_free()
-	 * when the entry is removed from the tree.
-	 */
-	zswap_pool_get(pool);
-	if (objcg) {
-		obj_cgroup_get(objcg);
-		obj_cgroup_charge_zswap(objcg, entry->length);
-	}
-	atomic_long_inc(&zswap_stored_pages);
+		/*
+		 * We may have had an existing entry that became stale when
+		 * the folio was redirtied and now the new version is being
+		 * swapped out. Get rid of the old.
+		 */
+		if (unlikely(old))
+			zswap_entry_free(old);
 
-	/*
-	 * We finish initializing the entry while it's already in xarray.
-	 * This is safe because:
-	 *
-	 * 1. Concurrent stores and invalidations are excluded by folio lock.
-	 *
-	 * 2. Writeback is excluded by the entry not being on the LRU yet.
-	 *    The publishing order matters to prevent writeback from seeing
-	 *    an incoherent entry.
-	 */
-	entry->pool = pool;
-	entry->swpentry = page_swpentry;
-	entry->objcg = objcg;
-	entry->referenced = true;
-	if (entry->length) {
-		INIT_LIST_HEAD(&entry->lru);
-		zswap_lru_add(&zswap_list_lru, entry);
+		/*
+		 * The entry is successfully compressed and stored in the tree, there is
+		 * no further possibility of failure. Grab refs to the pool and objcg,
+		 * charge zswap memory, and increment zswap_stored_pages.
+		 * The opposite actions will be performed by zswap_entry_free()
+		 * when the entry is removed from the tree.
+		 */
+		zswap_pool_get(pool);
+		if (objcg) {
+			obj_cgroup_get(objcg);
+			obj_cgroup_charge_zswap(objcg, entry->length);
+		}
+		atomic_long_inc(&zswap_stored_pages);
+
+		/*
+		 * We finish initializing the entry while it's already in xarray.
+		 * This is safe because:
+		 *
+		 * 1. Concurrent stores and invalidations are excluded by folio lock.
+		 *
+		 * 2. Writeback is excluded by the entry not being on the LRU yet.
+		 *    The publishing order matters to prevent writeback from seeing
+		 *    an incoherent entry.
+		 */
+		entry->pool = pool;
+		entry->swpentry = page_swpentry;
+		entry->objcg = objcg;
+		entry->referenced = true;
+		if (likely(entry->length)) {
+			INIT_LIST_HEAD(&entry->lru);
+			zswap_lru_add(&zswap_list_lru, entry);
+		}
 	}
 
 	return true;
 
-store_failed:
-	zpool_free(pool->zpool, entry->handle);
-compress_failed:
-	zswap_entry_cache_free(entry);
+store_pages_failed:
+	for (i = from_i; i < nr_pages; ++i) {
+		if (!IS_ERR_VALUE(entries[i]->handle))
+			zpool_free(pool->zpool, entries[i]->handle);
+
+		zswap_entry_cache_free(entries[i]);
+	}
+
 	return false;
 }
 
+/*
+ * Store all pages in a folio by calling zswap_batch_compress().
+ * If the compressor supports batching, i.e., has multiple acomp requests,
+ * the folio will be compressed in batches of "acomp_ctx->nr_reqs" using
+ * request chaining.
+ * If the compressor has only one acomp request, the folio will be compressed
+ * in batches of ZSWAP_MAX_BATCH_SIZE pages, where each page in the batch is
+ * compressed sequentially.
+ */
+static bool zswap_store_folio(struct folio *folio,
+			      struct obj_cgroup *objcg,
+			      struct zswap_pool *pool)
+{
+	long nr_pages = folio_nr_pages(folio);
+	struct crypto_acomp_ctx *acomp_ctx;
+	unsigned int batch_size;
+	bool ret = true, batching;
+	long si, ei;
+
+	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+
+	batching = ((acomp_ctx->nr_reqs > 1) && (nr_pages > 1));
+
+	batch_size = batching ? acomp_ctx->nr_reqs : ZSWAP_MAX_BATCH_SIZE;
+
+	if (!batching)
+		acomp_ctx_put_unlock(acomp_ctx);
+
+	/* Store the folio in batches of "batch_size" pages. */
+	for (si = 0, ei = min(si + batch_size - 1, nr_pages - 1);
+	     ((si < nr_pages) && (ei < nr_pages));
+	     si = ei + 1, ei = min(si + batch_size - 1, nr_pages - 1)) {
+
+		if (!batching)
+			acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+
+		if (!zswap_store_pages(folio, si, ei, objcg, pool, acomp_ctx, batching)) {
+			ret = false;
+			break;
+		}
+
+		if (!batching)
+			acomp_ctx_put_unlock(acomp_ctx);
+	}
+
+	if (batching || !ret)
+		acomp_ctx_put_unlock(acomp_ctx);
+
+	return ret;
+}
+
 bool zswap_store(struct folio *folio)
 {
 	long nr_pages = folio_nr_pages(folio);
@@ -1667,7 +1815,6 @@  bool zswap_store(struct folio *folio)
 	struct mem_cgroup *memcg = NULL;
 	struct zswap_pool *pool;
 	bool ret = false;
-	long index;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1701,12 +1848,8 @@  bool zswap_store(struct folio *folio)
 		mem_cgroup_put(memcg);
 	}
 
-	for (index = 0; index < nr_pages; ++index) {
-		struct page *page = folio_page(folio, index);
-
-		if (!zswap_store_page(page, objcg, pool))
-			goto put_pool;
-	}
+	if (!zswap_store_folio(folio, objcg, pool))
+		goto put_pool;
 
 	if (objcg)
 		count_objcg_events(objcg, ZSWPOUT, nr_pages);
@@ -1733,6 +1876,7 @@  bool zswap_store(struct folio *folio)
 		pgoff_t offset = swp_offset(swp);
 		struct zswap_entry *entry;
 		struct xarray *tree;
+		long index;
 
 		for (index = 0; index < nr_pages; ++index) {
 			tree = swap_zswap_tree(swp_entry(type, offset + index));