@@ -144,10 +144,12 @@ bool zswap_never_enabled(void)
struct crypto_acomp_ctx {
struct crypto_acomp *acomp;
struct acomp_req *req;
- struct crypto_wait wait;
u8 *buffer;
+ unsigned int nr_reqs;
+ struct crypto_wait wait;
struct mutex mutex;
bool is_sleepable;
+ bool __online;
};
/*
@@ -246,6 +248,122 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+{
+ if (!IS_ERR_OR_NULL(acomp_ctx) && acomp_ctx->nr_reqs) {
+
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+ acomp_ctx->req = NULL;
+
+ kfree(acomp_ctx->buffer);
+ acomp_ctx->buffer = NULL;
+
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+
+ acomp_ctx->nr_reqs = 0;
+ }
+}
+
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ int ret = -ENOMEM;
+
+ /*
+ * Just to be even more fail-safe against changes in assumptions and/or
+ * implementation of the CPU hotplug code.
+ */
+ if (acomp_ctx->__online)
+ return 0;
+
+ if (acomp_ctx->nr_reqs) {
+ acomp_ctx->__online = true;
+ return 0;
+ }
+
+ acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp_ctx->acomp)) {
+ pr_err("could not alloc crypto acomp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
+ ret = PTR_ERR(acomp_ctx->acomp);
+ goto fail;
+ }
+
+ acomp_ctx->nr_reqs = 1;
+
+ acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!acomp_ctx->req) {
+ pr_err("could not alloc crypto acomp_request %s\n",
+ pool->tfm_name);
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ crypto_init_wait(&acomp_ctx->wait);
+
+ /*
+ * if the backend of acomp is async zip, crypto_req_done() will wakeup
+ * crypto_wait_req(); if the backend of acomp is scomp, the callback
+ * won't be called, crypto_wait_req() will return without blocking.
+ */
+ acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &acomp_ctx->wait);
+
+ acomp_ctx->is_sleepable = acomp_is_async(acomp_ctx->acomp);
+
+ acomp_ctx->__online = true;
+
+ return 0;
+
+fail:
+ acomp_ctx_dealloc(acomp_ctx);
+
+ return ret;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ mutex_lock(&acomp_ctx->mutex);
+ acomp_ctx->__online = false;
+ mutex_unlock(&acomp_ctx->mutex);
+
+ return 0;
+}
+
+static void zswap_cpu_comp_dealloc(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ /*
+ * The lifetime of acomp_ctx resources is from pool creation to
+ * pool deletion.
+ *
+ * Reclaims should not be happening because, we get to this routine only
+ * in two scenarios:
+ *
+ * 1) pool creation failures before/during the pool ref initialization.
+ * 2) we are in the process of releasing the pool, it is off the
+ * zswap_pools list and has no references.
+ *
+ * Hence, there is no need for locks.
+ */
+ acomp_ctx->__online = false;
+ acomp_ctx_dealloc(acomp_ctx);
+}
+
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
@@ -285,13 +403,21 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
- for_each_possible_cpu(cpu)
- mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+ for_each_possible_cpu(cpu) {
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->req = NULL;
+ acomp_ctx->buffer = NULL;
+ acomp_ctx->__online = false;
+ acomp_ctx->nr_reqs = 0;
+ mutex_init(&acomp_ctx->mutex);
+ }
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
- goto error;
+ goto ref_fail;
/* being the current pool takes 1 ref; this func expects the
* caller to always add the new pool as the current pool
@@ -307,6 +433,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
return pool;
ref_fail:
+ for_each_possible_cpu(cpu)
+ zswap_cpu_comp_dealloc(cpu, &pool->node);
+
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
error:
if (pool->acomp_ctx)
@@ -361,8 +490,13 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
static void zswap_pool_destroy(struct zswap_pool *pool)
{
+ int cpu;
+
zswap_pool_debug("destroying", pool);
+ for_each_possible_cpu(cpu)
+ zswap_cpu_comp_dealloc(cpu, &pool->node);
+
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
@@ -816,85 +950,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
/*********************************
* compressed storage functions
**********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp = NULL;
- struct acomp_req *req = NULL;
- u8 *buffer = NULL;
- int ret;
-
- buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!buffer) {
- ret = -ENOMEM;
- goto fail;
- }
-
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
- pr_err("could not alloc crypto acomp %s : %ld\n",
- pool->tfm_name, PTR_ERR(acomp));
- ret = PTR_ERR(acomp);
- goto fail;
- }
-
- req = acomp_request_alloc(acomp);
- if (!req) {
- pr_err("could not alloc crypto acomp_request %s\n",
- pool->tfm_name);
- ret = -ENOMEM;
- goto fail;
- }
-
- /*
- * Only hold the mutex after completing allocations, otherwise we may
- * recurse into zswap through reclaim and attempt to hold the mutex
- * again resulting in a deadlock.
- */
- mutex_lock(&acomp_ctx->mutex);
- crypto_init_wait(&acomp_ctx->wait);
-
- /*
- * if the backend of acomp is async zip, crypto_req_done() will wakeup
- * crypto_wait_req(); if the backend of acomp is scomp, the callback
- * won't be called, crypto_wait_req() will return without blocking.
- */
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &acomp_ctx->wait);
-
- acomp_ctx->buffer = buffer;
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- acomp_ctx->req = req;
- mutex_unlock(&acomp_ctx->mutex);
- return 0;
-
-fail:
- if (acomp)
- crypto_free_acomp(acomp);
- kfree(buffer);
- return ret;
-}
-
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
- mutex_lock(&acomp_ctx->mutex);
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- acomp_ctx->req = NULL;
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
- mutex_unlock(&acomp_ctx->mutex);
-
- return 0;
-}
static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
{
@@ -902,16 +957,52 @@ static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
for (;;) {
acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
- if (likely(acomp_ctx->req))
- return acomp_ctx;
/*
- * It is possible that we were migrated to a different CPU after
- * getting the per-CPU ctx but before the mutex was acquired. If
- * the old CPU got offlined, zswap_cpu_comp_dead() could have
- * already freed ctx->req (among other things) and set it to
- * NULL. Just try again on the new CPU that we ended up on.
+ * If the CPU onlining code successfully allocates acomp_ctx resources,
+ * it sets acomp_ctx->initialized to true. Until this happens, we have
+ * two options:
+ *
+ * 1. Return NULL and fail all stores on this CPU.
+ * 2. Retry, until onlining has finished allocating resources.
+ *
+ * In theory, option 1 could be more appropriate, because it
+ * allows the calling procedure to decide how it wants to handle
+ * reclaim racing with CPU hotplug. For instance, it might be Ok
+ * for compress to return an error for the backing swap device
+ * to store the folio. Decompress could wait until we get a
+ * valid and locked mutex after onlining has completed. For now,
+ * we go with option 2 because adding a do-while in
+ * zswap_decompress() adds latency for software compressors.
+ *
+ * Once initialized, the resources will be de-allocated only
+ * when the pool is destroyed. The acomp_ctx will hold on to the
+ * resources through CPU offlining/onlining at any time until
+ * the pool is destroyed.
+ *
+ * This prevents races/deadlocks between reclaim and CPU acomp_ctx
+ * resource allocation that are a dependency for reclaim.
+ * It further simplifies the interaction with CPU onlining and
+ * offlining:
+ *
+ * - CPU onlining does not take the mutex. It only allocates
+ * resources and sets __online to true.
+ * - CPU offlining acquires the mutex before setting
+ * __online to false. If reclaim has acquired the mutex,
+ * offlining will have to wait for reclaim to complete before
+ * hotunplug can proceed. Further, hotplug merely sets
+ * __online to false. It does not delete the acomp_ctx
+ * resources.
+ *
+ * Option 1 is better than potentially not exiting the earlier
+ * for (;;) loop because the system is running low on memory
+ * and/or CPUs are getting offlined for whatever reason. At
+ * least failing this store will prevent data loss by failing
+ * zswap_store(), and saving the data in the backing swap device.
*/
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->__online))
+ return acomp_ctx;
+
mutex_unlock(&acomp_ctx->mutex);
}
}
This patch modifies the acomp_ctx resources' lifetime to be from pool creation to deletion. A "bool __online" and "unsigned int nr_reqs" are added to "struct crypto_acomp_ctx" which simplify a few things: 1) zswap_pool_create() will initialize all members of each percpu acomp_ctx to 0 or NULL and only then initialize the mutex. 2) CPU hotplug will set nr_reqs to 1, allocate resources and set __online to true, without locking the mutex. 3) CPU hotunplug will lock the mutex before setting __online to false. It will not delete any resources. 4) acomp_ctx_get_cpu_lock() will lock the mutex, then check if __online is true, and if so, return the mutex for use in zswap compress and decompress ops. 5) CPU onlining after offlining will simply check if either __online or nr_reqs are non-0, and return 0 if so, with re-allocating the resources. 6) zswap_pool_destroy() will call a newly added zswap_cpu_comp_dealloc() to delete the acomp_ctx resources. 7) Common resource deletion code in case of zswap_cpu_comp_prepare() errors, and for use in zswap_cpu_comp_dealloc(), is factored into a new acomp_ctx_dealloc(). The CPU hot[un]plug callback functions are moved to "pool functions" accordingly. The per-cpu memory cost of not deleting the acomp_ctx resources upon CPU offlining, and only deleting them when the pool is destroyed, is as follows: IAA with batching: 64.8 KB Software compressors: 8.2 KB I would appreciate code review comments on whether this memory cost is acceptable, for the latency improvement that it provides due to a faster reclaim restart after a CPU hotunplug-hotplug sequence - all that the hotplug code needs to do is to check if acomp_ctx->nr_reqs is non-0, and if so, set __online to true and return, and reclaim can proceed. Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> --- mm/zswap.c | 273 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 182 insertions(+), 91 deletions(-)