Message ID | 20230103184257.118069-3-dima@arista.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | net/crypto: Introduce crypto_pool | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessed tree name to be net-next, async |
netdev/fixes_present | success | Fixes tag not required for -next series |
netdev/subject_prefix | success | Link |
netdev/cover_letter | success | Series has a cover letter |
netdev/patch_count | success | Link |
netdev/header_inline | success | No static functions without inline keyword in header files |
netdev/build_32bit | success | Errors and warnings before: 1 this patch: 0 |
netdev/cc_maintainers | success | CCed 3 of 3 maintainers |
netdev/build_clang | success | Errors and warnings before: 2 this patch: 0 |
netdev/module_param | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Signed-off-by tag matches author and committer |
netdev/check_selftest | success | No net selftest shell script |
netdev/verify_fixes | success | No Fixes tag |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 1 this patch: 0 |
netdev/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 142 lines checked |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/source_inline | success | Was 0 now: 0 |
On Tue, 3 Jan 2023 18:42:54 +0000 Dmitry Safonov wrote: > Instead of having build-time hardcoded constant, reallocate scratch > area, if needed by user. Different algos, different users may need > different size of temp per-CPU buffer. Only up-sizing supported for > simplicity. > -static int crypto_pool_scratch_alloc(void) > +/* Slow-path */ > +/** > + * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path > + * @size: request size for the scratch/temp buffer > + */ > +int crypto_pool_reserve_scratch(unsigned long size) Does this have to be a separate call? Can't we make it part of the pool allocation? AFAICT the scratch gets freed when last pool is freed, so the user needs to know to allocate the pool _first_ otherwise there's a potential race: CPU 1 CPU 2 alloc pool set scratch free pool [frees scratch] alloc pool > { > - int cpu; > - > - lockdep_assert_held(&cpool_mutex); > +#define FREE_BATCH_SIZE 64 > + void *free_batch[FREE_BATCH_SIZE]; > + int cpu, err = 0; > + unsigned int i = 0; > > + mutex_lock(&cpool_mutex); > + if (size == scratch_size) { > + for_each_possible_cpu(cpu) { > + if (per_cpu(crypto_pool_scratch, cpu)) > + continue; > + goto allocate_scratch; > + } > + mutex_unlock(&cpool_mutex); > + return 0; > + } > +allocate_scratch: > + size = max(size, scratch_size); > + cpus_read_lock(); > for_each_possible_cpu(cpu) { > - void *scratch = per_cpu(crypto_pool_scratch, cpu); > + void *scratch, *old_scratch; > > - if (scratch) > + scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); > + if (!scratch) { > + err = -ENOMEM; > + break; > + } > + > + old_scratch = per_cpu(crypto_pool_scratch, cpu); > + /* Pairs with crypto_pool_get() */ > + WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch); You're using RCU for protection here, please use rcu accessors. > + if (!cpu_online(cpu)) { > + kfree(old_scratch); > continue; > + } > + free_batch[i++] = old_scratch; > + if (i == FREE_BATCH_SIZE) { > + cpus_read_unlock(); > + synchronize_rcu(); > + while (i > 0) > + kfree(free_batch[--i]); > + cpus_read_lock(); > + } This is a memory allocation routine, can we simplify this by dynamically sizing "free_batch" and using call_rcu()? struct humf_blah { struct rcu_head rcu; unsigned int cnt; void *data[]; }; cheezit = kmalloc(struct_size(blah, data, num_possible_cpus())); for_each .. cheezit->data[cheezit->cnt++] = old_scratch; call_rcu(&cheezit->rcu, my_free_them_scratches) etc. Feels like that'd be much less locking, unlocking and general carefully'ing. > + } > + cpus_read_unlock(); > + if (!err) > + scratch_size = size;
On 1/7/23 02:04, Jakub Kicinski wrote: > On Tue, 3 Jan 2023 18:42:54 +0000 Dmitry Safonov wrote: >> Instead of having build-time hardcoded constant, reallocate scratch >> area, if needed by user. Different algos, different users may need >> different size of temp per-CPU buffer. Only up-sizing supported for >> simplicity. > >> -static int crypto_pool_scratch_alloc(void) >> +/* Slow-path */ >> +/** >> + * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path >> + * @size: request size for the scratch/temp buffer >> + */ >> +int crypto_pool_reserve_scratch(unsigned long size) > > Does this have to be a separate call? Can't we make it part of > the pool allocation? AFAICT the scratch gets freed when last > pool is freed, so the user needs to know to allocate the pool > _first_ otherwise there's a potential race: > > CPU 1 CPU 2 > > alloc pool > set scratch > free pool > [frees scratch] > alloc pool Yeah, I think it will be cleaner if it was an argument for crypto_pool_alloc_*() and would prevent potential misuse as you describe. Which also means that I don't have to declare crypto_pool_scratch_alloc() in patch 1, will just add a new parameter in this patch to alloc function. > >> { >> - int cpu; >> - >> - lockdep_assert_held(&cpool_mutex); >> +#define FREE_BATCH_SIZE 64 >> + void *free_batch[FREE_BATCH_SIZE]; >> + int cpu, err = 0; >> + unsigned int i = 0; >> >> + mutex_lock(&cpool_mutex); >> + if (size == scratch_size) { >> + for_each_possible_cpu(cpu) { >> + if (per_cpu(crypto_pool_scratch, cpu)) >> + continue; >> + goto allocate_scratch; >> + } >> + mutex_unlock(&cpool_mutex); >> + return 0; >> + } >> +allocate_scratch: >> + size = max(size, scratch_size); >> + cpus_read_lock(); >> for_each_possible_cpu(cpu) { >> - void *scratch = per_cpu(crypto_pool_scratch, cpu); >> + void *scratch, *old_scratch; >> >> - if (scratch) >> + scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); >> + if (!scratch) { >> + err = -ENOMEM; >> + break; >> + } >> + >> + old_scratch = per_cpu(crypto_pool_scratch, cpu); >> + /* Pairs with crypto_pool_get() */ >> + WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch); > > You're using RCU for protection here, please use rcu accessors. Will do. > >> + if (!cpu_online(cpu)) { >> + kfree(old_scratch); >> continue; >> + } >> + free_batch[i++] = old_scratch; >> + if (i == FREE_BATCH_SIZE) { >> + cpus_read_unlock(); >> + synchronize_rcu(); >> + while (i > 0) >> + kfree(free_batch[--i]); >> + cpus_read_lock(); >> + } > > This is a memory allocation routine, can we simplify this by > dynamically sizing "free_batch" and using call_rcu()? > > struct humf_blah { > struct rcu_head rcu; > unsigned int cnt; > void *data[]; > }; > > cheezit = kmalloc(struct_size(blah, data, num_possible_cpus())); > > for_each .. > cheezit->data[cheezit->cnt++] = old_scratch; > > call_rcu(&cheezit->rcu, my_free_them_scratches) > > etc. > > Feels like that'd be much less locking, unlocking and general > carefully'ing. Will give it a try for v3, thanks for the idea and review, Dmitry
diff --git a/crypto/Kconfig b/crypto/Kconfig index ba8d4a1f10f9..0614c2acfffa 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1394,6 +1394,12 @@ config CRYPTO_POOL help Per-CPU pool of crypto requests ready for usage in atomic contexts. +config CRYPTO_POOL_DEFAULT_SCRATCH_SIZE + hex "Per-CPU default scratch area size" + depends on CRYPTO_POOL + default 0x100 + range 0x100 0x10000 + if !KMSAN # avoid false positives from assembly if ARM source "arch/arm/crypto/Kconfig" diff --git a/crypto/crypto_pool.c b/crypto/crypto_pool.c index 37131952c5a7..0cd9eade7b73 100644 --- a/crypto/crypto_pool.c +++ b/crypto/crypto_pool.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <crypto/pool.h> +#include <linux/cpu.h> #include <linux/kref.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/workqueue.h> -static unsigned long scratch_size = DEFAULT_CRYPTO_POOL_SCRATCH_SZ; +static unsigned long scratch_size = CONFIG_CRYPTO_POOL_DEFAULT_SCRATCH_SIZE; static DEFINE_PER_CPU(void *, crypto_pool_scratch); struct crypto_pool_entry { @@ -22,26 +23,69 @@ static struct crypto_pool_entry cpool[CPOOL_SIZE]; static unsigned int cpool_populated; static DEFINE_MUTEX(cpool_mutex); -static int crypto_pool_scratch_alloc(void) +/* Slow-path */ +/** + * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path + * @size: request size for the scratch/temp buffer + */ +int crypto_pool_reserve_scratch(unsigned long size) { - int cpu; - - lockdep_assert_held(&cpool_mutex); +#define FREE_BATCH_SIZE 64 + void *free_batch[FREE_BATCH_SIZE]; + int cpu, err = 0; + unsigned int i = 0; + mutex_lock(&cpool_mutex); + if (size == scratch_size) { + for_each_possible_cpu(cpu) { + if (per_cpu(crypto_pool_scratch, cpu)) + continue; + goto allocate_scratch; + } + mutex_unlock(&cpool_mutex); + return 0; + } +allocate_scratch: + size = max(size, scratch_size); + cpus_read_lock(); for_each_possible_cpu(cpu) { - void *scratch = per_cpu(crypto_pool_scratch, cpu); + void *scratch, *old_scratch; - if (scratch) + scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (!scratch) { + err = -ENOMEM; + break; + } + + old_scratch = per_cpu(crypto_pool_scratch, cpu); + /* Pairs with crypto_pool_get() */ + WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch); + if (!cpu_online(cpu)) { + kfree(old_scratch); continue; + } + free_batch[i++] = old_scratch; + if (i == FREE_BATCH_SIZE) { + cpus_read_unlock(); + synchronize_rcu(); + while (i > 0) + kfree(free_batch[--i]); + cpus_read_lock(); + } + } + cpus_read_unlock(); + if (!err) + scratch_size = size; + mutex_unlock(&cpool_mutex); - scratch = kmalloc_node(scratch_size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!scratch) - return -ENOMEM; - per_cpu(crypto_pool_scratch, cpu) = scratch; + if (i > 0) { + synchronize_rcu(); + while (i > 0) + kfree(free_batch[--i]); } - return 0; + return err; } +EXPORT_SYMBOL_GPL(crypto_pool_reserve_scratch); static void crypto_pool_scratch_free(void) { @@ -138,7 +182,6 @@ int crypto_pool_alloc_ahash(const char *alg) /* slow-path */ mutex_lock(&cpool_mutex); - for (i = 0; i < cpool_populated; i++) { if (cpool[i].alg && !strcmp(cpool[i].alg, alg)) { if (kref_read(&cpool[i].kref) > 0) { @@ -263,7 +306,11 @@ int crypto_pool_get(unsigned int id, struct crypto_pool *c) return -EINVAL; } ret->req = *this_cpu_ptr(cpool[id].req); - ret->base.scratch = this_cpu_read(crypto_pool_scratch); + /* + * Pairs with crypto_pool_reserve_scratch(), scartch area is + * valid (allocated) until crypto_pool_put(). + */ + ret->base.scratch = READ_ONCE(*this_cpu_ptr(&crypto_pool_scratch)); return 0; } EXPORT_SYMBOL_GPL(crypto_pool_get); diff --git a/include/crypto/pool.h b/include/crypto/pool.h index 2c61aa45faff..c7d817860cc3 100644 --- a/include/crypto/pool.h +++ b/include/crypto/pool.h @@ -4,8 +4,6 @@ #include <crypto/hash.h> -#define DEFAULT_CRYPTO_POOL_SCRATCH_SZ 128 - struct crypto_pool { void *scratch; }; @@ -20,6 +18,7 @@ struct crypto_pool_ahash { struct ahash_request *req; }; +int crypto_pool_reserve_scratch(unsigned long size); int crypto_pool_alloc_ahash(const char *alg); void crypto_pool_add(unsigned int id); void crypto_pool_release(unsigned int id);
Instead of having build-time hardcoded constant, reallocate scratch area, if needed by user. Different algos, different users may need different size of temp per-CPU buffer. Only up-sizing supported for simplicity. Signed-off-by: Dmitry Safonov <dima@arista.com> --- crypto/Kconfig | 6 ++++ crypto/crypto_pool.c | 77 ++++++++++++++++++++++++++++++++++--------- include/crypto/pool.h | 3 +- 3 files changed, 69 insertions(+), 17 deletions(-)