diff mbox series

mm: zswap: multiple zpool support

Message ID 20230531022911.1168524-1-yosryahmed@google.com (mailing list archive)
State New
Headers show
Series mm: zswap: multiple zpool support | expand

Commit Message

Yosry Ahmed May 31, 2023, 2:29 a.m. UTC
Support using multiple zpools of the same type in zswap, for concurrency
purposes. Add CONFIG_ZSWAP_NR_ZPOOLS_ORDER to control the number of
zpools. The order is specific by the config rather than the absolute
number to guarantee a power of 2. This is useful so that we can use
deterministically link each entry to a zpool by hashing the zswap_entry
pointer.

On a setup with zswap and zsmalloc, comparing a single zpool (current
default) to 32 zpools (by setting CONFIG_ZSWAP_NR_ZPOOLS_ORDER=32) shows
improvements in the zsmalloc lock contention, especially on the swap out
path.

The following shows the perf analysis of the swapout path when 10
workloads are simulatenously reclaiming and refaulting tmpfs pages.
There are some improvements on the swapin path as well, but much less
significant.

1 zpool:

 |--28.99%--zswap_frontswap_store
       |     |
       <snip>
       |     |
       |     |--8.98%--zpool_map_handle
       |     |     |
       |     |      --8.98%--zs_zpool_map
       |     |           |
       |     |            --8.95%--zs_map_object
       |     |                 |
       |     |                  --8.38%--_raw_spin_lock
       |     |                       |
       |     |                        --7.39%--queued_spin_lock_slowpath
       |     |
       |     |--8.82%--zpool_malloc
       |     |     |
       |     |      --8.82%--zs_zpool_malloc
       |     |           |
       |     |            --8.80%--zs_malloc
       |     |                 |
       |     |                 |--7.21%--_raw_spin_lock
       |     |                 |     |
       |     |                 |      --6.81%--queued_spin_lock_slowpath
       <snip>

32 zpools:

 |--16.73%--zswap_frontswap_store
       |     |
       <snip>
       |     |
       |     |--1.81%--zpool_malloc
       |     |     |
       |     |      --1.81%--zs_zpool_malloc
       |     |           |
       |     |            --1.79%--zs_malloc
       |     |                 |
       |     |                  --0.73%--obj_malloc
       |     |
       |     |--1.06%--zswap_update_total_size
       |     |
       |     |--0.59%--zpool_map_handle
       |     |     |
       |     |      --0.59%--zs_zpool_map
       |     |           |
       |     |            --0.57%--zs_map_object
       |     |                 |
       |     |                  --0.51%--_raw_spin_lock
       <snip>

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 mm/Kconfig | 12 +++++++
 mm/zswap.c | 95 ++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 76 insertions(+), 31 deletions(-)

Comments

Yosry Ahmed May 31, 2023, 2:32 a.m. UTC | #1
On Tue, May 30, 2023 at 7:29 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> Support using multiple zpools of the same type in zswap, for concurrency
> purposes. Add CONFIG_ZSWAP_NR_ZPOOLS_ORDER to control the number of
> zpools. The order is specific by the config rather than the absolute
> number to guarantee a power of 2. This is useful so that we can use
> deterministically link each entry to a zpool by hashing the zswap_entry
> pointer.
>
> On a setup with zswap and zsmalloc, comparing a single zpool (current
> default) to 32 zpools (by setting CONFIG_ZSWAP_NR_ZPOOLS_ORDER=32) shows
> improvements in the zsmalloc lock contention, especially on the swap out
> path.
>
> The following shows the perf analysis of the swapout path when 10
> workloads are simulatenously reclaiming and refaulting tmpfs pages.
> There are some improvements on the swapin path as well, but much less
> significant.
>
> 1 zpool:
>
>  |--28.99%--zswap_frontswap_store
>        |     |
>        <snip>
>        |     |
>        |     |--8.98%--zpool_map_handle
>        |     |     |
>        |     |      --8.98%--zs_zpool_map
>        |     |           |
>        |     |            --8.95%--zs_map_object
>        |     |                 |
>        |     |                  --8.38%--_raw_spin_lock
>        |     |                       |
>        |     |                        --7.39%--queued_spin_lock_slowpath
>        |     |
>        |     |--8.82%--zpool_malloc
>        |     |     |
>        |     |      --8.82%--zs_zpool_malloc
>        |     |           |
>        |     |            --8.80%--zs_malloc
>        |     |                 |
>        |     |                 |--7.21%--_raw_spin_lock
>        |     |                 |     |
>        |     |                 |      --6.81%--queued_spin_lock_slowpath
>        <snip>
>
> 32 zpools:
>
>  |--16.73%--zswap_frontswap_store
>        |     |
>        <snip>
>        |     |
>        |     |--1.81%--zpool_malloc
>        |     |     |
>        |     |      --1.81%--zs_zpool_malloc
>        |     |           |
>        |     |            --1.79%--zs_malloc
>        |     |                 |
>        |     |                  --0.73%--obj_malloc
>        |     |
>        |     |--1.06%--zswap_update_total_size
>        |     |
>        |     |--0.59%--zpool_map_handle
>        |     |     |
>        |     |      --0.59%--zs_zpool_map
>        |     |           |
>        |     |            --0.57%--zs_map_object
>        |     |                 |
>        |     |                  --0.51%--_raw_spin_lock
>        <snip>
>
> Suggested-by: Yu Zhao <yuzhao@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>  mm/Kconfig | 12 +++++++
>  mm/zswap.c | 95 ++++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 76 insertions(+), 31 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 92c30879bf67..de1da56d2c07 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -59,6 +59,18 @@ config ZSWAP_EXCLUSIVE_LOADS
>           The cost is that if the page was never dirtied and needs to be
>           swapped out again, it will be re-compressed.
>
> +config ZSWAP_NR_ZPOOLS_ORDER
> +       int "Number of zpools in zswap, as power of 2"
> +       default 0
> +       depends on ZSWAP
> +       help
> +         This options determines the number of zpools to use for zswap, it
> +         will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
> +
> +         Having multiple zpools helps with concurrency and lock contention
> +         on the swap in and swap out paths, but uses a little bit of extra
> +         space.
> +
>  choice
>         prompt "Default compressor"
>         depends on ZSWAP
> diff --git a/mm/zswap.c b/mm/zswap.c
> index fba80330afd1..7111036f8aa5 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -137,6 +137,9 @@ static bool zswap_non_same_filled_pages_enabled = true;
>  module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
>                    bool, 0644);
>
> +/* Order of zpools for global pool when memcg is enabled */

This comment is an artifact of an older implementation, please ignore.

> +static unsigned int zswap_nr_zpools = 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER;
> +
>  /*********************************
>  * data structures
>  **********************************/
> @@ -150,7 +153,6 @@ struct crypto_acomp_ctx {
>  };
>
>  struct zswap_pool {
> -       struct zpool *zpool;
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
>         struct kref kref;
>         struct list_head list;
> @@ -158,6 +160,7 @@ struct zswap_pool {
>         struct work_struct shrink_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> +       struct zpool *zpools[];
>  };
>
>  /*
> @@ -236,7 +239,7 @@ static bool zswap_has_pool;
>
>  #define zswap_pool_debug(msg, p)                               \
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> -                zpool_get_type((p)->zpool))
> +                zpool_get_type((p)->zpools[0]))
>
>  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
>  static int zswap_pool_get(struct zswap_pool *pool);
> @@ -263,11 +266,13 @@ static void zswap_update_total_size(void)
>  {
>         struct zswap_pool *pool;
>         u64 total = 0;
> +       int i;
>
>         rcu_read_lock();
>
>         list_for_each_entry_rcu(pool, &zswap_pools, list)
> -               total += zpool_get_total_size(pool->zpool);
> +               for (i = 0; i < zswap_nr_zpools; i++)
> +                       total += zpool_get_total_size(pool->zpools[i]);
>
>         rcu_read_unlock();
>
> @@ -350,6 +355,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
>         }
>  }
>
> +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> +{
> +       int i;
> +
> +       i = zswap_nr_zpools == 1 ? 0 :
> +           hash_ptr(entry, ilog2(zswap_nr_zpools));
> +
> +       return entry->pool->zpools[i];
> +}
> +
>  /*
>   * Carries out the common pattern of freeing and entry's zpool allocation,
>   * freeing the entry itself, and decrementing the number of stored pages.
> @@ -363,7 +378,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
>         if (!entry->length)
>                 atomic_dec(&zswap_same_filled_pages);
>         else {
> -               zpool_free(entry->pool->zpool, entry->handle);
> +               zpool_free(zswap_find_zpool(entry), entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
>         zswap_entry_cache_free(entry);
> @@ -572,7 +587,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         list_for_each_entry_rcu(pool, &zswap_pools, list) {
>                 if (strcmp(pool->tfm_name, compressor))
>                         continue;
> -               if (strcmp(zpool_get_type(pool->zpool), type))
> +               /* all zpools share the same type */
> +               if (strcmp(zpool_get_type(pool->zpools[0]), type))
>                         continue;
>                 /* if we can't get it, it's about to be destroyed */
>                 if (!zswap_pool_get(pool))
> @@ -587,14 +603,17 @@ static void shrink_worker(struct work_struct *w)
>  {
>         struct zswap_pool *pool = container_of(w, typeof(*pool),
>                                                 shrink_work);
> +       int i;
>
> -       if (zpool_shrink(pool->zpool, 1, NULL))
> -               zswap_reject_reclaim_fail++;
> +       for (i = 0; i < zswap_nr_zpools; i++)
> +               if (zpool_shrink(pool->zpools[i], 1, NULL))
> +                       zswap_reject_reclaim_fail++;
>         zswap_pool_put(pool);
>  }
>
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
> +       int i;
>         struct zswap_pool *pool;
>         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
>         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> @@ -611,19 +630,25 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>                         return NULL;
>         }
>
> -       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
> +       pool = kzalloc(sizeof(*pool) +
> +                      sizeof(pool->zpools[0]) * zswap_nr_zpools,
> +                      GFP_KERNEL);
>         if (!pool)
>                 return NULL;
>
> -       /* unique name for each pool specifically required by zsmalloc */
> -       snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> +       for (i = 0; i < zswap_nr_zpools; i++) {
> +               /* unique name for each pool specifically required by zsmalloc */
> +               snprintf(name, 38, "zswap%x",
> +                        atomic_inc_return(&zswap_pools_count));
>
> -       pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
> -       if (!pool->zpool) {
> -               pr_err("%s zpool not available\n", type);
> -               goto error;
> +               pool->zpools[i] = zpool_create_pool(type, name, gfp,
> +                                                   &zswap_zpool_ops);
> +               if (!pool->zpools[i]) {
> +                       pr_err("%s zpool not available\n", type);
> +                       goto error;
> +               }
>         }
> -       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> +       pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
>
>         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
>
> @@ -653,8 +678,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> -       if (pool->zpool)
> -               zpool_destroy_pool(pool->zpool);
> +       while (i--)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>         return NULL;
>  }
> @@ -703,11 +728,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
>
>  static void zswap_pool_destroy(struct zswap_pool *pool)
>  {
> +       int i;
> +
>         zswap_pool_debug("destroying", pool);
>
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>         free_percpu(pool->acomp_ctx);
> -       zpool_destroy_pool(pool->zpool);
> +       for (i = 0; i < zswap_nr_zpools; i++)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>  }
>
> @@ -1160,6 +1188,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         unsigned long handle, value;
>         char *buf;
>         u8 *src, *dst;
> +       struct zpool *zpool;
>         struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
>         gfp_t gfp;
>
> @@ -1259,11 +1288,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* store */
> -       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> +       zpool = zswap_find_zpool(entry);
> +       hlen = zpool_evictable(zpool) ? sizeof(zhdr) : 0;
>         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> -       if (zpool_malloc_support_movable(entry->pool->zpool))
> +       if (zpool_malloc_support_movable(zpool))
>                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> -       ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
> +       ret = zpool_malloc(zpool, hlen + dlen, gfp, &handle);
> +
>         if (ret == -ENOSPC) {
>                 zswap_reject_compress_poor++;
>                 goto put_dstmem;
> @@ -1272,10 +1303,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>                 zswap_reject_alloc_fail++;
>                 goto put_dstmem;
>         }
> -       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> +       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
>         memcpy(buf, &zhdr, hlen);
>         memcpy(buf + hlen, dst, dlen);
> -       zpool_unmap_handle(entry->pool->zpool, handle);
> +       zpool_unmap_handle(zpool, handle);
>         mutex_unlock(acomp_ctx->mutex);
>
>         /* populate entry */
> @@ -1353,6 +1384,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         u8 *src, *dst, *tmp;
>         unsigned int dlen;
>         int ret;
> +       struct zpool *zpool;
>
>         /* find */
>         spin_lock(&tree->lock);
> @@ -1372,7 +1404,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>                 goto stats;
>         }
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       zpool = zswap_find_zpool(entry);
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 tmp = kmalloc(entry->length, GFP_KERNEL);
>                 if (!tmp) {
>                         ret = -ENOMEM;
> @@ -1382,14 +1415,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>
>         /* decompress */
>         dlen = PAGE_SIZE;
> -       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> -       if (zpool_evictable(entry->pool->zpool))
> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> +       if (zpool_evictable(zpool))
>                 src += sizeof(struct zswap_header);
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 memcpy(tmp, src, entry->length);
>                 src = tmp;
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +               zpool_unmap_handle(zpool, entry->handle);
>         }
>
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -1401,8 +1434,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
>         mutex_unlock(acomp_ctx->mutex);
>
> -       if (zpool_can_sleep_mapped(entry->pool->zpool))
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +       if (zpool_can_sleep_mapped(zpool))
> +               zpool_unmap_handle(zpool, entry->handle);
>         else
>                 kfree(tmp);
>
> @@ -1558,7 +1591,7 @@ static int zswap_setup(void)
>         pool = __zswap_pool_create_fallback();
>         if (pool) {
>                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> -                       zpool_get_type(pool->zpool));
> +                       zpool_get_type(pool->zpools[0]));
>                 list_add(&pool->list, &zswap_pools);
>                 zswap_has_pool = true;
>         } else {
> --
> 2.41.0.rc0.172.g3f132b7071-goog
>
Yosry Ahmed June 13, 2023, 8:13 p.m. UTC | #2
On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Jun 02, 2023 at 12:14:28PM -0700, Yosry Ahmed wrote:
> > > On Fri, Jun 2, 2023 at 11:34 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Fri, Jun 02, 2023 at 09:59:20AM -0700, Yosry Ahmed wrote:
> > > > > On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > > Again, what about the zswap_tree.lock and swap_info_struct.lock?
> > > > > > They're the same scope unless you use multiple swap files. Would it
> > > > > > make sense to tie pools to trees, so that using multiple swapfiles for
> > > > > > concurrency purposes also implies this optimization?
> > > > >
> > > > > Yeah, using multiple swapfiles helps with those locks, but it doesn't
> > > > > help with the zpool lock.
> > > > >
> > > > > I am reluctant to take this path because I am trying to get rid of
> > > > > zswap's dependency on swapfiles to begin with, and have it act as its
> > > > > own standalone swapping backend. If I am successful, then having one
> > > > > zpool per zswap_tree is just a temporary fix.
> > > >
> > > > What about making the pools per-cpu?
> > > >
> > > > This would scale nicely with the machine size. And we commonly deal
> > > > with for_each_cpu() loops and per-cpu data structures, so have good
> > > > developer intuition about what's reasonable to squeeze into those.
> > > >
> > > > It would eliminate the lock contention, for everybody, right away, and
> > > > without asking questions.
> > > >
> > > > It would open the door to all kinds of locking optimizations on top.
> > >
> > > The page can get swapped out on one cpu and swapped in on another, no?
> > >
> > > We will need to store which zpool the page is stored in in its zswap
> > > entry, and potentially grab percpu locks from other cpus in the swap
> > > in path. The lock contention would probably be less, but certainly not
> > > eliminated.
> > >
> > > Did I misunderstand?
> >
> > Sorry, I should have been more precise.
> >
> > I'm saying that using NR_CPUS pools, and replacing the hash with
> > smp_processor_id(), would accomplish your goal of pool concurrency.
> > But it would do so with a broadly-used, well-understood scaling
> > factor. We might not need a config option at all.
> >
> > The lock would still be there, but contention would be reduced fairly
> > optimally (barring preemption) for store concurrency at least. Not
> > fully eliminated due to frees and compaction, though, yes.

I thought about this again and had some internal discussions, and I am
more unsure about it. Beyond the memory overhead, having too many
zpools might result in higher fragmentation within the zpools. For
zsmalloc, we do not compact across multiple zpools for example.

We have been using a specific number of zpools in our production for
years now, we know it can be tuned to achieve performance gains. OTOH,
percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
probably too many zpools in a lot of cases, and we wouldn't know how
many zpools actually fits our workloads.

I see value in allowing the number of zpools to be directly
configurable (it can always be left as 1), and am worried that with
percpu we would be throwing away years of production testing for an
unknown.

I am obviously biased, but I don't think this adds significant
complexity to the zswap code as-is (or as v2 is to be precise).

WDYT?

>
> Yeah I think we can do that. I looked at the size of the zsmalloc pool
> as an example, and it seems to be less than 1K, so having one percpu
> seems okay.
>
> There are a few things that we will need to do:
> - Rework zswap_update_total_size(). We don't want to loop through all
> cpus on each load/store. We can be smarter about it and inc/dec the
> total zswap pool size each time we allocate or free a page in the
> driver. This might need some plumbing from the drivers to zswap (or
> passing a callback from zswap to the drivers).
>
> - Update zsmalloc such that all pool share kmem caches, instead of
> creating two kmem caches for zsmalloc percpu. This was a follow-up I
> had in mind for multiple zpools support anyway, but I guess it's more
> significant if we have NR_CPUS pools.
>
> I was nervous about increasing the size of struct zswap_entry to store
> the cpu/zpool where the entry resides, but I realized we can replace
> the pointer to zswap_pool in struct zswap_entry with a pointer to
> zpool, and add a zswap_pool pointer in struct zpool. This will
> actually trim down the common "entry->pool->zpool" to just
> "entry->zpool", and then we can replace any "entry->pool" with
> "entry->zpool->pool".
>
> @Yu Zhao, any thoughts on this? The multiple zpools support was
> initially your idea (and did the initial implementation) -- so your
> input is very valuable here.
>
> >
> > I'm not proposing more than that at this point. I only wrote the last
> > line because already having per-cpu data structures might help with
> > fast path optimizations down the line, if contention is still an
> > issue. But unlikely. So it's not so important. Let's forget it.
Johannes Weiner June 14, 2023, 2:59 p.m. UTC | #3
On Tue, Jun 13, 2023 at 01:13:59PM -0700, Yosry Ahmed wrote:
> On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > Sorry, I should have been more precise.
> > >
> > > I'm saying that using NR_CPUS pools, and replacing the hash with
> > > smp_processor_id(), would accomplish your goal of pool concurrency.
> > > But it would do so with a broadly-used, well-understood scaling
> > > factor. We might not need a config option at all.
> > >
> > > The lock would still be there, but contention would be reduced fairly
> > > optimally (barring preemption) for store concurrency at least. Not
> > > fully eliminated due to frees and compaction, though, yes.
> 
> I thought about this again and had some internal discussions, and I am
> more unsure about it. Beyond the memory overhead, having too many
> zpools might result in higher fragmentation within the zpools. For
> zsmalloc, we do not compact across multiple zpools for example.
> 
> We have been using a specific number of zpools in our production for
> years now, we know it can be tuned to achieve performance gains. OTOH,
> percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
> probably too many zpools in a lot of cases, and we wouldn't know how
> many zpools actually fits our workloads.

Is it the same number across your entire fleet and all workloads?

How large *is* the number in relation to CPUs?

> I see value in allowing the number of zpools to be directly
> configurable (it can always be left as 1), and am worried that with
> percpu we would be throwing away years of production testing for an
> unknown.
> 
> I am obviously biased, but I don't think this adds significant
> complexity to the zswap code as-is (or as v2 is to be precise).

I had typed out this long list of reasons why I hate this change, and
then deleted it to suggest the per-cpu scaling factor.

But to summarize my POV, I think a user-facing config option for this
is completely inappropriate. There are no limits, no guidance, no sane
default. And it's very selective about micro-optimizing this one lock
when there are several locks and datastructures of the same scope in
the swap path. This isn't a reasonable question to ask people building
kernels. It's writing code through the Kconfig file.

Data structure scalability should be solved in code, not with config
options.

My vote on the patch as proposed is NAK.
Yosry Ahmed June 14, 2023, 8:50 p.m. UTC | #4
On Wed, Jun 14, 2023 at 7:59 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jun 13, 2023 at 01:13:59PM -0700, Yosry Ahmed wrote:
> > On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > Sorry, I should have been more precise.
> > > >
> > > > I'm saying that using NR_CPUS pools, and replacing the hash with
> > > > smp_processor_id(), would accomplish your goal of pool concurrency.
> > > > But it would do so with a broadly-used, well-understood scaling
> > > > factor. We might not need a config option at all.
> > > >
> > > > The lock would still be there, but contention would be reduced fairly
> > > > optimally (barring preemption) for store concurrency at least. Not
> > > > fully eliminated due to frees and compaction, though, yes.
> >
> > I thought about this again and had some internal discussions, and I am
> > more unsure about it. Beyond the memory overhead, having too many
> > zpools might result in higher fragmentation within the zpools. For
> > zsmalloc, we do not compact across multiple zpools for example.
> >
> > We have been using a specific number of zpools in our production for
> > years now, we know it can be tuned to achieve performance gains. OTOH,
> > percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
> > probably too many zpools in a lot of cases, and we wouldn't know how
> > many zpools actually fits our workloads.
>
> Is it the same number across your entire fleet and all workloads?

Yes.

>
> How large *is* the number in relation to CPUs?

It differs based on the number of cpus on the machine. We use 32
zpools on all machines.

>
> > I see value in allowing the number of zpools to be directly
> > configurable (it can always be left as 1), and am worried that with
> > percpu we would be throwing away years of production testing for an
> > unknown.
> >
> > I am obviously biased, but I don't think this adds significant
> > complexity to the zswap code as-is (or as v2 is to be precise).
>
> I had typed out this long list of reasons why I hate this change, and
> then deleted it to suggest the per-cpu scaling factor.
>
> But to summarize my POV, I think a user-facing config option for this
> is completely inappropriate. There are no limits, no guidance, no sane
> default. And it's very selective about micro-optimizing this one lock
> when there are several locks and datastructures of the same scope in
> the swap path. This isn't a reasonable question to ask people building
> kernels. It's writing code through the Kconfig file.

It's not just swap path, it's any contention that happens within the
zpool between its different operations (map, alloc, compaction, etc).
My thought was that if a user observes high contention in any of the
zpool operations, they can increase the number of zpools -- basically
this should be empirically decided. If unsure, the user can just leave
it as a single zpool.

>
> Data structure scalability should be solved in code, not with config
> options.

I agree, but until we have a more fundamental architectural solution,
having multiple zpools to address scalability is a win. We can remove
the config option later if needed.

>
> My vote on the patch as proposed is NAK.

I hear the argument about the config option not being ideal here, but
NR_CPUs is also not ideal.

How about if we introduce it as a constant in the kernel? We have a
lot of other constants around the kernel that do not scale with the
machine size (e.g. SWAP_CLUSTER_MAX). We can start with 32, which is a
value that we have tested in our data centers for many years now and
know to work well. We can revisit later if needed.

WDYT?
Yosry Ahmed June 20, 2023, 7:48 p.m. UTC | #5
On Wed, Jun 14, 2023 at 1:50 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Wed, Jun 14, 2023 at 7:59 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Tue, Jun 13, 2023 at 01:13:59PM -0700, Yosry Ahmed wrote:
> > > On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > Sorry, I should have been more precise.
> > > > >
> > > > > I'm saying that using NR_CPUS pools, and replacing the hash with
> > > > > smp_processor_id(), would accomplish your goal of pool concurrency.
> > > > > But it would do so with a broadly-used, well-understood scaling
> > > > > factor. We might not need a config option at all.
> > > > >
> > > > > The lock would still be there, but contention would be reduced fairly
> > > > > optimally (barring preemption) for store concurrency at least. Not
> > > > > fully eliminated due to frees and compaction, though, yes.
> > >
> > > I thought about this again and had some internal discussions, and I am
> > > more unsure about it. Beyond the memory overhead, having too many
> > > zpools might result in higher fragmentation within the zpools. For
> > > zsmalloc, we do not compact across multiple zpools for example.
> > >
> > > We have been using a specific number of zpools in our production for
> > > years now, we know it can be tuned to achieve performance gains. OTOH,
> > > percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
> > > probably too many zpools in a lot of cases, and we wouldn't know how
> > > many zpools actually fits our workloads.
> >
> > Is it the same number across your entire fleet and all workloads?
>
> Yes.
>
> >
> > How large *is* the number in relation to CPUs?
>
> It differs based on the number of cpus on the machine. We use 32
> zpools on all machines.
>
> >
> > > I see value in allowing the number of zpools to be directly
> > > configurable (it can always be left as 1), and am worried that with
> > > percpu we would be throwing away years of production testing for an
> > > unknown.
> > >
> > > I am obviously biased, but I don't think this adds significant
> > > complexity to the zswap code as-is (or as v2 is to be precise).
> >
> > I had typed out this long list of reasons why I hate this change, and
> > then deleted it to suggest the per-cpu scaling factor.
> >
> > But to summarize my POV, I think a user-facing config option for this
> > is completely inappropriate. There are no limits, no guidance, no sane
> > default. And it's very selective about micro-optimizing this one lock
> > when there are several locks and datastructures of the same scope in
> > the swap path. This isn't a reasonable question to ask people building
> > kernels. It's writing code through the Kconfig file.
>
> It's not just swap path, it's any contention that happens within the
> zpool between its different operations (map, alloc, compaction, etc).
> My thought was that if a user observes high contention in any of the
> zpool operations, they can increase the number of zpools -- basically
> this should be empirically decided. If unsure, the user can just leave
> it as a single zpool.
>
> >
> > Data structure scalability should be solved in code, not with config
> > options.
>
> I agree, but until we have a more fundamental architectural solution,
> having multiple zpools to address scalability is a win. We can remove
> the config option later if needed.
>
> >
> > My vote on the patch as proposed is NAK.
>
> I hear the argument about the config option not being ideal here, but
> NR_CPUs is also not ideal.
>
> How about if we introduce it as a constant in the kernel? We have a
> lot of other constants around the kernel that do not scale with the
> machine size (e.g. SWAP_CLUSTER_MAX). We can start with 32, which is a
> value that we have tested in our data centers for many years now and
> know to work well. We can revisit later if needed.
>
> WDYT?

I sent v3 [1] with the proposed constant instead of a config option,
hopefully this is more acceptable.

[1]https://lore.kernel.org/lkml/20230620194644.3142384-1-yosryahmed@google.com/
diff mbox series

Patch

diff --git a/mm/Kconfig b/mm/Kconfig
index 92c30879bf67..de1da56d2c07 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -59,6 +59,18 @@  config ZSWAP_EXCLUSIVE_LOADS
 	  The cost is that if the page was never dirtied and needs to be
 	  swapped out again, it will be re-compressed.
 
+config ZSWAP_NR_ZPOOLS_ORDER
+	int "Number of zpools in zswap, as power of 2"
+	default 0
+	depends on ZSWAP
+	help
+	  This options determines the number of zpools to use for zswap, it
+	  will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
+
+	  Having multiple zpools helps with concurrency and lock contention
+	  on the swap in and swap out paths, but uses a little bit of extra
+	  space.
+
 choice
 	prompt "Default compressor"
 	depends on ZSWAP
diff --git a/mm/zswap.c b/mm/zswap.c
index fba80330afd1..7111036f8aa5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -137,6 +137,9 @@  static bool zswap_non_same_filled_pages_enabled = true;
 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
 		   bool, 0644);
 
+/* Order of zpools for global pool when memcg is enabled */
+static unsigned int zswap_nr_zpools = 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER;
+
 /*********************************
 * data structures
 **********************************/
@@ -150,7 +153,6 @@  struct crypto_acomp_ctx {
 };
 
 struct zswap_pool {
-	struct zpool *zpool;
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
 	struct kref kref;
 	struct list_head list;
@@ -158,6 +160,7 @@  struct zswap_pool {
 	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+	struct zpool *zpools[];
 };
 
 /*
@@ -236,7 +239,7 @@  static bool zswap_has_pool;
 
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
-		 zpool_get_type((p)->zpool))
+		 zpool_get_type((p)->zpools[0]))
 
 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
 static int zswap_pool_get(struct zswap_pool *pool);
@@ -263,11 +266,13 @@  static void zswap_update_total_size(void)
 {
 	struct zswap_pool *pool;
 	u64 total = 0;
+	int i;
 
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		total += zpool_get_total_size(pool->zpool);
+		for (i = 0; i < zswap_nr_zpools; i++)
+			total += zpool_get_total_size(pool->zpools[i]);
 
 	rcu_read_unlock();
 
@@ -350,6 +355,16 @@  static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	}
 }
 
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+	int i;
+
+	i = zswap_nr_zpools == 1 ? 0 :
+	    hash_ptr(entry, ilog2(zswap_nr_zpools));
+
+	return entry->pool->zpools[i];
+}
+
 /*
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
@@ -363,7 +378,7 @@  static void zswap_free_entry(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
-		zpool_free(entry->pool->zpool, entry->handle);
+		zpool_free(zswap_find_zpool(entry), entry->handle);
 		zswap_pool_put(entry->pool);
 	}
 	zswap_entry_cache_free(entry);
@@ -572,7 +587,8 @@  static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
 		if (strcmp(pool->tfm_name, compressor))
 			continue;
-		if (strcmp(zpool_get_type(pool->zpool), type))
+		/* all zpools share the same type */
+		if (strcmp(zpool_get_type(pool->zpools[0]), type))
 			continue;
 		/* if we can't get it, it's about to be destroyed */
 		if (!zswap_pool_get(pool))
@@ -587,14 +603,17 @@  static void shrink_worker(struct work_struct *w)
 {
 	struct zswap_pool *pool = container_of(w, typeof(*pool),
 						shrink_work);
+	int i;
 
-	if (zpool_shrink(pool->zpool, 1, NULL))
-		zswap_reject_reclaim_fail++;
+	for (i = 0; i < zswap_nr_zpools; i++)
+		if (zpool_shrink(pool->zpools[i], 1, NULL))
+			zswap_reject_reclaim_fail++;
 	zswap_pool_put(pool);
 }
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
+	int i;
 	struct zswap_pool *pool;
 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -611,19 +630,25 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 			return NULL;
 	}
 
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	pool = kzalloc(sizeof(*pool) +
+		       sizeof(pool->zpools[0]) * zswap_nr_zpools,
+		       GFP_KERNEL);
 	if (!pool)
 		return NULL;
 
-	/* unique name for each pool specifically required by zsmalloc */
-	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+	for (i = 0; i < zswap_nr_zpools; i++) {
+		/* unique name for each pool specifically required by zsmalloc */
+		snprintf(name, 38, "zswap%x",
+			 atomic_inc_return(&zswap_pools_count));
 
-	pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
-	if (!pool->zpool) {
-		pr_err("%s zpool not available\n", type);
-		goto error;
+		pool->zpools[i] = zpool_create_pool(type, name, gfp,
+						    &zswap_zpool_ops);
+		if (!pool->zpools[i]) {
+			pr_err("%s zpool not available\n", type);
+			goto error;
+		}
 	}
-	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
 
 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 
@@ -653,8 +678,8 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
-	if (pool->zpool)
-		zpool_destroy_pool(pool->zpool);
+	while (i--)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 	return NULL;
 }
@@ -703,11 +728,14 @@  static struct zswap_pool *__zswap_pool_create_fallback(void)
 
 static void zswap_pool_destroy(struct zswap_pool *pool)
 {
+	int i;
+
 	zswap_pool_debug("destroying", pool);
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
-	zpool_destroy_pool(pool->zpool);
+	for (i = 0; i < zswap_nr_zpools; i++)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 }
 
@@ -1160,6 +1188,7 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	unsigned long handle, value;
 	char *buf;
 	u8 *src, *dst;
+	struct zpool *zpool;
 	struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
 	gfp_t gfp;
 
@@ -1259,11 +1288,13 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	}
 
 	/* store */
-	hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+	zpool = zswap_find_zpool(entry);
+	hlen = zpool_evictable(zpool) ? sizeof(zhdr) : 0;
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(entry->pool->zpool))
+	if (zpool_malloc_support_movable(zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
+	ret = zpool_malloc(zpool, hlen + dlen, gfp, &handle);
+
 	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
 		goto put_dstmem;
@@ -1272,10 +1303,10 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		zswap_reject_alloc_fail++;
 		goto put_dstmem;
 	}
-	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, &zhdr, hlen);
 	memcpy(buf + hlen, dst, dlen);
-	zpool_unmap_handle(entry->pool->zpool, handle);
+	zpool_unmap_handle(zpool, handle);
 	mutex_unlock(acomp_ctx->mutex);
 
 	/* populate entry */
@@ -1353,6 +1384,7 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	u8 *src, *dst, *tmp;
 	unsigned int dlen;
 	int ret;
+	struct zpool *zpool;
 
 	/* find */
 	spin_lock(&tree->lock);
@@ -1372,7 +1404,8 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 		goto stats;
 	}
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	zpool = zswap_find_zpool(entry);
+	if (!zpool_can_sleep_mapped(zpool)) {
 		tmp = kmalloc(entry->length, GFP_KERNEL);
 		if (!tmp) {
 			ret = -ENOMEM;
@@ -1382,14 +1415,14 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
-	if (zpool_evictable(entry->pool->zpool))
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+	if (zpool_evictable(zpool))
 		src += sizeof(struct zswap_header);
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	if (!zpool_can_sleep_mapped(zpool)) {
 		memcpy(tmp, src, entry->length);
 		src = tmp;
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1401,8 +1434,8 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
 	mutex_unlock(acomp_ctx->mutex);
 
-	if (zpool_can_sleep_mapped(entry->pool->zpool))
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	if (zpool_can_sleep_mapped(zpool))
+		zpool_unmap_handle(zpool, entry->handle);
 	else
 		kfree(tmp);
 
@@ -1558,7 +1591,7 @@  static int zswap_setup(void)
 	pool = __zswap_pool_create_fallback();
 	if (pool) {
 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
-			zpool_get_type(pool->zpool));
+			zpool_get_type(pool->zpools[0]));
 		list_add(&pool->list, &zswap_pools);
 		zswap_has_pool = true;
 	} else {