diff mbox series

[2/2] mm/zswap: split zswap rb-tree

Message ID 20240117-b4-zswap-lock-optimize-v1-2-23f6effe5775@bytedance.com (mailing list archive)
State New
Headers show
Series mm/zswap: optimize the scalability of zswap rb-tree | expand

Commit Message

Chengming Zhou Jan. 17, 2024, 9:23 a.m. UTC
Each swapfile has one rb-tree to search the mapping of swp_entry_t to
zswap_entry, that use a spinlock to protect, which can cause heavy lock
contention if multiple tasks zswap_store/load concurrently.

Optimize the scalability problem by splitting the zswap rb-tree into
multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
just like we did in the swap cache address_space splitting.

Although this method can't solve the spinlock contention completely, it
can mitigate much of that contention. Below is the results of kernel build
in tmpfs with zswap shrinker enabled:

     linux-next  zswap-lock-optimize
real 1m9.181s    1m3.820s
user 17m44.036s  17m40.100s
sys  7m37.297s   4m54.622s

So there are clearly improvements.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 include/linux/zswap.h |  4 +--
 mm/swapfile.c         |  2 +-
 mm/zswap.c            | 69 ++++++++++++++++++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 28 deletions(-)

Comments

Johannes Weiner Jan. 18, 2024, 3:11 p.m. UTC | #1
On Wed, Jan 17, 2024 at 09:23:19AM +0000, Chengming Zhou wrote:
> Each swapfile has one rb-tree to search the mapping of swp_entry_t to
> zswap_entry, that use a spinlock to protect, which can cause heavy lock
> contention if multiple tasks zswap_store/load concurrently.
> 
> Optimize the scalability problem by splitting the zswap rb-tree into
> multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
> just like we did in the swap cache address_space splitting.
> 
> Although this method can't solve the spinlock contention completely, it
> can mitigate much of that contention. Below is the results of kernel build
> in tmpfs with zswap shrinker enabled:
> 
>      linux-next  zswap-lock-optimize
> real 1m9.181s    1m3.820s
> user 17m44.036s  17m40.100s
> sys  7m37.297s   4m54.622s
> 
> So there are clearly improvements.
> 
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

One minor nit:

> @@ -265,6 +266,10 @@ static bool zswap_has_pool;
>  * helpers and fwd declarations
>  **********************************/
>  
> +#define swap_zswap_tree(entry)					\
> +	(&zswap_trees[swp_type(entry)][swp_offset(entry)	\
> +		>> SWAP_ADDRESS_SPACE_SHIFT])

Make this a static inline function instead?
Nhat Pham Jan. 18, 2024, 7:24 p.m. UTC | #2
On Wed, Jan 17, 2024 at 1:23 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Each swapfile has one rb-tree to search the mapping of swp_entry_t to
> zswap_entry, that use a spinlock to protect, which can cause heavy lock
> contention if multiple tasks zswap_store/load concurrently.
>
> Optimize the scalability problem by splitting the zswap rb-tree into
> multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
> just like we did in the swap cache address_space splitting.
>
> Although this method can't solve the spinlock contention completely, it
> can mitigate much of that contention. Below is the results of kernel build
> in tmpfs with zswap shrinker enabled:
>
>      linux-next  zswap-lock-optimize
> real 1m9.181s    1m3.820s
> user 17m44.036s  17m40.100s
> sys  7m37.297s   4m54.622s

That's really impressive, especially the sys time reduction :) Well done.

>
> So there are clearly improvements.
>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>

Code looks solid too. I haven't read the xarray patch series too
closely yet, but this patch series is clearly already an improvement.
It is simple, with existing precedent (from swap cache), and
experiments show that it works quite well to improve zswap's
performance.

If the xarray patch proves to be even better, we can always combine it
with this approach (a per-range xarray?), or replace it with the
xarray. But for now:

Acked-by: Nhat Pham <nphamcs@gmail.com>

> ---
>  include/linux/zswap.h |  4 +--
>  mm/swapfile.c         |  2 +-
>  mm/zswap.c            | 69 ++++++++++++++++++++++++++++++++-------------------
>  3 files changed, 47 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index eca388229d9a..91895ce1fdbc 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -30,7 +30,7 @@ struct zswap_lruvec_state {
>  bool zswap_store(struct folio *folio);
>  bool zswap_load(struct folio *folio);
>  void zswap_invalidate(int type, pgoff_t offset);
> -int zswap_swapon(int type);
> +int zswap_swapon(int type, unsigned long nr_pages);
>  void zswap_swapoff(int type);
>  void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
>  void zswap_lruvec_state_init(struct lruvec *lruvec);
> @@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio)
>  }
>
>  static inline void zswap_invalidate(int type, pgoff_t offset) {}
> -static inline int zswap_swapon(int type)
> +static inline int zswap_swapon(int type, unsigned long nr_pages)
>  {
>         return 0;
>  }
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 6c53ea06626b..35aa17b2a2fa 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -3164,7 +3164,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>         if (error)
>                 goto bad_swap_unlock_inode;
>
> -       error = zswap_swapon(p->type);
> +       error = zswap_swapon(p->type, maxpages);
>         if (error)
>                 goto free_swap_address_space;
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d88faea85978..4a6dbc620c7c 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -239,6 +239,7 @@ struct zswap_tree {
>  };
>
>  static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
> +static unsigned int nr_zswap_trees[MAX_SWAPFILES];
>
>  /* RCU-protected iteration */
>  static LIST_HEAD(zswap_pools);
> @@ -265,6 +266,10 @@ static bool zswap_has_pool;
>  * helpers and fwd declarations
>  **********************************/
>
> +#define swap_zswap_tree(entry)                                 \
> +       (&zswap_trees[swp_type(entry)][swp_offset(entry)        \
> +               >> SWAP_ADDRESS_SPACE_SHIFT])
> +
>  #define zswap_pool_debug(msg, p)                               \
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
>                  zpool_get_type((p)->zpools[0]))
> @@ -865,7 +870,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>          * until the entry is verified to still be alive in the tree.
>          */
>         swpoffset = swp_offset(entry->swpentry);
> -       tree = zswap_trees[swp_type(entry->swpentry)];
> +       tree = swap_zswap_tree(entry->swpentry);
>         list_lru_isolate(l, item);
>         /*
>          * It's safe to drop the lock here because we return either
> @@ -1494,10 +1499,9 @@ static void zswap_fill_page(void *ptr, unsigned long value)
>  bool zswap_store(struct folio *folio)
>  {
>         swp_entry_t swp = folio->swap;
> -       int type = swp_type(swp);
>         pgoff_t offset = swp_offset(swp);
>         struct page *page = &folio->page;
> -       struct zswap_tree *tree = zswap_trees[type];
> +       struct zswap_tree *tree = swap_zswap_tree(swp);
>         struct zswap_entry *entry, *dupentry;
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> @@ -1569,7 +1573,7 @@ bool zswap_store(struct folio *folio)
>                 src = kmap_local_page(page);
>                 if (zswap_is_page_same_filled(src, &value)) {
>                         kunmap_local(src);
> -                       entry->swpentry = swp_entry(type, offset);
> +                       entry->swpentry = swp;
>                         entry->length = 0;
>                         entry->value = value;
>                         atomic_inc(&zswap_same_filled_pages);
> @@ -1651,7 +1655,7 @@ bool zswap_store(struct folio *folio)
>         mutex_unlock(&acomp_ctx->mutex);
>
>         /* populate entry */
> -       entry->swpentry = swp_entry(type, offset);
> +       entry->swpentry = swp;
>         entry->handle = handle;
>         entry->length = dlen;
>
> @@ -1711,10 +1715,9 @@ bool zswap_store(struct folio *folio)
>  bool zswap_load(struct folio *folio)
>  {
>         swp_entry_t swp = folio->swap;
> -       int type = swp_type(swp);
>         pgoff_t offset = swp_offset(swp);
>         struct page *page = &folio->page;
> -       struct zswap_tree *tree = zswap_trees[type];
> +       struct zswap_tree *tree = swap_zswap_tree(swp);
>         struct zswap_entry *entry;
>         u8 *dst;
>
> @@ -1757,7 +1760,7 @@ bool zswap_load(struct folio *folio)
>
>  void zswap_invalidate(int type, pgoff_t offset)
>  {
> -       struct zswap_tree *tree = zswap_trees[type];
> +       struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
>         struct zswap_entry *entry;
>
>         /* find */
> @@ -1772,37 +1775,53 @@ void zswap_invalidate(int type, pgoff_t offset)
>         spin_unlock(&tree->lock);
>  }
>
> -int zswap_swapon(int type)
> +int zswap_swapon(int type, unsigned long nr_pages)
>  {
> -       struct zswap_tree *tree;
> +       struct zswap_tree *trees, *tree;
> +       unsigned int nr, i;
>
> -       tree = kzalloc(sizeof(*tree), GFP_KERNEL);
> -       if (!tree) {
> +       nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
> +       trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
> +       if (!trees) {
>                 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
>                 return -ENOMEM;
>         }
>
> -       tree->rbroot = RB_ROOT;
> -       spin_lock_init(&tree->lock);
> -       zswap_trees[type] = tree;
> +       for (i = 0; i < nr; i++) {
> +               tree = trees + i;
> +               tree->rbroot = RB_ROOT;
> +               spin_lock_init(&tree->lock);
> +       }
> +
> +       nr_zswap_trees[type] = nr;
> +       zswap_trees[type] = trees;
>         return 0;
>  }
>
>  void zswap_swapoff(int type)
>  {
> -       struct zswap_tree *tree = zswap_trees[type];
> -       struct zswap_entry *entry, *n;
> +       struct zswap_tree *trees = zswap_trees[type];
> +       unsigned int i;
>
> -       if (!tree)
> +       if (!trees)
>                 return;
>
> -       /* walk the tree and free everything */
> -       spin_lock(&tree->lock);
> -       rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
> -               zswap_free_entry(entry);
> -       tree->rbroot = RB_ROOT;
> -       spin_unlock(&tree->lock);
> -       kfree(tree);
> +       for (i = 0; i < nr_zswap_trees[type]; i++) {
> +               struct zswap_tree *tree = trees + i;
> +               struct zswap_entry *entry, *n;
> +
> +               /* walk the tree and free everything */
> +               spin_lock(&tree->lock);
> +               rbtree_postorder_for_each_entry_safe(entry, n,
> +                                                    &tree->rbroot,
> +                                                    rbnode)
> +                       zswap_free_entry(entry);
> +               tree->rbroot = RB_ROOT;
> +               spin_unlock(&tree->lock);
> +       }
> +
> +       kvfree(trees);
> +       nr_zswap_trees[type] = 0;
>         zswap_trees[type] = NULL;
>  }
>
>
> --
> b4 0.10.1
Chengming Zhou Jan. 19, 2024, 6:20 a.m. UTC | #3
On 2024/1/18 23:11, Johannes Weiner wrote:
> On Wed, Jan 17, 2024 at 09:23:19AM +0000, Chengming Zhou wrote:
>> Each swapfile has one rb-tree to search the mapping of swp_entry_t to
>> zswap_entry, that use a spinlock to protect, which can cause heavy lock
>> contention if multiple tasks zswap_store/load concurrently.
>>
>> Optimize the scalability problem by splitting the zswap rb-tree into
>> multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
>> just like we did in the swap cache address_space splitting.
>>
>> Although this method can't solve the spinlock contention completely, it
>> can mitigate much of that contention. Below is the results of kernel build
>> in tmpfs with zswap shrinker enabled:
>>
>>      linux-next  zswap-lock-optimize
>> real 1m9.181s    1m3.820s
>> user 17m44.036s  17m40.100s
>> sys  7m37.297s   4m54.622s
>>
>> So there are clearly improvements.
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> 
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> One minor nit:
> 
>> @@ -265,6 +266,10 @@ static bool zswap_has_pool;
>>  * helpers and fwd declarations
>>  **********************************/
>>  
>> +#define swap_zswap_tree(entry)					\
>> +	(&zswap_trees[swp_type(entry)][swp_offset(entry)	\
>> +		>> SWAP_ADDRESS_SPACE_SHIFT])
> 
> Make this a static inline function instead?

Good suggestion, will do.

Thanks.
Chengming Zhou Jan. 19, 2024, 6:24 a.m. UTC | #4
On 2024/1/19 03:24, Nhat Pham wrote:
> On Wed, Jan 17, 2024 at 1:23 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Each swapfile has one rb-tree to search the mapping of swp_entry_t to
>> zswap_entry, that use a spinlock to protect, which can cause heavy lock
>> contention if multiple tasks zswap_store/load concurrently.
>>
>> Optimize the scalability problem by splitting the zswap rb-tree into
>> multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
>> just like we did in the swap cache address_space splitting.
>>
>> Although this method can't solve the spinlock contention completely, it
>> can mitigate much of that contention. Below is the results of kernel build
>> in tmpfs with zswap shrinker enabled:
>>
>>      linux-next  zswap-lock-optimize
>> real 1m9.181s    1m3.820s
>> user 17m44.036s  17m40.100s
>> sys  7m37.297s   4m54.622s
> 
> That's really impressive, especially the sys time reduction :) Well done.
> 

Thanks!

>>
>> So there are clearly improvements.
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> 
> Code looks solid too. I haven't read the xarray patch series too
> closely yet, but this patch series is clearly already an improvement.
> It is simple, with existing precedent (from swap cache), and
> experiments show that it works quite well to improve zswap's
> performance.
> 
> If the xarray patch proves to be even better, we can always combine it
> with this approach (a per-range xarray?), or replace it with the
> xarray. But for now:
> 
> Acked-by: Nhat Pham <nphamcs@gmail.com>
> 

Right, I agree. We should combine both approaches.
diff mbox series

Patch

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index eca388229d9a..91895ce1fdbc 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -30,7 +30,7 @@  struct zswap_lruvec_state {
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
-int zswap_swapon(int type);
+int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -51,7 +51,7 @@  static inline bool zswap_load(struct folio *folio)
 }
 
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline int zswap_swapon(int type)
+static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
 	return 0;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c53ea06626b..35aa17b2a2fa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3164,7 +3164,7 @@  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (error)
 		goto bad_swap_unlock_inode;
 
-	error = zswap_swapon(p->type);
+	error = zswap_swapon(p->type, maxpages);
 	if (error)
 		goto free_swap_address_space;
 
diff --git a/mm/zswap.c b/mm/zswap.c
index d88faea85978..4a6dbc620c7c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -239,6 +239,7 @@  struct zswap_tree {
 };
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
 
 /* RCU-protected iteration */
 static LIST_HEAD(zswap_pools);
@@ -265,6 +266,10 @@  static bool zswap_has_pool;
 * helpers and fwd declarations
 **********************************/
 
+#define swap_zswap_tree(entry)					\
+	(&zswap_trees[swp_type(entry)][swp_offset(entry)	\
+		>> SWAP_ADDRESS_SPACE_SHIFT])
+
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpools[0]))
@@ -865,7 +870,7 @@  static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	 * until the entry is verified to still be alive in the tree.
 	 */
 	swpoffset = swp_offset(entry->swpentry);
-	tree = zswap_trees[swp_type(entry->swpentry)];
+	tree = swap_zswap_tree(entry->swpentry);
 	list_lru_isolate(l, item);
 	/*
 	 * It's safe to drop the lock here because we return either
@@ -1494,10 +1499,9 @@  static void zswap_fill_page(void *ptr, unsigned long value)
 bool zswap_store(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	struct page *page = &folio->page;
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
@@ -1569,7 +1573,7 @@  bool zswap_store(struct folio *folio)
 		src = kmap_local_page(page);
 		if (zswap_is_page_same_filled(src, &value)) {
 			kunmap_local(src);
-			entry->swpentry = swp_entry(type, offset);
+			entry->swpentry = swp;
 			entry->length = 0;
 			entry->value = value;
 			atomic_inc(&zswap_same_filled_pages);
@@ -1651,7 +1655,7 @@  bool zswap_store(struct folio *folio)
 	mutex_unlock(&acomp_ctx->mutex);
 
 	/* populate entry */
-	entry->swpentry = swp_entry(type, offset);
+	entry->swpentry = swp;
 	entry->handle = handle;
 	entry->length = dlen;
 
@@ -1711,10 +1715,9 @@  bool zswap_store(struct folio *folio)
 bool zswap_load(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	struct page *page = &folio->page;
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry;
 	u8 *dst;
 
@@ -1757,7 +1760,7 @@  bool zswap_load(struct folio *folio)
 
 void zswap_invalidate(int type, pgoff_t offset)
 {
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
 	struct zswap_entry *entry;
 
 	/* find */
@@ -1772,37 +1775,53 @@  void zswap_invalidate(int type, pgoff_t offset)
 	spin_unlock(&tree->lock);
 }
 
-int zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
 {
-	struct zswap_tree *tree;
+	struct zswap_tree *trees, *tree;
+	unsigned int nr, i;
 
-	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
-	if (!tree) {
+	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+	trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+	if (!trees) {
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 		return -ENOMEM;
 	}
 
-	tree->rbroot = RB_ROOT;
-	spin_lock_init(&tree->lock);
-	zswap_trees[type] = tree;
+	for (i = 0; i < nr; i++) {
+		tree = trees + i;
+		tree->rbroot = RB_ROOT;
+		spin_lock_init(&tree->lock);
+	}
+
+	nr_zswap_trees[type] = nr;
+	zswap_trees[type] = trees;
 	return 0;
 }
 
 void zswap_swapoff(int type)
 {
-	struct zswap_tree *tree = zswap_trees[type];
-	struct zswap_entry *entry, *n;
+	struct zswap_tree *trees = zswap_trees[type];
+	unsigned int i;
 
-	if (!tree)
+	if (!trees)
 		return;
 
-	/* walk the tree and free everything */
-	spin_lock(&tree->lock);
-	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-		zswap_free_entry(entry);
-	tree->rbroot = RB_ROOT;
-	spin_unlock(&tree->lock);
-	kfree(tree);
+	for (i = 0; i < nr_zswap_trees[type]; i++) {
+		struct zswap_tree *tree = trees + i;
+		struct zswap_entry *entry, *n;
+
+		/* walk the tree and free everything */
+		spin_lock(&tree->lock);
+		rbtree_postorder_for_each_entry_safe(entry, n,
+						     &tree->rbroot,
+						     rbnode)
+			zswap_free_entry(entry);
+		tree->rbroot = RB_ROOT;
+		spin_unlock(&tree->lock);
+	}
+
+	kvfree(trees);
+	nr_zswap_trees[type] = 0;
 	zswap_trees[type] = NULL;
 }