diff mbox series

[2/2] mm/zswap: change zswap_pool kref to percpu_ref

Message ID 20240210-zswap-global-lru-v1-2-853473d7b0da@bytedance.com (mailing list archive)
State New
Headers show
Series mm/zswap: optimize for dynamic zswap_pools | expand

Commit Message

Chengming Zhou Feb. 11, 2024, 1:57 p.m. UTC
All zswap entries will take a reference of zswap_pool when
zswap_store(), and drop it when free. Change it to use the
percpu_ref is better for scalability performance.

Testing kernel build in tmpfs with memory.max=2GB
(zswap shrinker and writeback enabled with one 50GB swapfile).

        mm-unstable  zswap-global-lru
real    63.20        63.12
user    1061.75      1062.95
sys     268.74       264.44

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 mm/zswap.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

Comments

Nhat Pham Feb. 11, 2024, 9:21 p.m. UTC | #1
On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> All zswap entries will take a reference of zswap_pool when
> zswap_store(), and drop it when free. Change it to use the
> percpu_ref is better for scalability performance.
>
> Testing kernel build in tmpfs with memory.max=2GB
> (zswap shrinker and writeback enabled with one 50GB swapfile).
>
>         mm-unstable  zswap-global-lru
> real    63.20        63.12
> user    1061.75      1062.95
> sys     268.74       264.44
>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  mm/zswap.c | 30 +++++++++++++++++++++---------
>  1 file changed, 21 insertions(+), 9 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7668db8c10e3..afb31904fb08 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -173,7 +173,7 @@ struct crypto_acomp_ctx {
>  struct zswap_pool {
>         struct zpool *zpools[ZSWAP_NR_ZPOOLS];
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
> -       struct kref kref;
> +       struct percpu_ref ref;
>         struct list_head list;
>         struct work_struct release_work;
>         struct hlist_node node;
> @@ -303,6 +303,7 @@ static void zswap_update_total_size(void)
>  /*********************************
>  * pool functions
>  **********************************/
> +static void __zswap_pool_empty(struct percpu_ref *ref);
>
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
> @@ -356,13 +357,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>         /* being the current pool takes 1 ref; this func expects the
>          * caller to always add the new pool as the current pool
>          */
> -       kref_init(&pool->kref);
> +       ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
> +                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
> +       if (ret)
> +               goto ref_fail;
>         INIT_LIST_HEAD(&pool->list);
>
>         zswap_pool_debug("created", pool);
>
>         return pool;
>
> +ref_fail:
> +       cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> @@ -435,8 +441,8 @@ static void __zswap_pool_release(struct work_struct *work)
>
>         synchronize_rcu();
>
> -       /* nobody should have been able to get a kref... */
> -       WARN_ON(kref_get_unless_zero(&pool->kref));

Do we no longer care about this WARN? IIUC, this is to catch someone
still holding a reference to the pool at release time, which sounds
like a bug. I think we can simulate the similar behavior with:

WARN_ON(percpu_ref_tryget(&pool->ref))

no? percpu_ref_tryget() should fail when the refcnt goes back down to
0. Then we can do percpu_ref_exit() as well.

> +       /* nobody should have been able to get a ref... */
> +       percpu_ref_exit(&pool->ref);
>
>         /* pool is now off zswap_pools list and has no references. */
>         zswap_pool_destroy(pool);
> @@ -444,11 +450,11 @@ static void __zswap_pool_release(struct work_struct *work)
>
>  static struct zswap_pool *zswap_pool_current(void);
>
> -static void __zswap_pool_empty(struct kref *kref)
> +static void __zswap_pool_empty(struct percpu_ref *ref)
>  {
>         struct zswap_pool *pool;
>
> -       pool = container_of(kref, typeof(*pool), kref);
> +       pool = container_of(ref, typeof(*pool), ref);
>
>         spin_lock(&zswap_pools_lock);
>
> @@ -467,12 +473,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
>         if (!pool)
>                 return 0;
>
> -       return kref_get_unless_zero(&pool->kref);
> +       return percpu_ref_tryget(&pool->ref);
>  }
>
>  static void zswap_pool_put(struct zswap_pool *pool)
>  {
> -       kref_put(&pool->kref, __zswap_pool_empty);
> +       percpu_ref_put(&pool->ref);
>  }
>
>  static struct zswap_pool *__zswap_pool_current(void)
> @@ -602,6 +608,12 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
>
>         if (!pool)
>                 pool = zswap_pool_create(type, compressor);
> +       else {
> +               /* Resurrect percpu_ref to percpu mode. */
> +               percpu_ref_resurrect(&pool->ref);
> +               /* Drop the ref from zswap_pool_find_get(). */
> +               zswap_pool_put(pool);
> +       }
>
>         if (pool)
>                 ret = param_set_charp(s, kp);
> @@ -640,7 +652,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
>          * or the new pool we failed to add
>          */
>         if (put_pool)
> -               zswap_pool_put(put_pool);
> +               percpu_ref_kill(&put_pool->ref);
>
>         return ret;
>  }
>
> --
> b4 0.10.1

The rest of the code looks solid to me FWIW. Number seems to indicate
this is a good idea as well.
Chengming Zhou Feb. 12, 2024, 1:29 p.m. UTC | #2
On 2024/2/12 05:21, Nhat Pham wrote:
> On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> All zswap entries will take a reference of zswap_pool when
>> zswap_store(), and drop it when free. Change it to use the
>> percpu_ref is better for scalability performance.
>>
>> Testing kernel build in tmpfs with memory.max=2GB
>> (zswap shrinker and writeback enabled with one 50GB swapfile).
>>
>>         mm-unstable  zswap-global-lru
>> real    63.20        63.12
>> user    1061.75      1062.95
>> sys     268.74       264.44
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>  mm/zswap.c | 30 +++++++++++++++++++++---------
>>  1 file changed, 21 insertions(+), 9 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 7668db8c10e3..afb31904fb08 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -173,7 +173,7 @@ struct crypto_acomp_ctx {
>>  struct zswap_pool {
>>         struct zpool *zpools[ZSWAP_NR_ZPOOLS];
>>         struct crypto_acomp_ctx __percpu *acomp_ctx;
>> -       struct kref kref;
>> +       struct percpu_ref ref;
>>         struct list_head list;
>>         struct work_struct release_work;
>>         struct hlist_node node;
>> @@ -303,6 +303,7 @@ static void zswap_update_total_size(void)
>>  /*********************************
>>  * pool functions
>>  **********************************/
>> +static void __zswap_pool_empty(struct percpu_ref *ref);
>>
>>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>>  {
>> @@ -356,13 +357,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>>         /* being the current pool takes 1 ref; this func expects the
>>          * caller to always add the new pool as the current pool
>>          */
>> -       kref_init(&pool->kref);
>> +       ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
>> +                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
>> +       if (ret)
>> +               goto ref_fail;
>>         INIT_LIST_HEAD(&pool->list);
>>
>>         zswap_pool_debug("created", pool);
>>
>>         return pool;
>>
>> +ref_fail:
>> +       cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>>  error:
>>         if (pool->acomp_ctx)
>>                 free_percpu(pool->acomp_ctx);
>> @@ -435,8 +441,8 @@ static void __zswap_pool_release(struct work_struct *work)
>>
>>         synchronize_rcu();
>>
>> -       /* nobody should have been able to get a kref... */
>> -       WARN_ON(kref_get_unless_zero(&pool->kref));
> 
> Do we no longer care about this WARN? IIUC, this is to catch someone
> still holding a reference to the pool at release time, which sounds
> like a bug. I think we can simulate the similar behavior with:

Ok, I thought it has already been put to 0 when we're here, so any tryget
will fail. But keeping this WARN_ON() is also fine to me, will keep it.

Thanks.

> 
> WARN_ON(percpu_ref_tryget(&pool->ref))
> 
> no? percpu_ref_tryget() should fail when the refcnt goes back down to
> 0. Then we can do percpu_ref_exit() as well.
> 
>> +       /* nobody should have been able to get a ref... */
>> +       percpu_ref_exit(&pool->ref);
>>
>>         /* pool is now off zswap_pools list and has no references. */
>>         zswap_pool_destroy(pool);
>> @@ -444,11 +450,11 @@ static void __zswap_pool_release(struct work_struct *work)
>>
>>  static struct zswap_pool *zswap_pool_current(void);
>>
>> -static void __zswap_pool_empty(struct kref *kref)
>> +static void __zswap_pool_empty(struct percpu_ref *ref)
>>  {
>>         struct zswap_pool *pool;
>>
>> -       pool = container_of(kref, typeof(*pool), kref);
>> +       pool = container_of(ref, typeof(*pool), ref);
>>
>>         spin_lock(&zswap_pools_lock);
>>
>> @@ -467,12 +473,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
>>         if (!pool)
>>                 return 0;
>>
>> -       return kref_get_unless_zero(&pool->kref);
>> +       return percpu_ref_tryget(&pool->ref);
>>  }
>>
>>  static void zswap_pool_put(struct zswap_pool *pool)
>>  {
>> -       kref_put(&pool->kref, __zswap_pool_empty);
>> +       percpu_ref_put(&pool->ref);
>>  }
>>
>>  static struct zswap_pool *__zswap_pool_current(void)
>> @@ -602,6 +608,12 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
>>
>>         if (!pool)
>>                 pool = zswap_pool_create(type, compressor);
>> +       else {
>> +               /* Resurrect percpu_ref to percpu mode. */
>> +               percpu_ref_resurrect(&pool->ref);
>> +               /* Drop the ref from zswap_pool_find_get(). */
>> +               zswap_pool_put(pool);
>> +       }
>>
>>         if (pool)
>>                 ret = param_set_charp(s, kp);
>> @@ -640,7 +652,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
>>          * or the new pool we failed to add
>>          */
>>         if (put_pool)
>> -               zswap_pool_put(put_pool);
>> +               percpu_ref_kill(&put_pool->ref);
>>
>>         return ret;
>>  }
>>
>> --
>> b4 0.10.1
> 
> The rest of the code looks solid to me FWIW. Number seems to indicate
> this is a good idea as well.
Nhat Pham Feb. 12, 2024, 6:53 p.m. UTC | #3
On Mon, Feb 12, 2024 at 5:29 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> On 2024/2/12 05:21, Nhat Pham wrote:
> > On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> >>
> >> All zswap entries will take a reference of zswap_pool when
> >> zswap_store(), and drop it when free. Change it to use the
> >> percpu_ref is better for scalability performance.
> >>
> >> Testing kernel build in tmpfs with memory.max=2GB
> >> (zswap shrinker and writeback enabled with one 50GB swapfile).
> >>
> >>         mm-unstable  zswap-global-lru
> >> real    63.20        63.12
> >> user    1061.75      1062.95
> >> sys     268.74       264.44
> >>
> >> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> >> ---
> >>  mm/zswap.c | 30 +++++++++++++++++++++---------
> >>  1 file changed, 21 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> index 7668db8c10e3..afb31904fb08 100644
> >> --- a/mm/zswap.c
> >> +++ b/mm/zswap.c
> >> @@ -173,7 +173,7 @@ struct crypto_acomp_ctx {
> >>  struct zswap_pool {
> >>         struct zpool *zpools[ZSWAP_NR_ZPOOLS];
> >>         struct crypto_acomp_ctx __percpu *acomp_ctx;
> >> -       struct kref kref;
> >> +       struct percpu_ref ref;
> >>         struct list_head list;
> >>         struct work_struct release_work;
> >>         struct hlist_node node;
> >> @@ -303,6 +303,7 @@ static void zswap_update_total_size(void)
> >>  /*********************************
> >>  * pool functions
> >>  **********************************/
> >> +static void __zswap_pool_empty(struct percpu_ref *ref);
> >>
> >>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >>  {
> >> @@ -356,13 +357,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >>         /* being the current pool takes 1 ref; this func expects the
> >>          * caller to always add the new pool as the current pool
> >>          */
> >> -       kref_init(&pool->kref);
> >> +       ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
> >> +                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
> >> +       if (ret)
> >> +               goto ref_fail;
> >>         INIT_LIST_HEAD(&pool->list);
> >>
> >>         zswap_pool_debug("created", pool);
> >>
> >>         return pool;
> >>
> >> +ref_fail:
> >> +       cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> >>  error:
> >>         if (pool->acomp_ctx)
> >>                 free_percpu(pool->acomp_ctx);
> >> @@ -435,8 +441,8 @@ static void __zswap_pool_release(struct work_struct *work)
> >>
> >>         synchronize_rcu();
> >>
> >> -       /* nobody should have been able to get a kref... */
> >> -       WARN_ON(kref_get_unless_zero(&pool->kref));
> >
> > Do we no longer care about this WARN? IIUC, this is to catch someone
> > still holding a reference to the pool at release time, which sounds
> > like a bug. I think we can simulate the similar behavior with:
>
> Ok, I thought it has already been put to 0 when we're here, so any tryget
> will fail. But keeping this WARN_ON() is also fine to me, will keep it.

Yup - it should fail, if the code is not buggy. But that's a pretty big if :)

Jokes aside, we can remove it if folks think the benefit is not worth
the cost/overhead. However, I'm a bit hesitant to remove checks in
zswap, especially given how buggy it has been (some of which are
refcnt bugs as well, IIRC).
Yosry Ahmed Feb. 12, 2024, 10:42 p.m. UTC | #4
On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> All zswap entries will take a reference of zswap_pool when
> zswap_store(), and drop it when free. Change it to use the
> percpu_ref is better for scalability performance.
>
> Testing kernel build in tmpfs with memory.max=2GB
> (zswap shrinker and writeback enabled with one 50GB swapfile).
>
>         mm-unstable  zswap-global-lru
> real    63.20        63.12
> user    1061.75      1062.95
> sys     268.74       264.44

Are these numbers from a single run or the average of multiple runs?
It just seems that the improvement is small, and percpu refcnt is
slightly less intuitive (and uses a bit more memory), so let's make
sure there is a real performance gain first.

It would also be useful to mention how many threads/CPUs are being used here.
Chengming Zhou Feb. 13, 2024, 2:22 p.m. UTC | #5
On 2024/2/13 02:53, Nhat Pham wrote:
> On Mon, Feb 12, 2024 at 5:29 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> On 2024/2/12 05:21, Nhat Pham wrote:
>>> On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
>>> <zhouchengming@bytedance.com> wrote:
>>>>
>>>> All zswap entries will take a reference of zswap_pool when
>>>> zswap_store(), and drop it when free. Change it to use the
>>>> percpu_ref is better for scalability performance.
>>>>
>>>> Testing kernel build in tmpfs with memory.max=2GB
>>>> (zswap shrinker and writeback enabled with one 50GB swapfile).
>>>>
>>>>         mm-unstable  zswap-global-lru
>>>> real    63.20        63.12
>>>> user    1061.75      1062.95
>>>> sys     268.74       264.44
>>>>
>>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>>> ---
>>>>  mm/zswap.c | 30 +++++++++++++++++++++---------
>>>>  1 file changed, 21 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/mm/zswap.c b/mm/zswap.c
>>>> index 7668db8c10e3..afb31904fb08 100644
>>>> --- a/mm/zswap.c
>>>> +++ b/mm/zswap.c
>>>> @@ -173,7 +173,7 @@ struct crypto_acomp_ctx {
>>>>  struct zswap_pool {
>>>>         struct zpool *zpools[ZSWAP_NR_ZPOOLS];
>>>>         struct crypto_acomp_ctx __percpu *acomp_ctx;
>>>> -       struct kref kref;
>>>> +       struct percpu_ref ref;
>>>>         struct list_head list;
>>>>         struct work_struct release_work;
>>>>         struct hlist_node node;
>>>> @@ -303,6 +303,7 @@ static void zswap_update_total_size(void)
>>>>  /*********************************
>>>>  * pool functions
>>>>  **********************************/
>>>> +static void __zswap_pool_empty(struct percpu_ref *ref);
>>>>
>>>>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>>>>  {
>>>> @@ -356,13 +357,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>>>>         /* being the current pool takes 1 ref; this func expects the
>>>>          * caller to always add the new pool as the current pool
>>>>          */
>>>> -       kref_init(&pool->kref);
>>>> +       ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
>>>> +                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
>>>> +       if (ret)
>>>> +               goto ref_fail;
>>>>         INIT_LIST_HEAD(&pool->list);
>>>>
>>>>         zswap_pool_debug("created", pool);
>>>>
>>>>         return pool;
>>>>
>>>> +ref_fail:
>>>> +       cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>>>>  error:
>>>>         if (pool->acomp_ctx)
>>>>                 free_percpu(pool->acomp_ctx);
>>>> @@ -435,8 +441,8 @@ static void __zswap_pool_release(struct work_struct *work)
>>>>
>>>>         synchronize_rcu();
>>>>
>>>> -       /* nobody should have been able to get a kref... */
>>>> -       WARN_ON(kref_get_unless_zero(&pool->kref));
>>>
>>> Do we no longer care about this WARN? IIUC, this is to catch someone
>>> still holding a reference to the pool at release time, which sounds
>>> like a bug. I think we can simulate the similar behavior with:
>>
>> Ok, I thought it has already been put to 0 when we're here, so any tryget
>> will fail. But keeping this WARN_ON() is also fine to me, will keep it.
> 
> Yup - it should fail, if the code is not buggy. But that's a pretty big if :)
> 
> Jokes aside, we can remove it if folks think the benefit is not worth
> the cost/overhead. However, I'm a bit hesitant to remove checks in
> zswap, especially given how buggy it has been (some of which are
> refcnt bugs as well, IIRC).

Yes, agree. It looks clearer to keep it, which should be no cost at all.

Thanks!
Chengming Zhou Feb. 13, 2024, 2:31 p.m. UTC | #6
On 2024/2/13 06:42, Yosry Ahmed wrote:
> On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> All zswap entries will take a reference of zswap_pool when
>> zswap_store(), and drop it when free. Change it to use the
>> percpu_ref is better for scalability performance.
>>
>> Testing kernel build in tmpfs with memory.max=2GB
>> (zswap shrinker and writeback enabled with one 50GB swapfile).
>>
>>         mm-unstable  zswap-global-lru
>> real    63.20        63.12
>> user    1061.75      1062.95
>> sys     268.74       264.44
> 
> Are these numbers from a single run or the average of multiple runs?

The average of 5 runs. And I just checked/compared each run result,
the improvement is stable. So yes, it should be a real performance gain.

> It just seems that the improvement is small, and percpu refcnt is
> slightly less intuitive (and uses a bit more memory), so let's make
> sure there is a real performance gain first.

Right, percpu_ref use a bit more memory which should be ok for our use case,
since we almost have only one zswap_pool to be using. The performance gain is
for zswap_store/load hotpath.

> 
> It would also be useful to mention how many threads/CPUs are being used here.

My bad, the testing uses 32 threads on a 128 CPUs x86-64 machine.

Thanks.
Yosry Ahmed Feb. 13, 2024, 5:45 p.m. UTC | #7
On Tue, Feb 13, 2024 at 10:31:16PM +0800, Chengming Zhou wrote:
> On 2024/2/13 06:42, Yosry Ahmed wrote:
> > On Sun, Feb 11, 2024 at 5:58 AM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> >>
> >> All zswap entries will take a reference of zswap_pool when
> >> zswap_store(), and drop it when free. Change it to use the
> >> percpu_ref is better for scalability performance.
> >>
> >> Testing kernel build in tmpfs with memory.max=2GB
> >> (zswap shrinker and writeback enabled with one 50GB swapfile).
> >>
> >>         mm-unstable  zswap-global-lru
> >> real    63.20        63.12
> >> user    1061.75      1062.95
> >> sys     268.74       264.44
> > 
> > Are these numbers from a single run or the average of multiple runs?
> 
> The average of 5 runs. And I just checked/compared each run result,
> the improvement is stable. So yes, it should be a real performance gain.
> 
> > It just seems that the improvement is small, and percpu refcnt is
> > slightly less intuitive (and uses a bit more memory), so let's make
> > sure there is a real performance gain first.
> 
> Right, percpu_ref use a bit more memory which should be ok for our use case,
> since we almost have only one zswap_pool to be using. The performance gain is
> for zswap_store/load hotpath.
> 
> > 
> > It would also be useful to mention how many threads/CPUs are being used here.
> 
> My bad, the testing uses 32 threads on a 128 CPUs x86-64 machine.

Thanks for the clarification. Please include such details in the commit
message.
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 7668db8c10e3..afb31904fb08 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -173,7 +173,7 @@  struct crypto_acomp_ctx {
 struct zswap_pool {
 	struct zpool *zpools[ZSWAP_NR_ZPOOLS];
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
-	struct kref kref;
+	struct percpu_ref ref;
 	struct list_head list;
 	struct work_struct release_work;
 	struct hlist_node node;
@@ -303,6 +303,7 @@  static void zswap_update_total_size(void)
 /*********************************
 * pool functions
 **********************************/
+static void __zswap_pool_empty(struct percpu_ref *ref);
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
@@ -356,13 +357,18 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	/* being the current pool takes 1 ref; this func expects the
 	 * caller to always add the new pool as the current pool
 	 */
-	kref_init(&pool->kref);
+	ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
+			      PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (ret)
+		goto ref_fail;
 	INIT_LIST_HEAD(&pool->list);
 
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
+ref_fail:
+	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -435,8 +441,8 @@  static void __zswap_pool_release(struct work_struct *work)
 
 	synchronize_rcu();
 
-	/* nobody should have been able to get a kref... */
-	WARN_ON(kref_get_unless_zero(&pool->kref));
+	/* nobody should have been able to get a ref... */
+	percpu_ref_exit(&pool->ref);
 
 	/* pool is now off zswap_pools list and has no references. */
 	zswap_pool_destroy(pool);
@@ -444,11 +450,11 @@  static void __zswap_pool_release(struct work_struct *work)
 
 static struct zswap_pool *zswap_pool_current(void);
 
-static void __zswap_pool_empty(struct kref *kref)
+static void __zswap_pool_empty(struct percpu_ref *ref)
 {
 	struct zswap_pool *pool;
 
-	pool = container_of(kref, typeof(*pool), kref);
+	pool = container_of(ref, typeof(*pool), ref);
 
 	spin_lock(&zswap_pools_lock);
 
@@ -467,12 +473,12 @@  static int __must_check zswap_pool_get(struct zswap_pool *pool)
 	if (!pool)
 		return 0;
 
-	return kref_get_unless_zero(&pool->kref);
+	return percpu_ref_tryget(&pool->ref);
 }
 
 static void zswap_pool_put(struct zswap_pool *pool)
 {
-	kref_put(&pool->kref, __zswap_pool_empty);
+	percpu_ref_put(&pool->ref);
 }
 
 static struct zswap_pool *__zswap_pool_current(void)
@@ -602,6 +608,12 @@  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 
 	if (!pool)
 		pool = zswap_pool_create(type, compressor);
+	else {
+		/* Resurrect percpu_ref to percpu mode. */
+		percpu_ref_resurrect(&pool->ref);
+		/* Drop the ref from zswap_pool_find_get(). */
+		zswap_pool_put(pool);
+	}
 
 	if (pool)
 		ret = param_set_charp(s, kp);
@@ -640,7 +652,7 @@  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 	 * or the new pool we failed to add
 	 */
 	if (put_pool)
-		zswap_pool_put(put_pool);
+		percpu_ref_kill(&put_pool->ref);
 
 	return ret;
 }