diff mbox series

[2/5] btrfs: make the extent map shrinker run asynchronously as a work queue job

Message ID 1a3f817fc3c5a6e4267bcd56f2f0518a9d8e0e4e.1727174151.git.fdmanana@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: make extent map shrinker more efficient and re-enable it | expand

Commit Message

Filipe Manana Sept. 24, 2024, 10:45 a.m. UTC
From: Filipe Manana <fdmanana@suse.com>

Currently the extent map shrinker is run synchronously for kswapd tasks
that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
This has some disadvantages and for some heavy workloads with memory
pressure it can cause some delays and stalls that make a machine
unresponsive for some periods. This happens because:

1) We can have several kswapd tasks on machines with multiple NUMA zones,
   and running the extent map shrinker concurrently can cause high
   contention on some spin locks, namely the spin locks that protect
   the radix tree that tracks roots, the per root xarray that tracks
   open inodes and the list of delayed iputs. This not only delays the
   shrinker but also causes high CPU consumption and makes the task
   running the shrinker monopolize a core, resulting in the symptoms
   of an unresponsive system. This was noted in previous commits such as
   commit ae1e766f623f ("btrfs: only run the extent map shrinker from
   kswapd tasks");

2) The extent map shrinker's iteration over inodes can often be slow, even
   after changing the data structure that tracks open inodes for a root
   from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
   The transition to the xarray while it made things a bit faster, it's
   still somewhat slow - for example in a test scenario with 10000 inodes
   that have no extent maps loaded, the extent map shrinker took between
   5ms to 8ms, using a release, non-debug kernel. Iterating over the
   extent maps of an inode can also be slow if have an inode with many
   thousands of extent maps, since we use a red black tree to track and
   search extent maps. So having the extent map shrinker run synchronously
   adds extra delay for other things a kswapd task does.

So make the extent map shrinker run asynchronously as a job for the
system unbounded workqueue, just like what we do for data and metadata
space reclaim jobs.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/disk-io.c    |  2 ++
 fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/extent_map.h |  3 ++-
 fs/btrfs/fs.h         |  2 ++
 fs/btrfs/super.c      | 13 +++--------
 5 files changed, 52 insertions(+), 19 deletions(-)

Comments

Qu Wenruo Sept. 25, 2024, 10:08 p.m. UTC | #1
在 2024/9/24 20:15, fdmanana@kernel.org 写道:
> From: Filipe Manana <fdmanana@suse.com>
>
> Currently the extent map shrinker is run synchronously for kswapd tasks
> that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
> This has some disadvantages and for some heavy workloads with memory
> pressure it can cause some delays and stalls that make a machine
> unresponsive for some periods. This happens because:
>
> 1) We can have several kswapd tasks on machines with multiple NUMA zones,
>     and running the extent map shrinker concurrently can cause high
>     contention on some spin locks, namely the spin locks that protect
>     the radix tree that tracks roots, the per root xarray that tracks
>     open inodes and the list of delayed iputs. This not only delays the
>     shrinker but also causes high CPU consumption and makes the task
>     running the shrinker monopolize a core, resulting in the symptoms
>     of an unresponsive system. This was noted in previous commits such as
>     commit ae1e766f623f ("btrfs: only run the extent map shrinker from
>     kswapd tasks");
>
> 2) The extent map shrinker's iteration over inodes can often be slow, even
>     after changing the data structure that tracks open inodes for a root
>     from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
>     The transition to the xarray while it made things a bit faster, it's
>     still somewhat slow - for example in a test scenario with 10000 inodes
>     that have no extent maps loaded, the extent map shrinker took between
>     5ms to 8ms, using a release, non-debug kernel. Iterating over the
>     extent maps of an inode can also be slow if have an inode with many
>     thousands of extent maps, since we use a red black tree to track and
>     search extent maps. So having the extent map shrinker run synchronously
>     adds extra delay for other things a kswapd task does.
>
> So make the extent map shrinker run asynchronously as a job for the
> system unbounded workqueue, just like what we do for data and metadata
> space reclaim jobs.
>
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
>   fs/btrfs/disk-io.c    |  2 ++
>   fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
>   fs/btrfs/extent_map.h |  3 ++-
>   fs/btrfs/fs.h         |  2 ++
>   fs/btrfs/super.c      | 13 +++--------
>   5 files changed, 52 insertions(+), 19 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 25d768e67e37..2148147c5257 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2786,6 +2786,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>   	btrfs_init_scrub(fs_info);
>   	btrfs_init_balance(fs_info);
>   	btrfs_init_async_reclaim_work(fs_info);
> +	btrfs_init_extent_map_shrinker_work(fs_info);
>
>   	rwlock_init(&fs_info->block_group_cache_lock);
>   	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
> @@ -4283,6 +4284,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
>   	cancel_work_sync(&fs_info->async_reclaim_work);
>   	cancel_work_sync(&fs_info->async_data_reclaim_work);
>   	cancel_work_sync(&fs_info->preempt_reclaim_work);
> +	cancel_work_sync(&fs_info->extent_map_shrinker_work);
>
>   	/* Cancel or finish ongoing discard work */
>   	btrfs_discard_cleanup(fs_info);
> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> index cb2a6f5dce2b..e2eeb94aa349 100644
> --- a/fs/btrfs/extent_map.c
> +++ b/fs/btrfs/extent_map.c
> @@ -1118,7 +1118,8 @@ struct btrfs_em_shrink_ctx {
>
>   static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
>   {
> -	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
>   	struct extent_map_tree *tree = &inode->extent_tree;
>   	long nr_dropped = 0;
>   	struct rb_node *node;
> @@ -1191,7 +1192,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
>   		 * lock. This is to avoid slowing other tasks trying to take the
>   		 * lock.
>   		 */
> -		if (need_resched() || rwlock_needbreak(&tree->lock))
> +		if (need_resched() || rwlock_needbreak(&tree->lock) ||
> +		    btrfs_fs_closing(fs_info))
>   			break;
>   		node = next;
>   	}
> @@ -1215,7 +1217,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
>   		ctx->last_ino = btrfs_ino(inode);
>   		btrfs_add_delayed_iput(inode);
>
> -		if (ctx->scanned >= ctx->nr_to_scan)
> +		if (ctx->scanned >= ctx->nr_to_scan ||
> +		    btrfs_fs_closing(inode->root->fs_info))
>   			break;
>
>   		cond_resched();
> @@ -1244,16 +1247,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
>   	return nr_dropped;
>   }
>
> -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> +static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
>   {
> +	struct btrfs_fs_info *fs_info;
>   	struct btrfs_em_shrink_ctx ctx;
>   	u64 start_root_id;
>   	u64 next_root_id;
>   	bool cycled = false;
>   	long nr_dropped = 0;
>
> +	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
> +
>   	ctx.scanned = 0;
> -	ctx.nr_to_scan = nr_to_scan;
> +	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
>
>   	/*
>   	 * In case we have multiple tasks running this shrinker, make the next
> @@ -1271,12 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
>   	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
>   		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
>
> -		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
> +		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
>   							   nr, ctx.last_root,
>   							   ctx.last_ino);
>   	}
>
> -	while (ctx.scanned < ctx.nr_to_scan) {
> +	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
>   		struct btrfs_root *root;
>   		unsigned long count;
>
> @@ -1334,5 +1340,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
>   							  ctx.last_ino);
>   	}
>
> -	return nr_dropped;
> +	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
> +}
> +
> +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> +{
> +	/*
> +	 * Do nothing if the shrinker is already running. In case of high memory
> +	 * pressure we can have a lot of tasks calling us and all passing the
> +	 * same nr_to_scan value, but in reality we may need only to free
> +	 * nr_to_scan extent maps (or less). In case we need to free more than
> +	 * that, we will be called again by the fs shrinker, so no worries about
> +	 * not doing enough work to reclaim memory from extent maps.
> +	 * We can also be repeatedly called with the same nr_to_scan value
> +	 * simply because the shrinker runs asynchronously and multiple calls
> +	 * to this function are made before the shrinker does enough progress.
> +	 *
> +	 * That's why we set the atomic counter to nr_to_scan only if its
> +	 * current value is zero, instead of incrementing the counter by
> +	 * nr_to_scan.
> +	 */

Since the shrinker can be called frequently, even if we only keep one
shrink work running, will the shrinker be kept running for a long time?
(one queued work done, then immiedately be queued again)

The XFS is queuing the work with an delay, although their reason is that
the reclaim needs to be run more frequently than syncd interval (30s).

Do we also need some delay to prevent such too frequent reclaim work?

Thanks,
Qu

> +	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
> +		return;
> +
> +	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
> +}
> +
> +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
> +{
> +	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
> +	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
>   }
> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> index 5154a8f1d26c..cd123b266b64 100644
> --- a/fs/btrfs/extent_map.h
> +++ b/fs/btrfs/extent_map.h
> @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
>   int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
>   				   struct extent_map *new_em,
>   				   bool modified);
> -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
> +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
> +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
>
>   #endif
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index 785ec15c1b84..a246d8dc0b20 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -638,6 +638,8 @@ struct btrfs_fs_info {
>   	spinlock_t extent_map_shrinker_lock;
>   	u64 extent_map_shrinker_last_root;
>   	u64 extent_map_shrinker_last_ino;
> +	atomic64_t extent_map_shrinker_nr_to_scan;
> +	struct work_struct extent_map_shrinker_work;
>
>   	/* Protected by 'trans_lock'. */
>   	struct list_head dirty_cowonly_roots;
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index e8a5bf4af918..e9e209dd8e05 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -28,7 +28,6 @@
>   #include <linux/btrfs.h>
>   #include <linux/security.h>
>   #include <linux/fs_parser.h>
> -#include <linux/swap.h>
>   #include "messages.h"
>   #include "delayed-inode.h"
>   #include "ctree.h"
> @@ -2416,16 +2415,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
>   	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
>   	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>
> -	/*
> -	 * We may be called from any task trying to allocate memory and we don't
> -	 * want to slow it down with scanning and dropping extent maps. It would
> -	 * also cause heavy lock contention if many tasks concurrently enter
> -	 * here. Therefore only allow kswapd tasks to scan and drop extent maps.
> -	 */
> -	if (!current_is_kswapd())
> -		return 0;
> +	btrfs_free_extent_maps(fs_info, nr_to_scan);
>
> -	return btrfs_free_extent_maps(fs_info, nr_to_scan);
> +	/* The extent map shrinker runs asynchronously, so always return 0. */
> +	return 0;
>   }
>
>   static const struct super_operations btrfs_super_ops = {
Filipe Manana Sept. 26, 2024, 9:01 a.m. UTC | #2
On Wed, Sep 25, 2024 at 11:08 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> 在 2024/9/24 20:15, fdmanana@kernel.org 写道:
> > From: Filipe Manana <fdmanana@suse.com>
> >
> > Currently the extent map shrinker is run synchronously for kswapd tasks
> > that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
> > This has some disadvantages and for some heavy workloads with memory
> > pressure it can cause some delays and stalls that make a machine
> > unresponsive for some periods. This happens because:
> >
> > 1) We can have several kswapd tasks on machines with multiple NUMA zones,
> >     and running the extent map shrinker concurrently can cause high
> >     contention on some spin locks, namely the spin locks that protect
> >     the radix tree that tracks roots, the per root xarray that tracks
> >     open inodes and the list of delayed iputs. This not only delays the
> >     shrinker but also causes high CPU consumption and makes the task
> >     running the shrinker monopolize a core, resulting in the symptoms
> >     of an unresponsive system. This was noted in previous commits such as
> >     commit ae1e766f623f ("btrfs: only run the extent map shrinker from
> >     kswapd tasks");
> >
> > 2) The extent map shrinker's iteration over inodes can often be slow, even
> >     after changing the data structure that tracks open inodes for a root
> >     from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
> >     The transition to the xarray while it made things a bit faster, it's
> >     still somewhat slow - for example in a test scenario with 10000 inodes
> >     that have no extent maps loaded, the extent map shrinker took between
> >     5ms to 8ms, using a release, non-debug kernel. Iterating over the
> >     extent maps of an inode can also be slow if have an inode with many
> >     thousands of extent maps, since we use a red black tree to track and
> >     search extent maps. So having the extent map shrinker run synchronously
> >     adds extra delay for other things a kswapd task does.
> >
> > So make the extent map shrinker run asynchronously as a job for the
> > system unbounded workqueue, just like what we do for data and metadata
> > space reclaim jobs.
> >
> > Signed-off-by: Filipe Manana <fdmanana@suse.com>
> > ---
> >   fs/btrfs/disk-io.c    |  2 ++
> >   fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
> >   fs/btrfs/extent_map.h |  3 ++-
> >   fs/btrfs/fs.h         |  2 ++
> >   fs/btrfs/super.c      | 13 +++--------
> >   5 files changed, 52 insertions(+), 19 deletions(-)
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 25d768e67e37..2148147c5257 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -2786,6 +2786,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> >       btrfs_init_scrub(fs_info);
> >       btrfs_init_balance(fs_info);
> >       btrfs_init_async_reclaim_work(fs_info);
> > +     btrfs_init_extent_map_shrinker_work(fs_info);
> >
> >       rwlock_init(&fs_info->block_group_cache_lock);
> >       fs_info->block_group_cache_tree = RB_ROOT_CACHED;
> > @@ -4283,6 +4284,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
> >       cancel_work_sync(&fs_info->async_reclaim_work);
> >       cancel_work_sync(&fs_info->async_data_reclaim_work);
> >       cancel_work_sync(&fs_info->preempt_reclaim_work);
> > +     cancel_work_sync(&fs_info->extent_map_shrinker_work);
> >
> >       /* Cancel or finish ongoing discard work */
> >       btrfs_discard_cleanup(fs_info);
> > diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> > index cb2a6f5dce2b..e2eeb94aa349 100644
> > --- a/fs/btrfs/extent_map.c
> > +++ b/fs/btrfs/extent_map.c
> > @@ -1118,7 +1118,8 @@ struct btrfs_em_shrink_ctx {
> >
> >   static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
> >   {
> > -     const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
> > +     struct btrfs_fs_info *fs_info = inode->root->fs_info;
> > +     const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
> >       struct extent_map_tree *tree = &inode->extent_tree;
> >       long nr_dropped = 0;
> >       struct rb_node *node;
> > @@ -1191,7 +1192,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
> >                * lock. This is to avoid slowing other tasks trying to take the
> >                * lock.
> >                */
> > -             if (need_resched() || rwlock_needbreak(&tree->lock))
> > +             if (need_resched() || rwlock_needbreak(&tree->lock) ||
> > +                 btrfs_fs_closing(fs_info))
> >                       break;
> >               node = next;
> >       }
> > @@ -1215,7 +1217,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
> >               ctx->last_ino = btrfs_ino(inode);
> >               btrfs_add_delayed_iput(inode);
> >
> > -             if (ctx->scanned >= ctx->nr_to_scan)
> > +             if (ctx->scanned >= ctx->nr_to_scan ||
> > +                 btrfs_fs_closing(inode->root->fs_info))
> >                       break;
> >
> >               cond_resched();
> > @@ -1244,16 +1247,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
> >       return nr_dropped;
> >   }
> >
> > -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> > +static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
> >   {
> > +     struct btrfs_fs_info *fs_info;
> >       struct btrfs_em_shrink_ctx ctx;
> >       u64 start_root_id;
> >       u64 next_root_id;
> >       bool cycled = false;
> >       long nr_dropped = 0;
> >
> > +     fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
> > +
> >       ctx.scanned = 0;
> > -     ctx.nr_to_scan = nr_to_scan;
> > +     ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
> >
> >       /*
> >        * In case we have multiple tasks running this shrinker, make the next
> > @@ -1271,12 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> >       if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
> >               s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
> >
> > -             trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
> > +             trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
> >                                                          nr, ctx.last_root,
> >                                                          ctx.last_ino);
> >       }
> >
> > -     while (ctx.scanned < ctx.nr_to_scan) {
> > +     while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
> >               struct btrfs_root *root;
> >               unsigned long count;
> >
> > @@ -1334,5 +1340,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> >                                                         ctx.last_ino);
> >       }
> >
> > -     return nr_dropped;
> > +     atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
> > +}
> > +
> > +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> > +{
> > +     /*
> > +      * Do nothing if the shrinker is already running. In case of high memory
> > +      * pressure we can have a lot of tasks calling us and all passing the
> > +      * same nr_to_scan value, but in reality we may need only to free
> > +      * nr_to_scan extent maps (or less). In case we need to free more than
> > +      * that, we will be called again by the fs shrinker, so no worries about
> > +      * not doing enough work to reclaim memory from extent maps.
> > +      * We can also be repeatedly called with the same nr_to_scan value
> > +      * simply because the shrinker runs asynchronously and multiple calls
> > +      * to this function are made before the shrinker does enough progress.
> > +      *
> > +      * That's why we set the atomic counter to nr_to_scan only if its
> > +      * current value is zero, instead of incrementing the counter by
> > +      * nr_to_scan.
> > +      */
>
> Since the shrinker can be called frequently, even if we only keep one
> shrink work running, will the shrinker be kept running for a long time?
> (one queued work done, then immiedately be queued again)

What's the problem?
The use of the atomic and not incrementing it, as said in the comment,
makes sure we don't do more work than necessary.

It's also running in the system unbound queue and has plenty of
explicit reschedule calls to ensure it doesn't monopolize a cpu and
doesn't block other tasks for long.

So how can it cause any problem?

>
> The XFS is queuing the work with an delay, although their reason is that
> the reclaim needs to be run more frequently than syncd interval (30s).
>
> Do we also need some delay to prevent such too frequent reclaim work?

See the comment above.

Without the increment of the atomic counter, if it keeps getting
scheduled it's because we're under memory pressure and need to free
memory as quickly as possible.

Thanks.

>
> Thanks,
> Qu
>
> > +     if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
> > +             return;
> > +
> > +     queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
> > +}
> > +
> > +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
> > +{
> > +     atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
> > +     INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
> >   }
> > diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> > index 5154a8f1d26c..cd123b266b64 100644
> > --- a/fs/btrfs/extent_map.h
> > +++ b/fs/btrfs/extent_map.h
> > @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
> >   int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
> >                                  struct extent_map *new_em,
> >                                  bool modified);
> > -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
> > +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
> > +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
> >
> >   #endif
> > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> > index 785ec15c1b84..a246d8dc0b20 100644
> > --- a/fs/btrfs/fs.h
> > +++ b/fs/btrfs/fs.h
> > @@ -638,6 +638,8 @@ struct btrfs_fs_info {
> >       spinlock_t extent_map_shrinker_lock;
> >       u64 extent_map_shrinker_last_root;
> >       u64 extent_map_shrinker_last_ino;
> > +     atomic64_t extent_map_shrinker_nr_to_scan;
> > +     struct work_struct extent_map_shrinker_work;
> >
> >       /* Protected by 'trans_lock'. */
> >       struct list_head dirty_cowonly_roots;
> > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> > index e8a5bf4af918..e9e209dd8e05 100644
> > --- a/fs/btrfs/super.c
> > +++ b/fs/btrfs/super.c
> > @@ -28,7 +28,6 @@
> >   #include <linux/btrfs.h>
> >   #include <linux/security.h>
> >   #include <linux/fs_parser.h>
> > -#include <linux/swap.h>
> >   #include "messages.h"
> >   #include "delayed-inode.h"
> >   #include "ctree.h"
> > @@ -2416,16 +2415,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
> >       const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
> >       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> >
> > -     /*
> > -      * We may be called from any task trying to allocate memory and we don't
> > -      * want to slow it down with scanning and dropping extent maps. It would
> > -      * also cause heavy lock contention if many tasks concurrently enter
> > -      * here. Therefore only allow kswapd tasks to scan and drop extent maps.
> > -      */
> > -     if (!current_is_kswapd())
> > -             return 0;
> > +     btrfs_free_extent_maps(fs_info, nr_to_scan);
> >
> > -     return btrfs_free_extent_maps(fs_info, nr_to_scan);
> > +     /* The extent map shrinker runs asynchronously, so always return 0. */
> > +     return 0;
> >   }
> >
> >   static const struct super_operations btrfs_super_ops = {
>
Qu Wenruo Sept. 26, 2024, 9:55 a.m. UTC | #3
在 2024/9/26 18:31, Filipe Manana 写道:
> On Wed, Sep 25, 2024 at 11:08 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>>
>>
>>
>> 在 2024/9/24 20:15, fdmanana@kernel.org 写道:
>>> From: Filipe Manana <fdmanana@suse.com>
>>>
>>> Currently the extent map shrinker is run synchronously for kswapd tasks
>>> that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
>>> This has some disadvantages and for some heavy workloads with memory
>>> pressure it can cause some delays and stalls that make a machine
>>> unresponsive for some periods. This happens because:
>>>
>>> 1) We can have several kswapd tasks on machines with multiple NUMA zones,
>>>      and running the extent map shrinker concurrently can cause high
>>>      contention on some spin locks, namely the spin locks that protect
>>>      the radix tree that tracks roots, the per root xarray that tracks
>>>      open inodes and the list of delayed iputs. This not only delays the
>>>      shrinker but also causes high CPU consumption and makes the task
>>>      running the shrinker monopolize a core, resulting in the symptoms
>>>      of an unresponsive system. This was noted in previous commits such as
>>>      commit ae1e766f623f ("btrfs: only run the extent map shrinker from
>>>      kswapd tasks");
>>>
>>> 2) The extent map shrinker's iteration over inodes can often be slow, even
>>>      after changing the data structure that tracks open inodes for a root
>>>      from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
>>>      The transition to the xarray while it made things a bit faster, it's
>>>      still somewhat slow - for example in a test scenario with 10000 inodes
>>>      that have no extent maps loaded, the extent map shrinker took between
>>>      5ms to 8ms, using a release, non-debug kernel. Iterating over the
>>>      extent maps of an inode can also be slow if have an inode with many
>>>      thousands of extent maps, since we use a red black tree to track and
>>>      search extent maps. So having the extent map shrinker run synchronously
>>>      adds extra delay for other things a kswapd task does.
>>>
>>> So make the extent map shrinker run asynchronously as a job for the
>>> system unbounded workqueue, just like what we do for data and metadata
>>> space reclaim jobs.
>>>
>>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>>> ---
>>>    fs/btrfs/disk-io.c    |  2 ++
>>>    fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
>>>    fs/btrfs/extent_map.h |  3 ++-
>>>    fs/btrfs/fs.h         |  2 ++
>>>    fs/btrfs/super.c      | 13 +++--------
>>>    5 files changed, 52 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> index 25d768e67e37..2148147c5257 100644
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -2786,6 +2786,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>>>        btrfs_init_scrub(fs_info);
>>>        btrfs_init_balance(fs_info);
>>>        btrfs_init_async_reclaim_work(fs_info);
>>> +     btrfs_init_extent_map_shrinker_work(fs_info);
>>>
>>>        rwlock_init(&fs_info->block_group_cache_lock);
>>>        fs_info->block_group_cache_tree = RB_ROOT_CACHED;
>>> @@ -4283,6 +4284,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
>>>        cancel_work_sync(&fs_info->async_reclaim_work);
>>>        cancel_work_sync(&fs_info->async_data_reclaim_work);
>>>        cancel_work_sync(&fs_info->preempt_reclaim_work);
>>> +     cancel_work_sync(&fs_info->extent_map_shrinker_work);
>>>
>>>        /* Cancel or finish ongoing discard work */
>>>        btrfs_discard_cleanup(fs_info);
>>> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
>>> index cb2a6f5dce2b..e2eeb94aa349 100644
>>> --- a/fs/btrfs/extent_map.c
>>> +++ b/fs/btrfs/extent_map.c
>>> @@ -1118,7 +1118,8 @@ struct btrfs_em_shrink_ctx {
>>>
>>>    static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
>>>    {
>>> -     const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
>>> +     struct btrfs_fs_info *fs_info = inode->root->fs_info;
>>> +     const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
>>>        struct extent_map_tree *tree = &inode->extent_tree;
>>>        long nr_dropped = 0;
>>>        struct rb_node *node;
>>> @@ -1191,7 +1192,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
>>>                 * lock. This is to avoid slowing other tasks trying to take the
>>>                 * lock.
>>>                 */
>>> -             if (need_resched() || rwlock_needbreak(&tree->lock))
>>> +             if (need_resched() || rwlock_needbreak(&tree->lock) ||
>>> +                 btrfs_fs_closing(fs_info))
>>>                        break;
>>>                node = next;
>>>        }
>>> @@ -1215,7 +1217,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
>>>                ctx->last_ino = btrfs_ino(inode);
>>>                btrfs_add_delayed_iput(inode);
>>>
>>> -             if (ctx->scanned >= ctx->nr_to_scan)
>>> +             if (ctx->scanned >= ctx->nr_to_scan ||
>>> +                 btrfs_fs_closing(inode->root->fs_info))
>>>                        break;
>>>
>>>                cond_resched();
>>> @@ -1244,16 +1247,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
>>>        return nr_dropped;
>>>    }
>>>
>>> -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
>>> +static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
>>>    {
>>> +     struct btrfs_fs_info *fs_info;
>>>        struct btrfs_em_shrink_ctx ctx;
>>>        u64 start_root_id;
>>>        u64 next_root_id;
>>>        bool cycled = false;
>>>        long nr_dropped = 0;
>>>
>>> +     fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
>>> +
>>>        ctx.scanned = 0;
>>> -     ctx.nr_to_scan = nr_to_scan;
>>> +     ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
>>>
>>>        /*
>>>         * In case we have multiple tasks running this shrinker, make the next
>>> @@ -1271,12 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
>>>        if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
>>>                s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
>>>
>>> -             trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
>>> +             trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
>>>                                                           nr, ctx.last_root,
>>>                                                           ctx.last_ino);
>>>        }
>>>
>>> -     while (ctx.scanned < ctx.nr_to_scan) {
>>> +     while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
>>>                struct btrfs_root *root;
>>>                unsigned long count;
>>>
>>> @@ -1334,5 +1340,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
>>>                                                          ctx.last_ino);
>>>        }
>>>
>>> -     return nr_dropped;
>>> +     atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
>>> +}
>>> +
>>> +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
>>> +{
>>> +     /*
>>> +      * Do nothing if the shrinker is already running. In case of high memory
>>> +      * pressure we can have a lot of tasks calling us and all passing the
>>> +      * same nr_to_scan value, but in reality we may need only to free
>>> +      * nr_to_scan extent maps (or less). In case we need to free more than
>>> +      * that, we will be called again by the fs shrinker, so no worries about
>>> +      * not doing enough work to reclaim memory from extent maps.
>>> +      * We can also be repeatedly called with the same nr_to_scan value
>>> +      * simply because the shrinker runs asynchronously and multiple calls
>>> +      * to this function are made before the shrinker does enough progress.
>>> +      *
>>> +      * That's why we set the atomic counter to nr_to_scan only if its
>>> +      * current value is zero, instead of incrementing the counter by
>>> +      * nr_to_scan.
>>> +      */
>>
>> Since the shrinker can be called frequently, even if we only keep one
>> shrink work running, will the shrinker be kept running for a long time?
>> (one queued work done, then immiedately be queued again)
>
> What's the problem?
> The use of the atomic and not incrementing it, as said in the comment,
> makes sure we don't do more work than necessary.
>
> It's also running in the system unbound queue and has plenty of
> explicit reschedule calls to ensure it doesn't monopolize a cpu and
> doesn't block other tasks for long.
>
> So how can it cause any problem?

Then it will be a workqueue taking CPU 100% (or near that).
Even there is only one running work.

The first one queued the X number to do, and the work got queued, after
freed X items, the next call triggered, queuing another Y number to reclaim.

The we get the same near-100% CPU usage, it may be rescheduled, but not
much difference.
We will always have one reclaim work item running at any moment.

>
>>
>> The XFS is queuing the work with an delay, although their reason is that
>> the reclaim needs to be run more frequently than syncd interval (30s).
>>
>> Do we also need some delay to prevent such too frequent reclaim work?
>
> See the comment above.
>
> Without the increment of the atomic counter, if it keeps getting
> scheduled it's because we're under memory pressure and need to free
> memory as quickly as possible.

Even the atomic stays the same until the current one finished, we just
get a new number of items to reclaim next.

Furthermore, from our existing experience, we didn't really hit a
realworld case where the em cache is causing a huge problem, so the
relaim for em should be a very low priority thing.

Thus I still believe a delayed work will be much safer, just like what
XFS is doing for decades, and also like our cleaner kthread.

Thanks,
Qu

>
> Thanks.
>
>>
>> Thanks,
>> Qu
>>
>>> +     if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
>>> +             return;
>>> +
>>> +     queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
>>> +}
>>> +
>>> +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
>>> +{
>>> +     atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
>>> +     INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
>>>    }
>>> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
>>> index 5154a8f1d26c..cd123b266b64 100644
>>> --- a/fs/btrfs/extent_map.h
>>> +++ b/fs/btrfs/extent_map.h
>>> @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
>>>    int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
>>>                                   struct extent_map *new_em,
>>>                                   bool modified);
>>> -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
>>> +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
>>> +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
>>>
>>>    #endif
>>> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
>>> index 785ec15c1b84..a246d8dc0b20 100644
>>> --- a/fs/btrfs/fs.h
>>> +++ b/fs/btrfs/fs.h
>>> @@ -638,6 +638,8 @@ struct btrfs_fs_info {
>>>        spinlock_t extent_map_shrinker_lock;
>>>        u64 extent_map_shrinker_last_root;
>>>        u64 extent_map_shrinker_last_ino;
>>> +     atomic64_t extent_map_shrinker_nr_to_scan;
>>> +     struct work_struct extent_map_shrinker_work;
>>>
>>>        /* Protected by 'trans_lock'. */
>>>        struct list_head dirty_cowonly_roots;
>>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>>> index e8a5bf4af918..e9e209dd8e05 100644
>>> --- a/fs/btrfs/super.c
>>> +++ b/fs/btrfs/super.c
>>> @@ -28,7 +28,6 @@
>>>    #include <linux/btrfs.h>
>>>    #include <linux/security.h>
>>>    #include <linux/fs_parser.h>
>>> -#include <linux/swap.h>
>>>    #include "messages.h"
>>>    #include "delayed-inode.h"
>>>    #include "ctree.h"
>>> @@ -2416,16 +2415,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
>>>        const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
>>>        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>>>
>>> -     /*
>>> -      * We may be called from any task trying to allocate memory and we don't
>>> -      * want to slow it down with scanning and dropping extent maps. It would
>>> -      * also cause heavy lock contention if many tasks concurrently enter
>>> -      * here. Therefore only allow kswapd tasks to scan and drop extent maps.
>>> -      */
>>> -     if (!current_is_kswapd())
>>> -             return 0;
>>> +     btrfs_free_extent_maps(fs_info, nr_to_scan);
>>>
>>> -     return btrfs_free_extent_maps(fs_info, nr_to_scan);
>>> +     /* The extent map shrinker runs asynchronously, so always return 0. */
>>> +     return 0;
>>>    }
>>>
>>>    static const struct super_operations btrfs_super_ops = {
>>
Filipe Manana Sept. 26, 2024, 10:41 a.m. UTC | #4
On Thu, Sep 26, 2024 at 10:55 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> 在 2024/9/26 18:31, Filipe Manana 写道:
> > On Wed, Sep 25, 2024 at 11:08 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
> >>
> >>
> >>
> >> 在 2024/9/24 20:15, fdmanana@kernel.org 写道:
> >>> From: Filipe Manana <fdmanana@suse.com>
> >>>
> >>> Currently the extent map shrinker is run synchronously for kswapd tasks
> >>> that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
> >>> This has some disadvantages and for some heavy workloads with memory
> >>> pressure it can cause some delays and stalls that make a machine
> >>> unresponsive for some periods. This happens because:
> >>>
> >>> 1) We can have several kswapd tasks on machines with multiple NUMA zones,
> >>>      and running the extent map shrinker concurrently can cause high
> >>>      contention on some spin locks, namely the spin locks that protect
> >>>      the radix tree that tracks roots, the per root xarray that tracks
> >>>      open inodes and the list of delayed iputs. This not only delays the
> >>>      shrinker but also causes high CPU consumption and makes the task
> >>>      running the shrinker monopolize a core, resulting in the symptoms
> >>>      of an unresponsive system. This was noted in previous commits such as
> >>>      commit ae1e766f623f ("btrfs: only run the extent map shrinker from
> >>>      kswapd tasks");
> >>>
> >>> 2) The extent map shrinker's iteration over inodes can often be slow, even
> >>>      after changing the data structure that tracks open inodes for a root
> >>>      from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
> >>>      The transition to the xarray while it made things a bit faster, it's
> >>>      still somewhat slow - for example in a test scenario with 10000 inodes
> >>>      that have no extent maps loaded, the extent map shrinker took between
> >>>      5ms to 8ms, using a release, non-debug kernel. Iterating over the
> >>>      extent maps of an inode can also be slow if have an inode with many
> >>>      thousands of extent maps, since we use a red black tree to track and
> >>>      search extent maps. So having the extent map shrinker run synchronously
> >>>      adds extra delay for other things a kswapd task does.
> >>>
> >>> So make the extent map shrinker run asynchronously as a job for the
> >>> system unbounded workqueue, just like what we do for data and metadata
> >>> space reclaim jobs.
> >>>
> >>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> >>> ---
> >>>    fs/btrfs/disk-io.c    |  2 ++
> >>>    fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
> >>>    fs/btrfs/extent_map.h |  3 ++-
> >>>    fs/btrfs/fs.h         |  2 ++
> >>>    fs/btrfs/super.c      | 13 +++--------
> >>>    5 files changed, 52 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> >>> index 25d768e67e37..2148147c5257 100644
> >>> --- a/fs/btrfs/disk-io.c
> >>> +++ b/fs/btrfs/disk-io.c
> >>> @@ -2786,6 +2786,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> >>>        btrfs_init_scrub(fs_info);
> >>>        btrfs_init_balance(fs_info);
> >>>        btrfs_init_async_reclaim_work(fs_info);
> >>> +     btrfs_init_extent_map_shrinker_work(fs_info);
> >>>
> >>>        rwlock_init(&fs_info->block_group_cache_lock);
> >>>        fs_info->block_group_cache_tree = RB_ROOT_CACHED;
> >>> @@ -4283,6 +4284,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
> >>>        cancel_work_sync(&fs_info->async_reclaim_work);
> >>>        cancel_work_sync(&fs_info->async_data_reclaim_work);
> >>>        cancel_work_sync(&fs_info->preempt_reclaim_work);
> >>> +     cancel_work_sync(&fs_info->extent_map_shrinker_work);
> >>>
> >>>        /* Cancel or finish ongoing discard work */
> >>>        btrfs_discard_cleanup(fs_info);
> >>> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> >>> index cb2a6f5dce2b..e2eeb94aa349 100644
> >>> --- a/fs/btrfs/extent_map.c
> >>> +++ b/fs/btrfs/extent_map.c
> >>> @@ -1118,7 +1118,8 @@ struct btrfs_em_shrink_ctx {
> >>>
> >>>    static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
> >>>    {
> >>> -     const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
> >>> +     struct btrfs_fs_info *fs_info = inode->root->fs_info;
> >>> +     const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
> >>>        struct extent_map_tree *tree = &inode->extent_tree;
> >>>        long nr_dropped = 0;
> >>>        struct rb_node *node;
> >>> @@ -1191,7 +1192,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
> >>>                 * lock. This is to avoid slowing other tasks trying to take the
> >>>                 * lock.
> >>>                 */
> >>> -             if (need_resched() || rwlock_needbreak(&tree->lock))
> >>> +             if (need_resched() || rwlock_needbreak(&tree->lock) ||
> >>> +                 btrfs_fs_closing(fs_info))
> >>>                        break;
> >>>                node = next;
> >>>        }
> >>> @@ -1215,7 +1217,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
> >>>                ctx->last_ino = btrfs_ino(inode);
> >>>                btrfs_add_delayed_iput(inode);
> >>>
> >>> -             if (ctx->scanned >= ctx->nr_to_scan)
> >>> +             if (ctx->scanned >= ctx->nr_to_scan ||
> >>> +                 btrfs_fs_closing(inode->root->fs_info))
> >>>                        break;
> >>>
> >>>                cond_resched();
> >>> @@ -1244,16 +1247,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
> >>>        return nr_dropped;
> >>>    }
> >>>
> >>> -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> >>> +static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
> >>>    {
> >>> +     struct btrfs_fs_info *fs_info;
> >>>        struct btrfs_em_shrink_ctx ctx;
> >>>        u64 start_root_id;
> >>>        u64 next_root_id;
> >>>        bool cycled = false;
> >>>        long nr_dropped = 0;
> >>>
> >>> +     fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
> >>> +
> >>>        ctx.scanned = 0;
> >>> -     ctx.nr_to_scan = nr_to_scan;
> >>> +     ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
> >>>
> >>>        /*
> >>>         * In case we have multiple tasks running this shrinker, make the next
> >>> @@ -1271,12 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> >>>        if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
> >>>                s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
> >>>
> >>> -             trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
> >>> +             trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
> >>>                                                           nr, ctx.last_root,
> >>>                                                           ctx.last_ino);
> >>>        }
> >>>
> >>> -     while (ctx.scanned < ctx.nr_to_scan) {
> >>> +     while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
> >>>                struct btrfs_root *root;
> >>>                unsigned long count;
> >>>
> >>> @@ -1334,5 +1340,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> >>>                                                          ctx.last_ino);
> >>>        }
> >>>
> >>> -     return nr_dropped;
> >>> +     atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
> >>> +}
> >>> +
> >>> +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
> >>> +{
> >>> +     /*
> >>> +      * Do nothing if the shrinker is already running. In case of high memory
> >>> +      * pressure we can have a lot of tasks calling us and all passing the
> >>> +      * same nr_to_scan value, but in reality we may need only to free
> >>> +      * nr_to_scan extent maps (or less). In case we need to free more than
> >>> +      * that, we will be called again by the fs shrinker, so no worries about
> >>> +      * not doing enough work to reclaim memory from extent maps.
> >>> +      * We can also be repeatedly called with the same nr_to_scan value
> >>> +      * simply because the shrinker runs asynchronously and multiple calls
> >>> +      * to this function are made before the shrinker does enough progress.
> >>> +      *
> >>> +      * That's why we set the atomic counter to nr_to_scan only if its
> >>> +      * current value is zero, instead of incrementing the counter by
> >>> +      * nr_to_scan.
> >>> +      */
> >>
> >> Since the shrinker can be called frequently, even if we only keep one
> >> shrink work running, will the shrinker be kept running for a long time?
> >> (one queued work done, then immiedately be queued again)
> >
> > What's the problem?
> > The use of the atomic and not incrementing it, as said in the comment,
> > makes sure we don't do more work than necessary.
> >
> > It's also running in the system unbound queue and has plenty of
> > explicit reschedule calls to ensure it doesn't monopolize a cpu and
> > doesn't block other tasks for long.
> >
> > So how can it cause any problem?
>
> Then it will be a workqueue taking CPU 100% (or near that).
> Even there is only one running work.

No it won't.
Besides the very frequent reschedule points, the shrinker is always
called with a small percentage of the total number of objects to free.

>
> The first one queued the X number to do, and the work got queued, after
> freed X items, the next call triggered, queuing another Y number to reclaim.

And? Work queue jobs are distributed across cores as needed, so that
work queue won't be monopolized with extent map shrinking.

>
> The we get the same near-100% CPU usage, it may be rescheduled, but not
> much difference.
> We will always have one reclaim work item running at any moment.

And if that happens it's because it's needed.
The same goes for space reclaim, it's exactly what we do for space
reclaim. We don't add delays there and neither for delayed iputs.

>
> >
> >>
> >> The XFS is queuing the work with an delay, although their reason is that
> >> the reclaim needs to be run more frequently than syncd interval (30s).
> >>
> >> Do we also need some delay to prevent such too frequent reclaim work?
> >
> > See the comment above.
> >
> > Without the increment of the atomic counter, if it keeps getting
> > scheduled it's because we're under memory pressure and need to free
> > memory as quickly as possible.
>
> Even the atomic stays the same until the current one finished, we just
> get a new number of items to reclaim next.

If we do get it's because we still need to free memory, and we have to do it.

>
> Furthermore, from our existing experience, we didn't really hit a
> realworld case where the em cache is causing a huge problem, so the
> relaim for em should be a very low priority thing.

We had 1 person reporting it. And now that the problem is publicly
known, it can be exploited by someone with bad intentions - no root
privileges are required.

>
> Thus I still believe a delayed work will be much safer, just like what
> XFS is doing for decades, and also like our cleaner kthread.

Not a specialist on XFS, and maybe they have their reasons for doing
what they do.

But the case with our cleaner kthread is very different:

1) First for delayed iputs that delay doesn't exist.
When doing a delayed iput we always wake up the cleaner if it's not
currently running.
We want them to run asap to prevent inode eviction from happening and
holding memory and space reservations for too long.

2) Otherwise the delay and sleep is both because it's a kthread we
manually manage and because deletion of dead roots is IO heavy.

Extent map shrinking doesn't do any IO. Any concern about CPU I've
already addressed above.


>
> Thanks,
> Qu
>
> >
> > Thanks.
> >
> >>
> >> Thanks,
> >> Qu
> >>
> >>> +     if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
> >>> +             return;
> >>> +
> >>> +     queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
> >>> +}
> >>> +
> >>> +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
> >>> +{
> >>> +     atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
> >>> +     INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
> >>>    }
> >>> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> >>> index 5154a8f1d26c..cd123b266b64 100644
> >>> --- a/fs/btrfs/extent_map.h
> >>> +++ b/fs/btrfs/extent_map.h
> >>> @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
> >>>    int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
> >>>                                   struct extent_map *new_em,
> >>>                                   bool modified);
> >>> -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
> >>> +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
> >>> +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
> >>>
> >>>    #endif
> >>> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> >>> index 785ec15c1b84..a246d8dc0b20 100644
> >>> --- a/fs/btrfs/fs.h
> >>> +++ b/fs/btrfs/fs.h
> >>> @@ -638,6 +638,8 @@ struct btrfs_fs_info {
> >>>        spinlock_t extent_map_shrinker_lock;
> >>>        u64 extent_map_shrinker_last_root;
> >>>        u64 extent_map_shrinker_last_ino;
> >>> +     atomic64_t extent_map_shrinker_nr_to_scan;
> >>> +     struct work_struct extent_map_shrinker_work;
> >>>
> >>>        /* Protected by 'trans_lock'. */
> >>>        struct list_head dirty_cowonly_roots;
> >>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> >>> index e8a5bf4af918..e9e209dd8e05 100644
> >>> --- a/fs/btrfs/super.c
> >>> +++ b/fs/btrfs/super.c
> >>> @@ -28,7 +28,6 @@
> >>>    #include <linux/btrfs.h>
> >>>    #include <linux/security.h>
> >>>    #include <linux/fs_parser.h>
> >>> -#include <linux/swap.h>
> >>>    #include "messages.h"
> >>>    #include "delayed-inode.h"
> >>>    #include "ctree.h"
> >>> @@ -2416,16 +2415,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
> >>>        const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
> >>>        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> >>>
> >>> -     /*
> >>> -      * We may be called from any task trying to allocate memory and we don't
> >>> -      * want to slow it down with scanning and dropping extent maps. It would
> >>> -      * also cause heavy lock contention if many tasks concurrently enter
> >>> -      * here. Therefore only allow kswapd tasks to scan and drop extent maps.
> >>> -      */
> >>> -     if (!current_is_kswapd())
> >>> -             return 0;
> >>> +     btrfs_free_extent_maps(fs_info, nr_to_scan);
> >>>
> >>> -     return btrfs_free_extent_maps(fs_info, nr_to_scan);
> >>> +     /* The extent map shrinker runs asynchronously, so always return 0. */
> >>> +     return 0;
> >>>    }
> >>>
> >>>    static const struct super_operations btrfs_super_ops = {
> >>
>
Qu Wenruo Sept. 26, 2024, 10:02 p.m. UTC | #5
在 2024/9/26 20:11, Filipe Manana 写道:
> On Thu, Sep 26, 2024 at 10:55 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
[...]
>>> What's the problem?
>>> The use of the atomic and not incrementing it, as said in the comment,
>>> makes sure we don't do more work than necessary.
>>>
>>> It's also running in the system unbound queue and has plenty of
>>> explicit reschedule calls to ensure it doesn't monopolize a cpu and
>>> doesn't block other tasks for long.
>>>
>>> So how can it cause any problem?
>>
>> Then it will be a workqueue taking CPU 100% (or near that).
>> Even there is only one running work.
>
> No it won't.
> Besides the very frequent reschedule points, the shrinker is always
> called with a small percentage of the total number of objects to free.

And there is no guarantee that a small number to reclaim is not heavy.

The latency of a scan is not directly related to the passed-in number,
but the amount of roots/inodes/ems.

E.g. we have tens of thousands of inodes to go through. Even most of
them have no extent maps, it may still take milliseconds to go through.

And if we always have a small ems pinned for IO, so even after the
current reclaim done, the next reclaim work will still get some small
number queued to reclaim, again takes several milliseconds to reclaim
may be a few extent maps, then IO creates new ems bumping up the em
counts again.

>
>>
>> The first one queued the X number to do, and the work got queued, after
>> freed X items, the next call triggered, queuing another Y number to reclaim.
>
> And? Work queue jobs are distributed across cores as needed, so that
> work queue won't be monopolized with extent map shrinking.

Scheduled across different CPUs won't make any difference if the reclaim
workload is queued and run again and again.

Let me be clear, reschedule and core distribution are not changing the
nature of a long running CPU heavy workload.
Just take gcc as an example, it's a user space program, which can always
be preempted, and kernel can always reschedule the user space program to
whatever CPU core.
But it's still a long running CPU heavy workload.

The same is for the em shrinker, the difference is the em shrinker is
just waked up again and again, other than a long running one.

>
>>
>> The we get the same near-100% CPU usage, it may be rescheduled, but not
>> much difference.
>> We will always have one reclaim work item running at any moment.
>
> And if that happens it's because it's needed.
> The same goes for space reclaim, it's exactly what we do for space
> reclaim. We don't add delays there and neither for delayed iputs.

Unlike delayed inodes, em shrinker needs to go through all roots and all
the inodes if we didn't free enough ems.

While delayed inodes has a simple list of all the delayed inodes, not
really much scanning.

And just as you mentioned in the changelog, the scanning itself can be
the real problem.

>
>>
>>>
>>>>
>>>> The XFS is queuing the work with an delay, although their reason is that
>>>> the reclaim needs to be run more frequently than syncd interval (30s).
>>>>
>>>> Do we also need some delay to prevent such too frequent reclaim work?
>>>
>>> See the comment above.
>>>
>>> Without the increment of the atomic counter, if it keeps getting
>>> scheduled it's because we're under memory pressure and need to free
>>> memory as quickly as possible.
>>
>> Even the atomic stays the same until the current one finished, we just
>> get a new number of items to reclaim next.
>
> If we do get it's because we still need to free memory, and we have to do it.
>
>>
>> Furthermore, from our existing experience, we didn't really hit a
>> realworld case where the em cache is causing a huge problem, so the
>> relaim for em should be a very low priority thing.
>
> We had 1 person reporting it. And now that the problem is publicly
> known, it can be exploited by someone with bad intentions - no root
> privileges are required.

I do not see how delayed reclaim will cause new problem in that case.
Yes, it will not reclaim those ems fast enough, but it's still doing
proper reclaim.

In fact we have more end users affected by the too aggressive shrinker
behavior than not reclaiming those ems.


And it will be a slap in the face if we move the feature into
experimental, then move it out and have to move it back again.

So I'm very cautious on any possibility that this shrinker can be
triggered without any extra wait.

>
>>
>> Thus I still believe a delayed work will be much safer, just like what
>> XFS is doing for decades, and also like our cleaner kthread.
>
> Not a specialist on XFS, and maybe they have their reasons for doing
> what they do.
>
> But the case with our cleaner kthread is very different:
>
> 1) First for delayed iputs that delay doesn't exist.
> When doing a delayed iput we always wake up the cleaner if it's not
> currently running.
> We want them to run asap to prevent inode eviction from happening and
> holding memory and space reservations for too long.

But the delayed iput doesn't need to scan through all the roots/inodes.

Thanks,
Qu

>
> 2) Otherwise the delay and sleep is both because it's a kthread we
> manually manage and because deletion of dead roots is IO heavy.
>
> Extent map shrinking doesn't do any IO. Any concern about CPU I've
> already addressed above.
>
>
Filipe Manana Sept. 27, 2024, 8:10 a.m. UTC | #6
On Thu, Sep 26, 2024 at 11:02 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> 在 2024/9/26 20:11, Filipe Manana 写道:
> > On Thu, Sep 26, 2024 at 10:55 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
> [...]
> >>> What's the problem?
> >>> The use of the atomic and not incrementing it, as said in the comment,
> >>> makes sure we don't do more work than necessary.
> >>>
> >>> It's also running in the system unbound queue and has plenty of
> >>> explicit reschedule calls to ensure it doesn't monopolize a cpu and
> >>> doesn't block other tasks for long.
> >>>
> >>> So how can it cause any problem?
> >>
> >> Then it will be a workqueue taking CPU 100% (or near that).
> >> Even there is only one running work.
> >
> > No it won't.
> > Besides the very frequent reschedule points, the shrinker is always
> > called with a small percentage of the total number of objects to free.
>
> And there is no guarantee that a small number to reclaim is not heavy.
>
> The latency of a scan is not directly related to the passed-in number,
> but the amount of roots/inodes/ems.
>
> E.g. we have tens of thousands of inodes to go through. Even most of
> them have no extent maps, it may still take milliseconds to go through.

You're taking what I've written in the changelog as if I'm not aware of it.
Yes I talk about several milliseconds for 10000 inodes without any
extent maps, on a very busy system and the
times measured were done with bpftrace which include reschedule times,
so accounting for periods the shrinker was not running.

The point of running it asynchronously in the system unbounded queue
is precisely so that in case it takes a long time it doesn't affect
other tasks.

>
> And if we always have a small ems pinned for IO, so even after the

Small ems? How does the size of an extent map matters here?

> current reclaim done, the next reclaim work will still get some small
> number queued to reclaim, again takes several milliseconds to reclaim

And that's precisely why it's run asynchronously in the system
unbounded workqueue, so that no matter how long it takes, it doesn't
affect (delays) other tasks.

> may be a few extent maps, then IO creates new ems bumping up the em
> counts again.

And adding the delay will solve any of that?
Please explain how.

>
> >
> >>
> >> The first one queued the X number to do, and the work got queued, after
> >> freed X items, the next call triggered, queuing another Y number to reclaim.
> >
> > And? Work queue jobs are distributed across cores as needed, so that
> > work queue won't be monopolized with extent map shrinking.
>
> Scheduled across different CPUs won't make any difference if the reclaim
> workload is queued and run again and again.
>
> Let me be clear, reschedule and core distribution are not changing the
> nature of a long running CPU heavy workload.

And how does your delay make that better?

> Just take gcc as an example, it's a user space program, which can always
> be preempted, and kernel can always reschedule the user space program to
> whatever CPU core.
> But it's still a long running CPU heavy workload.
>
> The same is for the em shrinker, the difference is the em shrinker is
> just waked up again and again, other than a long running one.

If needed, yes, that's why the shrinker (at a higher level) calls into
the fs shrinker for a small portion of objects at a time.

>
> >
> >>
> >> The we get the same near-100% CPU usage, it may be rescheduled, but not
> >> much difference.
> >> We will always have one reclaim work item running at any moment.
> >
> > And if that happens it's because it's needed.
> > The same goes for space reclaim, it's exactly what we do for space
> > reclaim. We don't add delays there and neither for delayed iputs.
>
> Unlike delayed inodes, em shrinker needs to go through all roots and all
> the inodes if we didn't free enough ems.

For a portion of the total extent maps.
Every time it's called it visits a portion of the total extent maps, a
few roots, a few inodes, sometimes just 1 inode and 1 root.

>
> While delayed inodes has a simple list of all the delayed inodes, not
> really much scanning.

The time to move from one element to the next is shorter, yes, but if
you have many thousands of elements it's still time consuming.

>
> And just as you mentioned in the changelog, the scanning itself can be
> the real problem.

Yes, and that's why it is being done by a single task in the system
unbounded queue, so that no matter how slow or fast it is, it doesn't
cause delay for other tasks.
The problem was that it ran synchronously, through memory allocation
paths (and kswapd), and that time introduced delays and made any
concurrent calls spend a lot of time busy on spin locks, preventing
other tasks from being scheduled and therefore unresponsive.

>
> >
> >>
> >>>
> >>>>
> >>>> The XFS is queuing the work with an delay, although their reason is that
> >>>> the reclaim needs to be run more frequently than syncd interval (30s).
> >>>>
> >>>> Do we also need some delay to prevent such too frequent reclaim work?
> >>>
> >>> See the comment above.
> >>>
> >>> Without the increment of the atomic counter, if it keeps getting
> >>> scheduled it's because we're under memory pressure and need to free
> >>> memory as quickly as possible.
> >>
> >> Even the atomic stays the same until the current one finished, we just
> >> get a new number of items to reclaim next.
> >
> > If we do get it's because we still need to free memory, and we have to do it.
> >
> >>
> >> Furthermore, from our existing experience, we didn't really hit a
> >> realworld case where the em cache is causing a huge problem, so the
> >> relaim for em should be a very low priority thing.
> >
> > We had 1 person reporting it. And now that the problem is publicly
> > known, it can be exploited by someone with bad intentions - no root
> > privileges are required.
>
> I do not see how delayed reclaim will cause new problem in that case.
> Yes, it will not reclaim those ems fast enough, but it's still doing
> proper reclaim.

If it's not fast enough it can often have consequences such as
allocations failing anywhere or taking longer for example, or making
the OOM killer start killing processes.

>
> In fact we have more end users affected by the too aggressive shrinker
> behavior than not reclaiming those ems.

Yes, when it ran synchronously, before this patchset.

I don't think you understand why it was aggressive.
This was already explained in the change log and comments, but let me
rephrase it:

1) Multiple tasks, during memory allocations or kswapd, end up
concurrently triggering the fs shrinker which enters the extent map
shrinker, let's say 100 tasks;

2) There's for example 1 000 000 extent maps, and the shrinker decides
for example to try to free 1000 extent maps (the nr_ro_scan argument);

3) The decision for each calling task was made before any other task
was able do do any progress (or very little) dropping extent maps;

4) So while 1000 extent maps may have been enough to release memory,
we end up releasing  (or trying to) 100 * 1000 extent maps
concurrently, causing the heavy spin lock waits and CPU usage.

Making it run asynchronously in the system unbound queue, ensuring
there's only one queued work, and using the atomic without adding
(just swapping if it's 0), solves all that.
In the example above it ensures only 1 task is doing the extent map
shrinking and for at most 1000 extent maps, and not 100 * 1000.


>
>
> And it will be a slap in the face if we move the feature into
> experimental, then move it out and have to move it back again.

Regressions and problems happen often, yes, but we always move forward
by analysing things and fixing them.
I'm sure you've gone through the same in the past.

>
> So I'm very cautious on any possibility that this shrinker can be
> triggered without any extra wait.

I'm very cautious as well, and have been very responsive helping users
ASAP, even spending too often personal time like weekends, evenings,
and holidays.
The last report happened at the only time of the year where I was away
on vacation and had no possibility of having access to a computer and
doing anything, and right at the last 6.10-rc before 6.10 final.

So please don't assume or insinuate I don't care enough. I do care, and a lot.

>
> >
> >>
> >> Thus I still believe a delayed work will be much safer, just like what
> >> XFS is doing for decades, and also like our cleaner kthread.
> >
> > Not a specialist on XFS, and maybe they have their reasons for doing
> > what they do.
> >
> > But the case with our cleaner kthread is very different:
> >
> > 1) First for delayed iputs that delay doesn't exist.
> > When doing a delayed iput we always wake up the cleaner if it's not
> > currently running.
> > We want them to run asap to prevent inode eviction from happening and
> > holding memory and space reservations for too long.
>
> But the delayed iput doesn't need to scan through all the roots/inodes.

And neither does the extent map shrinker, it goes through a small
portion at a time, in a dedicated task run by the system unbounded
queue.

And running delayed iputs still takes time going through a list. If
there are always delayed iputs being added, there's constant work.
Yet in this case you're not worried about adding a delay.

>
> Thanks,
> Qu
>
> >
> > 2) Otherwise the delay and sleep is both because it's a kthread we
> > manually manage and because deletion of dead roots is IO heavy.
> >
> > Extent map shrinking doesn't do any IO. Any concern about CPU I've
> > already addressed above.
> >
> >
>
diff mbox series

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 25d768e67e37..2148147c5257 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2786,6 +2786,7 @@  void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_init_scrub(fs_info);
 	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(fs_info);
+	btrfs_init_extent_map_shrinker_work(fs_info);
 
 	rwlock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -4283,6 +4284,7 @@  void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
+	cancel_work_sync(&fs_info->extent_map_shrinker_work);
 
 	/* Cancel or finish ongoing discard work */
 	btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index cb2a6f5dce2b..e2eeb94aa349 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1118,7 +1118,8 @@  struct btrfs_em_shrink_ctx {
 
 static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
 {
-	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
 	struct extent_map_tree *tree = &inode->extent_tree;
 	long nr_dropped = 0;
 	struct rb_node *node;
@@ -1191,7 +1192,8 @@  static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 		 * lock. This is to avoid slowing other tasks trying to take the
 		 * lock.
 		 */
-		if (need_resched() || rwlock_needbreak(&tree->lock))
+		if (need_resched() || rwlock_needbreak(&tree->lock) ||
+		    btrfs_fs_closing(fs_info))
 			break;
 		node = next;
 	}
@@ -1215,7 +1217,8 @@  static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		ctx->last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
-		if (ctx->scanned >= ctx->nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan ||
+		    btrfs_fs_closing(inode->root->fs_info))
 			break;
 
 		cond_resched();
@@ -1244,16 +1247,19 @@  static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 	return nr_dropped;
 }
 
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 {
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_em_shrink_ctx ctx;
 	u64 start_root_id;
 	u64 next_root_id;
 	bool cycled = false;
 	long nr_dropped = 0;
 
+	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+
 	ctx.scanned = 0;
-	ctx.nr_to_scan = nr_to_scan;
+	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 
 	/*
 	 * In case we have multiple tasks running this shrinker, make the next
@@ -1271,12 +1277,12 @@  long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
 							   nr, ctx.last_root,
 							   ctx.last_ino);
 	}
 
-	while (ctx.scanned < ctx.nr_to_scan) {
+	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
 		struct btrfs_root *root;
 		unsigned long count;
 
@@ -1334,5 +1340,34 @@  long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 							  ctx.last_ino);
 	}
 
-	return nr_dropped;
+	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+	/*
+	 * Do nothing if the shrinker is already running. In case of high memory
+	 * pressure we can have a lot of tasks calling us and all passing the
+	 * same nr_to_scan value, but in reality we may need only to free
+	 * nr_to_scan extent maps (or less). In case we need to free more than
+	 * that, we will be called again by the fs shrinker, so no worries about
+	 * not doing enough work to reclaim memory from extent maps.
+	 * We can also be repeatedly called with the same nr_to_scan value
+	 * simply because the shrinker runs asynchronously and multiple calls
+	 * to this function are made before the shrinker does enough progress.
+	 *
+	 * That's why we set the atomic counter to nr_to_scan only if its
+	 * current value is zero, instead of incrementing the counter by
+	 * nr_to_scan.
+	 */
+	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+		return;
+
+	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f1d26c..cd123b266b64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@  void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
 int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
 				   struct extent_map *new_em,
 				   bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
 
 #endif
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 785ec15c1b84..a246d8dc0b20 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -638,6 +638,8 @@  struct btrfs_fs_info {
 	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
+	atomic64_t extent_map_shrinker_nr_to_scan;
+	struct work_struct extent_map_shrinker_work;
 
 	/* Protected by 'trans_lock'. */
 	struct list_head dirty_cowonly_roots;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e8a5bf4af918..e9e209dd8e05 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@ 
 #include <linux/btrfs.h>
 #include <linux/security.h>
 #include <linux/fs_parser.h>
-#include <linux/swap.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -2416,16 +2415,10 @@  static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
 	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 
-	/*
-	 * We may be called from any task trying to allocate memory and we don't
-	 * want to slow it down with scanning and dropping extent maps. It would
-	 * also cause heavy lock contention if many tasks concurrently enter
-	 * here. Therefore only allow kswapd tasks to scan and drop extent maps.
-	 */
-	if (!current_is_kswapd())
-		return 0;
+	btrfs_free_extent_maps(fs_info, nr_to_scan);
 
-	return btrfs_free_extent_maps(fs_info, nr_to_scan);
+	/* The extent map shrinker runs asynchronously, so always return 0. */
+	return 0;
 }
 
 static const struct super_operations btrfs_super_ops = {