diff mbox series

[v2,7/7] nfsd: filecache: drop the list_lru lock during lock gc scans

Message ID 20250218153937.6125-8-cel@kernel.org (mailing list archive)
State Under Review
Delegated to: Chuck Lever
Headers show
Series nfsd: filecache: various fixes | expand

Commit Message

Chuck Lever Feb. 18, 2025, 3:39 p.m. UTC
From: NeilBrown <neilb@suse.de>

Under a high NFSv3 load with lots of different files being accessed,
the LRU list of garbage-collectable files can become quite long.

Asking list_lru_scan_node() to scan the whole list can result in a long
period during which a spinlock is held, blocking the addition of new LRU
items.

So ask list_lru_scan_node() to scan only a few entries at a time, and
repeat until the scan is complete.

If the shrinker runs between two consecutive calls of
list_lru_scan_node() it could invalidate the "remaining" counter which
could lead to premature freeing.  So add a spinlock to avoid that.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 27 ++++++++++++++++++++++++---
 fs/nfsd/filecache.h |  6 ++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

Comments

Jeff Layton Feb. 18, 2025, 8:51 p.m. UTC | #1
On Tue, 2025-02-18 at 10:39 -0500, cel@kernel.org wrote:
> From: NeilBrown <neilb@suse.de>
> 
> Under a high NFSv3 load with lots of different files being accessed,
> the LRU list of garbage-collectable files can become quite long.
> 
> Asking list_lru_scan_node() to scan the whole list can result in a long
> period during which a spinlock is held, blocking the addition of new LRU
> items.
> 
> So ask list_lru_scan_node() to scan only a few entries at a time, and
> repeat until the scan is complete.
> 
> If the shrinker runs between two consecutive calls of
> list_lru_scan_node() it could invalidate the "remaining" counter which
> could lead to premature freeing.  So add a spinlock to avoid that.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/filecache.c | 27 ++++++++++++++++++++++++---
>  fs/nfsd/filecache.h |  6 ++++++
>  2 files changed, 30 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 56935349f0e4..9a41ccfc2df6 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -544,6 +544,13 @@ nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru,
>  	return nfsd_file_lru_cb(item, lru, arg);
>  }
>  
> +/* If the shrinker runs between calls to list_lru_walk_node() in
> + * nfsd_file_gc(), the "remaining" count will be wrong.  This could
> + * result in premature freeing of some files.  This may not matter much
> + * but is easy to fix with this spinlock which temporarily disables
> + * the shrinker.
> + */
> +static DEFINE_SPINLOCK(nfsd_gc_lock);

Having this as a global lock makes sense since there is just a single
shrinker and laundrette for the whole kernel. I don't think it's
worthwhile to make them per-net or anything either.
 
>  static void
>  nfsd_file_gc(void)
>  {
> @@ -551,12 +558,22 @@ nfsd_file_gc(void)
>  	LIST_HEAD(dispose);
>  	int nid;
>  
> +	spin_lock(&nfsd_gc_lock);
>  	for_each_node_state(nid, N_NORMAL_MEMORY) {
> -		unsigned long nr = list_lru_count_node(&nfsd_file_lru, nid);
> +		unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid);
>  
> -		ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb,
> -					  &dispose, &nr);
> +		while (remaining > 0) {
> +			unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH);
> +
> +			remaining -= nr;
> +			ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb,
> +						  &dispose, &nr);
> +			if (nr)
> +				/* walk aborted early */
> +				remaining = 0;
> +		}
>  	}

Now that I look, if we end up walking a long list on a different NUMA
node, this could mean a lot of cross-node calls.

This is probably in the "further work" category, but...

Maybe we should switch the laundrette to have a work struct per-node,
and then schedule all of them on their respective nodes when we start
the laundrette.

If you do that, then the nfsd_gc_lock could also be per-node.

> +	spin_unlock(&nfsd_gc_lock);
>  	trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
>  	nfsd_file_dispose_list_delayed(&dispose);
>  }
> @@ -581,8 +598,12 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
>  	LIST_HEAD(dispose);
>  	unsigned long ret;
>  
> +	if (!spin_trylock(&nfsd_gc_lock))
> +		return SHRINK_STOP;
> +
>  	ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
>  				   nfsd_file_lru_cb, &dispose);
> +	spin_unlock(&nfsd_gc_lock);
>  	trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
>  	nfsd_file_dispose_list_delayed(&dispose);
>  	return ret;
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index de5b8aa7fcb0..5865f9c72712 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -3,6 +3,12 @@
>  
>  #include <linux/fsnotify_backend.h>
>  
> +/*
> + * Limit the time that the list_lru_one lock is held during
> + * an LRU scan.
> + */
> +#define NFSD_FILE_GC_BATCH     (16UL)
> +
>  /*
>   * This is the fsnotify_mark container that nfsd attaches to the files that it
>   * is holding open. Note that we have a separate refcount here aside from the

No objection to this patch as an interim step though.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
diff mbox series

Patch

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 56935349f0e4..9a41ccfc2df6 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -544,6 +544,13 @@  nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru,
 	return nfsd_file_lru_cb(item, lru, arg);
 }
 
+/* If the shrinker runs between calls to list_lru_walk_node() in
+ * nfsd_file_gc(), the "remaining" count will be wrong.  This could
+ * result in premature freeing of some files.  This may not matter much
+ * but is easy to fix with this spinlock which temporarily disables
+ * the shrinker.
+ */
+static DEFINE_SPINLOCK(nfsd_gc_lock);
 static void
 nfsd_file_gc(void)
 {
@@ -551,12 +558,22 @@  nfsd_file_gc(void)
 	LIST_HEAD(dispose);
 	int nid;
 
+	spin_lock(&nfsd_gc_lock);
 	for_each_node_state(nid, N_NORMAL_MEMORY) {
-		unsigned long nr = list_lru_count_node(&nfsd_file_lru, nid);
+		unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid);
 
-		ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb,
-					  &dispose, &nr);
+		while (remaining > 0) {
+			unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH);
+
+			remaining -= nr;
+			ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb,
+						  &dispose, &nr);
+			if (nr)
+				/* walk aborted early */
+				remaining = 0;
+		}
 	}
+	spin_unlock(&nfsd_gc_lock);
 	trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
 	nfsd_file_dispose_list_delayed(&dispose);
 }
@@ -581,8 +598,12 @@  nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
 	LIST_HEAD(dispose);
 	unsigned long ret;
 
+	if (!spin_trylock(&nfsd_gc_lock))
+		return SHRINK_STOP;
+
 	ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
 				   nfsd_file_lru_cb, &dispose);
+	spin_unlock(&nfsd_gc_lock);
 	trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
 	nfsd_file_dispose_list_delayed(&dispose);
 	return ret;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index de5b8aa7fcb0..5865f9c72712 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -3,6 +3,12 @@ 
 
 #include <linux/fsnotify_backend.h>
 
+/*
+ * Limit the time that the list_lru_one lock is held during
+ * an LRU scan.
+ */
+#define NFSD_FILE_GC_BATCH     (16UL)
+
 /*
  * This is the fsnotify_mark container that nfsd attaches to the files that it
  * is holding open. Note that we have a separate refcount here aside from the