Message ID | 20250218153937.6125-8-cel@kernel.org (mailing list archive) |
---|---|
State | Under Review |
Delegated to: | Chuck Lever |
Headers | show |
Series | nfsd: filecache: various fixes | expand |
On Tue, 2025-02-18 at 10:39 -0500, cel@kernel.org wrote: > From: NeilBrown <neilb@suse.de> > > Under a high NFSv3 load with lots of different files being accessed, > the LRU list of garbage-collectable files can become quite long. > > Asking list_lru_scan_node() to scan the whole list can result in a long > period during which a spinlock is held, blocking the addition of new LRU > items. > > So ask list_lru_scan_node() to scan only a few entries at a time, and > repeat until the scan is complete. > > If the shrinker runs between two consecutive calls of > list_lru_scan_node() it could invalidate the "remaining" counter which > could lead to premature freeing. So add a spinlock to avoid that. > > Signed-off-by: NeilBrown <neilb@suse.de> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > fs/nfsd/filecache.c | 27 ++++++++++++++++++++++++--- > fs/nfsd/filecache.h | 6 ++++++ > 2 files changed, 30 insertions(+), 3 deletions(-) > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > index 56935349f0e4..9a41ccfc2df6 100644 > --- a/fs/nfsd/filecache.c > +++ b/fs/nfsd/filecache.c > @@ -544,6 +544,13 @@ nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru, > return nfsd_file_lru_cb(item, lru, arg); > } > > +/* If the shrinker runs between calls to list_lru_walk_node() in > + * nfsd_file_gc(), the "remaining" count will be wrong. This could > + * result in premature freeing of some files. This may not matter much > + * but is easy to fix with this spinlock which temporarily disables > + * the shrinker. > + */ > +static DEFINE_SPINLOCK(nfsd_gc_lock); Having this as a global lock makes sense since there is just a single shrinker and laundrette for the whole kernel. I don't think it's worthwhile to make them per-net or anything either. > static void > nfsd_file_gc(void) > { > @@ -551,12 +558,22 @@ nfsd_file_gc(void) > LIST_HEAD(dispose); > int nid; > > + spin_lock(&nfsd_gc_lock); > for_each_node_state(nid, N_NORMAL_MEMORY) { > - unsigned long nr = list_lru_count_node(&nfsd_file_lru, nid); > + unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid); > > - ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, > - &dispose, &nr); > + while (remaining > 0) { > + unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH); > + > + remaining -= nr; > + ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, > + &dispose, &nr); > + if (nr) > + /* walk aborted early */ > + remaining = 0; > + } > } Now that I look, if we end up walking a long list on a different NUMA node, this could mean a lot of cross-node calls. This is probably in the "further work" category, but... Maybe we should switch the laundrette to have a work struct per-node, and then schedule all of them on their respective nodes when we start the laundrette. If you do that, then the nfsd_gc_lock could also be per-node. > + spin_unlock(&nfsd_gc_lock); > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); > nfsd_file_dispose_list_delayed(&dispose); > } > @@ -581,8 +598,12 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) > LIST_HEAD(dispose); > unsigned long ret; > > + if (!spin_trylock(&nfsd_gc_lock)) > + return SHRINK_STOP; > + > ret = list_lru_shrink_walk(&nfsd_file_lru, sc, > nfsd_file_lru_cb, &dispose); > + spin_unlock(&nfsd_gc_lock); > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); > nfsd_file_dispose_list_delayed(&dispose); > return ret; > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h > index de5b8aa7fcb0..5865f9c72712 100644 > --- a/fs/nfsd/filecache.h > +++ b/fs/nfsd/filecache.h > @@ -3,6 +3,12 @@ > > #include <linux/fsnotify_backend.h> > > +/* > + * Limit the time that the list_lru_one lock is held during > + * an LRU scan. > + */ > +#define NFSD_FILE_GC_BATCH (16UL) > + > /* > * This is the fsnotify_mark container that nfsd attaches to the files that it > * is holding open. Note that we have a separate refcount here aside from the No objection to this patch as an interim step though. Reviewed-by: Jeff Layton <jlayton@kernel.org>
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 56935349f0e4..9a41ccfc2df6 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -544,6 +544,13 @@ nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru, return nfsd_file_lru_cb(item, lru, arg); } +/* If the shrinker runs between calls to list_lru_walk_node() in + * nfsd_file_gc(), the "remaining" count will be wrong. This could + * result in premature freeing of some files. This may not matter much + * but is easy to fix with this spinlock which temporarily disables + * the shrinker. + */ +static DEFINE_SPINLOCK(nfsd_gc_lock); static void nfsd_file_gc(void) { @@ -551,12 +558,22 @@ nfsd_file_gc(void) LIST_HEAD(dispose); int nid; + spin_lock(&nfsd_gc_lock); for_each_node_state(nid, N_NORMAL_MEMORY) { - unsigned long nr = list_lru_count_node(&nfsd_file_lru, nid); + unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid); - ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, - &dispose, &nr); + while (remaining > 0) { + unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH); + + remaining -= nr; + ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, + &dispose, &nr); + if (nr) + /* walk aborted early */ + remaining = 0; + } } + spin_unlock(&nfsd_gc_lock); trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_dispose_list_delayed(&dispose); } @@ -581,8 +598,12 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) LIST_HEAD(dispose); unsigned long ret; + if (!spin_trylock(&nfsd_gc_lock)) + return SHRINK_STOP; + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &dispose); + spin_unlock(&nfsd_gc_lock); trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_dispose_list_delayed(&dispose); return ret; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index de5b8aa7fcb0..5865f9c72712 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -3,6 +3,12 @@ #include <linux/fsnotify_backend.h> +/* + * Limit the time that the list_lru_one lock is held during + * an LRU scan. + */ +#define NFSD_FILE_GC_BATCH (16UL) + /* * This is the fsnotify_mark container that nfsd attaches to the files that it * is holding open. Note that we have a separate refcount here aside from the