diff mbox series

[v7,10/20] nfs/localio: use dedicated workqueues for filesystem read and write

Message ID 20240624162741.68216-11-snitzer@kernel.org (mailing list archive)
State New
Headers show
Series nfs/nfsd: add support for localio | expand

Commit Message

Mike Snitzer June 24, 2024, 4:27 p.m. UTC
From: Trond Myklebust <trond.myklebust@hammerspace.com>

For localio access, don't call filesystem read() and write() routines
directly.

Some filesystem writeback routines can end up taking up a lot of stack
space (particularly xfs). Instead of risking running over due to the
extra overhead from the NFS stack, we should just call these routines
from a workqueue job.

Use of dedicated workqueues improves performance over using the
system_unbound_wq. Localio is motivated by the promise of improved
performance, it makes little sense to yield it back.

But further analysis of the latest stack depth requirements would be
useful. It'd be nice to root cause and fix the latest stack hogs,
because using workqueues at all can cause a loss in performance due to
context switches.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 fs/nfs/inode.c    |  57 +++++++++++++++++---------
 fs/nfs/internal.h |   1 +
 fs/nfs/localio.c  | 102 +++++++++++++++++++++++++++++++++++-----------
 3 files changed, 118 insertions(+), 42 deletions(-)

Comments

NeilBrown June 25, 2024, 11:15 p.m. UTC | #1
On Tue, 25 Jun 2024, Mike Snitzer wrote:
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> For localio access, don't call filesystem read() and write() routines
> directly.
> 
> Some filesystem writeback routines can end up taking up a lot of stack
> space (particularly xfs). Instead of risking running over due to the
> extra overhead from the NFS stack, we should just call these routines
> from a workqueue job.
> 
> Use of dedicated workqueues improves performance over using the
> system_unbound_wq. Localio is motivated by the promise of improved
> performance, it makes little sense to yield it back.
> 
> But further analysis of the latest stack depth requirements would be
> useful. It'd be nice to root cause and fix the latest stack hogs,
> because using workqueues at all can cause a loss in performance due to
> context switches.
> 

I would rather this patch were left out of the final submission, and
only added if/when we have documented evidence that it helps.

Thanks,
NeilBrown


> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>


> ---
>  fs/nfs/inode.c    |  57 +++++++++++++++++---------
>  fs/nfs/internal.h |   1 +
>  fs/nfs/localio.c  | 102 +++++++++++++++++++++++++++++++++++-----------
>  3 files changed, 118 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index f9923cbf6058..aac8c5302503 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void)
>  	kmem_cache_destroy(nfs_inode_cachep);
>  }
>  
> +struct workqueue_struct *nfslocaliod_workqueue;
>  struct workqueue_struct *nfsiod_workqueue;
>  EXPORT_SYMBOL_GPL(nfsiod_workqueue);
>  
>  /*
> - * start up the nfsiod workqueue
> - */
> -static int nfsiod_start(void)
> -{
> -	struct workqueue_struct *wq;
> -	dprintk("RPC:       creating workqueue nfsiod\n");
> -	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
> -	if (wq == NULL)
> -		return -ENOMEM;
> -	nfsiod_workqueue = wq;
> -	return 0;
> -}
> -
> -/*
> - * Destroy the nfsiod workqueue
> + * Destroy the nfsiod workqueues
>   */
>  static void nfsiod_stop(void)
>  {
>  	struct workqueue_struct *wq;
>  
>  	wq = nfsiod_workqueue;
> -	if (wq == NULL)
> -		return;
> -	nfsiod_workqueue = NULL;
> -	destroy_workqueue(wq);
> +	if (wq != NULL) {
> +		nfsiod_workqueue = NULL;
> +		destroy_workqueue(wq);
> +	}
> +#if IS_ENABLED(CONFIG_NFS_LOCALIO)
> +	wq = nfslocaliod_workqueue;
> +	if (wq != NULL) {
> +		nfslocaliod_workqueue = NULL;
> +		destroy_workqueue(wq);
> +	}
> +#endif /* CONFIG_NFS_LOCALIO */
> +}
> +
> +/*
> + * Start the nfsiod workqueues
> + */
> +static int nfsiod_start(void)
> +{
> +	dprintk("RPC:       creating workqueue nfsiod\n");
> +	nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
> +	if (nfsiod_workqueue == NULL)
> +		return -ENOMEM;
> +#if IS_ENABLED(CONFIG_NFS_LOCALIO)
> +	/*
> +	 * localio writes need to use a normal (non-memreclaim) workqueue.
> +	 * When we start getting low on space, XFS goes and calls flush_work() on
> +	 * a non-memreclaim work queue, which causes a priority inversion problem.
> +	 */
> +	dprintk("RPC:       creating workqueue nfslocaliod\n");
> +	nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
> +	if (unlikely(nfslocaliod_workqueue == NULL)) {
> +		nfsiod_stop();
> +		return -ENOMEM;
> +	}
> +#endif /* CONFIG_NFS_LOCALIO */
> +	return 0;
>  }
>  
>  unsigned int nfs_net_id;
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index d352040e3232..9251a357d097 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -440,6 +440,7 @@ int nfs_check_flags(int);
>  
>  /* inode.c */
>  extern struct workqueue_struct *nfsiod_workqueue;
> +extern struct workqueue_struct *nfslocaliod_workqueue;
>  extern struct inode *nfs_alloc_inode(struct super_block *sb);
>  extern void nfs_free_inode(struct inode *);
>  extern int nfs_write_inode(struct inode *, struct writeback_control *);
> diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
> index 724e81716b16..418b8d76692b 100644
> --- a/fs/nfs/localio.c
> +++ b/fs/nfs/localio.c
> @@ -44,6 +44,12 @@ struct nfs_local_fsync_ctx {
>  };
>  static void nfs_local_fsync_work(struct work_struct *work);
>  
> +struct nfs_local_io_args {
> +	struct nfs_local_kiocb *iocb;
> +	struct work_struct work;
> +	struct completion *done;
> +};
> +
>  /*
>   * We need to translate between nfs status return values and
>   * the local errno values which may not be the same.
> @@ -307,30 +313,54 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
>  			status > 0 ? status : 0, hdr->res.eof);
>  }
>  
> -static int
> -nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
> -		const struct rpc_call_ops *call_ops)
> +static void nfs_local_call_read(struct work_struct *work)
>  {
> -	struct nfs_local_kiocb *iocb;
> +	struct nfs_local_io_args *args =
> +		container_of(work, struct nfs_local_io_args, work);
> +	struct nfs_local_kiocb *iocb = args->iocb;
> +	struct file *filp = iocb->kiocb.ki_filp;
>  	struct iov_iter iter;
>  	ssize_t status;
>  
> +	nfs_local_iter_init(&iter, iocb, READ);
> +
> +	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
> +	if (status != -EIOCBQUEUED) {
> +		nfs_local_read_done(iocb, status);
> +		nfs_local_pgio_release(iocb);
> +	}
> +	complete(args->done);
> +}
> +
> +static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
> +			     const struct rpc_call_ops *call_ops)
> +{
> +	struct nfs_local_io_args args;
> +	DECLARE_COMPLETION_ONSTACK(done);
> +	struct nfs_local_kiocb *iocb;
> +
>  	dprintk("%s: vfs_read count=%u pos=%llu\n",
>  		__func__, hdr->args.count, hdr->args.offset);
>  
>  	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
>  	if (iocb == NULL)
>  		return -ENOMEM;
> -	nfs_local_iter_init(&iter, iocb, READ);
>  
>  	nfs_local_pgio_init(hdr, call_ops);
>  	hdr->res.eof = false;
>  
> -	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
> -	if (status != -EIOCBQUEUED) {
> -		nfs_local_read_done(iocb, status);
> -		nfs_local_pgio_release(iocb);
> -	}
> +	/*
> +	 * Don't call filesystem read() routines directly.
> +	 * In order to avoid issues with stack overflow,
> +	 * call the read routines from a workqueue job.
> +	 */
> +	args.iocb = iocb;
> +	args.done = &done;
> +	INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
> +	queue_work(nfslocaliod_workqueue, &args.work);
> +	wait_for_completion(&done);
> +	destroy_work_on_stack(&args.work);
> +
>  	return 0;
>  }
>  
> @@ -420,14 +450,35 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
>  	nfs_local_pgio_done(hdr, status);
>  }
>  
> -static int
> -nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> -		const struct rpc_call_ops *call_ops)
> +static void nfs_local_call_write(struct work_struct *work)
>  {
> -	struct nfs_local_kiocb *iocb;
> +	struct nfs_local_io_args *args =
> +		container_of(work, struct nfs_local_io_args, work);
> +	struct nfs_local_kiocb *iocb = args->iocb;
> +	struct file *filp = iocb->kiocb.ki_filp;
>  	struct iov_iter iter;
>  	ssize_t status;
>  
> +	nfs_local_iter_init(&iter, iocb, WRITE);
> +
> +	file_start_write(filp);
> +	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
> +	file_end_write(filp);
> +	if (status != -EIOCBQUEUED) {
> +		nfs_local_write_done(iocb, status);
> +		nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
> +		nfs_local_pgio_release(iocb);
> +	}
> +	complete(args->done);
> +}
> +
> +static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> +			      const struct rpc_call_ops *call_ops)
> +{
> +	struct nfs_local_io_args args;
> +	DECLARE_COMPLETION_ONSTACK(done);
> +	struct nfs_local_kiocb *iocb;
> +
>  	dprintk("%s: vfs_write count=%u pos=%llu %s\n",
>  		__func__, hdr->args.count, hdr->args.offset,
>  		(hdr->args.stable == NFS_UNSTABLE) ?  "unstable" : "stable");
> @@ -435,7 +486,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
>  	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
>  	if (iocb == NULL)
>  		return -ENOMEM;
> -	nfs_local_iter_init(&iter, iocb, WRITE);
>  
>  	switch (hdr->args.stable) {
>  	default:
> @@ -450,14 +500,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
>  
>  	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
>  
> -	file_start_write(filp);
> -	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
> -	file_end_write(filp);
> -	if (status != -EIOCBQUEUED) {
> -		nfs_local_write_done(iocb, status);
> -		nfs_get_vfs_attr(filp, hdr->res.fattr);
> -		nfs_local_pgio_release(iocb);
> -	}
> +	/*
> +	 * Don't call filesystem write() routines directly.
> +	 * Some filesystem writeback routines can end up taking up a lot of
> +	 * stack (particularly xfs). Instead of risking running over due to
> +	 * the extra overhead from the NFS stack, call these write routines
> +	 * from a workqueue job.
> +	 */
> +	args.iocb = iocb;
> +	args.done = &done;
> +	INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
> +	queue_work(nfslocaliod_workqueue, &args.work);
> +	wait_for_completion(&done);
> +	destroy_work_on_stack(&args.work);
> +
>  	return 0;
>  }
>  
> -- 
> 2.44.0
> 
>
diff mbox series

Patch

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f9923cbf6058..aac8c5302503 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2394,35 +2394,54 @@  static void nfs_destroy_inodecache(void)
 	kmem_cache_destroy(nfs_inode_cachep);
 }
 
+struct workqueue_struct *nfslocaliod_workqueue;
 struct workqueue_struct *nfsiod_workqueue;
 EXPORT_SYMBOL_GPL(nfsiod_workqueue);
 
 /*
- * start up the nfsiod workqueue
- */
-static int nfsiod_start(void)
-{
-	struct workqueue_struct *wq;
-	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
-	if (wq == NULL)
-		return -ENOMEM;
-	nfsiod_workqueue = wq;
-	return 0;
-}
-
-/*
- * Destroy the nfsiod workqueue
+ * Destroy the nfsiod workqueues
  */
 static void nfsiod_stop(void)
 {
 	struct workqueue_struct *wq;
 
 	wq = nfsiod_workqueue;
-	if (wq == NULL)
-		return;
-	nfsiod_workqueue = NULL;
-	destroy_workqueue(wq);
+	if (wq != NULL) {
+		nfsiod_workqueue = NULL;
+		destroy_workqueue(wq);
+	}
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	wq = nfslocaliod_workqueue;
+	if (wq != NULL) {
+		nfslocaliod_workqueue = NULL;
+		destroy_workqueue(wq);
+	}
+#endif /* CONFIG_NFS_LOCALIO */
+}
+
+/*
+ * Start the nfsiod workqueues
+ */
+static int nfsiod_start(void)
+{
+	dprintk("RPC:       creating workqueue nfsiod\n");
+	nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+	if (nfsiod_workqueue == NULL)
+		return -ENOMEM;
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	/*
+	 * localio writes need to use a normal (non-memreclaim) workqueue.
+	 * When we start getting low on space, XFS goes and calls flush_work() on
+	 * a non-memreclaim work queue, which causes a priority inversion problem.
+	 */
+	dprintk("RPC:       creating workqueue nfslocaliod\n");
+	nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
+	if (unlikely(nfslocaliod_workqueue == NULL)) {
+		nfsiod_stop();
+		return -ENOMEM;
+	}
+#endif /* CONFIG_NFS_LOCALIO */
+	return 0;
 }
 
 unsigned int nfs_net_id;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d352040e3232..9251a357d097 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -440,6 +440,7 @@  int nfs_check_flags(int);
 
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
+extern struct workqueue_struct *nfslocaliod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_free_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 724e81716b16..418b8d76692b 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -44,6 +44,12 @@  struct nfs_local_fsync_ctx {
 };
 static void nfs_local_fsync_work(struct work_struct *work);
 
+struct nfs_local_io_args {
+	struct nfs_local_kiocb *iocb;
+	struct work_struct work;
+	struct completion *done;
+};
+
 /*
  * We need to translate between nfs status return values and
  * the local errno values which may not be the same.
@@ -307,30 +313,54 @@  nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
 			status > 0 ? status : 0, hdr->res.eof);
 }
 
-static int
-nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
-		const struct rpc_call_ops *call_ops)
+static void nfs_local_call_read(struct work_struct *work)
 {
-	struct nfs_local_kiocb *iocb;
+	struct nfs_local_io_args *args =
+		container_of(work, struct nfs_local_io_args, work);
+	struct nfs_local_kiocb *iocb = args->iocb;
+	struct file *filp = iocb->kiocb.ki_filp;
 	struct iov_iter iter;
 	ssize_t status;
 
+	nfs_local_iter_init(&iter, iocb, READ);
+
+	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+	if (status != -EIOCBQUEUED) {
+		nfs_local_read_done(iocb, status);
+		nfs_local_pgio_release(iocb);
+	}
+	complete(args->done);
+}
+
+static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
+			     const struct rpc_call_ops *call_ops)
+{
+	struct nfs_local_io_args args;
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct nfs_local_kiocb *iocb;
+
 	dprintk("%s: vfs_read count=%u pos=%llu\n",
 		__func__, hdr->args.count, hdr->args.offset);
 
 	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
 	if (iocb == NULL)
 		return -ENOMEM;
-	nfs_local_iter_init(&iter, iocb, READ);
 
 	nfs_local_pgio_init(hdr, call_ops);
 	hdr->res.eof = false;
 
-	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
-	if (status != -EIOCBQUEUED) {
-		nfs_local_read_done(iocb, status);
-		nfs_local_pgio_release(iocb);
-	}
+	/*
+	 * Don't call filesystem read() routines directly.
+	 * In order to avoid issues with stack overflow,
+	 * call the read routines from a workqueue job.
+	 */
+	args.iocb = iocb;
+	args.done = &done;
+	INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
+	queue_work(nfslocaliod_workqueue, &args.work);
+	wait_for_completion(&done);
+	destroy_work_on_stack(&args.work);
+
 	return 0;
 }
 
@@ -420,14 +450,35 @@  nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
 	nfs_local_pgio_done(hdr, status);
 }
 
-static int
-nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
-		const struct rpc_call_ops *call_ops)
+static void nfs_local_call_write(struct work_struct *work)
 {
-	struct nfs_local_kiocb *iocb;
+	struct nfs_local_io_args *args =
+		container_of(work, struct nfs_local_io_args, work);
+	struct nfs_local_kiocb *iocb = args->iocb;
+	struct file *filp = iocb->kiocb.ki_filp;
 	struct iov_iter iter;
 	ssize_t status;
 
+	nfs_local_iter_init(&iter, iocb, WRITE);
+
+	file_start_write(filp);
+	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+	file_end_write(filp);
+	if (status != -EIOCBQUEUED) {
+		nfs_local_write_done(iocb, status);
+		nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
+		nfs_local_pgio_release(iocb);
+	}
+	complete(args->done);
+}
+
+static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
+			      const struct rpc_call_ops *call_ops)
+{
+	struct nfs_local_io_args args;
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct nfs_local_kiocb *iocb;
+
 	dprintk("%s: vfs_write count=%u pos=%llu %s\n",
 		__func__, hdr->args.count, hdr->args.offset,
 		(hdr->args.stable == NFS_UNSTABLE) ?  "unstable" : "stable");
@@ -435,7 +486,6 @@  nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
 	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
 	if (iocb == NULL)
 		return -ENOMEM;
-	nfs_local_iter_init(&iter, iocb, WRITE);
 
 	switch (hdr->args.stable) {
 	default:
@@ -450,14 +500,20 @@  nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
 
 	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
 
-	file_start_write(filp);
-	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
-	file_end_write(filp);
-	if (status != -EIOCBQUEUED) {
-		nfs_local_write_done(iocb, status);
-		nfs_get_vfs_attr(filp, hdr->res.fattr);
-		nfs_local_pgio_release(iocb);
-	}
+	/*
+	 * Don't call filesystem write() routines directly.
+	 * Some filesystem writeback routines can end up taking up a lot of
+	 * stack (particularly xfs). Instead of risking running over due to
+	 * the extra overhead from the NFS stack, call these write routines
+	 * from a workqueue job.
+	 */
+	args.iocb = iocb;
+	args.done = &done;
+	INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
+	queue_work(nfslocaliod_workqueue, &args.work);
+	wait_for_completion(&done);
+	destroy_work_on_stack(&args.work);
+
 	return 0;
 }