diff mbox series

[RFC,3/4] io_uring/splice: support splice from ->splice_read to ->splice_read

Message ID 20221103085004.1029763-4-ming.lei@redhat.com (mailing list archive)
State New
Headers show
Series io_uring/splice: extend splice for supporting ublk zero copy | expand

Commit Message

Ming Lei Nov. 3, 2022, 8:50 a.m. UTC
The 1st ->splice_read produces buffer to the pipe of
current->splice_pipe, and the 2nd ->splice_read consumes the buffer
in this pipe.

This way helps to support zero copy of read request for ublk and fuse.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 fs/splice.c               | 146 ++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h        |   2 +
 include/linux/pipe_fs_i.h |   9 +++
 include/linux/splice.h    |  11 +++
 io_uring/splice.c         |  13 ++--
 5 files changed, 169 insertions(+), 12 deletions(-)

Comments

Christoph Hellwig Nov. 8, 2022, 7:45 a.m. UTC | #1
On Thu, Nov 03, 2022 at 04:50:03PM +0800, Ming Lei wrote:
> The 1st ->splice_read produces buffer to the pipe of
> current->splice_pipe, and the 2nd ->splice_read consumes the buffer
> in this pipe.

This looks really ugly.  I think you want Linus and Al to look over
it at very least.

Also, what is going to happen if your ->splice_read instance does not
support the flag to magically do something entirely different?
Ming Lei Nov. 8, 2022, 8:29 a.m. UTC | #2
On Mon, Nov 07, 2022 at 11:45:06PM -0800, Christoph Hellwig wrote:
> On Thu, Nov 03, 2022 at 04:50:03PM +0800, Ming Lei wrote:
> > The 1st ->splice_read produces buffer to the pipe of
> > current->splice_pipe, and the 2nd ->splice_read consumes the buffer
> > in this pipe.
> 
> This looks really ugly.  I think you want Linus and Al to look over
> it at very least.

OK, I will Cc Linus and Al in V2.

It is just another case of pipe's producer/consumer model, IMO.

> 
> Also, what is going to happen if your ->splice_read instance does not
> support the flag to magically do something entirely different?

If the ->splice_read() instance doesn't support this feature, then the new
added pipe flag won't be set, this API will return -EINVAL.



thanks, 
Ming
diff mbox series

Patch

diff --git a/fs/splice.c b/fs/splice.c
index f8999ffe6215..cd5255f9ff13 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -330,17 +330,70 @@  ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 	struct iov_iter to;
 	struct kiocb kiocb;
 	int ret;
+	struct bio_vec *array = NULL;
+	bool consumer;
+
+	/*
+	 * So far, in->fops->splice_read() has to make sure the following
+	 * simple page use model works.
+	 *
+	 * pipe->consumed_by_read is set by the in end of the pipe
+	 */
+	if ((flags & SPLICE_F_READ_TO_READ) && pipe->consumed_by_read) {
+		unsigned int head, tail, mask;
+		int nbufs = pipe->max_usage;
+		size_t left = len;
+		int n;
+
+		if (WARN_ON_ONCE(!(flags & SPLICE_F_DIRECT)))
+			return -EINVAL;
+
+		head = pipe->head;
+		tail = pipe->tail;
+		mask = pipe->ring_size - 1;
+
+		array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
+		if (!array)
+			return -ENOMEM;
+
+		for (n = 0; !pipe_empty(head, tail) && left && n < nbufs;
+				tail++) {
+			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+			size_t this_len = buf->len;
+
+			/* zero-length bvecs are not supported, skip them */
+			if (!this_len)
+				continue;
+			this_len = min(this_len, left);
+
+			array[n].bv_page = buf->page;
+			array[n].bv_len = this_len;
+			array[n].bv_offset = buf->offset;
+			left -= this_len;
+			n++;
+		}
+
+		consumer = true;
+		iov_iter_bvec(&to, READ, array, n, len - left);
+	} else {
+		/* !consumer means one pipe buf producer */
+		consumer = false;
+		iov_iter_pipe(&to, READ, pipe, len);
+	}
 
-	iov_iter_pipe(&to, READ, pipe, len);
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
 	ret = call_read_iter(in, &kiocb, &to);
 	if (ret > 0) {
 		*ppos = kiocb.ki_pos;
 		file_accessed(in);
+
+		if (consumer)
+			splice_dismiss_pipe(pipe, ret);
 	} else if (ret < 0) {
 		/* free what was emitted */
-		pipe_discard_from(pipe, to.start_head);
+		if (!consumer)
+			pipe_discard_from(pipe, to.start_head);
 		/*
 		 * callers of ->splice_read() expect -EAGAIN on
 		 * "can't put anything in there", rather than -EFAULT.
@@ -349,6 +402,11 @@  ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 			ret = -EAGAIN;
 	}
 
+	if (consumer) {
+		kfree(array);
+		pipe->consumed_by_read = false;
+	}
+
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_read);
@@ -782,7 +840,7 @@  static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
  */
 static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
-			 unsigned int flags)
+			 unsigned int flags, bool consumer)
 {
 	unsigned int p_space;
 	int ret;
@@ -790,8 +848,12 @@  static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
-	/* Don't try to read more the pipe has space for. */
-	p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
+	if (consumer) /* read is consumer */
+		p_space = pipe_occupancy(pipe->head, pipe->tail);
+	else
+		/* Don't try to read more the pipe has space for. */
+		p_space = pipe->max_usage - pipe_occupancy(pipe->head,
+				pipe->tail);
 	len = min_t(size_t, len, p_space << PAGE_SHIFT);
 
 	ret = rw_verify_area(READ, in, ppos, len);
@@ -875,7 +937,7 @@  ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		size_t read_len;
 		loff_t pos = sd->pos, prev_pos = pos;
 
-		ret = do_splice_to(in, &pos, pipe, len, flags);
+		ret = do_splice_to(in, &pos, pipe, len, flags, false);
 		if (unlikely(ret <= 0))
 			goto out_release;
 
@@ -992,6 +1054,76 @@  long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 }
 EXPORT_SYMBOL(do_splice_direct);
 
+static int direct_splice_read_consumer_actor(struct pipe_inode_info *pipe,
+			       struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+
+	/* Pipe in side has to notify us by ->consumed_by_read */
+	if (!pipe->consumed_by_read)
+		return -EINVAL;
+
+	return do_splice_to(file, sd->opos, pipe, sd->total_len,
+			sd->flags, true);
+}
+
+/**
+ * do_splice_direct_read_consumer - splices data directly with producer/
+ *    consumer model
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @opos:	output file offset
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags, SPLICE_F_READ_TO_READ is required
+ *
+ * Description:
+ *    For use by ublk or fuse to implement zero copy for READ request, and
+ *    splice directly over internal pipe from device to file, and device's
+ *    ->splice_read() produces pipe buffers, and file's ->splice_read()
+ *    consumes the buffers.
+ *
+ */
+long do_splice_direct_read_consumer(struct file *in, loff_t *ppos,
+				    struct file *out, loff_t *opos,
+				    size_t len, unsigned int flags)
+{
+	struct splice_desc sd = {
+		.len		= len,
+		.total_len	= len,
+		.flags		= flags,
+		.pos		= *ppos,
+		.u.file		= out,
+		.opos		= opos,
+	};
+	long ret;
+
+	if (!(flags & (SPLICE_F_DIRECT | SPLICE_F_READ_TO_READ)))
+		return -EINVAL;
+
+	if (unlikely(!(out->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	/*
+	 * So far we just support F_READ_TO_READ if it is one plain
+	 * file which ->splice_read points to generic_file_splice_read
+	 */
+	if (out->f_op->splice_read != generic_file_splice_read)
+		return -EINVAL;
+
+	ret = rw_verify_area(READ, out, opos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	ret = splice_direct_to_actor(in, &sd,
+			direct_splice_read_consumer_actor);
+	if (ret > 0)
+		*ppos = sd.pos;
+
+	return ret;
+}
+EXPORT_SYMBOL(do_splice_direct_read_consumer);
+
 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 {
 	for (;;) {
@@ -1023,7 +1155,7 @@  long splice_file_to_pipe(struct file *in,
 	pipe_lock(opipe);
 	ret = wait_for_space(opipe, flags);
 	if (!ret)
-		ret = do_splice_to(in, offset, opipe, len, flags);
+		ret = do_splice_to(in, offset, opipe, len, flags, false);
 	pipe_unlock(opipe);
 	if (ret > 0)
 		wakeup_pipe_readers(opipe);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e654435f1651..e5f84902f149 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3169,6 +3169,8 @@  extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
 		struct file *out, loff_t *, size_t len, unsigned int flags);
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 		loff_t *opos, size_t len, unsigned int flags);
+extern long do_splice_direct_read_consumer(struct file *in, loff_t *ppos,
+		struct file *out, loff_t *opos, size_t len, unsigned int flags);
 
 
 extern void
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6cb65df3e3ba..90c6ff8c82ef 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -72,6 +72,15 @@  struct pipe_inode_info {
 	unsigned int r_counter;
 	unsigned int w_counter;
 	bool poll_usage;
+
+	/*
+	 * If SPLICE_F_READ_TO_READ is applied, in->fops->splice_read()
+	 * should set this flag, so that out->fops->splice_read() can
+	 * observe this flag, then consume buffers in the pipe.
+	 *
+	 * Used by do_splice_direct_read_consumer() only.
+	 */
+	bool consumed_by_read;
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 9121624ad198..f48044e5e173 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -26,6 +26,17 @@ 
 /* used for io_uring interface only */
 #define SPLICE_F_DIRECT	(0x10)	/* direct splice and user needn't provide pipe */
 
+/*
+ * The usual splice is file-to-pipe and pipe-to-file, and this flag means the
+ * splice is file-to-pipe and file-to-pipe. Looks this way is stupid, but
+ * please understand from producer & consumer viewpoint, the 1st file-to-pipe
+ * is producer, and the 2nd file-to-pipe is consumer, so here the 2nd
+ * ->slice_read just consumes buffers stored in the pipe.
+ *
+ *  And this flag is only valid in case of SPLICE_F_DIRECT.
+ */
+#define SPLICE_F_READ_TO_READ	(0x20)
+
 /*
  * Passed to the actors
  */
diff --git a/io_uring/splice.c b/io_uring/splice.c
index c11ea4cd1c7e..df66d89f4f17 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -28,7 +28,7 @@  static int __io_splice_prep(struct io_kiocb *req,
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL |
-		SPLICE_F_DIRECT;
+		SPLICE_F_DIRECT | SPLICE_F_READ_TO_READ;
 
 	sp->len = READ_ONCE(sqe->len);
 	sp->flags = READ_ONCE(sqe->splice_flags);
@@ -111,12 +111,15 @@  int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
 
 	if (sp->len) {
-		if (flags & SPLICE_F_DIRECT)
-			ret = do_splice_direct(in, poff_in, out, poff_out,
-					sp->len, flags);
-		else
+		if (!(flags & (SPLICE_F_DIRECT | SPLICE_F_READ_TO_READ)))
 			ret = do_splice(in, poff_in, out, poff_out, sp->len,
 					flags);
+		else if (flags & SPLICE_F_READ_TO_READ)
+			ret = do_splice_direct_read_consumer(in, poff_in, out,
+					poff_out, sp->len, flags);
+		else
+			ret = do_splice_direct(in, poff_in, out, poff_out,
+					sp->len, flags);
 	}
 
 	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))