diff mbox series

[RFC] iov_iter: Add an iterator-of-iterators

Message ID 3416400.1679508945@warthog.procyon.org.uk (mailing list archive)
State New
Headers show
Series [RFC] iov_iter: Add an iterator-of-iterators | expand

Commit Message

David Howells March 22, 2023, 6:15 p.m. UTC
Trond Myklebust <trondmy@hammerspace.com> wrote:

> Add an enum iter_type for ITER_ITER ? :-)

Well, you asked for it...  It's actually fairly straightforward once
ITER_PIPE is removed.

---
iov_iter: Add an iterator-of-iterators

Provide an I/O iterator that takes an array of iterators and iterates over
them in turn.  Then make the sunrpc service code (and thus nfsd) use it.

In this particular instance, the svc_tcp_sendmsg() sets up an array of
three iterators: once for the marker+header, one for the body and one
optional one for the tail, then sets msg_iter to be an
iterator-of-iterators across them.

Signed-off-by: David Howells <dhowells@redhat.com>
---    
 include/linux/uio.h  |   19 +++-
 lib/iov_iter.c       |  233 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/sunrpc/svcsock.c |   29 +++---
 3 files changed, 258 insertions(+), 23 deletions(-)

Comments

Trond Myklebust March 22, 2023, 6:47 p.m. UTC | #1
> On Mar 22, 2023, at 14:15, David Howells <dhowells@redhat.com> wrote:
> 
> Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
>> Add an enum iter_type for ITER_ITER ? :-)
> 
> Well, you asked for it...  It's actually fairly straightforward once
> ITER_PIPE is removed.
> 
> ---
> iov_iter: Add an iterator-of-iterators
> 
> Provide an I/O iterator that takes an array of iterators and iterates over
> them in turn.  Then make the sunrpc service code (and thus nfsd) use it.
> 
> In this particular instance, the svc_tcp_sendmsg() sets up an array of
> three iterators: once for the marker+header, one for the body and one
> optional one for the tail, then sets msg_iter to be an
> iterator-of-iterators across them.

Cool! This is something that can be used on the receive side as well, so very useful. I can imagine it might also open up a few more use cases for ITER_XARRAY.

Thanks!
  Trond
Matthew Wilcox (Oracle) March 22, 2023, 6:49 p.m. UTC | #2
On Wed, Mar 22, 2023 at 06:15:45PM +0000, David Howells wrote:
> @@ -43,17 +44,17 @@ struct iov_iter {
>  	bool nofault;
>  	bool data_source;
>  	bool user_backed;
> -	union {
> -		size_t iov_offset;
> -		int last_offset;
> -	};
> +	bool spliceable;

We've now up to five u8s in a row here (iter_type, nofault, data_source,
user_backed).  Is it time to turn some/all of them into:

	bool nofault:1;
	bool data_source:1;
	bool user_backed:1;
	bool spliceable:1;

You can't take the address of them then, but I don't believe we do that
anywhere.
diff mbox series

Patch

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 74598426edb4..321381d3d616 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -27,6 +27,7 @@  enum iter_type {
 	ITER_XARRAY,
 	ITER_DISCARD,
 	ITER_UBUF,
+	ITER_ITERLIST,
 };
 
 #define ITER_SOURCE	1	// == WRITE
@@ -43,17 +44,17 @@  struct iov_iter {
 	bool nofault;
 	bool data_source;
 	bool user_backed;
-	union {
-		size_t iov_offset;
-		int last_offset;
-	};
+	bool spliceable;
+	size_t iov_offset;
 	size_t count;
+	size_t orig_count;
 	union {
 		const struct iovec *iov;
 		const struct kvec *kvec;
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
 		void __user *ubuf;
+		struct iov_iter *iterlist;
 	};
 	union {
 		unsigned long nr_segs;
@@ -104,6 +105,11 @@  static inline bool iov_iter_is_xarray(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_XARRAY;
 }
 
+static inline bool iov_iter_is_iterlist(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_ITERLIST;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
 	return i->data_source ? WRITE : READ;
@@ -238,6 +244,8 @@  void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
+void iov_iter_iterlist(struct iov_iter *i, unsigned int direction, struct iov_iter *iterlist,
+		       unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 		size_t maxsize, unsigned maxpages, size_t *start,
 		iov_iter_extraction_t extraction_flags);
@@ -345,7 +353,8 @@  static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 /* Flags for iov_iter_get/extract_pages*() */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index fad95e4cf372..34ce3b958b6c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -282,7 +282,8 @@  void iov_iter_init(struct iov_iter *i, unsigned int direction,
 		.iov = iov,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 EXPORT_SYMBOL(iov_iter_init);
@@ -364,6 +365,26 @@  size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = _copy_from_iter(addr, part, i->iterlist);
+			addr += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
 	if (user_backed_iter(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
@@ -380,6 +401,27 @@  size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = _copy_from_iter_nocache(addr, part,
+							    i->iterlist);
+			addr += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
 	iterate_and_advance(i, bytes, base, len, off,
 		__copy_from_user_inatomic_nocache(addr + off, base, len),
 		memcpy(addr + off, base, len)
@@ -411,6 +453,27 @@  size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = _copy_from_iter_flushcache(addr, part,
+							       i->iterlist);
+			addr += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
 	iterate_and_advance(i, bytes, base, len, off,
 		__copy_from_user_flushcache(addr + off, base, len),
 		memcpy_flushcache(addr + off, base, len)
@@ -514,7 +577,31 @@  EXPORT_SYMBOL(iov_iter_zero);
 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
 				  struct iov_iter *i)
 {
-	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+	char *kaddr, *p;
+
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = copy_page_from_iter_atomic(page, offset, part,
+							       i->iterlist);
+			offset += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
+	kaddr = kmap_atomic(page);
+	p = kaddr + offset;
 	if (!page_copy_sane(page, offset, bytes)) {
 		kunmap_atomic(kaddr);
 		return 0;
@@ -585,19 +672,49 @@  void iov_iter_advance(struct iov_iter *i, size_t size)
 		iov_iter_bvec_advance(i, size);
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
+	}else if (iov_iter_is_iterlist(i)) {
+		i->count -= size;
+		for (;;) {
+			size_t part = min(size, i->iterlist->count);
+
+			if (part > 0)
+				iov_iter_advance(i->iterlist, part);
+			size -= part;
+			if (!size)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
 	}
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
+static void iov_iter_revert_iterlist(struct iov_iter *i, size_t unroll)
+{
+	for (;;) {
+		size_t part = min(unroll, i->iterlist->orig_count - i->iterlist->count);
+
+		if (part > 0)
+			iov_iter_revert(i->iterlist, part);
+		unroll -= part;
+		if (!unroll)
+			break;
+		i->iterlist--;
+		i->nr_segs++;
+	}
+}
+
 void iov_iter_revert(struct iov_iter *i, size_t unroll)
 {
 	if (!unroll)
 		return;
-	if (WARN_ON(unroll > MAX_RW_COUNT))
+	if (WARN_ON(unroll > i->orig_count - i->count))
 		return;
 	i->count += unroll;
 	if (unlikely(iov_iter_is_discard(i)))
 		return;
+	if (unlikely(iov_iter_is_iterlist(i)))
+		return iov_iter_revert_iterlist(i, unroll);
 	if (unroll <= i->iov_offset) {
 		i->iov_offset -= unroll;
 		return;
@@ -641,6 +758,8 @@  EXPORT_SYMBOL(iov_iter_revert);
  */
 size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
+	if (iov_iter_is_iterlist(i))
+		i = i->iterlist;
 	if (i->nr_segs > 1) {
 		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 			return min(i->count, i->iov->iov_len - i->iov_offset);
@@ -662,7 +781,8 @@  void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
 		.kvec = kvec,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 EXPORT_SYMBOL(iov_iter_kvec);
@@ -678,7 +798,8 @@  void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 		.bvec = bvec,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 EXPORT_SYMBOL(iov_iter_bvec);
@@ -706,6 +827,7 @@  void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
 		.xarray = xarray,
 		.xarray_start = start,
 		.count = count,
+		.orig_count = count,
 		.iov_offset = 0
 	};
 }
@@ -727,11 +849,47 @@  void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 		.iter_type = ITER_DISCARD,
 		.data_source = false,
 		.count = count,
+		.orig_count = count,
 		.iov_offset = 0
 	};
 }
 EXPORT_SYMBOL(iov_iter_discard);
 
+/**
+ * iov_iter_iterlist - Initialise an I/O iterator that is a list of iterators
+ * @iter: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @iterlist: The list of iterators
+ * @nr_segs: The number of elements in the list
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator that just discards everything that's written to it.
+ * It's only available as a source iterator (for WRITE), all the iterators in
+ * the list must be the same and none of them can be ITER_ITERLIST type.
+ */
+void iov_iter_iterlist(struct iov_iter *iter, unsigned int direction,
+		       struct iov_iter *iterlist, unsigned long nr_segs,
+		       size_t count)
+{
+	unsigned long i;
+
+	BUG_ON(direction != WRITE);
+	for (i = 0; i < nr_segs; i++) {
+		BUG_ON(iterlist[i].iter_type == ITER_ITERLIST);
+		BUG_ON(!iterlist[i].data_source);
+	}
+
+	*iter = (struct iov_iter){
+		.iter_type	= ITER_ITERLIST,
+		.data_source	= true,
+		.count		= count,
+		.orig_count	= count,
+		.iterlist	= iterlist,
+		.nr_segs	= nr_segs,
+	};
+}
+EXPORT_SYMBOL(iov_iter_iterlist);
+
 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 				   unsigned len_mask)
 {
@@ -879,6 +1037,15 @@  unsigned long iov_iter_alignment(const struct iov_iter *i)
 	if (iov_iter_is_xarray(i))
 		return (i->xarray_start + i->iov_offset) | i->count;
 
+	if (iov_iter_is_iterlist(i)) {
+		unsigned long align = 0;
+		unsigned int j;
+
+		for (j = 0; j < i->nr_segs; j++)
+			align |= iov_iter_alignment(&i->iterlist[j]);
+		return align;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(iov_iter_alignment);
@@ -1078,6 +1245,18 @@  static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 	}
 	if (iov_iter_is_xarray(i))
 		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
+	if (iov_iter_is_iterlist(i)) {
+		ssize_t size;
+
+		while (!i->iterlist->count) {
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		size = __iov_iter_get_pages_alloc(i->iterlist, pages, maxsize, maxpages,
+						  start, extraction_flags);
+		i->count -= size;
+		return size;
+	}
 	return -EFAULT;
 }
 
@@ -1126,6 +1305,31 @@  ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
+static size_t csum_and_copy_from_iterlist(void *addr, size_t bytes, __wsum *csum,
+					  struct iov_iter *i)
+{
+	size_t copied = 0, n;
+
+	while (i->count && i->nr_segs) {
+		struct iov_iter *j = i->iterlist;
+
+		if (j->count == 0) {
+			i->iterlist++;
+			i->nr_segs--;
+			continue;
+		}
+
+		n = csum_and_copy_from_iter(addr, bytes - copied, csum, j);
+		addr += n;
+		copied += n;
+		i->count -= n;
+		if (n == 0)
+			break;
+	}
+
+	return copied;
+}
+
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 			       struct iov_iter *i)
 {
@@ -1133,6 +1337,8 @@  size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 	sum = *csum;
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
+	if (iov_iter_is_iterlist(i))
+		return csum_and_copy_from_iterlist(addr, bytes, csum, i);
 
 	iterate_and_advance(i, bytes, base, len, off, ({
 		next = csum_and_copy_from_user(base, addr + off, len);
@@ -1236,6 +1442,21 @@  static int bvec_npages(const struct iov_iter *i, int maxpages)
 	return npages;
 }
 
+static int iterlist_npages(const struct iov_iter *i, int maxpages)
+{
+	ssize_t size = i->count;
+	const struct iov_iter *p;
+	int npages = 0;
+
+	for (p = i->iterlist; size; p++) {
+		size -= p->count;
+		npages += iov_iter_npages(p, maxpages - npages);
+		if (unlikely(npages >= maxpages))
+			return maxpages;
+	}
+	return npages;
+}
+
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	if (unlikely(!i->count))
@@ -1255,6 +1476,8 @@  int iov_iter_npages(const struct iov_iter *i, int maxpages)
 		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
 		return min(npages, maxpages);
 	}
+	if (iov_iter_is_iterlist(i))
+		return iterlist_npages(i, maxpages);
 	return 0;
 }
 EXPORT_SYMBOL(iov_iter_npages);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1d0f0f764e16..030a1fa5171b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1073,11 +1073,13 @@  static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
 {
 	const struct kvec *head = xdr->head;
 	const struct kvec *tail = xdr->tail;
+	struct iov_iter iters[3];
+	struct bio_vec head_bv, tail_bv;
 	struct msghdr msg = {
-		.msg_flags	= MSG_SPLICE_PAGES,
+		.msg_flags	= 0, //MSG_SPLICE_PAGES,
 	};
-	void *m, *h, *t;
-	int ret, n = xdr_buf_pagecount(xdr), size;
+	void *m, *t;
+	int ret, n = 2, size;
 
 	*sentp = 0;
 	ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
@@ -1089,27 +1091,28 @@  static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
 	if (!m)
 		return -ENOMEM;
 
-	h = m + sizeof(marker);
-	t = h + head->iov_len;
+	memcpy(m, &marker, sizeof(marker));
+	if (head->iov_len)
+		memcpy(m + sizeof(marker), head->iov_base, head->iov_len);
+	bvec_set_virt(&head_bv, m, sizeof(marker) + head->iov_len);
+	iov_iter_bvec(&iters[0], ITER_SOURCE, &head_bv, 1,
+		      sizeof(marker) + head->iov_len);
 
-	bvec_set_virt(&xdr->bvec[-1], m, sizeof(marker) + head->iov_len);
-	n++;
+	iov_iter_bvec(&iters[1], ITER_SOURCE, xdr->bvec,
+		      xdr_buf_pagecount(xdr), xdr->page_len);
 
 	if (tail->iov_len) {
 		t = page_frag_alloc(NULL, tail->iov_len, GFP_KERNEL);
 		if (!t)
 			return -ENOMEM;
-		bvec_set_virt(&xdr->bvec[n],  t, tail->iov_len);
 		memcpy(t, tail->iov_base, tail->iov_len);
+		bvec_set_virt(&tail_bv,  t, tail->iov_len);
+		iov_iter_bvec(&iters[2], ITER_SOURCE, &tail_bv, 1, tail->iov_len);
 		n++;
 	}
 
-	memcpy(m, &marker, sizeof(marker));
-	if (head->iov_len)
-		memcpy(h, head->iov_base, head->iov_len);
-
 	size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
-	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 1, n, size);
+	iov_iter_iterlist(&msg.msg_iter, ITER_SOURCE, iters, n, size);
 
 	ret = sock_sendmsg(sock, &msg);
 	if (ret < 0)