diff mbox series

[net-next,v6,06/18] net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES

Message ID 20230411160902.4134381-7-dhowells@redhat.com (mailing list archive)
State New
Headers show
Series splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES), part 1 | expand

Commit Message

David Howells April 11, 2023, 4:08 p.m. UTC
Add a function to handle MSG_SPLICE_PAGES being passed internally to
sendmsg().  Pages are spliced into the given socket buffer if possible and
copied in if not (ie. they're slab pages or have a zero refcount).

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric Dumazet <edumazet@google.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: David Ahern <dsahern@kernel.org>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
---
 include/linux/skbuff.h |   3 ++
 net/core/skbuff.c      | 110 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)

Comments

Al Viro April 13, 2023, 4:41 a.m. UTC | #1
On Tue, Apr 11, 2023 at 05:08:50PM +0100, David Howells wrote:
> Add a function to handle MSG_SPLICE_PAGES being passed internally to
> sendmsg().  Pages are spliced into the given socket buffer if possible and
> copied in if not (ie. they're slab pages or have a zero refcount).

That "ie." would better be "e.g." - that condition is *not* enough for
tell the unsafe ones from the rest.

sendpage_ok() would be better off called "might_be_ok_to_sendpage()".
If it's false, we'd better not grab a reference to the page and expect the
sucker to stay safe until the reference is dropped.  However, AFAICS
it might return true on a page that is not safe in that respect.

What rules do you propose for sendpage users?  "Pass whatever page reference
you want, it'll do the right thing"?  Anything short of that would better
be documented as explicitly as possible...
David Howells April 13, 2023, 9:26 p.m. UTC | #2
Al Viro <viro@zeniv.linux.org.uk> wrote:

> On Tue, Apr 11, 2023 at 05:08:50PM +0100, David Howells wrote:
> > Add a function to handle MSG_SPLICE_PAGES being passed internally to
> > sendmsg().  Pages are spliced into the given socket buffer if possible and
> > copied in if not (ie. they're slab pages or have a zero refcount).
> 
> That "ie." would better be "e.g." - that condition is *not* enough for
> tell the unsafe ones from the rest.
> 
> sendpage_ok() would be better off called "might_be_ok_to_sendpage()".
> If it's false, we'd better not grab a reference to the page and expect the
> sucker to stay safe until the reference is dropped.  However, AFAICS
> it might return true on a page that is not safe in that respect.
> 
> What rules do you propose for sendpage users?  "Pass whatever page reference
> you want, it'll do the right thing"?  Anything short of that would better
> be documented as explicitly as possible...

Hmmm...  Fair point.  Is everything passed through splice guaranteed to be
safe, I wonder?  Probably not because vmsplice().  Does that mean the existing
callers of sendpage_ok() are also making unviable assumptions?

So there are the following 'classes' of memory that I can immediately think
of:

 - Zero page				Splice (no ref?)
 - Kernel core data			Splice
 - Module core data (vmalloc'd)		Splice
 - Supervisor stack			Copy
 - Slab objects				Copy
 - Page frags				Splice
 - Other skbuff frags			Splice
 - Arbitrary pages (eg. sunrpc xdr buf)	Splice (probably)
 - Ordinary pipe buffers		Splice
 - Spliced tmpfs			Splice
 - Spliced pagecache (file/block)	Splice
 - Spliced DIO file/block		Splice
 - Vmspliced mmap'd anon		Splice (with pin?)
 - Vmspliced MAP_SHARED pagecache	Splice (with pin?)
 - Vmspliced MAP_SHARED DAX		Splice?
 - Vmspliced MAP_SHARED MTD		Splice?
 - Vmspliced MAP_SHARED other device	Reject? (e.g. graphics card mem)
 - Vmspliced /dev/{mem,kmem}		Reject?
 
Question is how to tell that we're looking at something that must be copied or
rejected?  sendpage_ok() checks the PG_slab bit and the pagecount, for
example.

David
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6e508274d2a5..add43417b798 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5070,5 +5070,8 @@  static inline void skb_mark_for_recycle(struct sk_buff *skb)
 }
 #endif
 
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+			     ssize_t maxsize, gfp_t gfp);
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d96175f58ca4..c90fc48a63a5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6838,3 +6838,113 @@  nodefer:	__kfree_skb(skb);
 	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
 		smp_call_function_single_async(cpu, &sd->defer_csd);
 }
+
+static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
+				 size_t offset, size_t len)
+{
+	const char *kaddr;
+	__wsum csum;
+
+	kaddr = kmap_local_page(page);
+	csum = csum_partial(kaddr + offset, len, 0);
+	kunmap_local(kaddr);
+	skb->csum = csum_block_add(skb->csum, csum, skb->len);
+}
+
+/**
+ * skb_splice_from_iter - Splice (or copy) pages to skbuff
+ * @skb: The buffer to add pages to
+ * @iter: Iterator representing the pages to be added
+ * @maxsize: Maximum amount of pages to be added
+ * @gfp: Allocation flags
+ *
+ * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
+ * extracts pages from an iterator and adds them to the socket buffer if
+ * possible, copying them to fragments if not possible (such as if they're slab
+ * pages).
+ *
+ * Returns the amount of data spliced/copied or -EMSGSIZE if there's
+ * insufficient space in the buffer to transfer anything.
+ */
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+			     ssize_t maxsize, gfp_t gfp)
+{
+	struct page *pages[8], **ppages = pages;
+	unsigned int i;
+	ssize_t spliced = 0, ret = 0;
+	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+
+	while (iter->count > 0) {
+		ssize_t space, nr;
+		size_t off, len;
+
+		ret = -EMSGSIZE;
+		space = frag_limit - skb_shinfo(skb)->nr_frags;
+		if (space < 0)
+			break;
+
+		/* We might be able to coalesce without increasing nr_frags */
+		nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
+
+		len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
+		if (len <= 0) {
+			ret = len ?: -EIO;
+			break;
+		}
+
+		if (space == 0 &&
+		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
+				      pages[0], off)) {
+			iov_iter_revert(iter, len);
+			break;
+		}
+
+		i = 0;
+		do {
+			struct page *page = pages[i++];
+			size_t part = min_t(size_t, PAGE_SIZE - off, len);
+			bool put = false;
+
+			if (!sendpage_ok(page)) {
+				const void *p = kmap_local_page(page);
+				void *q;
+
+				q = page_frag_memdup(NULL, p + off, part, gfp,
+						     ULONG_MAX);
+				kunmap_local(p);
+				if (!q) {
+					iov_iter_revert(iter, len);
+					ret = -ENOMEM;
+					goto out;
+				}
+				page = virt_to_page(q);
+				off = offset_in_page(q);
+				put = true;
+			}
+
+			ret = skb_append_pagefrags(skb, page, off, part,
+						   frag_limit);
+			if (put)
+				put_page(page);
+			if (ret < 0) {
+				iov_iter_revert(iter, len);
+				goto out;
+			}
+
+			if (skb->ip_summed == CHECKSUM_NONE)
+				skb_splice_csum_page(skb, page, off, part);
+
+			off = 0;
+			spliced += part;
+			maxsize -= part;
+			len -= part;
+		} while (len > 0);
+
+		if (maxsize <= 0)
+			break;
+	}
+
+out:
+	skb_len_add(skb, spliced);
+	return spliced ?: ret;
+}