Message ID | 20230329141354.516864-17-dhowells@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES) | expand |
David Howells wrote: > Make IP/UDP sendmsg() support MSG_SPLICE_PAGES. This causes pages to be > spliced from the source iterator if possible (the iterator must be > ITER_BVEC and the pages must be spliceable). > > This allows ->sendpage() to be replaced by something that can handle > multiple multipage folios in a single transaction. > > Signed-off-by: David Howells <dhowells@redhat.com> > cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com> > cc: "David S. Miller" <davem@davemloft.net> > cc: Eric Dumazet <edumazet@google.com> > cc: Jakub Kicinski <kuba@kernel.org> > cc: Paolo Abeni <pabeni@redhat.com> > cc: Jens Axboe <axboe@kernel.dk> > cc: Matthew Wilcox <willy@infradead.org> > cc: netdev@vger.kernel.org > --- > net/ipv4/ip_output.c | 85 +++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 81 insertions(+), 4 deletions(-) > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c A non-RFC version would require the same for ipv6, of course. > index 4e4e308c3230..07736da70eab 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -973,11 +973,11 @@ static int __ip_append_data(struct sock *sk, > int hh_len; > int exthdrlen; > int mtu; > - int copy; > + ssize_t copy; > int err; > int offset = 0; > bool zc = false; > - unsigned int maxfraglen, fragheaderlen, maxnonfragsize; > + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; Does x here stand for anything? > int csummode = CHECKSUM_NONE; > struct rtable *rt = (struct rtable *)cork->dst; > unsigned int wmem_alloc_delta = 0; > @@ -1017,6 +1017,7 @@ static int __ip_append_data(struct sock *sk, > (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) > csummode = CHECKSUM_PARTIAL; > > + xlength = length; > if ((flags & MSG_ZEROCOPY) && length) { > struct msghdr *msg = from; > > @@ -1047,6 +1048,14 @@ static int __ip_append_data(struct sock *sk, > skb_zcopy_set(skb, uarg, &extra_uref); > } > } > + } else if ((flags & MSG_SPLICE_PAGES) && length) { > + struct msghdr *msg = from; > + > + if (inet->hdrincl) > + return -EPERM; > + if (!(rt->dst.dev->features & NETIF_F_SG)) > + return -EOPNOTSUPP; > + xlength = transhdrlen; /* We need an empty buffer to attach stuff to */ > } > > cork->length += length; > @@ -1074,6 +1083,50 @@ static int __ip_append_data(struct sock *sk, > unsigned int alloclen, alloc_extra; > unsigned int pagedlen; > struct sk_buff *skb_prev; > + > + if (unlikely(flags & MSG_SPLICE_PAGES)) { > + skb_prev = skb; > + fraggap = skb_prev->len - maxfraglen; > + > + alloclen = fragheaderlen + hh_len + fraggap + 15; > + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); > + if (unlikely(!skb)) { > + err = -ENOBUFS; > + goto error; > + } > + > + /* > + * Fill in the control structures > + */ > + skb->ip_summed = CHECKSUM_NONE; > + skb->csum = 0; > + skb_reserve(skb, hh_len); > + > + /* > + * Find where to start putting bytes. > + */ > + skb_put(skb, fragheaderlen + fraggap); > + skb_reset_network_header(skb); > + skb->transport_header = (skb->network_header + > + fragheaderlen); > + if (fraggap) { > + skb->csum = skb_copy_and_csum_bits( > + skb_prev, maxfraglen, > + skb_transport_header(skb), > + fraggap); > + skb_prev->csum = csum_sub(skb_prev->csum, > + skb->csum); > + pskb_trim_unique(skb_prev, maxfraglen); > + } > + > + /* > + * Put the packet on the pending queue. > + */ > + __skb_queue_tail(&sk->sk_write_queue, skb); > + continue; > + } > + xlength = length; > + > alloc_new_skb: > skb_prev = skb; > if (skb_prev) > @@ -1085,7 +1138,7 @@ static int __ip_append_data(struct sock *sk, > * If remaining data exceeds the mtu, > * we know we need more fragment(s). > */ > - datalen = length + fraggap; > + datalen = xlength + fraggap; > if (datalen > mtu - fragheaderlen) > datalen = maxfraglen - fragheaderlen; > fraglen = datalen + fragheaderlen; > @@ -1099,7 +1152,7 @@ static int __ip_append_data(struct sock *sk, > * because we have no idea what fragment will be > * the last. > */ > - if (datalen == length + fraggap) > + if (datalen == xlength + fraggap) > alloc_extra += rt->dst.trailer_len; > > if ((flags & MSG_MORE) && > @@ -1206,6 +1259,30 @@ static int __ip_append_data(struct sock *sk, > err = -EFAULT; > goto error; > } > + } else if (flags & MSG_SPLICE_PAGES) { > + struct msghdr *msg = from; > + struct page *page = NULL, **pages = &page; > + size_t off; > + > + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, > + copy, 1, 0, &off); > + if (copy <= 0) { > + err = copy ?: -EIO; > + goto error; > + } > + > + err = skb_append_pagefrags(skb, page, off, copy); > + if (err < 0) > + goto error; > + > + if (skb->ip_summed == CHECKSUM_NONE) { > + __wsum csum; > + csum = csum_page(page, off, copy); > + skb->csum = csum_block_add(skb->csum, csum, skb->len); > + } > + > + skb_len_add(skb, copy); > + refcount_add(copy, &sk->sk_wmem_alloc); > } else if (!zc) { > int i = skb_shinfo(skb)->nr_frags; > > This does add a lot of code to two functions that are already unwieldy. It may be unavoidable, but it if can use helpers, that would be preferable.
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > > A non-RFC version would require the same for ipv6, of course. I missed the fact that ipv6 had it's own version of __ip_append_data() despite sharing tcp_sendmsg(). Could __ip_append_data() and __ip6_append_data() be shared? I guess that the v6_cork, the flowi6 and the ipcm6_cookie might prevent that. David
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; > > Does x here stand for anything? Yeah... "bad naming". How about if I call it initial_length? I'm trying to avoid allocating bufferage for the data. > This does add a lot of code to two functions that are already > unwieldy. It may be unavoidable, but it if can use helpers, that would > be preferable. Something like the attached? (This is on top of patches 16-17, but I would need to fold it in) David --- diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b38dbb2f9c3f..019ed9bb6745 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -956,6 +956,96 @@ csum_page(struct page *page, int offset, int copy) return csum; } +/* + * Allocate a packet for MSG_SPLICE_PAGES. + */ +static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb, + unsigned int fragheaderlen, unsigned int maxfraglen, + unsigned int hh_len) +{ + struct sk_buff *skb_prev = *pskb, *skb; + unsigned int fraggap = skb_prev->len - maxfraglen; + unsigned int alloclen = fragheaderlen + hh_len + fraggap + 15; + + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); + if (unlikely(!skb)) + return -ENOBUFS; + + /* Fill in the control structures */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* Find where to start putting bytes. */ + skb_put(skb, fragheaderlen + fraggap); + skb_reset_network_header(skb); + skb->transport_header = skb->network_header + fragheaderlen; + if (fraggap) { + skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, + skb_transport_header(skb), + fraggap); + skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); + pskb_trim_unique(skb_prev, maxfraglen); + } + + /* Put the packet on the pending queue. */ + __skb_queue_tail(&sk->sk_write_queue, skb); + *pskb = skb; + return 0; +} + +/* + * Add (or copy) data pages for MSG_SPLICE_PAGES. + */ +static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb, + void *from, size_t *pcopy) +{ + struct msghdr *msg = from; + struct page *page = NULL, **pages = &page; + ssize_t copy = *pcopy; + size_t off; + bool put = false; + int err; + + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off); + if (copy <= 0) + return copy ?: -EIO; + + if (!sendpage_ok(page)) { + const void *p = kmap_local_page(page); + void *q; + + q = page_frag_memdup(NULL, p + off, copy, + sk->sk_allocation, ULONG_MAX); + kunmap_local(p); + if (!q) + return -ENOMEM; + page = virt_to_page(q); + off = offset_in_page(q); + put = true; + } + + err = skb_append_pagefrags(skb, page, off, copy); + if (put) + put_page(page); + if (err < 0) { + iov_iter_revert(&msg->msg_iter, copy); + return err; + } + + if (skb->ip_summed == CHECKSUM_NONE) { + __wsum csum; + + csum = csum_page(page, off, copy); + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } + + skb_len_add(skb, copy); + refcount_add(copy, &sk->sk_wmem_alloc); + *pcopy = copy; + return 0; +} + static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, @@ -977,7 +1067,7 @@ static int __ip_append_data(struct sock *sk, int err; int offset = 0; bool zc = false; - unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, initial_length; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; @@ -1017,7 +1107,7 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; - xlength = length; + initial_length = length; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; @@ -1053,7 +1143,7 @@ static int __ip_append_data(struct sock *sk, return -EPERM; if (!(rt->dst.dev->features & NETIF_F_SG)) return -EOPNOTSUPP; - xlength = transhdrlen; /* We need an empty buffer to attach stuff to */ + initial_length = transhdrlen; /* We need an empty buffer to attach stuff to */ } cork->length += length; @@ -1083,47 +1173,13 @@ static int __ip_append_data(struct sock *sk, struct sk_buff *skb_prev; if (unlikely(flags & MSG_SPLICE_PAGES)) { - skb_prev = skb; - fraggap = skb_prev->len - maxfraglen; - - alloclen = fragheaderlen + hh_len + fraggap + 15; - skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); - if (unlikely(!skb)) { - err = -ENOBUFS; + err = __ip_splice_alloc(sk, &skb, fragheaderlen, + maxfraglen, hh_len); + if (err < 0) goto error; - } - - /* - * Fill in the control structures - */ - skb->ip_summed = CHECKSUM_NONE; - skb->csum = 0; - skb_reserve(skb, hh_len); - - /* - * Find where to start putting bytes. - */ - skb_put(skb, fragheaderlen + fraggap); - skb_reset_network_header(skb); - skb->transport_header = (skb->network_header + - fragheaderlen); - if (fraggap) { - skb->csum = skb_copy_and_csum_bits( - skb_prev, maxfraglen, - skb_transport_header(skb), - fraggap); - skb_prev->csum = csum_sub(skb_prev->csum, - skb->csum); - pskb_trim_unique(skb_prev, maxfraglen); - } - - /* - * Put the packet on the pending queue. - */ - __skb_queue_tail(&sk->sk_write_queue, skb); continue; } - xlength = length; + initial_length = length; alloc_new_skb: skb_prev = skb; @@ -1136,7 +1192,7 @@ static int __ip_append_data(struct sock *sk, * If remaining data exceeds the mtu, * we know we need more fragment(s). */ - datalen = xlength + fraggap; + datalen = initial_length + fraggap; if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; @@ -1150,7 +1206,7 @@ static int __ip_append_data(struct sock *sk, * because we have no idea what fragment will be * the last. */ - if (datalen == xlength + fraggap) + if (datalen == initial_length + fraggap) alloc_extra += rt->dst.trailer_len; if ((flags & MSG_MORE) && @@ -1258,48 +1314,9 @@ static int __ip_append_data(struct sock *sk, goto error; } } else if (flags & MSG_SPLICE_PAGES) { - struct msghdr *msg = from; - struct page *page = NULL, **pages = &page; - size_t off; - bool put = false; - - copy = iov_iter_extract_pages(&msg->msg_iter, &pages, - copy, 1, 0, &off); - if (copy <= 0) { - err = copy ?: -EIO; - goto error; - } - - if (!sendpage_ok(page)) { - const void *p = kmap_local_page(page); - void *q; - - q = page_frag_memdup(NULL, p + off, copy, - sk->sk_allocation, ULONG_MAX); - kunmap_local(p); - if (!q) { - err = copy ?: -ENOMEM; - goto error; - } - page = virt_to_page(q); - off = offset_in_page(q); - put = true; - } - - err = skb_append_pagefrags(skb, page, off, copy); - if (put) - put_page(page); + err = __ip_splice_pages(sk, skb, from, ©); if (err < 0) goto error; - - if (skb->ip_summed == CHECKSUM_NONE) { - __wsum csum; - csum = csum_page(page, off, copy); - skb->csum = csum_block_add(skb->csum, csum, skb->len); - } - - skb_len_add(skb, copy); - refcount_add(copy, &sk->sk_wmem_alloc); } else if (!zc) { int i = skb_shinfo(skb)->nr_frags;
David Howells wrote: > Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > > > > A non-RFC version would require the same for ipv6, of course. > > I missed the fact that ipv6 had it's own version of __ip_append_data() despite > sharing tcp_sendmsg(). Could __ip_append_data() and __ip6_append_data() be > shared? I guess that the v6_cork, the flowi6 and the ipcm6_cookie might > prevent that. We haven't been able to unify them before. As this series is complex enough as is, I would not attempt to include it. If it grows the code, maybe it'll be an incentive to take another look afterwards.
David Howells wrote: > Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > > > + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; > > > > Does x here stand for anything? > > Yeah... "bad naming". How about if I call it initial_length? I'm trying to > avoid allocating bufferage for the data. That's more informative, thanks. Let me not bikeshed this further for now. > > This does add a lot of code to two functions that are already > > unwieldy. It may be unavoidable, but it if can use helpers, that would > > be preferable. > > Something like the attached? (This is on top of patches 16-17, but I would > need to fold it in) Yes exactly. I wasn't sure whether the inner loops required access to too many function scope variables to pull this off. But seems like it is doable. Great. > > David > --- > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index b38dbb2f9c3f..019ed9bb6745 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -956,6 +956,96 @@ csum_page(struct page *page, int offset, int copy) > return csum; > } > > +/* > + * Allocate a packet for MSG_SPLICE_PAGES. > + */ > +static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb, > + unsigned int fragheaderlen, unsigned int maxfraglen, > + unsigned int hh_len) > +{ > + struct sk_buff *skb_prev = *pskb, *skb; > + unsigned int fraggap = skb_prev->len - maxfraglen; > + unsigned int alloclen = fragheaderlen + hh_len + fraggap + 15; > + > + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); > + if (unlikely(!skb)) > + return -ENOBUFS; > + > + /* Fill in the control structures */ > + skb->ip_summed = CHECKSUM_NONE; > + skb->csum = 0; > + skb_reserve(skb, hh_len); > + > + /* Find where to start putting bytes. */ > + skb_put(skb, fragheaderlen + fraggap); > + skb_reset_network_header(skb); > + skb->transport_header = skb->network_header + fragheaderlen; > + if (fraggap) { > + skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, > + skb_transport_header(skb), > + fraggap); > + skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); > + pskb_trim_unique(skb_prev, maxfraglen); > + } > + > + /* Put the packet on the pending queue. */ > + __skb_queue_tail(&sk->sk_write_queue, skb); > + *pskb = skb; > + return 0; > +} > + > +/* > + * Add (or copy) data pages for MSG_SPLICE_PAGES. > + */ > +static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb, > + void *from, size_t *pcopy) > +{ > + struct msghdr *msg = from; > + struct page *page = NULL, **pages = &page; > + ssize_t copy = *pcopy; > + size_t off; > + bool put = false; > + int err; > + > + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off); > + if (copy <= 0) > + return copy ?: -EIO; > + > + if (!sendpage_ok(page)) { > + const void *p = kmap_local_page(page); > + void *q; > + > + q = page_frag_memdup(NULL, p + off, copy, > + sk->sk_allocation, ULONG_MAX); > + kunmap_local(p); > + if (!q) > + return -ENOMEM; > + page = virt_to_page(q); > + off = offset_in_page(q); > + put = true; > + } > + > + err = skb_append_pagefrags(skb, page, off, copy); > + if (put) > + put_page(page); > + if (err < 0) { > + iov_iter_revert(&msg->msg_iter, copy); > + return err; > + } > + > + if (skb->ip_summed == CHECKSUM_NONE) { > + __wsum csum; > + > + csum = csum_page(page, off, copy); > + skb->csum = csum_block_add(skb->csum, csum, skb->len); > + } > + > + skb_len_add(skb, copy); > + refcount_add(copy, &sk->sk_wmem_alloc); > + *pcopy = copy; > + return 0; > +} > + > static int __ip_append_data(struct sock *sk, > struct flowi4 *fl4, > struct sk_buff_head *queue, > @@ -977,7 +1067,7 @@ static int __ip_append_data(struct sock *sk, > int err; > int offset = 0; > bool zc = false; > - unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; > + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, initial_length; > int csummode = CHECKSUM_NONE; > struct rtable *rt = (struct rtable *)cork->dst; > unsigned int wmem_alloc_delta = 0; > @@ -1017,7 +1107,7 @@ static int __ip_append_data(struct sock *sk, > (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) > csummode = CHECKSUM_PARTIAL; > > - xlength = length; > + initial_length = length; > if ((flags & MSG_ZEROCOPY) && length) { > struct msghdr *msg = from; > > @@ -1053,7 +1143,7 @@ static int __ip_append_data(struct sock *sk, > return -EPERM; > if (!(rt->dst.dev->features & NETIF_F_SG)) > return -EOPNOTSUPP; > - xlength = transhdrlen; /* We need an empty buffer to attach stuff to */ > + initial_length = transhdrlen; /* We need an empty buffer to attach stuff to */ > } > > cork->length += length; > @@ -1083,47 +1173,13 @@ static int __ip_append_data(struct sock *sk, > struct sk_buff *skb_prev; > > if (unlikely(flags & MSG_SPLICE_PAGES)) { > - skb_prev = skb; > - fraggap = skb_prev->len - maxfraglen; > - > - alloclen = fragheaderlen + hh_len + fraggap + 15; > - skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); > - if (unlikely(!skb)) { > - err = -ENOBUFS; > + err = __ip_splice_alloc(sk, &skb, fragheaderlen, > + maxfraglen, hh_len); > + if (err < 0) > goto error; > - } > - > - /* > - * Fill in the control structures > - */ > - skb->ip_summed = CHECKSUM_NONE; > - skb->csum = 0; > - skb_reserve(skb, hh_len); > - > - /* > - * Find where to start putting bytes. > - */ > - skb_put(skb, fragheaderlen + fraggap); > - skb_reset_network_header(skb); > - skb->transport_header = (skb->network_header + > - fragheaderlen); > - if (fraggap) { > - skb->csum = skb_copy_and_csum_bits( > - skb_prev, maxfraglen, > - skb_transport_header(skb), > - fraggap); > - skb_prev->csum = csum_sub(skb_prev->csum, > - skb->csum); > - pskb_trim_unique(skb_prev, maxfraglen); > - } > - > - /* > - * Put the packet on the pending queue. > - */ > - __skb_queue_tail(&sk->sk_write_queue, skb); > continue; > } > - xlength = length; > + initial_length = length; > > alloc_new_skb: > skb_prev = skb; > @@ -1136,7 +1192,7 @@ static int __ip_append_data(struct sock *sk, > * If remaining data exceeds the mtu, > * we know we need more fragment(s). > */ > - datalen = xlength + fraggap; > + datalen = initial_length + fraggap; > if (datalen > mtu - fragheaderlen) > datalen = maxfraglen - fragheaderlen; > fraglen = datalen + fragheaderlen; > @@ -1150,7 +1206,7 @@ static int __ip_append_data(struct sock *sk, > * because we have no idea what fragment will be > * the last. > */ > - if (datalen == xlength + fraggap) > + if (datalen == initial_length + fraggap) > alloc_extra += rt->dst.trailer_len; > > if ((flags & MSG_MORE) && > @@ -1258,48 +1314,9 @@ static int __ip_append_data(struct sock *sk, > goto error; > } > } else if (flags & MSG_SPLICE_PAGES) { > - struct msghdr *msg = from; > - struct page *page = NULL, **pages = &page; > - size_t off; > - bool put = false; > - > - copy = iov_iter_extract_pages(&msg->msg_iter, &pages, > - copy, 1, 0, &off); > - if (copy <= 0) { > - err = copy ?: -EIO; > - goto error; > - } > - > - if (!sendpage_ok(page)) { > - const void *p = kmap_local_page(page); > - void *q; > - > - q = page_frag_memdup(NULL, p + off, copy, > - sk->sk_allocation, ULONG_MAX); > - kunmap_local(p); > - if (!q) { > - err = copy ?: -ENOMEM; > - goto error; > - } > - page = virt_to_page(q); > - off = offset_in_page(q); > - put = true; > - } > - > - err = skb_append_pagefrags(skb, page, off, copy); > - if (put) > - put_page(page); > + err = __ip_splice_pages(sk, skb, from, ©); > if (err < 0) > goto error; > - > - if (skb->ip_summed == CHECKSUM_NONE) { > - __wsum csum; > - csum = csum_page(page, off, copy); > - skb->csum = csum_block_add(skb->csum, csum, skb->len); > - } > - > - skb_len_add(skb, copy); > - refcount_add(copy, &sk->sk_wmem_alloc); > } else if (!zc) { > int i = skb_shinfo(skb)->nr_frags; >
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > Yes exactly. I wasn't sure whether the inner loops required access to > too many function scope variables to pull this off. But seems like it > is doable. Great. The same helpers can be used for both ipv4 and ipv6 as it turns out. David
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4e4e308c3230..07736da70eab 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -973,11 +973,11 @@ static int __ip_append_data(struct sock *sk, int hh_len; int exthdrlen; int mtu; - int copy; + ssize_t copy; int err; int offset = 0; bool zc = false; - unsigned int maxfraglen, fragheaderlen, maxnonfragsize; + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; @@ -1017,6 +1017,7 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + xlength = length; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; @@ -1047,6 +1048,14 @@ static int __ip_append_data(struct sock *sk, skb_zcopy_set(skb, uarg, &extra_uref); } } + } else if ((flags & MSG_SPLICE_PAGES) && length) { + struct msghdr *msg = from; + + if (inet->hdrincl) + return -EPERM; + if (!(rt->dst.dev->features & NETIF_F_SG)) + return -EOPNOTSUPP; + xlength = transhdrlen; /* We need an empty buffer to attach stuff to */ } cork->length += length; @@ -1074,6 +1083,50 @@ static int __ip_append_data(struct sock *sk, unsigned int alloclen, alloc_extra; unsigned int pagedlen; struct sk_buff *skb_prev; + + if (unlikely(flags & MSG_SPLICE_PAGES)) { + skb_prev = skb; + fraggap = skb_prev->len - maxfraglen; + + alloclen = fragheaderlen + hh_len + fraggap + 15; + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); + if (unlikely(!skb)) { + err = -ENOBUFS; + goto error; + } + + /* + * Fill in the control structures + */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + skb_put(skb, fragheaderlen + fraggap); + skb_reset_network_header(skb); + skb->transport_header = (skb->network_header + + fragheaderlen); + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + skb_transport_header(skb), + fraggap); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + pskb_trim_unique(skb_prev, maxfraglen); + } + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + xlength = length; + alloc_new_skb: skb_prev = skb; if (skb_prev) @@ -1085,7 +1138,7 @@ static int __ip_append_data(struct sock *sk, * If remaining data exceeds the mtu, * we know we need more fragment(s). */ - datalen = length + fraggap; + datalen = xlength + fraggap; if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; @@ -1099,7 +1152,7 @@ static int __ip_append_data(struct sock *sk, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) + if (datalen == xlength + fraggap) alloc_extra += rt->dst.trailer_len; if ((flags & MSG_MORE) && @@ -1206,6 +1259,30 @@ static int __ip_append_data(struct sock *sk, err = -EFAULT; goto error; } + } else if (flags & MSG_SPLICE_PAGES) { + struct msghdr *msg = from; + struct page *page = NULL, **pages = &page; + size_t off; + + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, + copy, 1, 0, &off); + if (copy <= 0) { + err = copy ?: -EIO; + goto error; + } + + err = skb_append_pagefrags(skb, page, off, copy); + if (err < 0) + goto error; + + if (skb->ip_summed == CHECKSUM_NONE) { + __wsum csum; + csum = csum_page(page, off, copy); + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } + + skb_len_add(skb, copy); + refcount_add(copy, &sk->sk_wmem_alloc); } else if (!zc) { int i = skb_shinfo(skb)->nr_frags;
Make IP/UDP sendmsg() support MSG_SPLICE_PAGES. This causes pages to be spliced from the source iterator if possible (the iterator must be ITER_BVEC and the pages must be spliceable). This allows ->sendpage() to be replaced by something that can handle multiple multipage folios in a single transaction. Signed-off-by: David Howells <dhowells@redhat.com> cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: Jens Axboe <axboe@kernel.dk> cc: Matthew Wilcox <willy@infradead.org> cc: netdev@vger.kernel.org --- net/ipv4/ip_output.c | 85 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 4 deletions(-)