Message ID | 20230406094245.3633290-6-dhowells@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES), part 1 | expand |
On Thu, Apr 6, 2023 at 5:43 AM David Howells <dhowells@redhat.com> wrote: > > Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be > spliced from the source iterator. > > This allows ->sendpage() to be replaced by something that can handle > multiple multipage folios in a single transaction. > > Signed-off-by: David Howells <dhowells@redhat.com> > cc: Eric Dumazet <edumazet@google.com> > cc: "David S. Miller" <davem@davemloft.net> > cc: David Ahern <dsahern@kernel.org> > cc: Jakub Kicinski <kuba@kernel.org> > cc: Paolo Abeni <pabeni@redhat.com> > cc: Jens Axboe <axboe@kernel.dk> > cc: Matthew Wilcox <willy@infradead.org> > cc: netdev@vger.kernel.org > --- > net/ipv4/tcp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 60 insertions(+), 7 deletions(-) > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index fd68d49490f2..510bacc7ce7b 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > int flags, err, copied = 0; > int mss_now = 0, size_goal, copied_syn = 0; > int process_backlog = 0; > - bool zc = false; > + int zc = 0; > long timeo; > > flags = msg->msg_flags; > @@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > if (msg->msg_ubuf) { > uarg = msg->msg_ubuf; > net_zcopy_get(uarg); > - zc = sk->sk_route_caps & NETIF_F_SG; > + if (sk->sk_route_caps & NETIF_F_SG) > + zc = 1; > } else if (sock_flag(sk, SOCK_ZEROCOPY)) { > uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); > if (!uarg) { > err = -ENOBUFS; > goto out_err; > } > - zc = sk->sk_route_caps & NETIF_F_SG; > - if (!zc) > + if (sk->sk_route_caps & NETIF_F_SG) > + zc = 1; > + else > uarg_to_msgzc(uarg)->zerocopy = 0; > } > + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { > + if (sk->sk_route_caps & NETIF_F_SG) > + zc = 2; > } > > if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && > @@ -1305,7 +1310,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > goto do_error; > > while (msg_data_left(msg)) { > - int copy = 0; > + ssize_t copy = 0; > > skb = tcp_write_queue_tail(sk); > if (skb) > @@ -1346,7 +1351,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > if (copy > msg_data_left(msg)) > copy = msg_data_left(msg); > > - if (!zc) { > + if (zc == 0) { > bool merge = true; > int i = skb_shinfo(skb)->nr_frags; > struct page_frag *pfrag = sk_page_frag(sk); > @@ -1391,7 +1396,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > page_ref_inc(pfrag->page); > } > pfrag->offset += copy; > - } else { > + } else if (zc == 1) { Instead of 1 and 2, MSG_ZEROCOPY and MSG_SPLICE_PAGES make the code more self-documenting. > /* First append to a fragless skb builds initial > * pure zerocopy skb > */ > @@ -1412,6 +1417,54 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > if (err < 0) > goto do_error; > copy = err; > + } else if (zc == 2) { > + /* Splice in data. */ > + struct page *page = NULL, **pages = &page; > + size_t off = 0, part; > + bool can_coalesce; > + int i = skb_shinfo(skb)->nr_frags; > + > + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, > + copy, 1, 0, &off); > + if (copy <= 0) { > + err = copy ?: -EIO; > + goto do_error; > + } > + > + can_coalesce = skb_can_coalesce(skb, i, page, off); > + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { > + tcp_mark_push(tp, skb); > + iov_iter_revert(&msg->msg_iter, copy); > + goto new_segment; > + } > + if (tcp_downgrade_zcopy_pure(sk, skb)) { > + iov_iter_revert(&msg->msg_iter, copy); > + goto wait_for_space; > + } > + > + part = tcp_wmem_schedule(sk, copy); > + iov_iter_revert(&msg->msg_iter, copy - part); > + if (!part) > + goto wait_for_space; > + copy = part; > + > + if (can_coalesce) { > + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); > + } else { > + get_page(page); > + skb_fill_page_desc_noacc(skb, i, page, off, copy); > + } > + page = NULL; > + > + if (!(flags & MSG_NO_SHARED_FRAGS)) > + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; > + > + skb->len += copy; > + skb->data_len += copy; > + skb->truesize += copy; > + sk_wmem_queued_add(sk, copy); > + sk_mem_charge(sk, copy); > + Similar to udp, perhaps in a helper?
On 4/6/23 7:56 PM, Willem de Bruijn wrote: > On Thu, Apr 6, 2023 at 5:43 AM David Howells <dhowells@redhat.com> wrote: >> >> Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be >> spliced from the source iterator. >> >> This allows ->sendpage() to be replaced by something that can handle >> multiple multipage folios in a single transaction. >> >> Signed-off-by: David Howells <dhowells@redhat.com> >> cc: Eric Dumazet <edumazet@google.com> >> cc: "David S. Miller" <davem@davemloft.net> >> cc: David Ahern <dsahern@kernel.org> >> cc: Jakub Kicinski <kuba@kernel.org> >> cc: Paolo Abeni <pabeni@redhat.com> >> cc: Jens Axboe <axboe@kernel.dk> >> cc: Matthew Wilcox <willy@infradead.org> >> cc: netdev@vger.kernel.org >> --- >> net/ipv4/tcp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++------ >> 1 file changed, 60 insertions(+), 7 deletions(-) >> >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c >> index fd68d49490f2..510bacc7ce7b 100644 >> --- a/net/ipv4/tcp.c >> +++ b/net/ipv4/tcp.c >> @@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> int flags, err, copied = 0; >> int mss_now = 0, size_goal, copied_syn = 0; >> int process_backlog = 0; >> - bool zc = false; >> + int zc = 0; >> long timeo; >> >> flags = msg->msg_flags; >> @@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> if (msg->msg_ubuf) { >> uarg = msg->msg_ubuf; >> net_zcopy_get(uarg); >> - zc = sk->sk_route_caps & NETIF_F_SG; >> + if (sk->sk_route_caps & NETIF_F_SG) >> + zc = 1; >> } else if (sock_flag(sk, SOCK_ZEROCOPY)) { >> uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); >> if (!uarg) { >> err = -ENOBUFS; >> goto out_err; >> } >> - zc = sk->sk_route_caps & NETIF_F_SG; >> - if (!zc) >> + if (sk->sk_route_caps & NETIF_F_SG) >> + zc = 1; >> + else >> uarg_to_msgzc(uarg)->zerocopy = 0; >> } >> + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { >> + if (sk->sk_route_caps & NETIF_F_SG) >> + zc = 2; >> } >> >> if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && >> @@ -1305,7 +1310,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> goto do_error; >> >> while (msg_data_left(msg)) { >> - int copy = 0; >> + ssize_t copy = 0; >> >> skb = tcp_write_queue_tail(sk); >> if (skb) >> @@ -1346,7 +1351,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> if (copy > msg_data_left(msg)) >> copy = msg_data_left(msg); >> >> - if (!zc) { >> + if (zc == 0) { >> bool merge = true; >> int i = skb_shinfo(skb)->nr_frags; >> struct page_frag *pfrag = sk_page_frag(sk); >> @@ -1391,7 +1396,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> page_ref_inc(pfrag->page); >> } >> pfrag->offset += copy; >> - } else { >> + } else if (zc == 1) { > > Instead of 1 and 2, MSG_ZEROCOPY and MSG_SPLICE_PAGES make the code > more self-documenting. > >> /* First append to a fragless skb builds initial >> * pure zerocopy skb >> */ >> @@ -1412,6 +1417,54 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> if (err < 0) >> goto do_error; >> copy = err; >> + } else if (zc == 2) { >> + /* Splice in data. */ >> + struct page *page = NULL, **pages = &page; >> + size_t off = 0, part; >> + bool can_coalesce; >> + int i = skb_shinfo(skb)->nr_frags; >> + >> + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, >> + copy, 1, 0, &off); >> + if (copy <= 0) { >> + err = copy ?: -EIO; >> + goto do_error; >> + } >> + >> + can_coalesce = skb_can_coalesce(skb, i, page, off); >> + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { >> + tcp_mark_push(tp, skb); >> + iov_iter_revert(&msg->msg_iter, copy); >> + goto new_segment; >> + } >> + if (tcp_downgrade_zcopy_pure(sk, skb)) { >> + iov_iter_revert(&msg->msg_iter, copy); >> + goto wait_for_space; >> + } >> + >> + part = tcp_wmem_schedule(sk, copy); >> + iov_iter_revert(&msg->msg_iter, copy - part); >> + if (!part) >> + goto wait_for_space; >> + copy = part; >> + >> + if (can_coalesce) { >> + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); >> + } else { >> + get_page(page); >> + skb_fill_page_desc_noacc(skb, i, page, off, copy); >> + } >> + page = NULL; >> + >> + if (!(flags & MSG_NO_SHARED_FRAGS)) >> + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; >> + >> + skb->len += copy; >> + skb->data_len += copy; >> + skb->truesize += copy; >> + sk_wmem_queued_add(sk, copy); >> + sk_mem_charge(sk, copy); >> + > > Similar to udp, perhaps in a helper? tcp_sendmsg_locked is already more than 250 lines long and this 47 lines is compounding it. I was staring at this code 2 weeks ago wondering if it can be split or refactored to reduce the complexity.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fd68d49490f2..510bacc7ce7b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; - bool zc = false; + int zc = 0; long timeo; flags = msg->msg_flags; @@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (msg->msg_ubuf) { uarg = msg->msg_ubuf; net_zcopy_get(uarg); - zc = sk->sk_route_caps & NETIF_F_SG; + if (sk->sk_route_caps & NETIF_F_SG) + zc = 1; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } - zc = sk->sk_route_caps & NETIF_F_SG; - if (!zc) + if (sk->sk_route_caps & NETIF_F_SG) + zc = 1; + else uarg_to_msgzc(uarg)->zerocopy = 0; } + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { + if (sk->sk_route_caps & NETIF_F_SG) + zc = 2; } if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && @@ -1305,7 +1310,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto do_error; while (msg_data_left(msg)) { - int copy = 0; + ssize_t copy = 0; skb = tcp_write_queue_tail(sk); if (skb) @@ -1346,7 +1351,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (copy > msg_data_left(msg)) copy = msg_data_left(msg); - if (!zc) { + if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1391,7 +1396,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) page_ref_inc(pfrag->page); } pfrag->offset += copy; - } else { + } else if (zc == 1) { /* First append to a fragless skb builds initial * pure zerocopy skb */ @@ -1412,6 +1417,54 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (err < 0) goto do_error; copy = err; + } else if (zc == 2) { + /* Splice in data. */ + struct page *page = NULL, **pages = &page; + size_t off = 0, part; + bool can_coalesce; + int i = skb_shinfo(skb)->nr_frags; + + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, + copy, 1, 0, &off); + if (copy <= 0) { + err = copy ?: -EIO; + goto do_error; + } + + can_coalesce = skb_can_coalesce(skb, i, page, off); + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { + tcp_mark_push(tp, skb); + iov_iter_revert(&msg->msg_iter, copy); + goto new_segment; + } + if (tcp_downgrade_zcopy_pure(sk, skb)) { + iov_iter_revert(&msg->msg_iter, copy); + goto wait_for_space; + } + + part = tcp_wmem_schedule(sk, copy); + iov_iter_revert(&msg->msg_iter, copy - part); + if (!part) + goto wait_for_space; + copy = part; + + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(page); + skb_fill_page_desc_noacc(skb, i, page, off, copy); + } + page = NULL; + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); + } if (!copied)
Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be spliced from the source iterator. This allows ->sendpage() to be replaced by something that can handle multiple multipage folios in a single transaction. Signed-off-by: David Howells <dhowells@redhat.com> cc: Eric Dumazet <edumazet@google.com> cc: "David S. Miller" <davem@davemloft.net> cc: David Ahern <dsahern@kernel.org> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: Jens Axboe <axboe@kernel.dk> cc: Matthew Wilcox <willy@infradead.org> cc: netdev@vger.kernel.org --- net/ipv4/tcp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 7 deletions(-)