Message ID | 72608c13553a1372e7f6f7a32eb53d5d4b23a1fc.1638282789.git.asml.silence@gmail.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | io_uring zerocopy send | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply, async |
On Tue, Nov 30, 2021 at 03:18:53PM +0000, Pavel Begunkov wrote: > get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are > expensive. However, we can avoid it if the caller can guarantee that > pages stay alive until the corresponding ubuf_info is not released. > In particular, it targets io_uring with fixed buffers following the > described contract. > > Assuming that nobody yet uses bvec together with zerocopy, make all > calls with bvec iterators follow this model. > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > --- > include/linux/skbuff.h | 10 +++++++++- > net/core/datagram.c | 9 +++++++-- > net/core/skbuff.c | 16 +++++++++++++--- > net/ipv4/ip_output.c | 4 ++++ > 4 files changed, 33 insertions(+), 6 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 750b7518d6e2..ebb12a7d386d 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -461,6 +461,11 @@ enum { > SKBFL_PURE_ZEROCOPY = BIT(2), > > SKBFL_DONT_ORPHAN = BIT(3), > + > + /* page references are managed by the ubuf_info, so it's safe to > + * use frags only up until ubuf_info is released > + */ > + SKBFL_MANAGED_FRAGS = BIT(4), > }; > > #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) > @@ -3154,7 +3159,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) > */ > static inline void skb_frag_unref(struct sk_buff *skb, int f) > { > - __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); > + struct skb_shared_info *shinfo = skb_shinfo(skb); > + > + if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) > + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); > } > > /** > diff --git a/net/core/datagram.c b/net/core/datagram.c > index e00f7e0a7a0a..5cf0672039d6 100644 > --- a/net/core/datagram.c > +++ b/net/core/datagram.c > @@ -642,7 +642,6 @@ static int __zerocopy_sg_from_bvec(struct sock *sk, struct sk_buff *skb, > v = mp_bvec_iter_bvec(from->bvec, bi); > copied += v.bv_len; > truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); > - get_page(v.bv_page); > skb_fill_page_desc(skb, frag++, v.bv_page, v.bv_offset, v.bv_len); > bvec_iter_advance_single(from->bvec, &bi, v.bv_len); > } > @@ -671,9 +670,15 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, > struct iov_iter *from, size_t length) > { > int frag = skb_shinfo(skb)->nr_frags; > + bool managed = skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS; > > - if (iov_iter_is_bvec(from)) > + if (iov_iter_is_bvec(from) && (managed || frag == 0)) { > + skb_shinfo(skb)->flags |= SKBFL_MANAGED_FRAGS; > return __zerocopy_sg_from_bvec(sk, skb, from, length); > + } > + > + if (managed) > + return -EFAULT; > > while (length && iov_iter_count(from)) { > struct page *pages[MAX_SKB_FRAGS]; > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index b23db60ea6f9..b7b087815539 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -666,10 +666,14 @@ static void skb_release_data(struct sk_buff *skb) > &shinfo->dataref)) > goto exit; > > - skb_zcopy_clear(skb, true); > + if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) { > + for (i = 0; i < shinfo->nr_frags; i++) > + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); > + } else { > + shinfo->flags &= ~SKBFL_MANAGED_FRAGS; > + } > > - for (i = 0; i < shinfo->nr_frags; i++) > - __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); > + skb_zcopy_clear(skb, true); It would be better if all of this logic was moved into the callback under zcopy_clear. Note that this path is called for both TX and RX.
On 12/1/21 19:20, Jonathan Lemon wrote: > On Tue, Nov 30, 2021 at 03:18:53PM +0000, Pavel Begunkov wrote: >> get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are >> expensive. However, we can avoid it if the caller can guarantee that >> pages stay alive until the corresponding ubuf_info is not released. >> In particular, it targets io_uring with fixed buffers following the >> described contract. >> >> Assuming that nobody yet uses bvec together with zerocopy, make all >> calls with bvec iterators follow this model. >> >> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> >> --- [...] >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> index b23db60ea6f9..b7b087815539 100644 >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -666,10 +666,14 @@ static void skb_release_data(struct sk_buff *skb) >> &shinfo->dataref)) >> goto exit; >> >> - skb_zcopy_clear(skb, true); >> + if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) { >> + for (i = 0; i < shinfo->nr_frags; i++) >> + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); >> + } else { >> + shinfo->flags &= ~SKBFL_MANAGED_FRAGS; >> + } >> >> - for (i = 0; i < shinfo->nr_frags; i++) >> - __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); >> + skb_zcopy_clear(skb, true); > > It would be better if all of this logic was moved into the callback > under zcopy_clear. Note that this path is called for both TX and RX. Yeah, can be done and removes the extra if from non-zc path. Thanks!
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 750b7518d6e2..ebb12a7d386d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -461,6 +461,11 @@ enum { SKBFL_PURE_ZEROCOPY = BIT(2), SKBFL_DONT_ORPHAN = BIT(3), + + /* page references are managed by the ubuf_info, so it's safe to + * use frags only up until ubuf_info is released + */ + SKBFL_MANAGED_FRAGS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) @@ -3154,7 +3159,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } /** diff --git a/net/core/datagram.c b/net/core/datagram.c index e00f7e0a7a0a..5cf0672039d6 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -642,7 +642,6 @@ static int __zerocopy_sg_from_bvec(struct sock *sk, struct sk_buff *skb, v = mp_bvec_iter_bvec(from->bvec, bi); copied += v.bv_len; truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); - get_page(v.bv_page); skb_fill_page_desc(skb, frag++, v.bv_page, v.bv_offset, v.bv_len); bvec_iter_advance_single(from->bvec, &bi, v.bv_len); } @@ -671,9 +670,15 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { int frag = skb_shinfo(skb)->nr_frags; + bool managed = skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS; - if (iov_iter_is_bvec(from)) + if (iov_iter_is_bvec(from) && (managed || frag == 0)) { + skb_shinfo(skb)->flags |= SKBFL_MANAGED_FRAGS; return __zerocopy_sg_from_bvec(sk, skb, from, length); + } + + if (managed) + return -EFAULT; while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b23db60ea6f9..b7b087815539 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -666,10 +666,14 @@ static void skb_release_data(struct sk_buff *skb) &shinfo->dataref)) goto exit; - skb_zcopy_clear(skb, true); + if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) { + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); + } else { + shinfo->flags &= ~SKBFL_MANAGED_FRAGS; + } - for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); + skb_zcopy_clear(skb, true); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -1471,6 +1475,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) /* skb frags release userspace buffers */ for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); + skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAGS; /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { @@ -1597,6 +1602,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); skb_copy_header(n, skb); + skb_shinfo(n)->flags &= ~SKBFL_MANAGED_FRAGS; return n; } EXPORT_SYMBOL(skb_copy); @@ -1653,6 +1659,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; + skb_shinfo(n)->flags &= ~SKBFL_MANAGED_FRAGS; } if (skb_has_frag_list(skb)) { @@ -1725,6 +1732,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); + skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAGS; if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); @@ -3788,6 +3796,8 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); } else if (i < MAX_SKB_FRAGS) { + if (skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS) + return -EMSGSIZE; get_page(page); skb_fill_page_desc(skb, i, page, offset, size); } else { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f9aab355d283..e6adf96e5530 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1194,6 +1194,10 @@ static int __ip_append_data(struct sock *sk, err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; + if (skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS) { + err = -EMSGSIZE; + goto error; + } if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) {
get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are expensive. However, we can avoid it if the caller can guarantee that pages stay alive until the corresponding ubuf_info is not released. In particular, it targets io_uring with fixed buffers following the described contract. Assuming that nobody yet uses bvec together with zerocopy, make all calls with bvec iterators follow this model. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- include/linux/skbuff.h | 10 +++++++++- net/core/datagram.c | 9 +++++++-- net/core/skbuff.c | 16 +++++++++++++--- net/ipv4/ip_output.c | 4 ++++ 4 files changed, 33 insertions(+), 6 deletions(-)