diff mbox series

[RFC,05/12] net: optimise page get/free for bvec zc

Message ID 72608c13553a1372e7f6f7a32eb53d5d4b23a1fc.1638282789.git.asml.silence@gmail.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series io_uring zerocopy send | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Pavel Begunkov Nov. 30, 2021, 3:18 p.m. UTC
get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are
expensive. However, we can avoid it if the caller can guarantee that
pages stay alive until the corresponding ubuf_info is not released.
In particular, it targets io_uring with fixed buffers following the
described contract.

Assuming that nobody yet uses bvec together with zerocopy, make all
calls with bvec iterators follow this model.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/skbuff.h | 10 +++++++++-
 net/core/datagram.c    |  9 +++++++--
 net/core/skbuff.c      | 16 +++++++++++++---
 net/ipv4/ip_output.c   |  4 ++++
 4 files changed, 33 insertions(+), 6 deletions(-)

Comments

Jonathan Lemon Dec. 1, 2021, 7:20 p.m. UTC | #1
On Tue, Nov 30, 2021 at 03:18:53PM +0000, Pavel Begunkov wrote:
> get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are
> expensive. However, we can avoid it if the caller can guarantee that
> pages stay alive until the corresponding ubuf_info is not released.
> In particular, it targets io_uring with fixed buffers following the
> described contract.
> 
> Assuming that nobody yet uses bvec together with zerocopy, make all
> calls with bvec iterators follow this model.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  include/linux/skbuff.h | 10 +++++++++-
>  net/core/datagram.c    |  9 +++++++--
>  net/core/skbuff.c      | 16 +++++++++++++---
>  net/ipv4/ip_output.c   |  4 ++++
>  4 files changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 750b7518d6e2..ebb12a7d386d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -461,6 +461,11 @@ enum {
>  	SKBFL_PURE_ZEROCOPY = BIT(2),
>  
>  	SKBFL_DONT_ORPHAN = BIT(3),
> +
> +	/* page references are managed by the ubuf_info, so it's safe to
> +	 * use frags only up until ubuf_info is released
> +	 */
> +	SKBFL_MANAGED_FRAGS = BIT(4),
>  };
>  
>  #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
> @@ -3154,7 +3159,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
>   */
>  static inline void skb_frag_unref(struct sk_buff *skb, int f)
>  {
> -	__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
> +	struct skb_shared_info *shinfo = skb_shinfo(skb);
> +
> +	if (!(shinfo->flags & SKBFL_MANAGED_FRAGS))
> +		__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
>  }
>  
>  /**
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index e00f7e0a7a0a..5cf0672039d6 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -642,7 +642,6 @@ static int __zerocopy_sg_from_bvec(struct sock *sk, struct sk_buff *skb,
>  		v = mp_bvec_iter_bvec(from->bvec, bi);
>  		copied += v.bv_len;
>  		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
> -		get_page(v.bv_page);
>  		skb_fill_page_desc(skb, frag++, v.bv_page, v.bv_offset, v.bv_len);
>  		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
>  	}
> @@ -671,9 +670,15 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
>  			    struct iov_iter *from, size_t length)
>  {
>  	int frag = skb_shinfo(skb)->nr_frags;
> +	bool managed = skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS;
>  
> -	if (iov_iter_is_bvec(from))
> +	if (iov_iter_is_bvec(from) && (managed || frag == 0)) {
> +		skb_shinfo(skb)->flags |= SKBFL_MANAGED_FRAGS;
>  		return __zerocopy_sg_from_bvec(sk, skb, from, length);
> +	}
> +
> +	if (managed)
> +		return -EFAULT;
>  
>  	while (length && iov_iter_count(from)) {
>  		struct page *pages[MAX_SKB_FRAGS];
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index b23db60ea6f9..b7b087815539 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -666,10 +666,14 @@ static void skb_release_data(struct sk_buff *skb)
>  			      &shinfo->dataref))
>  		goto exit;
>  
> -	skb_zcopy_clear(skb, true);
> +	if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) {
> +		for (i = 0; i < shinfo->nr_frags; i++)
> +			__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
> +	} else {
> +		shinfo->flags &= ~SKBFL_MANAGED_FRAGS;
> +	}
>  
> -	for (i = 0; i < shinfo->nr_frags; i++)
> -		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
> +	skb_zcopy_clear(skb, true);

It would be better if all of this logic was moved into the callback
under zcopy_clear.  Note that this path is called for both TX and RX.
Pavel Begunkov Dec. 1, 2021, 8:17 p.m. UTC | #2
On 12/1/21 19:20, Jonathan Lemon wrote:
> On Tue, Nov 30, 2021 at 03:18:53PM +0000, Pavel Begunkov wrote:
>> get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are
>> expensive. However, we can avoid it if the caller can guarantee that
>> pages stay alive until the corresponding ubuf_info is not released.
>> In particular, it targets io_uring with fixed buffers following the
>> described contract.
>>
>> Assuming that nobody yet uses bvec together with zerocopy, make all
>> calls with bvec iterators follow this model.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
[...]
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index b23db60ea6f9..b7b087815539 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -666,10 +666,14 @@ static void skb_release_data(struct sk_buff *skb)
>>   			      &shinfo->dataref))
>>   		goto exit;
>>   
>> -	skb_zcopy_clear(skb, true);
>> +	if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) {
>> +		for (i = 0; i < shinfo->nr_frags; i++)
>> +			__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
>> +	} else {
>> +		shinfo->flags &= ~SKBFL_MANAGED_FRAGS;
>> +	}
>>   
>> -	for (i = 0; i < shinfo->nr_frags; i++)
>> -		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
>> +	skb_zcopy_clear(skb, true);
> 
> It would be better if all of this logic was moved into the callback
> under zcopy_clear.  Note that this path is called for both TX and RX.

Yeah, can be done and removes the extra if from non-zc path.
Thanks!
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 750b7518d6e2..ebb12a7d386d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -461,6 +461,11 @@  enum {
 	SKBFL_PURE_ZEROCOPY = BIT(2),
 
 	SKBFL_DONT_ORPHAN = BIT(3),
+
+	/* page references are managed by the ubuf_info, so it's safe to
+	 * use frags only up until ubuf_info is released
+	 */
+	SKBFL_MANAGED_FRAGS = BIT(4),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
@@ -3154,7 +3159,10 @@  static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
  */
 static inline void skb_frag_unref(struct sk_buff *skb, int f)
 {
-	__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	if (!(shinfo->flags & SKBFL_MANAGED_FRAGS))
+		__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
 }
 
 /**
diff --git a/net/core/datagram.c b/net/core/datagram.c
index e00f7e0a7a0a..5cf0672039d6 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -642,7 +642,6 @@  static int __zerocopy_sg_from_bvec(struct sock *sk, struct sk_buff *skb,
 		v = mp_bvec_iter_bvec(from->bvec, bi);
 		copied += v.bv_len;
 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
-		get_page(v.bv_page);
 		skb_fill_page_desc(skb, frag++, v.bv_page, v.bv_offset, v.bv_len);
 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
 	}
@@ -671,9 +670,15 @@  int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 			    struct iov_iter *from, size_t length)
 {
 	int frag = skb_shinfo(skb)->nr_frags;
+	bool managed = skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS;
 
-	if (iov_iter_is_bvec(from))
+	if (iov_iter_is_bvec(from) && (managed || frag == 0)) {
+		skb_shinfo(skb)->flags |= SKBFL_MANAGED_FRAGS;
 		return __zerocopy_sg_from_bvec(sk, skb, from, length);
+	}
+
+	if (managed)
+		return -EFAULT;
 
 	while (length && iov_iter_count(from)) {
 		struct page *pages[MAX_SKB_FRAGS];
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b23db60ea6f9..b7b087815539 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -666,10 +666,14 @@  static void skb_release_data(struct sk_buff *skb)
 			      &shinfo->dataref))
 		goto exit;
 
-	skb_zcopy_clear(skb, true);
+	if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) {
+		for (i = 0; i < shinfo->nr_frags; i++)
+			__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+	} else {
+		shinfo->flags &= ~SKBFL_MANAGED_FRAGS;
+	}
 
-	for (i = 0; i < shinfo->nr_frags; i++)
-		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+	skb_zcopy_clear(skb, true);
 
 	if (shinfo->frag_list)
 		kfree_skb_list(shinfo->frag_list);
@@ -1471,6 +1475,7 @@  int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	/* skb frags release userspace buffers */
 	for (i = 0; i < num_frags; i++)
 		skb_frag_unref(skb, i);
+	skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAGS;
 
 	/* skb frags point to kernel buffers */
 	for (i = 0; i < new_frags - 1; i++) {
@@ -1597,6 +1602,7 @@  struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
 
 	skb_copy_header(n, skb);
+	skb_shinfo(n)->flags &= ~SKBFL_MANAGED_FRAGS;
 	return n;
 }
 EXPORT_SYMBOL(skb_copy);
@@ -1653,6 +1659,7 @@  struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 			skb_frag_ref(skb, i);
 		}
 		skb_shinfo(n)->nr_frags = i;
+		skb_shinfo(n)->flags &= ~SKBFL_MANAGED_FRAGS;
 	}
 
 	if (skb_has_frag_list(skb)) {
@@ -1725,6 +1732,7 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 			refcount_inc(&skb_uarg(skb)->refcnt);
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			skb_frag_ref(skb, i);
+		skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAGS;
 
 		if (skb_has_frag_list(skb))
 			skb_clone_fraglist(skb);
@@ -3788,6 +3796,8 @@  int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
 	if (skb_can_coalesce(skb, i, page, offset)) {
 		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
 	} else if (i < MAX_SKB_FRAGS) {
+		if (skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS)
+			return -EMSGSIZE;
 		get_page(page);
 		skb_fill_page_desc(skb, i, page, offset, size);
 	} else {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f9aab355d283..e6adf96e5530 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1194,6 +1194,10 @@  static int __ip_append_data(struct sock *sk,
 			err = -ENOMEM;
 			if (!sk_page_frag_refill(sk, pfrag))
 				goto error;
+			if (skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS) {
+				err = -EMSGSIZE;
+				goto error;
+			}
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {