diff mbox series

[bpf,1/5] bpf, sockmap: fix partial copy_page_to_iter so progress can still be made

Message ID 160477787531.608263.10144789972668918015.stgit@john-XPS-13-9370 (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series sockmap fixes | expand

Commit Message

John Fastabend Nov. 7, 2020, 7:37 p.m. UTC
If copy_page_to_iter() fails or even partially completes, but with fewer
bytes copied than expected we currently reset sg.start and return EFAULT.
This proves problematic if we already copied data into the user buffer
before we return an error. Because we leave the copied data in the user
buffer and fail to unwind the scatterlist so kernel side believes data
has been copied and user side believes data has _not_ been received.

Expected behavior should be to return number of bytes copied and then
on the next read we need to return the error assuming its still there. This
can happen if we have a copy length spanning multiple scatterlist elements
and one or more complete before the error is hit.

The error is rare enough though that my normal testing with server side
programs, such as nginx, httpd, envoy, etc., I have never seen this. The
only reliable way to reproduce that I've found is to stream movies over
my browser for a day or so and wait for it to hang. Not very scientific,
but with a few extra WARN_ON()s in the code the bug was obvious.

When we review the errors from copy_page_to_iter() it seems we are hitting
a page fault from copy_page_to_iter_iovec() where the code checks
fault_in_pages_writeable(buf, copy) where buf is the user buffer. It
also seems typical server applications don't hit this case.

The other way to try and reproduce this is run the sockmap selftest tool
test_sockmap with data verification enabled, but it doesn't reproduce the
fault. Perhaps we can trigger this case artificially somehow from the
test tools. I haven't sorted out a way to do that yet though.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 net/ipv4/tcp_bpf.c |   15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

Comments

Daniel Borkmann Nov. 12, 2020, 12:19 a.m. UTC | #1
On 11/7/20 8:37 PM, John Fastabend wrote:
> If copy_page_to_iter() fails or even partially completes, but with fewer
> bytes copied than expected we currently reset sg.start and return EFAULT.
> This proves problematic if we already copied data into the user buffer
> before we return an error. Because we leave the copied data in the user
> buffer and fail to unwind the scatterlist so kernel side believes data
> has been copied and user side believes data has _not_ been received.
> 
> Expected behavior should be to return number of bytes copied and then
> on the next read we need to return the error assuming its still there. This
> can happen if we have a copy length spanning multiple scatterlist elements
> and one or more complete before the error is hit.
> 
> The error is rare enough though that my normal testing with server side
> programs, such as nginx, httpd, envoy, etc., I have never seen this. The
> only reliable way to reproduce that I've found is to stream movies over
> my browser for a day or so and wait for it to hang. Not very scientific,
> but with a few extra WARN_ON()s in the code the bug was obvious.
> 
> When we review the errors from copy_page_to_iter() it seems we are hitting
> a page fault from copy_page_to_iter_iovec() where the code checks
> fault_in_pages_writeable(buf, copy) where buf is the user buffer. It
> also seems typical server applications don't hit this case.
> 
> The other way to try and reproduce this is run the sockmap selftest tool
> test_sockmap with data verification enabled, but it doesn't reproduce the
> fault. Perhaps we can trigger this case artificially somehow from the
> test tools. I haven't sorted out a way to do that yet though.
> 
> Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>   net/ipv4/tcp_bpf.c |   15 ++++++++++-----
>   1 file changed, 10 insertions(+), 5 deletions(-)
> 
> diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
> index 37f4cb2bba5c..3709d679436e 100644
> --- a/net/ipv4/tcp_bpf.c
> +++ b/net/ipv4/tcp_bpf.c
> @@ -15,8 +15,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
>   {
>   	struct iov_iter *iter = &msg->msg_iter;
>   	int peek = flags & MSG_PEEK;
> -	int i, ret, copied = 0;
>   	struct sk_msg *msg_rx;
> +	int i, copied = 0;
>   
>   	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
>   					  struct sk_msg, list);
> @@ -37,10 +37,9 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
>   			page = sg_page(sge);
>   			if (copied + copy > len)
>   				copy = len - copied;
> -			ret = copy_page_to_iter(page, sge->offset, copy, iter);
> -			if (ret != copy) {
> -				msg_rx->sg.start = i;
> -				return -EFAULT;
> +			copy = copy_page_to_iter(page, sge->offset, copy, iter);
> +			if (!copy) {
> +				return copied ? copied : -EFAULT;
>   			}

nit: no need for {}

>   
>   			copied += copy;
> @@ -56,6 +55,11 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
>   						put_page(page);
>   				}
>   			} else {
> +				/* Lets not optimize peek case if copy_page_to_iter
> +				 * didn't copy the entire length lets just break.
> +				 */
> +				if (copy != sge->length)
> +					goto out;

nit: return copied;

Rest lgtm for this one.

>   				sk_msg_iter_var_next(i);
>   			}
>   
> @@ -82,6 +86,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
>   						  struct sk_msg, list);
>   	}
>   
> +out:
>   	return copied;
>   }
>   EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
> 
>
John Fastabend Nov. 12, 2020, 7:56 p.m. UTC | #2
Daniel Borkmann wrote:
> On 11/7/20 8:37 PM, John Fastabend wrote:
> > If copy_page_to_iter() fails or even partially completes, but with fewer
> > bytes copied than expected we currently reset sg.start and return EFAULT.
> > This proves problematic if we already copied data into the user buffer
> > before we return an error. Because we leave the copied data in the user
> > buffer and fail to unwind the scatterlist so kernel side believes data
> > has been copied and user side believes data has _not_ been received.

[...]

> > +			if (!copy) {
> > +				return copied ? copied : -EFAULT;
> >   			}
> 
> nit: no need for {}
> 
> >   
> >   			copied += copy;
> > @@ -56,6 +55,11 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
> >   						put_page(page);
> >   				}
> >   			} else {
> > +				/* Lets not optimize peek case if copy_page_to_iter
> > +				 * didn't copy the entire length lets just break.
> > +				 */
> > +				if (copy != sge->length)
> > +					goto out;
> 
> nit: return copied;
> 
> Rest lgtm for this one.

Great, thanks for the review will fixup in v2.
diff mbox series

Patch

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 37f4cb2bba5c..3709d679436e 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -15,8 +15,8 @@  int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 {
 	struct iov_iter *iter = &msg->msg_iter;
 	int peek = flags & MSG_PEEK;
-	int i, ret, copied = 0;
 	struct sk_msg *msg_rx;
+	int i, copied = 0;
 
 	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
 					  struct sk_msg, list);
@@ -37,10 +37,9 @@  int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 			page = sg_page(sge);
 			if (copied + copy > len)
 				copy = len - copied;
-			ret = copy_page_to_iter(page, sge->offset, copy, iter);
-			if (ret != copy) {
-				msg_rx->sg.start = i;
-				return -EFAULT;
+			copy = copy_page_to_iter(page, sge->offset, copy, iter);
+			if (!copy) {
+				return copied ? copied : -EFAULT;
 			}
 
 			copied += copy;
@@ -56,6 +55,11 @@  int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 						put_page(page);
 				}
 			} else {
+				/* Lets not optimize peek case if copy_page_to_iter
+				 * didn't copy the entire length lets just break.
+				 */
+				if (copy != sge->length)
+					goto out;
 				sk_msg_iter_var_next(i);
 			}
 
@@ -82,6 +86,7 @@  int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 						  struct sk_msg, list);
 	}
 
+out:
 	return copied;
 }
 EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);