diff mbox series

[bpf-next,v7,09/13] udp: implement ->read_sock() for sockmap

Message ID 20210328202013.29223-10-xiyou.wangcong@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series sockmap: introduce BPF_SK_SKB_VERDICT and support UDP | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 10 maintainers not CCed: dsahern@kernel.org yhs@fb.com kpsingh@kernel.org yoshfuji@linux-ipv6.org andrii@kernel.org kafai@fb.com ast@kernel.org songliubraving@fb.com davem@davemloft.net kuba@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 185 this patch: 185
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 63 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 293 this patch: 293
netdev/header_inline success Link

Commit Message

Cong Wang March 28, 2021, 8:20 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

This is similar to tcp_read_sock(), except we do not need
to worry about connections, we just need to retrieve skb
from UDP receive queue.

Note, the return value of ->read_sock() is unused in
sk_psock_verdict_data_ready().

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/net/udp.h   |  2 ++
 net/ipv4/af_inet.c  |  1 +
 net/ipv4/udp.c      | 35 +++++++++++++++++++++++++++++++++++
 net/ipv6/af_inet6.c |  1 +
 4 files changed, 39 insertions(+)

Comments

John Fastabend March 29, 2021, 8:54 p.m. UTC | #1
Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> This is similar to tcp_read_sock(), except we do not need
> to worry about connections, we just need to retrieve skb
> from UDP receive queue.
> 
> Note, the return value of ->read_sock() is unused in
> sk_psock_verdict_data_ready().
> 
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>  include/net/udp.h   |  2 ++
>  net/ipv4/af_inet.c  |  1 +
>  net/ipv4/udp.c      | 35 +++++++++++++++++++++++++++++++++++
>  net/ipv6/af_inet6.c |  1 +
>  4 files changed, 39 insertions(+)
> 
> diff --git a/include/net/udp.h b/include/net/udp.h
> index df7cc1edc200..347b62a753c3 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
>  			       struct sk_buff *skb);
>  struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
>  				 __be16 sport, __be16 dport);
> +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> +		  sk_read_actor_t recv_actor);
>  
>  /* UDP uses skb->dev_scratch to cache as much information as possible and avoid
>   * possibly multiple cache miss on dequeue()
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 1355e6c0d567..f17870ee558b 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = {
>  	.setsockopt	   = sock_common_setsockopt,
>  	.getsockopt	   = sock_common_getsockopt,
>  	.sendmsg	   = inet_sendmsg,
> +	.read_sock	   = udp_read_sock,
>  	.recvmsg	   = inet_recvmsg,
>  	.mmap		   = sock_no_mmap,
>  	.sendpage	   = inet_sendpage,
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 38952aaee3a1..04620e4d64ab 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1782,6 +1782,41 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
>  }
>  EXPORT_SYMBOL(__skb_recv_udp);
>  
> +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> +		  sk_read_actor_t recv_actor)
> +{
> +	int copied = 0;
> +
> +	while (1) {
> +		int offset = 0, err;

Should this be

 int offset = sk_peek_offset()?

MSG_PEEK should work from recv side, at least it does on TCP side. If
its handled in some following patch a comment would be nice. I was
just reading udp_recvmsg() so maybe its not needed.

> +		struct sk_buff *skb;
> +
> +		skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> +		if (!skb)
> +			return err;
> +		if (offset < skb->len) {
> +			size_t len;
> +			int used;
> +
> +			len = skb->len - offset;
> +			used = recv_actor(desc, skb, offset, len);
> +			if (used <= 0) {
> +				if (!copied)
> +					copied = used;
> +				break;
> +			} else if (used <= len) {
> +				copied += used;
> +				offset += used;

The while loop is going to zero this? What are we trying to do
here with offset?

> +			}
> +		}
> +		if (!desc->count)
> +			break;
> +	}
> +
> +	return copied;
> +}
> +EXPORT_SYMBOL(udp_read_sock);
> +
>  /*
>   * 	This should be easy, if there is something there we
>   * 	return it, otherwise we block.
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index 802f5111805a..71de739b4a9e 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = {
>  	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
>  	.sendmsg	   = inet6_sendmsg,		/* retpoline's sake */
>  	.recvmsg	   = inet6_recvmsg,		/* retpoline's sake */
> +	.read_sock	   = udp_read_sock,
>  	.mmap		   = sock_no_mmap,
>  	.sendpage	   = sock_no_sendpage,
>  	.set_peek_off	   = sk_set_peek_off,
> -- 
> 2.25.1
>
Cong Wang March 30, 2021, 5:39 a.m. UTC | #2
On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > This is similar to tcp_read_sock(), except we do not need
> > to worry about connections, we just need to retrieve skb
> > from UDP receive queue.
> >
> > Note, the return value of ->read_sock() is unused in
> > sk_psock_verdict_data_ready().
> >
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> >  include/net/udp.h   |  2 ++
> >  net/ipv4/af_inet.c  |  1 +
> >  net/ipv4/udp.c      | 35 +++++++++++++++++++++++++++++++++++
> >  net/ipv6/af_inet6.c |  1 +
> >  4 files changed, 39 insertions(+)
> >
> > diff --git a/include/net/udp.h b/include/net/udp.h
> > index df7cc1edc200..347b62a753c3 100644
> > --- a/include/net/udp.h
> > +++ b/include/net/udp.h
> > @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
> >                              struct sk_buff *skb);
> >  struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
> >                                __be16 sport, __be16 dport);
> > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> > +               sk_read_actor_t recv_actor);
> >
> >  /* UDP uses skb->dev_scratch to cache as much information as possible and avoid
> >   * possibly multiple cache miss on dequeue()
> > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> > index 1355e6c0d567..f17870ee558b 100644
> > --- a/net/ipv4/af_inet.c
> > +++ b/net/ipv4/af_inet.c
> > @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = {
> >       .setsockopt        = sock_common_setsockopt,
> >       .getsockopt        = sock_common_getsockopt,
> >       .sendmsg           = inet_sendmsg,
> > +     .read_sock         = udp_read_sock,
> >       .recvmsg           = inet_recvmsg,
> >       .mmap              = sock_no_mmap,
> >       .sendpage          = inet_sendpage,
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index 38952aaee3a1..04620e4d64ab 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -1782,6 +1782,41 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
> >  }
> >  EXPORT_SYMBOL(__skb_recv_udp);
> >
> > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> > +               sk_read_actor_t recv_actor)
> > +{
> > +     int copied = 0;
> > +
> > +     while (1) {
> > +             int offset = 0, err;
>
> Should this be
>
>  int offset = sk_peek_offset()?

What are you really suggesting? sk_peek_offset() is just 0 unless
we have MSG_PEEK here and we don't, because we really want to
dequeue the skb rather than peeking it.

Are you suggesting we should do peeking? I am afraid we can't.
Please be specific, guessing your mind is not an effective way to
address your reviews.

>
> MSG_PEEK should work from recv side, at least it does on TCP side. If
> its handled in some following patch a comment would be nice. I was
> just reading udp_recvmsg() so maybe its not needed.

Please explain why do we need peeking in sockmap? At very least
it has nothing to do with my patchset.

I do not know why you want to use TCP as a "standard" here, TCP
also supports splice(), UDP still doesn't even with ->read_sock().
Of course they are very different.

>
> > +             struct sk_buff *skb;
> > +
> > +             skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> > +             if (!skb)
> > +                     return err;
> > +             if (offset < skb->len) {
> > +                     size_t len;
> > +                     int used;
> > +
> > +                     len = skb->len - offset;
> > +                     used = recv_actor(desc, skb, offset, len);
> > +                     if (used <= 0) {
> > +                             if (!copied)
> > +                                     copied = used;
> > +                             break;
> > +                     } else if (used <= len) {
> > +                             copied += used;
> > +                             offset += used;
>
> The while loop is going to zero this? What are we trying to do
> here with offset?

offset only matters for MSG_PEEK and we do not support peeking
in sockmap case, hence it is unnecessary here. I "use" it here just
to make the code as complete as possible.

To further answer your question, it is set to 0 when we return a
valid skb on line 201 inside __skb_try_recv_from_queue(), as
"_off" is set to 0 and won't change unless we have MSG_PEEK.

173         bool peek_at_off = false;
174         struct sk_buff *skb;
175         int _off = 0;
176
177         if (unlikely(flags & MSG_PEEK && *off >= 0)) {
178                 peek_at_off = true;
179                 _off = *off;
180         }
181
182         *last = queue->prev;
183         skb_queue_walk(queue, skb) {
184                 if (flags & MSG_PEEK) {
185                         if (peek_at_off && _off >= skb->len &&
186                             (_off || skb->peeked)) {
187                                 _off -= skb->len;
188                                 continue;
189                         }
190                         if (!skb->len) {
191                                 skb = skb_set_peeked(skb);
192                                 if (IS_ERR(skb)) {
193                                         *err = PTR_ERR(skb);
194                                         return NULL;
195                                 }
196                         }
197                         refcount_inc(&skb->users);
198                 } else {
199                         __skb_unlink(skb, queue);
200                 }
201                 *off = _off;
202                 return skb;

Of course, when we return NULL, we return immediately without
using offset:

1794                 skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
1795                 if (!skb)
1796                         return err;

This should not be hard to figure out. Hope it is clear now.

Thanks.
John Fastabend March 30, 2021, 6:23 a.m. UTC | #3
Cong Wang wrote:
> On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote:
> >
> > Cong Wang wrote:
> > > From: Cong Wang <cong.wang@bytedance.com>
> > >
> > > This is similar to tcp_read_sock(), except we do not need
> > > to worry about connections, we just need to retrieve skb
> > > from UDP receive queue.
> > >
> > > Note, the return value of ->read_sock() is unused in
> > > sk_psock_verdict_data_ready().
> > >
> > > Cc: John Fastabend <john.fastabend@gmail.com>
> > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > ---

[...]

> > >  }
> > >  EXPORT_SYMBOL(__skb_recv_udp);
> > >
> > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> > > +               sk_read_actor_t recv_actor)
> > > +{
> > > +     int copied = 0;
> > > +
> > > +     while (1) {
> > > +             int offset = 0, err;
> >
> > Should this be
> >
> >  int offset = sk_peek_offset()?
> 
> What are you really suggesting? sk_peek_offset() is just 0 unless
> we have MSG_PEEK here and we don't, because we really want to
> dequeue the skb rather than peeking it.
> 
> Are you suggesting we should do peeking? I am afraid we can't.
> Please be specific, guessing your mind is not an effective way to
> address your reviews.

I was only asking for further details because the offset addition
below struck me as odd.

> 
> >
> > MSG_PEEK should work from recv side, at least it does on TCP side. If
> > its handled in some following patch a comment would be nice. I was
> > just reading udp_recvmsg() so maybe its not needed.
> 
> Please explain why do we need peeking in sockmap? At very least
> it has nothing to do with my patchset.

We need MSG_PEEK to work from application side. From sockmap
side I agree its not needed.

> 
> I do not know why you want to use TCP as a "standard" here, TCP
> also supports splice(), UDP still doesn't even with ->read_sock().
> Of course they are very different.

Not claiming any "standard" here only that user application needs
to work correctly if it passes MSG_PEEK.

> 
> >
> > > +             struct sk_buff *skb;
> > > +
> > > +             skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> > > +             if (!skb)
> > > +                     return err;
> > > +             if (offset < skb->len) {
> > > +                     size_t len;
> > > +                     int used;
> > > +
> > > +                     len = skb->len - offset;
> > > +                     used = recv_actor(desc, skb, offset, len);
> > > +                     if (used <= 0) {
> > > +                             if (!copied)
> > > +                                     copied = used;
> > > +                             break;
> > > +                     } else if (used <= len) {
> > > +                             copied += used;
> > > +                             offset += used;
> >
> > The while loop is going to zero this? What are we trying to do
> > here with offset?
> 
> offset only matters for MSG_PEEK and we do not support peeking
> in sockmap case, hence it is unnecessary here. I "use" it here just
> to make the code as complete as possible.

huh? If its not used the addition is just confusing. Can we drop it?

> 
> To further answer your question, it is set to 0 when we return a
> valid skb on line 201 inside __skb_try_recv_from_queue(), as
> "_off" is set to 0 and won't change unless we have MSG_PEEK.
> 
> 173         bool peek_at_off = false;
> 174         struct sk_buff *skb;
> 175         int _off = 0;
> 176
> 177         if (unlikely(flags & MSG_PEEK && *off >= 0)) {
> 178                 peek_at_off = true;
> 179                 _off = *off;
> 180         }
> 181
> 182         *last = queue->prev;
> 183         skb_queue_walk(queue, skb) {
> 184                 if (flags & MSG_PEEK) {
> 185                         if (peek_at_off && _off >= skb->len &&
> 186                             (_off || skb->peeked)) {
> 187                                 _off -= skb->len;
> 188                                 continue;
> 189                         }
> 190                         if (!skb->len) {
> 191                                 skb = skb_set_peeked(skb);
> 192                                 if (IS_ERR(skb)) {
> 193                                         *err = PTR_ERR(skb);
> 194                                         return NULL;
> 195                                 }
> 196                         }
> 197                         refcount_inc(&skb->users);
> 198                 } else {
> 199                         __skb_unlink(skb, queue);
> 200                 }
> 201                 *off = _off;
> 202                 return skb;
> 
> Of course, when we return NULL, we return immediately without
> using offset:
> 
> 1794                 skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> 1795                 if (!skb)
> 1796                         return err;
> 
> This should not be hard to figure out. Hope it is clear now.
> 

Yes, but tracking offset only to clear it a couple lines later
is confusing.

> Thanks.
Cong Wang March 30, 2021, 6:36 a.m. UTC | #4
On Mon, Mar 29, 2021 at 11:23 PM John Fastabend
<john.fastabend@gmail.com> wrote:
>
> Cong Wang wrote:
> > On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote:
> > >
> > > Cong Wang wrote:
> > > > From: Cong Wang <cong.wang@bytedance.com>
> > > >
> > > > This is similar to tcp_read_sock(), except we do not need
> > > > to worry about connections, we just need to retrieve skb
> > > > from UDP receive queue.
> > > >
> > > > Note, the return value of ->read_sock() is unused in
> > > > sk_psock_verdict_data_ready().
> > > >
> > > > Cc: John Fastabend <john.fastabend@gmail.com>
> > > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > > > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > > ---
>
> [...]
>
> > > >  }
> > > >  EXPORT_SYMBOL(__skb_recv_udp);
> > > >
> > > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> > > > +               sk_read_actor_t recv_actor)
> > > > +{
> > > > +     int copied = 0;
> > > > +
> > > > +     while (1) {
> > > > +             int offset = 0, err;
> > >
> > > Should this be
> > >
> > >  int offset = sk_peek_offset()?
> >
> > What are you really suggesting? sk_peek_offset() is just 0 unless
> > we have MSG_PEEK here and we don't, because we really want to
> > dequeue the skb rather than peeking it.
> >
> > Are you suggesting we should do peeking? I am afraid we can't.
> > Please be specific, guessing your mind is not an effective way to
> > address your reviews.
>
> I was only asking for further details because the offset addition
> below struck me as odd.
>
> >
> > >
> > > MSG_PEEK should work from recv side, at least it does on TCP side. If
> > > its handled in some following patch a comment would be nice. I was
> > > just reading udp_recvmsg() so maybe its not needed.
> >
> > Please explain why do we need peeking in sockmap? At very least
> > it has nothing to do with my patchset.
>
> We need MSG_PEEK to work from application side. From sockmap
> side I agree its not needed.

How does the application reach udp_read_sock()? UDP does not support
splice() as I already mentioned, as ->splice_read() is still missing.

>
> >
> > I do not know why you want to use TCP as a "standard" here, TCP
> > also supports splice(), UDP still doesn't even with ->read_sock().
> > Of course they are very different.
>
> Not claiming any "standard" here only that user application needs
> to work correctly if it passes MSG_PEEK.

I do not see how an application could pass any msg flag to
udp_read_sock().

>
> >
> > >
> > > > +             struct sk_buff *skb;
> > > > +
> > > > +             skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> > > > +             if (!skb)
> > > > +                     return err;
> > > > +             if (offset < skb->len) {
> > > > +                     size_t len;
> > > > +                     int used;
> > > > +
> > > > +                     len = skb->len - offset;
> > > > +                     used = recv_actor(desc, skb, offset, len);
> > > > +                     if (used <= 0) {
> > > > +                             if (!copied)
> > > > +                                     copied = used;
> > > > +                             break;
> > > > +                     } else if (used <= len) {
> > > > +                             copied += used;
> > > > +                             offset += used;
> > >
> > > The while loop is going to zero this? What are we trying to do
> > > here with offset?
> >
> > offset only matters for MSG_PEEK and we do not support peeking
> > in sockmap case, hence it is unnecessary here. I "use" it here just
> > to make the code as complete as possible.
>
> huh? If its not used the addition is just confusing. Can we drop it?

If you mean dropping this single line of code, yes. If you mean
dropping 'offset' completely, no, as both __skb_recv_udp() and
recv_actor() still need it. If you mean I should re-write
__skb_recv_udp() and recv_actor() just to drop 'offset', I am afraid
that is too much with too little gain.

>
> >
> > To further answer your question, it is set to 0 when we return a
> > valid skb on line 201 inside __skb_try_recv_from_queue(), as
> > "_off" is set to 0 and won't change unless we have MSG_PEEK.
> >
> > 173         bool peek_at_off = false;
> > 174         struct sk_buff *skb;
> > 175         int _off = 0;
> > 176
> > 177         if (unlikely(flags & MSG_PEEK && *off >= 0)) {
> > 178                 peek_at_off = true;
> > 179                 _off = *off;
> > 180         }
> > 181
> > 182         *last = queue->prev;
> > 183         skb_queue_walk(queue, skb) {
> > 184                 if (flags & MSG_PEEK) {
> > 185                         if (peek_at_off && _off >= skb->len &&
> > 186                             (_off || skb->peeked)) {
> > 187                                 _off -= skb->len;
> > 188                                 continue;
> > 189                         }
> > 190                         if (!skb->len) {
> > 191                                 skb = skb_set_peeked(skb);
> > 192                                 if (IS_ERR(skb)) {
> > 193                                         *err = PTR_ERR(skb);
> > 194                                         return NULL;
> > 195                                 }
> > 196                         }
> > 197                         refcount_inc(&skb->users);
> > 198                 } else {
> > 199                         __skb_unlink(skb, queue);
> > 200                 }
> > 201                 *off = _off;
> > 202                 return skb;
> >
> > Of course, when we return NULL, we return immediately without
> > using offset:
> >
> > 1794                 skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> > 1795                 if (!skb)
> > 1796                         return err;
> >
> > This should not be hard to figure out. Hope it is clear now.
> >
>
> Yes, but tracking offset only to clear it a couple lines later
> is confusing.

Yeah, but that's __skb_recv_udp()'s fault, not mine. We can refactor
__skb_recv_udp() a bit for !MSG_PEEK case, but I do not see
much gain here.

Thanks.
John Fastabend March 30, 2021, 6:45 a.m. UTC | #5
Cong Wang wrote:
> On Mon, Mar 29, 2021 at 11:23 PM John Fastabend
> <john.fastabend@gmail.com> wrote:
> >
> > Cong Wang wrote:
> > > On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote:
> > > >
> > > > Cong Wang wrote:
> > > > > From: Cong Wang <cong.wang@bytedance.com>
> > > > >
> > > > > This is similar to tcp_read_sock(), except we do not need
> > > > > to worry about connections, we just need to retrieve skb
> > > > > from UDP receive queue.
> > > > >
> > > > > Note, the return value of ->read_sock() is unused in
> > > > > sk_psock_verdict_data_ready().
> > > > >
> > > > > Cc: John Fastabend <john.fastabend@gmail.com>
> > > > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > > > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > > > > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > > > ---
> >
> > [...]
> >
> > > > >  }
> > > > >  EXPORT_SYMBOL(__skb_recv_udp);
> > > > >
> > > > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
> > > > > +               sk_read_actor_t recv_actor)
> > > > > +{
> > > > > +     int copied = 0;
> > > > > +
> > > > > +     while (1) {
> > > > > +             int offset = 0, err;
> > > >
> > > > Should this be
> > > >
> > > >  int offset = sk_peek_offset()?
> > >
> > > What are you really suggesting? sk_peek_offset() is just 0 unless
> > > we have MSG_PEEK here and we don't, because we really want to
> > > dequeue the skb rather than peeking it.
> > >
> > > Are you suggesting we should do peeking? I am afraid we can't.
> > > Please be specific, guessing your mind is not an effective way to
> > > address your reviews.
> >
> > I was only asking for further details because the offset addition
> > below struck me as odd.
> >
> > >
> > > >
> > > > MSG_PEEK should work from recv side, at least it does on TCP side. If
> > > > its handled in some following patch a comment would be nice. I was
> > > > just reading udp_recvmsg() so maybe its not needed.
> > >
> > > Please explain why do we need peeking in sockmap? At very least
> > > it has nothing to do with my patchset.
> >
> > We need MSG_PEEK to work from application side. From sockmap
> > side I agree its not needed.
> 
> How does the application reach udp_read_sock()? UDP does not support
> splice() as I already mentioned, as ->splice_read() is still missing.

It doesn't. All I was trying to say is if an application calls
recvmsg(..., MSG_PEEK) it should work correctly. It wasn't a
comment about this specific patch.

> 
> >
> > >
> > > I do not know why you want to use TCP as a "standard" here, TCP
> > > also supports splice(), UDP still doesn't even with ->read_sock().
> > > Of course they are very different.
> >
> > Not claiming any "standard" here only that user application needs
> > to work correctly if it passes MSG_PEEK.
> 
> I do not see how an application could pass any msg flag to
> udp_read_sock().

Agree.

> 
> >
> > >
> > > >
> > > > > +             struct sk_buff *skb;
> > > > > +
> > > > > +             skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> > > > > +             if (!skb)
> > > > > +                     return err;
> > > > > +             if (offset < skb->len) {
> > > > > +                     size_t len;
> > > > > +                     int used;
> > > > > +
> > > > > +                     len = skb->len - offset;
> > > > > +                     used = recv_actor(desc, skb, offset, len);
> > > > > +                     if (used <= 0) {
> > > > > +                             if (!copied)
> > > > > +                                     copied = used;
> > > > > +                             break;
> > > > > +                     } else if (used <= len) {
> > > > > +                             copied += used;
> > > > > +                             offset += used;
> > > >
> > > > The while loop is going to zero this? What are we trying to do
> > > > here with offset?
> > >
> > > offset only matters for MSG_PEEK and we do not support peeking
> > > in sockmap case, hence it is unnecessary here. I "use" it here just
> > > to make the code as complete as possible.
> >
> > huh? If its not used the addition is just confusing. Can we drop it?
> 
> If you mean dropping this single line of code, yes. If you mean
> dropping 'offset' completely, no, as both __skb_recv_udp() and
> recv_actor() still need it. If you mean I should re-write
> __skb_recv_udp() and recv_actor() just to drop 'offset', I am afraid
> that is too much with too little gain.

All I'm saying is drop the single line of code above. This specific
one

 'offset += used'

And add a comment in the commit msg that just says peeking is not
supported. I think we need at least one more respin of the patches
anyways to address a different small comment so should be easy.

> 
> >
> > >
> > > To further answer your question, it is set to 0 when we return a
> > > valid skb on line 201 inside __skb_try_recv_from_queue(), as
> > > "_off" is set to 0 and won't change unless we have MSG_PEEK.
> > >
> > > 173         bool peek_at_off = false;
> > > 174         struct sk_buff *skb;
> > > 175         int _off = 0;
> > > 176
> > > 177         if (unlikely(flags & MSG_PEEK && *off >= 0)) {
> > > 178                 peek_at_off = true;
> > > 179                 _off = *off;
> > > 180         }
> > > 181
> > > 182         *last = queue->prev;
> > > 183         skb_queue_walk(queue, skb) {
> > > 184                 if (flags & MSG_PEEK) {
> > > 185                         if (peek_at_off && _off >= skb->len &&
> > > 186                             (_off || skb->peeked)) {
> > > 187                                 _off -= skb->len;
> > > 188                                 continue;
> > > 189                         }
> > > 190                         if (!skb->len) {
> > > 191                                 skb = skb_set_peeked(skb);
> > > 192                                 if (IS_ERR(skb)) {
> > > 193                                         *err = PTR_ERR(skb);
> > > 194                                         return NULL;
> > > 195                                 }
> > > 196                         }
> > > 197                         refcount_inc(&skb->users);
> > > 198                 } else {
> > > 199                         __skb_unlink(skb, queue);
> > > 200                 }
> > > 201                 *off = _off;
> > > 202                 return skb;
> > >
> > > Of course, when we return NULL, we return immediately without
> > > using offset:
> > >
> > > 1794                 skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
> > > 1795                 if (!skb)
> > > 1796                         return err;
> > >
> > > This should not be hard to figure out. Hope it is clear now.
> > >
> >
> > Yes, but tracking offset only to clear it a couple lines later
> > is confusing.
> 
> Yeah, but that's __skb_recv_udp()'s fault, not mine. We can refactor
> __skb_recv_udp() a bit for !MSG_PEEK case, but I do not see
> much gain here.

No don't bother here. I don't see much gain in doing that either. If
you want do it in another series not this one.

> 
> Thanks.
diff mbox series

Patch

diff --git a/include/net/udp.h b/include/net/udp.h
index df7cc1edc200..347b62a753c3 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -329,6 +329,8 @@  struct sock *__udp6_lib_lookup(struct net *net,
 			       struct sk_buff *skb);
 struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
 				 __be16 sport, __be16 dport);
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor);
 
 /* UDP uses skb->dev_scratch to cache as much information as possible and avoid
  * possibly multiple cache miss on dequeue()
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1355e6c0d567..f17870ee558b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1070,6 +1070,7 @@  const struct proto_ops inet_dgram_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
+	.read_sock	   = udp_read_sock,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 38952aaee3a1..04620e4d64ab 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1782,6 +1782,41 @@  struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 }
 EXPORT_SYMBOL(__skb_recv_udp);
 
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor)
+{
+	int copied = 0;
+
+	while (1) {
+		int offset = 0, err;
+		struct sk_buff *skb;
+
+		skb = __skb_recv_udp(sk, 0, 1, &offset, &err);
+		if (!skb)
+			return err;
+		if (offset < skb->len) {
+			size_t len;
+			int used;
+
+			len = skb->len - offset;
+			used = recv_actor(desc, skb, offset, len);
+			if (used <= 0) {
+				if (!copied)
+					copied = used;
+				break;
+			} else if (used <= len) {
+				copied += used;
+				offset += used;
+			}
+		}
+		if (!desc->count)
+			break;
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL(udp_read_sock);
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 802f5111805a..71de739b4a9e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -714,6 +714,7 @@  const struct proto_ops inet6_dgram_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet6_sendmsg,		/* retpoline's sake */
 	.recvmsg	   = inet6_recvmsg,		/* retpoline's sake */
+	.read_sock	   = udp_read_sock,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 	.set_peek_off	   = sk_set_peek_off,