Message ID | 20210328202013.29223-10-xiyou.wangcong@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | sockmap: introduce BPF_SK_SKB_VERDICT and support UDP | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | success | Link |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Clearly marked for bpf-next |
netdev/subject_prefix | success | Link |
netdev/cc_maintainers | warning | 10 maintainers not CCed: dsahern@kernel.org yhs@fb.com kpsingh@kernel.org yoshfuji@linux-ipv6.org andrii@kernel.org kafai@fb.com ast@kernel.org songliubraving@fb.com davem@davemloft.net kuba@kernel.org |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 185 this patch: 185 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 63 lines checked |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 293 this patch: 293 |
netdev/header_inline | success | Link |
Cong Wang wrote: > From: Cong Wang <cong.wang@bytedance.com> > > This is similar to tcp_read_sock(), except we do not need > to worry about connections, we just need to retrieve skb > from UDP receive queue. > > Note, the return value of ->read_sock() is unused in > sk_psock_verdict_data_ready(). > > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: Jakub Sitnicki <jakub@cloudflare.com> > Cc: Lorenz Bauer <lmb@cloudflare.com> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > --- > include/net/udp.h | 2 ++ > net/ipv4/af_inet.c | 1 + > net/ipv4/udp.c | 35 +++++++++++++++++++++++++++++++++++ > net/ipv6/af_inet6.c | 1 + > 4 files changed, 39 insertions(+) > > diff --git a/include/net/udp.h b/include/net/udp.h > index df7cc1edc200..347b62a753c3 100644 > --- a/include/net/udp.h > +++ b/include/net/udp.h > @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net, > struct sk_buff *skb); > struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, > __be16 sport, __be16 dport); > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > + sk_read_actor_t recv_actor); > > /* UDP uses skb->dev_scratch to cache as much information as possible and avoid > * possibly multiple cache miss on dequeue() > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index 1355e6c0d567..f17870ee558b 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { > .setsockopt = sock_common_setsockopt, > .getsockopt = sock_common_getsockopt, > .sendmsg = inet_sendmsg, > + .read_sock = udp_read_sock, > .recvmsg = inet_recvmsg, > .mmap = sock_no_mmap, > .sendpage = inet_sendpage, > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index 38952aaee3a1..04620e4d64ab 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -1782,6 +1782,41 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, > } > EXPORT_SYMBOL(__skb_recv_udp); > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > + sk_read_actor_t recv_actor) > +{ > + int copied = 0; > + > + while (1) { > + int offset = 0, err; Should this be int offset = sk_peek_offset()? MSG_PEEK should work from recv side, at least it does on TCP side. If its handled in some following patch a comment would be nice. I was just reading udp_recvmsg() so maybe its not needed. > + struct sk_buff *skb; > + > + skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > + if (!skb) > + return err; > + if (offset < skb->len) { > + size_t len; > + int used; > + > + len = skb->len - offset; > + used = recv_actor(desc, skb, offset, len); > + if (used <= 0) { > + if (!copied) > + copied = used; > + break; > + } else if (used <= len) { > + copied += used; > + offset += used; The while loop is going to zero this? What are we trying to do here with offset? > + } > + } > + if (!desc->count) > + break; > + } > + > + return copied; > +} > +EXPORT_SYMBOL(udp_read_sock); > + > /* > * This should be easy, if there is something there we > * return it, otherwise we block. > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c > index 802f5111805a..71de739b4a9e 100644 > --- a/net/ipv6/af_inet6.c > +++ b/net/ipv6/af_inet6.c > @@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = { > .getsockopt = sock_common_getsockopt, /* ok */ > .sendmsg = inet6_sendmsg, /* retpoline's sake */ > .recvmsg = inet6_recvmsg, /* retpoline's sake */ > + .read_sock = udp_read_sock, > .mmap = sock_no_mmap, > .sendpage = sock_no_sendpage, > .set_peek_off = sk_set_peek_off, > -- > 2.25.1 >
On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > From: Cong Wang <cong.wang@bytedance.com> > > > > This is similar to tcp_read_sock(), except we do not need > > to worry about connections, we just need to retrieve skb > > from UDP receive queue. > > > > Note, the return value of ->read_sock() is unused in > > sk_psock_verdict_data_ready(). > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > Cc: Lorenz Bauer <lmb@cloudflare.com> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > --- > > include/net/udp.h | 2 ++ > > net/ipv4/af_inet.c | 1 + > > net/ipv4/udp.c | 35 +++++++++++++++++++++++++++++++++++ > > net/ipv6/af_inet6.c | 1 + > > 4 files changed, 39 insertions(+) > > > > diff --git a/include/net/udp.h b/include/net/udp.h > > index df7cc1edc200..347b62a753c3 100644 > > --- a/include/net/udp.h > > +++ b/include/net/udp.h > > @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net, > > struct sk_buff *skb); > > struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, > > __be16 sport, __be16 dport); > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > > + sk_read_actor_t recv_actor); > > > > /* UDP uses skb->dev_scratch to cache as much information as possible and avoid > > * possibly multiple cache miss on dequeue() > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > > index 1355e6c0d567..f17870ee558b 100644 > > --- a/net/ipv4/af_inet.c > > +++ b/net/ipv4/af_inet.c > > @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { > > .setsockopt = sock_common_setsockopt, > > .getsockopt = sock_common_getsockopt, > > .sendmsg = inet_sendmsg, > > + .read_sock = udp_read_sock, > > .recvmsg = inet_recvmsg, > > .mmap = sock_no_mmap, > > .sendpage = inet_sendpage, > > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > > index 38952aaee3a1..04620e4d64ab 100644 > > --- a/net/ipv4/udp.c > > +++ b/net/ipv4/udp.c > > @@ -1782,6 +1782,41 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, > > } > > EXPORT_SYMBOL(__skb_recv_udp); > > > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > > + sk_read_actor_t recv_actor) > > +{ > > + int copied = 0; > > + > > + while (1) { > > + int offset = 0, err; > > Should this be > > int offset = sk_peek_offset()? What are you really suggesting? sk_peek_offset() is just 0 unless we have MSG_PEEK here and we don't, because we really want to dequeue the skb rather than peeking it. Are you suggesting we should do peeking? I am afraid we can't. Please be specific, guessing your mind is not an effective way to address your reviews. > > MSG_PEEK should work from recv side, at least it does on TCP side. If > its handled in some following patch a comment would be nice. I was > just reading udp_recvmsg() so maybe its not needed. Please explain why do we need peeking in sockmap? At very least it has nothing to do with my patchset. I do not know why you want to use TCP as a "standard" here, TCP also supports splice(), UDP still doesn't even with ->read_sock(). Of course they are very different. > > > + struct sk_buff *skb; > > + > > + skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > > + if (!skb) > > + return err; > > + if (offset < skb->len) { > > + size_t len; > > + int used; > > + > > + len = skb->len - offset; > > + used = recv_actor(desc, skb, offset, len); > > + if (used <= 0) { > > + if (!copied) > > + copied = used; > > + break; > > + } else if (used <= len) { > > + copied += used; > > + offset += used; > > The while loop is going to zero this? What are we trying to do > here with offset? offset only matters for MSG_PEEK and we do not support peeking in sockmap case, hence it is unnecessary here. I "use" it here just to make the code as complete as possible. To further answer your question, it is set to 0 when we return a valid skb on line 201 inside __skb_try_recv_from_queue(), as "_off" is set to 0 and won't change unless we have MSG_PEEK. 173 bool peek_at_off = false; 174 struct sk_buff *skb; 175 int _off = 0; 176 177 if (unlikely(flags & MSG_PEEK && *off >= 0)) { 178 peek_at_off = true; 179 _off = *off; 180 } 181 182 *last = queue->prev; 183 skb_queue_walk(queue, skb) { 184 if (flags & MSG_PEEK) { 185 if (peek_at_off && _off >= skb->len && 186 (_off || skb->peeked)) { 187 _off -= skb->len; 188 continue; 189 } 190 if (!skb->len) { 191 skb = skb_set_peeked(skb); 192 if (IS_ERR(skb)) { 193 *err = PTR_ERR(skb); 194 return NULL; 195 } 196 } 197 refcount_inc(&skb->users); 198 } else { 199 __skb_unlink(skb, queue); 200 } 201 *off = _off; 202 return skb; Of course, when we return NULL, we return immediately without using offset: 1794 skb = __skb_recv_udp(sk, 0, 1, &offset, &err); 1795 if (!skb) 1796 return err; This should not be hard to figure out. Hope it is clear now. Thanks.
Cong Wang wrote: > On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > Cong Wang wrote: > > > From: Cong Wang <cong.wang@bytedance.com> > > > > > > This is similar to tcp_read_sock(), except we do not need > > > to worry about connections, we just need to retrieve skb > > > from UDP receive queue. > > > > > > Note, the return value of ->read_sock() is unused in > > > sk_psock_verdict_data_ready(). > > > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > > Cc: Lorenz Bauer <lmb@cloudflare.com> > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > > --- [...] > > > } > > > EXPORT_SYMBOL(__skb_recv_udp); > > > > > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > > > + sk_read_actor_t recv_actor) > > > +{ > > > + int copied = 0; > > > + > > > + while (1) { > > > + int offset = 0, err; > > > > Should this be > > > > int offset = sk_peek_offset()? > > What are you really suggesting? sk_peek_offset() is just 0 unless > we have MSG_PEEK here and we don't, because we really want to > dequeue the skb rather than peeking it. > > Are you suggesting we should do peeking? I am afraid we can't. > Please be specific, guessing your mind is not an effective way to > address your reviews. I was only asking for further details because the offset addition below struck me as odd. > > > > > MSG_PEEK should work from recv side, at least it does on TCP side. If > > its handled in some following patch a comment would be nice. I was > > just reading udp_recvmsg() so maybe its not needed. > > Please explain why do we need peeking in sockmap? At very least > it has nothing to do with my patchset. We need MSG_PEEK to work from application side. From sockmap side I agree its not needed. > > I do not know why you want to use TCP as a "standard" here, TCP > also supports splice(), UDP still doesn't even with ->read_sock(). > Of course they are very different. Not claiming any "standard" here only that user application needs to work correctly if it passes MSG_PEEK. > > > > > > + struct sk_buff *skb; > > > + > > > + skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > > > + if (!skb) > > > + return err; > > > + if (offset < skb->len) { > > > + size_t len; > > > + int used; > > > + > > > + len = skb->len - offset; > > > + used = recv_actor(desc, skb, offset, len); > > > + if (used <= 0) { > > > + if (!copied) > > > + copied = used; > > > + break; > > > + } else if (used <= len) { > > > + copied += used; > > > + offset += used; > > > > The while loop is going to zero this? What are we trying to do > > here with offset? > > offset only matters for MSG_PEEK and we do not support peeking > in sockmap case, hence it is unnecessary here. I "use" it here just > to make the code as complete as possible. huh? If its not used the addition is just confusing. Can we drop it? > > To further answer your question, it is set to 0 when we return a > valid skb on line 201 inside __skb_try_recv_from_queue(), as > "_off" is set to 0 and won't change unless we have MSG_PEEK. > > 173 bool peek_at_off = false; > 174 struct sk_buff *skb; > 175 int _off = 0; > 176 > 177 if (unlikely(flags & MSG_PEEK && *off >= 0)) { > 178 peek_at_off = true; > 179 _off = *off; > 180 } > 181 > 182 *last = queue->prev; > 183 skb_queue_walk(queue, skb) { > 184 if (flags & MSG_PEEK) { > 185 if (peek_at_off && _off >= skb->len && > 186 (_off || skb->peeked)) { > 187 _off -= skb->len; > 188 continue; > 189 } > 190 if (!skb->len) { > 191 skb = skb_set_peeked(skb); > 192 if (IS_ERR(skb)) { > 193 *err = PTR_ERR(skb); > 194 return NULL; > 195 } > 196 } > 197 refcount_inc(&skb->users); > 198 } else { > 199 __skb_unlink(skb, queue); > 200 } > 201 *off = _off; > 202 return skb; > > Of course, when we return NULL, we return immediately without > using offset: > > 1794 skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > 1795 if (!skb) > 1796 return err; > > This should not be hard to figure out. Hope it is clear now. > Yes, but tracking offset only to clear it a couple lines later is confusing. > Thanks.
On Mon, Mar 29, 2021 at 11:23 PM John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > > > Cong Wang wrote: > > > > From: Cong Wang <cong.wang@bytedance.com> > > > > > > > > This is similar to tcp_read_sock(), except we do not need > > > > to worry about connections, we just need to retrieve skb > > > > from UDP receive queue. > > > > > > > > Note, the return value of ->read_sock() is unused in > > > > sk_psock_verdict_data_ready(). > > > > > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > > > Cc: Lorenz Bauer <lmb@cloudflare.com> > > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > > > --- > > [...] > > > > > } > > > > EXPORT_SYMBOL(__skb_recv_udp); > > > > > > > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > > > > + sk_read_actor_t recv_actor) > > > > +{ > > > > + int copied = 0; > > > > + > > > > + while (1) { > > > > + int offset = 0, err; > > > > > > Should this be > > > > > > int offset = sk_peek_offset()? > > > > What are you really suggesting? sk_peek_offset() is just 0 unless > > we have MSG_PEEK here and we don't, because we really want to > > dequeue the skb rather than peeking it. > > > > Are you suggesting we should do peeking? I am afraid we can't. > > Please be specific, guessing your mind is not an effective way to > > address your reviews. > > I was only asking for further details because the offset addition > below struck me as odd. > > > > > > > > > MSG_PEEK should work from recv side, at least it does on TCP side. If > > > its handled in some following patch a comment would be nice. I was > > > just reading udp_recvmsg() so maybe its not needed. > > > > Please explain why do we need peeking in sockmap? At very least > > it has nothing to do with my patchset. > > We need MSG_PEEK to work from application side. From sockmap > side I agree its not needed. How does the application reach udp_read_sock()? UDP does not support splice() as I already mentioned, as ->splice_read() is still missing. > > > > > I do not know why you want to use TCP as a "standard" here, TCP > > also supports splice(), UDP still doesn't even with ->read_sock(). > > Of course they are very different. > > Not claiming any "standard" here only that user application needs > to work correctly if it passes MSG_PEEK. I do not see how an application could pass any msg flag to udp_read_sock(). > > > > > > > > > > + struct sk_buff *skb; > > > > + > > > > + skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > > > > + if (!skb) > > > > + return err; > > > > + if (offset < skb->len) { > > > > + size_t len; > > > > + int used; > > > > + > > > > + len = skb->len - offset; > > > > + used = recv_actor(desc, skb, offset, len); > > > > + if (used <= 0) { > > > > + if (!copied) > > > > + copied = used; > > > > + break; > > > > + } else if (used <= len) { > > > > + copied += used; > > > > + offset += used; > > > > > > The while loop is going to zero this? What are we trying to do > > > here with offset? > > > > offset only matters for MSG_PEEK and we do not support peeking > > in sockmap case, hence it is unnecessary here. I "use" it here just > > to make the code as complete as possible. > > huh? If its not used the addition is just confusing. Can we drop it? If you mean dropping this single line of code, yes. If you mean dropping 'offset' completely, no, as both __skb_recv_udp() and recv_actor() still need it. If you mean I should re-write __skb_recv_udp() and recv_actor() just to drop 'offset', I am afraid that is too much with too little gain. > > > > > To further answer your question, it is set to 0 when we return a > > valid skb on line 201 inside __skb_try_recv_from_queue(), as > > "_off" is set to 0 and won't change unless we have MSG_PEEK. > > > > 173 bool peek_at_off = false; > > 174 struct sk_buff *skb; > > 175 int _off = 0; > > 176 > > 177 if (unlikely(flags & MSG_PEEK && *off >= 0)) { > > 178 peek_at_off = true; > > 179 _off = *off; > > 180 } > > 181 > > 182 *last = queue->prev; > > 183 skb_queue_walk(queue, skb) { > > 184 if (flags & MSG_PEEK) { > > 185 if (peek_at_off && _off >= skb->len && > > 186 (_off || skb->peeked)) { > > 187 _off -= skb->len; > > 188 continue; > > 189 } > > 190 if (!skb->len) { > > 191 skb = skb_set_peeked(skb); > > 192 if (IS_ERR(skb)) { > > 193 *err = PTR_ERR(skb); > > 194 return NULL; > > 195 } > > 196 } > > 197 refcount_inc(&skb->users); > > 198 } else { > > 199 __skb_unlink(skb, queue); > > 200 } > > 201 *off = _off; > > 202 return skb; > > > > Of course, when we return NULL, we return immediately without > > using offset: > > > > 1794 skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > > 1795 if (!skb) > > 1796 return err; > > > > This should not be hard to figure out. Hope it is clear now. > > > > Yes, but tracking offset only to clear it a couple lines later > is confusing. Yeah, but that's __skb_recv_udp()'s fault, not mine. We can refactor __skb_recv_udp() a bit for !MSG_PEEK case, but I do not see much gain here. Thanks.
Cong Wang wrote: > On Mon, Mar 29, 2021 at 11:23 PM John Fastabend > <john.fastabend@gmail.com> wrote: > > > > Cong Wang wrote: > > > On Mon, Mar 29, 2021 at 1:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > > > > > Cong Wang wrote: > > > > > From: Cong Wang <cong.wang@bytedance.com> > > > > > > > > > > This is similar to tcp_read_sock(), except we do not need > > > > > to worry about connections, we just need to retrieve skb > > > > > from UDP receive queue. > > > > > > > > > > Note, the return value of ->read_sock() is unused in > > > > > sk_psock_verdict_data_ready(). > > > > > > > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > > > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > > > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > > > > Cc: Lorenz Bauer <lmb@cloudflare.com> > > > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > > > > --- > > > > [...] > > > > > > > } > > > > > EXPORT_SYMBOL(__skb_recv_udp); > > > > > > > > > > +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, > > > > > + sk_read_actor_t recv_actor) > > > > > +{ > > > > > + int copied = 0; > > > > > + > > > > > + while (1) { > > > > > + int offset = 0, err; > > > > > > > > Should this be > > > > > > > > int offset = sk_peek_offset()? > > > > > > What are you really suggesting? sk_peek_offset() is just 0 unless > > > we have MSG_PEEK here and we don't, because we really want to > > > dequeue the skb rather than peeking it. > > > > > > Are you suggesting we should do peeking? I am afraid we can't. > > > Please be specific, guessing your mind is not an effective way to > > > address your reviews. > > > > I was only asking for further details because the offset addition > > below struck me as odd. > > > > > > > > > > > > > MSG_PEEK should work from recv side, at least it does on TCP side. If > > > > its handled in some following patch a comment would be nice. I was > > > > just reading udp_recvmsg() so maybe its not needed. > > > > > > Please explain why do we need peeking in sockmap? At very least > > > it has nothing to do with my patchset. > > > > We need MSG_PEEK to work from application side. From sockmap > > side I agree its not needed. > > How does the application reach udp_read_sock()? UDP does not support > splice() as I already mentioned, as ->splice_read() is still missing. It doesn't. All I was trying to say is if an application calls recvmsg(..., MSG_PEEK) it should work correctly. It wasn't a comment about this specific patch. > > > > > > > > > I do not know why you want to use TCP as a "standard" here, TCP > > > also supports splice(), UDP still doesn't even with ->read_sock(). > > > Of course they are very different. > > > > Not claiming any "standard" here only that user application needs > > to work correctly if it passes MSG_PEEK. > > I do not see how an application could pass any msg flag to > udp_read_sock(). Agree. > > > > > > > > > > > > > > > + struct sk_buff *skb; > > > > > + > > > > > + skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > > > > > + if (!skb) > > > > > + return err; > > > > > + if (offset < skb->len) { > > > > > + size_t len; > > > > > + int used; > > > > > + > > > > > + len = skb->len - offset; > > > > > + used = recv_actor(desc, skb, offset, len); > > > > > + if (used <= 0) { > > > > > + if (!copied) > > > > > + copied = used; > > > > > + break; > > > > > + } else if (used <= len) { > > > > > + copied += used; > > > > > + offset += used; > > > > > > > > The while loop is going to zero this? What are we trying to do > > > > here with offset? > > > > > > offset only matters for MSG_PEEK and we do not support peeking > > > in sockmap case, hence it is unnecessary here. I "use" it here just > > > to make the code as complete as possible. > > > > huh? If its not used the addition is just confusing. Can we drop it? > > If you mean dropping this single line of code, yes. If you mean > dropping 'offset' completely, no, as both __skb_recv_udp() and > recv_actor() still need it. If you mean I should re-write > __skb_recv_udp() and recv_actor() just to drop 'offset', I am afraid > that is too much with too little gain. All I'm saying is drop the single line of code above. This specific one 'offset += used' And add a comment in the commit msg that just says peeking is not supported. I think we need at least one more respin of the patches anyways to address a different small comment so should be easy. > > > > > > > > > To further answer your question, it is set to 0 when we return a > > > valid skb on line 201 inside __skb_try_recv_from_queue(), as > > > "_off" is set to 0 and won't change unless we have MSG_PEEK. > > > > > > 173 bool peek_at_off = false; > > > 174 struct sk_buff *skb; > > > 175 int _off = 0; > > > 176 > > > 177 if (unlikely(flags & MSG_PEEK && *off >= 0)) { > > > 178 peek_at_off = true; > > > 179 _off = *off; > > > 180 } > > > 181 > > > 182 *last = queue->prev; > > > 183 skb_queue_walk(queue, skb) { > > > 184 if (flags & MSG_PEEK) { > > > 185 if (peek_at_off && _off >= skb->len && > > > 186 (_off || skb->peeked)) { > > > 187 _off -= skb->len; > > > 188 continue; > > > 189 } > > > 190 if (!skb->len) { > > > 191 skb = skb_set_peeked(skb); > > > 192 if (IS_ERR(skb)) { > > > 193 *err = PTR_ERR(skb); > > > 194 return NULL; > > > 195 } > > > 196 } > > > 197 refcount_inc(&skb->users); > > > 198 } else { > > > 199 __skb_unlink(skb, queue); > > > 200 } > > > 201 *off = _off; > > > 202 return skb; > > > > > > Of course, when we return NULL, we return immediately without > > > using offset: > > > > > > 1794 skb = __skb_recv_udp(sk, 0, 1, &offset, &err); > > > 1795 if (!skb) > > > 1796 return err; > > > > > > This should not be hard to figure out. Hope it is clear now. > > > > > > > Yes, but tracking offset only to clear it a couple lines later > > is confusing. > > Yeah, but that's __skb_recv_udp()'s fault, not mine. We can refactor > __skb_recv_udp() a bit for !MSG_PEEK case, but I do not see > much gain here. No don't bother here. I don't see much gain in doing that either. If you want do it in another series not this one. > > Thanks.
diff --git a/include/net/udp.h b/include/net/udp.h index df7cc1edc200..347b62a753c3 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1355e6c0d567..f17870ee558b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, + .read_sock = udp_read_sock, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 38952aaee3a1..04620e4d64ab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1782,6 +1782,41 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, } EXPORT_SYMBOL(__skb_recv_udp); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + int offset = 0, err; + struct sk_buff *skb; + + skb = __skb_recv_udp(sk, 0, 1, &offset, &err); + if (!skb) + return err; + if (offset < skb->len) { + size_t len; + int used; + + len = skb->len - offset; + used = recv_actor(desc, skb, offset, len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } else if (used <= len) { + copied += used; + offset += used; + } + } + if (!desc->count) + break; + } + + return copied; +} +EXPORT_SYMBOL(udp_read_sock); + /* * This should be easy, if there is something there we * return it, otherwise we block. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 802f5111805a..71de739b4a9e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ + .read_sock = udp_read_sock, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off,