Message ID | 35d722e246b7c4afb6afb03760df6f664db4ef05.1642519257.git.pabeni@redhat.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | gro: some minor optimizations | expand |
From: Paolo Abeni <pabeni@redhat.com> Date: Tue, 18 Jan 2022 16:24:19 +0100 > While inspecting some perf report, I noticed that the compiler > emits suboptimal code for the napi CB initialization, fetching > and storing multiple times the memory for flags bitfield. > This is with gcc 10.3.1, but I observed the same with older compiler > versions. > > We can help the compiler to do a nicer work e.g. initially setting > all the bitfield to 0 using an u16 alias. The generated code is quite > smaller, with the same number of conditional > > Before: > objdump -t net/core/gro.o | grep " F .text" > 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive > > After: > 0000000000000bb0 l F .text 000000000000033c dev_gro_receive > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > include/net/gro.h | 13 +++++++++---- > net/core/gro.c | 16 +++++----------- > 2 files changed, 14 insertions(+), 15 deletions(-) > > diff --git a/include/net/gro.h b/include/net/gro.h > index 8f75802d50fd..a068b27d341f 100644 > --- a/include/net/gro.h > +++ b/include/net/gro.h > @@ -29,14 +29,17 @@ struct napi_gro_cb { > /* Number of segments aggregated. */ > u16 count; > > - /* Start offset for remote checksum offload */ > - u16 gro_remcsum_start; > + /* Used in ipv6_gro_receive() and foo-over-udp */ > + u16 proto; > > /* jiffies when first packet was created/queued */ > unsigned long age; > > - /* Used in ipv6_gro_receive() and foo-over-udp */ > - u16 proto; > + /* portion of the cb set to zero at every gro iteration */ > + u32 zeroed_start[0]; > + > + /* Start offset for remote checksum offload */ > + u16 gro_remcsum_start; > > /* This is non-zero if the packet may be of the same flow. */ > u8 same_flow:1; > @@ -70,6 +73,8 @@ struct napi_gro_cb { > /* GRO is done by frag_list pointer chaining. */ > u8 is_flist:1; > > + u32 zeroed_end[0]; This should be wrapped in struct_group() I believe, or compilers will start complaining soon. See [0] for the details. Adding Kees to the CCs. > + > /* used to support CHECKSUM_COMPLETE for tunneling protocols */ > __wsum csum; > > diff --git a/net/core/gro.c b/net/core/gro.c > index d43d42215bdb..b9ebe9298731 100644 > --- a/net/core/gro.c > +++ b/net/core/gro.c > @@ -435,6 +435,9 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) > napi_gro_complete(napi, oldest); > } > > +#define zeroed_len (offsetof(struct napi_gro_cb, zeroed_end) - \ > + offsetof(struct napi_gro_cb, zeroed_start)) > + > static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) > { > u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); > @@ -459,29 +462,20 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > skb_set_network_header(skb, skb_gro_offset(skb)); > skb_reset_mac_len(skb); > - NAPI_GRO_CB(skb)->same_flow = 0; > + BUILD_BUG_ON(zeroed_len != sizeof(NAPI_GRO_CB(skb)->zeroed_start[0])); > + NAPI_GRO_CB(skb)->zeroed_start[0] = 0; > NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > - NAPI_GRO_CB(skb)->free = 0; > - NAPI_GRO_CB(skb)->encap_mark = 0; > - NAPI_GRO_CB(skb)->recursion_counter = 0; > - NAPI_GRO_CB(skb)->is_fou = 0; > NAPI_GRO_CB(skb)->is_atomic = 1; > - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; > > /* Setup for GRO checksum validation */ > switch (skb->ip_summed) { > case CHECKSUM_COMPLETE: > NAPI_GRO_CB(skb)->csum = skb->csum; > NAPI_GRO_CB(skb)->csum_valid = 1; > - NAPI_GRO_CB(skb)->csum_cnt = 0; > break; > case CHECKSUM_UNNECESSARY: > NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; > - NAPI_GRO_CB(skb)->csum_valid = 0; > break; > - default: > - NAPI_GRO_CB(skb)->csum_cnt = 0; > - NAPI_GRO_CB(skb)->csum_valid = 0; > } > > pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, > -- > 2.34.1 [0] https://lore.kernel.org/linux-hardening/20220112220652.3952944-1-keescook@chromium.org Thanks, Al
On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote: > From: Paolo Abeni <pabeni@redhat.com> > Date: Tue, 18 Jan 2022 16:24:19 +0100 > > > While inspecting some perf report, I noticed that the compiler > > emits suboptimal code for the napi CB initialization, fetching > > and storing multiple times the memory for flags bitfield. > > This is with gcc 10.3.1, but I observed the same with older compiler > > versions. > > > > We can help the compiler to do a nicer work e.g. initially setting > > all the bitfield to 0 using an u16 alias. The generated code is quite > > smaller, with the same number of conditional > > > > Before: > > objdump -t net/core/gro.o | grep " F .text" > > 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive > > > > After: > > 0000000000000bb0 l F .text 000000000000033c dev_gro_receive > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > --- > > include/net/gro.h | 13 +++++++++---- > > net/core/gro.c | 16 +++++----------- > > 2 files changed, 14 insertions(+), 15 deletions(-) > > > > diff --git a/include/net/gro.h b/include/net/gro.h > > index 8f75802d50fd..a068b27d341f 100644 > > --- a/include/net/gro.h > > +++ b/include/net/gro.h > > @@ -29,14 +29,17 @@ struct napi_gro_cb { > > /* Number of segments aggregated. */ > > u16 count; > > > > - /* Start offset for remote checksum offload */ > > - u16 gro_remcsum_start; > > + /* Used in ipv6_gro_receive() and foo-over-udp */ > > + u16 proto; > > > > /* jiffies when first packet was created/queued */ > > unsigned long age; > > > > - /* Used in ipv6_gro_receive() and foo-over-udp */ > > - u16 proto; > > + /* portion of the cb set to zero at every gro iteration */ > > + u32 zeroed_start[0]; > > + > > + /* Start offset for remote checksum offload */ > > + u16 gro_remcsum_start; > > > > /* This is non-zero if the packet may be of the same flow. */ > > u8 same_flow:1; > > @@ -70,6 +73,8 @@ struct napi_gro_cb { > > /* GRO is done by frag_list pointer chaining. */ > > u8 is_flist:1; > > > > + u32 zeroed_end[0]; > > This should be wrapped in struct_group() I believe, or compilers > will start complaining soon. See [0] for the details. > Adding Kees to the CCs. Thank you for the reference. That really slipped-off my mind. This patch does not use memcpy() or similar, just a single direct assignement. Would that still require struct_group()? Thanks! Paolo
From: Paolo Abeni <pabeni@redhat.com> Date: Tue, 18 Jan 2022 17:31:00 +0100 > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote: > > From: Paolo Abeni <pabeni@redhat.com> > > Date: Tue, 18 Jan 2022 16:24:19 +0100 > > > > > While inspecting some perf report, I noticed that the compiler > > > emits suboptimal code for the napi CB initialization, fetching > > > and storing multiple times the memory for flags bitfield. > > > This is with gcc 10.3.1, but I observed the same with older compiler > > > versions. > > > > > > We can help the compiler to do a nicer work e.g. initially setting > > > all the bitfield to 0 using an u16 alias. The generated code is quite > > > smaller, with the same number of conditional > > > > > > Before: > > > objdump -t net/core/gro.o | grep " F .text" > > > 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive > > > > > > After: > > > 0000000000000bb0 l F .text 000000000000033c dev_gro_receive > > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > --- > > > include/net/gro.h | 13 +++++++++---- > > > net/core/gro.c | 16 +++++----------- > > > 2 files changed, 14 insertions(+), 15 deletions(-) > > > > > > diff --git a/include/net/gro.h b/include/net/gro.h > > > index 8f75802d50fd..a068b27d341f 100644 > > > --- a/include/net/gro.h > > > +++ b/include/net/gro.h > > > @@ -29,14 +29,17 @@ struct napi_gro_cb { > > > /* Number of segments aggregated. */ > > > u16 count; > > > > > > - /* Start offset for remote checksum offload */ > > > - u16 gro_remcsum_start; > > > + /* Used in ipv6_gro_receive() and foo-over-udp */ > > > + u16 proto; > > > > > > /* jiffies when first packet was created/queued */ > > > unsigned long age; > > > > > > - /* Used in ipv6_gro_receive() and foo-over-udp */ > > > - u16 proto; > > > + /* portion of the cb set to zero at every gro iteration */ > > > + u32 zeroed_start[0]; > > > + > > > + /* Start offset for remote checksum offload */ > > > + u16 gro_remcsum_start; > > > > > > /* This is non-zero if the packet may be of the same flow. */ > > > u8 same_flow:1; > > > @@ -70,6 +73,8 @@ struct napi_gro_cb { > > > /* GRO is done by frag_list pointer chaining. */ > > > u8 is_flist:1; > > > > > > + u32 zeroed_end[0]; > > > > This should be wrapped in struct_group() I believe, or compilers > > will start complaining soon. See [0] for the details. > > Adding Kees to the CCs. > > Thank you for the reference. That really slipped-off my mind. > > This patch does not use memcpy() or similar, just a single direct > assignement. Would that still require struct_group()? Oof, sorry, I saw start/end and overlooked that it's only for a single assignment. Then it shouldn't cause warnings, but maybe use an anonymous union instead? union { u32 zeroed; struct { u16 gro_remcsum_start; ... }; }; __wsum csum; Use can still use a BUILD_BUG_ON() in this case, like sizeof(zeroed) != offsetof(csum) - offsetof(zeroed). > > Thanks! > > Paolo Thanks, Al
Hello, On Tue, 2022-01-18 at 18:39 +0100, Alexander Lobakin wrote: > From: Paolo Abeni <pabeni@redhat.com> > Date: Tue, 18 Jan 2022 17:31:00 +0100 > > > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote: > > > From: Paolo Abeni <pabeni@redhat.com> > > > Date: Tue, 18 Jan 2022 16:24:19 +0100 > > > > > > > While inspecting some perf report, I noticed that the compiler > > > > emits suboptimal code for the napi CB initialization, fetching > > > > and storing multiple times the memory for flags bitfield. > > > > This is with gcc 10.3.1, but I observed the same with older compiler > > > > versions. > > > > > > > > We can help the compiler to do a nicer work e.g. initially setting > > > > all the bitfield to 0 using an u16 alias. The generated code is quite > > > > smaller, with the same number of conditional > > > > > > > > Before: > > > > objdump -t net/core/gro.o | grep " F .text" > > > > 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive > > > > > > > > After: > > > > 0000000000000bb0 l F .text 000000000000033c dev_gro_receive > > > > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > > --- > > > > include/net/gro.h | 13 +++++++++---- > > > > net/core/gro.c | 16 +++++----------- > > > > 2 files changed, 14 insertions(+), 15 deletions(-) > > > > > > > > diff --git a/include/net/gro.h b/include/net/gro.h > > > > index 8f75802d50fd..a068b27d341f 100644 > > > > --- a/include/net/gro.h > > > > +++ b/include/net/gro.h > > > > @@ -29,14 +29,17 @@ struct napi_gro_cb { > > > > /* Number of segments aggregated. */ > > > > u16 count; > > > > > > > > - /* Start offset for remote checksum offload */ > > > > - u16 gro_remcsum_start; > > > > + /* Used in ipv6_gro_receive() and foo-over-udp */ > > > > + u16 proto; > > > > > > > > /* jiffies when first packet was created/queued */ > > > > unsigned long age; > > > > > > > > - /* Used in ipv6_gro_receive() and foo-over-udp */ > > > > - u16 proto; > > > > + /* portion of the cb set to zero at every gro iteration */ > > > > + u32 zeroed_start[0]; > > > > + > > > > + /* Start offset for remote checksum offload */ > > > > + u16 gro_remcsum_start; > > > > > > > > /* This is non-zero if the packet may be of the same flow. */ > > > > u8 same_flow:1; > > > > @@ -70,6 +73,8 @@ struct napi_gro_cb { > > > > /* GRO is done by frag_list pointer chaining. */ > > > > u8 is_flist:1; > > > > > > > > + u32 zeroed_end[0]; > > > > > > This should be wrapped in struct_group() I believe, or compilers > > > will start complaining soon. See [0] for the details. > > > Adding Kees to the CCs. > > > > Thank you for the reference. That really slipped-off my mind. > > > > This patch does not use memcpy() or similar, just a single direct > > assignement. Would that still require struct_group()? > > Oof, sorry, I saw start/end and overlooked that it's only for > a single assignment. > Then it shouldn't cause warnings, but maybe use an anonymous > union instead? > > union { > u32 zeroed; > struct { > u16 gro_remcsum_start; > ... > }; > }; > __wsum csum; > > Use can still use a BUILD_BUG_ON() in this case, like > sizeof(zeroed) != offsetof(csum) - offsetof(zeroed). Please forgive me for the very long delay. I'm looking again at this stuff for formal non-rfc submission. I like the anonymous union less, because it will move around much more code - making the patch less readable - and will be more fragile e.g. some comment alike "please don't move around 'csum'" would be needed. No strong opinion anyway, so if you really prefer that way I can adapt. Please let me know. Thanks! Paolo
From: Paolo Abeni <pabeni@redhat.com> Date: Wed, 02 Feb 2022 11:09:41 +0100 > Hello, Hi! > > On Tue, 2022-01-18 at 18:39 +0100, Alexander Lobakin wrote: > > From: Paolo Abeni <pabeni@redhat.com> > > Date: Tue, 18 Jan 2022 17:31:00 +0100 > > > > > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote: > > > > From: Paolo Abeni <pabeni@redhat.com> > > > > Date: Tue, 18 Jan 2022 16:24:19 +0100 > > > > > > > > > While inspecting some perf report, I noticed that the compiler > > > > > emits suboptimal code for the napi CB initialization, fetching > > > > > and storing multiple times the memory for flags bitfield. > > > > > This is with gcc 10.3.1, but I observed the same with older compiler > > > > > versions. > > > > > > > > > > We can help the compiler to do a nicer work e.g. initially setting > > > > > all the bitfield to 0 using an u16 alias. The generated code is quite > > > > > smaller, with the same number of conditional > > > > > > > > > > Before: > > > > > objdump -t net/core/gro.o | grep " F .text" > > > > > 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive > > > > > > > > > > After: > > > > > 0000000000000bb0 l F .text 000000000000033c dev_gro_receive > > > > > > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > > > --- > > > > > include/net/gro.h | 13 +++++++++---- > > > > > net/core/gro.c | 16 +++++----------- > > > > > 2 files changed, 14 insertions(+), 15 deletions(-) > > > > > > > > > > diff --git a/include/net/gro.h b/include/net/gro.h > > > > > index 8f75802d50fd..a068b27d341f 100644 > > > > > --- a/include/net/gro.h > > > > > +++ b/include/net/gro.h > > > > > @@ -29,14 +29,17 @@ struct napi_gro_cb { > > > > > /* Number of segments aggregated. */ > > > > > u16 count; > > > > > > > > > > - /* Start offset for remote checksum offload */ > > > > > - u16 gro_remcsum_start; > > > > > + /* Used in ipv6_gro_receive() and foo-over-udp */ > > > > > + u16 proto; > > > > > > > > > > /* jiffies when first packet was created/queued */ > > > > > unsigned long age; > > > > > > > > > > - /* Used in ipv6_gro_receive() and foo-over-udp */ > > > > > - u16 proto; > > > > > + /* portion of the cb set to zero at every gro iteration */ > > > > > + u32 zeroed_start[0]; > > > > > + > > > > > + /* Start offset for remote checksum offload */ > > > > > + u16 gro_remcsum_start; > > > > > > > > > > /* This is non-zero if the packet may be of the same flow. */ > > > > > u8 same_flow:1; > > > > > @@ -70,6 +73,8 @@ struct napi_gro_cb { > > > > > /* GRO is done by frag_list pointer chaining. */ > > > > > u8 is_flist:1; > > > > > > > > > > + u32 zeroed_end[0]; > > > > > > > > This should be wrapped in struct_group() I believe, or compilers > > > > will start complaining soon. See [0] for the details. > > > > Adding Kees to the CCs. > > > > > > Thank you for the reference. That really slipped-off my mind. > > > > > > This patch does not use memcpy() or similar, just a single direct > > > assignement. Would that still require struct_group()? > > > > Oof, sorry, I saw start/end and overlooked that it's only for > > a single assignment. > > Then it shouldn't cause warnings, but maybe use an anonymous > > union instead? > > > > union { > > u32 zeroed; > > struct { > > u16 gro_remcsum_start; > > ... > > }; > > }; > > __wsum csum; > > > > Use can still use a BUILD_BUG_ON() in this case, like > > sizeof(zeroed) != offsetof(csum) - offsetof(zeroed). > > Please forgive me for the very long delay. I'm looking again at this > stuff for formal non-rfc submission. Sure, not a problem at all (: > > I like the anonymous union less, because it will move around much more > code - making the patch less readable - and will be more fragile e.g. > some comment alike "please don't move around 'csum'" would be needed. We still need comments around zeroed_{start,end}[0] for now. I used offsetof(csum) as offsetofend(is_flist) which I'd prefer here unfortunately expands to offsetof(is_flist) + sizeof(is_flist), and the latter causes an error of using sizeof() against a bitfield. > > No strong opinion anyway, so if you really prefer that way I can adapt. > Please let me know. I don't really have a strong preference here, I just suspect that zero-length array will produce or already produce -Warray-bounds warnings, and empty-struct constructs like struct { } zeroed_start; u16 gro_remcsum_start; ... struct { } zeroed_end; memset(NAPI_GRO_CB(skb)->zeroed_start, 0, offsetofend(zeroed_end) - offsetsof(zeroed_start)); will trigger Fortify compile-time errors from Kees' KSPP tree. I think we could use __struct_group(/* no tag */, zeroed, /* no attrs */, u16 gro_remcsum_start; ... u8 is_flist:1; ); __wsum csum; BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; This doesn't depend on `csum`, doesn't need `struct { }` or `struct zero[0]` markers and still uses a direct assignment. Also adding Gustavo, maybe he'd like to leave a comment here. > > Thanks! > > Paolo Thanks, Al
On Wed, Feb 02, 2022 at 01:08:27PM +0100, Alexander Lobakin wrote: > From: Paolo Abeni <pabeni@redhat.com> > Date: Wed, 02 Feb 2022 11:09:41 +0100 > > > Hello, > > Hi! > > > > > On Tue, 2022-01-18 at 18:39 +0100, Alexander Lobakin wrote: > > > From: Paolo Abeni <pabeni@redhat.com> > > > Date: Tue, 18 Jan 2022 17:31:00 +0100 > > > > > > > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote: > > > > > From: Paolo Abeni <pabeni@redhat.com> > > > > > Date: Tue, 18 Jan 2022 16:24:19 +0100 > > > > > > > > > > > While inspecting some perf report, I noticed that the compiler > > > > > > emits suboptimal code for the napi CB initialization, fetching > > > > > > and storing multiple times the memory for flags bitfield. > > > > > > This is with gcc 10.3.1, but I observed the same with older compiler > > > > > > versions. > > > > > > > > > > > > We can help the compiler to do a nicer work e.g. initially setting > > > > > > all the bitfield to 0 using an u16 alias. The generated code is quite > > > > > > smaller, with the same number of conditional > > > > > > > > > > > > Before: > > > > > > objdump -t net/core/gro.o | grep " F .text" > > > > > > 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive > > > > > > > > > > > > After: > > > > > > 0000000000000bb0 l F .text 000000000000033c dev_gro_receive > > > > > > > > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > > > > --- > > > > > > include/net/gro.h | 13 +++++++++---- > > > > > > net/core/gro.c | 16 +++++----------- > > > > > > 2 files changed, 14 insertions(+), 15 deletions(-) > > > > > > > > > > > > diff --git a/include/net/gro.h b/include/net/gro.h > > > > > > index 8f75802d50fd..a068b27d341f 100644 > > > > > > --- a/include/net/gro.h > > > > > > +++ b/include/net/gro.h > > > > > > @@ -29,14 +29,17 @@ struct napi_gro_cb { > > > > > > /* Number of segments aggregated. */ > > > > > > u16 count; > > > > > > > > > > > > - /* Start offset for remote checksum offload */ > > > > > > - u16 gro_remcsum_start; > > > > > > + /* Used in ipv6_gro_receive() and foo-over-udp */ > > > > > > + u16 proto; > > > > > > > > > > > > /* jiffies when first packet was created/queued */ > > > > > > unsigned long age; > > > > > > > > > > > > - /* Used in ipv6_gro_receive() and foo-over-udp */ > > > > > > - u16 proto; > > > > > > + /* portion of the cb set to zero at every gro iteration */ > > > > > > + u32 zeroed_start[0]; > > > > > > + > > > > > > + /* Start offset for remote checksum offload */ > > > > > > + u16 gro_remcsum_start; > > > > > > > > > > > > /* This is non-zero if the packet may be of the same flow. */ > > > > > > u8 same_flow:1; > > > > > > @@ -70,6 +73,8 @@ struct napi_gro_cb { > > > > > > /* GRO is done by frag_list pointer chaining. */ > > > > > > u8 is_flist:1; > > > > > > > > > > > > + u32 zeroed_end[0]; > > > > > > > > > > This should be wrapped in struct_group() I believe, or compilers > > > > > will start complaining soon. See [0] for the details. > > > > > Adding Kees to the CCs. Hi! Sorry I missed the original email sent my way. :) > > > > > > > > Thank you for the reference. That really slipped-off my mind. > > > > > > > > This patch does not use memcpy() or similar, just a single direct > > > > assignement. Would that still require struct_group()? > > > > > > Oof, sorry, I saw start/end and overlooked that it's only for > > > a single assignment. > > > Then it shouldn't cause warnings, but maybe use an anonymous > > > union instead? > > > > > > union { > > > u32 zeroed; > > > struct { > > > u16 gro_remcsum_start; > > > ... > > > }; > > > }; > > > __wsum csum; > > > > > > Use can still use a BUILD_BUG_ON() in this case, like > > > sizeof(zeroed) != offsetof(csum) - offsetof(zeroed). > > > > Please forgive me for the very long delay. I'm looking again at this > > stuff for formal non-rfc submission. > > Sure, not a problem at all (: > > > > > I like the anonymous union less, because it will move around much more > > code - making the patch less readable - and will be more fragile e.g. > > some comment alike "please don't move around 'csum'" would be needed. > > We still need comments around zeroed_{start,end}[0] for now. > I used offsetof(csum) as offsetofend(is_flist) which I'd prefer here > unfortunately expands to offsetof(is_flist) + sizeof(is_flist), and > the latter causes an error of using sizeof() against a bitfield. > > > > > No strong opinion anyway, so if you really prefer that way I can adapt. > > Please let me know. > > I don't really have a strong preference here, I just suspect that > zero-length array will produce or already produce -Warray-bounds > warnings, and empty-struct constructs like Yes, -Warray-bounds would yell very loudly about an 0-element array having its first element assigned. :) > > struct { } zeroed_start; > u16 gro_remcsum_start; > ... > struct { } zeroed_end; > > memset(NAPI_GRO_CB(skb)->zeroed_start, 0, > offsetofend(zeroed_end) - offsetsof(zeroed_start)); > > will trigger Fortify compile-time errors from Kees' KSPP tree. > > I think we could use > > __struct_group(/* no tag */, zeroed, /* no attrs */, Since this isn't UAPI, you can just do: struct_group(zeroed, > u16 gro_remcsum_start; > ... > u8 is_flist:1; > ); > __wsum csum; > > BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > sizeof(u32))); /* Avoid slow unaligned acc */ > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > > This doesn't depend on `csum`, doesn't need `struct { }` or > `struct zero[0]` markers and still uses a direct assignment. Given the kernel's -O2 use, a constant-sized u32 memcpy() and a direct assignment will resolve to the same machine code: https://godbolt.org/z/b9qKexx6q void zero_var(struct object *p) { p->zero_var = 0; } void zero_group(struct object *p) { memset(&p->zero_group, 0, sizeof(p->zero_group)); } zero_var: movl $0, (%rdi) ret zero_group: movl $0, (%rdi) ret And assigning them individually results in careful "and"ing to avoid the padding: zero_each: andl $-134217728, (%rdi) ret I would have recommended struct_group() + memset() here, just because you don't need the casts. But since you're doing BUILD_BUG_ON() size verification, it's fine either way. If you want to avoid the cast and avoid the memset(), you could use the earlier suggestion of anonymous union and anonymous struct. It's basically an open-coded struct_group, but you get to pick the type of the overlapping variable. :) -Kees > Also adding Gustavo, maybe he'd like to leave a comment here. > > > > > Thanks! > > > > Paolo > > Thanks, > Al
diff --git a/include/net/gro.h b/include/net/gro.h index 8f75802d50fd..a068b27d341f 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -29,14 +29,17 @@ struct napi_gro_cb { /* Number of segments aggregated. */ u16 count; - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; + /* Used in ipv6_gro_receive() and foo-over-udp */ + u16 proto; /* jiffies when first packet was created/queued */ unsigned long age; - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; + /* portion of the cb set to zero at every gro iteration */ + u32 zeroed_start[0]; + + /* Start offset for remote checksum offload */ + u16 gro_remcsum_start; /* This is non-zero if the packet may be of the same flow. */ u8 same_flow:1; @@ -70,6 +73,8 @@ struct napi_gro_cb { /* GRO is done by frag_list pointer chaining. */ u8 is_flist:1; + u32 zeroed_end[0]; + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; diff --git a/net/core/gro.c b/net/core/gro.c index d43d42215bdb..b9ebe9298731 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -435,6 +435,9 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) napi_gro_complete(napi, oldest); } +#define zeroed_len (offsetof(struct napi_gro_cb, zeroed_end) - \ + offsetof(struct napi_gro_cb, zeroed_start)) + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -459,29 +462,20 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; + BUILD_BUG_ON(zeroed_len != sizeof(NAPI_GRO_CB(skb)->zeroed_start[0])); + NAPI_GRO_CB(skb)->zeroed_start[0] = 0; NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->encap_mark = 0; - NAPI_GRO_CB(skb)->recursion_counter = 0; - NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; - NAPI_GRO_CB(skb)->csum_cnt = 0; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - NAPI_GRO_CB(skb)->csum_valid = 0; break; - default: - NAPI_GRO_CB(skb)->csum_cnt = 0; - NAPI_GRO_CB(skb)->csum_valid = 0; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
While inspecting some perf report, I noticed that the compiler emits suboptimal code for the napi CB initialization, fetching and storing multiple times the memory for flags bitfield. This is with gcc 10.3.1, but I observed the same with older compiler versions. We can help the compiler to do a nicer work e.g. initially setting all the bitfield to 0 using an u16 alias. The generated code is quite smaller, with the same number of conditional Before: objdump -t net/core/gro.o | grep " F .text" 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive After: 0000000000000bb0 l F .text 000000000000033c dev_gro_receive Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- include/net/gro.h | 13 +++++++++---- net/core/gro.c | 16 +++++----------- 2 files changed, 14 insertions(+), 15 deletions(-)