diff mbox series

[RFC,2/3] net: gro: minor optimization for dev_gro_receive()

Message ID 35d722e246b7c4afb6afb03760df6f664db4ef05.1642519257.git.pabeni@redhat.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series gro: some minor optimizations | expand

Checks

Context Check Description
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/cc_maintainers warning 2 maintainers not CCed: davem@davemloft.net kuba@kernel.org
netdev/build_clang success Errors and warnings before: 22 this patch: 22
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 25 this patch: 25
netdev/checkpatch warning WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Paolo Abeni Jan. 18, 2022, 3:24 p.m. UTC
While inspecting some perf report, I noticed that the compiler
emits suboptimal code for the napi CB initialization, fetching
and storing multiple times the memory for flags bitfield.
This is with gcc 10.3.1, but I observed the same with older compiler
versions.

We can help the compiler to do a nicer work e.g. initially setting
all the bitfield to 0 using an u16 alias. The generated code is quite
smaller, with the same number of conditional

Before:
objdump -t net/core/gro.o | grep " F .text"
0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive

After:
0000000000000bb0 l     F .text	000000000000033c dev_gro_receive

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/gro.h | 13 +++++++++----
 net/core/gro.c    | 16 +++++-----------
 2 files changed, 14 insertions(+), 15 deletions(-)

Comments

Alexander Lobakin Jan. 18, 2022, 3:56 p.m. UTC | #1
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 18 Jan 2022 16:24:19 +0100

> While inspecting some perf report, I noticed that the compiler
> emits suboptimal code for the napi CB initialization, fetching
> and storing multiple times the memory for flags bitfield.
> This is with gcc 10.3.1, but I observed the same with older compiler
> versions.
> 
> We can help the compiler to do a nicer work e.g. initially setting
> all the bitfield to 0 using an u16 alias. The generated code is quite
> smaller, with the same number of conditional
> 
> Before:
> objdump -t net/core/gro.o | grep " F .text"
> 0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive
> 
> After:
> 0000000000000bb0 l     F .text	000000000000033c dev_gro_receive
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  include/net/gro.h | 13 +++++++++----
>  net/core/gro.c    | 16 +++++-----------
>  2 files changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/include/net/gro.h b/include/net/gro.h
> index 8f75802d50fd..a068b27d341f 100644
> --- a/include/net/gro.h
> +++ b/include/net/gro.h
> @@ -29,14 +29,17 @@ struct napi_gro_cb {
>  	/* Number of segments aggregated. */
>  	u16	count;
>  
> -	/* Start offset for remote checksum offload */
> -	u16	gro_remcsum_start;
> +	/* Used in ipv6_gro_receive() and foo-over-udp */
> +	u16	proto;
>  
>  	/* jiffies when first packet was created/queued */
>  	unsigned long age;
>  
> -	/* Used in ipv6_gro_receive() and foo-over-udp */
> -	u16	proto;
> +	/* portion of the cb set to zero at every gro iteration */
> +	u32	zeroed_start[0];
> +
> +	/* Start offset for remote checksum offload */
> +	u16	gro_remcsum_start;
>  
>  	/* This is non-zero if the packet may be of the same flow. */
>  	u8	same_flow:1;
> @@ -70,6 +73,8 @@ struct napi_gro_cb {
>  	/* GRO is done by frag_list pointer chaining. */
>  	u8	is_flist:1;
>  
> +	u32	zeroed_end[0];

This should be wrapped in struct_group() I believe, or compilers
will start complaining soon. See [0] for the details.
Adding Kees to the CCs.

> +
>  	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
>  	__wsum	csum;
>  
> diff --git a/net/core/gro.c b/net/core/gro.c
> index d43d42215bdb..b9ebe9298731 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -435,6 +435,9 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
>  	napi_gro_complete(napi, oldest);
>  }
>  
> +#define zeroed_len	(offsetof(struct napi_gro_cb, zeroed_end) - \
> +			 offsetof(struct napi_gro_cb, zeroed_start))
> +
>  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
>  {
>  	u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> @@ -459,29 +462,20 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
>  
>  		skb_set_network_header(skb, skb_gro_offset(skb));
>  		skb_reset_mac_len(skb);
> -		NAPI_GRO_CB(skb)->same_flow = 0;
> +		BUILD_BUG_ON(zeroed_len != sizeof(NAPI_GRO_CB(skb)->zeroed_start[0]));
> +		NAPI_GRO_CB(skb)->zeroed_start[0] = 0;
>  		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> -		NAPI_GRO_CB(skb)->free = 0;
> -		NAPI_GRO_CB(skb)->encap_mark = 0;
> -		NAPI_GRO_CB(skb)->recursion_counter = 0;
> -		NAPI_GRO_CB(skb)->is_fou = 0;
>  		NAPI_GRO_CB(skb)->is_atomic = 1;
> -		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
>  
>  		/* Setup for GRO checksum validation */
>  		switch (skb->ip_summed) {
>  		case CHECKSUM_COMPLETE:
>  			NAPI_GRO_CB(skb)->csum = skb->csum;
>  			NAPI_GRO_CB(skb)->csum_valid = 1;
> -			NAPI_GRO_CB(skb)->csum_cnt = 0;
>  			break;
>  		case CHECKSUM_UNNECESSARY:
>  			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
> -			NAPI_GRO_CB(skb)->csum_valid = 0;
>  			break;
> -		default:
> -			NAPI_GRO_CB(skb)->csum_cnt = 0;
> -			NAPI_GRO_CB(skb)->csum_valid = 0;
>  		}
>  
>  		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
> -- 
> 2.34.1

[0] https://lore.kernel.org/linux-hardening/20220112220652.3952944-1-keescook@chromium.org

Thanks,
Al
Paolo Abeni Jan. 18, 2022, 4:31 p.m. UTC | #2
On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Tue, 18 Jan 2022 16:24:19 +0100
> 
> > While inspecting some perf report, I noticed that the compiler
> > emits suboptimal code for the napi CB initialization, fetching
> > and storing multiple times the memory for flags bitfield.
> > This is with gcc 10.3.1, but I observed the same with older compiler
> > versions.
> > 
> > We can help the compiler to do a nicer work e.g. initially setting
> > all the bitfield to 0 using an u16 alias. The generated code is quite
> > smaller, with the same number of conditional
> > 
> > Before:
> > objdump -t net/core/gro.o | grep " F .text"
> > 0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive
> > 
> > After:
> > 0000000000000bb0 l     F .text	000000000000033c dev_gro_receive
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> >  include/net/gro.h | 13 +++++++++----
> >  net/core/gro.c    | 16 +++++-----------
> >  2 files changed, 14 insertions(+), 15 deletions(-)
> > 
> > diff --git a/include/net/gro.h b/include/net/gro.h
> > index 8f75802d50fd..a068b27d341f 100644
> > --- a/include/net/gro.h
> > +++ b/include/net/gro.h
> > @@ -29,14 +29,17 @@ struct napi_gro_cb {
> >  	/* Number of segments aggregated. */
> >  	u16	count;
> >  
> > -	/* Start offset for remote checksum offload */
> > -	u16	gro_remcsum_start;
> > +	/* Used in ipv6_gro_receive() and foo-over-udp */
> > +	u16	proto;
> >  
> >  	/* jiffies when first packet was created/queued */
> >  	unsigned long age;
> >  
> > -	/* Used in ipv6_gro_receive() and foo-over-udp */
> > -	u16	proto;
> > +	/* portion of the cb set to zero at every gro iteration */
> > +	u32	zeroed_start[0];
> > +
> > +	/* Start offset for remote checksum offload */
> > +	u16	gro_remcsum_start;
> >  
> >  	/* This is non-zero if the packet may be of the same flow. */
> >  	u8	same_flow:1;
> > @@ -70,6 +73,8 @@ struct napi_gro_cb {
> >  	/* GRO is done by frag_list pointer chaining. */
> >  	u8	is_flist:1;
> >  
> > +	u32	zeroed_end[0];
> 
> This should be wrapped in struct_group() I believe, or compilers
> will start complaining soon. See [0] for the details.
> Adding Kees to the CCs.

Thank you for the reference. That really slipped-off my mind. 

This patch does not use memcpy() or similar, just a single direct
assignement. Would that still require struct_group()?

Thanks!

Paolo
Alexander Lobakin Jan. 18, 2022, 5:39 p.m. UTC | #3
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 18 Jan 2022 17:31:00 +0100

> On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote:
> > From: Paolo Abeni <pabeni@redhat.com>
> > Date: Tue, 18 Jan 2022 16:24:19 +0100
> > 
> > > While inspecting some perf report, I noticed that the compiler
> > > emits suboptimal code for the napi CB initialization, fetching
> > > and storing multiple times the memory for flags bitfield.
> > > This is with gcc 10.3.1, but I observed the same with older compiler
> > > versions.
> > > 
> > > We can help the compiler to do a nicer work e.g. initially setting
> > > all the bitfield to 0 using an u16 alias. The generated code is quite
> > > smaller, with the same number of conditional
> > > 
> > > Before:
> > > objdump -t net/core/gro.o | grep " F .text"
> > > 0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive
> > > 
> > > After:
> > > 0000000000000bb0 l     F .text	000000000000033c dev_gro_receive
> > > 
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > >  include/net/gro.h | 13 +++++++++----
> > >  net/core/gro.c    | 16 +++++-----------
> > >  2 files changed, 14 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/include/net/gro.h b/include/net/gro.h
> > > index 8f75802d50fd..a068b27d341f 100644
> > > --- a/include/net/gro.h
> > > +++ b/include/net/gro.h
> > > @@ -29,14 +29,17 @@ struct napi_gro_cb {
> > >  	/* Number of segments aggregated. */
> > >  	u16	count;
> > >  
> > > -	/* Start offset for remote checksum offload */
> > > -	u16	gro_remcsum_start;
> > > +	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > +	u16	proto;
> > >  
> > >  	/* jiffies when first packet was created/queued */
> > >  	unsigned long age;
> > >  
> > > -	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > -	u16	proto;
> > > +	/* portion of the cb set to zero at every gro iteration */
> > > +	u32	zeroed_start[0];
> > > +
> > > +	/* Start offset for remote checksum offload */
> > > +	u16	gro_remcsum_start;
> > >  
> > >  	/* This is non-zero if the packet may be of the same flow. */
> > >  	u8	same_flow:1;
> > > @@ -70,6 +73,8 @@ struct napi_gro_cb {
> > >  	/* GRO is done by frag_list pointer chaining. */
> > >  	u8	is_flist:1;
> > >  
> > > +	u32	zeroed_end[0];
> > 
> > This should be wrapped in struct_group() I believe, or compilers
> > will start complaining soon. See [0] for the details.
> > Adding Kees to the CCs.
> 
> Thank you for the reference. That really slipped-off my mind.
> 
> This patch does not use memcpy() or similar, just a single direct
> assignement. Would that still require struct_group()?

Oof, sorry, I saw start/end and overlooked that it's only for
a single assignment.
Then it shouldn't cause warnings, but maybe use an anonymous
union instead?

	union {
		u32 zeroed;
		struct {
			u16 gro_remcsum_start;
			...
		};
	};
	__wsum csum;

Use can still use a BUILD_BUG_ON() in this case, like
sizeof(zeroed) != offsetof(csum) - offsetof(zeroed).

> 
> Thanks!
> 
> Paolo

Thanks,
Al
Paolo Abeni Feb. 2, 2022, 10:09 a.m. UTC | #4
Hello,

On Tue, 2022-01-18 at 18:39 +0100, Alexander Lobakin wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Tue, 18 Jan 2022 17:31:00 +0100
> 
> > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote:
> > > From: Paolo Abeni <pabeni@redhat.com>
> > > Date: Tue, 18 Jan 2022 16:24:19 +0100
> > > 
> > > > While inspecting some perf report, I noticed that the compiler
> > > > emits suboptimal code for the napi CB initialization, fetching
> > > > and storing multiple times the memory for flags bitfield.
> > > > This is with gcc 10.3.1, but I observed the same with older compiler
> > > > versions.
> > > > 
> > > > We can help the compiler to do a nicer work e.g. initially setting
> > > > all the bitfield to 0 using an u16 alias. The generated code is quite
> > > > smaller, with the same number of conditional
> > > > 
> > > > Before:
> > > > objdump -t net/core/gro.o | grep " F .text"
> > > > 0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive
> > > > 
> > > > After:
> > > > 0000000000000bb0 l     F .text	000000000000033c dev_gro_receive
> > > > 
> > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > > ---
> > > >  include/net/gro.h | 13 +++++++++----
> > > >  net/core/gro.c    | 16 +++++-----------
> > > >  2 files changed, 14 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/include/net/gro.h b/include/net/gro.h
> > > > index 8f75802d50fd..a068b27d341f 100644
> > > > --- a/include/net/gro.h
> > > > +++ b/include/net/gro.h
> > > > @@ -29,14 +29,17 @@ struct napi_gro_cb {
> > > >  	/* Number of segments aggregated. */
> > > >  	u16	count;
> > > >  
> > > > -	/* Start offset for remote checksum offload */
> > > > -	u16	gro_remcsum_start;
> > > > +	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > > +	u16	proto;
> > > >  
> > > >  	/* jiffies when first packet was created/queued */
> > > >  	unsigned long age;
> > > >  
> > > > -	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > > -	u16	proto;
> > > > +	/* portion of the cb set to zero at every gro iteration */
> > > > +	u32	zeroed_start[0];
> > > > +
> > > > +	/* Start offset for remote checksum offload */
> > > > +	u16	gro_remcsum_start;
> > > >  
> > > >  	/* This is non-zero if the packet may be of the same flow. */
> > > >  	u8	same_flow:1;
> > > > @@ -70,6 +73,8 @@ struct napi_gro_cb {
> > > >  	/* GRO is done by frag_list pointer chaining. */
> > > >  	u8	is_flist:1;
> > > >  
> > > > +	u32	zeroed_end[0];
> > > 
> > > This should be wrapped in struct_group() I believe, or compilers
> > > will start complaining soon. See [0] for the details.
> > > Adding Kees to the CCs.
> > 
> > Thank you for the reference. That really slipped-off my mind.
> > 
> > This patch does not use memcpy() or similar, just a single direct
> > assignement. Would that still require struct_group()?
> 
> Oof, sorry, I saw start/end and overlooked that it's only for
> a single assignment.
> Then it shouldn't cause warnings, but maybe use an anonymous
> union instead?
> 
> 	union {
> 		u32 zeroed;
> 		struct {
> 			u16 gro_remcsum_start;
> 			...
> 		};
> 	};
> 	__wsum csum;
> 
> Use can still use a BUILD_BUG_ON() in this case, like
> sizeof(zeroed) != offsetof(csum) - offsetof(zeroed).

Please forgive me for the very long delay. I'm looking again at this
stuff for formal non-rfc submission.

I like the anonymous union less, because it will move around much more
code - making the patch less readable - and will be more fragile e.g.
some comment alike "please don't move around 'csum'" would be needed.

No strong opinion anyway, so if you really prefer that way I can adapt.
Please let me know.

Thanks!

Paolo
Alexander Lobakin Feb. 2, 2022, 12:08 p.m. UTC | #5
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 02 Feb 2022 11:09:41 +0100

> Hello,

Hi!

> 
> On Tue, 2022-01-18 at 18:39 +0100, Alexander Lobakin wrote:
> > From: Paolo Abeni <pabeni@redhat.com>
> > Date: Tue, 18 Jan 2022 17:31:00 +0100
> > 
> > > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote:
> > > > From: Paolo Abeni <pabeni@redhat.com>
> > > > Date: Tue, 18 Jan 2022 16:24:19 +0100
> > > > 
> > > > > While inspecting some perf report, I noticed that the compiler
> > > > > emits suboptimal code for the napi CB initialization, fetching
> > > > > and storing multiple times the memory for flags bitfield.
> > > > > This is with gcc 10.3.1, but I observed the same with older compiler
> > > > > versions.
> > > > > 
> > > > > We can help the compiler to do a nicer work e.g. initially setting
> > > > > all the bitfield to 0 using an u16 alias. The generated code is quite
> > > > > smaller, with the same number of conditional
> > > > > 
> > > > > Before:
> > > > > objdump -t net/core/gro.o | grep " F .text"
> > > > > 0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive
> > > > > 
> > > > > After:
> > > > > 0000000000000bb0 l     F .text	000000000000033c dev_gro_receive
> > > > > 
> > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > > > ---
> > > > >  include/net/gro.h | 13 +++++++++----
> > > > >  net/core/gro.c    | 16 +++++-----------
> > > > >  2 files changed, 14 insertions(+), 15 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/gro.h b/include/net/gro.h
> > > > > index 8f75802d50fd..a068b27d341f 100644
> > > > > --- a/include/net/gro.h
> > > > > +++ b/include/net/gro.h
> > > > > @@ -29,14 +29,17 @@ struct napi_gro_cb {
> > > > >  	/* Number of segments aggregated. */
> > > > >  	u16	count;
> > > > >  
> > > > > -	/* Start offset for remote checksum offload */
> > > > > -	u16	gro_remcsum_start;
> > > > > +	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > > > +	u16	proto;
> > > > >  
> > > > >  	/* jiffies when first packet was created/queued */
> > > > >  	unsigned long age;
> > > > >  
> > > > > -	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > > > -	u16	proto;
> > > > > +	/* portion of the cb set to zero at every gro iteration */
> > > > > +	u32	zeroed_start[0];
> > > > > +
> > > > > +	/* Start offset for remote checksum offload */
> > > > > +	u16	gro_remcsum_start;
> > > > >  
> > > > >  	/* This is non-zero if the packet may be of the same flow. */
> > > > >  	u8	same_flow:1;
> > > > > @@ -70,6 +73,8 @@ struct napi_gro_cb {
> > > > >  	/* GRO is done by frag_list pointer chaining. */
> > > > >  	u8	is_flist:1;
> > > > >  
> > > > > +	u32	zeroed_end[0];
> > > > 
> > > > This should be wrapped in struct_group() I believe, or compilers
> > > > will start complaining soon. See [0] for the details.
> > > > Adding Kees to the CCs.
> > > 
> > > Thank you for the reference. That really slipped-off my mind.
> > > 
> > > This patch does not use memcpy() or similar, just a single direct
> > > assignement. Would that still require struct_group()?
> > 
> > Oof, sorry, I saw start/end and overlooked that it's only for
> > a single assignment.
> > Then it shouldn't cause warnings, but maybe use an anonymous
> > union instead?
> > 
> > 	union {
> > 		u32 zeroed;
> > 		struct {
> > 			u16 gro_remcsum_start;
> > 			...
> > 		};
> > 	};
> > 	__wsum csum;
> > 
> > Use can still use a BUILD_BUG_ON() in this case, like
> > sizeof(zeroed) != offsetof(csum) - offsetof(zeroed).
> 
> Please forgive me for the very long delay. I'm looking again at this
> stuff for formal non-rfc submission.

Sure, not a problem at all (:

> 
> I like the anonymous union less, because it will move around much more
> code - making the patch less readable - and will be more fragile e.g.
> some comment alike "please don't move around 'csum'" would be needed.

We still need comments around zeroed_{start,end}[0] for now.
I used offsetof(csum) as offsetofend(is_flist) which I'd prefer here
unfortunately expands to offsetof(is_flist) + sizeof(is_flist), and
the latter causes an error of using sizeof() against a bitfield.

> 
> No strong opinion anyway, so if you really prefer that way I can adapt.
> Please let me know.

I don't really have a strong preference here, I just suspect that
zero-length array will produce or already produce -Warray-bounds
warnings, and empty-struct constructs like

	struct { } zeroed_start;
	u16 gro_remcsum_start;
	...
	struct { } zeroed_end;

	memset(NAPI_GRO_CB(skb)->zeroed_start, 0,
	       offsetofend(zeroed_end) - offsetsof(zeroed_start));

will trigger Fortify compile-time errors from Kees' KSPP tree.

I think we could use

	__struct_group(/* no tag */, zeroed, /* no attrs */,
		u16 gro_remcsum_start;
		...
		u8 is_flist:1;
	);
	__wsum csum;

	BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
	BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
				 sizeof(u32))); /* Avoid slow unaligned acc */

	*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;

This doesn't depend on `csum`, doesn't need `struct { }` or
`struct zero[0]` markers and still uses a direct assignment.

Also adding Gustavo, maybe he'd like to leave a comment here.

> 
> Thanks!
> 
> Paolo

Thanks,
Al
Kees Cook Feb. 7, 2022, 2:53 a.m. UTC | #6
On Wed, Feb 02, 2022 at 01:08:27PM +0100, Alexander Lobakin wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Wed, 02 Feb 2022 11:09:41 +0100
> 
> > Hello,
> 
> Hi!
> 
> > 
> > On Tue, 2022-01-18 at 18:39 +0100, Alexander Lobakin wrote:
> > > From: Paolo Abeni <pabeni@redhat.com>
> > > Date: Tue, 18 Jan 2022 17:31:00 +0100
> > > 
> > > > On Tue, 2022-01-18 at 16:56 +0100, Alexander Lobakin wrote:
> > > > > From: Paolo Abeni <pabeni@redhat.com>
> > > > > Date: Tue, 18 Jan 2022 16:24:19 +0100
> > > > > 
> > > > > > While inspecting some perf report, I noticed that the compiler
> > > > > > emits suboptimal code for the napi CB initialization, fetching
> > > > > > and storing multiple times the memory for flags bitfield.
> > > > > > This is with gcc 10.3.1, but I observed the same with older compiler
> > > > > > versions.
> > > > > > 
> > > > > > We can help the compiler to do a nicer work e.g. initially setting
> > > > > > all the bitfield to 0 using an u16 alias. The generated code is quite
> > > > > > smaller, with the same number of conditional
> > > > > > 
> > > > > > Before:
> > > > > > objdump -t net/core/gro.o | grep " F .text"
> > > > > > 0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive
> > > > > > 
> > > > > > After:
> > > > > > 0000000000000bb0 l     F .text	000000000000033c dev_gro_receive
> > > > > > 
> > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > > > > ---
> > > > > >  include/net/gro.h | 13 +++++++++----
> > > > > >  net/core/gro.c    | 16 +++++-----------
> > > > > >  2 files changed, 14 insertions(+), 15 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/net/gro.h b/include/net/gro.h
> > > > > > index 8f75802d50fd..a068b27d341f 100644
> > > > > > --- a/include/net/gro.h
> > > > > > +++ b/include/net/gro.h
> > > > > > @@ -29,14 +29,17 @@ struct napi_gro_cb {
> > > > > >  	/* Number of segments aggregated. */
> > > > > >  	u16	count;
> > > > > >  
> > > > > > -	/* Start offset for remote checksum offload */
> > > > > > -	u16	gro_remcsum_start;
> > > > > > +	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > > > > +	u16	proto;
> > > > > >  
> > > > > >  	/* jiffies when first packet was created/queued */
> > > > > >  	unsigned long age;
> > > > > >  
> > > > > > -	/* Used in ipv6_gro_receive() and foo-over-udp */
> > > > > > -	u16	proto;
> > > > > > +	/* portion of the cb set to zero at every gro iteration */
> > > > > > +	u32	zeroed_start[0];
> > > > > > +
> > > > > > +	/* Start offset for remote checksum offload */
> > > > > > +	u16	gro_remcsum_start;
> > > > > >  
> > > > > >  	/* This is non-zero if the packet may be of the same flow. */
> > > > > >  	u8	same_flow:1;
> > > > > > @@ -70,6 +73,8 @@ struct napi_gro_cb {
> > > > > >  	/* GRO is done by frag_list pointer chaining. */
> > > > > >  	u8	is_flist:1;
> > > > > >  
> > > > > > +	u32	zeroed_end[0];
> > > > > 
> > > > > This should be wrapped in struct_group() I believe, or compilers
> > > > > will start complaining soon. See [0] for the details.
> > > > > Adding Kees to the CCs.

Hi! Sorry I missed the original email sent my way. :)

> > > > 
> > > > Thank you for the reference. That really slipped-off my mind.
> > > > 
> > > > This patch does not use memcpy() or similar, just a single direct
> > > > assignement. Would that still require struct_group()?
> > > 
> > > Oof, sorry, I saw start/end and overlooked that it's only for
> > > a single assignment.
> > > Then it shouldn't cause warnings, but maybe use an anonymous
> > > union instead?
> > > 
> > > 	union {
> > > 		u32 zeroed;
> > > 		struct {
> > > 			u16 gro_remcsum_start;
> > > 			...
> > > 		};
> > > 	};
> > > 	__wsum csum;
> > > 
> > > Use can still use a BUILD_BUG_ON() in this case, like
> > > sizeof(zeroed) != offsetof(csum) - offsetof(zeroed).
> > 
> > Please forgive me for the very long delay. I'm looking again at this
> > stuff for formal non-rfc submission.
> 
> Sure, not a problem at all (:
> 
> > 
> > I like the anonymous union less, because it will move around much more
> > code - making the patch less readable - and will be more fragile e.g.
> > some comment alike "please don't move around 'csum'" would be needed.
> 
> We still need comments around zeroed_{start,end}[0] for now.
> I used offsetof(csum) as offsetofend(is_flist) which I'd prefer here
> unfortunately expands to offsetof(is_flist) + sizeof(is_flist), and
> the latter causes an error of using sizeof() against a bitfield.
> 
> > 
> > No strong opinion anyway, so if you really prefer that way I can adapt.
> > Please let me know.
> 
> I don't really have a strong preference here, I just suspect that
> zero-length array will produce or already produce -Warray-bounds
> warnings, and empty-struct constructs like

Yes, -Warray-bounds would yell very loudly about an 0-element array
having its first element assigned. :)

> 
> 	struct { } zeroed_start;
> 	u16 gro_remcsum_start;
> 	...
> 	struct { } zeroed_end;
> 
> 	memset(NAPI_GRO_CB(skb)->zeroed_start, 0,
> 	       offsetofend(zeroed_end) - offsetsof(zeroed_start));
> 
> will trigger Fortify compile-time errors from Kees' KSPP tree.
> 
> I think we could use
> 
> 	__struct_group(/* no tag */, zeroed, /* no attrs */,

Since this isn't UAPI, you can just do:

	struct_group(zeroed,

> 		u16 gro_remcsum_start;
> 		...
> 		u8 is_flist:1;
> 	);
> 	__wsum csum;
> 
> 	BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
> 	BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> 				 sizeof(u32))); /* Avoid slow unaligned acc */
> 
> 	*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> 
> This doesn't depend on `csum`, doesn't need `struct { }` or
> `struct zero[0]` markers and still uses a direct assignment.

Given the kernel's -O2 use, a constant-sized u32 memcpy() and a direct
assignment will resolve to the same machine code:

https://godbolt.org/z/b9qKexx6q

void zero_var(struct object *p)
{
    p->zero_var = 0;
}

void zero_group(struct object *p)
{
    memset(&p->zero_group, 0, sizeof(p->zero_group));
}

zero_var:
  movl $0, (%rdi)
  ret
zero_group:
  movl $0, (%rdi)
  ret

And assigning them individually results in careful "and"ing to avoid the
padding:

zero_each:
  andl $-134217728, (%rdi)
  ret

I would have recommended struct_group() + memset() here, just because
you don't need the casts. But since you're doing BUILD_BUG_ON() size
verification, it's fine either way.

If you want to avoid the cast and avoid the memset(), you could use the
earlier suggestion of anonymous union and anonymous struct. It's
basically an open-coded struct_group, but you get to pick the type of
the overlapping variable. :)

-Kees

> Also adding Gustavo, maybe he'd like to leave a comment here.
> 
> > 
> > Thanks!
> > 
> > Paolo
> 
> Thanks,
> Al
diff mbox series

Patch

diff --git a/include/net/gro.h b/include/net/gro.h
index 8f75802d50fd..a068b27d341f 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -29,14 +29,17 @@  struct napi_gro_cb {
 	/* Number of segments aggregated. */
 	u16	count;
 
-	/* Start offset for remote checksum offload */
-	u16	gro_remcsum_start;
+	/* Used in ipv6_gro_receive() and foo-over-udp */
+	u16	proto;
 
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
-	/* Used in ipv6_gro_receive() and foo-over-udp */
-	u16	proto;
+	/* portion of the cb set to zero at every gro iteration */
+	u32	zeroed_start[0];
+
+	/* Start offset for remote checksum offload */
+	u16	gro_remcsum_start;
 
 	/* This is non-zero if the packet may be of the same flow. */
 	u8	same_flow:1;
@@ -70,6 +73,8 @@  struct napi_gro_cb {
 	/* GRO is done by frag_list pointer chaining. */
 	u8	is_flist:1;
 
+	u32	zeroed_end[0];
+
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
 
diff --git a/net/core/gro.c b/net/core/gro.c
index d43d42215bdb..b9ebe9298731 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -435,6 +435,9 @@  static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 	napi_gro_complete(napi, oldest);
 }
 
+#define zeroed_len	(offsetof(struct napi_gro_cb, zeroed_end) - \
+			 offsetof(struct napi_gro_cb, zeroed_start))
+
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@ -459,29 +462,20 @@  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 
 		skb_set_network_header(skb, skb_gro_offset(skb));
 		skb_reset_mac_len(skb);
-		NAPI_GRO_CB(skb)->same_flow = 0;
+		BUILD_BUG_ON(zeroed_len != sizeof(NAPI_GRO_CB(skb)->zeroed_start[0]));
+		NAPI_GRO_CB(skb)->zeroed_start[0] = 0;
 		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
-		NAPI_GRO_CB(skb)->free = 0;
-		NAPI_GRO_CB(skb)->encap_mark = 0;
-		NAPI_GRO_CB(skb)->recursion_counter = 0;
-		NAPI_GRO_CB(skb)->is_fou = 0;
 		NAPI_GRO_CB(skb)->is_atomic = 1;
-		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
 		case CHECKSUM_COMPLETE:
 			NAPI_GRO_CB(skb)->csum = skb->csum;
 			NAPI_GRO_CB(skb)->csum_valid = 1;
-			NAPI_GRO_CB(skb)->csum_cnt = 0;
 			break;
 		case CHECKSUM_UNNECESSARY:
 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
-			NAPI_GRO_CB(skb)->csum_valid = 0;
 			break;
-		default:
-			NAPI_GRO_CB(skb)->csum_cnt = 0;
-			NAPI_GRO_CB(skb)->csum_valid = 0;
 		}
 
 		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,