diff mbox series

MIPS: crypto: Clean up useless assignment operations

Message ID 1D248893502B75F5+20240628084117.84264-1-wangyuli@uniontech.com (mailing list archive)
State Changes Requested
Delegated to: Herbert Xu
Headers show
Series MIPS: crypto: Clean up useless assignment operations | expand

Commit Message

WangYuli June 28, 2024, 8:41 a.m. UTC
When entering the "len & sizeof(u32)" branch, len must be less than 8.
So after one operation, len must be less than 4.
At this time, "len -= sizeof(u32)" is not necessary for 64-bit CPUs.

A similar issue has been solved at Loongarch.

Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.10-rc5&id=fea1c949f6ca5059e12de00d0483645debc5b206
Signed-off-by: Guan Wentao <guanwentao@uniontech.com>
Signed-off-by: WangYuli <wangyuli@uniontech.com>
---
 arch/mips/crypto/crc32-mips.c | 4 ++++
 1 file changed, 4 insertions(+)

Comments

Herbert Xu June 28, 2024, 10:12 a.m. UTC | #1
On Fri, Jun 28, 2024 at 04:41:17PM +0800, WangYuli wrote:
> When entering the "len & sizeof(u32)" branch, len must be less than 8.
> So after one operation, len must be less than 4.
> At this time, "len -= sizeof(u32)" is not necessary for 64-bit CPUs.
> 
> A similar issue has been solved at Loongarch.
> 
> Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.10-rc5&id=fea1c949f6ca5059e12de00d0483645debc5b206
> Signed-off-by: Guan Wentao <guanwentao@uniontech.com>
> Signed-off-by: WangYuli <wangyuli@uniontech.com>
> ---
>  arch/mips/crypto/crc32-mips.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c
> index ec6d58008f8e..505d2d897849 100644
> --- a/arch/mips/crypto/crc32-mips.c
> +++ b/arch/mips/crypto/crc32-mips.c
> @@ -94,7 +94,9 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len)
>  
>  		CRC32(crc, value, w);
>  		p += sizeof(u32);
> +#ifndef CONFIG_64BIT
>  		len -= sizeof(u32);
> +#endif

First of all, did you verify that this actually makes a difference?
Please post the actual assembly output with and without this patch.

If it does make a difference, you should avoid doing ifdefs as they
are more likely to cause build failures.  Instead do something like

		if (!IS_ENABLED(CONFIG_64BIT))
			len -= sizeof(u32);

Cheers,
WangYuli June 28, 2024, 3:51 p.m. UTC | #2
On 2024/6/28 18:12, Herbert Xu wrote:
 > On Fri, Jun 28, 2024 at 04:41:17PM +0800, WangYuli wrote:
 >> When entering the "len & sizeof(u32)" branch, len must be less than 8.
 >> So after one operation, len must be less than 4.
 >> At this time, "len -= sizeof(u32)" is not necessary for 64-bit CPUs.
 >>
 >> A similar issue has been solved at Loongarch.
 >>
 >> Link: 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.10-rc5&id=fea1c949f6ca5059e12de00d0483645debc5b206
 >> Signed-off-by: Guan Wentao <guanwentao@uniontech.com>
 >> Signed-off-by: WangYuli <wangyuli@uniontech.com>
 >> ---
 >>  arch/mips/crypto/crc32-mips.c | 4 ++++
 >>  1 file changed, 4 insertions(+)
 >>
 >> diff --git a/arch/mips/crypto/crc32-mips.c 
b/arch/mips/crypto/crc32-mips.c
 >> index ec6d58008f8e..505d2d897849 100644
 >> --- a/arch/mips/crypto/crc32-mips.c
 >> +++ b/arch/mips/crypto/crc32-mips.c
 >> @@ -94,7 +94,9 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, 
unsigned int len)
 >>
 >>          CRC32(crc, value, w);
 >>          p += sizeof(u32);
 >> +#ifndef CONFIG_64BIT
 >>          len -= sizeof(u32);
 >> +#endif
 >
 > First of all, did you verify that this actually makes a difference?
 > Please post the actual assembly output with and without this patch.

Sure.

The left shows the assembly after applying this patch, while the right 
shows the origin. ( Generated by Clang 17.0.6 )

0000000000000018 <chksum_update>: 0000000000000018 <chksum_update>:
;     ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); ;     
ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);
       18: 08 00 82 8c      lw    $2, 0x8($4) 18: 08 00 82 8c      lw    
$2, 0x8($4)
;     while (len >= sizeof(u64)) { ;     while (len >= sizeof(u64)) {
       1c: 08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8> 1c: 
08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8>
       20: 06 00 20 f8      bnezc    $1, 0x3c <chksum_update+0x24> 20: 
06 00 20 f8      bnezc    $1, 0x3c <chksum_update+0x24>
;     return le64_to_cpu(__get_unaligned_t(__le64, p)); ;     return 
le64_to_cpu(__get_unaligned_t(__le64, p));
       24: 00 00 a3 dc      ld    $3, 0x0($5) 24: 00 00 a3 dc      ld    
$3, 0x0($5)
;         CRC32(crc, value, d); ;         CRC32(crc, value, d);
       28: cf 00 62 7c      <unknown> crc32d v0,v1,v0 28: cf 00 62 
7c      <unknown> crc32d    v0,v1,v0
;         len -= sizeof(u64); ;         len -= sizeof(u64);
       2c: f8 ff c6 24      addiu    $6, $6, -0x8 
<chksumc_digest+0xfffffffffffffce0> 2c: f8 ff c6 24      addiu    $6, 
$6, -0x8 <chksumc_digest+0xfffffffffffffd48>
;     while (len >= sizeof(u64)) { ;     while (len >= sizeof(u64)) {
       30: 08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8> 30: 
08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8>
       34: fb ff 20 10      beqz    $1, 0x24 <chksum_update+0xc> 34: fb 
ff 20 10      beqz    $1, 0x24 <chksum_update+0xc>
       38: 08 00 a5 64      daddiu    $5, $5, 0x8 <chksum_init+0x8> 38: 
08 00 a5 64      daddiu    $5, $5, 0x8 <chksum_init+0x8>
;     if (len & sizeof(u32)) { ;     if (len & sizeof(u32)) {
       3c: 04 00 c1 2c      sltiu    $1, $6, 0x4 <chksum_init+0x4> 3c: 
04 00 c1 2c      sltiu    $1, $6, 0x4 <chksum_init+0x4>
       40: 0a 00 20 10      beqz    $1, 0x6c <chksum_update+0x54> 40: 04 
00 20 f8      bnezc    $1, 0x54 <chksum_update+0x3c>
       44: 03 f8 c3 7c      dext    $3, $6, 0x0, 0x20 
<chksum_update+0x8> ;     return le32_to_cpu(__get_unaligned_t(__le32, p));
;     if (len & sizeof(u16)) { 44: 00 00 a3 8c      lw    $3, 0x0($5)
       48: 02 00 61 30      andi    $1, $3, 0x2 <chksum_init+0x2> 
;         CRC32(crc, value, w);
       4c: 0c 00 20 f8      bnezc    $1, 0x80 <chksum_update+0x68>
;     if (len & sizeof(u8)) { 48: 8f 00 62 7c      <unknown> crc32w    
v0,v1,v0
       50: 01 00 61 30      andi    $1, $3, 0x1 <chksum_init+0x1>
       54: 02 00 20 d8      beqzc    $1, 0x60 <chksum_update+0x48> 
;         len -= sizeof(u32);
;         CRC32(crc, value, b); 4c: fc ff c6 24      addiu    $6, $6, 
-0x4 <chksumc_digest+0xfffffffffffffd4c>
       58: 00 00 a3 90      lbu    $3, 0x0($5) ;         p += sizeof(u32);
50: 04 00 a5 64      daddiu    $5, $5, 0x4 <chksum_init+0x4>
       5c: 0f 00 62 7c      <unknown> crc32b v0,v1,v0 ;     if (len & 
sizeof(u16)) {
54: 03 f8 c3 7c      dext    $3, $6, 0x0, 0x20 <chksum_update+0x8>
;     ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 58: 02 00 61 
30      andi    $1, $3, 0x2 <chksum_init+0x2>
       60: 08 00 82 ac      sw    $2, 0x8($4) 5c: 03 00 20 d8      
beqzc    $1, 0x6c <chksum_update+0x54>
;     return 0; ;         CRC32(crc, value, h);
       64: 09 00 e0 03      jr $ra 60: 00 00 a6 94      lhu    $6, 0x0($5)
       68: 00 00 02 64      daddiu    $2, $zero, 0x0 <chksum_init>
;     return le32_to_cpu(__get_unaligned_t(__le32, p)); 64: 4f 00 c2 
7c      <unknown> crc32h    v0,a2,v0
       6c: 00 00 a6 8c      lw    $6, 0x0($5)
;         CRC32(crc, value, w); ;         p += sizeof(u16);
       70: 8f 00 c2 7c      <unknown> crc32w v0,a2,v0 68: 02 00 a5 
64      daddiu    $5, $5, 0x2 <chksum_init+0x2>
;     if (len & sizeof(u16)) { ;     if (len & sizeof(u8)) {
       74: 02 00 61 30      andi    $1, $3, 0x2 <chksum_init+0x2> 6c: 01 
00 61 30      andi    $1, $3, 0x1 <chksum_init+0x1>
       78: f5 ff 20 10      beqz    $1, 0x50 <chksum_update+0x38> 70: 02 
00 20 d8      beqzc    $1, 0x7c <chksum_update+0x64>
       7c: 04 00 a5 64      daddiu    $5, $5, 0x4 <chksum_init+0x4>
;         CRC32(crc, value, h); ;         CRC32(crc, value, b);
       80: 00 00 a6 94      lhu    $6, 0x0($5) 74: 00 00 a3 90      
lbu    $3, 0x0($5)

       84: 4f 00 c2 7c      <unknown> crc32h v0,a2,v0 78: 0f 00 62 
7c      <unknown> crc32b    v0,v1,v0

;     if (len & sizeof(u8)) { ;     ctx->crc = 
crc32_mips_le_hw(ctx->crc, data, length);
       88: 01 00 61 30      andi    $1, $3, 0x1 <chksum_init+0x1> 7c: 08 
00 82 ac      sw    $2, 0x8($4)
       8c: f4 ff 20 10      beqz    $1, 0x60 <chksum_update+0x48> ;     
return 0;
       90: 02 00 a5 64      daddiu    $5, $5, 0x2 <chksum_init+0x2> 80: 
09 00 e0 03      jr    $ra
       94: 00 00 00 08      j    0x0 <chksum_init> 84: 00 00 02 64      
daddiu    $2, $zero, 0x0 <chksum_init>


In our testing, this assignment operation affects Clang's code expansion 
and instruction reordering.

This redundant assignment operation confuses Clang and prevents us from 
obtaining optimized

assembly code.


I extracted the 'crc32_mips_le_hw()' function as a user-mode demo to 
analyze the assembly code

generated for it on MIPS64.

Link: https://godbolt.org/z/r4dGbhTGf



As you can see, regardless of the Clang or GCC version, this redundant 
operation affects the generated

assembly code.
 >
 > If it does make a difference, you should avoid doing ifdefs as they
 > are more likely to cause build failures.  Instead do something like
 >
 >         if (!IS_ENABLED(CONFIG_64BIT))
 >             len -= sizeof(u32);
Okay, I'll send a Patch V2 to fix this and update the commit message 
based on above.
 >
 >
 > Cheers,
Wentao Guan June 28, 2024, 3:59 p.m. UTC | #3
Something looks dirty with folding, the origin diff is following:

0000000000000018 <chksum_update>:                                                                                                0000000000000018 <chksum_update>:
;     ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);                                                               ;     ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);
      18: 08 00 82 8c      lw    $2, 0x8($4)                                                                                                             18: 08 00 82 8c      lw    $2, 0x8($4)
;     while (len >= sizeof(u64)) {                                                                                                                    ;     while (len >= sizeof(u64)) {
      1c: 08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8>                                                                 1c: 08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8>
      20: 06 00 20 f8      bnezc    $1, 0x3c <chksum_update+0x24>                                                         20: 06 00 20 f8      bnezc    $1, 0x3c <chksum_update+0x24>
;     return le64_to_cpu(__get_unaligned_t(__le64, p));                                                                 ;     return le64_to_cpu(__get_unaligned_t(__le64, p));
      24: 00 00 a3 dc      ld    $3, 0x0($5)                                                                                                              24: 00 00 a3 dc      ld    $3, 0x0($5)
;         CRC32(crc, value, d);                                                                                                                           ;         CRC32(crc, value, d);
      28: cf 00 62 7c      <unknown> crc32d    v0,v1,v0                                                                                 28: cf 00 62 7c      <unknown> crc32d    v0,v1,v0
;         len -= sizeof(u64);                                                                                                                                 ;         len -= sizeof(u64);
      2c: f8 ff c6 24      addiu    $6, $6, -0x8 <chksumc_digest+0xfffffffffffffce0>                                2c: f8 ff c6 24      addiu    $6, $6, -0x8 <chksumc_digest+0xfffffffffffffd48>
;     while (len >= sizeof(u64)) {                                                                                                                    ;     while (len >= sizeof(u64)) {
      30: 08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8>                                                                30: 08 00 c1 2c      sltiu    $1, $6, 0x8 <chksum_init+0x8>
      34: fb ff 20 10      beqz    $1, 0x24 <chksum_update+0xc>                                                              34: fb ff 20 10      beqz    $1, 0x24 <chksum_update+0xc>
      38: 08 00 a5 64      daddiu    $5, $5, 0x8 <chksum_init+0x8>                                                          38: 08 00 a5 64      daddiu    $5, $5, 0x8 <chksum_init+0x8>
;     if (len & sizeof(u32)) {                                                                                                                              ;     if (len & sizeof(u32)) {
      3c: 04 00 c1 2c      sltiu    $1, $6, 0x4 <chksum_init+0x4>                                                                3c: 04 00 c1 2c      sltiu    $1, $6, 0x4 <chksum_init+0x4>
      40: 0a 00 20 10      beqz    $1, 0x6c <chksum_update+0x54>                                                         40: 04 00 20 f8      bnezc    $1, 0x54 <chksum_update+0x3c>
      44: 03 f8 c3 7c      dext    $3, $6, 0x0, 0x20 <chksum_update+0x8>                                       ;     return le32_to_cpu(__get_unaligned_t(__le32, p));
;     if (len & sizeof(u16)) {                                                                                                                                    44: 00 00 a3 8c      lw    $3, 0x0($5)
      48: 02 00 61 30      andi    $1, $3, 0x2 <chksum_init+0x2>                                                         ;         CRC32(crc, value, w);
      4c: 0c 00 20 f8      bnezc    $1, 0x80 <chksum_update+0x68>
;     if (len & sizeof(u8)) {                                                                                                                                      48: 8f 00 62 7c      <unknown> crc32w    v0,v1,v0
      50: 01 00 61 30      andi    $1, $3, 0x1 <chksum_init+0x1>
      54: 02 00 20 d8      beqzc    $1, 0x60 <chksum_update+0x48>                                                 ;         len -= sizeof(u32);
;         CRC32(crc, value, b);                                                                                                                                 4c: fc ff c6 24      addiu    $6, $6, -0x4 <chksumc_digest+0xfffffffffffffd4c>
      58: 00 00 a3 90      lbu    $3, 0x0($5)                                                                                                     ;         p += sizeof(u32);
                                                                                                                                                                                     50: 04 00 a5 64      daddiu    $5, $5, 0x4 <chksum_init+0x4>
      5c: 0f 00 62 7c      <unknown> crc32b    v0,v1,v0                                                                           ;     if (len & sizeof(u16)) {
                                                                                                                                                                                     54: 03 f8 c3 7c      dext    $3, $6, 0x0, 0x20 <chksum_update+0x8>
;     ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);                                                                   58: 02 00 61 30      andi    $1, $3, 0x2 <chksum_init+0x2>
      60: 08 00 82 ac      sw    $2, 0x8($4)                                                                                                           5c: 03 00 20 d8      beqzc    $1, 0x6c <chksum_update+0x54>
;     return 0;                                                                                                                                                        ;         CRC32(crc, value, h);
      64: 09 00 e0 03      jr    $ra                                                                                                                              60: 00 00 a6 94      lhu    $6, 0x0($5)
      68: 00 00 02 64      daddiu    $2, $zero, 0x0 <chksum_init>
;     return le32_to_cpu(__get_unaligned_t(__le32, p));                                                                      64: 4f 00 c2 7c      <unknown> crc32h    v0,a2,v0
      6c: 00 00 a6 8c      lw    $6, 0x0($5)
;         CRC32(crc, value, w);                                                                                                                          ;         p += sizeof(u16);
      70: 8f 00 c2 7c      <unknown> crc32w    v0,a2,v0                                                                               68: 02 00 a5 64      daddiu    $5, $5, 0x2 <chksum_init+0x2>
;     if (len & sizeof(u16)) {                                                                                                                              ;     if (len & sizeof(u8)) {
      74: 02 00 61 30      andi    $1, $3, 0x2 <chksum_init+0x2>                                                               6c: 01 00 61 30      andi    $1, $3, 0x1 <chksum_init+0x1>
      78: f5 ff 20 10      beqz    $1, 0x50 <chksum_update+0x38>                                                           70: 02 00 20 d8      beqzc    $1, 0x7c <chksum_update+0x64>
      7c: 04 00 a5 64      daddiu    $5, $5, 0x4 <chksum_init+0x4>
;         CRC32(crc, value, h);                                                                                                                           ;         CRC32(crc, value, b);
      80: 00 00 a6 94      lhu    $6, 0x0($5)                                                                                                          74: 00 00 a3 90      lbu    $3, 0x0($5)

      84: 4f 00 c2 7c      <unknown> crc32h    v0,a2,v0                                                                                78: 0f 00 62 7c      <unknown> crc32b    v0,v1,v0

;     if (len & sizeof(u8)) {                                                                                                                                ;     ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);
      88: 01 00 61 30      andi    $1, $3, 0x1 <chksum_init+0x1>                                                               7c: 08 00 82 ac      sw    $2, 0x8($4)
      8c: f4 ff 20 10      beqz    $1, 0x60 <chksum_update+0x48>                                                      ;     return 0;
      90: 02 00 a5 64      daddiu    $5, $5, 0x2 <chksum_init+0x2>                                                         80: 09 00 e0 03      jr    $ra
      94: 00 00 00 08      j    0x0 <chksum_init>                                                                                               84: 00 00 02 64      daddiu    $2, $zero, 0x0 <chksum_init>

Guan Wentao
Maciej W. Rozycki June 28, 2024, 4:42 p.m. UTC | #4
On Fri, 28 Jun 2024, WangYuli wrote:

> As you can see, regardless of the Clang or GCC version, this redundant
> operation affects the generated
> 
> assembly code.

 Boy, this code is horribly structured!

 Anyway, rather than making it yet worse with another #ifdef I'd suggest 
replacing both `while' loops with equivalent `for' ones with `len' being 
the worker variable, which will then make the code structure a little bit 
better and as a side effect address the missed optimisation automagically.

 You might also consider Herbert's suggestion to use IS_ENABLED, however 
in the current shape of code I find it kind of pointless anyway.  Instead 
I think that it would make more sense to factor out the block bodies to 
small static inline helpers and then restructure the call sites so that 
IS_ENABLED controls the loops, the conditionals, and the choice between 
them as applicable.

  Maciej
diff mbox series

Patch

diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c
index ec6d58008f8e..505d2d897849 100644
--- a/arch/mips/crypto/crc32-mips.c
+++ b/arch/mips/crypto/crc32-mips.c
@@ -94,7 +94,9 @@  static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len)
 
 		CRC32(crc, value, w);
 		p += sizeof(u32);
+#ifndef CONFIG_64BIT
 		len -= sizeof(u32);
+#endif
 	}
 
 	if (len & sizeof(u16)) {
@@ -134,7 +136,9 @@  static u32 crc32c_mips_le_hw(u32 crc_, const u8 *p, unsigned int len)
 
 		CRC32C(crc, value, w);
 		p += sizeof(u32);
+#ifndef CONFIG_64BIT
 		len -= sizeof(u32);
+#endif
 	}
 
 	if (len & sizeof(u16)) {