Message ID | 1D248893502B75F5+20240628084117.84264-1-wangyuli@uniontech.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | MIPS: crypto: Clean up useless assignment operations | expand |
On Fri, Jun 28, 2024 at 04:41:17PM +0800, WangYuli wrote: > When entering the "len & sizeof(u32)" branch, len must be less than 8. > So after one operation, len must be less than 4. > At this time, "len -= sizeof(u32)" is not necessary for 64-bit CPUs. > > A similar issue has been solved at Loongarch. > > Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.10-rc5&id=fea1c949f6ca5059e12de00d0483645debc5b206 > Signed-off-by: Guan Wentao <guanwentao@uniontech.com> > Signed-off-by: WangYuli <wangyuli@uniontech.com> > --- > arch/mips/crypto/crc32-mips.c | 4 ++++ > 1 file changed, 4 insertions(+) > > diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c > index ec6d58008f8e..505d2d897849 100644 > --- a/arch/mips/crypto/crc32-mips.c > +++ b/arch/mips/crypto/crc32-mips.c > @@ -94,7 +94,9 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) > > CRC32(crc, value, w); > p += sizeof(u32); > +#ifndef CONFIG_64BIT > len -= sizeof(u32); > +#endif First of all, did you verify that this actually makes a difference? Please post the actual assembly output with and without this patch. If it does make a difference, you should avoid doing ifdefs as they are more likely to cause build failures. Instead do something like if (!IS_ENABLED(CONFIG_64BIT)) len -= sizeof(u32); Cheers,
On 2024/6/28 18:12, Herbert Xu wrote: > On Fri, Jun 28, 2024 at 04:41:17PM +0800, WangYuli wrote: >> When entering the "len & sizeof(u32)" branch, len must be less than 8. >> So after one operation, len must be less than 4. >> At this time, "len -= sizeof(u32)" is not necessary for 64-bit CPUs. >> >> A similar issue has been solved at Loongarch. >> >> Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.10-rc5&id=fea1c949f6ca5059e12de00d0483645debc5b206 >> Signed-off-by: Guan Wentao <guanwentao@uniontech.com> >> Signed-off-by: WangYuli <wangyuli@uniontech.com> >> --- >> arch/mips/crypto/crc32-mips.c | 4 ++++ >> 1 file changed, 4 insertions(+) >> >> diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c >> index ec6d58008f8e..505d2d897849 100644 >> --- a/arch/mips/crypto/crc32-mips.c >> +++ b/arch/mips/crypto/crc32-mips.c >> @@ -94,7 +94,9 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) >> >> CRC32(crc, value, w); >> p += sizeof(u32); >> +#ifndef CONFIG_64BIT >> len -= sizeof(u32); >> +#endif > > First of all, did you verify that this actually makes a difference? > Please post the actual assembly output with and without this patch. Sure. The left shows the assembly after applying this patch, while the right shows the origin. ( Generated by Clang 17.0.6 ) 0000000000000018 <chksum_update>: 0000000000000018 <chksum_update>: ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 18: 08 00 82 8c lw $2, 0x8($4) 18: 08 00 82 8c lw $2, 0x8($4) ; while (len >= sizeof(u64)) { ; while (len >= sizeof(u64)) { 1c: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 1c: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 20: 06 00 20 f8 bnezc $1, 0x3c <chksum_update+0x24> 20: 06 00 20 f8 bnezc $1, 0x3c <chksum_update+0x24> ; return le64_to_cpu(__get_unaligned_t(__le64, p)); ; return le64_to_cpu(__get_unaligned_t(__le64, p)); 24: 00 00 a3 dc ld $3, 0x0($5) 24: 00 00 a3 dc ld $3, 0x0($5) ; CRC32(crc, value, d); ; CRC32(crc, value, d); 28: cf 00 62 7c <unknown> crc32d v0,v1,v0 28: cf 00 62 7c <unknown> crc32d v0,v1,v0 ; len -= sizeof(u64); ; len -= sizeof(u64); 2c: f8 ff c6 24 addiu $6, $6, -0x8 <chksumc_digest+0xfffffffffffffce0> 2c: f8 ff c6 24 addiu $6, $6, -0x8 <chksumc_digest+0xfffffffffffffd48> ; while (len >= sizeof(u64)) { ; while (len >= sizeof(u64)) { 30: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 30: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 34: fb ff 20 10 beqz $1, 0x24 <chksum_update+0xc> 34: fb ff 20 10 beqz $1, 0x24 <chksum_update+0xc> 38: 08 00 a5 64 daddiu $5, $5, 0x8 <chksum_init+0x8> 38: 08 00 a5 64 daddiu $5, $5, 0x8 <chksum_init+0x8> ; if (len & sizeof(u32)) { ; if (len & sizeof(u32)) { 3c: 04 00 c1 2c sltiu $1, $6, 0x4 <chksum_init+0x4> 3c: 04 00 c1 2c sltiu $1, $6, 0x4 <chksum_init+0x4> 40: 0a 00 20 10 beqz $1, 0x6c <chksum_update+0x54> 40: 04 00 20 f8 bnezc $1, 0x54 <chksum_update+0x3c> 44: 03 f8 c3 7c dext $3, $6, 0x0, 0x20 <chksum_update+0x8> ; return le32_to_cpu(__get_unaligned_t(__le32, p)); ; if (len & sizeof(u16)) { 44: 00 00 a3 8c lw $3, 0x0($5) 48: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> ; CRC32(crc, value, w); 4c: 0c 00 20 f8 bnezc $1, 0x80 <chksum_update+0x68> ; if (len & sizeof(u8)) { 48: 8f 00 62 7c <unknown> crc32w v0,v1,v0 50: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 54: 02 00 20 d8 beqzc $1, 0x60 <chksum_update+0x48> ; len -= sizeof(u32); ; CRC32(crc, value, b); 4c: fc ff c6 24 addiu $6, $6, -0x4 <chksumc_digest+0xfffffffffffffd4c> 58: 00 00 a3 90 lbu $3, 0x0($5) ; p += sizeof(u32); 50: 04 00 a5 64 daddiu $5, $5, 0x4 <chksum_init+0x4> 5c: 0f 00 62 7c <unknown> crc32b v0,v1,v0 ; if (len & sizeof(u16)) { 54: 03 f8 c3 7c dext $3, $6, 0x0, 0x20 <chksum_update+0x8> ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 58: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> 60: 08 00 82 ac sw $2, 0x8($4) 5c: 03 00 20 d8 beqzc $1, 0x6c <chksum_update+0x54> ; return 0; ; CRC32(crc, value, h); 64: 09 00 e0 03 jr $ra 60: 00 00 a6 94 lhu $6, 0x0($5) 68: 00 00 02 64 daddiu $2, $zero, 0x0 <chksum_init> ; return le32_to_cpu(__get_unaligned_t(__le32, p)); 64: 4f 00 c2 7c <unknown> crc32h v0,a2,v0 6c: 00 00 a6 8c lw $6, 0x0($5) ; CRC32(crc, value, w); ; p += sizeof(u16); 70: 8f 00 c2 7c <unknown> crc32w v0,a2,v0 68: 02 00 a5 64 daddiu $5, $5, 0x2 <chksum_init+0x2> ; if (len & sizeof(u16)) { ; if (len & sizeof(u8)) { 74: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> 6c: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 78: f5 ff 20 10 beqz $1, 0x50 <chksum_update+0x38> 70: 02 00 20 d8 beqzc $1, 0x7c <chksum_update+0x64> 7c: 04 00 a5 64 daddiu $5, $5, 0x4 <chksum_init+0x4> ; CRC32(crc, value, h); ; CRC32(crc, value, b); 80: 00 00 a6 94 lhu $6, 0x0($5) 74: 00 00 a3 90 lbu $3, 0x0($5) 84: 4f 00 c2 7c <unknown> crc32h v0,a2,v0 78: 0f 00 62 7c <unknown> crc32b v0,v1,v0 ; if (len & sizeof(u8)) { ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 88: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 7c: 08 00 82 ac sw $2, 0x8($4) 8c: f4 ff 20 10 beqz $1, 0x60 <chksum_update+0x48> ; return 0; 90: 02 00 a5 64 daddiu $5, $5, 0x2 <chksum_init+0x2> 80: 09 00 e0 03 jr $ra 94: 00 00 00 08 j 0x0 <chksum_init> 84: 00 00 02 64 daddiu $2, $zero, 0x0 <chksum_init> In our testing, this assignment operation affects Clang's code expansion and instruction reordering. This redundant assignment operation confuses Clang and prevents us from obtaining optimized assembly code. I extracted the 'crc32_mips_le_hw()' function as a user-mode demo to analyze the assembly code generated for it on MIPS64. Link: https://godbolt.org/z/r4dGbhTGf As you can see, regardless of the Clang or GCC version, this redundant operation affects the generated assembly code. > > If it does make a difference, you should avoid doing ifdefs as they > are more likely to cause build failures. Instead do something like > > if (!IS_ENABLED(CONFIG_64BIT)) > len -= sizeof(u32); Okay, I'll send a Patch V2 to fix this and update the commit message based on above. > > > Cheers,
Something looks dirty with folding, the origin diff is following: 0000000000000018 <chksum_update>: 0000000000000018 <chksum_update>: ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 18: 08 00 82 8c lw $2, 0x8($4) 18: 08 00 82 8c lw $2, 0x8($4) ; while (len >= sizeof(u64)) { ; while (len >= sizeof(u64)) { 1c: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 1c: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 20: 06 00 20 f8 bnezc $1, 0x3c <chksum_update+0x24> 20: 06 00 20 f8 bnezc $1, 0x3c <chksum_update+0x24> ; return le64_to_cpu(__get_unaligned_t(__le64, p)); ; return le64_to_cpu(__get_unaligned_t(__le64, p)); 24: 00 00 a3 dc ld $3, 0x0($5) 24: 00 00 a3 dc ld $3, 0x0($5) ; CRC32(crc, value, d); ; CRC32(crc, value, d); 28: cf 00 62 7c <unknown> crc32d v0,v1,v0 28: cf 00 62 7c <unknown> crc32d v0,v1,v0 ; len -= sizeof(u64); ; len -= sizeof(u64); 2c: f8 ff c6 24 addiu $6, $6, -0x8 <chksumc_digest+0xfffffffffffffce0> 2c: f8 ff c6 24 addiu $6, $6, -0x8 <chksumc_digest+0xfffffffffffffd48> ; while (len >= sizeof(u64)) { ; while (len >= sizeof(u64)) { 30: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 30: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 34: fb ff 20 10 beqz $1, 0x24 <chksum_update+0xc> 34: fb ff 20 10 beqz $1, 0x24 <chksum_update+0xc> 38: 08 00 a5 64 daddiu $5, $5, 0x8 <chksum_init+0x8> 38: 08 00 a5 64 daddiu $5, $5, 0x8 <chksum_init+0x8> ; if (len & sizeof(u32)) { ; if (len & sizeof(u32)) { 3c: 04 00 c1 2c sltiu $1, $6, 0x4 <chksum_init+0x4> 3c: 04 00 c1 2c sltiu $1, $6, 0x4 <chksum_init+0x4> 40: 0a 00 20 10 beqz $1, 0x6c <chksum_update+0x54> 40: 04 00 20 f8 bnezc $1, 0x54 <chksum_update+0x3c> 44: 03 f8 c3 7c dext $3, $6, 0x0, 0x20 <chksum_update+0x8> ; return le32_to_cpu(__get_unaligned_t(__le32, p)); ; if (len & sizeof(u16)) { 44: 00 00 a3 8c lw $3, 0x0($5) 48: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> ; CRC32(crc, value, w); 4c: 0c 00 20 f8 bnezc $1, 0x80 <chksum_update+0x68> ; if (len & sizeof(u8)) { 48: 8f 00 62 7c <unknown> crc32w v0,v1,v0 50: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 54: 02 00 20 d8 beqzc $1, 0x60 <chksum_update+0x48> ; len -= sizeof(u32); ; CRC32(crc, value, b); 4c: fc ff c6 24 addiu $6, $6, -0x4 <chksumc_digest+0xfffffffffffffd4c> 58: 00 00 a3 90 lbu $3, 0x0($5) ; p += sizeof(u32); 50: 04 00 a5 64 daddiu $5, $5, 0x4 <chksum_init+0x4> 5c: 0f 00 62 7c <unknown> crc32b v0,v1,v0 ; if (len & sizeof(u16)) { 54: 03 f8 c3 7c dext $3, $6, 0x0, 0x20 <chksum_update+0x8> ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 58: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> 60: 08 00 82 ac sw $2, 0x8($4) 5c: 03 00 20 d8 beqzc $1, 0x6c <chksum_update+0x54> ; return 0; ; CRC32(crc, value, h); 64: 09 00 e0 03 jr $ra 60: 00 00 a6 94 lhu $6, 0x0($5) 68: 00 00 02 64 daddiu $2, $zero, 0x0 <chksum_init> ; return le32_to_cpu(__get_unaligned_t(__le32, p)); 64: 4f 00 c2 7c <unknown> crc32h v0,a2,v0 6c: 00 00 a6 8c lw $6, 0x0($5) ; CRC32(crc, value, w); ; p += sizeof(u16); 70: 8f 00 c2 7c <unknown> crc32w v0,a2,v0 68: 02 00 a5 64 daddiu $5, $5, 0x2 <chksum_init+0x2> ; if (len & sizeof(u16)) { ; if (len & sizeof(u8)) { 74: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> 6c: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 78: f5 ff 20 10 beqz $1, 0x50 <chksum_update+0x38> 70: 02 00 20 d8 beqzc $1, 0x7c <chksum_update+0x64> 7c: 04 00 a5 64 daddiu $5, $5, 0x4 <chksum_init+0x4> ; CRC32(crc, value, h); ; CRC32(crc, value, b); 80: 00 00 a6 94 lhu $6, 0x0($5) 74: 00 00 a3 90 lbu $3, 0x0($5) 84: 4f 00 c2 7c <unknown> crc32h v0,a2,v0 78: 0f 00 62 7c <unknown> crc32b v0,v1,v0 ; if (len & sizeof(u8)) { ; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 88: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 7c: 08 00 82 ac sw $2, 0x8($4) 8c: f4 ff 20 10 beqz $1, 0x60 <chksum_update+0x48> ; return 0; 90: 02 00 a5 64 daddiu $5, $5, 0x2 <chksum_init+0x2> 80: 09 00 e0 03 jr $ra 94: 00 00 00 08 j 0x0 <chksum_init> 84: 00 00 02 64 daddiu $2, $zero, 0x0 <chksum_init> Guan Wentao
On Fri, 28 Jun 2024, WangYuli wrote: > As you can see, regardless of the Clang or GCC version, this redundant > operation affects the generated > > assembly code. Boy, this code is horribly structured! Anyway, rather than making it yet worse with another #ifdef I'd suggest replacing both `while' loops with equivalent `for' ones with `len' being the worker variable, which will then make the code structure a little bit better and as a side effect address the missed optimisation automagically. You might also consider Herbert's suggestion to use IS_ENABLED, however in the current shape of code I find it kind of pointless anyway. Instead I think that it would make more sense to factor out the block bodies to small static inline helpers and then restructure the call sites so that IS_ENABLED controls the loops, the conditionals, and the choice between them as applicable. Maciej
diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c index ec6d58008f8e..505d2d897849 100644 --- a/arch/mips/crypto/crc32-mips.c +++ b/arch/mips/crypto/crc32-mips.c @@ -94,7 +94,9 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) CRC32(crc, value, w); p += sizeof(u32); +#ifndef CONFIG_64BIT len -= sizeof(u32); +#endif } if (len & sizeof(u16)) { @@ -134,7 +136,9 @@ static u32 crc32c_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) CRC32C(crc, value, w); p += sizeof(u32); +#ifndef CONFIG_64BIT len -= sizeof(u32); +#endif } if (len & sizeof(u16)) {