diff mbox series

[2/2] crypto: x86/sha256-ni - optimize code size

Message ID 20240409124216.9261-3-ebiggers@kernel.org (mailing list archive)
State Superseded
Delegated to: Herbert Xu
Headers show
Series crypto: x86/sha256-ni - cleanup and optimization | expand

Commit Message

Eric Biggers April 9, 2024, 12:42 p.m. UTC
From: Eric Biggers <ebiggers@google.com>

- Load the SHA-256 round constants relative to a pointer that points
  into the middle of the constants rather than to the beginning.  Since
  x86 instructions use signed offsets, this decreases the instruction
  length required to access some of the later round constants.

- Use punpcklqdq or punpckhqdq instead of longer instructions such as
  pshufd, pblendw, and palignr.  This doesn't harm performance.

The end result is that sha256_ni_transform shrinks from 839 bytes to 791
bytes, with no loss in performance.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index e485520e3b49..4d373069448d 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -82,19 +82,19 @@ 
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, \m0
 .else
 	movdqa		\m0, MSG
 .endif
-	paddd		\i*4(SHA256CONSTANTS), MSG
+	paddd		(\i-32)*4(SHA256CONSTANTS), MSG
 	sha256rnds2	STATE0, STATE1
 .if \i >= 12 && \i < 60
 	movdqa		\m0, MSGTMP4
 	palignr		$4, \m3, MSGTMP4
 	paddd		MSGTMP4, \m1
 	sha256msg2	\m0, \m1
 .endif
-	pshufd 		$0x0E, MSG, MSG
+	punpckhqdq	MSG, MSG
 	sha256rnds2	STATE1, STATE0
 .if \i >= 4 && \i < 52
 	sha256msg1	\m0, \m3
 .endif
 .endm
@@ -133,21 +133,21 @@  SYM_TYPED_FUNC_START(sha256_ni_transform)
 	/*
 	 * load initial hash values
 	 * Need to reorder these appropriately
 	 * DCBA, HGFE -> ABEF, CDGH
 	 */
-	movdqu		0*16(DIGEST_PTR), STATE0
-	movdqu		1*16(DIGEST_PTR), STATE1
+	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
+	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
 
-	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
-	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
 	movdqa		STATE0, MSGTMP4
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	MSGTMP4, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
 
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
+	lea		K256+32*4(%rip), SHA256CONSTANTS
 
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
@@ -165,18 +165,18 @@  SYM_TYPED_FUNC_START(sha256_ni_transform)
 	add		$64, DATA_PTR
 	cmp		NUM_BLKS, DATA_PTR
 	jne		.Lloop0
 
 	/* Write hash values back in the correct order */
-	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
-	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
 	movdqa		STATE0, MSGTMP4
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	punpcklqdq	STATE1, STATE0			/* GHEF */
+	punpckhqdq	MSGTMP4, STATE1			/* ABCD */
+	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
+	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
 
-	movdqu		STATE0, 0*16(DIGEST_PTR)
-	movdqu		STATE1, 1*16(DIGEST_PTR)
+	movdqu		STATE1, 0*16(DIGEST_PTR)
+	movdqu		STATE0, 1*16(DIGEST_PTR)
 
 .Ldone_hash:
 
 	RET
 SYM_FUNC_END(sha256_ni_transform)