Message ID | 20150316154835.GA31336@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hello Sami, On 16 March 2015 at 16:48, Sami Tolvanen <samitolvanen@google.com> wrote: > Add Andy Polyakov's NEON optimized SHA-256 implementation. > > On Nexus 6, this implementation is ~2x faster than sha256-generic. > > Signed-off-by: Sami Tolvanen <samitolvanen@google.com> > Have you tested this code with the tcrypt.ko module? Some more comments below > --- > arch/arm/crypto/Makefile | 2 > arch/arm/crypto/sha256-armv7-neon.S | 819 ++++++++++++++++++++++++++++++++++++ > arch/arm/crypto/sha256_neon_glue.c | 201 ++++++++ > crypto/Kconfig | 12 > 4 files changed, 1034 insertions(+) > > diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile > index b48fa34..316dba2 100644 > --- a/arch/arm/crypto/Makefile > +++ b/arch/arm/crypto/Makefile > @@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o > obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o > obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o > obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o > +obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o > obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o > > aes-arm-y := aes-armv4.o aes_glue.o > aes-arm-bs-y := aesbs-core.o aesbs-glue.o > sha1-arm-y := sha1-armv4-large.o sha1_glue.o > sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o > +sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o > sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o > > quiet_cmd_perl = PERL $@ > diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S > new file mode 100644 > index 0000000..5ce04c2 > --- /dev/null > +++ b/arch/arm/crypto/sha256-armv7-neon.S > @@ -0,0 +1,819 @@ > +@ sha256-armv7-neon.S - ARM/NEON assembly implementation of SHA-256 transform > +@ > +@ ==================================================================== > +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL > +@ project. The module is, however, dual licensed under OpenSSL and > +@ CRYPTOGAMS licenses depending on where you obtain it. For further > +@ details see http://www.openssl.org/~appro/cryptogams/. > +@ ==================================================================== > + Did you talk to Andy about the license? I don't think this is permissible for the kernel as-is. > +#include <linux/linkage.h> > + > +.text > +.code 32 > +.fpu neon > + > +.type K256,%object > +.align 5 > +K256: > +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 > +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 > +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 > +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 > +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc > +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da > +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 > +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 > +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 > +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 > +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 > +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 > +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 > +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 > +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 > +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 > +.size K256,.-K256 > +.word 0 @ terminator > +.word 0 > +.align 5 > + > +.align 5 > +ENTRY(sha256_transform_neon) > + /* Input: > + * %r0: SHA256_CONTEXT > + * %r1: data > + * %r2: nblks > + */ > + sub r3,pc,#8 @ sha256_transform_neon This is broken on thumb-2, use adr instead > + add r2,r1,r2,lsl#6 @ len to point at the end of inp > + > + stmdb sp!,{r4-r12,lr} > + > + mov r12,sp > + sub sp,sp,#16*4+16 @ alloca > + sub r14,r3,#256+32 @ K256 > + bic sp,sp,#15 @ align for 128-bit stores > + > + vld1.8 {q0},[r1]! > + vld1.8 {q1},[r1]! > + vld1.8 {q2},[r1]! > + vld1.8 {q3},[r1]! > + vld1.32 {q8},[r14,:128]! > + vld1.32 {q9},[r14,:128]! > + vld1.32 {q10},[r14,:128]! > + vld1.32 {q11},[r14,:128]! > + vrev32.8 q0,q0 @ yes, even on > + str r0,[sp,#64] > + vrev32.8 q1,q1 @ big-endian > + str r1,[sp,#68] > + mov r1,sp > + vrev32.8 q2,q2 > + str r2,[sp,#72] > + vrev32.8 q3,q3 > + str r12,[sp,#76] @ save original sp > + vadd.i32 q8,q8,q0 > + vadd.i32 q9,q9,q1 > + vst1.32 {q8},[r1,:128]! > + vadd.i32 q10,q10,q2 > + vst1.32 {q9},[r1,:128]! > + vadd.i32 q11,q11,q3 > + vst1.32 {q10},[r1,:128]! > + vst1.32 {q11},[r1,:128]! > + > + ldmia r0,{r4-r11} > + sub r1,r1,#64 > + ldr r2,[sp,#0] > + eor r12,r12,r12 > + eor r3,r5,r6 > + b .L_00_48 > + > +.align 4 > +.L_00_48: > + vext.8 q8,q0,q1,#4 > + add r11,r11,r2 > + eor r2,r9,r10 > + eor r0,r8,r8,ror#5 > + vext.8 q9,q2,q3,#4 > + add r4,r4,r12 > + and r2,r2,r8 > + eor r12,r0,r8,ror#19 > + vshr.u32 q10,q8,#7 > + eor r0,r4,r4,ror#11 > + eor r2,r2,r10 > + vadd.i32 q0,q0,q9 > + add r11,r11,r12,ror#6 > + eor r12,r4,r5 > + vshr.u32 q9,q8,#3 > + eor r0,r0,r4,ror#20 > + add r11,r11,r2 > + vsli.32 q10,q8,#25 > + ldr r2,[sp,#4] > + and r3,r3,r12 > + vshr.u32 q11,q8,#18 > + add r7,r7,r11 > + add r11,r11,r0,ror#2 > + eor r3,r3,r5 > + veor q9,q9,q10 > + add r10,r10,r2 > + vsli.32 q11,q8,#14 > + eor r2,r8,r9 > + eor r0,r7,r7,ror#5 > + vshr.u32 d24,d7,#17 > + add r11,r11,r3 > + and r2,r2,r7 > + veor q9,q9,q11 > + eor r3,r0,r7,ror#19 > + eor r0,r11,r11,ror#11 > + vsli.32 d24,d7,#15 > + eor r2,r2,r9 > + add r10,r10,r3,ror#6 > + vshr.u32 d25,d7,#10 > + eor r3,r11,r4 > + eor r0,r0,r11,ror#20 > + vadd.i32 q0,q0,q9 > + add r10,r10,r2 > + ldr r2,[sp,#8] > + veor d25,d25,d24 > + and r12,r12,r3 > + add r6,r6,r10 > + vshr.u32 d24,d7,#19 > + add r10,r10,r0,ror#2 > + eor r12,r12,r4 > + vsli.32 d24,d7,#13 > + add r9,r9,r2 > + eor r2,r7,r8 > + veor d25,d25,d24 > + eor r0,r6,r6,ror#5 > + add r10,r10,r12 > + vadd.i32 d0,d0,d25 > + and r2,r2,r6 > + eor r12,r0,r6,ror#19 > + vshr.u32 d24,d0,#17 > + eor r0,r10,r10,ror#11 > + eor r2,r2,r8 > + vsli.32 d24,d0,#15 > + add r9,r9,r12,ror#6 > + eor r12,r10,r11 > + vshr.u32 d25,d0,#10 > + eor r0,r0,r10,ror#20 > + add r9,r9,r2 > + veor d25,d25,d24 > + ldr r2,[sp,#12] > + and r3,r3,r12 > + vshr.u32 d24,d0,#19 > + add r5,r5,r9 > + add r9,r9,r0,ror#2 > + eor r3,r3,r11 > + vld1.32 {q8},[r14,:128]! > + add r8,r8,r2 > + vsli.32 d24,d0,#13 > + eor r2,r6,r7 > + eor r0,r5,r5,ror#5 > + veor d25,d25,d24 > + add r9,r9,r3 > + and r2,r2,r5 > + vadd.i32 d1,d1,d25 > + eor r3,r0,r5,ror#19 > + eor r0,r9,r9,ror#11 > + vadd.i32 q8,q8,q0 > + eor r2,r2,r7 > + add r8,r8,r3,ror#6 > + eor r3,r9,r10 > + eor r0,r0,r9,ror#20 > + add r8,r8,r2 > + ldr r2,[sp,#16] > + and r12,r12,r3 > + add r4,r4,r8 > + vst1.32 {q8},[r1,:128]! > + add r8,r8,r0,ror#2 > + eor r12,r12,r10 > + vext.8 q8,q1,q2,#4 > + add r7,r7,r2 > + eor r2,r5,r6 > + eor r0,r4,r4,ror#5 > + vext.8 q9,q3,q0,#4 > + add r8,r8,r12 > + and r2,r2,r4 > + eor r12,r0,r4,ror#19 > + vshr.u32 q10,q8,#7 > + eor r0,r8,r8,ror#11 > + eor r2,r2,r6 > + vadd.i32 q1,q1,q9 > + add r7,r7,r12,ror#6 > + eor r12,r8,r9 > + vshr.u32 q9,q8,#3 > + eor r0,r0,r8,ror#20 > + add r7,r7,r2 > + vsli.32 q10,q8,#25 > + ldr r2,[sp,#20] > + and r3,r3,r12 > + vshr.u32 q11,q8,#18 > + add r11,r11,r7 > + add r7,r7,r0,ror#2 > + eor r3,r3,r9 > + veor q9,q9,q10 > + add r6,r6,r2 > + vsli.32 q11,q8,#14 > + eor r2,r4,r5 > + eor r0,r11,r11,ror#5 > + vshr.u32 d24,d1,#17 > + add r7,r7,r3 > + and r2,r2,r11 > + veor q9,q9,q11 > + eor r3,r0,r11,ror#19 > + eor r0,r7,r7,ror#11 > + vsli.32 d24,d1,#15 > + eor r2,r2,r5 > + add r6,r6,r3,ror#6 > + vshr.u32 d25,d1,#10 > + eor r3,r7,r8 > + eor r0,r0,r7,ror#20 > + vadd.i32 q1,q1,q9 > + add r6,r6,r2 > + ldr r2,[sp,#24] > + veor d25,d25,d24 > + and r12,r12,r3 > + add r10,r10,r6 > + vshr.u32 d24,d1,#19 > + add r6,r6,r0,ror#2 > + eor r12,r12,r8 > + vsli.32 d24,d1,#13 > + add r5,r5,r2 > + eor r2,r11,r4 > + veor d25,d25,d24 > + eor r0,r10,r10,ror#5 > + add r6,r6,r12 > + vadd.i32 d2,d2,d25 > + and r2,r2,r10 > + eor r12,r0,r10,ror#19 > + vshr.u32 d24,d2,#17 > + eor r0,r6,r6,ror#11 > + eor r2,r2,r4 > + vsli.32 d24,d2,#15 > + add r5,r5,r12,ror#6 > + eor r12,r6,r7 > + vshr.u32 d25,d2,#10 > + eor r0,r0,r6,ror#20 > + add r5,r5,r2 > + veor d25,d25,d24 > + ldr r2,[sp,#28] > + and r3,r3,r12 > + vshr.u32 d24,d2,#19 > + add r9,r9,r5 > + add r5,r5,r0,ror#2 > + eor r3,r3,r7 > + vld1.32 {q8},[r14,:128]! > + add r4,r4,r2 > + vsli.32 d24,d2,#13 > + eor r2,r10,r11 > + eor r0,r9,r9,ror#5 > + veor d25,d25,d24 > + add r5,r5,r3 > + and r2,r2,r9 > + vadd.i32 d3,d3,d25 > + eor r3,r0,r9,ror#19 > + eor r0,r5,r5,ror#11 > + vadd.i32 q8,q8,q1 > + eor r2,r2,r11 > + add r4,r4,r3,ror#6 > + eor r3,r5,r6 > + eor r0,r0,r5,ror#20 > + add r4,r4,r2 > + ldr r2,[sp,#32] > + and r12,r12,r3 > + add r8,r8,r4 > + vst1.32 {q8},[r1,:128]! > + add r4,r4,r0,ror#2 > + eor r12,r12,r6 > + vext.8 q8,q2,q3,#4 > + add r11,r11,r2 > + eor r2,r9,r10 > + eor r0,r8,r8,ror#5 > + vext.8 q9,q0,q1,#4 > + add r4,r4,r12 > + and r2,r2,r8 > + eor r12,r0,r8,ror#19 > + vshr.u32 q10,q8,#7 > + eor r0,r4,r4,ror#11 > + eor r2,r2,r10 > + vadd.i32 q2,q2,q9 > + add r11,r11,r12,ror#6 > + eor r12,r4,r5 > + vshr.u32 q9,q8,#3 > + eor r0,r0,r4,ror#20 > + add r11,r11,r2 > + vsli.32 q10,q8,#25 > + ldr r2,[sp,#36] > + and r3,r3,r12 > + vshr.u32 q11,q8,#18 > + add r7,r7,r11 > + add r11,r11,r0,ror#2 > + eor r3,r3,r5 > + veor q9,q9,q10 > + add r10,r10,r2 > + vsli.32 q11,q8,#14 > + eor r2,r8,r9 > + eor r0,r7,r7,ror#5 > + vshr.u32 d24,d3,#17 > + add r11,r11,r3 > + and r2,r2,r7 > + veor q9,q9,q11 > + eor r3,r0,r7,ror#19 > + eor r0,r11,r11,ror#11 > + vsli.32 d24,d3,#15 > + eor r2,r2,r9 > + add r10,r10,r3,ror#6 > + vshr.u32 d25,d3,#10 > + eor r3,r11,r4 > + eor r0,r0,r11,ror#20 > + vadd.i32 q2,q2,q9 > + add r10,r10,r2 > + ldr r2,[sp,#40] > + veor d25,d25,d24 > + and r12,r12,r3 > + add r6,r6,r10 > + vshr.u32 d24,d3,#19 > + add r10,r10,r0,ror#2 > + eor r12,r12,r4 > + vsli.32 d24,d3,#13 > + add r9,r9,r2 > + eor r2,r7,r8 > + veor d25,d25,d24 > + eor r0,r6,r6,ror#5 > + add r10,r10,r12 > + vadd.i32 d4,d4,d25 > + and r2,r2,r6 > + eor r12,r0,r6,ror#19 > + vshr.u32 d24,d4,#17 > + eor r0,r10,r10,ror#11 > + eor r2,r2,r8 > + vsli.32 d24,d4,#15 > + add r9,r9,r12,ror#6 > + eor r12,r10,r11 > + vshr.u32 d25,d4,#10 > + eor r0,r0,r10,ror#20 > + add r9,r9,r2 > + veor d25,d25,d24 > + ldr r2,[sp,#44] > + and r3,r3,r12 > + vshr.u32 d24,d4,#19 > + add r5,r5,r9 > + add r9,r9,r0,ror#2 > + eor r3,r3,r11 > + vld1.32 {q8},[r14,:128]! > + add r8,r8,r2 > + vsli.32 d24,d4,#13 > + eor r2,r6,r7 > + eor r0,r5,r5,ror#5 > + veor d25,d25,d24 > + add r9,r9,r3 > + and r2,r2,r5 > + vadd.i32 d5,d5,d25 > + eor r3,r0,r5,ror#19 > + eor r0,r9,r9,ror#11 > + vadd.i32 q8,q8,q2 > + eor r2,r2,r7 > + add r8,r8,r3,ror#6 > + eor r3,r9,r10 > + eor r0,r0,r9,ror#20 > + add r8,r8,r2 > + ldr r2,[sp,#48] > + and r12,r12,r3 > + add r4,r4,r8 > + vst1.32 {q8},[r1,:128]! > + add r8,r8,r0,ror#2 > + eor r12,r12,r10 > + vext.8 q8,q3,q0,#4 > + add r7,r7,r2 > + eor r2,r5,r6 > + eor r0,r4,r4,ror#5 > + vext.8 q9,q1,q2,#4 > + add r8,r8,r12 > + and r2,r2,r4 > + eor r12,r0,r4,ror#19 > + vshr.u32 q10,q8,#7 > + eor r0,r8,r8,ror#11 > + eor r2,r2,r6 > + vadd.i32 q3,q3,q9 > + add r7,r7,r12,ror#6 > + eor r12,r8,r9 > + vshr.u32 q9,q8,#3 > + eor r0,r0,r8,ror#20 > + add r7,r7,r2 > + vsli.32 q10,q8,#25 > + ldr r2,[sp,#52] > + and r3,r3,r12 > + vshr.u32 q11,q8,#18 > + add r11,r11,r7 > + add r7,r7,r0,ror#2 > + eor r3,r3,r9 > + veor q9,q9,q10 > + add r6,r6,r2 > + vsli.32 q11,q8,#14 > + eor r2,r4,r5 > + eor r0,r11,r11,ror#5 > + vshr.u32 d24,d5,#17 > + add r7,r7,r3 > + and r2,r2,r11 > + veor q9,q9,q11 > + eor r3,r0,r11,ror#19 > + eor r0,r7,r7,ror#11 > + vsli.32 d24,d5,#15 > + eor r2,r2,r5 > + add r6,r6,r3,ror#6 > + vshr.u32 d25,d5,#10 > + eor r3,r7,r8 > + eor r0,r0,r7,ror#20 > + vadd.i32 q3,q3,q9 > + add r6,r6,r2 > + ldr r2,[sp,#56] > + veor d25,d25,d24 > + and r12,r12,r3 > + add r10,r10,r6 > + vshr.u32 d24,d5,#19 > + add r6,r6,r0,ror#2 > + eor r12,r12,r8 > + vsli.32 d24,d5,#13 > + add r5,r5,r2 > + eor r2,r11,r4 > + veor d25,d25,d24 > + eor r0,r10,r10,ror#5 > + add r6,r6,r12 > + vadd.i32 d6,d6,d25 > + and r2,r2,r10 > + eor r12,r0,r10,ror#19 > + vshr.u32 d24,d6,#17 > + eor r0,r6,r6,ror#11 > + eor r2,r2,r4 > + vsli.32 d24,d6,#15 > + add r5,r5,r12,ror#6 > + eor r12,r6,r7 > + vshr.u32 d25,d6,#10 > + eor r0,r0,r6,ror#20 > + add r5,r5,r2 > + veor d25,d25,d24 > + ldr r2,[sp,#60] > + and r3,r3,r12 > + vshr.u32 d24,d6,#19 > + add r9,r9,r5 > + add r5,r5,r0,ror#2 > + eor r3,r3,r7 > + vld1.32 {q8},[r14,:128]! > + add r4,r4,r2 > + vsli.32 d24,d6,#13 > + eor r2,r10,r11 > + eor r0,r9,r9,ror#5 > + veor d25,d25,d24 > + add r5,r5,r3 > + and r2,r2,r9 > + vadd.i32 d7,d7,d25 > + eor r3,r0,r9,ror#19 > + eor r0,r5,r5,ror#11 > + vadd.i32 q8,q8,q3 > + eor r2,r2,r11 > + add r4,r4,r3,ror#6 > + eor r3,r5,r6 > + eor r0,r0,r5,ror#20 > + add r4,r4,r2 > + ldr r2,[r14] > + and r12,r12,r3 > + add r8,r8,r4 > + vst1.32 {q8},[r1,:128]! > + add r4,r4,r0,ror#2 > + eor r12,r12,r6 > + teq r2,#0 @ check for K256 terminator > + ldr r2,[sp,#0] > + sub r1,r1,#64 > + bne .L_00_48 > + > + ldr r1,[sp,#68] > + ldr r0,[sp,#72] > + sub r14,r14,#256 @ rewind r14 > + teq r1,r0 > + subeq r1,r1,#64 @ avoid SEGV > + vld1.8 {q0},[r1]! @ load next input block > + vld1.8 {q1},[r1]! > + vld1.8 {q2},[r1]! > + vld1.8 {q3},[r1]! > + strne r1,[sp,#68] > + mov r1,sp > + add r11,r11,r2 > + eor r2,r9,r10 > + eor r0,r8,r8,ror#5 > + add r4,r4,r12 > + vld1.32 {q8},[r14,:128]! > + and r2,r2,r8 > + eor r12,r0,r8,ror#19 > + eor r0,r4,r4,ror#11 > + eor r2,r2,r10 > + vrev32.8 q0,q0 > + add r11,r11,r12,ror#6 > + eor r12,r4,r5 > + eor r0,r0,r4,ror#20 > + add r11,r11,r2 > + vadd.i32 q8,q8,q0 > + ldr r2,[sp,#4] > + and r3,r3,r12 > + add r7,r7,r11 > + add r11,r11,r0,ror#2 > + eor r3,r3,r5 > + add r10,r10,r2 > + eor r2,r8,r9 > + eor r0,r7,r7,ror#5 > + add r11,r11,r3 > + and r2,r2,r7 > + eor r3,r0,r7,ror#19 > + eor r0,r11,r11,ror#11 > + eor r2,r2,r9 > + add r10,r10,r3,ror#6 > + eor r3,r11,r4 > + eor r0,r0,r11,ror#20 > + add r10,r10,r2 > + ldr r2,[sp,#8] > + and r12,r12,r3 > + add r6,r6,r10 > + add r10,r10,r0,ror#2 > + eor r12,r12,r4 > + add r9,r9,r2 > + eor r2,r7,r8 > + eor r0,r6,r6,ror#5 > + add r10,r10,r12 > + and r2,r2,r6 > + eor r12,r0,r6,ror#19 > + eor r0,r10,r10,ror#11 > + eor r2,r2,r8 > + add r9,r9,r12,ror#6 > + eor r12,r10,r11 > + eor r0,r0,r10,ror#20 > + add r9,r9,r2 > + ldr r2,[sp,#12] > + and r3,r3,r12 > + add r5,r5,r9 > + add r9,r9,r0,ror#2 > + eor r3,r3,r11 > + add r8,r8,r2 > + eor r2,r6,r7 > + eor r0,r5,r5,ror#5 > + add r9,r9,r3 > + and r2,r2,r5 > + eor r3,r0,r5,ror#19 > + eor r0,r9,r9,ror#11 > + eor r2,r2,r7 > + add r8,r8,r3,ror#6 > + eor r3,r9,r10 > + eor r0,r0,r9,ror#20 > + add r8,r8,r2 > + ldr r2,[sp,#16] > + and r12,r12,r3 > + add r4,r4,r8 > + add r8,r8,r0,ror#2 > + eor r12,r12,r10 > + vst1.32 {q8},[r1,:128]! > + add r7,r7,r2 > + eor r2,r5,r6 > + eor r0,r4,r4,ror#5 > + add r8,r8,r12 > + vld1.32 {q8},[r14,:128]! > + and r2,r2,r4 > + eor r12,r0,r4,ror#19 > + eor r0,r8,r8,ror#11 > + eor r2,r2,r6 > + vrev32.8 q1,q1 > + add r7,r7,r12,ror#6 > + eor r12,r8,r9 > + eor r0,r0,r8,ror#20 > + add r7,r7,r2 > + vadd.i32 q8,q8,q1 > + ldr r2,[sp,#20] > + and r3,r3,r12 > + add r11,r11,r7 > + add r7,r7,r0,ror#2 > + eor r3,r3,r9 > + add r6,r6,r2 > + eor r2,r4,r5 > + eor r0,r11,r11,ror#5 > + add r7,r7,r3 > + and r2,r2,r11 > + eor r3,r0,r11,ror#19 > + eor r0,r7,r7,ror#11 > + eor r2,r2,r5 > + add r6,r6,r3,ror#6 > + eor r3,r7,r8 > + eor r0,r0,r7,ror#20 > + add r6,r6,r2 > + ldr r2,[sp,#24] > + and r12,r12,r3 > + add r10,r10,r6 > + add r6,r6,r0,ror#2 > + eor r12,r12,r8 > + add r5,r5,r2 > + eor r2,r11,r4 > + eor r0,r10,r10,ror#5 > + add r6,r6,r12 > + and r2,r2,r10 > + eor r12,r0,r10,ror#19 > + eor r0,r6,r6,ror#11 > + eor r2,r2,r4 > + add r5,r5,r12,ror#6 > + eor r12,r6,r7 > + eor r0,r0,r6,ror#20 > + add r5,r5,r2 > + ldr r2,[sp,#28] > + and r3,r3,r12 > + add r9,r9,r5 > + add r5,r5,r0,ror#2 > + eor r3,r3,r7 > + add r4,r4,r2 > + eor r2,r10,r11 > + eor r0,r9,r9,ror#5 > + add r5,r5,r3 > + and r2,r2,r9 > + eor r3,r0,r9,ror#19 > + eor r0,r5,r5,ror#11 > + eor r2,r2,r11 > + add r4,r4,r3,ror#6 > + eor r3,r5,r6 > + eor r0,r0,r5,ror#20 > + add r4,r4,r2 > + ldr r2,[sp,#32] > + and r12,r12,r3 > + add r8,r8,r4 > + add r4,r4,r0,ror#2 > + eor r12,r12,r6 > + vst1.32 {q8},[r1,:128]! > + add r11,r11,r2 > + eor r2,r9,r10 > + eor r0,r8,r8,ror#5 > + add r4,r4,r12 > + vld1.32 {q8},[r14,:128]! > + and r2,r2,r8 > + eor r12,r0,r8,ror#19 > + eor r0,r4,r4,ror#11 > + eor r2,r2,r10 > + vrev32.8 q2,q2 > + add r11,r11,r12,ror#6 > + eor r12,r4,r5 > + eor r0,r0,r4,ror#20 > + add r11,r11,r2 > + vadd.i32 q8,q8,q2 > + ldr r2,[sp,#36] > + and r3,r3,r12 > + add r7,r7,r11 > + add r11,r11,r0,ror#2 > + eor r3,r3,r5 > + add r10,r10,r2 > + eor r2,r8,r9 > + eor r0,r7,r7,ror#5 > + add r11,r11,r3 > + and r2,r2,r7 > + eor r3,r0,r7,ror#19 > + eor r0,r11,r11,ror#11 > + eor r2,r2,r9 > + add r10,r10,r3,ror#6 > + eor r3,r11,r4 > + eor r0,r0,r11,ror#20 > + add r10,r10,r2 > + ldr r2,[sp,#40] > + and r12,r12,r3 > + add r6,r6,r10 > + add r10,r10,r0,ror#2 > + eor r12,r12,r4 > + add r9,r9,r2 > + eor r2,r7,r8 > + eor r0,r6,r6,ror#5 > + add r10,r10,r12 > + and r2,r2,r6 > + eor r12,r0,r6,ror#19 > + eor r0,r10,r10,ror#11 > + eor r2,r2,r8 > + add r9,r9,r12,ror#6 > + eor r12,r10,r11 > + eor r0,r0,r10,ror#20 > + add r9,r9,r2 > + ldr r2,[sp,#44] > + and r3,r3,r12 > + add r5,r5,r9 > + add r9,r9,r0,ror#2 > + eor r3,r3,r11 > + add r8,r8,r2 > + eor r2,r6,r7 > + eor r0,r5,r5,ror#5 > + add r9,r9,r3 > + and r2,r2,r5 > + eor r3,r0,r5,ror#19 > + eor r0,r9,r9,ror#11 > + eor r2,r2,r7 > + add r8,r8,r3,ror#6 > + eor r3,r9,r10 > + eor r0,r0,r9,ror#20 > + add r8,r8,r2 > + ldr r2,[sp,#48] > + and r12,r12,r3 > + add r4,r4,r8 > + add r8,r8,r0,ror#2 > + eor r12,r12,r10 > + vst1.32 {q8},[r1,:128]! > + add r7,r7,r2 > + eor r2,r5,r6 > + eor r0,r4,r4,ror#5 > + add r8,r8,r12 > + vld1.32 {q8},[r14,:128]! > + and r2,r2,r4 > + eor r12,r0,r4,ror#19 > + eor r0,r8,r8,ror#11 > + eor r2,r2,r6 > + vrev32.8 q3,q3 > + add r7,r7,r12,ror#6 > + eor r12,r8,r9 > + eor r0,r0,r8,ror#20 > + add r7,r7,r2 > + vadd.i32 q8,q8,q3 > + ldr r2,[sp,#52] > + and r3,r3,r12 > + add r11,r11,r7 > + add r7,r7,r0,ror#2 > + eor r3,r3,r9 > + add r6,r6,r2 > + eor r2,r4,r5 > + eor r0,r11,r11,ror#5 > + add r7,r7,r3 > + and r2,r2,r11 > + eor r3,r0,r11,ror#19 > + eor r0,r7,r7,ror#11 > + eor r2,r2,r5 > + add r6,r6,r3,ror#6 > + eor r3,r7,r8 > + eor r0,r0,r7,ror#20 > + add r6,r6,r2 > + ldr r2,[sp,#56] > + and r12,r12,r3 > + add r10,r10,r6 > + add r6,r6,r0,ror#2 > + eor r12,r12,r8 > + add r5,r5,r2 > + eor r2,r11,r4 > + eor r0,r10,r10,ror#5 > + add r6,r6,r12 > + and r2,r2,r10 > + eor r12,r0,r10,ror#19 > + eor r0,r6,r6,ror#11 > + eor r2,r2,r4 > + add r5,r5,r12,ror#6 > + eor r12,r6,r7 > + eor r0,r0,r6,ror#20 > + add r5,r5,r2 > + ldr r2,[sp,#60] > + and r3,r3,r12 > + add r9,r9,r5 > + add r5,r5,r0,ror#2 > + eor r3,r3,r7 > + add r4,r4,r2 > + eor r2,r10,r11 > + eor r0,r9,r9,ror#5 > + add r5,r5,r3 > + and r2,r2,r9 > + eor r3,r0,r9,ror#19 > + eor r0,r5,r5,ror#11 > + eor r2,r2,r11 > + add r4,r4,r3,ror#6 > + eor r3,r5,r6 > + eor r0,r0,r5,ror#20 > + add r4,r4,r2 > + ldr r2,[sp,#64] > + and r12,r12,r3 > + add r8,r8,r4 > + add r4,r4,r0,ror#2 > + eor r12,r12,r6 > + vst1.32 {q8},[r1,:128]! > + ldr r0,[r2,#0] > + add r4,r4,r12 @ h+=Maj(a,b,c) from the past > + ldr r12,[r2,#4] > + ldr r3,[r2,#8] > + ldr r1,[r2,#12] > + add r4,r4,r0 @ accumulate > + ldr r0,[r2,#16] > + add r5,r5,r12 > + ldr r12,[r2,#20] > + add r6,r6,r3 > + ldr r3,[r2,#24] > + add r7,r7,r1 > + ldr r1,[r2,#28] > + add r8,r8,r0 > + str r4,[r2],#4 > + add r9,r9,r12 > + str r5,[r2],#4 > + add r10,r10,r3 > + str r6,[r2],#4 > + add r11,r11,r1 > + str r7,[r2],#4 > + stmia r2,{r8-r11} > + > + movne r1,sp > + ldrne r2,[sp,#0] > + eorne r12,r12,r12 > + ldreq sp,[sp,#76] @ restore original sp > + eorne r3,r5,r6 > + bne .L_00_48 > + > + ldmia sp!,{r4-r12,pc} > +ENDPROC(sha256_transform_neon) > diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c > new file mode 100644 > index 0000000..698a498 > --- /dev/null > +++ b/arch/arm/crypto/sha256_neon_glue.c > @@ -0,0 +1,201 @@ > +/* > + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation > + * using NEON instructions. > + * > + * Copyright © 2015 Google Inc. > + * > + * This file is based on sha512_neon_glue.c: > + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the Free > + * Software Foundation; either version 2 of the License, or (at your option) > + * any later version. > + * > + */ > + > +#include <crypto/internal/hash.h> > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/mm.h> > +#include <linux/cryptohash.h> > +#include <linux/types.h> > +#include <linux/string.h> > +#include <crypto/sha.h> > +#include <asm/byteorder.h> > +#include <asm/simd.h> > +#include <asm/neon.h> > + > +asmlinkage void sha256_transform_neon(u32 *digest, const void *data, > + unsigned int num_blks); > + > + > +static int sha256_neon_init(struct shash_desc *desc) > +{ > + struct sha256_state *sctx = shash_desc_ctx(desc); > + > + sctx->state[0] = SHA256_H0; > + sctx->state[1] = SHA256_H1; > + sctx->state[2] = SHA256_H2; > + sctx->state[3] = SHA256_H3; > + sctx->state[4] = SHA256_H4; > + sctx->state[5] = SHA256_H5; > + sctx->state[6] = SHA256_H6; > + sctx->state[7] = SHA256_H7; > + sctx->count = 0; > + > + return 0; > +} > + > +static int __sha256_neon_update(struct shash_desc *desc, const u8 *data, > + unsigned int len, unsigned int partial) > +{ > + struct sha256_state *sctx = shash_desc_ctx(desc); > + unsigned int done = 0; > + > + sctx->count += len; > + > + if (partial) { > + done = SHA256_BLOCK_SIZE - partial; > + memcpy(sctx->buf + partial, data, done); > + sha256_transform_neon(sctx->state, sctx->buf, 1); > + } > + > + if (len - done >= SHA256_BLOCK_SIZE) { > + const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; > + > + sha256_transform_neon(sctx->state, data + done, rounds); > + done += rounds * SHA256_BLOCK_SIZE; > + } > + > + memcpy(sctx->buf, data + done, len - done); > + > + return 0; > +} > + > +static int sha256_neon_update(struct shash_desc *desc, const u8 *data, > + unsigned int len) > +{ > + struct sha256_state *sctx = shash_desc_ctx(desc); > + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; > + int res; > + > + /* Handle the fast case right here */ > + if (partial + len < SHA256_BLOCK_SIZE) { > + sctx->count += len; > + memcpy(sctx->buf + partial, data, len); > + > + return 0; > + } > + > + if (!may_use_simd()) { > + res = crypto_sha256_update(desc, data, len); > + } else { > + kernel_neon_begin(); > + res = __sha256_neon_update(desc, data, len, partial); > + kernel_neon_end(); > + } > + > + return res; > +} > + > +/* Add padding and return the message digest. */ > +static int sha256_neon_final(struct shash_desc *desc, u8 *out) > +{ > + struct sha256_state *sctx = shash_desc_ctx(desc); > + unsigned int i, index, padlen; > + __be32 *dst = (__be32 *)out; > + __be64 bits; > + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; > + > + /* save number of bits */ > + bits = cpu_to_be64(sctx->count << 3); > + > + /* Pad out to 56 mod 64 and append length */ > + index = sctx->count % SHA256_BLOCK_SIZE; > + padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); > + > + if (!may_use_simd()) { > + crypto_sha256_update(desc, padding, padlen); > + crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); > + } else { > + kernel_neon_begin(); > + /* We need to fill a whole block for __sha256_neon_update() */ > + if (padlen <= 56) { > + sctx->count += padlen; > + memcpy(sctx->buf + index, padding, padlen); > + } else { > + __sha256_neon_update(desc, padding, padlen, index); > + } > + __sha256_neon_update(desc, (const u8 *)&bits, > + sizeof(bits), 56); > + kernel_neon_end(); > + } > + > + /* Store state in digest */ > + for (i = 0; i < 8; i++) > + dst[i] = cpu_to_be32(sctx->state[i]); > + > + /* Wipe context */ > + memset(sctx, 0, sizeof(*sctx)); > + > + return 0; > +} > + > +static int sha256_neon_export(struct shash_desc *desc, void *out) > +{ > + struct sha256_state *sctx = shash_desc_ctx(desc); > + > + memcpy(out, sctx, sizeof(*sctx)); > + > + return 0; > +} > + > +static int sha256_neon_import(struct shash_desc *desc, const void *in) > +{ > + struct sha256_state *sctx = shash_desc_ctx(desc); > + > + memcpy(sctx, in, sizeof(*sctx)); > + > + return 0; > +} > + > +static struct shash_alg alg = { > + .digestsize = SHA256_DIGEST_SIZE, > + .init = sha256_neon_init, > + .update = sha256_neon_update, > + .final = sha256_neon_final, > + .export = sha256_neon_export, > + .import = sha256_neon_import, > + .descsize = sizeof(struct sha256_state), > + .statesize = sizeof(struct sha256_state), > + .base = { > + .cra_name = "sha256", > + .cra_driver_name = "sha256-neon", > + .cra_priority = 350, > + .cra_flags = CRYPTO_ALG_TYPE_SHASH, > + .cra_blocksize = SHA256_BLOCK_SIZE, > + .cra_module = THIS_MODULE, > + } > +}; > + You can also implement SHA-224 using the same core transform, it's just some trivial glue code. > +static int __init sha256_neon_mod_init(void) > +{ > + if (!cpu_has_neon()) > + return -ENODEV; > + > + return crypto_register_shash(&alg); > +} > + > +static void __exit sha256_neon_mod_fini(void) > +{ > + crypto_unregister_shash(&alg); > +} > + > +module_init(sha256_neon_mod_init); > +module_exit(sha256_neon_mod_fini); > + > +MODULE_LICENSE("GPL"); > +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated"); > + > +MODULE_ALIAS("sha256"); > diff --git a/crypto/Kconfig b/crypto/Kconfig > index 50f4da4..0505523 100644 > --- a/crypto/Kconfig > +++ b/crypto/Kconfig > @@ -610,6 +610,18 @@ config CRYPTO_SHA256 > This code also includes SHA-224, a 224 bit hash with 112 bits > of security against collision attacks. > > +config CRYPTO_SHA256_ARM_NEON > + tristate "SHA256 digest algorithm (ARM NEON)" > + depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN > + select CRYPTO_SHA256 > + select CRYPTO_HASH > + help > + SHA-256 secure hash standard (DFIPS 180-2) implemented > + using ARM NEON instructions, when available. > + > + This version of SHA implements a 256 bit hash with 128 bits of > + security against collision attacks. > + Could you please rebase this onto Herbert's cryptodev tree and move this to arch/arm/crypto/Kconfig? > config CRYPTO_SHA256_SPARC64 > tristate "SHA224 and SHA256 digest algorithm (SPARC64)" > depends on SPARC64 Regards, Ard.
On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote: > Have you tested this code with the tcrypt.ko module? I have not, but I can look into it. > Did you talk to Andy about the license? I don't think this is > permissible for the kernel as-is. Unless I have misunderstood something, the license at the Cryptogams website includes an option to license the code under the GNU GPL. However, I can certainly contact Andy to clarify his intentions. > This is broken on thumb-2, use adr instead > You can also implement SHA-224 using the same core transform, it's > just some trivial glue code. > Could you please rebase this onto Herbert's cryptodev tree and move > this to arch/arm/crypto/Kconfig? Thanks for the comments, I will submit a second version once we have a clarification on the license. Sami
On 15:48 Mon 16 Mar , Sami Tolvanen wrote: > Add Andy Polyakov's NEON optimized SHA-256 implementation. > > On Nexus 6, this implementation is ~2x faster than sha256-generic. do you plan to add the sha512 from openssl too? Whould be nice so armv4 can get faster implementatio too Best Regards, J.
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index b48fa34..316dba2 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o +obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o aes-arm-y := aes-armv4.o aes_glue.o aes-arm-bs-y := aesbs-core.o aesbs-glue.o sha1-arm-y := sha1-armv4-large.o sha1_glue.o sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o +sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o quiet_cmd_perl = PERL $@ diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S new file mode 100644 index 0000000..5ce04c2 --- /dev/null +++ b/arch/arm/crypto/sha256-armv7-neon.S @@ -0,0 +1,819 @@ +@ sha256-armv7-neon.S - ARM/NEON assembly implementation of SHA-256 transform +@ +@ ==================================================================== +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +#include <linux/linkage.h> + +.text +.code 32 +.fpu neon + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +.word 0 +.align 5 + +.align 5 +ENTRY(sha256_transform_neon) + /* Input: + * %r0: SHA256_CONTEXT + * %r1: data + * %r2: nblks + */ + sub r3,pc,#8 @ sha256_transform_neon + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + stmdb sp!,{r4-r12,lr} + + mov r12,sp + sub sp,sp,#16*4+16 @ alloca + sub r14,r3,#256+32 @ K256 + bic sp,sp,#15 @ align for 128-bit stores + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4-r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8-r11} + + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +ENDPROC(sha256_transform_neon) diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c new file mode 100644 index 0000000..698a498 --- /dev/null +++ b/arch/arm/crypto/sha256_neon_glue.c @@ -0,0 +1,201 @@ +/* + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation + * using NEON instructions. + * + * Copyright © 2015 Google Inc. + * + * This file is based on sha512_neon_glue.c: + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/string.h> +#include <crypto/sha.h> +#include <asm/byteorder.h> +#include <asm/simd.h> +#include <asm/neon.h> + +asmlinkage void sha256_transform_neon(u32 *digest, const void *data, + unsigned int num_blks); + + +static int sha256_neon_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +static int __sha256_neon_update(struct shash_desc *desc, const u8 *data, + unsigned int len, unsigned int partial) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int done = 0; + + sctx->count += len; + + if (partial) { + done = SHA256_BLOCK_SIZE - partial; + memcpy(sctx->buf + partial, data, done); + sha256_transform_neon(sctx->state, sctx->buf, 1); + } + + if (len - done >= SHA256_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; + + sha256_transform_neon(sctx->state, data + done, rounds); + done += rounds * SHA256_BLOCK_SIZE; + } + + memcpy(sctx->buf, data + done, len - done); + + return 0; +} + +static int sha256_neon_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA256_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buf + partial, data, len); + + return 0; + } + + if (!may_use_simd()) { + res = crypto_sha256_update(desc, data, len); + } else { + kernel_neon_begin(); + res = __sha256_neon_update(desc, data, len, partial); + kernel_neon_end(); + } + + return res; +} + +/* Add padding and return the message digest. */ +static int sha256_neon_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; + + /* save number of bits */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA256_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); + + if (!may_use_simd()) { + crypto_sha256_update(desc, padding, padlen); + crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); + } else { + kernel_neon_begin(); + /* We need to fill a whole block for __sha256_neon_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buf + index, padding, padlen); + } else { + __sha256_neon_update(desc, padding, padlen, index); + } + __sha256_neon_update(desc, (const u8 *)&bits, + sizeof(bits), 56); + kernel_neon_end(); + } + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha256_neon_export(struct shash_desc *desc, void *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha256_neon_import(struct shash_desc *desc, const void *in) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_neon_init, + .update = sha256_neon_update, + .final = sha256_neon_final, + .export = sha256_neon_export, + .import = sha256_neon_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-neon", + .cra_priority = 350, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha256_neon_mod_init(void) +{ + if (!cpu_has_neon()) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit sha256_neon_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(sha256_neon_mod_init); +module_exit(sha256_neon_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated"); + +MODULE_ALIAS("sha256"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 50f4da4..0505523 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -610,6 +610,18 @@ config CRYPTO_SHA256 This code also includes SHA-224, a 224 bit hash with 112 bits of security against collision attacks. +config CRYPTO_SHA256_ARM_NEON + tristate "SHA256 digest algorithm (ARM NEON)" + depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN + select CRYPTO_SHA256 + select CRYPTO_HASH + help + SHA-256 secure hash standard (DFIPS 180-2) implemented + using ARM NEON instructions, when available. + + This version of SHA implements a 256 bit hash with 128 bits of + security against collision attacks. + config CRYPTO_SHA256_SPARC64 tristate "SHA224 and SHA256 digest algorithm (SPARC64)" depends on SPARC64
Add Andy Polyakov's NEON optimized SHA-256 implementation. On Nexus 6, this implementation is ~2x faster than sha256-generic. Signed-off-by: Sami Tolvanen <samitolvanen@google.com> --- arch/arm/crypto/Makefile | 2 arch/arm/crypto/sha256-armv7-neon.S | 819 ++++++++++++++++++++++++++++++++++++ arch/arm/crypto/sha256_neon_glue.c | 201 ++++++++ crypto/Kconfig | 12 4 files changed, 1034 insertions(+)