@@ -415,68 +415,56 @@ _zero_cipher_left\@:
vmovdqu %xmm14, AadHash(arg2)
vmovdqu %xmm9, CurCount(arg2)
- cmp $16, arg5
- jl _only_less_than_16\@
-
+ # check for 0 length
mov arg5, %r13
and $15, %r13 # r13 = (arg5 mod 16)
je _multiple_of_16_bytes\@
- # handle the last <16 Byte block seperately
+ # handle the last <16 Byte block separately
mov %r13, PBlockLen(arg2)
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
vmovdqu %xmm9, CurCount(arg2)
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
vmovdqu %xmm9, PBlockEncKey(arg2)
- sub $16, %r11
- add %r13, %r11
- vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
- jmp _final_ghash_mul\@
-
-_only_less_than_16\@:
- # check for 0 length
- mov arg5, %r13
- and $15, %r13 # r13 = (arg5 mod 16)
+ cmp $16, arg5
+ jge _large_enough_update\@
- je _multiple_of_16_bytes\@
+ lea (arg4,%r11,1), %r10
+ mov %r13, %r12
- # handle the last <16 Byte block separately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
-
- vmovdqu %xmm9, PBlockEncKey(arg2)
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
lea SHIFT_MASK+16(%rip), %r12
sub %r13, %r12 # adjust the shuffle mask pointer to be
# able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
+ # number of bytes in plaintext mod 16)
-_get_last_16_byte_loop\@:
- movb (arg4, %r11), %al
- movb %al, TMP1 (%rsp , %r11)
- add $1, %r11
- cmp %r13, %r11
- jne _get_last_16_byte_loop\@
+ jmp _final_ghash_mul\@
+
+_large_enough_update\@:
+ sub $16, %r11
+ add %r13, %r11
+
+ # receive the last <16 Byte block
+ vmovdqu (arg4, %r11, 1), %xmm1
- vmovdqu TMP1(%rsp), %xmm1
+ sub %r13, %r11
+ add $16, %r11
- sub $16, %r11
+ lea SHIFT_MASK+16(%rip), %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ sub %r13, %r12
+ # get the appropriate shuffle mask
+ vmovdqu (%r12), %xmm2
+ # shift right 16-r13 bytes
+ vpshufb %xmm2, %xmm1, %xmm1
_final_ghash_mul\@:
.if \ENC_DEC == DEC
@@ -490,8 +478,6 @@ _final_ghash_mul\@:
vpxor %xmm2, %xmm14, %xmm14
vmovdqu %xmm14, AadHash(arg2)
- sub %r13, %r11
- add $16, %r11
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
@@ -501,8 +487,6 @@ _final_ghash_mul\@:
vpxor %xmm9, %xmm14, %xmm14
vmovdqu %xmm14, AadHash(arg2)
- sub %r13, %r11
- add $16, %r11
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
.endif
@@ -721,6 +705,38 @@ _get_AAD_done\@:
\PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
.endm
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+ vpxor \XMMDst, \XMMDst, \XMMDst
+
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ vpinsrq $1, %rax, \XMMDst, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
Introduce READ_PARTIAL_BLOCK macro, and use it in the two existing partial block cases: AAD and the end of ENC_DEC. In particular, the ENC_DEC case should be faster, since we read by 8/4 bytes if possible. This macro will also be used to read partial blocks between enc_update and dec_update calls. Signed-off-by: Dave Watson <davejwatson@fb.com> --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 102 +++++++++++++---------- 1 file changed, 59 insertions(+), 43 deletions(-)