diff mbox series

[10/12] x86/crypto: aesni: Introduce READ_PARTIAL_BLOCK macro

Message ID 1b813c4617813c08bea79ff57f3497ea2d32df24.1544471415.git.davejwatson@fb.com (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series x86/crypto: gcmaes AVX scatter/gather support | expand

Commit Message

Dave Watson Dec. 10, 2018, 7:59 p.m. UTC
Introduce READ_PARTIAL_BLOCK macro, and use it in the two existing
partial block cases: AAD and the end of ENC_DEC.   In particular,
the ENC_DEC case should be faster, since we read by 8/4 bytes if
possible.

This macro will also be used to read partial blocks between
enc_update and dec_update calls.

Signed-off-by: Dave Watson <davejwatson@fb.com>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 102 +++++++++++++----------
 1 file changed, 59 insertions(+), 43 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 44a4a8b43ca4..ff00ad19064d 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -415,68 +415,56 @@  _zero_cipher_left\@:
         vmovdqu %xmm14, AadHash(arg2)
         vmovdqu %xmm9, CurCount(arg2)
 
-        cmp     $16, arg5
-        jl      _only_less_than_16\@
-
+        # check for 0 length
         mov     arg5, %r13
         and     $15, %r13                            # r13 = (arg5 mod 16)
 
         je      _multiple_of_16_bytes\@
 
-        # handle the last <16 Byte block seperately
+        # handle the last <16 Byte block separately
 
         mov %r13, PBlockLen(arg2)
 
-        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
+        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
         vmovdqu %xmm9, CurCount(arg2)
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
         vmovdqu %xmm9, PBlockEncKey(arg2)
 
-        sub     $16, %r11
-        add     %r13, %r11
-        vmovdqu (arg4, %r11), %xmm1                  # receive the last <16 Byte block
-
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
-        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
-        jmp     _final_ghash_mul\@
-
-_only_less_than_16\@:
-        # check for 0 length
-        mov     arg5, %r13
-        and     $15, %r13                            # r13 = (arg5 mod 16)
+        cmp $16, arg5
+        jge _large_enough_update\@
 
-        je      _multiple_of_16_bytes\@
+        lea (arg4,%r11,1), %r10
+        mov %r13, %r12
 
-        # handle the last <16 Byte block separately
-
-
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
-
-        vmovdqu %xmm9, PBlockEncKey(arg2)
+        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
 
         lea     SHIFT_MASK+16(%rip), %r12
         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
 						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
+	# number of bytes in plaintext mod 16)
 
-_get_last_16_byte_loop\@:
-        movb    (arg4, %r11),  %al
-        movb    %al,  TMP1 (%rsp , %r11)
-        add     $1, %r11
-        cmp     %r13,  %r11
-        jne     _get_last_16_byte_loop\@
+        jmp _final_ghash_mul\@
+
+_large_enough_update\@:
+        sub $16, %r11
+        add %r13, %r11
+
+        # receive the last <16 Byte block
+        vmovdqu	(arg4, %r11, 1), %xmm1
 
-        vmovdqu  TMP1(%rsp), %xmm1
+        sub	%r13, %r11
+        add	$16, %r11
 
-        sub     $16, %r11
+        lea	SHIFT_MASK+16(%rip), %r12
+        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+        # (r13 is the number of bytes in plaintext mod 16)
+        sub	%r13, %r12
+        # get the appropriate shuffle mask
+        vmovdqu	(%r12), %xmm2
+        # shift right 16-r13 bytes
+        vpshufb  %xmm2, %xmm1, %xmm1
 
 _final_ghash_mul\@:
         .if  \ENC_DEC ==  DEC
@@ -490,8 +478,6 @@  _final_ghash_mul\@:
         vpxor   %xmm2, %xmm14, %xmm14
 
         vmovdqu %xmm14, AadHash(arg2)
-        sub     %r13, %r11
-        add     $16, %r11
         .else
         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
@@ -501,8 +487,6 @@  _final_ghash_mul\@:
         vpxor   %xmm9, %xmm14, %xmm14
 
         vmovdqu %xmm14, AadHash(arg2)
-        sub     %r13, %r11
-        add     $16, %r11
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
         .endif
 
@@ -721,6 +705,38 @@  _get_AAD_done\@:
         \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 .endm
 
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+        vpxor \XMMDst, \XMMDst, \XMMDst
+
+        cmp $8, \DLEN
+        jl _read_lt8_\@
+        mov (\DPTR), %rax
+        vpinsrq $0, %rax, \XMMDst, \XMMDst
+        sub $8, \DLEN
+        jz _done_read_partial_block_\@
+        xor %eax, %eax
+_read_next_byte_\@:
+        shl $8, %rax
+        mov 7(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_\@
+        vpinsrq $1, %rax, \XMMDst, \XMMDst
+        jmp _done_read_partial_block_\@
+_read_lt8_\@:
+        xor %eax, %eax
+_read_next_byte_lt8_\@:
+        shl $8, %rax
+        mov -1(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_lt8_\@
+        vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
 #ifdef CONFIG_AS_AVX
 ###############################################################################
 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)