[1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes

Message ID	20241014042447.50197-2-ebiggers@kernel.org (mailing list archive)
State	Accepted
Delegated to:	Herbert Xu
Headers	show Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9143C132132; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) From: Eric Biggers <ebiggers@kernel.org> To: linux-crypto@vger.kernel.org Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>, Josh Poimboeuf <jpoimboe@kernel.org>, Peter Zijlstra <peterz@infradead.org> Subject: [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes Date: Sun, 13 Oct 2024 21:24:45 -0700 Message-ID: <20241014042447.50197-2-ebiggers@kernel.org> In-Reply-To: <20241014042447.50197-1-ebiggers@kernel.org> References: <20241014042447.50197-1-ebiggers@kernel.org> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	crypto: x86/crc32c - jump table elimination and other cleanups \| expand [0/3] crypto: x86/crc32c - jump table elimination and other cleanups [1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes [2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit [3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index bbcff1fb78cb2..466cea4943963 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -54,24 +54,14 @@ .macro JMPTBL_ENTRY i .quad .Lcrc_\i .endm -.macro JNC_LESS_THAN j - jnc .Lless_than_\j -.endm - -# Define threshold where buffers are considered "small" and routed to more -# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so -# SMALL_SIZE can be no larger than 255. - +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 -.if (SMALL_SIZE > 255) -.error "SMALL_ SIZE must be < 256" -.endif - # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); .text SYM_FUNC_START(crc_pcl) #define bufp rdi @@ -98,29 +88,22 @@ SYM_FUNC_START(crc_pcl) pushq %rsi ## Move crc_init for Linux to a different mov crc_init_arg, crc_init + mov %bufp, bufptmp # rdi = *buf + cmp $SMALL_SIZE, len + jb .Lsmall + ################################################################ ## 1) ALIGN: ################################################################ - - mov %bufp, bufptmp # rdi = *buf neg %bufp and $7, %bufp # calculate the unalignment amount of # the address je .Lproc_block # Skip if aligned - ## If len is less than 8 and we're unaligned, we need to jump - ## to special code to avoid reading beyond the end of the buffer - cmp $8, len - jae .Ldo_align - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $32-3+1, len_dw - jmp .Lless_than_8_post_shl1 - .Ldo_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer add %bufp, bufptmp # align buffer pointer for quadword # processing @@ -142,13 +125,10 @@ SYM_FUNC_START(crc_pcl) cmpq $128*24, len jae .Lfull_block .Lcontinue_block: - cmpq $SMALL_SIZE, len - jb .Lsmall - ## len < 128*24 movq $2731, %rax # 2731 = ceil(2^16 / 24) mul len_dw shrq $16, %rax @@ -241,72 +221,42 @@ LABEL crc_ %i LABEL crc_ 0 ENDBR mov tmp, len cmp $128*24, tmp jae .Lfull_block - cmp $24, tmp + cmp $SMALL_SIZE, tmp jae .Lcontinue_block -.Lless_than_24: - shl $32-4, len_dw # less_than_16 expects length - # in upper 4 bits of len_dw - jnc .Lless_than_16 - crc32q (bufptmp), crc_init - crc32q 8(bufptmp), crc_init - jz .Ldo_return - add $16, bufptmp - # len is less than 8 if we got here - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $2, len_dw - jmp .Lless_than_8_post_shl1 - ####################################################################### - ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: - shl $32-8, len_dw # Prepare len_dw for less_than_256 - j=256 -.rept 5 # j = {256, 128, 64, 32, 16} -.altmacro -LABEL less_than_ %j # less_than_j: Length should be in - # upper lg(j) bits of len_dw - j=(j/2) - shl $1, len_dw # Get next MSB - JNC_LESS_THAN %j -.noaltmacro - i=0 -.rept (j/8) - crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data - i=i+8 -.endr - jz .Ldo_return # Return if remaining length is zero - add $j, bufptmp # Advance buf -.endr - -.Lless_than_8: # Length should be stored in - # upper 3 bits of len_dw - shl $1, len_dw -.Lless_than_8_post_shl1: - jnc .Lless_than_4 - crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz .Ldo_return # return if remaining data is zero - add $4, bufptmp -.Lless_than_4: # Length should be stored in - # upper 2 bits of len_dw - shl $1, len_dw - jnc .Lless_than_2 - crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz .Ldo_return # return if remaining data is zero - add $2, bufptmp -.Lless_than_2: # Length should be stored in the MSB - # of len_dw - shl $1, len_dw - jnc .Lless_than_1 - crc32b (bufptmp), crc_init_dw # CRC of 1 byte -.Lless_than_1: # Length should be zero -.Ldo_return: + test len, len + jz .Ldone + mov len_dw, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufptmp), crc_init + add $8, bufptmp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len_dw + jz .Ldo_word + crc32l (bufptmp), crc_init_dw + add $4, bufptmp +.Ldo_word: + test $2, len_dw + jz .Ldo_byte + crc32w (bufptmp), crc_init_dw + add $2, bufptmp +.Ldo_byte: + test $1, len_dw + jz .Ldone + crc32b (bufptmp), crc_init_dw +.Ldone: movq crc_init, %rax popq %rsi popq %rdi popq %rbx RET

[1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes

Commit Message

Patch