@@ -68,10 +68,10 @@
.text
.arch armv8-a+crypto
- init_crc .req w19
- buf .req x20
- len .req x21
- fold_consts_ptr .req x22
+ init_crc .req w0
+ buf .req x1
+ len .req x2
+ fold_consts_ptr .req x3
fold_consts .req v10
@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
.endm
.macro crc_t10dif_pmull, p
- frame_push 4, 128
-
- mov init_crc, w0
- mov buf, x1
- mov len, x2
-
__pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
fold_32_bytes \p, v6, v7
subs len, len, #128
- b.lt .Lfold_128_bytes_loop_done_\@
-
- if_will_cond_yield_neon
- stp q0, q1, [sp, #.Lframe_local_offset]
- stp q2, q3, [sp, #.Lframe_local_offset + 32]
- stp q4, q5, [sp, #.Lframe_local_offset + 64]
- stp q6, q7, [sp, #.Lframe_local_offset + 96]
- do_cond_yield_neon
- ldp q0, q1, [sp, #.Lframe_local_offset]
- ldp q2, q3, [sp, #.Lframe_local_offset + 32]
- ldp q4, q5, [sp, #.Lframe_local_offset + 64]
- ldp q6, q7, [sp, #.Lframe_local_offset + 96]
- ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_init_\p
- __pmull_pre_\p fold_consts
- endif_yield_neon
-
- b .Lfold_128_bytes_loop_\@
-
-.Lfold_128_bytes_loop_done_\@:
+ b.ge .Lfold_128_bytes_loop_\@
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
umov w0, v0.h[0]
- frame_pop
+ .ifc \p, p8
+ ldp x29, x30, [sp], #16
+ .endif
ret
.Lless_than_256_bytes_\@:
@@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Assumes len >= 16.
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
- crc_t10dif_pmull p8
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ crc_t10dif_pmull p8
SYM_FUNC_END(crc_t10dif_pmull_p8)
.align 5
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p8(*crc, data, length);
- kernel_neon_end();
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p64(*crc, data, length);
- kernel_neon_end();
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
Instead of yielding from the bowels of the asm routine if a reschedule is needed, divide up the input into 4 KB chunks in the C glue. This simplifies the code substantially, and avoids scheduling out the task with the asm routine on the call stack, which is undesirable from a CFI/instrumentation point of view. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- arch/arm64/crypto/crct10dif-ce-core.S | 43 +++++--------------- arch/arm64/crypto/crct10dif-ce-glue.c | 30 +++++++++++--- 2 files changed, 35 insertions(+), 38 deletions(-)