Message ID | 20220504001823.2483834-7-nhuck@google.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Herbert Xu |
Headers | show |
Series | crypto: HCTR2 support | expand |
Hi Nathan, Thanks for cleaning this up. On Wed, 4 May 2022 at 02:18, Nathan Huckleberry <nhuck@google.com> wrote: > > Added some clarifying comments, changed the register allocations to make > the code clearer, and added register aliases. > > Signed-off-by: Nathan Huckleberry <nhuck@google.com> With one comment below addressed: Reviewed-by: Ard Biesheuvel <ardb@kernel.org> > --- > arch/arm64/crypto/aes-modes.S | 193 ++++++++++++++++++++++------------ > 1 file changed, 128 insertions(+), 65 deletions(-) > > diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S > index 55df157fce3a..da7c9f3380f8 100644 > --- a/arch/arm64/crypto/aes-modes.S > +++ b/arch/arm64/crypto/aes-modes.S > @@ -322,32 +322,60 @@ AES_FUNC_END(aes_cbc_cts_decrypt) > * This macro generates the code for CTR and XCTR mode. > */ > .macro ctr_encrypt xctr > + // Arguments > + OUT .req x0 > + IN .req x1 > + KEY .req x2 > + ROUNDS_W .req w3 > + BYTES_W .req w4 > + IV .req x5 > + BYTE_CTR_W .req w6 // XCTR only > + // Intermediate values > + CTR_W .req w11 // XCTR only > + CTR .req x11 // XCTR only > + IV_PART .req x12 > + BLOCKS .req x13 > + BLOCKS_W .req w13 > + > stp x29, x30, [sp, #-16]! > mov x29, sp > > - enc_prepare w3, x2, x12 > - ld1 {vctr.16b}, [x5] > + enc_prepare ROUNDS_W, KEY, IV_PART > + ld1 {vctr.16b}, [IV] > > + /* > + * Keep 64 bits of the IV in a register. For CTR mode this lets us > + * easily increment the IV. For XCTR mode this lets us efficiently XOR > + * the 64-bit counter with the IV. > + */ > .if \xctr > - umov x12, vctr.d[0] > - lsr w11, w6, #4 > + umov IV_PART, vctr.d[0] > + lsr CTR_W, BYTE_CTR_W, #4 > .else > - umov x12, vctr.d[1] /* keep swabbed ctr in reg */ > - rev x12, x12 > + umov IV_PART, vctr.d[1] > + rev IV_PART, IV_PART > .endif > > .LctrloopNx\xctr: > - add w7, w4, #15 > - sub w4, w4, #MAX_STRIDE << 4 > - lsr w7, w7, #4 > + add BLOCKS_W, BYTES_W, #15 > + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 > + lsr BLOCKS_W, BLOCKS_W, #4 > mov w8, #MAX_STRIDE > - cmp w7, w8 > - csel w7, w7, w8, lt > + cmp BLOCKS_W, w8 > + csel BLOCKS_W, BLOCKS_W, w8, lt > > + /* > + * Set up the counter values in v0-v4. > + * > + * If we are encrypting less than MAX_STRIDE blocks, the tail block > + * handling code expects the last keystream block to be in v4. For > + * example: if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 > + * should have the next two counter blocks. > + */ > .if \xctr > - add x11, x11, x7 > + add CTR, CTR, BLOCKS > .else > - adds x12, x12, x7 > + adds IV_PART, IV_PART, BLOCKS > .endif > mov v0.16b, vctr.16b > mov v1.16b, vctr.16b > @@ -355,16 +383,16 @@ AES_FUNC_END(aes_cbc_cts_decrypt) > mov v3.16b, vctr.16b > ST5( mov v4.16b, vctr.16b ) > .if \xctr > - sub x6, x11, #MAX_STRIDE - 1 > - sub x7, x11, #MAX_STRIDE - 2 > - sub x8, x11, #MAX_STRIDE - 3 > - sub x9, x11, #MAX_STRIDE - 4 > -ST5( sub x10, x11, #MAX_STRIDE - 5 ) > - eor x6, x6, x12 > - eor x7, x7, x12 > - eor x8, x8, x12 > - eor x9, x9, x12 > - eor x10, x10, x12 > + sub x6, CTR, #MAX_STRIDE - 1 > + sub x7, CTR, #MAX_STRIDE - 2 > + sub x8, CTR, #MAX_STRIDE - 3 > + sub x9, CTR, #MAX_STRIDE - 4 > +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) > + eor x6, x6, IV_PART > + eor x7, x7, IV_PART > + eor x8, x8, IV_PART > + eor x9, x9, IV_PART > + eor x10, x10, IV_PART > mov v0.d[0], x6 > mov v1.d[0], x7 > mov v2.d[0], x8 > @@ -381,9 +409,9 @@ ST5( mov v4.d[0], x10 ) > ins vctr.d[0], x8 > > /* apply carry to N counter blocks for N := x12 */ Please update this comment as well. And while at it, it might make sense to clarify that doing a conditional branch here is fine wrt time invariance, given that the IV is not part of the key or the plaintext, and this code rarely triggers in practice anyway. > - cbz x12, 2f > + cbz IV_PART, 2f > adr x16, 1f > - sub x16, x16, x12, lsl #3 > + sub x16, x16, IV_PART, lsl #3 > br x16 > bti c > mov v0.d[0], vctr.d[0] > @@ -398,71 +426,88 @@ ST5( mov v4.d[0], vctr.d[0] ) > 1: b 2f > .previous > > -2: rev x7, x12 > +2: rev x7, IV_PART > ins vctr.d[1], x7 > - sub x7, x12, #MAX_STRIDE - 1 > - sub x8, x12, #MAX_STRIDE - 2 > - sub x9, x12, #MAX_STRIDE - 3 > + sub x7, IV_PART, #MAX_STRIDE - 1 > + sub x8, IV_PART, #MAX_STRIDE - 2 > + sub x9, IV_PART, #MAX_STRIDE - 3 > rev x7, x7 > rev x8, x8 > mov v1.d[1], x7 > rev x9, x9 > -ST5( sub x10, x12, #MAX_STRIDE - 4 ) > +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) > mov v2.d[1], x8 > ST5( rev x10, x10 ) > mov v3.d[1], x9 > ST5( mov v4.d[1], x10 ) > .endif > - tbnz w4, #31, .Lctrtail\xctr > - ld1 {v5.16b-v7.16b}, [x1], #48 > + > + /* > + * If there are at least MAX_STRIDE blocks left, XOR the plaintext with > + * keystream and store. Otherwise jump to tail handling. > + */ > + tbnz BYTES_W, #31, .Lctrtail\xctr > + ld1 {v5.16b-v7.16b}, [IN], #48 > ST4( bl aes_encrypt_block4x ) > ST5( bl aes_encrypt_block5x ) > eor v0.16b, v5.16b, v0.16b > -ST4( ld1 {v5.16b}, [x1], #16 ) > +ST4( ld1 {v5.16b}, [IN], #16 ) > eor v1.16b, v6.16b, v1.16b > -ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) > +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) > eor v2.16b, v7.16b, v2.16b > eor v3.16b, v5.16b, v3.16b > ST5( eor v4.16b, v6.16b, v4.16b ) > - st1 {v0.16b-v3.16b}, [x0], #64 > -ST5( st1 {v4.16b}, [x0], #16 ) > - cbz w4, .Lctrout\xctr > + st1 {v0.16b-v3.16b}, [OUT], #64 > +ST5( st1 {v4.16b}, [OUT], #16 ) > + cbz BYTES_W, .Lctrout\xctr > b .LctrloopNx\xctr > > .Lctrout\xctr: > .if !\xctr > - st1 {vctr.16b}, [x5] /* return next CTR value */ > + st1 {vctr.16b}, [IV] /* return next CTR value */ > .endif > ldp x29, x30, [sp], #16 > ret > > .Lctrtail\xctr: > + /* > + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext > + * > + * This code expects the last keystream block to be in v4. For example: > + * if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 should > + * have the next two counter blocks. > + * > + * This allows us to store the ciphertext by writing to overlapping > + * regions of memory. Any invalid ciphertext blocks get overwritten by > + * correctly computed blocks. This approach avoids extra conditional > + * branches. > + */ Nit: Without overlapping stores, we'd have to load and store smaller quantities and use a loop here, or bounce it via a temp buffer and memcpy() it from the C code. So it's not just some extra branches. > mov x16, #16 > - ands x6, x4, #0xf > - csel x13, x6, x16, ne > + ands w7, BYTES_W, #0xf > + csel x13, x7, x16, ne > > -ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) > +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) > ST5( csel x14, x16, xzr, gt ) > - cmp w4, #48 - (MAX_STRIDE << 4) > + cmp BYTES_W, #48 - (MAX_STRIDE << 4) > csel x15, x16, xzr, gt > - cmp w4, #32 - (MAX_STRIDE << 4) > + cmp BYTES_W, #32 - (MAX_STRIDE << 4) > csel x16, x16, xzr, gt > - cmp w4, #16 - (MAX_STRIDE << 4) > + cmp BYTES_W, #16 - (MAX_STRIDE << 4) > > - adr_l x12, .Lcts_permute_table > - add x12, x12, x13 > + adr_l x9, .Lcts_permute_table > + add x9, x9, x13 > ble .Lctrtail1x\xctr > > -ST5( ld1 {v5.16b}, [x1], x14 ) > - ld1 {v6.16b}, [x1], x15 > - ld1 {v7.16b}, [x1], x16 > +ST5( ld1 {v5.16b}, [IN], x14 ) > + ld1 {v6.16b}, [IN], x15 > + ld1 {v7.16b}, [IN], x16 > > ST4( bl aes_encrypt_block4x ) > ST5( bl aes_encrypt_block5x ) > > - ld1 {v8.16b}, [x1], x13 > - ld1 {v9.16b}, [x1] > - ld1 {v10.16b}, [x12] > + ld1 {v8.16b}, [IN], x13 > + ld1 {v9.16b}, [IN] > + ld1 {v10.16b}, [x9] > > ST4( eor v6.16b, v6.16b, v0.16b ) > ST4( eor v7.16b, v7.16b, v1.16b ) > @@ -477,30 +522,48 @@ ST5( eor v7.16b, v7.16b, v2.16b ) > ST5( eor v8.16b, v8.16b, v3.16b ) > ST5( eor v9.16b, v9.16b, v4.16b ) > > -ST5( st1 {v5.16b}, [x0], x14 ) > - st1 {v6.16b}, [x0], x15 > - st1 {v7.16b}, [x0], x16 > - add x13, x13, x0 > +ST5( st1 {v5.16b}, [OUT], x14 ) > + st1 {v6.16b}, [OUT], x15 > + st1 {v7.16b}, [OUT], x16 > + add x13, x13, OUT > st1 {v9.16b}, [x13] // overlapping stores > - st1 {v8.16b}, [x0] > + st1 {v8.16b}, [OUT] > b .Lctrout\xctr > > .Lctrtail1x\xctr: > - sub x7, x6, #16 > - csel x6, x6, x7, eq > - add x1, x1, x6 > - add x0, x0, x6 > - ld1 {v5.16b}, [x1] > - ld1 {v6.16b}, [x0] > + /* > + * Handle <= 16 bytes of plaintext > + */ > + sub x8, x7, #16 > + csel x7, x7, x8, eq > + add IN, IN, x7 > + add OUT, OUT, x7 > + ld1 {v5.16b}, [IN] > + ld1 {v6.16b}, [OUT] > ST5( mov v3.16b, v4.16b ) > encrypt_block v3, w3, x2, x8, w7 > - ld1 {v10.16b-v11.16b}, [x12] > + ld1 {v10.16b-v11.16b}, [x9] > tbl v3.16b, {v3.16b}, v10.16b > sshr v11.16b, v11.16b, #7 > eor v5.16b, v5.16b, v3.16b > bif v5.16b, v6.16b, v11.16b > - st1 {v5.16b}, [x0] > + st1 {v5.16b}, [OUT] > b .Lctrout\xctr > + > + // Arguments > + .unreq OUT > + .unreq IN > + .unreq KEY > + .unreq ROUNDS_W > + .unreq BYTES_W > + .unreq IV > + .unreq BYTE_CTR_W // XCTR only > + // Intermediate values > + .unreq CTR_W // XCTR only > + .unreq CTR // XCTR only > + .unreq IV_PART > + .unreq BLOCKS > + .unreq BLOCKS_W > .endm > > /* > -- > 2.36.0.464.gb9c8b46e94-goog >
On Wed, May 04, 2022 at 12:18:20AM +0000, Nathan Huckleberry wrote: > Added some clarifying comments, changed the register allocations to make > the code clearer, and added register aliases. > > Signed-off-by: Nathan Huckleberry <nhuck@google.com> I was a bit surprised to see this after the xctr support patch rather than before. Doing the cleanup first would make adding and reviewing the xctr support easier. But it's not a big deal; if you already tested it this way you can just leave it as-is if you want. A few minor comments below. > + /* > + * Set up the counter values in v0-v4. > + * > + * If we are encrypting less than MAX_STRIDE blocks, the tail block > + * handling code expects the last keystream block to be in v4. For > + * example: if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 > + * should have the next two counter blocks. > + */ The first two mentions of v4 should actually be v{MAX_STRIDE-1}, as it is actually v4 for MAX_STRIDE==5 and v3 for MAX_STRIDE==4. > @@ -355,16 +383,16 @@ AES_FUNC_END(aes_cbc_cts_decrypt) > mov v3.16b, vctr.16b > ST5( mov v4.16b, vctr.16b ) > .if \xctr > - sub x6, x11, #MAX_STRIDE - 1 > - sub x7, x11, #MAX_STRIDE - 2 > - sub x8, x11, #MAX_STRIDE - 3 > - sub x9, x11, #MAX_STRIDE - 4 > -ST5( sub x10, x11, #MAX_STRIDE - 5 ) > - eor x6, x6, x12 > - eor x7, x7, x12 > - eor x8, x8, x12 > - eor x9, x9, x12 > - eor x10, x10, x12 > + sub x6, CTR, #MAX_STRIDE - 1 > + sub x7, CTR, #MAX_STRIDE - 2 > + sub x8, CTR, #MAX_STRIDE - 3 > + sub x9, CTR, #MAX_STRIDE - 4 > +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) > + eor x6, x6, IV_PART > + eor x7, x7, IV_PART > + eor x8, x8, IV_PART > + eor x9, x9, IV_PART > + eor x10, x10, IV_PART The eor into x10 should be enclosed by ST5(), since it's dead code otherwise. > + /* > + * If there are at least MAX_STRIDE blocks left, XOR the plaintext with > + * keystream and store. Otherwise jump to tail handling. > + */ Technically this could be XOR-ing with either the plaintext or the ciphertext. Maybe write "data" instead. > .Lctrtail1x\xctr: > - sub x7, x6, #16 > - csel x6, x6, x7, eq > - add x1, x1, x6 > - add x0, x0, x6 > - ld1 {v5.16b}, [x1] > - ld1 {v6.16b}, [x0] > + /* > + * Handle <= 16 bytes of plaintext > + */ > + sub x8, x7, #16 > + csel x7, x7, x8, eq > + add IN, IN, x7 > + add OUT, OUT, x7 > + ld1 {v5.16b}, [IN] > + ld1 {v6.16b}, [OUT] > ST5( mov v3.16b, v4.16b ) > encrypt_block v3, w3, x2, x8, w7 w3 and x2 should be ROUNDS_W and KEY, respectively. This code also has the very unusual property that it reads and writes before the buffers given. Specifically, for bytes < 16, it access the 16 bytes beginning at &in[bytes - 16] and &dst[bytes - 16]. Mentioning this explicitly would be very helpful, particularly in the function comments for aes_ctr_encrypt() and aes_xctr_encrypt(), and maybe in the C code, so that anyone calling these functions has this in mind. Anyway, with the above addressed feel free to add: Reviewed-by: Eric Biggers <ebiggers@google.com> - Eric
On Fri, May 6, 2022 at 12:41 AM Eric Biggers <ebiggers@kernel.org> wrote: > > On Wed, May 04, 2022 at 12:18:20AM +0000, Nathan Huckleberry wrote: > > Added some clarifying comments, changed the register allocations to make > > the code clearer, and added register aliases. > > > > Signed-off-by: Nathan Huckleberry <nhuck@google.com> > > I was a bit surprised to see this after the xctr support patch rather than > before. Doing the cleanup first would make adding and reviewing the xctr > support easier. But it's not a big deal; if you already tested it this way you > can just leave it as-is if you want. > > A few minor comments below. > > > + /* > > + * Set up the counter values in v0-v4. > > + * > > + * If we are encrypting less than MAX_STRIDE blocks, the tail block > > + * handling code expects the last keystream block to be in v4. For > > + * example: if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 > > + * should have the next two counter blocks. > > + */ > > The first two mentions of v4 should actually be v{MAX_STRIDE-1}, as it is > actually v4 for MAX_STRIDE==5 and v3 for MAX_STRIDE==4. > > > @@ -355,16 +383,16 @@ AES_FUNC_END(aes_cbc_cts_decrypt) > > mov v3.16b, vctr.16b > > ST5( mov v4.16b, vctr.16b ) > > .if \xctr > > - sub x6, x11, #MAX_STRIDE - 1 > > - sub x7, x11, #MAX_STRIDE - 2 > > - sub x8, x11, #MAX_STRIDE - 3 > > - sub x9, x11, #MAX_STRIDE - 4 > > -ST5( sub x10, x11, #MAX_STRIDE - 5 ) > > - eor x6, x6, x12 > > - eor x7, x7, x12 > > - eor x8, x8, x12 > > - eor x9, x9, x12 > > - eor x10, x10, x12 > > + sub x6, CTR, #MAX_STRIDE - 1 > > + sub x7, CTR, #MAX_STRIDE - 2 > > + sub x8, CTR, #MAX_STRIDE - 3 > > + sub x9, CTR, #MAX_STRIDE - 4 > > +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) > > + eor x6, x6, IV_PART > > + eor x7, x7, IV_PART > > + eor x8, x8, IV_PART > > + eor x9, x9, IV_PART > > + eor x10, x10, IV_PART > > The eor into x10 should be enclosed by ST5(), since it's dead code otherwise. > > > + /* > > + * If there are at least MAX_STRIDE blocks left, XOR the plaintext with > > + * keystream and store. Otherwise jump to tail handling. > > + */ > > Technically this could be XOR-ing with either the plaintext or the ciphertext. > Maybe write "data" instead. > > > .Lctrtail1x\xctr: > > - sub x7, x6, #16 > > - csel x6, x6, x7, eq > > - add x1, x1, x6 > > - add x0, x0, x6 > > - ld1 {v5.16b}, [x1] > > - ld1 {v6.16b}, [x0] > > + /* > > + * Handle <= 16 bytes of plaintext > > + */ > > + sub x8, x7, #16 > > + csel x7, x7, x8, eq > > + add IN, IN, x7 > > + add OUT, OUT, x7 > > + ld1 {v5.16b}, [IN] > > + ld1 {v6.16b}, [OUT] > > ST5( mov v3.16b, v4.16b ) > > encrypt_block v3, w3, x2, x8, w7 > > w3 and x2 should be ROUNDS_W and KEY, respectively. > > This code also has the very unusual property that it reads and writes before the > buffers given. Specifically, for bytes < 16, it access the 16 bytes beginning > at &in[bytes - 16] and &dst[bytes - 16]. Mentioning this explicitly would be > very helpful, particularly in the function comments for aes_ctr_encrypt() and > aes_xctr_encrypt(), and maybe in the C code, so that anyone calling these > functions has this in mind. If bytes < 16, then the C code uses a buffer of 16 bytes to avoid this. I'll add some comments explaining that because its not entirely clear what is happening in the C unless you've taken a deep dive into the asm. > > Anyway, with the above addressed feel free to add: > > Reviewed-by: Eric Biggers <ebiggers@google.com> > > - Eric
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 55df157fce3a..da7c9f3380f8 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -322,32 +322,60 @@ AES_FUNC_END(aes_cbc_cts_decrypt) * This macro generates the code for CTR and XCTR mode. */ .macro ctr_encrypt xctr + // Arguments + OUT .req x0 + IN .req x1 + KEY .req x2 + ROUNDS_W .req w3 + BYTES_W .req w4 + IV .req x5 + BYTE_CTR_W .req w6 // XCTR only + // Intermediate values + CTR_W .req w11 // XCTR only + CTR .req x11 // XCTR only + IV_PART .req x12 + BLOCKS .req x13 + BLOCKS_W .req w13 + stp x29, x30, [sp, #-16]! mov x29, sp - enc_prepare w3, x2, x12 - ld1 {vctr.16b}, [x5] + enc_prepare ROUNDS_W, KEY, IV_PART + ld1 {vctr.16b}, [IV] + /* + * Keep 64 bits of the IV in a register. For CTR mode this lets us + * easily increment the IV. For XCTR mode this lets us efficiently XOR + * the 64-bit counter with the IV. + */ .if \xctr - umov x12, vctr.d[0] - lsr w11, w6, #4 + umov IV_PART, vctr.d[0] + lsr CTR_W, BYTE_CTR_W, #4 .else - umov x12, vctr.d[1] /* keep swabbed ctr in reg */ - rev x12, x12 + umov IV_PART, vctr.d[1] + rev IV_PART, IV_PART .endif .LctrloopNx\xctr: - add w7, w4, #15 - sub w4, w4, #MAX_STRIDE << 4 - lsr w7, w7, #4 + add BLOCKS_W, BYTES_W, #15 + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 + lsr BLOCKS_W, BLOCKS_W, #4 mov w8, #MAX_STRIDE - cmp w7, w8 - csel w7, w7, w8, lt + cmp BLOCKS_W, w8 + csel BLOCKS_W, BLOCKS_W, w8, lt + /* + * Set up the counter values in v0-v4. + * + * If we are encrypting less than MAX_STRIDE blocks, the tail block + * handling code expects the last keystream block to be in v4. For + * example: if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 + * should have the next two counter blocks. + */ .if \xctr - add x11, x11, x7 + add CTR, CTR, BLOCKS .else - adds x12, x12, x7 + adds IV_PART, IV_PART, BLOCKS .endif mov v0.16b, vctr.16b mov v1.16b, vctr.16b @@ -355,16 +383,16 @@ AES_FUNC_END(aes_cbc_cts_decrypt) mov v3.16b, vctr.16b ST5( mov v4.16b, vctr.16b ) .if \xctr - sub x6, x11, #MAX_STRIDE - 1 - sub x7, x11, #MAX_STRIDE - 2 - sub x8, x11, #MAX_STRIDE - 3 - sub x9, x11, #MAX_STRIDE - 4 -ST5( sub x10, x11, #MAX_STRIDE - 5 ) - eor x6, x6, x12 - eor x7, x7, x12 - eor x8, x8, x12 - eor x9, x9, x12 - eor x10, x10, x12 + sub x6, CTR, #MAX_STRIDE - 1 + sub x7, CTR, #MAX_STRIDE - 2 + sub x8, CTR, #MAX_STRIDE - 3 + sub x9, CTR, #MAX_STRIDE - 4 +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) + eor x6, x6, IV_PART + eor x7, x7, IV_PART + eor x8, x8, IV_PART + eor x9, x9, IV_PART + eor x10, x10, IV_PART mov v0.d[0], x6 mov v1.d[0], x7 mov v2.d[0], x8 @@ -381,9 +409,9 @@ ST5( mov v4.d[0], x10 ) ins vctr.d[0], x8 /* apply carry to N counter blocks for N := x12 */ - cbz x12, 2f + cbz IV_PART, 2f adr x16, 1f - sub x16, x16, x12, lsl #3 + sub x16, x16, IV_PART, lsl #3 br x16 bti c mov v0.d[0], vctr.d[0] @@ -398,71 +426,88 @@ ST5( mov v4.d[0], vctr.d[0] ) 1: b 2f .previous -2: rev x7, x12 +2: rev x7, IV_PART ins vctr.d[1], x7 - sub x7, x12, #MAX_STRIDE - 1 - sub x8, x12, #MAX_STRIDE - 2 - sub x9, x12, #MAX_STRIDE - 3 + sub x7, IV_PART, #MAX_STRIDE - 1 + sub x8, IV_PART, #MAX_STRIDE - 2 + sub x9, IV_PART, #MAX_STRIDE - 3 rev x7, x7 rev x8, x8 mov v1.d[1], x7 rev x9, x9 -ST5( sub x10, x12, #MAX_STRIDE - 4 ) +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) mov v2.d[1], x8 ST5( rev x10, x10 ) mov v3.d[1], x9 ST5( mov v4.d[1], x10 ) .endif - tbnz w4, #31, .Lctrtail\xctr - ld1 {v5.16b-v7.16b}, [x1], #48 + + /* + * If there are at least MAX_STRIDE blocks left, XOR the plaintext with + * keystream and store. Otherwise jump to tail handling. + */ + tbnz BYTES_W, #31, .Lctrtail\xctr + ld1 {v5.16b-v7.16b}, [IN], #48 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b -ST4( ld1 {v5.16b}, [x1], #16 ) +ST4( ld1 {v5.16b}, [IN], #16 ) eor v1.16b, v6.16b, v1.16b -ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) eor v2.16b, v7.16b, v2.16b eor v3.16b, v5.16b, v3.16b ST5( eor v4.16b, v6.16b, v4.16b ) - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - cbz w4, .Lctrout\xctr + st1 {v0.16b-v3.16b}, [OUT], #64 +ST5( st1 {v4.16b}, [OUT], #16 ) + cbz BYTES_W, .Lctrout\xctr b .LctrloopNx\xctr .Lctrout\xctr: .if !\xctr - st1 {vctr.16b}, [x5] /* return next CTR value */ + st1 {vctr.16b}, [IV] /* return next CTR value */ .endif ldp x29, x30, [sp], #16 ret .Lctrtail\xctr: + /* + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext + * + * This code expects the last keystream block to be in v4. For example: + * if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 should + * have the next two counter blocks. + * + * This allows us to store the ciphertext by writing to overlapping + * regions of memory. Any invalid ciphertext blocks get overwritten by + * correctly computed blocks. This approach avoids extra conditional + * branches. + */ mov x16, #16 - ands x6, x4, #0xf - csel x13, x6, x16, ne + ands w7, BYTES_W, #0xf + csel x13, x7, x16, ne -ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) ST5( csel x14, x16, xzr, gt ) - cmp w4, #48 - (MAX_STRIDE << 4) + cmp BYTES_W, #48 - (MAX_STRIDE << 4) csel x15, x16, xzr, gt - cmp w4, #32 - (MAX_STRIDE << 4) + cmp BYTES_W, #32 - (MAX_STRIDE << 4) csel x16, x16, xzr, gt - cmp w4, #16 - (MAX_STRIDE << 4) + cmp BYTES_W, #16 - (MAX_STRIDE << 4) - adr_l x12, .Lcts_permute_table - add x12, x12, x13 + adr_l x9, .Lcts_permute_table + add x9, x9, x13 ble .Lctrtail1x\xctr -ST5( ld1 {v5.16b}, [x1], x14 ) - ld1 {v6.16b}, [x1], x15 - ld1 {v7.16b}, [x1], x16 +ST5( ld1 {v5.16b}, [IN], x14 ) + ld1 {v6.16b}, [IN], x15 + ld1 {v7.16b}, [IN], x16 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) - ld1 {v8.16b}, [x1], x13 - ld1 {v9.16b}, [x1] - ld1 {v10.16b}, [x12] + ld1 {v8.16b}, [IN], x13 + ld1 {v9.16b}, [IN] + ld1 {v10.16b}, [x9] ST4( eor v6.16b, v6.16b, v0.16b ) ST4( eor v7.16b, v7.16b, v1.16b ) @@ -477,30 +522,48 @@ ST5( eor v7.16b, v7.16b, v2.16b ) ST5( eor v8.16b, v8.16b, v3.16b ) ST5( eor v9.16b, v9.16b, v4.16b ) -ST5( st1 {v5.16b}, [x0], x14 ) - st1 {v6.16b}, [x0], x15 - st1 {v7.16b}, [x0], x16 - add x13, x13, x0 +ST5( st1 {v5.16b}, [OUT], x14 ) + st1 {v6.16b}, [OUT], x15 + st1 {v7.16b}, [OUT], x16 + add x13, x13, OUT st1 {v9.16b}, [x13] // overlapping stores - st1 {v8.16b}, [x0] + st1 {v8.16b}, [OUT] b .Lctrout\xctr .Lctrtail1x\xctr: - sub x7, x6, #16 - csel x6, x6, x7, eq - add x1, x1, x6 - add x0, x0, x6 - ld1 {v5.16b}, [x1] - ld1 {v6.16b}, [x0] + /* + * Handle <= 16 bytes of plaintext + */ + sub x8, x7, #16 + csel x7, x7, x8, eq + add IN, IN, x7 + add OUT, OUT, x7 + ld1 {v5.16b}, [IN] + ld1 {v6.16b}, [OUT] ST5( mov v3.16b, v4.16b ) encrypt_block v3, w3, x2, x8, w7 - ld1 {v10.16b-v11.16b}, [x12] + ld1 {v10.16b-v11.16b}, [x9] tbl v3.16b, {v3.16b}, v10.16b sshr v11.16b, v11.16b, #7 eor v5.16b, v5.16b, v3.16b bif v5.16b, v6.16b, v11.16b - st1 {v5.16b}, [x0] + st1 {v5.16b}, [OUT] b .Lctrout\xctr + + // Arguments + .unreq OUT + .unreq IN + .unreq KEY + .unreq ROUNDS_W + .unreq BYTES_W + .unreq IV + .unreq BYTE_CTR_W // XCTR only + // Intermediate values + .unreq CTR_W // XCTR only + .unreq CTR // XCTR only + .unreq IV_PART + .unreq BLOCKS + .unreq BLOCKS_W .endm /*
Added some clarifying comments, changed the register allocations to make the code clearer, and added register aliases. Signed-off-by: Nathan Huckleberry <nhuck@google.com> --- arch/arm64/crypto/aes-modes.S | 193 ++++++++++++++++++++++------------ 1 file changed, 128 insertions(+), 65 deletions(-)