Message ID | 20180912030510.27025-1-ebiggers@kernel.org (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Herbert Xu |
Headers | show |
Series | crypto: chacha20 - Fix chacha20_block() keystream alignment (again) | expand |
Hi, Le mardi 11 septembre 2018 à 20:05 -0700, Eric Biggers a écrit : > From: Eric Biggers <ebiggers@google.com> > > In commit 9f480faec58c ("crypto: chacha20 - Fix keystream alignment for > chacha20_block()"), I had missed that chacha20_block() can be called > directly on the buffer passed to get_random_bytes(), which can have any > alignment. So, while my commit didn't break anything, it didn't fully > solve the alignment problems. > > Revert my solution and just update chacha20_block() to use > put_unaligned_le32(), so the output buffer need not be aligned. > This is simpler, and on many CPUs it's the same speed. > > But, I kept the 'tmp' buffers in extract_crng_user() and > _get_random_bytes() 4-byte aligned, since that alignment is actually > needed for _crng_backtrack_protect() too. > > Reported-by: Stephan Müller <smueller@chronox.de> > Cc: Theodore Ts'o <tytso@mit.edu> > Signed-off-by: Eric Biggers <ebiggers@google.com> > --- > crypto/chacha20_generic.c | 7 ++++--- > drivers/char/random.c | 24 ++++++++++++------------ > include/crypto/chacha20.h | 3 +-- > lib/chacha20.c | 6 +++--- > 4 files changed, 20 insertions(+), 20 deletions(-) > > diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c > index e451c3cb6a56..3ae96587caf9 100644 > --- a/crypto/chacha20_generic.c > +++ b/crypto/chacha20_generic.c > @@ -18,20 +18,21 @@ > static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, > unsigned int bytes) > { > - u32 stream[CHACHA20_BLOCK_WORDS]; > + /* aligned to potentially speed up crypto_xor() */ > + u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long)); > > if (dst != src) > memcpy(dst, src, bytes); > > while (bytes >= CHACHA20_BLOCK_SIZE) { > chacha20_block(state, stream); > - crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE); > + crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE); > bytes -= CHACHA20_BLOCK_SIZE; > dst += CHACHA20_BLOCK_SIZE; > } > if (bytes) { > chacha20_block(state, stream); > - crypto_xor(dst, (const u8 *)stream, bytes); > + crypto_xor(dst, stream, bytes); > } > } > > diff --git a/drivers/char/random.c b/drivers/char/random.c > index bf5f99fc36f1..d22d967c50f0 100644 > --- a/drivers/char/random.c > +++ b/drivers/char/random.c > @@ -1003,7 +1003,7 @@ static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS]) > * enough) to mutate the CRNG key to provide backtracking protection. > */ > static void _crng_backtrack_protect(struct crng_state *crng, > - __u32 tmp[CHACHA20_BLOCK_WORDS], int used) > + __u8 tmp[CHACHA20_BLOCK_SIZE], int used) > { > unsigned long flags; > __u32 *s, *d; > @@ -1015,14 +1015,14 @@ static void _crng_backtrack_protect(struct crng_state *crng, > used = 0; > } > spin_lock_irqsave(&crng->lock, flags); > - s = &tmp[used / sizeof(__u32)]; > + s = (__u32 *) &tmp[used]; This introduces a alignment issue: tmp is not aligned for __u32, but is dereferenced as such later. > d = &crng->state[4]; > for (i=0; i < 8; i++) > *d++ ^= *s++; > spin_unlock_irqrestore(&crng->lock, flags); > } > Regards.
Hi Yann, On Wed, Sep 12, 2018 at 11:50:00AM +0200, Yann Droneaud wrote: > Hi, > > Le mardi 11 septembre 2018 à 20:05 -0700, Eric Biggers a écrit : > > From: Eric Biggers <ebiggers@google.com> > > > > In commit 9f480faec58c ("crypto: chacha20 - Fix keystream alignment for > > chacha20_block()"), I had missed that chacha20_block() can be called > > directly on the buffer passed to get_random_bytes(), which can have any > > alignment. So, while my commit didn't break anything, it didn't fully > > solve the alignment problems. > > > > Revert my solution and just update chacha20_block() to use > > put_unaligned_le32(), so the output buffer need not be aligned. > > This is simpler, and on many CPUs it's the same speed. > > > > But, I kept the 'tmp' buffers in extract_crng_user() and > > _get_random_bytes() 4-byte aligned, since that alignment is actually > > needed for _crng_backtrack_protect() too. > > > > Reported-by: Stephan Müller <smueller@chronox.de> > > Cc: Theodore Ts'o <tytso@mit.edu> > > Signed-off-by: Eric Biggers <ebiggers@google.com> > > --- > > crypto/chacha20_generic.c | 7 ++++--- > > drivers/char/random.c | 24 ++++++++++++------------ > > include/crypto/chacha20.h | 3 +-- > > lib/chacha20.c | 6 +++--- > > 4 files changed, 20 insertions(+), 20 deletions(-) > > > > diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c > > index e451c3cb6a56..3ae96587caf9 100644 > > --- a/crypto/chacha20_generic.c > > +++ b/crypto/chacha20_generic.c > > @@ -18,20 +18,21 @@ > > static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, > > unsigned int bytes) > > { > > - u32 stream[CHACHA20_BLOCK_WORDS]; > > + /* aligned to potentially speed up crypto_xor() */ > > + u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long)); > > > > if (dst != src) > > memcpy(dst, src, bytes); > > > > while (bytes >= CHACHA20_BLOCK_SIZE) { > > chacha20_block(state, stream); > > - crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE); > > + crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE); > > bytes -= CHACHA20_BLOCK_SIZE; > > dst += CHACHA20_BLOCK_SIZE; > > } > > if (bytes) { > > chacha20_block(state, stream); > > - crypto_xor(dst, (const u8 *)stream, bytes); > > + crypto_xor(dst, stream, bytes); > > } > > } > > > > diff --git a/drivers/char/random.c b/drivers/char/random.c > > index bf5f99fc36f1..d22d967c50f0 100644 > > --- a/drivers/char/random.c > > +++ b/drivers/char/random.c > > @@ -1003,7 +1003,7 @@ static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS]) > > * enough) to mutate the CRNG key to provide backtracking protection. > > */ > > static void _crng_backtrack_protect(struct crng_state *crng, > > - __u32 tmp[CHACHA20_BLOCK_WORDS], int used) > > + __u8 tmp[CHACHA20_BLOCK_SIZE], int used) > > { > > unsigned long flags; > > __u32 *s, *d; > > @@ -1015,14 +1015,14 @@ static void _crng_backtrack_protect(struct crng_state *crng, > > used = 0; > > } > > spin_lock_irqsave(&crng->lock, flags); > > - s = &tmp[used / sizeof(__u32)]; > > + s = (__u32 *) &tmp[used]; > > This introduces a alignment issue: tmp is not aligned for __u32, but is > dereferenced as such later. > > > d = &crng->state[4]; > > for (i=0; i < 8; i++) > > *d++ ^= *s++; > > spin_unlock_irqrestore(&crng->lock, flags); > > } > > > I explained this in the patch; the callers ensure the buffer is aligned. - Eric
On Tue, Sep 11, 2018 at 08:05:10PM -0700, Eric Biggers wrote: > From: Eric Biggers <ebiggers@google.com> > > In commit 9f480faec58c ("crypto: chacha20 - Fix keystream alignment for > chacha20_block()"), I had missed that chacha20_block() can be called > directly on the buffer passed to get_random_bytes(), which can have any > alignment. So, while my commit didn't break anything, it didn't fully > solve the alignment problems. > > Revert my solution and just update chacha20_block() to use > put_unaligned_le32(), so the output buffer need not be aligned. > This is simpler, and on many CPUs it's the same speed. > > But, I kept the 'tmp' buffers in extract_crng_user() and > _get_random_bytes() 4-byte aligned, since that alignment is actually > needed for _crng_backtrack_protect() too. > > Reported-by: Stephan Müller <smueller@chronox.de> > Cc: Theodore Ts'o <tytso@mit.edu> > Signed-off-by: Eric Biggers <ebiggers@google.com> > --- > crypto/chacha20_generic.c | 7 ++++--- > drivers/char/random.c | 24 ++++++++++++------------ > include/crypto/chacha20.h | 3 +-- > lib/chacha20.c | 6 +++--- > 4 files changed, 20 insertions(+), 20 deletions(-) Patch applied. Thanks.
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c index e451c3cb6a56..3ae96587caf9 100644 --- a/crypto/chacha20_generic.c +++ b/crypto/chacha20_generic.c @@ -18,20 +18,21 @@ static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { - u32 stream[CHACHA20_BLOCK_WORDS]; + /* aligned to potentially speed up crypto_xor() */ + u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long)); if (dst != src) memcpy(dst, src, bytes); while (bytes >= CHACHA20_BLOCK_SIZE) { chacha20_block(state, stream); - crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE); + crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE); bytes -= CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; } if (bytes) { chacha20_block(state, stream); - crypto_xor(dst, (const u8 *)stream, bytes); + crypto_xor(dst, stream, bytes); } } diff --git a/drivers/char/random.c b/drivers/char/random.c index bf5f99fc36f1..d22d967c50f0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -433,9 +433,9 @@ static int crng_init_cnt = 0; static unsigned long crng_global_init_time = 0; #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE) static void _extract_crng(struct crng_state *crng, - __u32 out[CHACHA20_BLOCK_WORDS]); + __u8 out[CHACHA20_BLOCK_SIZE]); static void _crng_backtrack_protect(struct crng_state *crng, - __u32 tmp[CHACHA20_BLOCK_WORDS], int used); + __u8 tmp[CHACHA20_BLOCK_SIZE], int used); static void process_random_ready_list(void); static void _get_random_bytes(void *buf, int nbytes); @@ -921,7 +921,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) unsigned long flags; int i, num; union { - __u32 block[CHACHA20_BLOCK_WORDS]; + __u8 block[CHACHA20_BLOCK_SIZE]; __u32 key[8]; } buf; @@ -968,7 +968,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) } static void _extract_crng(struct crng_state *crng, - __u32 out[CHACHA20_BLOCK_WORDS]) + __u8 out[CHACHA20_BLOCK_SIZE]) { unsigned long v, flags; @@ -985,7 +985,7 @@ static void _extract_crng(struct crng_state *crng, spin_unlock_irqrestore(&crng->lock, flags); } -static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS]) +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) { struct crng_state *crng = NULL; @@ -1003,7 +1003,7 @@ static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS]) * enough) to mutate the CRNG key to provide backtracking protection. */ static void _crng_backtrack_protect(struct crng_state *crng, - __u32 tmp[CHACHA20_BLOCK_WORDS], int used) + __u8 tmp[CHACHA20_BLOCK_SIZE], int used) { unsigned long flags; __u32 *s, *d; @@ -1015,14 +1015,14 @@ static void _crng_backtrack_protect(struct crng_state *crng, used = 0; } spin_lock_irqsave(&crng->lock, flags); - s = &tmp[used / sizeof(__u32)]; + s = (__u32 *) &tmp[used]; d = &crng->state[4]; for (i=0; i < 8; i++) *d++ ^= *s++; spin_unlock_irqrestore(&crng->lock, flags); } -static void crng_backtrack_protect(__u32 tmp[CHACHA20_BLOCK_WORDS], int used) +static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used) { struct crng_state *crng = NULL; @@ -1038,7 +1038,7 @@ static void crng_backtrack_protect(__u32 tmp[CHACHA20_BLOCK_WORDS], int used) static ssize_t extract_crng_user(void __user *buf, size_t nbytes) { ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE; - __u32 tmp[CHACHA20_BLOCK_WORDS]; + __u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4); int large_request = (nbytes > 256); while (nbytes) { @@ -1617,7 +1617,7 @@ static void _warn_unseeded_randomness(const char *func_name, void *caller, */ static void _get_random_bytes(void *buf, int nbytes) { - __u32 tmp[CHACHA20_BLOCK_WORDS]; + __u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4); trace_get_random_bytes(nbytes, _RET_IP_); @@ -2243,7 +2243,7 @@ u64 get_random_u64(void) if (use_lock) read_lock_irqsave(&batched_entropy_reset_lock, flags); if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) { - extract_crng((__u32 *)batch->entropy_u64); + extract_crng((u8 *)batch->entropy_u64); batch->position = 0; } ret = batch->entropy_u64[batch->position++]; @@ -2273,7 +2273,7 @@ u32 get_random_u32(void) if (use_lock) read_lock_irqsave(&batched_entropy_reset_lock, flags); if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) { - extract_crng(batch->entropy_u32); + extract_crng((u8 *)batch->entropy_u32); batch->position = 0; } ret = batch->entropy_u32[batch->position++]; diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h index b83d66073db0..f76302d99e2b 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha20.h @@ -13,13 +13,12 @@ #define CHACHA20_IV_SIZE 16 #define CHACHA20_KEY_SIZE 32 #define CHACHA20_BLOCK_SIZE 64 -#define CHACHA20_BLOCK_WORDS (CHACHA20_BLOCK_SIZE / sizeof(u32)) struct chacha20_ctx { u32 key[8]; }; -void chacha20_block(u32 *state, u32 *stream); +void chacha20_block(u32 *state, u8 *stream); void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize); diff --git a/lib/chacha20.c b/lib/chacha20.c index c1cc50fb68c9..d907fec6a9ed 100644 --- a/lib/chacha20.c +++ b/lib/chacha20.c @@ -16,9 +16,9 @@ #include <asm/unaligned.h> #include <crypto/chacha20.h> -void chacha20_block(u32 *state, u32 *stream) +void chacha20_block(u32 *state, u8 *stream) { - u32 x[16], *out = stream; + u32 x[16]; int i; for (i = 0; i < ARRAY_SIZE(x); i++) @@ -67,7 +67,7 @@ void chacha20_block(u32 *state, u32 *stream) } for (i = 0; i < ARRAY_SIZE(x); i++) - out[i] = cpu_to_le32(x[i] + state[i]); + put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); state[12]++; }