diff mbox series

[v4,6/8] fsverity: improve performance by using multibuffer hashing

Message ID 20240603183731.108986-7-ebiggers@kernel.org (mailing list archive)
State Superseded
Headers show
Series Optimize dm-verity and fsverity using multibuffer hashing | expand

Commit Message

Eric Biggers June 3, 2024, 6:37 p.m. UTC
From: Eric Biggers <ebiggers@google.com>

When supported by the hash algorithm, use crypto_shash_finup_mb() to
interleave the hashing of pairs of data blocks.  On some CPUs this
nearly doubles hashing performance.  The increase in overall throughput
of cold-cache fsverity reads that I'm seeing on arm64 and x86_64 is
roughly 35% (though this metric is hard to measure as it jumps around a
lot).

For now this is only done on the verification path, and only for data
blocks, not Merkle tree blocks.  We could use finup_mb on Merkle tree
blocks too, but that is less important as there aren't as many Merkle
tree blocks as data blocks, and that would require some additional code
restructuring.  We could also use finup_mb to accelerate building the
Merkle tree, but verification performance is more important.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/fsverity_private.h |   7 ++
 fs/verity/hash_algs.c        |   8 +-
 fs/verity/verify.c           | 173 +++++++++++++++++++++++++++++------
 3 files changed, 156 insertions(+), 32 deletions(-)

Comments

Herbert Xu June 4, 2024, 9:37 a.m. UTC | #1
On Mon, Jun 03, 2024 at 11:37:29AM -0700, Eric Biggers wrote:
>
> +	for (i = 0; i < ctx->num_pending; i++) {
> +		data[i] = ctx->pending_blocks[i].data;
> +		outs[i] = ctx->pending_blocks[i].hash;
> +	}
> +
> +	desc->tfm = params->hash_alg->tfm;
> +	if (params->hashstate)
> +		err = crypto_shash_import(desc, params->hashstate);
> +	else
> +		err = crypto_shash_init(desc);
> +	if (err) {
> +		fsverity_err(inode, "Error %d importing hash state", err);
> +		return false;
> +	}
> +	err = crypto_shash_finup_mb(desc, data, params->block_size, outs,
> +				    ctx->num_pending);
> +	if (err) {
> +		fsverity_err(inode, "Error %d computing block hashes", err);
> +		return false;
> +	}

So with ahash operating in synchronous mode (callback == NULL), this
would look like:

	struct ahash_request *reqs[FS_VERITY_MAX_PENDING_DATA_BLOCKS];

	for (i = 0; i < ctx->num_pending; i++) {
		reqs[i] = fsverity_alloc_hash_request();
		if (!req) {
			free all reqs;
			return false;
		}

		if (params->hashstate)
			err = crypto_ahash_import(&reqs[i], params->hashstate);
		else
			err = crypto_ahash_init(&reqs[i]);

		if (err) {
			fsverity_err(inode, "Error %d importing hash state", err);
			free all reqs;
			return false;
		}
	}

	for (i = 0; i < ctx->num_pending; i++) {
		unsigned more;

		if (params->hashstate)
			err = crypto_ahash_import(req, params->hashstate);
		else
			err = crypto_ahash_init(req);

		if (err) {
			fsverity_err(inode, "Error %d importing hash state", err);
			free all requests;
			return false;
		}

		more = 0;
		if (i + 1 < ctx->num_pending)
			more = CRYPTO_TFM_REQ_MORE;
		ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | more,
					   NULL, NULL);
		ahash_request_set_crypt(req, ctx->pending_blocks[i].sg,
					ctx->pending_blocks[i].hash,
					params->block_size);

		err = crypto_ahash_finup(req);
		if (err) {
			fsverity_err(inode, "Error %d computing block hashes", err);
			free all requests;
			return false;
		}
	}

You're hiding some of the complexity by not allocating memory
explicitly for each hash state.  This might fit on the stack
for two requests, but eventually you will have to allocate memory.

With the ahash API, the allocation is explicit.

Cheers,
Eric Biggers June 4, 2024, 6:42 p.m. UTC | #2
On Tue, Jun 04, 2024 at 05:37:36PM +0800, Herbert Xu wrote:
> On Mon, Jun 03, 2024 at 11:37:29AM -0700, Eric Biggers wrote:
> >
> > +	for (i = 0; i < ctx->num_pending; i++) {
> > +		data[i] = ctx->pending_blocks[i].data;
> > +		outs[i] = ctx->pending_blocks[i].hash;
> > +	}
> > +
> > +	desc->tfm = params->hash_alg->tfm;
> > +	if (params->hashstate)
> > +		err = crypto_shash_import(desc, params->hashstate);
> > +	else
> > +		err = crypto_shash_init(desc);
> > +	if (err) {
> > +		fsverity_err(inode, "Error %d importing hash state", err);
> > +		return false;
> > +	}
> > +	err = crypto_shash_finup_mb(desc, data, params->block_size, outs,
> > +				    ctx->num_pending);
> > +	if (err) {
> > +		fsverity_err(inode, "Error %d computing block hashes", err);
> > +		return false;
> > +	}
> 
> So with ahash operating in synchronous mode (callback == NULL), this
> would look like:
> 
> 	struct ahash_request *reqs[FS_VERITY_MAX_PENDING_DATA_BLOCKS];
> 
> 	for (i = 0; i < ctx->num_pending; i++) {
> 		reqs[i] = fsverity_alloc_hash_request();
> 		if (!req) {
> 			free all reqs;
> 			return false;
> 		}
> 
> 		if (params->hashstate)
> 			err = crypto_ahash_import(&reqs[i], params->hashstate);
> 		else
> 			err = crypto_ahash_init(&reqs[i]);
> 
> 		if (err) {
> 			fsverity_err(inode, "Error %d importing hash state", err);
> 			free all reqs;
> 			return false;
> 		}
> 	}
> 
> 	for (i = 0; i < ctx->num_pending; i++) {
> 		unsigned more;
> 
> 		if (params->hashstate)
> 			err = crypto_ahash_import(req, params->hashstate);
> 		else
> 			err = crypto_ahash_init(req);
> 
> 		if (err) {
> 			fsverity_err(inode, "Error %d importing hash state", err);
> 			free all requests;
> 			return false;
> 		}
> 
> 		more = 0;
> 		if (i + 1 < ctx->num_pending)
> 			more = CRYPTO_TFM_REQ_MORE;
> 		ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | more,
> 					   NULL, NULL);
> 		ahash_request_set_crypt(req, ctx->pending_blocks[i].sg,
> 					ctx->pending_blocks[i].hash,
> 					params->block_size);
> 
> 		err = crypto_ahash_finup(req);
> 		if (err) {
> 			fsverity_err(inode, "Error %d computing block hashes", err);
> 			free all requests;
> 			return false;
> 		}
> 	}
> 
> You're hiding some of the complexity by not allocating memory
> explicitly for each hash state.  This might fit on the stack
> for two requests, but eventually you will have to allocate memory.
> 
> With the ahash API, the allocation is explicit.
> 

This doesn't make any sense, though.  First, the requests need to be enqueued
for the task, but crypto_ahash_finup() would only have the ability to enqueue it
in a queue associated with the tfm, which is shared by many tasks.  So it can't
actually work unless the tfm maintained a separate queue for each task, which
would be really complex.  Second, it adds a memory allocation per block which is
very undesirable.  You claim that it's needed anyway, but actually it's not;
with my API there is only one initial hash state regardless of how high the
interleaving factor is.  In fact, if multiple initial states were allowed,
multibuffer hashing would become much more complex because the underlying
algorithm would need to validate that these different states are synced up.  My
proposal is much simpler and avoids all this unnecessary overhead.

Really the only reason to even consider ahash at all would be try to support
software hashing and off-CPU hardware accelerators using the "same" code.
However, your proposal would not achieve that either, as it would not use the
async callback.  Note, as far as I know no one actually cares about off-CPU
hardware accelerator support in fsverity anyway...

- Eric
Herbert Xu June 5, 2024, 9:19 a.m. UTC | #3
On Tue, Jun 04, 2024 at 11:42:20AM -0700, Eric Biggers wrote:
>
> This doesn't make any sense, though.  First, the requests need to be enqueued
> for the task, but crypto_ahash_finup() would only have the ability to enqueue it
> in a queue associated with the tfm, which is shared by many tasks.  So it can't

OK I screwed up that one.  But that's not hard to fix.  We could
simply add request chaining:

	ahash_request_chain(req1, req2);
	ahash_request_chain(req2, req3);
	...
	ahash_request_chain(reqn1, reqn);

	err = crypto_ahash_finup(req1);

> actually work unless the tfm maintained a separate queue for each task, which
> would be really complex.  Second, it adds a memory allocation per block which is
> very undesirable.  You claim that it's needed anyway, but actually it's not;
> with my API there is only one initial hash state regardless of how high the
> interleaving factor is.  In fact, if multiple initial states were allowed,

Sure you don't need it for two interleaved requests.  But as you
scale up to 16 and beyond, surely at some point you're going to
want to move the hash states off the stack.

> multibuffer hashing would become much more complex because the underlying
> algorithm would need to validate that these different states are synced up.  My
> proposal is much simpler and avoids all this unnecessary overhead.

We could simply state that these chained requests must be on block
boundaries, similar to how we handle block ciphers.  This is not a
big deal.

> Really the only reason to even consider ahash at all would be try to support
> software hashing and off-CPU hardware accelerators using the "same" code.
> However, your proposal would not achieve that either, as it would not use the
> async callback.  Note, as far as I know no one actually cares about off-CPU
> hardware accelerator support in fsverity anyway...

The other thing is that verity doesn't benefit from shash at all.
It appears to be doing kmap's on every single request.

Cheers,
Herbert Xu June 5, 2024, 9:22 a.m. UTC | #4
Hi Eric:

I really appreciate your suggestion of having the user driving
multi-request hashing as opposed to the bodge that we had before.
I think that is the perfect way of doing this, similar to how we
handled TCP segmentation with TSO.

However, I really dislike the idea of shoehorning this into shash.
I know you really like shash, but I think there are some clear
benefits to be had by coupling this with ahash.

Cheers,
Herbert Xu June 5, 2024, 9:46 a.m. UTC | #5
On Wed, Jun 05, 2024 at 05:22:21PM +0800, Herbert Xu wrote:
>
> However, I really dislike the idea of shoehorning this into shash.
> I know you really like shash, but I think there are some clear
> benefits to be had by coupling this with ahash.

If we do this properly, we should be able to immediately use the
mb code with IPsec.  In the network stack, we already aggregate
the data prior to IPsec with GSO.  So at the boundary between
IPsec and the Crypto API, it's dividing chunks of data up to 64K
into 1500-byte packets and feeding them to crypto one at a time.

It really should be sending the whole chain of packets to us as
a unit.

Once we have a proper mb interface, we can fix that and immediately
get the benefit of mb hashing.

Cheers,
Eric Biggers June 5, 2024, 6:58 p.m. UTC | #6
On Wed, Jun 05, 2024 at 05:19:01PM +0800, Herbert Xu wrote:
> On Tue, Jun 04, 2024 at 11:42:20AM -0700, Eric Biggers wrote:
> >
> > This doesn't make any sense, though.  First, the requests need to be enqueued
> > for the task, but crypto_ahash_finup() would only have the ability to enqueue it
> > in a queue associated with the tfm, which is shared by many tasks.  So it can't
> 
> OK I screwed up that one.  But that's not hard to fix.  We could
> simply add request chaining:
> 
> 	ahash_request_chain(req1, req2);
> 	ahash_request_chain(req2, req3);
> 	...
> 	ahash_request_chain(reqn1, reqn);
> 
> 	err = crypto_ahash_finup(req1);

So after completely changing several times your proposal is getting a bit closer
to mine, but it still uses a very clumsy API based around ahash that would be
much harder to use and implement correctly.  It also says nothing about what the
API would look like on the shash side, which would be needed anyway because
ahash is almost always just a pointless wrapper for shash.  Is there a different
API that you are asking for on the shash side?  If so, what?

> > actually work unless the tfm maintained a separate queue for each task, which
> > would be really complex.  Second, it adds a memory allocation per block which is
> > very undesirable.  You claim that it's needed anyway, but actually it's not;
> > with my API there is only one initial hash state regardless of how high the
> > interleaving factor is.  In fact, if multiple initial states were allowed,
> 
> Sure you don't need it for two interleaved requests.  But as you
> scale up to 16 and beyond, surely at some point you're going to
> want to move the hash states off the stack.

To reiterate, with my proposal there is only one state in memory.  It's a simple
API that can't be misused by providing inconsistent properties in the requests.
Yes, separate states would be needed if we were to support arbitrary updates,
but why add all that complexity before it's actually needed?

Also, "16 and beyond" is highly unlikely to be useful for kernel use cases.

> > multibuffer hashing would become much more complex because the underlying
> > algorithm would need to validate that these different states are synced up.  My
> > proposal is much simpler and avoids all this unnecessary overhead.
> 
> We could simply state that these chained requests must be on block
> boundaries, similar to how we handle block ciphers.  This is not a
> big deal.

... which would make it useless for most dm-verity users, as dm-verity uses a
32-byte salt with SHA-256 (which has a 64-byte block size) by default.

> 
> > Really the only reason to even consider ahash at all would be try to support
> > software hashing and off-CPU hardware accelerators using the "same" code.
> > However, your proposal would not achieve that either, as it would not use the
> > async callback.  Note, as far as I know no one actually cares about off-CPU
> > hardware accelerator support in fsverity anyway...
> 
> The other thing is that verity doesn't benefit from shash at all.
> It appears to be doing kmap's on every single request.

The diff from switching fsverity from ahash to shash clearly demonstrates
otherwise.  Yes, fsverity has to map the pages to pass into shash, but that's a
very minor thing compared to all the complexity of ahash that was saved.  And
fsverity already had to map most of the pages anyway to access them.

- Eric
Eric Biggers June 5, 2024, 7:14 p.m. UTC | #7
On Wed, Jun 05, 2024 at 05:46:27PM +0800, Herbert Xu wrote:
> On Wed, Jun 05, 2024 at 05:22:21PM +0800, Herbert Xu wrote:
> >
> > However, I really dislike the idea of shoehorning this into shash.
> > I know you really like shash, but I think there are some clear
> > benefits to be had by coupling this with ahash.
> 
> If we do this properly, we should be able to immediately use the
> mb code with IPsec.  In the network stack, we already aggregate
> the data prior to IPsec with GSO.  So at the boundary between
> IPsec and the Crypto API, it's dividing chunks of data up to 64K
> into 1500-byte packets and feeding them to crypto one at a time.
> 
> It really should be sending the whole chain of packets to us as
> a unit.
> 
> Once we have a proper mb interface, we can fix that and immediately
> get the benefit of mb hashing.
> 

This would at most apply to AH, not to ESP.  Is AH commonly used these days?
Also even for AH, the IPsec code would need to be significantly restructured to
make use of multibuffer hashing.  See how the segmentation happens in
xfrm_output_gso(), but the ahash calls happen much lower in the stack.

I'm guessing that you've had the AH use case in mind since your earlier
comments.  Given you were originally pushing for this to be supported using the
existing async support in the ahash API (which would have required fewer code
changes on the AH side), but we now agree that is not feasible, maybe it is time
to reconsider whether it would still be worthwhile to make all the changes to
the AH code needed to support this?

Also, even if it would be worthwhile and would use ahash, ahash is almost always
just a wrapper for shash.  So the shash support would be needed anyway.

- Eric
Herbert Xu June 6, 2024, 2 a.m. UTC | #8
On Wed, Jun 05, 2024 at 12:14:10PM -0700, Eric Biggers wrote:
> 
> This would at most apply to AH, not to ESP.  Is AH commonly used these days?

No AH is completely useless.  However, this applies perfectly to
ESP, in conjunction with authenc.  Obviously we would need to add
request linking to authenc (AEAD) as well so that it can pass it
along to sha.

BTW, does any of this interleaving apply to AES? If so we should
explore adding request linking to skcipher as well.

Cheers,
Eric Biggers June 6, 2024, 5:28 a.m. UTC | #9
On Thu, Jun 06, 2024 at 10:00:05AM +0800, Herbert Xu wrote:
> On Wed, Jun 05, 2024 at 12:14:10PM -0700, Eric Biggers wrote:
> > 
> > This would at most apply to AH, not to ESP.  Is AH commonly used these days?
> 
> No AH is completely useless.  However, this applies perfectly to
> ESP, in conjunction with authenc.  Obviously we would need to add
> request linking to authenc (AEAD) as well so that it can pass it
> along to sha.
> 
> BTW, does any of this interleaving apply to AES? If so we should
> explore adding request linking to skcipher as well.
> 

With AES, interleaving would only help with non-parallelizable modes such as CBC
encryption.  Anyone who cares about IPsec performance should of course be using
AES-GCM, which is parallelizable.  Especially since my other patch
https://lore.kernel.org/linux-crypto/20240602222221.176625-2-ebiggers@kernel.org/
is making AES-GCM twice as fast...

With hashing we unfortunately don't have the luxury of there being widely used
and accepted parallelizable algorithms.  In particular, all the SHAs are
serialized.  So that's why interleaving makes sense there.

In any case, it seems that what you're asking for at this point is far beyond
the scope of this patchset.

- Eric
Herbert Xu June 6, 2024, 5:41 a.m. UTC | #10
On Wed, Jun 05, 2024 at 10:28:01PM -0700, Eric Biggers wrote:
>
> With AES, interleaving would only help with non-parallelizable modes such as CBC
> encryption.  Anyone who cares about IPsec performance should of course be using
> AES-GCM, which is parallelizable.  Especially since my other patch
> https://lore.kernel.org/linux-crypto/20240602222221.176625-2-ebiggers@kernel.org/
> is making AES-GCM twice as fast...

Algorithm selection may be limited by peer capability.  For IPsec,
if SHA is being used, then most likely CBC is also being used.

> In any case, it seems that what you're asking for at this point is far beyond
> the scope of this patchset.

I'm more than happy to take this over if you don't wish to extend
it beyond the storage usage cases.  According to the original Intel
sha2-mb submission, this should result in at least a two-fold
speed-up.

Cheers,
Ard Biesheuvel June 6, 2024, 6:58 a.m. UTC | #11
On Thu, 6 Jun 2024 at 07:41, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Wed, Jun 05, 2024 at 10:28:01PM -0700, Eric Biggers wrote:
> >
> > With AES, interleaving would only help with non-parallelizable modes such as CBC
> > encryption.  Anyone who cares about IPsec performance should of course be using
> > AES-GCM, which is parallelizable.  Especially since my other patch
> > https://lore.kernel.org/linux-crypto/20240602222221.176625-2-ebiggers@kernel.org/
> > is making AES-GCM twice as fast...
>
> Algorithm selection may be limited by peer capability.  For IPsec,
> if SHA is being used, then most likely CBC is also being used.
>

IPSec users relying on software crypto and authenc() and caring about
performance seems like a rather niche use case to me.

> > In any case, it seems that what you're asking for at this point is far beyond
> > the scope of this patchset.
>
> I'm more than happy to take this over if you don't wish to extend
> it beyond the storage usage cases.  According to the original Intel
> sha2-mb submission, this should result in at least a two-fold
> speed-up.
>

I'm struggling to follow this debate. Surely, if this functionality
needs to live in ahash, the shash fallbacks need to implement this
parallel scheme too, or ahash would end up just feeding the requests
into shash sequentially, defeating the purpose. It is then up to the
API client to choose between ahash or shash, just as it can today.

So Eric has a pretty strong case for his shash implementation;
kmap_local() is essentially a NOP on architectures that anyone still
cares about (unlike kmap_atomic() which still disables preemption), so
I don't have a problem with the caller relying on that in order to be
able to use shash directly. The whole scatterlist / request alloc
dance is just too tedious and pointless, given that in practice, it
all gets relegated to shash anyway.

But my point is that even if we go with Herbert's proposal for the
ahash, we'll still need something like Eric's code on the shash side.

For the true async accelerator use case, none of this should make any
difference, right? If the caller already tolerates async (but
in-order) completion, implementing this request chaining doesn't buy
it anything. So only when the caller is sync and the implementation is
async, we might be able to do something smart where the caller waits
on a single completion that signals the completion of a set of inputs.
But this is also rather niche, so not worth holding up this effort.

So Herbert, what would the ahash_to_shash plumbing look like for the
ahash API that you are proposing? What changes would it require to
shash, and how much to they deviate from what Eric is suggesting?
Herbert Xu June 6, 2024, 7:34 a.m. UTC | #12
On Thu, Jun 06, 2024 at 08:58:47AM +0200, Ard Biesheuvel wrote:
>
> IPSec users relying on software crypto and authenc() and caring about
> performance seems like a rather niche use case to me.

It's no more niche than fs/verity and dm-integrity.  In fact,
this could potentially be used for all algorithms.  Just the
reduction in the number of function calls may produce enough
of a benefit (this is something I observed when adding GSO,
even without any actual hardware offload, simply aggregating
packets into larger units produced a visible benefit).

> I'm struggling to follow this debate. Surely, if this functionality
> needs to live in ahash, the shash fallbacks need to implement this
> parallel scheme too, or ahash would end up just feeding the requests
> into shash sequentially, defeating the purpose. It is then up to the
> API client to choose between ahash or shash, just as it can today.

I've never suggested it adding it to shash at all.  In fact
that's my primary problem with this interface.  I think it
should be ahash only.  Just like skcipher and aead.

My reasoning is that this should cater mostly to bulk data, i.e.,
multiple pages (e.g., for IPsec we're talking about 64K chunks,
actually that (the 64K limit) is something that we should really
extend, it's not 2014 anymore).  These typically will be more
easily accessed as a number of distinct pages rather than as a
contiguous mapping.

Cheers,
Ard Biesheuvel June 6, 2024, 7:55 a.m. UTC | #13
On Thu, 6 Jun 2024 at 09:34, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Jun 06, 2024 at 08:58:47AM +0200, Ard Biesheuvel wrote:
> >
> > IPSec users relying on software crypto and authenc() and caring about
> > performance seems like a rather niche use case to me.
>
> It's no more niche than fs/verity and dm-integrity.

fsverity is used by Android, which is the diametrical opposite of
niche when it comes to Linux distros.

> this could potentially be used for all algorithms.  Just the
> reduction in the number of function calls may produce enough
> of a benefit (this is something I observed when adding GSO,
> even without any actual hardware offload, simply aggregating
> packets into larger units produced a visible benefit).
>

Sure. But my point is that anyone who cares enough about IPsec
performance would have stopped using authenc(cbc(aes),hmac(sha2)) long
ago and moved to GCM or even ChaChaPoly. This is not just a matter of
efficiency in the implementation - using a general purpose hash
function such as SHA-256 [twice] rather than GHASH or Poly1305 is just
overkill.

So complicating the performance optimization of something that runs on
every (non-Apple) phone for the benefit of something that is rarely
used in a context where the performance matters seems unreasonable to
me.

> > I'm struggling to follow this debate. Surely, if this functionality
> > needs to live in ahash, the shash fallbacks need to implement this
> > parallel scheme too, or ahash would end up just feeding the requests
> > into shash sequentially, defeating the purpose. It is then up to the
> > API client to choose between ahash or shash, just as it can today.
>
> I've never suggested it adding it to shash at all.  In fact
> that's my primary problem with this interface.  I think it
> should be ahash only.  Just like skcipher and aead.
>

So again, how would that work for ahash falling back to shash. Are you
saying every existing shash implementation should be duplicated into
an ahash so that the multibuffer optimization can be added? shash is a
public interface so we cannot just remove the existing ones and we'll
end up carrying both forever.

> My reasoning is that this should cater mostly to bulk data, i.e.,
> multiple pages (e.g., for IPsec we're talking about 64K chunks,
> actually that (the 64K limit) is something that we should really
> extend, it's not 2014 anymore).  These typically will be more
> easily accessed as a number of distinct pages rather than as a
> contiguous mapping.
>

Sure, but the block I/O world is very different. Forcing it to use an
API modeled after how IPsec might use it seems, again, unreasonable.

So these 64k chunks are made up of 1500 byte frames that can be hashed
in parallel?
Herbert Xu June 6, 2024, 8:08 a.m. UTC | #14
On Thu, Jun 06, 2024 at 09:55:56AM +0200, Ard Biesheuvel wrote:
>
> So again, how would that work for ahash falling back to shash. Are you
> saying every existing shash implementation should be duplicated into
> an ahash so that the multibuffer optimization can be added? shash is a
> public interface so we cannot just remove the existing ones and we'll
> end up carrying both forever.

It should do the same thing for ahash algorithms that do not support
multiple requests.  IOW it should process the requests one by one.

> Sure, but the block I/O world is very different. Forcing it to use an
> API modeled after how IPsec might use it seems, again, unreasonable.

It's not different at all.  You can see that by the proliferation
of kmap calls in fs/verity.  It's a fundamental issue.  You can't
consistently get a large contiguous allocation beyond one page due
to fragmentation.  So large data is always going to be scattered.

BTW, I'm all for elminating the overhead when you already have a
linear address for scattered memory, e.g., through vmalloc.  We
should definitely improve our interface for ahash/skcipher/aead so
that vmalloc addresses (as well as kmalloc virtual addresses by
extension) are supported as first class citizens, and we don't turn
them into SG lists unless it's necessary for DMA.

Cheers,
Ard Biesheuvel June 6, 2024, 8:33 a.m. UTC | #15
On Thu, 6 Jun 2024 at 10:08, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Jun 06, 2024 at 09:55:56AM +0200, Ard Biesheuvel wrote:
> >
> > So again, how would that work for ahash falling back to shash. Are you
> > saying every existing shash implementation should be duplicated into
> > an ahash so that the multibuffer optimization can be added? shash is a
> > public interface so we cannot just remove the existing ones and we'll
> > end up carrying both forever.
>
> It should do the same thing for ahash algorithms that do not support
> multiple requests.  IOW it should process the requests one by one.
>

That is not what I am asking.

Are you suggesting that, e.g., the arm64 sha2 shash implementation
that is modified by this series should instead expose both an shash as
before, and an ahash built around the same asm code that exposes the
multibuffer capability?

> > Sure, but the block I/O world is very different. Forcing it to use an
> > API modeled after how IPsec might use it seems, again, unreasonable.
>
> It's not different at all.  You can see that by the proliferation
> of kmap calls in fs/verity.  It's a fundamental issue.  You can't
> consistently get a large contiguous allocation beyond one page due
> to fragmentation.  So large data is always going to be scattered.
>

I don't think this is true for many uses of the block layer.

> BTW, I'm all for elminating the overhead when you already have a
> linear address for scattered memory, e.g., through vmalloc.  We
> should definitely improve our interface for ahash/skcipher/aead so
> that vmalloc addresses (as well as kmalloc virtual addresses by
> extension) are supported as first class citizens, and we don't turn
> them into SG lists unless it's necessary for DMA.
>

Yes, this is something I've been pondering for a while. An
ahash/skcipher/aead with CRYPTO_ALG_ASYNC cleared (which would
guarantee that any provided VA would not be referenced after the algo
invocation returns) should be able to consume a request that carries
virtual addresses rather than SG lists. Given that it is up to the
caller to choose between sync and async, it would be in a good
position also to judge whether it wants to use stack/vmalloc
addresses.

I'll have a stab at this.
Herbert Xu June 6, 2024, 9:15 a.m. UTC | #16
On Thu, Jun 06, 2024 at 10:33:15AM +0200, Ard Biesheuvel wrote:
>
> Are you suggesting that, e.g., the arm64 sha2 shash implementation
> that is modified by this series should instead expose both an shash as
> before, and an ahash built around the same asm code that exposes the
> multibuffer capability?

Yes the multi-request handling should be implemented as ahash
only.

Cheers,
Eric Biggers June 10, 2024, 4:42 p.m. UTC | #17
Hi Herbert,

On Thu, Jun 06, 2024 at 05:15:16PM +0800, Herbert Xu wrote:
> On Thu, Jun 06, 2024 at 10:33:15AM +0200, Ard Biesheuvel wrote:
> >
> > Are you suggesting that, e.g., the arm64 sha2 shash implementation
> > that is modified by this series should instead expose both an shash as
> > before, and an ahash built around the same asm code that exposes the
> > multibuffer capability?
> 
> Yes the multi-request handling should be implemented as ahash
> only.
> 

That is not a reasonable way to do it.  It would be much more complex, more
error-prone, and slower than my proposal.  Also your proposal wouldn't actually
bring you much closer to being able to optimize your authenc use case.

First, each algorithm implementation that wants to support your scheme would
need to implement the ahash API alongside shash.  This would be a lot of
duplicated code, and it would need to support all quirks of the ahash API
including scatterlists.

Second, allowing an ahash_request to actually be a list of requests creates an
ambiguity where it will often be unclear when code is supposed to operate on an
ahash_request individually vs. on the whole list.  This is error-prone, as it
invites bugs where a crypto operation is done on only one request of the list.
This is a bad design, especially for a cryptographic API.  We would have
crypto_ahash_*() do some checks to prevent multi-requests from reaching
algorithms that don't support the particular kind of request being made.  But
there will be a lot of cases to consider.

Third, it would be hard to actually implement multibuffer hashing given an
arbitrary list of requests.  For multibuffer hashing to work, the lengths of all
the buffers must be synced up, including the internal buffers in the hash states
as well as every buffer that is produced while walking the scatterlists.  We can
place constraints on what is supported, but those constraints will need to be
clearly specified, and each algorithm will actually need to check for them and
do something sane (error or fallback) when they are not met.  Note that it would
be possible for the messages to get out of sync in the middle of walking the
scatterlists, which would be difficult to handle.  All this would add a lot of
complexity and overhead compared to my proposal, which naturally expresses the
same-length constraints in the API.

Fourth, having the API be ahash-only would also force fsverity to switch back to
the ahash API, which would add complexity and overhead.  The shash API is easier
to use even when working with pages, as the diffstat of commit 8fcd94add6c5
clearly shows.  The ahash API also has more overhead, including in doing the
needed memory allocation, setting up a scatterlist, initializing required but
irrelevant or redundant fields in the ahash_request, and shash_ahash_finup()
running the fully generalized scatterlist walker on the scatterlist just to
finally end up with a pointer which the caller could have provided directly.
All this overhead adds up, even when hashing 4K blocks.  Back when CPU-based
crypto was slow this didn't really matter, but now it's fast and these small
overheads are significant when trying to keep up with storage device speeds
(which are also fast now).  E.g., even disregarding the memory allocation,
hashing a 4K block is about 5% faster with shash than with ahash on x86_64.

Of course, I'd need to support ahash in fsverity anyway if someone were to
actually need support for non-inline hardware offload in fsverity (and if I
decide to accept that despite many hardware drivers being buggy).  But really
the best way to do this would be to support ahash alongside shash, like what I'm
proposing in dm-verity -- perhaps with ahash support limited to the data blocks
only, as that's the most performance critical part.  The ahash code path would
use the async callback to actually properly support offload, which would mean
the code would be substantially different anyway due to the fundamentally
different computational model, especially vs. multibuffer hashing.  A "single"
API, that works for both hardware offload and for the vast majority of users
that simply need very low overhead software crypto, would be nice in theory.
But I'm afraid it's been shown repeatedly that just doesn't work...  The
existence of lib/crypto/, shash, lskcipher, and scomp all reflect this.

I understand that you think the ahash based API would make it easier to add
multibuffer support to "authenc(hmac(sha256),cbc(aes))" for IPsec, which seems
to be a very important use case for you (though it isn't relevant to nearly as
many systems as dm-verity and fsverity are).  Regardless, the reality is that it
would be much more difficult to take advantage of multibuffer crypto in the
IPsec authenc use case than in dm-verity and fsverity.  authenc uses multiple
underlying algorithms, AES-CBC and HMAC-SHA256, that would both have to use
multibuffer crypto in order to see a significant benefit, seeing as even if the
SHA-256 support could be wired up through HMAC-SHA256, encryption would be
bottlenecked on AES-CBC, especially on Intel CPUs.  It also looks like the IPsec
code would need a lot of updates to support multibuffer crypto.

At the same time, an easier way to optimize "authenc(hmac(sha256),cbc(aes))"
would be to add an implementation of it to arch/x86/crypto/ that interleaves the
AES-CBC and SHA-256 and also avoids the overhead associated with the template
based approach (such as all the extra indirect calls).  Of course, that would
require writing assembly, but so would multibuffer crypto.  It seems unlikely
that someone would step in to do all the work to implement a multibuffer
optimization for this algorithm and its use in IPsec, when no one has ever
bothered to optimize the single-buffer case in the first place, which has been
possible all along and would require no API or IPsec changes...

In any case, any potential multi-request support in ahash, skcipher, or aead
should be separate considerations from the simple function in shash that I'm
proposing.  It makes sense to have the shash support regardless.

Ultimately, I need to have dm-verity and fsverity be properly optimized in the
downstreams that are most relevant to me.  If you're not going to allow the
upstream crypto API to provide the needed functionality in a reasonable way,
then I'll need to shift my focus to getting this patchset into downstream
kernels such as Android and Chrome OS instead.

- Eric
Herbert Xu June 11, 2024, 3:21 p.m. UTC | #18
On Mon, Jun 10, 2024 at 09:42:58AM -0700, Eric Biggers wrote:
>
> I understand that you think the ahash based API would make it easier to add
> multibuffer support to "authenc(hmac(sha256),cbc(aes))" for IPsec, which seems
> to be a very important use case for you (though it isn't relevant to nearly as
> many systems as dm-verity and fsverity are).  Regardless, the reality is that it
> would be much more difficult to take advantage of multibuffer crypto in the
> IPsec authenc use case than in dm-verity and fsverity.  authenc uses multiple
> underlying algorithms, AES-CBC and HMAC-SHA256, that would both have to use
> multibuffer crypto in order to see a significant benefit, seeing as even if the
> SHA-256 support could be wired up through HMAC-SHA256, encryption would be
> bottlenecked on AES-CBC, especially on Intel CPUs.  It also looks like the IPsec
> code would need a lot of updates to support multibuffer crypto.

The linked-request thing feeds nicely into networking.  In fact
that's where I got the idea of linking them from.  In networking
a large GSO (currently limited to 64K but theoretically we could
make it unlimited) packet is automatically split up into a linked
list of MTU-sized skb's.

Therefore if we switched to a linked-list API networking could
give us the buffers with minimal changes.

BTW, I found an old Intel paper that claims through their multi-
buffer strategy they were able to make AES-CBC-XCBC beat AES-GCM.
I wonder if we could still replicate this today:

https://github.com/intel/intel-ipsec-mb/wiki/doc/fast-multi-buffer-ipsec-implementations-ia-processors-paper.pdf
 
> Ultimately, I need to have dm-verity and fsverity be properly optimized in the
> downstreams that are most relevant to me.  If you're not going to allow the
> upstream crypto API to provide the needed functionality in a reasonable way,
> then I'll need to shift my focus to getting this patchset into downstream
> kernels such as Android and Chrome OS instead.

I totally understand that this is your priority.  But please give
me some time to see if we can devise something that works for both
scenarios.

Thanks,
Herbert Xu June 11, 2024, 3:39 p.m. UTC | #19
On Tue, Jun 11, 2024 at 11:21:43PM +0800, Herbert Xu wrote:
>
> Therefore if we switched to a linked-list API networking could
> give us the buffers with minimal changes.

BTW, this is not just about parallelising hashing.  Just as one of
the most significant benefits of GSO does not come from hardware
offload, but rather the amortisation of (network) stack overhead.
IOW you're traversing a very deep stack once instead of 40 times
(this is the factor for 64K vs MTU, if we extend beyond 64K (which
we absolute should do) the benefit would increase as well).

The same should apply to the Crypto API.  So even if this was a
purely software solution with no assembly code at all, it may well
improve GCM performance (at least for users able to feed us bulk
data, like networking).

Cheers,
Ard Biesheuvel June 11, 2024, 3:46 p.m. UTC | #20
On Tue, 11 Jun 2024 at 17:21, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Mon, Jun 10, 2024 at 09:42:58AM -0700, Eric Biggers wrote:
> >
> > I understand that you think the ahash based API would make it easier to add
> > multibuffer support to "authenc(hmac(sha256),cbc(aes))" for IPsec, which seems
> > to be a very important use case for you (though it isn't relevant to nearly as
> > many systems as dm-verity and fsverity are).  Regardless, the reality is that it
> > would be much more difficult to take advantage of multibuffer crypto in the
> > IPsec authenc use case than in dm-verity and fsverity.  authenc uses multiple
> > underlying algorithms, AES-CBC and HMAC-SHA256, that would both have to use
> > multibuffer crypto in order to see a significant benefit, seeing as even if the
> > SHA-256 support could be wired up through HMAC-SHA256, encryption would be
> > bottlenecked on AES-CBC, especially on Intel CPUs.  It also looks like the IPsec
> > code would need a lot of updates to support multibuffer crypto.
>
> The linked-request thing feeds nicely into networking.  In fact
> that's where I got the idea of linking them from.  In networking
> a large GSO (currently limited to 64K but theoretically we could
> make it unlimited) packet is automatically split up into a linked
> list of MTU-sized skb's.
>
> Therefore if we switched to a linked-list API networking could
> give us the buffers with minimal changes.
>
> BTW, I found an old Intel paper that claims through their multi-
> buffer strategy they were able to make AES-CBC-XCBC beat AES-GCM.
> I wonder if we could still replicate this today:
>
> https://github.com/intel/intel-ipsec-mb/wiki/doc/fast-multi-buffer-ipsec-implementations-ia-processors-paper.pdf
>

This looks like the whitepaper that describes the buggy multibuffer
code that we ripped out.

> > Ultimately, I need to have dm-verity and fsverity be properly optimized in the
> > downstreams that are most relevant to me.  If you're not going to allow the
> > upstream crypto API to provide the needed functionality in a reasonable way,
> > then I'll need to shift my focus to getting this patchset into downstream
> > kernels such as Android and Chrome OS instead.
>
> I totally understand that this is your priority.  But please give
> me some time to see if we can devise something that works for both
> scenarios.
>

The issue here is that the CPU based multibuffer approach has rather
tight constraints in terms of input length and the shared prefix, and
so designing a more generic API based on ahash doesn't help at all.
The intel multibuffer code went off into the weeds entirely attempting
to apply this parallel scheme to arbitrary combinations of inputs, so
this is something we know we should avoid.
Herbert Xu June 11, 2024, 3:51 p.m. UTC | #21
On Tue, Jun 11, 2024 at 05:46:01PM +0200, Ard Biesheuvel wrote:
>
> The issue here is that the CPU based multibuffer approach has rather
> tight constraints in terms of input length and the shared prefix, and
> so designing a more generic API based on ahash doesn't help at all.
> The intel multibuffer code went off into the weeds entirely attempting
> to apply this parallel scheme to arbitrary combinations of inputs, so
> this is something we know we should avoid.

The sha-mb approach failed because it failed to aggregate the data
properly.  By driving this from the data sink, it was doomed to fail.

The correct way to aggregate data is to do it at the source.  The
user (of the Crypto API) knows exactlty how much data they want to
hash and how it's structured.  They should be supplying that info
to the API so it can use multi-buffer where applicable.  Even where
multi-buffer isn't available, they would at least benefit from making
a single indirect call into the Crypto stack instead of N calls.
When N is large (which is almost always the case for TCP) this
produces a non-trivial saving.

Sure I understand that you guys are more than happy with N=2 but
please let me at least try this out and see if we could make this
work for a large value of N.

Cheers,
Eric Biggers June 11, 2024, 8:18 p.m. UTC | #22
On Tue, Jun 11, 2024 at 11:21:43PM +0800, Herbert Xu wrote:
> 
> BTW, I found an old Intel paper that claims through their multi-
> buffer strategy they were able to make AES-CBC-XCBC beat AES-GCM.
> I wonder if we could still replicate this today:
> 
> https://github.com/intel/intel-ipsec-mb/wiki/doc/fast-multi-buffer-ipsec-implementations-ia-processors-paper.pdf

No, not even close.  Even assuming that the lack of parallelizability in AES-CBC
and AES-XCBC can be entirely compensated for via multibuffer crypto (which
really it can't -- consider single packets, for example), doing AES twice is
much more expensive than doing AES and GHASH.  GHASH is a universal hash
function, and computing a universal hash function is inherently cheaper than
computing a cryptographic hash function.  But also modern Intel CPUs have very
fast carryless multiplication, and it uses a different execution port from what
AES uses.  So the overhead of AES + GHASH over AES alone is very small.  By
doing AES twice, you'd be entirely bottlenecked by the ports that can execute
the AES instructions, while the other ports go nearly unused.  So it would
probably be approaching twice as slow as AES-GCM.

Westmere (2010) through Ivy Bridge (2012) are the only Intel CPUs where
multibuffer AES-CBC-XCBC could plausibly be faster than AES-GCM (given a
sufficiently large number of messages at once), due to the very slow pclmulqdq
instruction on those CPUs.  This is long since fixed, as pclmulqdq became much
faster in Haswell (2013), and faster still in Broadwell.  This is exactly what
that Intel paper shows; they show AES-GCM becoming fastest in "Gen 4", i.e.
Haswell.  The paper is from 2012, so of course they don't show anything after
that.  But AES-GCM has only pulled ahead even more since then.

In theory something like AES-CBC + SHA-256 could be slightly more competitive
than AES-CBC + AES-XCBC.  But it would still be worse than simply doing AES-GCM
-- which again, doesn't need multibuffer, and my recent patches have already
fully optimized for recent x86_64 CPUs.

- Eric
Eric Biggers June 11, 2024, 8:32 p.m. UTC | #23
On Tue, Jun 11, 2024 at 11:39:08PM +0800, Herbert Xu wrote:
> On Tue, Jun 11, 2024 at 11:21:43PM +0800, Herbert Xu wrote:
> >
> > Therefore if we switched to a linked-list API networking could
> > give us the buffers with minimal changes.
> 
> BTW, this is not just about parallelising hashing.  Just as one of
> the most significant benefits of GSO does not come from hardware
> offload, but rather the amortisation of (network) stack overhead.
> IOW you're traversing a very deep stack once instead of 40 times
> (this is the factor for 64K vs MTU, if we extend beyond 64K (which
> we absolute should do) the benefit would increase as well).
> 
> The same should apply to the Crypto API.  So even if this was a
> purely software solution with no assembly code at all, it may well
> improve GCM performance (at least for users able to feed us bulk
> data, like networking).
> 

At best this would save an indirect call per message, if the underlying
algorithm explicitly added support for it and the user of the API migrated to
the multi-request model.  This alone doesn't seem worth the effort of migrating
to multi-request, especially considering the many other already-possible
optimizations that would not require API changes or migrating users to
multi-request.  The x86_64 AES-GCM is pretty well optimized now after my recent
patches, but there's still an indirect call associated with the use of the SIMD
helper which could be eliminated, saving one per message (already as much as we
could hope to get from multi-request).  authenc on the other hand is almost
totally unoptimized, as I mentioned before; it makes little sense to talk about
any sort of multi-request optimization for it at this point.

- Eric
diff mbox series

Patch

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index b3506f56e180..7535c9d9b516 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -27,10 +27,15 @@  struct fsverity_hash_alg {
 	/*
 	 * The HASH_ALGO_* constant for this algorithm.  This is different from
 	 * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
 	 */
 	enum hash_algo algo_id;
+	/*
+	 * The maximum supported interleaving factor for multibuffer hashing, or
+	 * 1 if the algorithm doesn't support multibuffer hashing
+	 */
+	int mb_max_msgs;
 };
 
 /* Merkle tree parameters: hash algorithm, initial hash state, and topology */
 struct merkle_tree_params {
 	const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
@@ -150,8 +155,10 @@  static inline void fsverity_init_signature(void)
 }
 #endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
 
 /* verify.c */
 
+#define FS_VERITY_MAX_PENDING_DATA_BLOCKS	2
+
 void __init fsverity_init_workqueue(void);
 
 #endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 6b08b1d9a7d7..f24d7c295455 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -82,12 +82,16 @@  const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
 	if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm)))
 		goto err_free_tfm;
 	if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm)))
 		goto err_free_tfm;
 
-	pr_info("%s using implementation \"%s\"\n",
-		alg->name, crypto_shash_driver_name(tfm));
+	alg->mb_max_msgs = min(crypto_shash_mb_max_msgs(tfm),
+			       FS_VERITY_MAX_PENDING_DATA_BLOCKS);
+
+	pr_info("%s using implementation \"%s\"%s\n",
+		alg->name, crypto_shash_driver_name(tfm),
+		alg->mb_max_msgs > 1 ? " (multibuffer)" : "");
 
 	/* pairs with smp_load_acquire() above */
 	smp_store_release(&alg->tfm, tfm);
 	goto out_unlock;
 
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 4fcad0825a12..e9fb299ffa77 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -8,10 +8,32 @@ 
 #include "fsverity_private.h"
 
 #include <crypto/hash.h>
 #include <linux/bio.h>
 
+struct fsverity_pending_block {
+	const void *data;
+	u64 pos;
+	u8 hash[FS_VERITY_MAX_DIGEST_SIZE];
+};
+
+struct fsverity_verification_context {
+	struct inode *inode;
+	struct fsverity_info *vi;
+	unsigned long max_ra_pages;
+
+	/*
+	 * This is the queue of data blocks that are pending verification.  We
+	 * allow multiple blocks to be queued up in order to support hash
+	 * algorithm implementations that provide support for multibuffer
+	 * hashing, i.e. interleaving the hashing of multiple messages.  On many
+	 * CPUs this improves performance significantly.
+	 */
+	int num_pending;
+	struct fsverity_pending_block pending_blocks[FS_VERITY_MAX_PENDING_DATA_BLOCKS];
+};
+
 static struct workqueue_struct *fsverity_read_workqueue;
 
 /*
  * Returns true if the hash block with index @hblock_idx in the tree, located in
  * @hpage, has already been verified.
@@ -77,23 +99,25 @@  static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 	SetPageChecked(hpage);
 	return false;
 }
 
 /*
- * Verify a single data block against the file's Merkle tree.
+ * Verify the hash of a single data block against the file's Merkle tree.
  *
  * In principle, we need to verify the entire path to the root node.  However,
  * for efficiency the filesystem may cache the hash blocks.  Therefore we need
  * only ascend the tree until an already-verified hash block is seen, and then
  * verify the path to that block.
  *
  * Return: %true if the data block is valid, else %false.
  */
 static bool
 verify_data_block(struct inode *inode, struct fsverity_info *vi,
-		  const void *data, u64 data_pos, unsigned long max_ra_pages)
+		  const struct fsverity_pending_block *dblock,
+		  unsigned long max_ra_pages)
 {
+	const u64 data_pos = dblock->pos;
 	const struct merkle_tree_params *params = &vi->tree_params;
 	const unsigned int hsize = params->digest_size;
 	int level;
 	u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	const u8 *want_hash;
@@ -113,23 +137,27 @@  verify_data_block(struct inode *inode, struct fsverity_info *vi,
 	 * The index of the previous level's block within that level; also the
 	 * index of that block's hash within the current level.
 	 */
 	u64 hidx = data_pos >> params->log_blocksize;
 
-	/* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
-	BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
+	/*
+	 * Up to FS_VERITY_MAX_PENDING_DATA_BLOCKS + FS_VERITY_MAX_LEVELS pages
+	 * may be mapped at once.
+	 */
+	BUILD_BUG_ON(FS_VERITY_MAX_PENDING_DATA_BLOCKS +
+		     FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
 
 	if (unlikely(data_pos >= inode->i_size)) {
 		/*
 		 * This can happen in the data page spanning EOF when the Merkle
 		 * tree block size is less than the page size.  The Merkle tree
 		 * doesn't cover data blocks fully past EOF.  But the entire
 		 * page spanning EOF can be visible to userspace via a mmap, and
 		 * any part past EOF should be all zeroes.  Therefore, we need
 		 * to verify that any data blocks fully past EOF are all zeroes.
 		 */
-		if (memchr_inv(data, 0, params->block_size)) {
+		if (memchr_inv(dblock->data, 0, params->block_size)) {
 			fsverity_err(inode,
 				     "FILE CORRUPTED!  Data past EOF is not zeroed");
 			return false;
 		}
 		return true;
@@ -219,55 +247,122 @@  verify_data_block(struct inode *inode, struct fsverity_info *vi,
 		want_hash = _want_hash;
 		kunmap_local(haddr);
 		put_page(hpage);
 	}
 
-	/* Finally, verify the data block. */
-	if (fsverity_hash_block(params, inode, data, real_hash) != 0)
-		goto error;
-	if (memcmp(want_hash, real_hash, hsize) != 0)
+	/* Finally, verify the hash of the data block. */
+	if (memcmp(want_hash, dblock->hash, hsize) != 0)
 		goto corrupted;
 	return true;
 
 corrupted:
 	fsverity_err(inode,
 		     "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
 		     data_pos, level - 1,
 		     params->hash_alg->name, hsize, want_hash,
-		     params->hash_alg->name, hsize, real_hash);
+		     params->hash_alg->name, hsize,
+		     level == 0 ? dblock->hash : real_hash);
 error:
 	for (; level > 0; level--) {
 		kunmap_local(hblocks[level - 1].addr);
 		put_page(hblocks[level - 1].page);
 	}
 	return false;
 }
 
+static inline void
+fsverity_init_verification_context(struct fsverity_verification_context *ctx,
+				   struct inode *inode,
+				   unsigned long max_ra_pages)
+{
+	ctx->inode = inode;
+	ctx->vi = inode->i_verity_info;
+	ctx->max_ra_pages = max_ra_pages;
+	ctx->num_pending = 0;
+}
+
+static inline void
+fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx)
+{
+	int i;
+
+	for (i = ctx->num_pending - 1; i >= 0; i--) {
+		kunmap_local(ctx->pending_blocks[i].data);
+		ctx->pending_blocks[i].data = NULL;
+	}
+	ctx->num_pending = 0;
+}
+
 static bool
-verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
-		   unsigned long max_ra_pages)
+fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
 {
-	struct inode *inode = data_folio->mapping->host;
-	struct fsverity_info *vi = inode->i_verity_info;
-	const unsigned int block_size = vi->tree_params.block_size;
-	u64 pos = (u64)data_folio->index << PAGE_SHIFT;
+	struct inode *inode = ctx->inode;
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm);
+	const u8 *data[FS_VERITY_MAX_PENDING_DATA_BLOCKS];
+	u8 *outs[FS_VERITY_MAX_PENDING_DATA_BLOCKS];
+	int i;
+	int err;
+
+	if (ctx->num_pending == 0)
+		return true;
+
+	for (i = 0; i < ctx->num_pending; i++) {
+		data[i] = ctx->pending_blocks[i].data;
+		outs[i] = ctx->pending_blocks[i].hash;
+	}
+
+	desc->tfm = params->hash_alg->tfm;
+	if (params->hashstate)
+		err = crypto_shash_import(desc, params->hashstate);
+	else
+		err = crypto_shash_init(desc);
+	if (err) {
+		fsverity_err(inode, "Error %d importing hash state", err);
+		return false;
+	}
+	err = crypto_shash_finup_mb(desc, data, params->block_size, outs,
+				    ctx->num_pending);
+	if (err) {
+		fsverity_err(inode, "Error %d computing block hashes", err);
+		return false;
+	}
+
+	for (i = 0; i < ctx->num_pending; i++) {
+		if (!verify_data_block(inode, vi, &ctx->pending_blocks[i],
+				       ctx->max_ra_pages))
+			return false;
+	}
+
+	fsverity_clear_pending_blocks(ctx);
+	return true;
+}
+
+static bool
+fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
+			 struct folio *data_folio, size_t len, size_t offset)
+{
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	const unsigned int block_size = params->block_size;
+	const int mb_max_msgs = params->hash_alg->mb_max_msgs;
+	u64 pos = ((u64)data_folio->index << PAGE_SHIFT) + offset;
 
 	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
 		return false;
 	if (WARN_ON_ONCE(!folio_test_locked(data_folio) ||
 			 folio_test_uptodate(data_folio)))
 		return false;
 	do {
-		void *data;
-		bool valid;
-
-		data = kmap_local_folio(data_folio, offset);
-		valid = verify_data_block(inode, vi, data, pos + offset,
-					  max_ra_pages);
-		kunmap_local(data);
-		if (!valid)
+		ctx->pending_blocks[ctx->num_pending].data =
+			kmap_local_folio(data_folio, offset);
+		ctx->pending_blocks[ctx->num_pending].pos = pos;
+		if (++ctx->num_pending == mb_max_msgs &&
+		    !fsverity_verify_pending_blocks(ctx))
 			return false;
+		pos += block_size;
 		offset += block_size;
 		len -= block_size;
 	} while (len);
 	return true;
 }
@@ -284,11 +379,19 @@  verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
  *
  * Return: %true if the data is valid, else %false.
  */
 bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
 {
-	return verify_data_blocks(folio, len, offset, 0);
+	struct fsverity_verification_context ctx;
+
+	fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+
+	if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
+	    fsverity_verify_pending_blocks(&ctx))
+		return true;
+	fsverity_clear_pending_blocks(&ctx);
+	return false;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
 
 #ifdef CONFIG_BLOCK
 /**
@@ -305,10 +408,12 @@  EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
  * filesystems) must instead call fsverity_verify_page() directly on each page.
  * All filesystems must also call fsverity_verify_page() on holes.
  */
 void fsverity_verify_bio(struct bio *bio)
 {
+	struct inode *inode = bio_first_folio_all(bio)->mapping->host;
+	struct fsverity_verification_context ctx;
 	struct folio_iter fi;
 	unsigned long max_ra_pages = 0;
 
 	if (bio->bi_opf & REQ_RAHEAD) {
 		/*
@@ -321,17 +426,25 @@  void fsverity_verify_bio(struct bio *bio)
 		 * reduces the number of I/O requests made to the Merkle tree.
 		 */
 		max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
 	}
 
+	fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+
 	bio_for_each_folio_all(fi, bio) {
-		if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
-					max_ra_pages)) {
-			bio->bi_status = BLK_STS_IOERR;
-			break;
-		}
+		if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
+					      fi.offset))
+			goto ioerr;
 	}
+
+	if (!fsverity_verify_pending_blocks(&ctx))
+		goto ioerr;
+	return;
+
+ioerr:
+	fsverity_clear_pending_blocks(&ctx);
+	bio->bi_status = BLK_STS_IOERR;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_bio);
 #endif /* CONFIG_BLOCK */
 
 /**