Message ID | 20240603183731.108986-7-ebiggers@kernel.org (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Mike Snitzer |
Headers | show |
Series | Optimize dm-verity and fsverity using multibuffer hashing | expand |
On Mon, Jun 03, 2024 at 11:37:29AM -0700, Eric Biggers wrote: > > + for (i = 0; i < ctx->num_pending; i++) { > + data[i] = ctx->pending_blocks[i].data; > + outs[i] = ctx->pending_blocks[i].hash; > + } > + > + desc->tfm = params->hash_alg->tfm; > + if (params->hashstate) > + err = crypto_shash_import(desc, params->hashstate); > + else > + err = crypto_shash_init(desc); > + if (err) { > + fsverity_err(inode, "Error %d importing hash state", err); > + return false; > + } > + err = crypto_shash_finup_mb(desc, data, params->block_size, outs, > + ctx->num_pending); > + if (err) { > + fsverity_err(inode, "Error %d computing block hashes", err); > + return false; > + } So with ahash operating in synchronous mode (callback == NULL), this would look like: struct ahash_request *reqs[FS_VERITY_MAX_PENDING_DATA_BLOCKS]; for (i = 0; i < ctx->num_pending; i++) { reqs[i] = fsverity_alloc_hash_request(); if (!req) { free all reqs; return false; } if (params->hashstate) err = crypto_ahash_import(&reqs[i], params->hashstate); else err = crypto_ahash_init(&reqs[i]); if (err) { fsverity_err(inode, "Error %d importing hash state", err); free all reqs; return false; } } for (i = 0; i < ctx->num_pending; i++) { unsigned more; if (params->hashstate) err = crypto_ahash_import(req, params->hashstate); else err = crypto_ahash_init(req); if (err) { fsverity_err(inode, "Error %d importing hash state", err); free all requests; return false; } more = 0; if (i + 1 < ctx->num_pending) more = CRYPTO_TFM_REQ_MORE; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | more, NULL, NULL); ahash_request_set_crypt(req, ctx->pending_blocks[i].sg, ctx->pending_blocks[i].hash, params->block_size); err = crypto_ahash_finup(req); if (err) { fsverity_err(inode, "Error %d computing block hashes", err); free all requests; return false; } } You're hiding some of the complexity by not allocating memory explicitly for each hash state. This might fit on the stack for two requests, but eventually you will have to allocate memory. With the ahash API, the allocation is explicit. Cheers,
On Tue, Jun 04, 2024 at 05:37:36PM +0800, Herbert Xu wrote: > On Mon, Jun 03, 2024 at 11:37:29AM -0700, Eric Biggers wrote: > > > > + for (i = 0; i < ctx->num_pending; i++) { > > + data[i] = ctx->pending_blocks[i].data; > > + outs[i] = ctx->pending_blocks[i].hash; > > + } > > + > > + desc->tfm = params->hash_alg->tfm; > > + if (params->hashstate) > > + err = crypto_shash_import(desc, params->hashstate); > > + else > > + err = crypto_shash_init(desc); > > + if (err) { > > + fsverity_err(inode, "Error %d importing hash state", err); > > + return false; > > + } > > + err = crypto_shash_finup_mb(desc, data, params->block_size, outs, > > + ctx->num_pending); > > + if (err) { > > + fsverity_err(inode, "Error %d computing block hashes", err); > > + return false; > > + } > > So with ahash operating in synchronous mode (callback == NULL), this > would look like: > > struct ahash_request *reqs[FS_VERITY_MAX_PENDING_DATA_BLOCKS]; > > for (i = 0; i < ctx->num_pending; i++) { > reqs[i] = fsverity_alloc_hash_request(); > if (!req) { > free all reqs; > return false; > } > > if (params->hashstate) > err = crypto_ahash_import(&reqs[i], params->hashstate); > else > err = crypto_ahash_init(&reqs[i]); > > if (err) { > fsverity_err(inode, "Error %d importing hash state", err); > free all reqs; > return false; > } > } > > for (i = 0; i < ctx->num_pending; i++) { > unsigned more; > > if (params->hashstate) > err = crypto_ahash_import(req, params->hashstate); > else > err = crypto_ahash_init(req); > > if (err) { > fsverity_err(inode, "Error %d importing hash state", err); > free all requests; > return false; > } > > more = 0; > if (i + 1 < ctx->num_pending) > more = CRYPTO_TFM_REQ_MORE; > ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | more, > NULL, NULL); > ahash_request_set_crypt(req, ctx->pending_blocks[i].sg, > ctx->pending_blocks[i].hash, > params->block_size); > > err = crypto_ahash_finup(req); > if (err) { > fsverity_err(inode, "Error %d computing block hashes", err); > free all requests; > return false; > } > } > > You're hiding some of the complexity by not allocating memory > explicitly for each hash state. This might fit on the stack > for two requests, but eventually you will have to allocate memory. > > With the ahash API, the allocation is explicit. > This doesn't make any sense, though. First, the requests need to be enqueued for the task, but crypto_ahash_finup() would only have the ability to enqueue it in a queue associated with the tfm, which is shared by many tasks. So it can't actually work unless the tfm maintained a separate queue for each task, which would be really complex. Second, it adds a memory allocation per block which is very undesirable. You claim that it's needed anyway, but actually it's not; with my API there is only one initial hash state regardless of how high the interleaving factor is. In fact, if multiple initial states were allowed, multibuffer hashing would become much more complex because the underlying algorithm would need to validate that these different states are synced up. My proposal is much simpler and avoids all this unnecessary overhead. Really the only reason to even consider ahash at all would be try to support software hashing and off-CPU hardware accelerators using the "same" code. However, your proposal would not achieve that either, as it would not use the async callback. Note, as far as I know no one actually cares about off-CPU hardware accelerator support in fsverity anyway... - Eric
On Tue, Jun 04, 2024 at 11:42:20AM -0700, Eric Biggers wrote: > > This doesn't make any sense, though. First, the requests need to be enqueued > for the task, but crypto_ahash_finup() would only have the ability to enqueue it > in a queue associated with the tfm, which is shared by many tasks. So it can't OK I screwed up that one. But that's not hard to fix. We could simply add request chaining: ahash_request_chain(req1, req2); ahash_request_chain(req2, req3); ... ahash_request_chain(reqn1, reqn); err = crypto_ahash_finup(req1); > actually work unless the tfm maintained a separate queue for each task, which > would be really complex. Second, it adds a memory allocation per block which is > very undesirable. You claim that it's needed anyway, but actually it's not; > with my API there is only one initial hash state regardless of how high the > interleaving factor is. In fact, if multiple initial states were allowed, Sure you don't need it for two interleaved requests. But as you scale up to 16 and beyond, surely at some point you're going to want to move the hash states off the stack. > multibuffer hashing would become much more complex because the underlying > algorithm would need to validate that these different states are synced up. My > proposal is much simpler and avoids all this unnecessary overhead. We could simply state that these chained requests must be on block boundaries, similar to how we handle block ciphers. This is not a big deal. > Really the only reason to even consider ahash at all would be try to support > software hashing and off-CPU hardware accelerators using the "same" code. > However, your proposal would not achieve that either, as it would not use the > async callback. Note, as far as I know no one actually cares about off-CPU > hardware accelerator support in fsverity anyway... The other thing is that verity doesn't benefit from shash at all. It appears to be doing kmap's on every single request. Cheers,
Hi Eric: I really appreciate your suggestion of having the user driving multi-request hashing as opposed to the bodge that we had before. I think that is the perfect way of doing this, similar to how we handled TCP segmentation with TSO. However, I really dislike the idea of shoehorning this into shash. I know you really like shash, but I think there are some clear benefits to be had by coupling this with ahash. Cheers,
On Wed, Jun 05, 2024 at 05:22:21PM +0800, Herbert Xu wrote: > > However, I really dislike the idea of shoehorning this into shash. > I know you really like shash, but I think there are some clear > benefits to be had by coupling this with ahash. If we do this properly, we should be able to immediately use the mb code with IPsec. In the network stack, we already aggregate the data prior to IPsec with GSO. So at the boundary between IPsec and the Crypto API, it's dividing chunks of data up to 64K into 1500-byte packets and feeding them to crypto one at a time. It really should be sending the whole chain of packets to us as a unit. Once we have a proper mb interface, we can fix that and immediately get the benefit of mb hashing. Cheers,
On Wed, Jun 05, 2024 at 05:19:01PM +0800, Herbert Xu wrote: > On Tue, Jun 04, 2024 at 11:42:20AM -0700, Eric Biggers wrote: > > > > This doesn't make any sense, though. First, the requests need to be enqueued > > for the task, but crypto_ahash_finup() would only have the ability to enqueue it > > in a queue associated with the tfm, which is shared by many tasks. So it can't > > OK I screwed up that one. But that's not hard to fix. We could > simply add request chaining: > > ahash_request_chain(req1, req2); > ahash_request_chain(req2, req3); > ... > ahash_request_chain(reqn1, reqn); > > err = crypto_ahash_finup(req1); So after completely changing several times your proposal is getting a bit closer to mine, but it still uses a very clumsy API based around ahash that would be much harder to use and implement correctly. It also says nothing about what the API would look like on the shash side, which would be needed anyway because ahash is almost always just a pointless wrapper for shash. Is there a different API that you are asking for on the shash side? If so, what? > > actually work unless the tfm maintained a separate queue for each task, which > > would be really complex. Second, it adds a memory allocation per block which is > > very undesirable. You claim that it's needed anyway, but actually it's not; > > with my API there is only one initial hash state regardless of how high the > > interleaving factor is. In fact, if multiple initial states were allowed, > > Sure you don't need it for two interleaved requests. But as you > scale up to 16 and beyond, surely at some point you're going to > want to move the hash states off the stack. To reiterate, with my proposal there is only one state in memory. It's a simple API that can't be misused by providing inconsistent properties in the requests. Yes, separate states would be needed if we were to support arbitrary updates, but why add all that complexity before it's actually needed? Also, "16 and beyond" is highly unlikely to be useful for kernel use cases. > > multibuffer hashing would become much more complex because the underlying > > algorithm would need to validate that these different states are synced up. My > > proposal is much simpler and avoids all this unnecessary overhead. > > We could simply state that these chained requests must be on block > boundaries, similar to how we handle block ciphers. This is not a > big deal. ... which would make it useless for most dm-verity users, as dm-verity uses a 32-byte salt with SHA-256 (which has a 64-byte block size) by default. > > > Really the only reason to even consider ahash at all would be try to support > > software hashing and off-CPU hardware accelerators using the "same" code. > > However, your proposal would not achieve that either, as it would not use the > > async callback. Note, as far as I know no one actually cares about off-CPU > > hardware accelerator support in fsverity anyway... > > The other thing is that verity doesn't benefit from shash at all. > It appears to be doing kmap's on every single request. The diff from switching fsverity from ahash to shash clearly demonstrates otherwise. Yes, fsverity has to map the pages to pass into shash, but that's a very minor thing compared to all the complexity of ahash that was saved. And fsverity already had to map most of the pages anyway to access them. - Eric
On Wed, Jun 05, 2024 at 05:46:27PM +0800, Herbert Xu wrote: > On Wed, Jun 05, 2024 at 05:22:21PM +0800, Herbert Xu wrote: > > > > However, I really dislike the idea of shoehorning this into shash. > > I know you really like shash, but I think there are some clear > > benefits to be had by coupling this with ahash. > > If we do this properly, we should be able to immediately use the > mb code with IPsec. In the network stack, we already aggregate > the data prior to IPsec with GSO. So at the boundary between > IPsec and the Crypto API, it's dividing chunks of data up to 64K > into 1500-byte packets and feeding them to crypto one at a time. > > It really should be sending the whole chain of packets to us as > a unit. > > Once we have a proper mb interface, we can fix that and immediately > get the benefit of mb hashing. > This would at most apply to AH, not to ESP. Is AH commonly used these days? Also even for AH, the IPsec code would need to be significantly restructured to make use of multibuffer hashing. See how the segmentation happens in xfrm_output_gso(), but the ahash calls happen much lower in the stack. I'm guessing that you've had the AH use case in mind since your earlier comments. Given you were originally pushing for this to be supported using the existing async support in the ahash API (which would have required fewer code changes on the AH side), but we now agree that is not feasible, maybe it is time to reconsider whether it would still be worthwhile to make all the changes to the AH code needed to support this? Also, even if it would be worthwhile and would use ahash, ahash is almost always just a wrapper for shash. So the shash support would be needed anyway. - Eric
On Wed, Jun 05, 2024 at 12:14:10PM -0700, Eric Biggers wrote: > > This would at most apply to AH, not to ESP. Is AH commonly used these days? No AH is completely useless. However, this applies perfectly to ESP, in conjunction with authenc. Obviously we would need to add request linking to authenc (AEAD) as well so that it can pass it along to sha. BTW, does any of this interleaving apply to AES? If so we should explore adding request linking to skcipher as well. Cheers,
On Thu, Jun 06, 2024 at 10:00:05AM +0800, Herbert Xu wrote: > On Wed, Jun 05, 2024 at 12:14:10PM -0700, Eric Biggers wrote: > > > > This would at most apply to AH, not to ESP. Is AH commonly used these days? > > No AH is completely useless. However, this applies perfectly to > ESP, in conjunction with authenc. Obviously we would need to add > request linking to authenc (AEAD) as well so that it can pass it > along to sha. > > BTW, does any of this interleaving apply to AES? If so we should > explore adding request linking to skcipher as well. > With AES, interleaving would only help with non-parallelizable modes such as CBC encryption. Anyone who cares about IPsec performance should of course be using AES-GCM, which is parallelizable. Especially since my other patch https://lore.kernel.org/linux-crypto/20240602222221.176625-2-ebiggers@kernel.org/ is making AES-GCM twice as fast... With hashing we unfortunately don't have the luxury of there being widely used and accepted parallelizable algorithms. In particular, all the SHAs are serialized. So that's why interleaving makes sense there. In any case, it seems that what you're asking for at this point is far beyond the scope of this patchset. - Eric
On Wed, Jun 05, 2024 at 10:28:01PM -0700, Eric Biggers wrote: > > With AES, interleaving would only help with non-parallelizable modes such as CBC > encryption. Anyone who cares about IPsec performance should of course be using > AES-GCM, which is parallelizable. Especially since my other patch > https://lore.kernel.org/linux-crypto/20240602222221.176625-2-ebiggers@kernel.org/ > is making AES-GCM twice as fast... Algorithm selection may be limited by peer capability. For IPsec, if SHA is being used, then most likely CBC is also being used. > In any case, it seems that what you're asking for at this point is far beyond > the scope of this patchset. I'm more than happy to take this over if you don't wish to extend it beyond the storage usage cases. According to the original Intel sha2-mb submission, this should result in at least a two-fold speed-up. Cheers,
On Thu, 6 Jun 2024 at 07:41, Herbert Xu <herbert@gondor.apana.org.au> wrote: > > On Wed, Jun 05, 2024 at 10:28:01PM -0700, Eric Biggers wrote: > > > > With AES, interleaving would only help with non-parallelizable modes such as CBC > > encryption. Anyone who cares about IPsec performance should of course be using > > AES-GCM, which is parallelizable. Especially since my other patch > > https://lore.kernel.org/linux-crypto/20240602222221.176625-2-ebiggers@kernel.org/ > > is making AES-GCM twice as fast... > > Algorithm selection may be limited by peer capability. For IPsec, > if SHA is being used, then most likely CBC is also being used. > IPSec users relying on software crypto and authenc() and caring about performance seems like a rather niche use case to me. > > In any case, it seems that what you're asking for at this point is far beyond > > the scope of this patchset. > > I'm more than happy to take this over if you don't wish to extend > it beyond the storage usage cases. According to the original Intel > sha2-mb submission, this should result in at least a two-fold > speed-up. > I'm struggling to follow this debate. Surely, if this functionality needs to live in ahash, the shash fallbacks need to implement this parallel scheme too, or ahash would end up just feeding the requests into shash sequentially, defeating the purpose. It is then up to the API client to choose between ahash or shash, just as it can today. So Eric has a pretty strong case for his shash implementation; kmap_local() is essentially a NOP on architectures that anyone still cares about (unlike kmap_atomic() which still disables preemption), so I don't have a problem with the caller relying on that in order to be able to use shash directly. The whole scatterlist / request alloc dance is just too tedious and pointless, given that in practice, it all gets relegated to shash anyway. But my point is that even if we go with Herbert's proposal for the ahash, we'll still need something like Eric's code on the shash side. For the true async accelerator use case, none of this should make any difference, right? If the caller already tolerates async (but in-order) completion, implementing this request chaining doesn't buy it anything. So only when the caller is sync and the implementation is async, we might be able to do something smart where the caller waits on a single completion that signals the completion of a set of inputs. But this is also rather niche, so not worth holding up this effort. So Herbert, what would the ahash_to_shash plumbing look like for the ahash API that you are proposing? What changes would it require to shash, and how much to they deviate from what Eric is suggesting?
On Thu, Jun 06, 2024 at 08:58:47AM +0200, Ard Biesheuvel wrote: > > IPSec users relying on software crypto and authenc() and caring about > performance seems like a rather niche use case to me. It's no more niche than fs/verity and dm-integrity. In fact, this could potentially be used for all algorithms. Just the reduction in the number of function calls may produce enough of a benefit (this is something I observed when adding GSO, even without any actual hardware offload, simply aggregating packets into larger units produced a visible benefit). > I'm struggling to follow this debate. Surely, if this functionality > needs to live in ahash, the shash fallbacks need to implement this > parallel scheme too, or ahash would end up just feeding the requests > into shash sequentially, defeating the purpose. It is then up to the > API client to choose between ahash or shash, just as it can today. I've never suggested it adding it to shash at all. In fact that's my primary problem with this interface. I think it should be ahash only. Just like skcipher and aead. My reasoning is that this should cater mostly to bulk data, i.e., multiple pages (e.g., for IPsec we're talking about 64K chunks, actually that (the 64K limit) is something that we should really extend, it's not 2014 anymore). These typically will be more easily accessed as a number of distinct pages rather than as a contiguous mapping. Cheers,
On Thu, 6 Jun 2024 at 09:34, Herbert Xu <herbert@gondor.apana.org.au> wrote: > > On Thu, Jun 06, 2024 at 08:58:47AM +0200, Ard Biesheuvel wrote: > > > > IPSec users relying on software crypto and authenc() and caring about > > performance seems like a rather niche use case to me. > > It's no more niche than fs/verity and dm-integrity. fsverity is used by Android, which is the diametrical opposite of niche when it comes to Linux distros. > this could potentially be used for all algorithms. Just the > reduction in the number of function calls may produce enough > of a benefit (this is something I observed when adding GSO, > even without any actual hardware offload, simply aggregating > packets into larger units produced a visible benefit). > Sure. But my point is that anyone who cares enough about IPsec performance would have stopped using authenc(cbc(aes),hmac(sha2)) long ago and moved to GCM or even ChaChaPoly. This is not just a matter of efficiency in the implementation - using a general purpose hash function such as SHA-256 [twice] rather than GHASH or Poly1305 is just overkill. So complicating the performance optimization of something that runs on every (non-Apple) phone for the benefit of something that is rarely used in a context where the performance matters seems unreasonable to me. > > I'm struggling to follow this debate. Surely, if this functionality > > needs to live in ahash, the shash fallbacks need to implement this > > parallel scheme too, or ahash would end up just feeding the requests > > into shash sequentially, defeating the purpose. It is then up to the > > API client to choose between ahash or shash, just as it can today. > > I've never suggested it adding it to shash at all. In fact > that's my primary problem with this interface. I think it > should be ahash only. Just like skcipher and aead. > So again, how would that work for ahash falling back to shash. Are you saying every existing shash implementation should be duplicated into an ahash so that the multibuffer optimization can be added? shash is a public interface so we cannot just remove the existing ones and we'll end up carrying both forever. > My reasoning is that this should cater mostly to bulk data, i.e., > multiple pages (e.g., for IPsec we're talking about 64K chunks, > actually that (the 64K limit) is something that we should really > extend, it's not 2014 anymore). These typically will be more > easily accessed as a number of distinct pages rather than as a > contiguous mapping. > Sure, but the block I/O world is very different. Forcing it to use an API modeled after how IPsec might use it seems, again, unreasonable. So these 64k chunks are made up of 1500 byte frames that can be hashed in parallel?
On Thu, Jun 06, 2024 at 09:55:56AM +0200, Ard Biesheuvel wrote: > > So again, how would that work for ahash falling back to shash. Are you > saying every existing shash implementation should be duplicated into > an ahash so that the multibuffer optimization can be added? shash is a > public interface so we cannot just remove the existing ones and we'll > end up carrying both forever. It should do the same thing for ahash algorithms that do not support multiple requests. IOW it should process the requests one by one. > Sure, but the block I/O world is very different. Forcing it to use an > API modeled after how IPsec might use it seems, again, unreasonable. It's not different at all. You can see that by the proliferation of kmap calls in fs/verity. It's a fundamental issue. You can't consistently get a large contiguous allocation beyond one page due to fragmentation. So large data is always going to be scattered. BTW, I'm all for elminating the overhead when you already have a linear address for scattered memory, e.g., through vmalloc. We should definitely improve our interface for ahash/skcipher/aead so that vmalloc addresses (as well as kmalloc virtual addresses by extension) are supported as first class citizens, and we don't turn them into SG lists unless it's necessary for DMA. Cheers,
On Thu, 6 Jun 2024 at 10:08, Herbert Xu <herbert@gondor.apana.org.au> wrote: > > On Thu, Jun 06, 2024 at 09:55:56AM +0200, Ard Biesheuvel wrote: > > > > So again, how would that work for ahash falling back to shash. Are you > > saying every existing shash implementation should be duplicated into > > an ahash so that the multibuffer optimization can be added? shash is a > > public interface so we cannot just remove the existing ones and we'll > > end up carrying both forever. > > It should do the same thing for ahash algorithms that do not support > multiple requests. IOW it should process the requests one by one. > That is not what I am asking. Are you suggesting that, e.g., the arm64 sha2 shash implementation that is modified by this series should instead expose both an shash as before, and an ahash built around the same asm code that exposes the multibuffer capability? > > Sure, but the block I/O world is very different. Forcing it to use an > > API modeled after how IPsec might use it seems, again, unreasonable. > > It's not different at all. You can see that by the proliferation > of kmap calls in fs/verity. It's a fundamental issue. You can't > consistently get a large contiguous allocation beyond one page due > to fragmentation. So large data is always going to be scattered. > I don't think this is true for many uses of the block layer. > BTW, I'm all for elminating the overhead when you already have a > linear address for scattered memory, e.g., through vmalloc. We > should definitely improve our interface for ahash/skcipher/aead so > that vmalloc addresses (as well as kmalloc virtual addresses by > extension) are supported as first class citizens, and we don't turn > them into SG lists unless it's necessary for DMA. > Yes, this is something I've been pondering for a while. An ahash/skcipher/aead with CRYPTO_ALG_ASYNC cleared (which would guarantee that any provided VA would not be referenced after the algo invocation returns) should be able to consume a request that carries virtual addresses rather than SG lists. Given that it is up to the caller to choose between sync and async, it would be in a good position also to judge whether it wants to use stack/vmalloc addresses. I'll have a stab at this.
On Thu, Jun 06, 2024 at 10:33:15AM +0200, Ard Biesheuvel wrote: > > Are you suggesting that, e.g., the arm64 sha2 shash implementation > that is modified by this series should instead expose both an shash as > before, and an ahash built around the same asm code that exposes the > multibuffer capability? Yes the multi-request handling should be implemented as ahash only. Cheers,
Hi Herbert, On Thu, Jun 06, 2024 at 05:15:16PM +0800, Herbert Xu wrote: > On Thu, Jun 06, 2024 at 10:33:15AM +0200, Ard Biesheuvel wrote: > > > > Are you suggesting that, e.g., the arm64 sha2 shash implementation > > that is modified by this series should instead expose both an shash as > > before, and an ahash built around the same asm code that exposes the > > multibuffer capability? > > Yes the multi-request handling should be implemented as ahash > only. > That is not a reasonable way to do it. It would be much more complex, more error-prone, and slower than my proposal. Also your proposal wouldn't actually bring you much closer to being able to optimize your authenc use case. First, each algorithm implementation that wants to support your scheme would need to implement the ahash API alongside shash. This would be a lot of duplicated code, and it would need to support all quirks of the ahash API including scatterlists. Second, allowing an ahash_request to actually be a list of requests creates an ambiguity where it will often be unclear when code is supposed to operate on an ahash_request individually vs. on the whole list. This is error-prone, as it invites bugs where a crypto operation is done on only one request of the list. This is a bad design, especially for a cryptographic API. We would have crypto_ahash_*() do some checks to prevent multi-requests from reaching algorithms that don't support the particular kind of request being made. But there will be a lot of cases to consider. Third, it would be hard to actually implement multibuffer hashing given an arbitrary list of requests. For multibuffer hashing to work, the lengths of all the buffers must be synced up, including the internal buffers in the hash states as well as every buffer that is produced while walking the scatterlists. We can place constraints on what is supported, but those constraints will need to be clearly specified, and each algorithm will actually need to check for them and do something sane (error or fallback) when they are not met. Note that it would be possible for the messages to get out of sync in the middle of walking the scatterlists, which would be difficult to handle. All this would add a lot of complexity and overhead compared to my proposal, which naturally expresses the same-length constraints in the API. Fourth, having the API be ahash-only would also force fsverity to switch back to the ahash API, which would add complexity and overhead. The shash API is easier to use even when working with pages, as the diffstat of commit 8fcd94add6c5 clearly shows. The ahash API also has more overhead, including in doing the needed memory allocation, setting up a scatterlist, initializing required but irrelevant or redundant fields in the ahash_request, and shash_ahash_finup() running the fully generalized scatterlist walker on the scatterlist just to finally end up with a pointer which the caller could have provided directly. All this overhead adds up, even when hashing 4K blocks. Back when CPU-based crypto was slow this didn't really matter, but now it's fast and these small overheads are significant when trying to keep up with storage device speeds (which are also fast now). E.g., even disregarding the memory allocation, hashing a 4K block is about 5% faster with shash than with ahash on x86_64. Of course, I'd need to support ahash in fsverity anyway if someone were to actually need support for non-inline hardware offload in fsverity (and if I decide to accept that despite many hardware drivers being buggy). But really the best way to do this would be to support ahash alongside shash, like what I'm proposing in dm-verity -- perhaps with ahash support limited to the data blocks only, as that's the most performance critical part. The ahash code path would use the async callback to actually properly support offload, which would mean the code would be substantially different anyway due to the fundamentally different computational model, especially vs. multibuffer hashing. A "single" API, that works for both hardware offload and for the vast majority of users that simply need very low overhead software crypto, would be nice in theory. But I'm afraid it's been shown repeatedly that just doesn't work... The existence of lib/crypto/, shash, lskcipher, and scomp all reflect this. I understand that you think the ahash based API would make it easier to add multibuffer support to "authenc(hmac(sha256),cbc(aes))" for IPsec, which seems to be a very important use case for you (though it isn't relevant to nearly as many systems as dm-verity and fsverity are). Regardless, the reality is that it would be much more difficult to take advantage of multibuffer crypto in the IPsec authenc use case than in dm-verity and fsverity. authenc uses multiple underlying algorithms, AES-CBC and HMAC-SHA256, that would both have to use multibuffer crypto in order to see a significant benefit, seeing as even if the SHA-256 support could be wired up through HMAC-SHA256, encryption would be bottlenecked on AES-CBC, especially on Intel CPUs. It also looks like the IPsec code would need a lot of updates to support multibuffer crypto. At the same time, an easier way to optimize "authenc(hmac(sha256),cbc(aes))" would be to add an implementation of it to arch/x86/crypto/ that interleaves the AES-CBC and SHA-256 and also avoids the overhead associated with the template based approach (such as all the extra indirect calls). Of course, that would require writing assembly, but so would multibuffer crypto. It seems unlikely that someone would step in to do all the work to implement a multibuffer optimization for this algorithm and its use in IPsec, when no one has ever bothered to optimize the single-buffer case in the first place, which has been possible all along and would require no API or IPsec changes... In any case, any potential multi-request support in ahash, skcipher, or aead should be separate considerations from the simple function in shash that I'm proposing. It makes sense to have the shash support regardless. Ultimately, I need to have dm-verity and fsverity be properly optimized in the downstreams that are most relevant to me. If you're not going to allow the upstream crypto API to provide the needed functionality in a reasonable way, then I'll need to shift my focus to getting this patchset into downstream kernels such as Android and Chrome OS instead. - Eric
On Mon, Jun 10, 2024 at 09:42:58AM -0700, Eric Biggers wrote: > > I understand that you think the ahash based API would make it easier to add > multibuffer support to "authenc(hmac(sha256),cbc(aes))" for IPsec, which seems > to be a very important use case for you (though it isn't relevant to nearly as > many systems as dm-verity and fsverity are). Regardless, the reality is that it > would be much more difficult to take advantage of multibuffer crypto in the > IPsec authenc use case than in dm-verity and fsverity. authenc uses multiple > underlying algorithms, AES-CBC and HMAC-SHA256, that would both have to use > multibuffer crypto in order to see a significant benefit, seeing as even if the > SHA-256 support could be wired up through HMAC-SHA256, encryption would be > bottlenecked on AES-CBC, especially on Intel CPUs. It also looks like the IPsec > code would need a lot of updates to support multibuffer crypto. The linked-request thing feeds nicely into networking. In fact that's where I got the idea of linking them from. In networking a large GSO (currently limited to 64K but theoretically we could make it unlimited) packet is automatically split up into a linked list of MTU-sized skb's. Therefore if we switched to a linked-list API networking could give us the buffers with minimal changes. BTW, I found an old Intel paper that claims through their multi- buffer strategy they were able to make AES-CBC-XCBC beat AES-GCM. I wonder if we could still replicate this today: https://github.com/intel/intel-ipsec-mb/wiki/doc/fast-multi-buffer-ipsec-implementations-ia-processors-paper.pdf > Ultimately, I need to have dm-verity and fsverity be properly optimized in the > downstreams that are most relevant to me. If you're not going to allow the > upstream crypto API to provide the needed functionality in a reasonable way, > then I'll need to shift my focus to getting this patchset into downstream > kernels such as Android and Chrome OS instead. I totally understand that this is your priority. But please give me some time to see if we can devise something that works for both scenarios. Thanks,
On Tue, Jun 11, 2024 at 11:21:43PM +0800, Herbert Xu wrote: > > Therefore if we switched to a linked-list API networking could > give us the buffers with minimal changes. BTW, this is not just about parallelising hashing. Just as one of the most significant benefits of GSO does not come from hardware offload, but rather the amortisation of (network) stack overhead. IOW you're traversing a very deep stack once instead of 40 times (this is the factor for 64K vs MTU, if we extend beyond 64K (which we absolute should do) the benefit would increase as well). The same should apply to the Crypto API. So even if this was a purely software solution with no assembly code at all, it may well improve GCM performance (at least for users able to feed us bulk data, like networking). Cheers,
On Tue, 11 Jun 2024 at 17:21, Herbert Xu <herbert@gondor.apana.org.au> wrote: > > On Mon, Jun 10, 2024 at 09:42:58AM -0700, Eric Biggers wrote: > > > > I understand that you think the ahash based API would make it easier to add > > multibuffer support to "authenc(hmac(sha256),cbc(aes))" for IPsec, which seems > > to be a very important use case for you (though it isn't relevant to nearly as > > many systems as dm-verity and fsverity are). Regardless, the reality is that it > > would be much more difficult to take advantage of multibuffer crypto in the > > IPsec authenc use case than in dm-verity and fsverity. authenc uses multiple > > underlying algorithms, AES-CBC and HMAC-SHA256, that would both have to use > > multibuffer crypto in order to see a significant benefit, seeing as even if the > > SHA-256 support could be wired up through HMAC-SHA256, encryption would be > > bottlenecked on AES-CBC, especially on Intel CPUs. It also looks like the IPsec > > code would need a lot of updates to support multibuffer crypto. > > The linked-request thing feeds nicely into networking. In fact > that's where I got the idea of linking them from. In networking > a large GSO (currently limited to 64K but theoretically we could > make it unlimited) packet is automatically split up into a linked > list of MTU-sized skb's. > > Therefore if we switched to a linked-list API networking could > give us the buffers with minimal changes. > > BTW, I found an old Intel paper that claims through their multi- > buffer strategy they were able to make AES-CBC-XCBC beat AES-GCM. > I wonder if we could still replicate this today: > > https://github.com/intel/intel-ipsec-mb/wiki/doc/fast-multi-buffer-ipsec-implementations-ia-processors-paper.pdf > This looks like the whitepaper that describes the buggy multibuffer code that we ripped out. > > Ultimately, I need to have dm-verity and fsverity be properly optimized in the > > downstreams that are most relevant to me. If you're not going to allow the > > upstream crypto API to provide the needed functionality in a reasonable way, > > then I'll need to shift my focus to getting this patchset into downstream > > kernels such as Android and Chrome OS instead. > > I totally understand that this is your priority. But please give > me some time to see if we can devise something that works for both > scenarios. > The issue here is that the CPU based multibuffer approach has rather tight constraints in terms of input length and the shared prefix, and so designing a more generic API based on ahash doesn't help at all. The intel multibuffer code went off into the weeds entirely attempting to apply this parallel scheme to arbitrary combinations of inputs, so this is something we know we should avoid.
On Tue, Jun 11, 2024 at 05:46:01PM +0200, Ard Biesheuvel wrote: > > The issue here is that the CPU based multibuffer approach has rather > tight constraints in terms of input length and the shared prefix, and > so designing a more generic API based on ahash doesn't help at all. > The intel multibuffer code went off into the weeds entirely attempting > to apply this parallel scheme to arbitrary combinations of inputs, so > this is something we know we should avoid. The sha-mb approach failed because it failed to aggregate the data properly. By driving this from the data sink, it was doomed to fail. The correct way to aggregate data is to do it at the source. The user (of the Crypto API) knows exactlty how much data they want to hash and how it's structured. They should be supplying that info to the API so it can use multi-buffer where applicable. Even where multi-buffer isn't available, they would at least benefit from making a single indirect call into the Crypto stack instead of N calls. When N is large (which is almost always the case for TCP) this produces a non-trivial saving. Sure I understand that you guys are more than happy with N=2 but please let me at least try this out and see if we could make this work for a large value of N. Cheers,
On Tue, Jun 11, 2024 at 11:21:43PM +0800, Herbert Xu wrote: > > BTW, I found an old Intel paper that claims through their multi- > buffer strategy they were able to make AES-CBC-XCBC beat AES-GCM. > I wonder if we could still replicate this today: > > https://github.com/intel/intel-ipsec-mb/wiki/doc/fast-multi-buffer-ipsec-implementations-ia-processors-paper.pdf No, not even close. Even assuming that the lack of parallelizability in AES-CBC and AES-XCBC can be entirely compensated for via multibuffer crypto (which really it can't -- consider single packets, for example), doing AES twice is much more expensive than doing AES and GHASH. GHASH is a universal hash function, and computing a universal hash function is inherently cheaper than computing a cryptographic hash function. But also modern Intel CPUs have very fast carryless multiplication, and it uses a different execution port from what AES uses. So the overhead of AES + GHASH over AES alone is very small. By doing AES twice, you'd be entirely bottlenecked by the ports that can execute the AES instructions, while the other ports go nearly unused. So it would probably be approaching twice as slow as AES-GCM. Westmere (2010) through Ivy Bridge (2012) are the only Intel CPUs where multibuffer AES-CBC-XCBC could plausibly be faster than AES-GCM (given a sufficiently large number of messages at once), due to the very slow pclmulqdq instruction on those CPUs. This is long since fixed, as pclmulqdq became much faster in Haswell (2013), and faster still in Broadwell. This is exactly what that Intel paper shows; they show AES-GCM becoming fastest in "Gen 4", i.e. Haswell. The paper is from 2012, so of course they don't show anything after that. But AES-GCM has only pulled ahead even more since then. In theory something like AES-CBC + SHA-256 could be slightly more competitive than AES-CBC + AES-XCBC. But it would still be worse than simply doing AES-GCM -- which again, doesn't need multibuffer, and my recent patches have already fully optimized for recent x86_64 CPUs. - Eric
On Tue, Jun 11, 2024 at 11:39:08PM +0800, Herbert Xu wrote: > On Tue, Jun 11, 2024 at 11:21:43PM +0800, Herbert Xu wrote: > > > > Therefore if we switched to a linked-list API networking could > > give us the buffers with minimal changes. > > BTW, this is not just about parallelising hashing. Just as one of > the most significant benefits of GSO does not come from hardware > offload, but rather the amortisation of (network) stack overhead. > IOW you're traversing a very deep stack once instead of 40 times > (this is the factor for 64K vs MTU, if we extend beyond 64K (which > we absolute should do) the benefit would increase as well). > > The same should apply to the Crypto API. So even if this was a > purely software solution with no assembly code at all, it may well > improve GCM performance (at least for users able to feed us bulk > data, like networking). > At best this would save an indirect call per message, if the underlying algorithm explicitly added support for it and the user of the API migrated to the multi-request model. This alone doesn't seem worth the effort of migrating to multi-request, especially considering the many other already-possible optimizations that would not require API changes or migrating users to multi-request. The x86_64 AES-GCM is pretty well optimized now after my recent patches, but there's still an indirect call associated with the use of the SIMD helper which could be eliminated, saving one per message (already as much as we could hope to get from multi-request). authenc on the other hand is almost totally unoptimized, as I mentioned before; it makes little sense to talk about any sort of multi-request optimization for it at this point. - Eric
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index b3506f56e180..7535c9d9b516 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -27,10 +27,15 @@ struct fsverity_hash_alg { /* * The HASH_ALGO_* constant for this algorithm. This is different from * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. */ enum hash_algo algo_id; + /* + * The maximum supported interleaving factor for multibuffer hashing, or + * 1 if the algorithm doesn't support multibuffer hashing + */ + int mb_max_msgs; }; /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ struct merkle_tree_params { const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ @@ -150,8 +155,10 @@ static inline void fsverity_init_signature(void) } #endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ /* verify.c */ +#define FS_VERITY_MAX_PENDING_DATA_BLOCKS 2 + void __init fsverity_init_workqueue(void); #endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 6b08b1d9a7d7..f24d7c295455 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -82,12 +82,16 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm))) goto err_free_tfm; if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm))) goto err_free_tfm; - pr_info("%s using implementation \"%s\"\n", - alg->name, crypto_shash_driver_name(tfm)); + alg->mb_max_msgs = min(crypto_shash_mb_max_msgs(tfm), + FS_VERITY_MAX_PENDING_DATA_BLOCKS); + + pr_info("%s using implementation \"%s\"%s\n", + alg->name, crypto_shash_driver_name(tfm), + alg->mb_max_msgs > 1 ? " (multibuffer)" : ""); /* pairs with smp_load_acquire() above */ smp_store_release(&alg->tfm, tfm); goto out_unlock; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 4fcad0825a12..e9fb299ffa77 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -8,10 +8,32 @@ #include "fsverity_private.h" #include <crypto/hash.h> #include <linux/bio.h> +struct fsverity_pending_block { + const void *data; + u64 pos; + u8 hash[FS_VERITY_MAX_DIGEST_SIZE]; +}; + +struct fsverity_verification_context { + struct inode *inode; + struct fsverity_info *vi; + unsigned long max_ra_pages; + + /* + * This is the queue of data blocks that are pending verification. We + * allow multiple blocks to be queued up in order to support hash + * algorithm implementations that provide support for multibuffer + * hashing, i.e. interleaving the hashing of multiple messages. On many + * CPUs this improves performance significantly. + */ + int num_pending; + struct fsverity_pending_block pending_blocks[FS_VERITY_MAX_PENDING_DATA_BLOCKS]; +}; + static struct workqueue_struct *fsverity_read_workqueue; /* * Returns true if the hash block with index @hblock_idx in the tree, located in * @hpage, has already been verified. @@ -77,23 +99,25 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, SetPageChecked(hpage); return false; } /* - * Verify a single data block against the file's Merkle tree. + * Verify the hash of a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, * for efficiency the filesystem may cache the hash blocks. Therefore we need * only ascend the tree until an already-verified hash block is seen, and then * verify the path to that block. * * Return: %true if the data block is valid, else %false. */ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, - const void *data, u64 data_pos, unsigned long max_ra_pages) + const struct fsverity_pending_block *dblock, + unsigned long max_ra_pages) { + const u64 data_pos = dblock->pos; const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; int level; u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; const u8 *want_hash; @@ -113,23 +137,27 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * The index of the previous level's block within that level; also the * index of that block's hash within the current level. */ u64 hidx = data_pos >> params->log_blocksize; - /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ - BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); + /* + * Up to FS_VERITY_MAX_PENDING_DATA_BLOCKS + FS_VERITY_MAX_LEVELS pages + * may be mapped at once. + */ + BUILD_BUG_ON(FS_VERITY_MAX_PENDING_DATA_BLOCKS + + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* * This can happen in the data page spanning EOF when the Merkle * tree block size is less than the page size. The Merkle tree * doesn't cover data blocks fully past EOF. But the entire * page spanning EOF can be visible to userspace via a mmap, and * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - if (memchr_inv(data, 0, params->block_size)) { + if (memchr_inv(dblock->data, 0, params->block_size)) { fsverity_err(inode, "FILE CORRUPTED! Data past EOF is not zeroed"); return false; } return true; @@ -219,55 +247,122 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, want_hash = _want_hash; kunmap_local(haddr); put_page(hpage); } - /* Finally, verify the data block. */ - if (fsverity_hash_block(params, inode, data, real_hash) != 0) - goto error; - if (memcmp(want_hash, real_hash, hsize) != 0) + /* Finally, verify the hash of the data block. */ + if (memcmp(want_hash, dblock->hash, hsize) != 0) goto corrupted; return true; corrupted: fsverity_err(inode, "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", data_pos, level - 1, params->hash_alg->name, hsize, want_hash, - params->hash_alg->name, hsize, real_hash); + params->hash_alg->name, hsize, + level == 0 ? dblock->hash : real_hash); error: for (; level > 0; level--) { kunmap_local(hblocks[level - 1].addr); put_page(hblocks[level - 1].page); } return false; } +static inline void +fsverity_init_verification_context(struct fsverity_verification_context *ctx, + struct inode *inode, + unsigned long max_ra_pages) +{ + ctx->inode = inode; + ctx->vi = inode->i_verity_info; + ctx->max_ra_pages = max_ra_pages; + ctx->num_pending = 0; +} + +static inline void +fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx) +{ + int i; + + for (i = ctx->num_pending - 1; i >= 0; i--) { + kunmap_local(ctx->pending_blocks[i].data); + ctx->pending_blocks[i].data = NULL; + } + ctx->num_pending = 0; +} + static bool -verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, - unsigned long max_ra_pages) +fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx) { - struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - const unsigned int block_size = vi->tree_params.block_size; - u64 pos = (u64)data_folio->index << PAGE_SHIFT; + struct inode *inode = ctx->inode; + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm); + const u8 *data[FS_VERITY_MAX_PENDING_DATA_BLOCKS]; + u8 *outs[FS_VERITY_MAX_PENDING_DATA_BLOCKS]; + int i; + int err; + + if (ctx->num_pending == 0) + return true; + + for (i = 0; i < ctx->num_pending; i++) { + data[i] = ctx->pending_blocks[i].data; + outs[i] = ctx->pending_blocks[i].hash; + } + + desc->tfm = params->hash_alg->tfm; + if (params->hashstate) + err = crypto_shash_import(desc, params->hashstate); + else + err = crypto_shash_init(desc); + if (err) { + fsverity_err(inode, "Error %d importing hash state", err); + return false; + } + err = crypto_shash_finup_mb(desc, data, params->block_size, outs, + ctx->num_pending); + if (err) { + fsverity_err(inode, "Error %d computing block hashes", err); + return false; + } + + for (i = 0; i < ctx->num_pending; i++) { + if (!verify_data_block(inode, vi, &ctx->pending_blocks[i], + ctx->max_ra_pages)) + return false; + } + + fsverity_clear_pending_blocks(ctx); + return true; +} + +static bool +fsverity_add_data_blocks(struct fsverity_verification_context *ctx, + struct folio *data_folio, size_t len, size_t offset) +{ + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int block_size = params->block_size; + const int mb_max_msgs = params->hash_alg->mb_max_msgs; + u64 pos = ((u64)data_folio->index << PAGE_SHIFT) + offset; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) return false; if (WARN_ON_ONCE(!folio_test_locked(data_folio) || folio_test_uptodate(data_folio))) return false; do { - void *data; - bool valid; - - data = kmap_local_folio(data_folio, offset); - valid = verify_data_block(inode, vi, data, pos + offset, - max_ra_pages); - kunmap_local(data); - if (!valid) + ctx->pending_blocks[ctx->num_pending].data = + kmap_local_folio(data_folio, offset); + ctx->pending_blocks[ctx->num_pending].pos = pos; + if (++ctx->num_pending == mb_max_msgs && + !fsverity_verify_pending_blocks(ctx)) return false; + pos += block_size; offset += block_size; len -= block_size; } while (len); return true; } @@ -284,11 +379,19 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, * * Return: %true if the data is valid, else %false. */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - return verify_data_blocks(folio, len, offset, 0); + struct fsverity_verification_context ctx; + + fsverity_init_verification_context(&ctx, folio->mapping->host, 0); + + if (fsverity_add_data_blocks(&ctx, folio, len, offset) && + fsverity_verify_pending_blocks(&ctx)) + return true; + fsverity_clear_pending_blocks(&ctx); + return false; } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); #ifdef CONFIG_BLOCK /** @@ -305,10 +408,12 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); * filesystems) must instead call fsverity_verify_page() directly on each page. * All filesystems must also call fsverity_verify_page() on holes. */ void fsverity_verify_bio(struct bio *bio) { + struct inode *inode = bio_first_folio_all(bio)->mapping->host; + struct fsverity_verification_context ctx; struct folio_iter fi; unsigned long max_ra_pages = 0; if (bio->bi_opf & REQ_RAHEAD) { /* @@ -321,17 +426,25 @@ void fsverity_verify_bio(struct bio *bio) * reduces the number of I/O requests made to the Merkle tree. */ max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } + fsverity_init_verification_context(&ctx, inode, max_ra_pages); + bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(fi.folio, fi.length, fi.offset, - max_ra_pages)) { - bio->bi_status = BLK_STS_IOERR; - break; - } + if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length, + fi.offset)) + goto ioerr; } + + if (!fsverity_verify_pending_blocks(&ctx)) + goto ioerr; + return; + +ioerr: + fsverity_clear_pending_blocks(&ctx); + bio->bi_status = BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ /**