diff mbox series

[v4,8/8] dm-verity: improve performance by using multibuffer hashing

Message ID 20240603183731.108986-9-ebiggers@kernel.org (mailing list archive)
State Superseded
Headers show
Series Optimize dm-verity and fsverity using multibuffer hashing | expand

Commit Message

Eric Biggers June 3, 2024, 6:37 p.m. UTC
From: Eric Biggers <ebiggers@google.com>

When supported by the hash algorithm, use crypto_shash_finup_mb() to
interleave the hashing of pairs of data blocks.  On some CPUs this
nearly doubles hashing performance.  The increase in overall throughput
of cold-cache dm-verity reads that I'm seeing on arm64 and x86_64 is
roughly 35% (though this metric is hard to measure as it jumps around a
lot).

For now this is only done on data blocks, not Merkle tree blocks.  We
could use finup_mb on Merkle tree blocks too, but that is less important
as there aren't as many Merkle tree blocks as data blocks, and that
would require some additional code restructuring.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 drivers/md/dm-verity-fec.c    |  24 +--
 drivers/md/dm-verity-fec.h    |   7 +-
 drivers/md/dm-verity-target.c | 345 ++++++++++++++++++++++------------
 drivers/md/dm-verity.h        |  28 +--
 4 files changed, 247 insertions(+), 157 deletions(-)
diff mbox series

Patch

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index b436b8e4d750..c1677137a682 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -184,18 +184,18 @@  static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
  * Locate data block erasures using verity hashes.
  */
 static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 			  u8 *want_digest, u8 *data)
 {
+	u8 real_digest[HASH_MAX_DIGESTSIZE];
+
 	if (unlikely(verity_compute_hash_virt(v, io, data,
 					      1 << v->data_dev_block_bits,
-					      verity_io_real_digest(v, io),
-					      true)))
+					      real_digest, true)))
 		return 0;
 
-	return memcmp(verity_io_real_digest(v, io), want_digest,
-		      v->digest_size) != 0;
+	return memcmp(real_digest, want_digest, v->digest_size) != 0;
 }
 
 /*
  * Read data blocks that are part of the RS block and deinterleave as much as
  * fits into buffers. Check for erasure locations if @neras is non-NULL.
@@ -362,14 +362,15 @@  static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
  * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
  * hashes to locate erasures.
  */
 static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 			  struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
-			  bool use_erasures)
+			  const u8 *want_digest, bool use_erasures)
 {
 	int r, neras = 0;
 	unsigned int pos;
+	u8 real_digest[HASH_MAX_DIGESTSIZE];
 
 	r = fec_alloc_bufs(v, fio);
 	if (unlikely(r < 0))
 		return r;
 
@@ -389,16 +390,15 @@  static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 	}
 
 	/* Always re-validate the corrected block against the expected hash */
 	r = verity_compute_hash_virt(v, io, fio->output,
 				     1 << v->data_dev_block_bits,
-				     verity_io_real_digest(v, io), true);
+				     real_digest, true);
 	if (unlikely(r < 0))
 		return r;
 
-	if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
-		   v->digest_size)) {
+	if (memcmp(real_digest, want_digest, v->digest_size)) {
 		DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
 			    v->data_dev->name, (unsigned long long)rsb, neras);
 		return -EILSEQ;
 	}
 
@@ -419,12 +419,12 @@  static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
 /*
  * Correct errors in a block. Copies corrected block to dest if non-NULL,
  * otherwise to a bio_vec starting from iter.
  */
 int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
-		      enum verity_block_type type, sector_t block, u8 *dest,
-		      struct bvec_iter *iter)
+		      enum verity_block_type type, sector_t block,
+		      const u8 *want_digest, u8 *dest, struct bvec_iter *iter)
 {
 	int r;
 	struct dm_verity_fec_io *fio = fec_io(io);
 	u64 offset, res, rsb;
 
@@ -463,13 +463,13 @@  int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	/*
 	 * Locating erasures is slow, so attempt to recover the block without
 	 * them first. Do a second attempt with erasures if the corruption is
 	 * bad enough.
 	 */
-	r = fec_decode_rsb(v, io, fio, rsb, offset, false);
+	r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false);
 	if (r < 0) {
-		r = fec_decode_rsb(v, io, fio, rsb, offset, true);
+		r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true);
 		if (r < 0)
 			goto done;
 	}
 
 	if (dest)
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 8454070d2824..57c3f674cae9 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -68,11 +68,12 @@  struct dm_verity_fec_io {
 
 extern bool verity_fec_is_enabled(struct dm_verity *v);
 
 extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 			     enum verity_block_type type, sector_t block,
-			     u8 *dest, struct bvec_iter *iter);
+			     const u8 *want_digest, u8 *dest,
+			     struct bvec_iter *iter);
 
 extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz,
 					char *result, unsigned int maxlen);
 
 extern void verity_fec_finish_io(struct dm_verity_io *io);
@@ -97,12 +98,12 @@  static inline bool verity_fec_is_enabled(struct dm_verity *v)
 	return false;
 }
 
 static inline int verity_fec_decode(struct dm_verity *v,
 				    struct dm_verity_io *io,
-				    enum verity_block_type type,
-				    sector_t block, u8 *dest,
+				    enum verity_block_type type, sector_t block,
+				    const u8 *want_digest, u8 *dest,
 				    struct bvec_iter *iter)
 {
 	return -EOPNOTSUPP;
 }
 
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 2dd15f5e91b7..ec0a8c51d99e 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -40,10 +40,12 @@ 
 #define DM_VERITY_OPT_TASKLET_VERIFY	"try_verify_in_tasklet"
 
 #define DM_VERITY_OPTS_MAX		(4 + DM_VERITY_OPTS_FEC + \
 					 DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
 
+#define DM_VERITY_MAX_PENDING_DATA_BLOCKS	2
+
 static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644);
 
 /* Is at least one dm-verity instance using the bh workqueue? */
@@ -300,16 +302,16 @@  static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
 
 /*
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
  *
- * On successful return, verity_io_want_digest(v, io) contains the hash value
- * for a lower tree level or for the data block (if we're at the lowest level).
+ * On successful return, want_digest contains the hash value for a lower tree
+ * level or for the data block (if we're at the lowest level).
  *
  * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
  * If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of verity_io_want_digest(v, io).
+ * against current value of want_digest.
  */
 static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			       sector_t block, int level, bool skip_unverified,
 			       u8 *want_digest)
 {
@@ -318,10 +320,11 @@  static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 	u8 *data;
 	int r;
 	sector_t hash_block;
 	unsigned int offset;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+	u8 real_digest[HASH_MAX_DIGESTSIZE];
 
 	verity_hash_at_level(v, block, level, &hash_block, &offset);
 
 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 		data = dm_bufio_get(v->bufio, hash_block, &buf);
@@ -349,27 +352,26 @@  static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			goto release_ret_r;
 		}
 
 		r = verity_compute_hash_virt(v, io, data,
 					     1 << v->hash_dev_block_bits,
-					     verity_io_real_digest(v, io),
-					     !io->in_bh);
+					     real_digest, !io->in_bh);
 		if (unlikely(r < 0))
 			goto release_ret_r;
 
-		if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
-				  v->digest_size) == 0))
+		if (likely(!memcmp(real_digest, want_digest, v->digest_size)))
 			aux->hash_verified = 1;
 		else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 			/*
 			 * Error handling code (FEC included) cannot be run in a
 			 * tasklet since it may sleep, so fallback to work-queue.
 			 */
 			r = -EAGAIN;
 			goto release_ret_r;
 		} else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
-					     hash_block, data, NULL) == 0)
+					     hash_block, want_digest,
+					     data, NULL) == 0)
 			aux->hash_verified = 1;
 		else if (verity_handle_err(v,
 					   DM_VERITY_BLOCK_TYPE_METADATA,
 					   hash_block)) {
 			struct bio *bio =
@@ -473,71 +475,10 @@  static int verity_ahash_update_block(struct dm_verity *v,
 	} while (todo);
 
 	return 0;
 }
 
-static int verity_compute_hash(struct dm_verity *v, struct dm_verity_io *io,
-			       struct bvec_iter *iter, u8 *digest,
-			       bool may_sleep)
-{
-	int r;
-
-	if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) {
-		struct ahash_request *req = verity_io_hash_req(v, io);
-		struct crypto_wait wait;
-
-		r = verity_ahash_init(v, req, &wait, may_sleep);
-		if (unlikely(r))
-			goto error;
-
-		r = verity_ahash_update_block(v, io, iter, &wait);
-		if (unlikely(r))
-			goto error;
-
-		r = verity_ahash_final(v, req, digest, &wait);
-		if (unlikely(r))
-			goto error;
-	} else {
-		struct shash_desc *desc = verity_io_hash_req(v, io);
-		struct bio *bio =
-			dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
-		struct bio_vec bv = bio_iter_iovec(bio, *iter);
-		const unsigned int len = 1 << v->data_dev_block_bits;
-		const void *virt;
-
-		if (unlikely(len > bv.bv_len)) {
-			/*
-			 * Data block spans pages.  This should not happen,
-			 * since this code path is not used if the data block
-			 * size is greater than the page size, and all I/O
-			 * should be data block aligned because dm-verity sets
-			 * logical_block_size to the data block size.
-			 */
-			DMERR_LIMIT("unaligned io (data block spans pages)");
-			return -EIO;
-		}
-
-		desc->tfm = v->shash_tfm;
-		r = crypto_shash_import(desc, v->initial_hashstate);
-		if (unlikely(r))
-			goto error;
-
-		virt = bvec_kmap_local(&bv);
-		r = crypto_shash_finup(desc, virt, len, digest);
-		kunmap_local(virt);
-		if (unlikely(r))
-			goto error;
-
-		bio_advance_iter(bio, iter, len);
-	}
-	return 0;
-
-error:
-	DMERR("Error hashing block from bio iter: %d", r);
-	return r;
-}
-
 /*
  * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
  * starting from iter.
  */
 int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
@@ -581,41 +522,42 @@  static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io,
 	io->recheck_buffer += len;
 
 	return 0;
 }
 
-static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
-				   struct bvec_iter start, sector_t cur_block)
+static int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
+			  struct bvec_iter start, sector_t blkno,
+			  const u8 *want_digest)
 {
 	struct page *page;
 	void *buffer;
 	int r;
 	struct dm_io_request io_req;
 	struct dm_io_region io_loc;
+	u8 real_digest[HASH_MAX_DIGESTSIZE];
 
 	page = mempool_alloc(&v->recheck_pool, GFP_NOIO);
 	buffer = page_to_virt(page);
 
 	io_req.bi_opf = REQ_OP_READ;
 	io_req.mem.type = DM_IO_KMEM;
 	io_req.mem.ptr.addr = buffer;
 	io_req.notify.fn = NULL;
 	io_req.client = v->io;
 	io_loc.bdev = v->data_dev->bdev;
-	io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT);
+	io_loc.sector = blkno << (v->data_dev_block_bits - SECTOR_SHIFT);
 	io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT);
 	r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
 	if (unlikely(r))
 		goto free_ret;
 
 	r = verity_compute_hash_virt(v, io, buffer, 1 << v->data_dev_block_bits,
-				     verity_io_real_digest(v, io), true);
+				     real_digest, true);
 	if (unlikely(r))
 		goto free_ret;
 
-	if (memcmp(verity_io_real_digest(v, io),
-		   verity_io_want_digest(v, io), v->digest_size)) {
+	if (memcmp(real_digest, want_digest, v->digest_size)) {
 		r = -EIO;
 		goto free_ret;
 	}
 
 	io->recheck_buffer = buffer;
@@ -647,22 +589,144 @@  static inline void verity_bv_skip_block(struct dm_verity *v,
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
 	bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits);
 }
 
+static noinline int
+verity_handle_data_hash_mismatch(struct dm_verity *v, struct dm_verity_io *io,
+				 struct bio *bio, struct bvec_iter *start,
+				 sector_t blkno, const u8 *want_digest)
+{
+	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
+		/*
+		 * Error handling code (FEC included) cannot be run in the
+		 * BH workqueue, so fallback to a standard workqueue.
+		 */
+		return -EAGAIN;
+	}
+	if (verity_recheck(v, io, *start, blkno, want_digest) == 0) {
+		if (v->validated_blocks)
+			set_bit(blkno, v->validated_blocks);
+		return 0;
+	}
+#if defined(CONFIG_DM_VERITY_FEC)
+	if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno,
+			      want_digest, NULL, start) == 0)
+		return 0;
+#endif
+	if (bio->bi_status)
+		return -EIO; /* Error correction failed; Just return error */
+
+	if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, blkno)) {
+		dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", bio, blkno, 0);
+		return -EIO;
+	}
+	return 0;
+}
+
+struct pending_block {
+	const void *data;
+	sector_t blkno;
+	struct bvec_iter start;
+	u8 want_digest[HASH_MAX_DIGESTSIZE];
+	u8 real_digest[HASH_MAX_DIGESTSIZE];
+};
+
+struct verification_context {
+	struct dm_verity *v;
+	struct dm_verity_io *io;
+	struct bio *bio;
+	struct pending_block pending_blocks[DM_VERITY_MAX_PENDING_DATA_BLOCKS];
+	int num_pending;
+};
+
+static void verity_clear_pending_blocks(struct verification_context *ctx)
+{
+	int i;
+
+	for (i = ctx->num_pending - 1; i >= 0; i--) {
+		kunmap_local(ctx->pending_blocks[i].data);
+		ctx->pending_blocks[i].data = NULL;
+	}
+	ctx->num_pending = 0;
+}
+
+static __always_inline int
+verity_check_data_block_hash(struct dm_verity *v, struct dm_verity_io *io,
+			     struct bio *bio, struct pending_block *block)
+{
+	if (likely(memcmp(block->real_digest, block->want_digest,
+			  v->digest_size) == 0)) {
+		if (v->validated_blocks)
+			set_bit(block->blkno, v->validated_blocks);
+		return 0;
+	}
+	return verity_handle_data_hash_mismatch(v, io, bio, &block->start,
+						block->blkno,
+						block->want_digest);
+}
+
+static int verity_verify_pending_blocks(struct verification_context *ctx)
+{
+	struct dm_verity *v = ctx->v;
+	struct dm_verity_io *io = ctx->io;
+	struct bio *bio = ctx->bio;
+	const u8 *data[DM_VERITY_MAX_PENDING_DATA_BLOCKS];
+	u8 *outs[DM_VERITY_MAX_PENDING_DATA_BLOCKS];
+	struct shash_desc *desc = verity_io_hash_req(v, io);
+	int i;
+	int r;
+
+	if (ctx->num_pending == 0)
+		return 0;
+
+	for (i = 0; i < ctx->num_pending; i++) {
+		data[i] = ctx->pending_blocks[i].data;
+		outs[i] = ctx->pending_blocks[i].real_digest;
+	}
+
+	desc->tfm = v->shash_tfm;
+	r = crypto_shash_import(desc, v->initial_hashstate);
+	if (unlikely(r)) {
+		DMERR("Error importing hash state: %d", r);
+		return r;
+	}
+	r = crypto_shash_finup_mb(desc, data, 1 << v->data_dev_block_bits, outs,
+				  ctx->num_pending);
+	if (unlikely(r)) {
+		DMERR("Error hashing data blocks: %d", r);
+		return r;
+	}
+
+	for (i = 0; i < ctx->num_pending; i++) {
+		r = verity_check_data_block_hash(v, io, bio,
+						 &ctx->pending_blocks[i]);
+		if (unlikely(r))
+			return r;
+	}
+	verity_clear_pending_blocks(ctx);
+	return 0;
+}
+
 /*
  * Verify one "dm_verity_io" structure.
  */
 static int verity_verify_io(struct dm_verity_io *io)
 {
-	bool is_zero;
 	struct dm_verity *v = io->v;
-	struct bvec_iter start;
+	const unsigned int block_size = 1 << v->data_dev_block_bits;
 	struct bvec_iter iter_copy;
 	struct bvec_iter *iter;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+	struct verification_context ctx;
 	unsigned int b;
+	int r;
+
+	ctx.v = v;
+	ctx.io = io;
+	ctx.bio = bio;
+	ctx.num_pending = 0;
 
 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 		/*
 		 * Copy the iterator in case we need to restart
 		 * verification in a work-queue.
@@ -671,82 +735,98 @@  static int verity_verify_io(struct dm_verity_io *io)
 		iter = &iter_copy;
 	} else
 		iter = &io->iter;
 
 	for (b = 0; b < io->n_blocks; b++) {
-		int r;
-		sector_t cur_block = io->block + b;
+		sector_t blkno = io->block + b;
+		struct pending_block *block;
+		bool is_zero;
 
 		if (v->validated_blocks && bio->bi_status == BLK_STS_OK &&
-		    likely(test_bit(cur_block, v->validated_blocks))) {
+		    likely(test_bit(blkno, v->validated_blocks))) {
 			verity_bv_skip_block(v, io, iter);
 			continue;
 		}
 
-		r = verity_hash_for_block(v, io, cur_block,
-					  verity_io_want_digest(v, io),
+		block = &ctx.pending_blocks[ctx.num_pending];
+		block->blkno = blkno;
+		block->start = *iter;
+
+		r = verity_hash_for_block(v, io, blkno, block->want_digest,
 					  &is_zero);
 		if (unlikely(r < 0))
-			return r;
+			goto error;
 
 		if (is_zero) {
 			/*
 			 * If we expect a zero block, don't validate, just
 			 * return zeros.
 			 */
 			r = verity_for_bv_block(v, io, iter,
 						verity_bv_zero);
 			if (unlikely(r < 0))
-				return r;
+				goto error;
 
 			continue;
 		}
 
-		start = *iter;
-		r = verity_compute_hash(v, io, iter,
-					verity_io_real_digest(v, io),
-					!io->in_bh);
-		if (unlikely(r < 0))
-			return r;
+		if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) {
+			/* Hash and verify one data block using ahash. */
+			struct ahash_request *req = verity_io_hash_req(v, io);
+			struct crypto_wait wait;
+
+			r = verity_ahash_init(v, req, &wait, !io->in_bh) ?:
+			    verity_ahash_update_block(v, io, iter, &wait) ?:
+			    verity_ahash_final(v, req, block->real_digest,
+					       &wait);
+			if (unlikely(r)) {
+				DMERR("Error hashing data block: %d", r);
+				goto error;
+			}
 
-		if (likely(memcmp(verity_io_real_digest(v, io),
-				  verity_io_want_digest(v, io), v->digest_size) == 0)) {
-			if (v->validated_blocks)
-				set_bit(cur_block, v->validated_blocks);
-			continue;
-		} else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
-			/*
-			 * Error handling code (FEC included) cannot be run in a
-			 * tasklet since it may sleep, so fallback to work-queue.
-			 */
-			return -EAGAIN;
-		} else if (verity_recheck(v, io, start, cur_block) == 0) {
-			if (v->validated_blocks)
-				set_bit(cur_block, v->validated_blocks);
-			continue;
-#if defined(CONFIG_DM_VERITY_FEC)
-		} else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
-					     cur_block, NULL, &start) == 0) {
-			continue;
-#endif
+			r = verity_check_data_block_hash(v, io, bio, block);
+			if (unlikely(r))
+				goto error;
 		} else {
-			if (bio->bi_status) {
+			/* Queue up one block to be hashed with shash. */
+			struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+			if (unlikely(bv.bv_len < block_size)) {
 				/*
-				 * Error correction failed; Just return error
+				 * Data block spans pages.  This should not
+				 * happen, since this code path is not used if
+				 * the data block size is greater than the page
+				 * size, and all I/O should be data block
+				 * aligned because dm-verity sets
+				 * logical_block_size to the data block size.
 				 */
-				return -EIO;
+				DMERR_LIMIT("unaligned io (data block spans pages)");
+				r = -EIO;
+				goto error;
 			}
-			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-					      cur_block)) {
-				dm_audit_log_bio(DM_MSG_PREFIX, "verify-data",
-						 bio, cur_block, 0);
-				return -EIO;
+
+			block->data = bvec_kmap_local(&bv);
+			if (++ctx.num_pending == v->mb_max_msgs) {
+				/* Queue is full.  Verify the blocks. */
+				r = verity_verify_pending_blocks(&ctx);
+				if (r)
+					goto error;
+
 			}
+			bio_advance_iter(bio, iter, block_size);
 		}
 	}
 
+	r = verity_verify_pending_blocks(&ctx);
+	if (r)
+		goto error;
+
 	return 0;
+
+error:
+	verity_clear_pending_blocks(&ctx);
+	return r;
 }
 
 /*
  * Skip verity work in response to I/O error when system is shutting down.
  */
@@ -1321,10 +1401,34 @@  static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
 	if (!v->alg_name) {
 		ti->error = "Cannot allocate algorithm name";
 		return -ENOMEM;
 	}
 
+	/*
+	 * Allocate the hash transformation object that this dm-verity instance
+	 * will use.  We have a choice of two APIs: shash and ahash.  Most
+	 * dm-verity users use CPU-based hashing, and for this shash is optimal
+	 * since it matches the underlying algorithm implementations and also
+	 * allows the use of fast multibuffer hashing (crypto_shash_finup_mb()).
+	 * ahash adds support for off-CPU hash offloading.  It also provides
+	 * access to shash algorithms, but does so less efficiently.
+	 *
+	 * Meanwhile, hashing a block in dm-verity in general requires an
+	 * init+update+final sequence with multiple updates.  However, usually
+	 * the salt is prepended to the block rather than appended, and the data
+	 * block size is not greater than the page size.  In this very common
+	 * case, the sequence can be optimized to import+finup, where the first
+	 * step imports the pre-computed state after init+update(salt).  This
+	 * can reduce the crypto API overhead significantly.
+	 *
+	 * To provide optimal performance for the vast majority of dm-verity
+	 * users while still supporting off-CPU hash offloading and the rarer
+	 * dm-verity settings, we therefore have two code paths: one using shash
+	 * where we use import+finup_mb, and one using ahash where we use
+	 * init+update(s)+final.  We use the former code path when it's possible
+	 * to use and shash gives the same algorithm as ahash.
+	 */
 	ahash = crypto_alloc_ahash(alg_name, 0,
 				   v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0);
 	if (IS_ERR(ahash)) {
 		ti->error = "Cannot initialize hash function";
 		return PTR_ERR(ahash);
@@ -1345,14 +1449,17 @@  static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
 	}
 	if (!IS_ERR_OR_NULL(shash)) {
 		crypto_free_ahash(ahash);
 		ahash = NULL;
 		v->shash_tfm = shash;
+		v->mb_max_msgs = min(crypto_shash_mb_max_msgs(shash),
+				     DM_VERITY_MAX_PENDING_DATA_BLOCKS);
 		v->digest_size = crypto_shash_digestsize(shash);
 		v->hash_reqsize = sizeof(struct shash_desc) +
 				  crypto_shash_descsize(shash);
-		DMINFO("%s using shash \"%s\"", alg_name, driver_name);
+		DMINFO("%s using shash \"%s\"%s", alg_name, driver_name,
+		       v->mb_max_msgs > 1 ? " (multibuffer)" : "");
 	} else {
 		v->ahash_tfm = ahash;
 		static_branch_inc(&ahash_enabled);
 		v->digest_size = crypto_ahash_digestsize(ahash);
 		v->hash_reqsize = sizeof(struct ahash_request) +
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 15ffb0881cc9..932b0c437d21 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -55,10 +55,11 @@  struct dm_verity {
 	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
 	unsigned char levels;	/* the number of tree levels */
 	unsigned char version;
 	bool hash_failed:1;	/* set if hash of any block failed */
 	bool use_bh_wq:1;	/* try to verify in BH wq before normal work-queue */
+	unsigned char mb_max_msgs; /* max multibuffer hashing interleaving factor */
 	unsigned int digest_size;	/* digest size for the current hash algorithm */
 	unsigned int hash_reqsize; /* the size of temporary space for crypto */
 	enum verity_mode mode;	/* mode for handling verification errors */
 	unsigned int corrupted_errs;/* Number of errors for corrupted blocks */
 
@@ -92,42 +93,23 @@  struct dm_verity_io {
 	struct work_struct bh_work;
 
 	char *recheck_buffer;
 
 	/*
-	 * Three variably-size fields follow this struct:
-	 *
-	 * u8 hash_req[v->hash_reqsize];
-	 * u8 real_digest[v->digest_size];
-	 * u8 want_digest[v->digest_size];
-	 *
-	 * To access them use: verity_io_hash_req(), verity_io_real_digest()
-	 * and verity_io_want_digest().
-	 *
-	 * hash_req is either a struct ahash_request or a struct shash_desc,
-	 * depending on whether ahash_tfm or shash_tfm is being used.
+	 * This struct is followed by a variable-sized hash request of size
+	 * v->hash_reqsize, either a struct ahash_request or a struct shash_desc
+	 * (depending on whether ahash_tfm or shash_tfm is being used).  To
+	 * access it, use verity_io_hash_req().
 	 */
 };
 
 static inline void *verity_io_hash_req(struct dm_verity *v,
 				       struct dm_verity_io *io)
 {
 	return io + 1;
 }
 
-static inline u8 *verity_io_real_digest(struct dm_verity *v,
-					struct dm_verity_io *io)
-{
-	return (u8 *)(io + 1) + v->hash_reqsize;
-}
-
-static inline u8 *verity_io_want_digest(struct dm_verity *v,
-					struct dm_verity_io *io)
-{
-	return (u8 *)(io + 1) + v->hash_reqsize + v->digest_size;
-}
-
 extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 			       struct bvec_iter *iter,
 			       int (*process)(struct dm_verity *v,
 					      struct dm_verity_io *io,
 					      u8 *data, size_t len));