Message ID | 20241023211519.4177873-1-ushankar@purestorage.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | block: fix sanity checks in blk_rq_map_user_bvec | expand |
On 10/23/24 3:15 PM, Uday Shankar wrote: > From: Xinyu Zhang <xizhang@purestorage.com> > > blk_rq_map_user_bvec contains a check bytes + bv->bv_len > nr_iter which > causes unnecessary failures in NVMe passthrough I/O, reproducible as > follows: > > - register a 2 page, page-aligned buffer against a ring > - use that buffer to do a 1 page io_uring NVMe passthrough read > > The second (i = 1) iteration of the loop in blk_rq_map_user_bvec will > then have nr_iter == 1 page, bytes == 1 page, bv->bv_len == 1 page, so > the check bytes + bv->bv_len > nr_iter will succeed, causing the I/O to > fail. This failure is unnecessary, as when the check succeeds, it means > we've checked the entire buffer that will be used by the request - i.e. > blk_rq_map_user_bvec should complete successfully. Therefore, terminate > the loop early and return successfully when the check bytes + bv->bv_len >> nr_iter succeeds. > > While we're at it, also remove the check that all segments in the bvec > are single-page. While this seems to be true for all users of the > function, it doesn't appear to be required anywhere downstream. Yep that looks like an issue. Since this would be nice to backport, please add a Fixes tag as well.
On Wed, Oct 23, 2024 at 04:31:53PM -0600, Jens Axboe wrote: > Yep that looks like an issue. Since this would be nice to backport, > please add a Fixes tag as well. Fixes: 37987547932c ("block: extend functionality to map bvec iterator")
On Wed, Oct 23, 2024 at 03:15:19PM -0600, Uday Shankar wrote: > From: Xinyu Zhang <xizhang@purestorage.com> > > blk_rq_map_user_bvec contains a check bytes + bv->bv_len > nr_iter which > causes unnecessary failures in NVMe passthrough I/O, reproducible as > follows: > > - register a 2 page, page-aligned buffer against a ring > - use that buffer to do a 1 page io_uring NVMe passthrough read > > The second (i = 1) iteration of the loop in blk_rq_map_user_bvec will > then have nr_iter == 1 page, bytes == 1 page, bv->bv_len == 1 page, so > the check bytes + bv->bv_len > nr_iter will succeed, causing the I/O to > fail. This failure is unnecessary, as when the check succeeds, it means > we've checked the entire buffer that will be used by the request - i.e. > blk_rq_map_user_bvec should complete successfully. Therefore, terminate > the loop early and return successfully when the check bytes + bv->bv_len > > nr_iter succeeds. For anyone interested, here are the details on how to reproduce the issue described above: # cat test.c #include <fcntl.h> #include <stdio.h> #include <string.h> #include <sys/ioctl.h> #include <liburing.h> #include <stdlib.h> #include <assert.h> #include <linux/nvme_ioctl.h> int main(int argc, char *argv[]) { struct io_uring ring; assert(io_uring_queue_init(1, &ring, IORING_SETUP_SQE128 | IORING_SETUP_CQE32) == 0); void *buf = memalign(4096, 2 * 4096); printf("buf %p\n", buf); struct iovec iov = { .iov_base = buf, .iov_len = 2 * 4096, }; assert(io_uring_register_buffers(&ring, &iov, 1) == 0); struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); assert(sqe != NULL); int fd = open("/dev/ng0n1", O_RDONLY); assert(fd > 0); sqe->fd = fd; sqe->opcode = IORING_OP_URING_CMD; sqe->cmd_op = NVME_URING_CMD_IO; sqe->buf_index = 0; sqe->flags = 0; sqe->uring_cmd_flags = IORING_URING_CMD_FIXED; struct nvme_passthru_cmd *cmd = &sqe->cmd; cmd->opcode = 2; // read cmd->nsid = 1; cmd->data_len = 1 * 4096; cmd->addr = buf; struct io_uring_cqe *cqe; assert(io_uring_submit(&ring) == 1); assert(io_uring_wait_cqe(&ring, &cqe) == 0); printf("res %d\n", cqe->res); return 0; } # gcc -o test -luring test.c test.c: In function ‘main’: test.c:15:17: warning: implicit declaration of function ‘memalign’ [-Wimplicit-function-declaration] 15 | void *buf = memalign(4096, 2 * 4096); | ^~~~~~~~ test.c:15:17: warning: initialization of ‘void *’ from ‘int’ makes pointer from integer without a cast [-Wint-conversion] test.c:36:37: warning: initialization of ‘struct nvme_passthru_cmd *’ from incompatible pointer type ‘__u8 (*)[0]’ {aka ‘unsigned char (*)[]’} [-Wincompatible-pointer-types] 36 | struct nvme_passthru_cmd *cmd = &sqe->cmd; | ^ test.c:40:15: warning: assignment to ‘__u64’ {aka ‘long long unsigned int’} from ‘void *’ makes integer from pointer without a cast [-Wint-conversion] 40 | cmd->addr = buf; | # ./test buf 0x406000 res -22
On 10/23/24 3:50 PM, Uday Shankar wrote: > For anyone interested, here are the details on how to reproduce the > issue described above: > > # cat test.c > #include <fcntl.h> > #include <stdio.h> > #include <string.h> > #include <sys/ioctl.h> > #include <liburing.h> > #include <stdlib.h> > #include <assert.h> > #include <linux/nvme_ioctl.h> > > int main(int argc, char *argv[]) { > struct io_uring ring; > > assert(io_uring_queue_init(1, &ring, IORING_SETUP_SQE128 | IORING_SETUP_CQE32) == 0); > > void *buf = memalign(4096, 2 * 4096); > printf("buf %p\n", buf); > struct iovec iov = { > .iov_base = buf, > .iov_len = 2 * 4096, > }; > > assert(io_uring_register_buffers(&ring, &iov, 1) == 0); > > struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); > assert(sqe != NULL); > > int fd = open("/dev/ng0n1", O_RDONLY); > assert(fd > 0); > sqe->fd = fd; > sqe->opcode = IORING_OP_URING_CMD; > sqe->cmd_op = NVME_URING_CMD_IO; > sqe->buf_index = 0; > sqe->flags = 0; > sqe->uring_cmd_flags = IORING_URING_CMD_FIXED; > > struct nvme_passthru_cmd *cmd = &sqe->cmd; > cmd->opcode = 2; // read > cmd->nsid = 1; > cmd->data_len = 1 * 4096; > cmd->addr = buf; > > struct io_uring_cqe *cqe; > assert(io_uring_submit(&ring) == 1); > assert(io_uring_wait_cqe(&ring, &cqe) == 0); > > printf("res %d\n", cqe->res); > > return 0; > } > # gcc -o test -luring test.c > test.c: In function ‘main’: > test.c:15:17: warning: implicit declaration of function ‘memalign’ [-Wimplicit-function-declaration] > 15 | void *buf = memalign(4096, 2 * 4096); > | ^~~~~~~~ > test.c:15:17: warning: initialization of ‘void *’ from ‘int’ makes pointer from integer without a cast [-Wint-conversion] > test.c:36:37: warning: initialization of ‘struct nvme_passthru_cmd *’ from incompatible pointer type ‘__u8 (*)[0]’ {aka ‘unsigned char (*)[]’} [-Wincompatible-pointer-types] > 36 | struct nvme_passthru_cmd *cmd = &sqe->cmd; > | ^ > test.c:40:15: warning: assignment to ‘__u64’ {aka ‘long long unsigned int’} from ‘void *’ makes integer from pointer without a cast [-Wint-conversion] > 40 | cmd->addr = buf; > | > # ./test > buf 0x406000 > res -22 Please convert the above code into a blktests pull request. See also https://github.com/osandov/blktests. Thanks, Bart.
On Wed, 23 Oct 2024 15:15:19 -0600, Uday Shankar wrote: > blk_rq_map_user_bvec contains a check bytes + bv->bv_len > nr_iter which > causes unnecessary failures in NVMe passthrough I/O, reproducible as > follows: > > - register a 2 page, page-aligned buffer against a ring > - use that buffer to do a 1 page io_uring NVMe passthrough read > > [...] Applied, thanks! [1/1] block: fix sanity checks in blk_rq_map_user_bvec commit: 2ff949441802a8d076d9013c7761f63e8ae5a9bd Best regards,
> Please convert the above code into a blktests pull request. See also > https://github.com/osandov/blktests. > > Thanks, > > Bart. > +1 it will be really useful get this tested on regularly ... -ck
On Wed, Oct 23, 2024 at 03:15:19PM -0600, Uday Shankar wrote: > @@ -600,9 +600,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) > if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) > goto put_bio; > if (bytes + bv->bv_len > nr_iter) > - goto put_bio; > - if (bv->bv_offset + bv->bv_len > PAGE_SIZE) > - goto put_bio; > + break; So while this fixes NVMe, it actually breaks just about every SCSI driver as the code will easily exceed max_segment_size now, which the old code obeyed (although more by accident). The right thing here is to probably remove blk_rq_map_user_bvec entirely and rely on the ITER_BVEC extraction in iov_iter_extract_pages plus the bio_add_hw_page in bio_map_user_iov.
On Thu, Oct 24, 2024 at 06:56:22AM +0200, Christoph Hellwig wrote: > On Wed, Oct 23, 2024 at 03:15:19PM -0600, Uday Shankar wrote: > > @@ -600,9 +600,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) > > if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) > > goto put_bio; > > if (bytes + bv->bv_len > nr_iter) > > - goto put_bio; > > - if (bv->bv_offset + bv->bv_len > PAGE_SIZE) > > - goto put_bio; > > + break; > > So while this fixes NVMe, it actually breaks just about every SCSI > driver as the code will easily exceed max_segment_size now, which the > old code obeyed (although more by accident). Looking at the existing code a bit more it seems really confused, e.g. by iterating over all segments in the iov_iter instead of using the proper iterators that limit to the actualy size for the I/O, which I think is the root cause of your problem. Can you try the (untested) patch below? That uses the proper block layer helper to check the I/O layout using the bio iterator. It will handle all block layer queue limits, and it does so on the actual iterator instead of the potential larger registration. One change in behavior is that it now returns -EREMOTEIO for all limits mismatches instead of a random mix of -EINVAL and -REMOTEIO. diff --git a/block/blk-map.c b/block/blk-map.c index 0e1167b23934..ca2f2ff853da 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -561,57 +561,27 @@ EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { - struct request_queue *q = rq->q; - size_t nr_iter = iov_iter_count(iter); - size_t nr_segs = iter->nr_segs; - struct bio_vec *bvecs, *bvprvp = NULL; - const struct queue_limits *lim = &q->limits; - unsigned int nsegs = 0, bytes = 0; + const struct queue_limits *lim = &rq->q->limits; + unsigned int nsegs; struct bio *bio; - size_t i; - if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) - return -EINVAL; - if (nr_segs > queue_max_segments(q)) + if (!iov_iter_count(iter)) return -EINVAL; - /* no iovecs to alloc, as we already have a BVEC iterator */ + /* reuse the bvecs from the iterator instead of allocating new ones */ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); - if (bio == NULL) + if (!bio) return -ENOMEM; - bio_iov_bvec_set(bio, (struct iov_iter *)iter); - blk_rq_bio_prep(rq, bio, nr_segs); - - /* loop to perform a bunch of sanity checks */ - bvecs = (struct bio_vec *)iter->bvec; - for (i = 0; i < nr_segs; i++) { - struct bio_vec *bv = &bvecs[i]; - /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, fallback to copy. - */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { - blk_mq_map_bio_put(bio); - return -EREMOTEIO; - } - /* check full condition */ - if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) - goto put_bio; - if (bytes + bv->bv_len > nr_iter) - goto put_bio; - if (bv->bv_offset + bv->bv_len > PAGE_SIZE) - goto put_bio; - - nsegs++; - bytes += bv->bv_len; - bvprvp = bv; + /* check that the data layout matches the hardware restrictions */ + if (bio_split_rw_at(bio, lim, &nsegs, lim->max_hw_sectors)) { + blk_mq_map_bio_put(bio); + return -EREMOTEIO; } + + blk_rq_bio_prep(rq, bio, nsegs); return 0; -put_bio: - blk_mq_map_bio_put(bio); - return -EINVAL; } /**
diff --git a/block/blk-map.c b/block/blk-map.c index 0e1167b23934..6ef2ec1f7d78 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -600,9 +600,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) goto put_bio; if (bytes + bv->bv_len > nr_iter) - goto put_bio; - if (bv->bv_offset + bv->bv_len > PAGE_SIZE) - goto put_bio; + break; nsegs++; bytes += bv->bv_len;