diff mbox series

[3/5] block: avoid extra iter advance with async iocb

Message ID aee615ac9cd6804c10c14938d011e0913f751960.1635006010.git.asml.silence@gmail.com (mailing list archive)
State New, archived
Headers show
Series block optimisations | expand

Commit Message

Pavel Begunkov Oct. 23, 2021, 4:21 p.m. UTC
Nobody cares about iov iterators state if we return -EIOCBQUEUED, so as
the we now have __blkdev_direct_IO_async(), which gets pages only once,
we can skip expensive iov_iter_advance(). It's around 1-2% of all CPU
spent.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/bio.c         |  2 +-
 block/fops.c        | 20 +++++++++++++++-----
 include/linux/bio.h |  1 +
 3 files changed, 17 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig Oct. 25, 2021, 7:33 a.m. UTC | #1
On Sat, Oct 23, 2021 at 05:21:34PM +0100, Pavel Begunkov wrote:
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -352,11 +352,21 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
>  	bio->bi_end_io = blkdev_bio_end_io_async;
>  	bio->bi_ioprio = iocb->ki_ioprio;
>  
> -	ret = bio_iov_iter_get_pages(bio, iter);
> -	if (unlikely(ret)) {
> -		bio->bi_status = BLK_STS_IOERR;
> -		bio_endio(bio);
> -		return ret;
> +	if (!iov_iter_is_bvec(iter)) {
> +		ret = bio_iov_iter_get_pages(bio, iter);
> +		if (unlikely(ret)) {
> +			bio->bi_status = BLK_STS_IOERR;
> +			bio_endio(bio);
> +			return ret;
> +		}

Nit: I generally find it much nicer to read if simple if statements
don't use pointless negations.

> +	} else {
> +		/*
> +		 * Users don't rely on the iterator being in any particular
> +		 * state for async I/O returning -EIOCBQUEUED, hence we can
> +		 * avoid expensive iov_iter_advance(). Bypass
> +		 * bio_iov_iter_get_pages() and set the bvec directly.
> +		 */
> +		bio_iov_bvec_set(bio, iter);

So if this optimization is so useful, please also do it for
non-bvec iov_iters, which is what 99% of the applications actually
use.
Pavel Begunkov Oct. 25, 2021, 11:10 a.m. UTC | #2
On 10/25/21 08:33, Christoph Hellwig wrote:
> On Sat, Oct 23, 2021 at 05:21:34PM +0100, Pavel Begunkov wrote:
>> --- a/block/fops.c
>> +++ b/block/fops.c
>> @@ -352,11 +352,21 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
>>   	bio->bi_end_io = blkdev_bio_end_io_async;
>>   	bio->bi_ioprio = iocb->ki_ioprio;
>>   
>> -	ret = bio_iov_iter_get_pages(bio, iter);
>> -	if (unlikely(ret)) {
>> -		bio->bi_status = BLK_STS_IOERR;
>> -		bio_endio(bio);
>> -		return ret;
>> +	if (!iov_iter_is_bvec(iter)) {
>> +		ret = bio_iov_iter_get_pages(bio, iter);
>> +		if (unlikely(ret)) {
>> +			bio->bi_status = BLK_STS_IOERR;
>> +			bio_endio(bio);
>> +			return ret;
>> +		}
> 
> Nit: I generally find it much nicer to read if simple if statements
> don't use pointless negations.
> 
>> +	} else {
>> +		/*
>> +		 * Users don't rely on the iterator being in any particular
>> +		 * state for async I/O returning -EIOCBQUEUED, hence we can
>> +		 * avoid expensive iov_iter_advance(). Bypass
>> +		 * bio_iov_iter_get_pages() and set the bvec directly.
>> +		 */
>> +		bio_iov_bvec_set(bio, iter);
> 
> So if this optimization is so useful, please also do it for
> non-bvec iov_iters, which is what 99% of the applications actually
> use.

It's an async path, so mainly io_uring or aio, I don't think there
is much profit in doing that for iov, especially behind iov -> bvec
translation with page referencing.

Could've been done nonetheless, but what I think about looks too
ugly because of the loop inside of bio_iov_iter_get_pages(). Don't
think the sketch below is viable, any better ideas?


ssize_t __bio_iov_iter_get_pages() {
	...
	/* not advancing */
	return size;
}

do {
	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
		ret = __bio_iov_append_get_pages(bio, iter);
	else
		ret = __bio_iov_iter_get_pages(bio, iter);
	if (ret < 0)
		break;
	iov_iter_advance(ret);
} while (iov_iter_count(iter) && !bio_full(bio, 0));

and copy paste into fops.c

do {
	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
		ret = __bio_iov_append_get_pages(bio, iter);
	else
		ret = __bio_iov_iter_get_pages(bio, iter);

	if (ret < 0 || iov_iter_count(iter) == ret || bio_full(bio, 0))
		break;
	iov_iter_advance(ret);
} while (1);
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index ead1f8a9ff5e..15ab0d6d1c06 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1046,7 +1046,7 @@  void __bio_release_pages(struct bio *bio, bool mark_dirty)
 }
 EXPORT_SYMBOL_GPL(__bio_release_pages);
 
-static void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
 	size_t size = iov_iter_count(iter);
 
diff --git a/block/fops.c b/block/fops.c
index a7b328296912..8800b0ad5c29 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -352,11 +352,21 @@  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	bio->bi_end_io = blkdev_bio_end_io_async;
 	bio->bi_ioprio = iocb->ki_ioprio;
 
-	ret = bio_iov_iter_get_pages(bio, iter);
-	if (unlikely(ret)) {
-		bio->bi_status = BLK_STS_IOERR;
-		bio_endio(bio);
-		return ret;
+	if (!iov_iter_is_bvec(iter)) {
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (unlikely(ret)) {
+			bio->bi_status = BLK_STS_IOERR;
+			bio_endio(bio);
+			return ret;
+		}
+	} else {
+		/*
+		 * Users don't rely on the iterator being in any particular
+		 * state for async I/O returning -EIOCBQUEUED, hence we can
+		 * avoid expensive iov_iter_advance(). Bypass
+		 * bio_iov_iter_get_pages() and set the bvec directly.
+		 */
+		bio_iov_bvec_set(bio, iter);
 	}
 	dio->size = bio->bi_iter.bi_size;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c88700d1bdc3..fe6bdfbbef66 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -417,6 +417,7 @@  int bio_add_zone_append_page(struct bio *bio, struct page *page,
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
 void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);