diff mbox series

[3/6] fs: Convert block_read_full_page to be synchronous

Message ID 20201022212228.15703-4-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series Make block_read_full_page synchronous | expand

Commit Message

Matthew Wilcox Oct. 22, 2020, 9:22 p.m. UTC
Use the new blk_completion infrastructure to wait for multiple I/Os.
Also coalesce adjacent buffer heads into a single BIO instead of
submitting one BIO per buffer head.  This doesn't work for fscrypt yet,
so keep the old code around for now.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/buffer.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

Comments

Eric Biggers Oct. 22, 2020, 11:35 p.m. UTC | #1
On Thu, Oct 22, 2020 at 10:22:25PM +0100, Matthew Wilcox (Oracle) wrote:
> Use the new blk_completion infrastructure to wait for multiple I/Os.
> Also coalesce adjacent buffer heads into a single BIO instead of
> submitting one BIO per buffer head.  This doesn't work for fscrypt yet,
> so keep the old code around for now.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  fs/buffer.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 90 insertions(+)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 1b0ba1d59966..ccb90081117c 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2249,6 +2249,87 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
>  }
>  EXPORT_SYMBOL(block_is_partially_uptodate);
>  
> +static void readpage_end_bio(struct bio *bio)
> +{
> +	struct bio_vec *bvec;
> +	struct page *page;
> +	struct buffer_head *bh;
> +	int i, nr = 0;
> +
> +	bio_for_each_bvec_all(bvec, bio, i) {

Shouldn't this technically be bio_for_each_segment_all()?  This wants to iterate
over the pages, not the bvecs -- and in general, each bvec might contain
multiple pages.

Now, in this case, each bio has only 1 page and 1 bvec, so it doesn't really
matter.  But if we're going to use an iterator, it seems we should use the right
kind.

Likewise in decrypt_bio() in patch 6.

- Eric
Eric Biggers Oct. 22, 2020, 11:40 p.m. UTC | #2
On Thu, Oct 22, 2020 at 10:22:25PM +0100, Matthew Wilcox (Oracle) wrote:
> +static int readpage_submit_bhs(struct page *page, struct blk_completion *cmpl,
> +		unsigned int nr, struct buffer_head **bhs)
> +{
> +	struct bio *bio = NULL;
> +	unsigned int i;
> +	int err;
> +
> +	blk_completion_init(cmpl, nr);
> +
> +	for (i = 0; i < nr; i++) {
> +		struct buffer_head *bh = bhs[i];
> +		sector_t sector = bh->b_blocknr * (bh->b_size >> 9);
> +		bool same_page;
> +
> +		if (buffer_uptodate(bh)) {
> +			end_buffer_async_read(bh, 1);
> +			blk_completion_sub(cmpl, BLK_STS_OK, 1);
> +			continue;
> +		}
> +		if (bio) {
> +			if (bio_end_sector(bio) == sector &&
> +			    __bio_try_merge_page(bio, bh->b_page, bh->b_size,
> +					bh_offset(bh), &same_page))
> +				continue;
> +			submit_bio(bio);
> +		}
> +		bio = bio_alloc(GFP_NOIO, 1);
> +		bio_set_dev(bio, bh->b_bdev);
> +		bio->bi_iter.bi_sector = sector;
> +		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
> +		bio->bi_end_io = readpage_end_bio;
> +		bio->bi_private = cmpl;
> +		/* Take care of bh's that straddle the end of the device */
> +		guard_bio_eod(bio);
> +	}

The following is needed to set the bio encryption context for the
'-o inlinecrypt' case on ext4:

diff --git a/fs/buffer.c b/fs/buffer.c
index 95c338e2b99c..546a08c5003b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2237,6 +2237,7 @@ static int readpage_submit_bhs(struct page *page, struct blk_completion *cmpl,
 			submit_bio(bio);
 		}
 		bio = bio_alloc(GFP_NOIO, 1);
+		fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 		bio_set_dev(bio, bh->b_bdev);
 		bio->bi_iter.bi_sector = sector;
 		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
Matthew Wilcox Oct. 23, 2020, 1:21 p.m. UTC | #3
On Thu, Oct 22, 2020 at 04:40:11PM -0700, Eric Biggers wrote:
> On Thu, Oct 22, 2020 at 10:22:25PM +0100, Matthew Wilcox (Oracle) wrote:
> > +static int readpage_submit_bhs(struct page *page, struct blk_completion *cmpl,
> > +		unsigned int nr, struct buffer_head **bhs)
> > +{
> > +	struct bio *bio = NULL;
> > +	unsigned int i;
> > +	int err;
> > +
> > +	blk_completion_init(cmpl, nr);
> > +
> > +	for (i = 0; i < nr; i++) {
> > +		struct buffer_head *bh = bhs[i];
> > +		sector_t sector = bh->b_blocknr * (bh->b_size >> 9);
> > +		bool same_page;
> > +
> > +		if (buffer_uptodate(bh)) {
> > +			end_buffer_async_read(bh, 1);
> > +			blk_completion_sub(cmpl, BLK_STS_OK, 1);
> > +			continue;
> > +		}
> > +		if (bio) {
> > +			if (bio_end_sector(bio) == sector &&
> > +			    __bio_try_merge_page(bio, bh->b_page, bh->b_size,
> > +					bh_offset(bh), &same_page))
> > +				continue;
> > +			submit_bio(bio);
> > +		}
> > +		bio = bio_alloc(GFP_NOIO, 1);
> > +		bio_set_dev(bio, bh->b_bdev);
> > +		bio->bi_iter.bi_sector = sector;
> > +		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
> > +		bio->bi_end_io = readpage_end_bio;
> > +		bio->bi_private = cmpl;
> > +		/* Take care of bh's that straddle the end of the device */
> > +		guard_bio_eod(bio);
> > +	}
> 
> The following is needed to set the bio encryption context for the
> '-o inlinecrypt' case on ext4:
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 95c338e2b99c..546a08c5003b 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2237,6 +2237,7 @@ static int readpage_submit_bhs(struct page *page, struct blk_completion *cmpl,
>  			submit_bio(bio);
>  		}
>  		bio = bio_alloc(GFP_NOIO, 1);
> +		fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
>  		bio_set_dev(bio, bh->b_bdev);
>  		bio->bi_iter.bi_sector = sector;
>  		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

Thanks!  I saw that and had every intention of copying it across.
And then I forgot.  I'll add that.  I'm also going to do:

-                           __bio_try_merge_page(bio, bh->b_page, bh->b_size,
-                                       bh_offset(bh), &same_page))
+                           bio_add_page(bio, bh->b_page, bh->b_size,
+                                       bh_offset(bh)))

I wonder about allocating bios that can accommodate more bvecs.  Not sure
how often filesystems have adjacent blocks which go into non-adjacent
sub-page blocks.  It's certainly possible that a filesystem might have
a page consisting of DDhhDDDD ('D' for Data, 'h' for hole), but how
likely is it to have written the two data chunks next to each other?
Maybe with O_SYNC?

Anyway, this patchset needs some more thought because I've just seen
the path from mpage_readahead() to block_read_full_page() that should
definitely not be synchronous.
Eric Biggers Oct. 23, 2020, 4:13 p.m. UTC | #4
On Fri, Oct 23, 2020 at 02:21:38PM +0100, Matthew Wilcox wrote:
> > 
> > The following is needed to set the bio encryption context for the
> > '-o inlinecrypt' case on ext4:
> > 
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index 95c338e2b99c..546a08c5003b 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -2237,6 +2237,7 @@ static int readpage_submit_bhs(struct page *page, struct blk_completion *cmpl,
> >  			submit_bio(bio);
> >  		}
> >  		bio = bio_alloc(GFP_NOIO, 1);
> > +		fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
> >  		bio_set_dev(bio, bh->b_bdev);
> >  		bio->bi_iter.bi_sector = sector;
> >  		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
> 
> Thanks!  I saw that and had every intention of copying it across.
> And then I forgot.  I'll add that.  I'm also going to do:
> 
> -                           __bio_try_merge_page(bio, bh->b_page, bh->b_size,
> -                                       bh_offset(bh), &same_page))
> +                           bio_add_page(bio, bh->b_page, bh->b_size,
> +                                       bh_offset(bh)))
> 
> I wonder about allocating bios that can accommodate more bvecs.  Not sure
> how often filesystems have adjacent blocks which go into non-adjacent
> sub-page blocks.  It's certainly possible that a filesystem might have
> a page consisting of DDhhDDDD ('D' for Data, 'h' for hole), but how
> likely is it to have written the two data chunks next to each other?
> Maybe with O_SYNC?
> 

I think that's a rare case that's not very important to optimize.  And there's
already a lot of code where filesystems *could* submit a single bio in that case
but don't.  For example, both fs/direct-io.c and fs/iomap/direct-io.c only
submit bios that contain logically contiguous data.

If you do implement this optimization, note that it wouldn't work when a
bio_crypt_ctx is set, since the data must be logically contiguous in that case.
To handle that you'd need to call fscrypt_mergeable_bio_bh() when adding each
block, and submit the bio if it returns false.  (In contrast, with your current
proposal, calling fscrypt_mergeable_bio_bh() isn't necessary because each bio
only contains logically contiguous data within one page.)

- Eric
Matthew Wilcox Oct. 23, 2020, 8:42 p.m. UTC | #5
On Fri, Oct 23, 2020 at 09:13:35AM -0700, Eric Biggers wrote:
> On Fri, Oct 23, 2020 at 02:21:38PM +0100, Matthew Wilcox wrote:
> > I wonder about allocating bios that can accommodate more bvecs.  Not sure
> > how often filesystems have adjacent blocks which go into non-adjacent
> > sub-page blocks.  It's certainly possible that a filesystem might have
> > a page consisting of DDhhDDDD ('D' for Data, 'h' for hole), but how
> > likely is it to have written the two data chunks next to each other?
> > Maybe with O_SYNC?
> 
> I think that's a rare case that's not very important to optimize.  And there's
> already a lot of code where filesystems *could* submit a single bio in that case
> but don't.  For example, both fs/direct-io.c and fs/iomap/direct-io.c only
> submit bios that contain logically contiguous data.

True.  iomap/buffered-io.c will do it though.

> If you do implement this optimization, note that it wouldn't work when a
> bio_crypt_ctx is set, since the data must be logically contiguous in that case.
> To handle that you'd need to call fscrypt_mergeable_bio_bh() when adding each
> block, and submit the bio if it returns false.  (In contrast, with your current
> proposal, calling fscrypt_mergeable_bio_bh() isn't necessary because each bio
> only contains logically contiguous data within one page.)

Oh, that's disappointing.  I had assumed that you'd set up the dun for
the logical block corresponding to the start of the page and then you'd
be able to decrypt any range in the page.
diff mbox series

Patch

diff --git a/fs/buffer.c b/fs/buffer.c
index 1b0ba1d59966..ccb90081117c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2249,6 +2249,87 @@  int block_is_partially_uptodate(struct page *page, unsigned long from,
 }
 EXPORT_SYMBOL(block_is_partially_uptodate);
 
+static void readpage_end_bio(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	struct page *page;
+	struct buffer_head *bh;
+	int i, nr = 0;
+
+	bio_for_each_bvec_all(bvec, bio, i) {
+		size_t offset = 0;
+		size_t max = bvec->bv_offset + bvec->bv_len;
+
+		page = bvec->bv_page;
+		bh = page_buffers(page);
+
+		for (offset = 0; offset < max; offset += bh->b_size,
+				bh = bh->b_this_page) {
+			if (offset < bvec->bv_offset)
+				continue;
+			BUG_ON(bh_offset(bh) != offset);
+			nr++;
+			if (unlikely(bio_flagged(bio, BIO_QUIET)))
+				set_bit(BH_Quiet, &bh->b_state);
+			if (bio->bi_status == BLK_STS_OK)
+				set_buffer_uptodate(bh);
+			else
+				buffer_io_error(bh, ", async page read");
+			unlock_buffer(bh);
+		}
+	}
+
+	if (blk_completion_sub(bio->bi_private, bio->bi_status, nr) < 0)
+		unlock_page(page);
+	bio_put(bio);
+}
+
+static int readpage_submit_bhs(struct page *page, struct blk_completion *cmpl,
+		unsigned int nr, struct buffer_head **bhs)
+{
+	struct bio *bio = NULL;
+	unsigned int i;
+	int err;
+
+	blk_completion_init(cmpl, nr);
+
+	for (i = 0; i < nr; i++) {
+		struct buffer_head *bh = bhs[i];
+		sector_t sector = bh->b_blocknr * (bh->b_size >> 9);
+		bool same_page;
+
+		if (buffer_uptodate(bh)) {
+			end_buffer_async_read(bh, 1);
+			blk_completion_sub(cmpl, BLK_STS_OK, 1);
+			continue;
+		}
+		if (bio) {
+			if (bio_end_sector(bio) == sector &&
+			    __bio_try_merge_page(bio, bh->b_page, bh->b_size,
+					bh_offset(bh), &same_page))
+				continue;
+			submit_bio(bio);
+		}
+		bio = bio_alloc(GFP_NOIO, 1);
+		bio_set_dev(bio, bh->b_bdev);
+		bio->bi_iter.bi_sector = sector;
+		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+		bio->bi_end_io = readpage_end_bio;
+		bio->bi_private = cmpl;
+		/* Take care of bh's that straddle the end of the device */
+		guard_bio_eod(bio);
+	}
+
+	if (bio)
+		submit_bio(bio);
+
+	err = blk_completion_wait_killable(cmpl);
+	if (!err)
+		return AOP_UPDATED_PAGE;
+	unlock_page(page);
+	return err;
+}
+
 /*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
@@ -2258,6 +2339,7 @@  EXPORT_SYMBOL(block_is_partially_uptodate);
  */
 int block_read_full_page(struct page *page, get_block_t *get_block)
 {
+	struct blk_completion *cmpl = kmalloc(sizeof(*cmpl), GFP_NOIO);
 	struct inode *inode = page->mapping->host;
 	sector_t iblock, lblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
@@ -2265,6 +2347,9 @@  int block_read_full_page(struct page *page, get_block_t *get_block)
 	int nr, i, err = 0;
 	int fully_mapped = 1;
 
+	if (!cmpl)
+		return -ENOMEM;
+
 	head = create_page_buffers(page, inode, 0);
 	blocksize = head->b_size;
 	bbits = block_size_bits(blocksize);
@@ -2303,6 +2388,7 @@  int block_read_full_page(struct page *page, get_block_t *get_block)
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
 
 	if (err) {
+		kfree(cmpl);
 		unlock_page(page);
 		return err;
 	}
@@ -2322,6 +2408,10 @@  int block_read_full_page(struct page *page, get_block_t *get_block)
 		mark_buffer_async_read(bh);
 	}
 
+	if (!fscrypt_inode_uses_fs_layer_crypto(inode))
+		return readpage_submit_bhs(page, cmpl, nr, arr);
+	kfree(cmpl);
+
 	/*
 	 * Stage 3: start the IO.  Check for uptodateness
 	 * inside the buffer lock in case another process reading