diff mbox series

[16/17] block: use iomap for writes to block devices

Message ID 20230424054926.26927-17-hch@lst.de (mailing list archive)
State New
Headers show
Series [01/17] fs: unexport buffer_check_dirty_writeback | expand

Commit Message

Christoph Hellwig April 24, 2023, 5:49 a.m. UTC
Use iomap in buffer_head compat mode to write to block devices.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/Kconfig |  1 +
 block/fops.c  | 33 +++++++++++++++++++++++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

Comments

Pankaj Raghav April 26, 2023, 1 p.m. UTC | #1
On Mon, Apr 24, 2023 at 07:49:25AM +0200, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/Kconfig |  1 +
>  block/fops.c  | 33 +++++++++++++++++++++++++++++----
>  2 files changed, 30 insertions(+), 4 deletions(-)
> 
> diff --git a/block/Kconfig b/block/Kconfig
> index 941b2dca70db73..672b08f0096ab4 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -5,6 +5,7 @@
>  menuconfig BLOCK
>         bool "Enable the block layer" if EXPERT
>         default y
> +       select IOMAP

This needs to be FS_IOMAP.

>         select SBITMAP
>         help
>  	 Provide block layer support for the kernel.
Hannes Reinecke May 19, 2023, 2:22 p.m. UTC | #2
On 4/24/23 07:49, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   block/Kconfig |  1 +
>   block/fops.c  | 33 +++++++++++++++++++++++++++++----
>   2 files changed, 30 insertions(+), 4 deletions(-)
> 
> diff --git a/block/Kconfig b/block/Kconfig
> index 941b2dca70db73..672b08f0096ab4 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -5,6 +5,7 @@
>   menuconfig BLOCK
>          bool "Enable the block layer" if EXPERT
>          default y
> +       select IOMAP
>          select SBITMAP
>          help
>   	 Provide block layer support for the kernel.
> diff --git a/block/fops.c b/block/fops.c
> index 318247832a7bcf..7910636f8df33b 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -15,6 +15,7 @@
>   #include <linux/falloc.h>
>   #include <linux/suspend.h>
>   #include <linux/fs.h>
> +#include <linux/iomap.h>
>   #include <linux/module.h>
>   #include "blk.h"
>   
> @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>   	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
>   }
>   
> +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> +		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> +{
> +	struct block_device *bdev = I_BDEV(inode);
> +	loff_t isize = i_size_read(inode);
> +
> +	iomap->bdev = bdev;
> +	iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> +	if (WARN_ON_ONCE(iomap->offset >= isize))
> +		return -EIO;

I'm hitting this during booting:
[    5.016324]  <TASK>
[    5.030256]  iomap_iter+0x11a/0x350
[    5.030264]  iomap_readahead+0x1eb/0x2c0
[    5.030272]  read_pages+0x5d/0x220
[    5.030279]  page_cache_ra_unbounded+0x131/0x180
[    5.030284]  filemap_get_pages+0xff/0x5a0
[    5.030292]  filemap_read+0xca/0x320
[    5.030296]  ? aa_file_perm+0x126/0x500
[    5.040216]  ? touch_atime+0xc8/0x150
[    5.040224]  blkdev_read_iter+0xb0/0x150
[    5.040228]  vfs_read+0x226/0x2d0
[    5.040234]  ksys_read+0xa5/0xe0
[    5.040238]  do_syscall_64+0x5b/0x80

Maybe we should consider this patch:

diff --git a/block/fops.c b/block/fops.c
index 524b8a828aad..d202fb663f25 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,

         iomap->bdev = bdev;
         iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
-       if (WARN_ON_ONCE(iomap->offset >= isize))
-               return -EIO;
-       iomap->type = IOMAP_MAPPED;
-       iomap->addr = iomap->offset;
+       if (WARN_ON_ONCE(iomap->offset >= isize)) {
+               iomap->type = IOMAP_HOLE;
+               iomap->addr = IOMAP_NULL_ADDR;
+       } else {
+               iomap->type = IOMAP_MAPPED;
+               iomap->addr = iomap->offset;
+       }
         iomap->length = isize - iomap->offset;
         if (IS_ENABLED(CONFIG_BUFFER_HEAD))
                 iomap->flags |= IOMAP_F_BUFFER_HEAD;


Other that the the system seems fine.

Cheers,

Hannes
Dave Chinner May 23, 2023, 10:27 p.m. UTC | #3
On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
> On 4/24/23 07:49, Christoph Hellwig wrote:
> > Use iomap in buffer_head compat mode to write to block devices.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >   block/Kconfig |  1 +
> >   block/fops.c  | 33 +++++++++++++++++++++++++++++----
> >   2 files changed, 30 insertions(+), 4 deletions(-)
> > 
> > diff --git a/block/Kconfig b/block/Kconfig
> > index 941b2dca70db73..672b08f0096ab4 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -5,6 +5,7 @@
> >   menuconfig BLOCK
> >          bool "Enable the block layer" if EXPERT
> >          default y
> > +       select IOMAP
> >          select SBITMAP
> >          help
> >   	 Provide block layer support for the kernel.
> > diff --git a/block/fops.c b/block/fops.c
> > index 318247832a7bcf..7910636f8df33b 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -15,6 +15,7 @@
> >   #include <linux/falloc.h>
> >   #include <linux/suspend.h>
> >   #include <linux/fs.h>
> > +#include <linux/iomap.h>
> >   #include <linux/module.h>
> >   #include "blk.h"
> > @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> >   	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> >   }
> > +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > +		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> > +{
> > +	struct block_device *bdev = I_BDEV(inode);
> > +	loff_t isize = i_size_read(inode);
> > +
> > +	iomap->bdev = bdev;
> > +	iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> > +	if (WARN_ON_ONCE(iomap->offset >= isize))
> > +		return -EIO;
> 
> I'm hitting this during booting:
> [    5.016324]  <TASK>
> [    5.030256]  iomap_iter+0x11a/0x350
> [    5.030264]  iomap_readahead+0x1eb/0x2c0
> [    5.030272]  read_pages+0x5d/0x220
> [    5.030279]  page_cache_ra_unbounded+0x131/0x180
> [    5.030284]  filemap_get_pages+0xff/0x5a0

Why is filemap_get_pages() using unbounded readahead? Surely
readahead should be limited to reading within EOF....

> [    5.030292]  filemap_read+0xca/0x320
> [    5.030296]  ? aa_file_perm+0x126/0x500
> [    5.040216]  ? touch_atime+0xc8/0x150
> [    5.040224]  blkdev_read_iter+0xb0/0x150
> [    5.040228]  vfs_read+0x226/0x2d0
> [    5.040234]  ksys_read+0xa5/0xe0
> [    5.040238]  do_syscall_64+0x5b/0x80
> 
> Maybe we should consider this patch:
> 
> diff --git a/block/fops.c b/block/fops.c
> index 524b8a828aad..d202fb663f25 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode,
> loff_t offset, loff_t length,
> 
>         iomap->bdev = bdev;
>         iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> -       if (WARN_ON_ONCE(iomap->offset >= isize))
> -               return -EIO;
> -       iomap->type = IOMAP_MAPPED;
> -       iomap->addr = iomap->offset;
> +       if (WARN_ON_ONCE(iomap->offset >= isize)) {
> +               iomap->type = IOMAP_HOLE;
> +               iomap->addr = IOMAP_NULL_ADDR;
> +       } else {
> +               iomap->type = IOMAP_MAPPED;
> +               iomap->addr = iomap->offset;
> +       }

I think Christoph's code is correct. IMO, any attempt to read beyond
the end of the device should throw out a warning and return an
error, not silently return zeros.

If readahead is trying to read beyond the end of the device, then it
really seems to me like the problem here is readahead, not the iomap
code detecting the OOB read request....

Cheers,

Dave.
Matthew Wilcox May 24, 2023, 1:33 p.m. UTC | #4
On Wed, May 24, 2023 at 08:27:13AM +1000, Dave Chinner wrote:
> On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
> > I'm hitting this during booting:
> > [    5.016324]  <TASK>
> > [    5.030256]  iomap_iter+0x11a/0x350
> > [    5.030264]  iomap_readahead+0x1eb/0x2c0
> > [    5.030272]  read_pages+0x5d/0x220
> > [    5.030279]  page_cache_ra_unbounded+0x131/0x180
> > [    5.030284]  filemap_get_pages+0xff/0x5a0
> 
> Why is filemap_get_pages() using unbounded readahead? Surely
> readahead should be limited to reading within EOF....

It isn't using unbounded readahead; that's an artifact of this
incomplete stack trace.  Actual call stack:

page_cache_ra_unbounded
do_page_cache_ra
ondemand_readahead
page_cache_sync_ra
page_cache_sync_readahead
filemap_get_pages

As you can see, do_page_cache_ra() does limit readahead to i_size.
Is ractl->mapping->host the correct way to find the inode?  I always
get confused.

> I think Christoph's code is correct. IMO, any attempt to read beyond
> the end of the device should throw out a warning and return an
> error, not silently return zeros.
> 
> If readahead is trying to read beyond the end of the device, then it
> really seems to me like the problem here is readahead, not the iomap
> code detecting the OOB read request....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Christoph Hellwig July 20, 2023, 12:06 p.m. UTC | #5
On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
> I'm hitting this during booting:
> [    5.016324]  <TASK>
> [    5.030256]  iomap_iter+0x11a/0x350
> [    5.030264]  iomap_readahead+0x1eb/0x2c0
> [    5.030272]  read_pages+0x5d/0x220
> [    5.030279]  page_cache_ra_unbounded+0x131/0x180
> [    5.030284]  filemap_get_pages+0xff/0x5a0
> [    5.030292]  filemap_read+0xca/0x320
> [    5.030296]  ? aa_file_perm+0x126/0x500
> [    5.040216]  ? touch_atime+0xc8/0x150
> [    5.040224]  blkdev_read_iter+0xb0/0x150
> [    5.040228]  vfs_read+0x226/0x2d0
> [    5.040234]  ksys_read+0xa5/0xe0
> [    5.040238]  do_syscall_64+0x5b/0x80
>
> Maybe we should consider this patch:

As willy said this should be taken care of by the i_size check.
Did you run with just this patch set or some of the large block
size experiments on top which might change the variables?

I'll repost the series today without any chances in the area, and
if you can reproduce it with just that series we need to root
cause it, so please send your kernel and VM config along for the
next report.
Christoph Hellwig July 20, 2023, 12:09 p.m. UTC | #6
On Wed, May 24, 2023 at 02:33:13PM +0100, Matthew Wilcox wrote:
> As you can see, do_page_cache_ra() does limit readahead to i_size.
> Is ractl->mapping->host the correct way to find the inode?  I always
> get confused.

As far as I can tell it is the right inode, the indirection through
file->f_mapping ensures it actually points to the backing inode.
Hannes Reinecke July 20, 2023, 12:16 p.m. UTC | #7
On 7/20/23 14:06, Christoph Hellwig wrote:
> On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
>> I'm hitting this during booting:
>> [    5.016324]  <TASK>
>> [    5.030256]  iomap_iter+0x11a/0x350
>> [    5.030264]  iomap_readahead+0x1eb/0x2c0
>> [    5.030272]  read_pages+0x5d/0x220
>> [    5.030279]  page_cache_ra_unbounded+0x131/0x180
>> [    5.030284]  filemap_get_pages+0xff/0x5a0
>> [    5.030292]  filemap_read+0xca/0x320
>> [    5.030296]  ? aa_file_perm+0x126/0x500
>> [    5.040216]  ? touch_atime+0xc8/0x150
>> [    5.040224]  blkdev_read_iter+0xb0/0x150
>> [    5.040228]  vfs_read+0x226/0x2d0
>> [    5.040234]  ksys_read+0xa5/0xe0
>> [    5.040238]  do_syscall_64+0x5b/0x80
>>
>> Maybe we should consider this patch:
> 
> As willy said this should be taken care of by the i_size check.
> Did you run with just this patch set or some of the large block
> size experiments on top which might change the variables?
> 
> I'll repost the series today without any chances in the area, and
> if you can reproduce it with just that series we need to root
> cause it, so please send your kernel and VM config along for the
> next report.

I _think_ it's been resolve now; I've rewritten my patchset (and the 
patches where it's based upon) several times now, so it might be a stale 
issue now.

Eagerly awaiting your patchset.

Cheers,

Hannes
diff mbox series

Patch

diff --git a/block/Kconfig b/block/Kconfig
index 941b2dca70db73..672b08f0096ab4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@ 
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select IOMAP
        select SBITMAP
        help
 	 Provide block layer support for the kernel.
diff --git a/block/fops.c b/block/fops.c
index 318247832a7bcf..7910636f8df33b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@ 
 #include <linux/falloc.h>
 #include <linux/suspend.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/module.h>
 #include "blk.h"
 
@@ -386,6 +387,27 @@  static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
 
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+	struct block_device *bdev = I_BDEV(inode);
+	loff_t isize = i_size_read(inode);
+
+	iomap->bdev = bdev;
+	iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+	if (WARN_ON_ONCE(iomap->offset >= isize))
+		return -EIO;
+	iomap->type = IOMAP_MAPPED;
+	iomap->addr = iomap->offset;
+	iomap->length = isize - iomap->offset;
+	iomap->flags |= IOMAP_F_BUFFER_HEAD;
+	return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+	.iomap_begin		= blkdev_iomap_begin,
+};
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
@@ -530,6 +552,11 @@  blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	return written;
 }
 
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -575,16 +602,14 @@  static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		iov_iter_truncate(from, size);
 	}
 
-	current->backing_dev_info = bdev->bd_disk->bdi;
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		ret = blkdev_direct_write(iocb, from);
 		if (ret >= 0 && iov_iter_count(from))
 			ret = direct_write_fallback(iocb, from, ret,
-					generic_perform_write(iocb, from));
+					blkdev_buffered_write(iocb, from));
 	} else {
-		ret = generic_perform_write(iocb, from);
+		ret = blkdev_buffered_write(iocb, from);
 	}
-	current->backing_dev_info = NULL;
 
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);