Message ID | 20230424054926.26927-17-hch@lst.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [01/17] fs: unexport buffer_check_dirty_writeback | expand |
On Mon, Apr 24, 2023 at 07:49:25AM +0200, Christoph Hellwig wrote: > Use iomap in buffer_head compat mode to write to block devices. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > block/Kconfig | 1 + > block/fops.c | 33 +++++++++++++++++++++++++++++---- > 2 files changed, 30 insertions(+), 4 deletions(-) > > diff --git a/block/Kconfig b/block/Kconfig > index 941b2dca70db73..672b08f0096ab4 100644 > --- a/block/Kconfig > +++ b/block/Kconfig > @@ -5,6 +5,7 @@ > menuconfig BLOCK > bool "Enable the block layer" if EXPERT > default y > + select IOMAP This needs to be FS_IOMAP. > select SBITMAP > help > Provide block layer support for the kernel.
On 4/24/23 07:49, Christoph Hellwig wrote: > Use iomap in buffer_head compat mode to write to block devices. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > block/Kconfig | 1 + > block/fops.c | 33 +++++++++++++++++++++++++++++---- > 2 files changed, 30 insertions(+), 4 deletions(-) > > diff --git a/block/Kconfig b/block/Kconfig > index 941b2dca70db73..672b08f0096ab4 100644 > --- a/block/Kconfig > +++ b/block/Kconfig > @@ -5,6 +5,7 @@ > menuconfig BLOCK > bool "Enable the block layer" if EXPERT > default y > + select IOMAP > select SBITMAP > help > Provide block layer support for the kernel. > diff --git a/block/fops.c b/block/fops.c > index 318247832a7bcf..7910636f8df33b 100644 > --- a/block/fops.c > +++ b/block/fops.c > @@ -15,6 +15,7 @@ > #include <linux/falloc.h> > #include <linux/suspend.h> > #include <linux/fs.h> > +#include <linux/iomap.h> > #include <linux/module.h> > #include "blk.h" > > @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) > return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); > } > > +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) > +{ > + struct block_device *bdev = I_BDEV(inode); > + loff_t isize = i_size_read(inode); > + > + iomap->bdev = bdev; > + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); > + if (WARN_ON_ONCE(iomap->offset >= isize)) > + return -EIO; I'm hitting this during booting: [ 5.016324] <TASK> [ 5.030256] iomap_iter+0x11a/0x350 [ 5.030264] iomap_readahead+0x1eb/0x2c0 [ 5.030272] read_pages+0x5d/0x220 [ 5.030279] page_cache_ra_unbounded+0x131/0x180 [ 5.030284] filemap_get_pages+0xff/0x5a0 [ 5.030292] filemap_read+0xca/0x320 [ 5.030296] ? aa_file_perm+0x126/0x500 [ 5.040216] ? touch_atime+0xc8/0x150 [ 5.040224] blkdev_read_iter+0xb0/0x150 [ 5.040228] vfs_read+0x226/0x2d0 [ 5.040234] ksys_read+0xa5/0xe0 [ 5.040238] do_syscall_64+0x5b/0x80 Maybe we should consider this patch: diff --git a/block/fops.c b/block/fops.c index 524b8a828aad..d202fb663f25 100644 --- a/block/fops.c +++ b/block/fops.c @@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); - if (WARN_ON_ONCE(iomap->offset >= isize)) - return -EIO; - iomap->type = IOMAP_MAPPED; - iomap->addr = iomap->offset; + if (WARN_ON_ONCE(iomap->offset >= isize)) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + } iomap->length = isize - iomap->offset; if (IS_ENABLED(CONFIG_BUFFER_HEAD)) iomap->flags |= IOMAP_F_BUFFER_HEAD; Other that the the system seems fine. Cheers, Hannes
On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote: > On 4/24/23 07:49, Christoph Hellwig wrote: > > Use iomap in buffer_head compat mode to write to block devices. > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > --- > > block/Kconfig | 1 + > > block/fops.c | 33 +++++++++++++++++++++++++++++---- > > 2 files changed, 30 insertions(+), 4 deletions(-) > > > > diff --git a/block/Kconfig b/block/Kconfig > > index 941b2dca70db73..672b08f0096ab4 100644 > > --- a/block/Kconfig > > +++ b/block/Kconfig > > @@ -5,6 +5,7 @@ > > menuconfig BLOCK > > bool "Enable the block layer" if EXPERT > > default y > > + select IOMAP > > select SBITMAP > > help > > Provide block layer support for the kernel. > > diff --git a/block/fops.c b/block/fops.c > > index 318247832a7bcf..7910636f8df33b 100644 > > --- a/block/fops.c > > +++ b/block/fops.c > > @@ -15,6 +15,7 @@ > > #include <linux/falloc.h> > > #include <linux/suspend.h> > > #include <linux/fs.h> > > +#include <linux/iomap.h> > > #include <linux/module.h> > > #include "blk.h" > > @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) > > return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); > > } > > +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > > + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) > > +{ > > + struct block_device *bdev = I_BDEV(inode); > > + loff_t isize = i_size_read(inode); > > + > > + iomap->bdev = bdev; > > + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); > > + if (WARN_ON_ONCE(iomap->offset >= isize)) > > + return -EIO; > > I'm hitting this during booting: > [ 5.016324] <TASK> > [ 5.030256] iomap_iter+0x11a/0x350 > [ 5.030264] iomap_readahead+0x1eb/0x2c0 > [ 5.030272] read_pages+0x5d/0x220 > [ 5.030279] page_cache_ra_unbounded+0x131/0x180 > [ 5.030284] filemap_get_pages+0xff/0x5a0 Why is filemap_get_pages() using unbounded readahead? Surely readahead should be limited to reading within EOF.... > [ 5.030292] filemap_read+0xca/0x320 > [ 5.030296] ? aa_file_perm+0x126/0x500 > [ 5.040216] ? touch_atime+0xc8/0x150 > [ 5.040224] blkdev_read_iter+0xb0/0x150 > [ 5.040228] vfs_read+0x226/0x2d0 > [ 5.040234] ksys_read+0xa5/0xe0 > [ 5.040238] do_syscall_64+0x5b/0x80 > > Maybe we should consider this patch: > > diff --git a/block/fops.c b/block/fops.c > index 524b8a828aad..d202fb663f25 100644 > --- a/block/fops.c > +++ b/block/fops.c > @@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode, > loff_t offset, loff_t length, > > iomap->bdev = bdev; > iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); > - if (WARN_ON_ONCE(iomap->offset >= isize)) > - return -EIO; > - iomap->type = IOMAP_MAPPED; > - iomap->addr = iomap->offset; > + if (WARN_ON_ONCE(iomap->offset >= isize)) { > + iomap->type = IOMAP_HOLE; > + iomap->addr = IOMAP_NULL_ADDR; > + } else { > + iomap->type = IOMAP_MAPPED; > + iomap->addr = iomap->offset; > + } I think Christoph's code is correct. IMO, any attempt to read beyond the end of the device should throw out a warning and return an error, not silently return zeros. If readahead is trying to read beyond the end of the device, then it really seems to me like the problem here is readahead, not the iomap code detecting the OOB read request.... Cheers, Dave.
On Wed, May 24, 2023 at 08:27:13AM +1000, Dave Chinner wrote: > On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote: > > I'm hitting this during booting: > > [ 5.016324] <TASK> > > [ 5.030256] iomap_iter+0x11a/0x350 > > [ 5.030264] iomap_readahead+0x1eb/0x2c0 > > [ 5.030272] read_pages+0x5d/0x220 > > [ 5.030279] page_cache_ra_unbounded+0x131/0x180 > > [ 5.030284] filemap_get_pages+0xff/0x5a0 > > Why is filemap_get_pages() using unbounded readahead? Surely > readahead should be limited to reading within EOF.... It isn't using unbounded readahead; that's an artifact of this incomplete stack trace. Actual call stack: page_cache_ra_unbounded do_page_cache_ra ondemand_readahead page_cache_sync_ra page_cache_sync_readahead filemap_get_pages As you can see, do_page_cache_ra() does limit readahead to i_size. Is ractl->mapping->host the correct way to find the inode? I always get confused. > I think Christoph's code is correct. IMO, any attempt to read beyond > the end of the device should throw out a warning and return an > error, not silently return zeros. > > If readahead is trying to read beyond the end of the device, then it > really seems to me like the problem here is readahead, not the iomap > code detecting the OOB read request.... > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote: > I'm hitting this during booting: > [ 5.016324] <TASK> > [ 5.030256] iomap_iter+0x11a/0x350 > [ 5.030264] iomap_readahead+0x1eb/0x2c0 > [ 5.030272] read_pages+0x5d/0x220 > [ 5.030279] page_cache_ra_unbounded+0x131/0x180 > [ 5.030284] filemap_get_pages+0xff/0x5a0 > [ 5.030292] filemap_read+0xca/0x320 > [ 5.030296] ? aa_file_perm+0x126/0x500 > [ 5.040216] ? touch_atime+0xc8/0x150 > [ 5.040224] blkdev_read_iter+0xb0/0x150 > [ 5.040228] vfs_read+0x226/0x2d0 > [ 5.040234] ksys_read+0xa5/0xe0 > [ 5.040238] do_syscall_64+0x5b/0x80 > > Maybe we should consider this patch: As willy said this should be taken care of by the i_size check. Did you run with just this patch set or some of the large block size experiments on top which might change the variables? I'll repost the series today without any chances in the area, and if you can reproduce it with just that series we need to root cause it, so please send your kernel and VM config along for the next report.
On Wed, May 24, 2023 at 02:33:13PM +0100, Matthew Wilcox wrote: > As you can see, do_page_cache_ra() does limit readahead to i_size. > Is ractl->mapping->host the correct way to find the inode? I always > get confused. As far as I can tell it is the right inode, the indirection through file->f_mapping ensures it actually points to the backing inode.
On 7/20/23 14:06, Christoph Hellwig wrote: > On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote: >> I'm hitting this during booting: >> [ 5.016324] <TASK> >> [ 5.030256] iomap_iter+0x11a/0x350 >> [ 5.030264] iomap_readahead+0x1eb/0x2c0 >> [ 5.030272] read_pages+0x5d/0x220 >> [ 5.030279] page_cache_ra_unbounded+0x131/0x180 >> [ 5.030284] filemap_get_pages+0xff/0x5a0 >> [ 5.030292] filemap_read+0xca/0x320 >> [ 5.030296] ? aa_file_perm+0x126/0x500 >> [ 5.040216] ? touch_atime+0xc8/0x150 >> [ 5.040224] blkdev_read_iter+0xb0/0x150 >> [ 5.040228] vfs_read+0x226/0x2d0 >> [ 5.040234] ksys_read+0xa5/0xe0 >> [ 5.040238] do_syscall_64+0x5b/0x80 >> >> Maybe we should consider this patch: > > As willy said this should be taken care of by the i_size check. > Did you run with just this patch set or some of the large block > size experiments on top which might change the variables? > > I'll repost the series today without any chances in the area, and > if you can reproduce it with just that series we need to root > cause it, so please send your kernel and VM config along for the > next report. I _think_ it's been resolve now; I've rewritten my patchset (and the patches where it's based upon) several times now, so it might be a stale issue now. Eagerly awaiting your patchset. Cheers, Hannes
diff --git a/block/Kconfig b/block/Kconfig index 941b2dca70db73..672b08f0096ab4 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select IOMAP select SBITMAP help Provide block layer support for the kernel. diff --git a/block/fops.c b/block/fops.c index 318247832a7bcf..7910636f8df33b 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,6 +15,7 @@ #include <linux/falloc.h> #include <linux/suspend.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/module.h> #include "blk.h" @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct block_device *bdev = I_BDEV(inode); + loff_t isize = i_size_read(inode); + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); + if (WARN_ON_ONCE(iomap->offset >= isize)) + return -EIO; + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + iomap->length = isize - iomap->offset; + iomap->flags |= IOMAP_F_BUFFER_HEAD; + return 0; +} + +static const struct iomap_ops blkdev_iomap_ops = { + .iomap_begin = blkdev_iomap_begin, +}; + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -530,6 +552,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) return written; } +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -575,16 +602,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - current->backing_dev_info = bdev->bd_disk->bdi; if (iocb->ki_flags & IOCB_DIRECT) { ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, - generic_perform_write(iocb, from)); + blkdev_buffered_write(iocb, from)); } else { - ret = generic_perform_write(iocb, from); + ret = blkdev_buffered_write(iocb, from); } - current->backing_dev_info = NULL; if (ret > 0) ret = generic_write_sync(iocb, ret);
Use iomap in buffer_head compat mode to write to block devices. Signed-off-by: Christoph Hellwig <hch@lst.de> --- block/Kconfig | 1 + block/fops.c | 33 +++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-)