Message ID | 20240625114420.719014-7-kernel@pankajraghav.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | enable bs > ps in XFS | expand |
On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote: > From: Pankaj Raghav <p.raghav@samsung.com> > > iomap_dio_zero() will pad a fs block with zeroes if the direct IO size > < fs block size. iomap_dio_zero() has an implicit assumption that fs block > size < page_size. This is true for most filesystems at the moment. > > If the block size > page size, this will send the contents of the page > next to zero page(as len > PAGE_SIZE) to the underlying block device, > causing FS corruption. > > iomap is a generic infrastructure and it should not make any assumptions > about the fs block size and the page size of the system. > > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> Looks fine, so: Reviewed-by: Dave Chinner <dchinner@redhat.com> but.... > +/* > + * Used for sub block zeroing in iomap_dio_zero() > + */ > +#define ZERO_PAGE_64K_SIZE (65536) > +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE)) > +static struct page *zero_page_64k; ..... > @@ -753,3 +765,17 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > return iomap_dio_complete(dio); > } > EXPORT_SYMBOL_GPL(iomap_dio_rw); > + > +static int __init iomap_dio_init(void) > +{ > + zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO, > + ZERO_PAGE_64K_ORDER); > + > + if (!zero_page_64k) > + return -ENOMEM; > + > + set_memory_ro((unsigned long)page_address(zero_page_64k), > + 1U << ZERO_PAGE_64K_ORDER); ^^^^^^^^^^^^^^^^^^^^^^^^^ isn't that just ZERO_PAGE_64K_SIZE? -Dave.
On Mon, Jul 01, 2024 at 12:37:10PM +1000, Dave Chinner wrote: > On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote: > > From: Pankaj Raghav <p.raghav@samsung.com> > > > > iomap_dio_zero() will pad a fs block with zeroes if the direct IO size > > < fs block size. iomap_dio_zero() has an implicit assumption that fs block > > size < page_size. This is true for most filesystems at the moment. > > > > If the block size > page size, this will send the contents of the page > > next to zero page(as len > PAGE_SIZE) to the underlying block device, > > causing FS corruption. > > > > iomap is a generic infrastructure and it should not make any assumptions > > about the fs block size and the page size of the system. > > > > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > Looks fine, so: > > Reviewed-by: Dave Chinner <dchinner@redhat.com> Thanks! > > but.... > > > + > > + if (!zero_page_64k) > > + return -ENOMEM; > > + > > + set_memory_ro((unsigned long)page_address(zero_page_64k), > > + 1U << ZERO_PAGE_64K_ORDER); > ^^^^^^^^^^^^^^^^^^^^^^^^^ > isn't that just ZERO_PAGE_64K_SIZE? Nope, set_memory_ro takes numbers of pages and not size in bytes :) > > -Dave. > -- > Dave Chinner > david@fromorbit.com
On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote: > From: Pankaj Raghav <p.raghav@samsung.com> > > iomap_dio_zero() will pad a fs block with zeroes if the direct IO size > < fs block size. iomap_dio_zero() has an implicit assumption that fs block > size < page_size. This is true for most filesystems at the moment. > > If the block size > page size, this will send the contents of the page > next to zero page(as len > PAGE_SIZE) to the underlying block device, > causing FS corruption. > > iomap is a generic infrastructure and it should not make any assumptions > about the fs block size and the page size of the system. > > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> Looks good to me, Reviewed-by: Darrick J. Wong <djwong@kernel.org> --D > --- > fs/iomap/buffered-io.c | 4 ++-- > fs/iomap/direct-io.c | 30 ++++++++++++++++++++++++++++-- > 2 files changed, 30 insertions(+), 4 deletions(-) > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index f420c53d86ac..9a9e94c7ed1d 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -2007,10 +2007,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, > } > EXPORT_SYMBOL_GPL(iomap_writepages); > > -static int __init iomap_init(void) > +static int __init iomap_pagecache_init(void) > { > return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), > offsetof(struct iomap_ioend, io_bio), > BIOSET_NEED_BVECS); > } > -fs_initcall(iomap_init); > +fs_initcall(iomap_pagecache_init); > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index f3b43d223a46..61d09d2364f7 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -11,6 +11,7 @@ > #include <linux/iomap.h> > #include <linux/backing-dev.h> > #include <linux/uio.h> > +#include <linux/set_memory.h> > #include <linux/task_io_accounting_ops.h> > #include "trace.h" > > @@ -27,6 +28,13 @@ > #define IOMAP_DIO_WRITE (1U << 30) > #define IOMAP_DIO_DIRTY (1U << 31) > > +/* > + * Used for sub block zeroing in iomap_dio_zero() > + */ > +#define ZERO_PAGE_64K_SIZE (65536) > +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE)) > +static struct page *zero_page_64k; > + > struct iomap_dio { > struct kiocb *iocb; > const struct iomap_dio_ops *dops; > @@ -236,9 +244,13 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, > loff_t pos, unsigned len) > { > struct inode *inode = file_inode(dio->iocb->ki_filp); > - struct page *page = ZERO_PAGE(0); > struct bio *bio; > > + /* > + * Max block size supported is 64k > + */ > + WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > + > bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); > fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, > GFP_KERNEL); > @@ -246,7 +258,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, > bio->bi_private = dio; > bio->bi_end_io = iomap_dio_bio_end_io; > > - __bio_add_page(bio, page, len, 0); > + __bio_add_page(bio, zero_page_64k, len, 0); > iomap_dio_submit_bio(iter, dio, bio, pos); > } > > @@ -753,3 +765,17 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > return iomap_dio_complete(dio); > } > EXPORT_SYMBOL_GPL(iomap_dio_rw); > + > +static int __init iomap_dio_init(void) > +{ > + zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO, > + ZERO_PAGE_64K_ORDER); > + > + if (!zero_page_64k) > + return -ENOMEM; > + > + set_memory_ro((unsigned long)page_address(zero_page_64k), > + 1U << ZERO_PAGE_64K_ORDER); > + return 0; > +} > +fs_initcall(iomap_dio_init); > -- > 2.44.1 > >
On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote: > -static int __init iomap_init(void) > +static int __init iomap_pagecache_init(void) > { > return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), > offsetof(struct iomap_ioend, io_bio), > BIOSET_NEED_BVECS); > } > -fs_initcall(iomap_init); > +fs_initcall(iomap_pagecache_init); s/iomap_pagecache_init/iomap_buffered_init/ We don't use pagecache naming anywhere else in the file. > +/* > + * Used for sub block zeroing in iomap_dio_zero() > + */ > +#define ZERO_PAGE_64K_SIZE (65536) just use SZ_64K > +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE)) No really point in having this. > +static struct page *zero_page_64k; This should be a folio. Encoding the size in the name is also really weird and just creates churn when we have to increase it. > + /* > + * Max block size supported is 64k > + */ > + WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); A WARN_ON without actually erroring out here is highly dangerous. > + > bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); Overly long line here. > + > +static int __init iomap_dio_init(void) > +{ > + zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO, > + ZERO_PAGE_64K_ORDER); > + > + if (!zero_page_64k) > + return -ENOMEM; > + > + set_memory_ro((unsigned long)page_address(zero_page_64k), > + 1U << ZERO_PAGE_64K_ORDER); What's the point of the set_memory_ro here? Yes, we won't write to it, but it's hardly an attack vector and fragments the direct map.
> > +fs_initcall(iomap_pagecache_init); > > s/iomap_pagecache_init/iomap_buffered_init/ > > We don't use pagecache naming anywhere else in the file. Got it. > > > +/* > > + * Used for sub block zeroing in iomap_dio_zero() > > + */ > > +#define ZERO_PAGE_64K_SIZE (65536) > > just use SZ_64K > > > +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE)) > > No really point in having this. Hmm, I used it twice, hence the define. But if we decide to get rid of set_memory_ro(), then this does not make sense. > > > +static struct page *zero_page_64k; > > This should be a folio. Encoding the size in the name is also really > weird and just creates churn when we have to increase it. Willy suggested we could use raw pages as we don't need the metadata from using a folio. [0] > > > > + /* > > + * Max block size supported is 64k > > + */ > > + WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > > > A WARN_ON without actually erroring out here is highly dangerous. I agree but I think we decided that we are safe with 64k for now as fs that uses iomap will not have a block size > 64k. But this function needs some changes when we decide to go beyond 64k by returning error instead of not returning anything. Until then WARN_ON_ONCE would be a good stop gap for people developing the feature to go beyond 64k block size[1]. > > > + > > bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); > > Overly long line here. > Not a part of my change, so I didn't bother reformatting it. :) > > + > > +static int __init iomap_dio_init(void) > > +{ > > + zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO, > > + ZERO_PAGE_64K_ORDER); > > > + > > + if (!zero_page_64k) > > + return -ENOMEM; > > + > > + set_memory_ro((unsigned long)page_address(zero_page_64k), > > + 1U << ZERO_PAGE_64K_ORDER); > > What's the point of the set_memory_ro here? Yes, we won't write to > it, but it's hardly an attack vector and fragments the direct map. That is a good point. Darrick suggested why not add a ro tag as we don't write to it but I did not know the consequence of direct map fragmentation when this is added. So probably there is no value calling set_memory_ro here. -- Pankaj [0] https://lore.kernel.org/linux-fsdevel/ZkT46AsZ3WghOArL@casper.infradead.org/ [1] I spent a lot of time banging my head why I was getting FS corruption when I was doing direct io in XFS while adding LBS support before I found the PAGE_SIZE assumption here.
On Tue, Jul 02, 2024 at 10:15:56AM +0000, Pankaj Raghav (Samsung) wrote: > Willy suggested we could use raw pages as we don't need the metadata > from using a folio. [0] Ok, that feels weird but I'll defer to his opinion in that case. > > > + /* > > > + * Max block size supported is 64k > > > + */ > > > + WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > > > > > > A WARN_ON without actually erroring out here is highly dangerous. > > I agree but I think we decided that we are safe with 64k for now as fs > that uses iomap will not have a block size > 64k. > > But this function needs some changes when we decide to go beyond 64k > by returning error instead of not returning anything. > Until then WARN_ON_ONCE would be a good stop gap for people developing > the feature to go beyond 64k block size[1]. Sure, but please make it return an error and return that instead of just warning and going beyond the allocated page.
On Tue, Jul 02, 2024 at 10:15:56AM +0000, Pankaj Raghav (Samsung) wrote: > > > + set_memory_ro((unsigned long)page_address(zero_page_64k), > > > + 1U << ZERO_PAGE_64K_ORDER); > > > > What's the point of the set_memory_ro here? Yes, we won't write to > > it, but it's hardly an attack vector and fragments the direct map. > > That is a good point. Darrick suggested why not add a ro tag as we don't > write to it but I did not know the consequence of direct map > fragmentation when this is added. So probably there is no value calling > set_memory_ro here. Mike Rapoport already did the thankless hard work to evaluate if direct map fragmentation is something which causes a performance issue and it is not [0]. Either way, this is a *one* time thing, not something that happens as often as other things which aggrevate direct map fragmentation like eBPF, and so from my perspective there is no issue to using, if we want set_memory_ro(). [0] https://lwn.net/Articles/931406/ Luis
> > > A WARN_ON without actually erroring out here is highly dangerous. > > > > I agree but I think we decided that we are safe with 64k for now as fs > > that uses iomap will not have a block size > 64k. > > > > But this function needs some changes when we decide to go beyond 64k > > by returning error instead of not returning anything. > > Until then WARN_ON_ONCE would be a good stop gap for people developing > > the feature to go beyond 64k block size[1]. > > Sure, but please make it return an error and return that instead of > just warning and going beyond the allocated page. Does this make sense? diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 61d09d2364f7..14be34703588 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -240,16 +240,19 @@ void iomap_dio_bio_end_io(struct bio *bio) } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); -static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { struct inode *inode = file_inode(dio->iocb->ki_filp); struct bio *bio; + if (!len) + return 0; /* * Max block size supported is 64k */ - WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); + if (len > ZERO_PAGE_64K_SIZE) + return -EINVAL; bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, @@ -260,6 +263,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, __bio_add_page(bio, zero_page_64k, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); + return 0; } /* @@ -368,8 +372,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); - if (pad) - iomap_dio_zero(iter, dio, pos - pad, pad); + + ret = iomap_dio_zero(iter, dio, pos - pad, pad); + if (ret) + goto out; } /* @@ -443,7 +449,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); + ret = iomap_dio_zero(iter, dio, pos, fs_block_size - pad); } out: /* Undo iter limitation to current extent */ -- Pankaj
On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote: +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, > loff_t pos, unsigned len) > { > struct inode *inode = file_inode(dio->iocb->ki_filp); > struct bio *bio; > > + if (!len) > + return 0; > /* > * Max block size supported is 64k > */ > - WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > + if (len > ZERO_PAGE_64K_SIZE) > + return -EINVAL; The should probably be both WARN_ON_ONCE in addition to the error return (and ZERO_PAGE_64K_SIZE really needs to go away..) > + ret = iomap_dio_zero(iter, dio, pos, fs_block_size - pad); Overly lone line here. Otherwise this looks good.
On Tue, Jul 02, 2024 at 05:42:16PM +0200, Christoph Hellwig wrote: > On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote: > +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, > > loff_t pos, unsigned len) > > { > > struct inode *inode = file_inode(dio->iocb->ki_filp); > > struct bio *bio; > > > > + if (!len) > > + return 0; > > /* > > * Max block size supported is 64k > > */ > > - WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > > + if (len > ZERO_PAGE_64K_SIZE) > > + return -EINVAL; > > The should probably be both WARN_ON_ONCE in addition to the error > return (and ZERO_PAGE_64K_SIZE really needs to go away..) Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested. > > > + ret = iomap_dio_zero(iter, dio, pos, fs_block_size - pad); > > Overly lone line here. > > Otherwise this looks good. >
On Tue, Jul 02, 2024 at 02:02:50PM +0200, Christoph Hellwig wrote: > On Tue, Jul 02, 2024 at 10:15:56AM +0000, Pankaj Raghav (Samsung) wrote: > > Willy suggested we could use raw pages as we don't need the metadata > > from using a folio. [0] > > Ok, that feels weird but I'll defer to his opinion in that case. Let me see if I can make you feel less weird about it, since I think this is something that people should have a clear feeling about. In the Glorious Future, when we've separated pages and folios from each other, folios are conceptually memory that gets mapped to userspace. They have refcounts, mapcounts, a pointer to a file's mapping or an anon vma's anon_vma, an index within that object, an LRU list, a dirty flag, a lock bit, and so on. We don't need any of that here. We might choose to use a special memdesc for accounting purposes, but there's no need to allocate a folio for it. For now, leaving it as a plain allocation of pages seems like the smartest option, and we can revisit in the future.
On Tue, Jul 02, 2024 at 04:13:29PM +0000, Pankaj Raghav (Samsung) wrote: > On Tue, Jul 02, 2024 at 05:42:16PM +0200, Christoph Hellwig wrote: > > On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote: > > +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, > > > loff_t pos, unsigned len) > > > { > > > struct inode *inode = file_inode(dio->iocb->ki_filp); > > > struct bio *bio; > > > > > > + if (!len) > > > + return 0; > > > /* > > > * Max block size supported is 64k > > > */ > > > - WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > > > + if (len > ZERO_PAGE_64K_SIZE) > > > + return -EINVAL; > > > > The should probably be both WARN_ON_ONCE in addition to the error > > return (and ZERO_PAGE_64K_SIZE really needs to go away..) > > Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested. No. It needs a symbolic name that doesn't include the actual size. Maybe ZERO_PAGE_IO_MAX. Christoph suggested using SZ_64K to define it, not to include it in the name.
On Tue, Jul 02, 2024 at 05:51:54PM +0100, Matthew Wilcox wrote: > On Tue, Jul 02, 2024 at 04:13:29PM +0000, Pankaj Raghav (Samsung) wrote: > > On Tue, Jul 02, 2024 at 05:42:16PM +0200, Christoph Hellwig wrote: > > > On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote: > > > +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, > > > > loff_t pos, unsigned len) > > > > { > > > > struct inode *inode = file_inode(dio->iocb->ki_filp); > > > > struct bio *bio; > > > > > > > > + if (!len) > > > > + return 0; > > > > /* > > > > * Max block size supported is 64k > > > > */ > > > > - WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); > > > > + if (len > ZERO_PAGE_64K_SIZE) > > > > + return -EINVAL; > > > > > > The should probably be both WARN_ON_ONCE in addition to the error > > > return (and ZERO_PAGE_64K_SIZE really needs to go away..) > > > > Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested. > > No. It needs a symbolic name that doesn't include the actual size. > Maybe ZERO_PAGE_IO_MAX. Christoph suggested using SZ_64K to define > it, not to include it in the name. Initially I kept the name as ZERO_FSB_PAGE as it is used to do sub-block zeroing. But I know John from Oracle is already working on using it for rt extent zeroing. So I will just go with ZERO_PAGE_IO_MAX for now. Understood about the SZ_64K part. Thanks for the clarification.
On Tue, Jul 02, 2024 at 05:10:14PM +0000, Pankaj Raghav (Samsung) wrote: > > > Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested. > > > > No. It needs a symbolic name that doesn't include the actual size. > > Maybe ZERO_PAGE_IO_MAX. Christoph suggested using SZ_64K to define > > it, not to include it in the name. > > Initially I kept the name as ZERO_FSB_PAGE as it is used to do sub-block > zeroing. But I know John from Oracle is already working on using it for > rt extent zeroing. So I will just go with ZERO_PAGE_IO_MAX for now. > Understood about the SZ_64K part. Thanks for the clarification. IOMAP_ZERO_PAGE_SIZE ? (I kind regret not going for just adding zero page as originally suggested, but then we'd keep arguing about the nr_vecs sizing and naming :))
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f420c53d86ac..9a9e94c7ed1d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -2007,10 +2007,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, } EXPORT_SYMBOL_GPL(iomap_writepages); -static int __init iomap_init(void) +static int __init iomap_pagecache_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } -fs_initcall(iomap_init); +fs_initcall(iomap_pagecache_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f3b43d223a46..61d09d2364f7 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -11,6 +11,7 @@ #include <linux/iomap.h> #include <linux/backing-dev.h> #include <linux/uio.h> +#include <linux/set_memory.h> #include <linux/task_io_accounting_ops.h> #include "trace.h" @@ -27,6 +28,13 @@ #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) +/* + * Used for sub block zeroing in iomap_dio_zero() + */ +#define ZERO_PAGE_64K_SIZE (65536) +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE)) +static struct page *zero_page_64k; + struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -236,9 +244,13 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { struct inode *inode = file_inode(dio->iocb->ki_filp); - struct page *page = ZERO_PAGE(0); struct bio *bio; + /* + * Max block size supported is 64k + */ + WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE); + bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); @@ -246,7 +258,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, page, len, 0); + __bio_add_page(bio, zero_page_64k, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } @@ -753,3 +765,17 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); + +static int __init iomap_dio_init(void) +{ + zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO, + ZERO_PAGE_64K_ORDER); + + if (!zero_page_64k) + return -ENOMEM; + + set_memory_ro((unsigned long)page_address(zero_page_64k), + 1U << ZERO_PAGE_64K_ORDER); + return 0; +} +fs_initcall(iomap_dio_init);