diff mbox series

[v8,06/10] iomap: fix iomap_dio_zero() for fs bs > system page size

Message ID 20240625114420.719014-7-kernel@pankajraghav.com (mailing list archive)
State Superseded, archived
Headers show
Series enable bs > ps in XFS | expand

Commit Message

Pankaj Raghav (Samsung) June 25, 2024, 11:44 a.m. UTC
From: Pankaj Raghav <p.raghav@samsung.com>

iomap_dio_zero() will pad a fs block with zeroes if the direct IO size
< fs block size. iomap_dio_zero() has an implicit assumption that fs block
size < page_size. This is true for most filesystems at the moment.

If the block size > page size, this will send the contents of the page
next to zero page(as len > PAGE_SIZE) to the underlying block device,
causing FS corruption.

iomap is a generic infrastructure and it should not make any assumptions
about the fs block size and the page size of the system.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 fs/iomap/buffered-io.c |  4 ++--
 fs/iomap/direct-io.c   | 30 ++++++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

Comments

Dave Chinner July 1, 2024, 2:37 a.m. UTC | #1
On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
> 
> iomap_dio_zero() will pad a fs block with zeroes if the direct IO size
> < fs block size. iomap_dio_zero() has an implicit assumption that fs block
> size < page_size. This is true for most filesystems at the moment.
> 
> If the block size > page size, this will send the contents of the page
> next to zero page(as len > PAGE_SIZE) to the underlying block device,
> causing FS corruption.
> 
> iomap is a generic infrastructure and it should not make any assumptions
> about the fs block size and the page size of the system.
> 
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>

Looks fine, so:

Reviewed-by: Dave Chinner <dchinner@redhat.com>

but....

> +/*
> + * Used for sub block zeroing in iomap_dio_zero()
> + */
> +#define ZERO_PAGE_64K_SIZE (65536)
> +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE))
> +static struct page *zero_page_64k;

.....

> @@ -753,3 +765,17 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	return iomap_dio_complete(dio);
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> +
> +static int __init iomap_dio_init(void)
> +{
> +	zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO,
> +				    ZERO_PAGE_64K_ORDER);
> +
> +	if (!zero_page_64k)
> +		return -ENOMEM;
> +
> +	set_memory_ro((unsigned long)page_address(zero_page_64k),
> +		      1U << ZERO_PAGE_64K_ORDER);
                      ^^^^^^^^^^^^^^^^^^^^^^^^^
isn't that just ZERO_PAGE_64K_SIZE?

-Dave.
Pankaj Raghav (Samsung) July 1, 2024, 11:22 a.m. UTC | #2
On Mon, Jul 01, 2024 at 12:37:10PM +1000, Dave Chinner wrote:
> On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote:
> > From: Pankaj Raghav <p.raghav@samsung.com>
> > 
> > iomap_dio_zero() will pad a fs block with zeroes if the direct IO size
> > < fs block size. iomap_dio_zero() has an implicit assumption that fs block
> > size < page_size. This is true for most filesystems at the moment.
> > 
> > If the block size > page size, this will send the contents of the page
> > next to zero page(as len > PAGE_SIZE) to the underlying block device,
> > causing FS corruption.
> > 
> > iomap is a generic infrastructure and it should not make any assumptions
> > about the fs block size and the page size of the system.
> > 
> > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> > Reviewed-by: Hannes Reinecke <hare@suse.de>
> 
> Looks fine, so:
> 
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
Thanks!
> 
> but....
> 
> > +
> > +	if (!zero_page_64k)
> > +		return -ENOMEM;
> > +
> > +	set_memory_ro((unsigned long)page_address(zero_page_64k),
> > +		      1U << ZERO_PAGE_64K_ORDER);
>                       ^^^^^^^^^^^^^^^^^^^^^^^^^
> isn't that just ZERO_PAGE_64K_SIZE?

Nope, set_memory_ro takes numbers of pages and not size in bytes :)

> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Darrick J. Wong July 1, 2024, 11:40 p.m. UTC | #3
On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
> 
> iomap_dio_zero() will pad a fs block with zeroes if the direct IO size
> < fs block size. iomap_dio_zero() has an implicit assumption that fs block
> size < page_size. This is true for most filesystems at the moment.
> 
> If the block size > page size, this will send the contents of the page
> next to zero page(as len > PAGE_SIZE) to the underlying block device,
> causing FS corruption.
> 
> iomap is a generic infrastructure and it should not make any assumptions
> about the fs block size and the page size of the system.
> 
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>

Looks good to me,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/iomap/buffered-io.c |  4 ++--
>  fs/iomap/direct-io.c   | 30 ++++++++++++++++++++++++++++--
>  2 files changed, 30 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index f420c53d86ac..9a9e94c7ed1d 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -2007,10 +2007,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
>  }
>  EXPORT_SYMBOL_GPL(iomap_writepages);
>  
> -static int __init iomap_init(void)
> +static int __init iomap_pagecache_init(void)
>  {
>  	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
>  			   offsetof(struct iomap_ioend, io_bio),
>  			   BIOSET_NEED_BVECS);
>  }
> -fs_initcall(iomap_init);
> +fs_initcall(iomap_pagecache_init);
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..61d09d2364f7 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -11,6 +11,7 @@
>  #include <linux/iomap.h>
>  #include <linux/backing-dev.h>
>  #include <linux/uio.h>
> +#include <linux/set_memory.h>
>  #include <linux/task_io_accounting_ops.h>
>  #include "trace.h"
>  
> @@ -27,6 +28,13 @@
>  #define IOMAP_DIO_WRITE		(1U << 30)
>  #define IOMAP_DIO_DIRTY		(1U << 31)
>  
> +/*
> + * Used for sub block zeroing in iomap_dio_zero()
> + */
> +#define ZERO_PAGE_64K_SIZE (65536)
> +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE))
> +static struct page *zero_page_64k;
> +
>  struct iomap_dio {
>  	struct kiocb		*iocb;
>  	const struct iomap_dio_ops *dops;
> @@ -236,9 +244,13 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>  		loff_t pos, unsigned len)
>  {
>  	struct inode *inode = file_inode(dio->iocb->ki_filp);
> -	struct page *page = ZERO_PAGE(0);
>  	struct bio *bio;
>  
> +	/*
> +	 * Max block size supported is 64k
> +	 */
> +	WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> +
>  	bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
>  	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
>  				  GFP_KERNEL);
> @@ -246,7 +258,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>  	bio->bi_private = dio;
>  	bio->bi_end_io = iomap_dio_bio_end_io;
>  
> -	__bio_add_page(bio, page, len, 0);
> +	__bio_add_page(bio, zero_page_64k, len, 0);
>  	iomap_dio_submit_bio(iter, dio, bio, pos);
>  }
>  
> @@ -753,3 +765,17 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	return iomap_dio_complete(dio);
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> +
> +static int __init iomap_dio_init(void)
> +{
> +	zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO,
> +				    ZERO_PAGE_64K_ORDER);
> +
> +	if (!zero_page_64k)
> +		return -ENOMEM;
> +
> +	set_memory_ro((unsigned long)page_address(zero_page_64k),
> +		      1U << ZERO_PAGE_64K_ORDER);
> +	return 0;
> +}
> +fs_initcall(iomap_dio_init);
> -- 
> 2.44.1
> 
>
Christoph Hellwig July 2, 2024, 7:42 a.m. UTC | #4
On Tue, Jun 25, 2024 at 11:44:16AM +0000, Pankaj Raghav (Samsung) wrote:
> -static int __init iomap_init(void)
> +static int __init iomap_pagecache_init(void)
>  {
>  	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
>  			   offsetof(struct iomap_ioend, io_bio),
>  			   BIOSET_NEED_BVECS);
>  }
> -fs_initcall(iomap_init);
> +fs_initcall(iomap_pagecache_init);

s/iomap_pagecache_init/iomap_buffered_init/

We don't use pagecache naming anywhere else in the file.

> +/*
> + * Used for sub block zeroing in iomap_dio_zero()
> + */
> +#define ZERO_PAGE_64K_SIZE (65536)

just use SZ_64K

> +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE))

No really point in having this.

> +static struct page *zero_page_64k;

This should be a folio.  Encoding the size in the name is also really
weird and just creates churn when we have to increase it.


> +	/*
> +	 * Max block size supported is 64k
> +	 */
> +	WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);


A WARN_ON without actually erroring out here is highly dangerous. 

> +
>  	bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);

Overly long line here.

> +
> +static int __init iomap_dio_init(void)
> +{
> +	zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO,
> +				    ZERO_PAGE_64K_ORDER);

> +
> +	if (!zero_page_64k)
> +		return -ENOMEM;
> +
> +	set_memory_ro((unsigned long)page_address(zero_page_64k),
> +		      1U << ZERO_PAGE_64K_ORDER);

What's the point of the set_memory_ro here?  Yes, we won't write to
it, but it's hardly an attack vector and fragments the direct map.
Pankaj Raghav (Samsung) July 2, 2024, 10:15 a.m. UTC | #5
> > +fs_initcall(iomap_pagecache_init);
> 
> s/iomap_pagecache_init/iomap_buffered_init/
> 
> We don't use pagecache naming anywhere else in the file.

Got it.
> 
> > +/*
> > + * Used for sub block zeroing in iomap_dio_zero()
> > + */
> > +#define ZERO_PAGE_64K_SIZE (65536)
> 
> just use SZ_64K
> 
> > +#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE))
> 
> No really point in having this.

Hmm, I used it twice, hence the define. But if we decide to get rid of
set_memory_ro(), then this does not make sense.

> 
> > +static struct page *zero_page_64k;
> 
> This should be a folio.  Encoding the size in the name is also really
> weird and just creates churn when we have to increase it.

Willy suggested we could use raw pages as we don't need the metadata
from using a folio. [0]

> 
> 
> > +	/*
> > +	 * Max block size supported is 64k
> > +	 */
> > +	WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> 
> 
> A WARN_ON without actually erroring out here is highly dangerous. 

I agree but I think we decided that we are safe with 64k for now as fs 
that uses iomap will not have a block size > 64k. 

But this function needs some changes when we decide to go beyond 64k
by returning error instead of not returning anything. 
Until then WARN_ON_ONCE would be a good stop gap for people developing
the feature to go beyond 64k block size[1]. 

> 
> > +
> >  	bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
> 
> Overly long line here.
> 

Not a part of my change, so I didn't bother reformatting it. :)

> > +
> > +static int __init iomap_dio_init(void)
> > +{
> > +	zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO,
> > +				    ZERO_PAGE_64K_ORDER);
> 
> > +
> > +	if (!zero_page_64k)
> > +		return -ENOMEM;
> > +
> > +	set_memory_ro((unsigned long)page_address(zero_page_64k),
> > +		      1U << ZERO_PAGE_64K_ORDER);
> 
> What's the point of the set_memory_ro here?  Yes, we won't write to
> it, but it's hardly an attack vector and fragments the direct map.

That is a good point. Darrick suggested why not add a ro tag as we don't
write to it but I did not know the consequence of direct map
fragmentation when this is added. So probably there is no value calling
set_memory_ro here.


--
Pankaj

[0] https://lore.kernel.org/linux-fsdevel/ZkT46AsZ3WghOArL@casper.infradead.org/
[1] I spent a lot of time banging my head why I was getting FS corruption
when I was doing direct io in XFS while adding LBS support before I found
the PAGE_SIZE assumption here.
Christoph Hellwig July 2, 2024, 12:02 p.m. UTC | #6
On Tue, Jul 02, 2024 at 10:15:56AM +0000, Pankaj Raghav (Samsung) wrote:
> Willy suggested we could use raw pages as we don't need the metadata
> from using a folio. [0]

Ok, that feels weird but I'll defer to his opinion in that case.

> > > +	/*
> > > +	 * Max block size supported is 64k
> > > +	 */
> > > +	WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> > 
> > 
> > A WARN_ON without actually erroring out here is highly dangerous. 
> 
> I agree but I think we decided that we are safe with 64k for now as fs 
> that uses iomap will not have a block size > 64k. 
> 
> But this function needs some changes when we decide to go beyond 64k
> by returning error instead of not returning anything. 
> Until then WARN_ON_ONCE would be a good stop gap for people developing
> the feature to go beyond 64k block size[1]. 

Sure, but please make it return an error and return that instead of
just warning and going beyond the allocated page.
Luis Chamberlain July 2, 2024, 1:49 p.m. UTC | #7
On Tue, Jul 02, 2024 at 10:15:56AM +0000, Pankaj Raghav (Samsung) wrote:
> > > +	set_memory_ro((unsigned long)page_address(zero_page_64k),
> > > +		      1U << ZERO_PAGE_64K_ORDER);
> > 
> > What's the point of the set_memory_ro here?  Yes, we won't write to
> > it, but it's hardly an attack vector and fragments the direct map.
> 
> That is a good point. Darrick suggested why not add a ro tag as we don't
> write to it but I did not know the consequence of direct map
> fragmentation when this is added. So probably there is no value calling
> set_memory_ro here.

Mike Rapoport already did the thankless hard work to evaluate if direct
map fragmentation is something which causes a performance issue and it
is not [0]. Either way, this is a *one* time thing, not something that
happens as often as other things which aggrevate direct map fragmentation
like eBPF, and so from my perspective there is no issue to using, if
we want set_memory_ro().

[0] https://lwn.net/Articles/931406/

  Luis
Pankaj Raghav (Samsung) July 2, 2024, 2:01 p.m. UTC | #8
> > > A WARN_ON without actually erroring out here is highly dangerous. 
> > 
> > I agree but I think we decided that we are safe with 64k for now as fs 
> > that uses iomap will not have a block size > 64k. 
> > 
> > But this function needs some changes when we decide to go beyond 64k
> > by returning error instead of not returning anything. 
> > Until then WARN_ON_ONCE would be a good stop gap for people developing
> > the feature to go beyond 64k block size[1]. 
> 
> Sure, but please make it return an error and return that instead of
> just warning and going beyond the allocated page.

Does this make sense?

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 61d09d2364f7..14be34703588 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -240,16 +240,19 @@ void iomap_dio_bio_end_io(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
 
-static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
+static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
                loff_t pos, unsigned len)
 {
        struct inode *inode = file_inode(dio->iocb->ki_filp);
        struct bio *bio;
 
+       if (!len)
+               return 0;
        /*
         * Max block size supported is 64k
         */
-       WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
+       if (len > ZERO_PAGE_64K_SIZE)
+               return -EINVAL;
 
        bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
        fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
@@ -260,6 +263,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 
        __bio_add_page(bio, zero_page_64k, len, 0);
        iomap_dio_submit_bio(iter, dio, bio, pos);
+       return 0;
 }
 
 /*
@@ -368,8 +372,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
        if (need_zeroout) {
                /* zero out from the start of the block to the write offset */
                pad = pos & (fs_block_size - 1);
-               if (pad)
-                       iomap_dio_zero(iter, dio, pos - pad, pad);
+
+               ret = iomap_dio_zero(iter, dio, pos - pad, pad);
+               if (ret)
+                       goto out;
        }
 
        /*
@@ -443,7 +449,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
                /* zero out from the end of the write to the end of the block */
                pad = pos & (fs_block_size - 1);
                if (pad)
-                       iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
+                       ret = iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
        }
 out:
        /* Undo iter limitation to current extent */

--
Pankaj
Christoph Hellwig July 2, 2024, 3:42 p.m. UTC | #9
On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote:
+static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>                 loff_t pos, unsigned len)
>  {
>         struct inode *inode = file_inode(dio->iocb->ki_filp);
>         struct bio *bio;
>  
> +       if (!len)
> +               return 0;
>         /*
>          * Max block size supported is 64k
>          */
> -       WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> +       if (len > ZERO_PAGE_64K_SIZE)
> +               return -EINVAL;

The should probably be both WARN_ON_ONCE in addition to the error
return (and ZERO_PAGE_64K_SIZE really needs to go away..)

> +                       ret = iomap_dio_zero(iter, dio, pos, fs_block_size - pad);

Overly lone line here.

Otherwise this looks good.
Pankaj Raghav (Samsung) July 2, 2024, 4:13 p.m. UTC | #10
On Tue, Jul 02, 2024 at 05:42:16PM +0200, Christoph Hellwig wrote:
> On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote:
> +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> >                 loff_t pos, unsigned len)
> >  {
> >         struct inode *inode = file_inode(dio->iocb->ki_filp);
> >         struct bio *bio;
> >  
> > +       if (!len)
> > +               return 0;
> >         /*
> >          * Max block size supported is 64k
> >          */
> > -       WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> > +       if (len > ZERO_PAGE_64K_SIZE)
> > +               return -EINVAL;
> 
> The should probably be both WARN_ON_ONCE in addition to the error
> return (and ZERO_PAGE_64K_SIZE really needs to go away..)

Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested.
> 
> > +                       ret = iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
> 
> Overly lone line here.
> 
> Otherwise this looks good.
>
Matthew Wilcox July 2, 2024, 4:50 p.m. UTC | #11
On Tue, Jul 02, 2024 at 02:02:50PM +0200, Christoph Hellwig wrote:
> On Tue, Jul 02, 2024 at 10:15:56AM +0000, Pankaj Raghav (Samsung) wrote:
> > Willy suggested we could use raw pages as we don't need the metadata
> > from using a folio. [0]
> 
> Ok, that feels weird but I'll defer to his opinion in that case.

Let me see if I can make you feel less weird about it, since I think
this is something that people should have a clear feeling about.

In the Glorious Future, when we've separated pages and folios from each
other, folios are conceptually memory that gets mapped to userspace.
They have refcounts, mapcounts, a pointer to a file's mapping or an anon
vma's anon_vma, an index within that object, an LRU list, a dirty flag,
a lock bit, and so on.

We don't need any of that here.  We might choose to use a special memdesc
for accounting purposes, but there's no need to allocate a folio for it.
For now, leaving it as a plain allocation of pages seems like the smartest
option, and we can revisit in the future.
Matthew Wilcox July 2, 2024, 4:51 p.m. UTC | #12
On Tue, Jul 02, 2024 at 04:13:29PM +0000, Pankaj Raghav (Samsung) wrote:
> On Tue, Jul 02, 2024 at 05:42:16PM +0200, Christoph Hellwig wrote:
> > On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote:
> > +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> > >                 loff_t pos, unsigned len)
> > >  {
> > >         struct inode *inode = file_inode(dio->iocb->ki_filp);
> > >         struct bio *bio;
> > >  
> > > +       if (!len)
> > > +               return 0;
> > >         /*
> > >          * Max block size supported is 64k
> > >          */
> > > -       WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> > > +       if (len > ZERO_PAGE_64K_SIZE)
> > > +               return -EINVAL;
> > 
> > The should probably be both WARN_ON_ONCE in addition to the error
> > return (and ZERO_PAGE_64K_SIZE really needs to go away..)
> 
> Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested.

No.  It needs a symbolic name that doesn't include the actual size.
Maybe ZERO_PAGE_IO_MAX.  Christoph suggested using SZ_64K to define
it, not to include it in the name.
Pankaj Raghav (Samsung) July 2, 2024, 5:10 p.m. UTC | #13
On Tue, Jul 02, 2024 at 05:51:54PM +0100, Matthew Wilcox wrote:
> On Tue, Jul 02, 2024 at 04:13:29PM +0000, Pankaj Raghav (Samsung) wrote:
> > On Tue, Jul 02, 2024 at 05:42:16PM +0200, Christoph Hellwig wrote:
> > > On Tue, Jul 02, 2024 at 02:01:23PM +0000, Pankaj Raghav (Samsung) wrote:
> > > +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> > > >                 loff_t pos, unsigned len)
> > > >  {
> > > >         struct inode *inode = file_inode(dio->iocb->ki_filp);
> > > >         struct bio *bio;
> > > >  
> > > > +       if (!len)
> > > > +               return 0;
> > > >         /*
> > > >          * Max block size supported is 64k
> > > >          */
> > > > -       WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
> > > > +       if (len > ZERO_PAGE_64K_SIZE)
> > > > +               return -EINVAL;
> > > 
> > > The should probably be both WARN_ON_ONCE in addition to the error
> > > return (and ZERO_PAGE_64K_SIZE really needs to go away..)
> > 
> > Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested.
> 
> No.  It needs a symbolic name that doesn't include the actual size.
> Maybe ZERO_PAGE_IO_MAX.  Christoph suggested using SZ_64K to define
> it, not to include it in the name.

Initially I kept the name as ZERO_FSB_PAGE as it is used to do sub-block
zeroing. But I know John from Oracle is already working on using it for
rt extent zeroing. So I will just go with ZERO_PAGE_IO_MAX for now.
Understood about the SZ_64K part. Thanks for the clarification.
Christoph Hellwig July 3, 2024, 5:16 a.m. UTC | #14
On Tue, Jul 02, 2024 at 05:10:14PM +0000, Pankaj Raghav (Samsung) wrote:
> > > Yes, I will rename it to ZERO_PAGE_SZ_64K as you suggested.
> > 
> > No.  It needs a symbolic name that doesn't include the actual size.
> > Maybe ZERO_PAGE_IO_MAX.  Christoph suggested using SZ_64K to define
> > it, not to include it in the name.
> 
> Initially I kept the name as ZERO_FSB_PAGE as it is used to do sub-block
> zeroing. But I know John from Oracle is already working on using it for
> rt extent zeroing. So I will just go with ZERO_PAGE_IO_MAX for now.
> Understood about the SZ_64K part. Thanks for the clarification.

IOMAP_ZERO_PAGE_SIZE ?

(I kind regret not going for just adding zero page as originally
suggested, but then we'd keep arguing about the nr_vecs sizing and
naming :))
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f420c53d86ac..9a9e94c7ed1d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -2007,10 +2007,10 @@  iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);
 
-static int __init iomap_init(void)
+static int __init iomap_pagecache_init(void)
 {
 	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
 			   offsetof(struct iomap_ioend, io_bio),
 			   BIOSET_NEED_BVECS);
 }
-fs_initcall(iomap_init);
+fs_initcall(iomap_pagecache_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..61d09d2364f7 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -11,6 +11,7 @@ 
 #include <linux/iomap.h>
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
+#include <linux/set_memory.h>
 #include <linux/task_io_accounting_ops.h>
 #include "trace.h"
 
@@ -27,6 +28,13 @@ 
 #define IOMAP_DIO_WRITE		(1U << 30)
 #define IOMAP_DIO_DIRTY		(1U << 31)
 
+/*
+ * Used for sub block zeroing in iomap_dio_zero()
+ */
+#define ZERO_PAGE_64K_SIZE (65536)
+#define ZERO_PAGE_64K_ORDER (get_order(ZERO_PAGE_64K_SIZE))
+static struct page *zero_page_64k;
+
 struct iomap_dio {
 	struct kiocb		*iocb;
 	const struct iomap_dio_ops *dops;
@@ -236,9 +244,13 @@  static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 		loff_t pos, unsigned len)
 {
 	struct inode *inode = file_inode(dio->iocb->ki_filp);
-	struct page *page = ZERO_PAGE(0);
 	struct bio *bio;
 
+	/*
+	 * Max block size supported is 64k
+	 */
+	WARN_ON_ONCE(len > ZERO_PAGE_64K_SIZE);
+
 	bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
 				  GFP_KERNEL);
@@ -246,7 +258,7 @@  static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	__bio_add_page(bio, page, len, 0);
+	__bio_add_page(bio, zero_page_64k, len, 0);
 	iomap_dio_submit_bio(iter, dio, bio, pos);
 }
 
@@ -753,3 +765,17 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	return iomap_dio_complete(dio);
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+static int __init iomap_dio_init(void)
+{
+	zero_page_64k = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+				    ZERO_PAGE_64K_ORDER);
+
+	if (!zero_page_64k)
+		return -ENOMEM;
+
+	set_memory_ro((unsigned long)page_address(zero_page_64k),
+		      1U << ZERO_PAGE_64K_ORDER);
+	return 0;
+}
+fs_initcall(iomap_dio_init);