diff mbox series

[RFC,4/8] fs: Introduce FALLOC_FL_PROVISION

Message ID 20220915164826.1396245-5-sarthakkukreti@google.com (mailing list archive)
State New, archived
Headers show
Series Introduce provisioning primitives for thinly provisioned storage | expand

Commit Message

Sarthak Kukreti Sept. 15, 2022, 4:48 p.m. UTC
From: Sarthak Kukreti <sarthakkukreti@chromium.org>

FALLOC_FL_PROVISION is a new fallocate() allocation mode that
sends a hint to (supported) thinly provisioned block devices to
allocate space for the given range of sectors via REQ_OP_PROVISION.

Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
---
 block/fops.c                | 7 ++++++-
 include/linux/falloc.h      | 3 ++-
 include/uapi/linux/falloc.h | 8 ++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

Comments

Brian Foster Sept. 16, 2022, 11:56 a.m. UTC | #1
On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> 
> FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> sends a hint to (supported) thinly provisioned block devices to
> allocate space for the given range of sectors via REQ_OP_PROVISION.
> 
> Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> ---
>  block/fops.c                | 7 ++++++-
>  include/linux/falloc.h      | 3 ++-
>  include/uapi/linux/falloc.h | 8 ++++++++
>  3 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/block/fops.c b/block/fops.c
> index b90742595317..a436a7596508 100644
> --- a/block/fops.c
> +++ b/block/fops.c
...
> @@ -661,6 +662,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
>  		error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
>  					     len >> SECTOR_SHIFT, GFP_KERNEL);
>  		break;
> +	case FALLOC_FL_PROVISION:
> +		error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> +					       len >> SECTOR_SHIFT, GFP_KERNEL);
> +		break;
>  	default:
>  		error = -EOPNOTSUPP;
>  	}

Hi Sarthak,

Neat mechanism.. I played with something very similar in the past (that
was much more crudely hacked up to target dm-thin) to allow filesystems
to request a thinly provisioned device to allocate blocks and try to do
a better job of avoiding inactivation when overprovisioned.

One thing I'm a little curious about here.. what's the need for a new
fallocate mode? On a cursory glance, the provision mode looks fairly
analogous to normal (mode == 0) allocation mode with the exception of
sending the request down to the bdev. blkdev_fallocate() already maps
some of the logical falloc modes (i.e. punch hole, zero range) to
sending write sames or discards, etc., and it doesn't currently look
like it supports allocation mode, so could it not map such requests to
the underlying REQ_OP_PROVISION op?

I guess the difference would be at the filesystem level where we'd
probably need to rely on a mount option or some such to control whether
traditional fallocate issues provision ops (like you've implemented for
ext4) vs. the specific falloc command, but that seems fairly consistent
with historical punch hole/discard behavior too. Hm? You might want to
cc linux-fsdevel in future posts in any event to get some more feedback
on how other filesystems might want to interact with such a thing.

BTW another thing that might be useful wrt to dm-thin is to support
FALLOC_FL_UNSHARE. I.e., it looks like the previous dm-thin patch only
checks that blocks are allocated, but not whether those blocks are
shared (re: lookup_result.shared). It might be useful to do the COW in
such cases if the caller passes down a REQ_UNSHARE or some such flag.

Brian

> diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> index f3f0b97b1675..a0e506255b20 100644
> --- a/include/linux/falloc.h
> +++ b/include/linux/falloc.h
> @@ -30,7 +30,8 @@ struct space_resv {
>  					 FALLOC_FL_COLLAPSE_RANGE |	\
>  					 FALLOC_FL_ZERO_RANGE |		\
>  					 FALLOC_FL_INSERT_RANGE |	\
> -					 FALLOC_FL_UNSHARE_RANGE)
> +					 FALLOC_FL_UNSHARE_RANGE |                          \
> +					 FALLOC_FL_PROVISION)
>  
>  /* on ia32 l_start is on a 32-bit boundary */
>  #if defined(CONFIG_X86_64)
> diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> index 51398fa57f6c..2d323d113eed 100644
> --- a/include/uapi/linux/falloc.h
> +++ b/include/uapi/linux/falloc.h
> @@ -77,4 +77,12 @@
>   */
>  #define FALLOC_FL_UNSHARE_RANGE		0x40
>  
> +/*
> + * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
> + * blocks for the range/EOF.
> + *
> + * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
> + */
> +#define FALLOC_FL_PROVISION		0x80
> +
>  #endif /* _UAPI_FALLOC_H_ */
> -- 
> 2.31.0
>
Sarthak Kukreti Sept. 16, 2022, 9:02 p.m. UTC | #2
On Fri, Sep 16, 2022 at 4:56 AM Brian Foster <bfoster@redhat.com> wrote:
>
> On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> >
> > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > sends a hint to (supported) thinly provisioned block devices to
> > allocate space for the given range of sectors via REQ_OP_PROVISION.
> >
> > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > ---
> >  block/fops.c                | 7 ++++++-
> >  include/linux/falloc.h      | 3 ++-
> >  include/uapi/linux/falloc.h | 8 ++++++++
> >  3 files changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/fops.c b/block/fops.c
> > index b90742595317..a436a7596508 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> ...
> > @@ -661,6 +662,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> >               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
> >                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> >               break;
> > +     case FALLOC_FL_PROVISION:
> > +             error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> > +                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > +             break;
> >       default:
> >               error = -EOPNOTSUPP;
> >       }
>
> Hi Sarthak,
>
> Neat mechanism.. I played with something very similar in the past (that
> was much more crudely hacked up to target dm-thin) to allow filesystems
> to request a thinly provisioned device to allocate blocks and try to do
> a better job of avoiding inactivation when overprovisioned.
>
> One thing I'm a little curious about here.. what's the need for a new
> fallocate mode? On a cursory glance, the provision mode looks fairly
> analogous to normal (mode == 0) allocation mode with the exception of
> sending the request down to the bdev. blkdev_fallocate() already maps
> some of the logical falloc modes (i.e. punch hole, zero range) to
> sending write sames or discards, etc., and it doesn't currently look
> like it supports allocation mode, so could it not map such requests to
> the underlying REQ_OP_PROVISION op?
>
> I guess the difference would be at the filesystem level where we'd
> probably need to rely on a mount option or some such to control whether
> traditional fallocate issues provision ops (like you've implemented for
> ext4) vs. the specific falloc command, but that seems fairly consistent
> with historical punch hole/discard behavior too. Hm? You might want to
> cc linux-fsdevel in future posts in any event to get some more feedback
> on how other filesystems might want to interact with such a thing.
>
Thanks for the feedback!
Argh, I completely forgot that I should add linux-fsdevel. Let me
re-send this with linux-fsdevel cc'd

There's a slight distinction is that the current filesystem-level
controls are usually for default handling, but userspace can still
call the relevant functions manually if they need to. For example, for
ext4, the 'discard' mount option dictates whether free blocks are
discarded, but it doesn't set the policy to allow/disallow userspace
from manually punching holes into files even if the mount opt is
'nodiscard'. FALLOC_FL_PROVISION is similar in that regard; it adds a
manual mechanism for users to provision the files' extents, that is
separate from the filesystems' default handling of provisioning files.

> BTW another thing that might be useful wrt to dm-thin is to support
> FALLOC_FL_UNSHARE. I.e., it looks like the previous dm-thin patch only
> checks that blocks are allocated, but not whether those blocks are
> shared (re: lookup_result.shared). It might be useful to do the COW in
> such cases if the caller passes down a REQ_UNSHARE or some such flag.
>
That's an interesting idea! There's a few more things on the TODO list
for this patch series but I think we can follow up with a patch to
handle that as well.

Sarthak

> Brian
>
> > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > index f3f0b97b1675..a0e506255b20 100644
> > --- a/include/linux/falloc.h
> > +++ b/include/linux/falloc.h
> > @@ -30,7 +30,8 @@ struct space_resv {
> >                                        FALLOC_FL_COLLAPSE_RANGE |     \
> >                                        FALLOC_FL_ZERO_RANGE |         \
> >                                        FALLOC_FL_INSERT_RANGE |       \
> > -                                      FALLOC_FL_UNSHARE_RANGE)
> > +                                      FALLOC_FL_UNSHARE_RANGE |                          \
> > +                                      FALLOC_FL_PROVISION)
> >
> >  /* on ia32 l_start is on a 32-bit boundary */
> >  #if defined(CONFIG_X86_64)
> > diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> > index 51398fa57f6c..2d323d113eed 100644
> > --- a/include/uapi/linux/falloc.h
> > +++ b/include/uapi/linux/falloc.h
> > @@ -77,4 +77,12 @@
> >   */
> >  #define FALLOC_FL_UNSHARE_RANGE              0x40
> >
> > +/*
> > + * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
> > + * blocks for the range/EOF.
> > + *
> > + * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
> > + */
> > +#define FALLOC_FL_PROVISION          0x80
> > +
> >  #endif /* _UAPI_FALLOC_H_ */
> > --
> > 2.31.0
> >
>
Christoph Hellwig Sept. 20, 2022, 7:49 a.m. UTC | #3
On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> 
> FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> sends a hint to (supported) thinly provisioned block devices to
> allocate space for the given range of sectors via REQ_OP_PROVISION.

So, how does that "provisioning" actually work in todays world where
storage is usually doing out of place writes in one or more layers,
including the flash storage everyone is using.  Does it give you one
write?  And unlimited number?  Some undecided number inbetween?  How
is it affected by write zeroes to that range or a discard?
Sarthak Kukreti Sept. 21, 2022, 5:54 a.m. UTC | #4
On Tue, Sep 20, 2022 at 12:49 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> >
> > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > sends a hint to (supported) thinly provisioned block devices to
> > allocate space for the given range of sectors via REQ_OP_PROVISION.
>
> So, how does that "provisioning" actually work in todays world where
> storage is usually doing out of place writes in one or more layers,
> including the flash storage everyone is using.  Does it give you one
> write?  And unlimited number?  Some undecided number inbetween?

Apologies, the patchset was a bit short on describing the semantics so
I'll expand more in the next revision; I'd say that it's the minimum
of regular mode fallocate() guarantees at each allocation layer. For
example, the guarantees from a contrived storage stack like (left to
right is bottom to top):

[ mmc0blkp1 | ext4(1) | sparse file | loop | dm-thinp | dm-thin | ext4(2) ]

would be predicated on the guarantees of fallocate() per allocation
layer; if ext4(1) was replaced by a filesystem that did not support
fallocate(), then there would be no guarantee that a write to a file
on ext4(2) succeeds.

For dm-thinp, in the current implementation, the provision request
allocates blocks for the range specified and adds the mapping to the
thinpool metadata. All subsequent writes are to the same block, so
you'll be able to write to the same block inifinitely. Brian mentioned
this above, one case it doesn't cover is if provision is called on a
shared block, but the natural extension would be to allocate and
assign a new block and copy the contents of the shared block (kind of
like copy-on-provision).

[reflowed]
> How is it affected by write zeroes to that range or a discard?

The current semantics of discards for dm-thinp/ext4/sparse files will
apply as they do today; discards will unmap the dm-thin block/free the
file extent. Write zeroes is more interesting; dm-thinp will treat the
command as usual. ext4_zero_range will mark the extents as unwritten,
so essentially if a user did provision + write to a block, write zeros
to the block would essentially leave it in the original provisioned
state, but ext4 would now show the contents of the block as zero on
the next read. I think, similar to above, the semantics of a request
will depend on each layer that it passes through.

Best
Sarthak
Mike Snitzer Sept. 21, 2022, 3:21 p.m. UTC | #5
On Wed, Sep 21 2022 at  1:54P -0400,
Sarthak Kukreti <sarthakkukreti@chromium.org> wrote:

> On Tue, Sep 20, 2022 at 12:49 AM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > >
> > > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > > sends a hint to (supported) thinly provisioned block devices to
> > > allocate space for the given range of sectors via REQ_OP_PROVISION.
> >
> > So, how does that "provisioning" actually work in todays world where
> > storage is usually doing out of place writes in one or more layers,
> > including the flash storage everyone is using.  Does it give you one
> > write?  And unlimited number?  Some undecided number inbetween?
> 
> Apologies, the patchset was a bit short on describing the semantics so
> I'll expand more in the next revision; I'd say that it's the minimum
> of regular mode fallocate() guarantees at each allocation layer. For
> example, the guarantees from a contrived storage stack like (left to
> right is bottom to top):
> 
> [ mmc0blkp1 | ext4(1) | sparse file | loop | dm-thinp | dm-thin | ext4(2) ]
> 
> would be predicated on the guarantees of fallocate() per allocation
> layer; if ext4(1) was replaced by a filesystem that did not support
> fallocate(), then there would be no guarantee that a write to a file
> on ext4(2) succeeds.
> 
> For dm-thinp, in the current implementation, the provision request
> allocates blocks for the range specified and adds the mapping to the
> thinpool metadata. All subsequent writes are to the same block, so
> you'll be able to write to the same block inifinitely. Brian mentioned
> this above, one case it doesn't cover is if provision is called on a
> shared block, but the natural extension would be to allocate and
> assign a new block and copy the contents of the shared block (kind of
> like copy-on-provision).

It follows that ChromiumOS isn't using dm-thinp's snapshot support?

But please do fold in incremental dm-thinp support to properly handle
shared blocks (dm-thinp already handles breaking sharing, etc.. so
I'll need to see where you're hooking into that you don't get this
"for free").

Mike
Brian Foster Sept. 21, 2022, 3:39 p.m. UTC | #6
On Fri, Sep 16, 2022 at 02:02:31PM -0700, Sarthak Kukreti wrote:
> On Fri, Sep 16, 2022 at 4:56 AM Brian Foster <bfoster@redhat.com> wrote:
> >
> > On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > >
> > > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > > sends a hint to (supported) thinly provisioned block devices to
> > > allocate space for the given range of sectors via REQ_OP_PROVISION.
> > >
> > > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > ---
> > >  block/fops.c                | 7 ++++++-
> > >  include/linux/falloc.h      | 3 ++-
> > >  include/uapi/linux/falloc.h | 8 ++++++++
> > >  3 files changed, 16 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/block/fops.c b/block/fops.c
> > > index b90742595317..a436a7596508 100644
> > > --- a/block/fops.c
> > > +++ b/block/fops.c
> > ...
> > > @@ -661,6 +662,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> > >               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
> > >                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > >               break;
> > > +     case FALLOC_FL_PROVISION:
> > > +             error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> > > +                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > +             break;
> > >       default:
> > >               error = -EOPNOTSUPP;
> > >       }
> >
> > Hi Sarthak,
> >
> > Neat mechanism.. I played with something very similar in the past (that
> > was much more crudely hacked up to target dm-thin) to allow filesystems
> > to request a thinly provisioned device to allocate blocks and try to do
> > a better job of avoiding inactivation when overprovisioned.
> >
> > One thing I'm a little curious about here.. what's the need for a new
> > fallocate mode? On a cursory glance, the provision mode looks fairly
> > analogous to normal (mode == 0) allocation mode with the exception of
> > sending the request down to the bdev. blkdev_fallocate() already maps
> > some of the logical falloc modes (i.e. punch hole, zero range) to
> > sending write sames or discards, etc., and it doesn't currently look
> > like it supports allocation mode, so could it not map such requests to
> > the underlying REQ_OP_PROVISION op?
> >
> > I guess the difference would be at the filesystem level where we'd
> > probably need to rely on a mount option or some such to control whether
> > traditional fallocate issues provision ops (like you've implemented for
> > ext4) vs. the specific falloc command, but that seems fairly consistent
> > with historical punch hole/discard behavior too. Hm? You might want to
> > cc linux-fsdevel in future posts in any event to get some more feedback
> > on how other filesystems might want to interact with such a thing.
> >
> Thanks for the feedback!
> Argh, I completely forgot that I should add linux-fsdevel. Let me
> re-send this with linux-fsdevel cc'd
> 
> There's a slight distinction is that the current filesystem-level
> controls are usually for default handling, but userspace can still
> call the relevant functions manually if they need to. For example, for
> ext4, the 'discard' mount option dictates whether free blocks are
> discarded, but it doesn't set the policy to allow/disallow userspace
> from manually punching holes into files even if the mount opt is
> 'nodiscard'. FALLOC_FL_PROVISION is similar in that regard; it adds a
> manual mechanism for users to provision the files' extents, that is
> separate from the filesystems' default handling of provisioning files.
> 

What I'm trying to understand is why not let blkdev_fallocate() issue a
provision based on the default mode (i.e. mode == 0) of fallocate(),
which is already defined to mean "perform allocation?" It currently
issues discards or write zeroes based on variants of
FALLOC_FL_PUNCH_HOLE without the need for a separate FALLOC_FL_DISCARD
mode, for example.

Brian

> > BTW another thing that might be useful wrt to dm-thin is to support
> > FALLOC_FL_UNSHARE. I.e., it looks like the previous dm-thin patch only
> > checks that blocks are allocated, but not whether those blocks are
> > shared (re: lookup_result.shared). It might be useful to do the COW in
> > such cases if the caller passes down a REQ_UNSHARE or some such flag.
> >
> That's an interesting idea! There's a few more things on the TODO list
> for this patch series but I think we can follow up with a patch to
> handle that as well.
> 
> Sarthak
> 
> > Brian
> >
> > > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > > index f3f0b97b1675..a0e506255b20 100644
> > > --- a/include/linux/falloc.h
> > > +++ b/include/linux/falloc.h
> > > @@ -30,7 +30,8 @@ struct space_resv {
> > >                                        FALLOC_FL_COLLAPSE_RANGE |     \
> > >                                        FALLOC_FL_ZERO_RANGE |         \
> > >                                        FALLOC_FL_INSERT_RANGE |       \
> > > -                                      FALLOC_FL_UNSHARE_RANGE)
> > > +                                      FALLOC_FL_UNSHARE_RANGE |                          \
> > > +                                      FALLOC_FL_PROVISION)
> > >
> > >  /* on ia32 l_start is on a 32-bit boundary */
> > >  #if defined(CONFIG_X86_64)
> > > diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> > > index 51398fa57f6c..2d323d113eed 100644
> > > --- a/include/uapi/linux/falloc.h
> > > +++ b/include/uapi/linux/falloc.h
> > > @@ -77,4 +77,12 @@
> > >   */
> > >  #define FALLOC_FL_UNSHARE_RANGE              0x40
> > >
> > > +/*
> > > + * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
> > > + * blocks for the range/EOF.
> > > + *
> > > + * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
> > > + */
> > > +#define FALLOC_FL_PROVISION          0x80
> > > +
> > >  #endif /* _UAPI_FALLOC_H_ */
> > > --
> > > 2.31.0
> > >
> >
>
Sarthak Kukreti Sept. 22, 2022, 8:04 a.m. UTC | #7
On Wed, Sep 21, 2022 at 8:39 AM Brian Foster <bfoster@redhat.com> wrote:
>
> On Fri, Sep 16, 2022 at 02:02:31PM -0700, Sarthak Kukreti wrote:
> > On Fri, Sep 16, 2022 at 4:56 AM Brian Foster <bfoster@redhat.com> wrote:
> > >
> > > On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > > > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > >
> > > > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > > > sends a hint to (supported) thinly provisioned block devices to
> > > > allocate space for the given range of sectors via REQ_OP_PROVISION.
> > > >
> > > > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > > ---
> > > >  block/fops.c                | 7 ++++++-
> > > >  include/linux/falloc.h      | 3 ++-
> > > >  include/uapi/linux/falloc.h | 8 ++++++++
> > > >  3 files changed, 16 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/block/fops.c b/block/fops.c
> > > > index b90742595317..a436a7596508 100644
> > > > --- a/block/fops.c
> > > > +++ b/block/fops.c
> > > ...
> > > > @@ -661,6 +662,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> > > >               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
> > > >                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > >               break;
> > > > +     case FALLOC_FL_PROVISION:
> > > > +             error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> > > > +                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > > +             break;
> > > >       default:
> > > >               error = -EOPNOTSUPP;
> > > >       }
> > >
> > > Hi Sarthak,
> > >
> > > Neat mechanism.. I played with something very similar in the past (that
> > > was much more crudely hacked up to target dm-thin) to allow filesystems
> > > to request a thinly provisioned device to allocate blocks and try to do
> > > a better job of avoiding inactivation when overprovisioned.
> > >
> > > One thing I'm a little curious about here.. what's the need for a new
> > > fallocate mode? On a cursory glance, the provision mode looks fairly
> > > analogous to normal (mode == 0) allocation mode with the exception of
> > > sending the request down to the bdev. blkdev_fallocate() already maps
> > > some of the logical falloc modes (i.e. punch hole, zero range) to
> > > sending write sames or discards, etc., and it doesn't currently look
> > > like it supports allocation mode, so could it not map such requests to
> > > the underlying REQ_OP_PROVISION op?
> > >
> > > I guess the difference would be at the filesystem level where we'd
> > > probably need to rely on a mount option or some such to control whether
> > > traditional fallocate issues provision ops (like you've implemented for
> > > ext4) vs. the specific falloc command, but that seems fairly consistent
> > > with historical punch hole/discard behavior too. Hm? You might want to
> > > cc linux-fsdevel in future posts in any event to get some more feedback
> > > on how other filesystems might want to interact with such a thing.
> > >
> > Thanks for the feedback!
> > Argh, I completely forgot that I should add linux-fsdevel. Let me
> > re-send this with linux-fsdevel cc'd
> >
> > There's a slight distinction is that the current filesystem-level
> > controls are usually for default handling, but userspace can still
> > call the relevant functions manually if they need to. For example, for
> > ext4, the 'discard' mount option dictates whether free blocks are
> > discarded, but it doesn't set the policy to allow/disallow userspace
> > from manually punching holes into files even if the mount opt is
> > 'nodiscard'. FALLOC_FL_PROVISION is similar in that regard; it adds a
> > manual mechanism for users to provision the files' extents, that is
> > separate from the filesystems' default handling of provisioning files.
> >
>
> What I'm trying to understand is why not let blkdev_fallocate() issue a
> provision based on the default mode (i.e. mode == 0) of fallocate(),
> which is already defined to mean "perform allocation?" It currently
> issues discards or write zeroes based on variants of
> FALLOC_FL_PUNCH_HOLE without the need for a separate FALLOC_FL_DISCARD
> mode, for example.
>
It's mostly to keep the block device fallocate() semantics in-line and
consistent with the file-specific modes: I added the separate
filesystem fallocate() mode under the assumption that we'd want to
keep the traditional handling for filesystems intact with (mode == 0).
And for block devices, I didn't map the requests to mode == 0 so that
it's less confusing to describe (eg. mode == 0 on block devices will
issue provision; mode == 0 on files will not). It would complicate
loopback devices, for instance; if the loop device is backed by a
file, it would need to use (mode == FALLOC_FL_PROVISION) but if the
loop device is backed by another block device, then the fallocate()
call would need to switch to (mode == 0).

With the separate mode, we can describe the semantics of falllcate()
modes a bit more cleanly, and it is common for both files and block
devices:

1. mode == 0: allocation at the same layer, will not provision on the
underlying device/filesystem (unsupported for block devices).
2. mode == FALLOC_FL_PROVISION, allocation at the layer, will
provision on the underlying device/filesystem.

Block devices don't technically need to use a separate mode, but it
makes it much less confusing if filesystems are already using a
separate mode for provision.

Best
Sarthak

> Brian
>
> > > BTW another thing that might be useful wrt to dm-thin is to support
> > > FALLOC_FL_UNSHARE. I.e., it looks like the previous dm-thin patch only
> > > checks that blocks are allocated, but not whether those blocks are
> > > shared (re: lookup_result.shared). It might be useful to do the COW in
> > > such cases if the caller passes down a REQ_UNSHARE or some such flag.
> > >
> > That's an interesting idea! There's a few more things on the TODO list
> > for this patch series but I think we can follow up with a patch to
> > handle that as well.
> >
> > Sarthak
> >
> > > Brian
> > >
> > > > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > > > index f3f0b97b1675..a0e506255b20 100644
> > > > --- a/include/linux/falloc.h
> > > > +++ b/include/linux/falloc.h
> > > > @@ -30,7 +30,8 @@ struct space_resv {
> > > >                                        FALLOC_FL_COLLAPSE_RANGE |     \
> > > >                                        FALLOC_FL_ZERO_RANGE |         \
> > > >                                        FALLOC_FL_INSERT_RANGE |       \
> > > > -                                      FALLOC_FL_UNSHARE_RANGE)
> > > > +                                      FALLOC_FL_UNSHARE_RANGE |                          \
> > > > +                                      FALLOC_FL_PROVISION)
> > > >
> > > >  /* on ia32 l_start is on a 32-bit boundary */
> > > >  #if defined(CONFIG_X86_64)
> > > > diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> > > > index 51398fa57f6c..2d323d113eed 100644
> > > > --- a/include/uapi/linux/falloc.h
> > > > +++ b/include/uapi/linux/falloc.h
> > > > @@ -77,4 +77,12 @@
> > > >   */
> > > >  #define FALLOC_FL_UNSHARE_RANGE              0x40
> > > >
> > > > +/*
> > > > + * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
> > > > + * blocks for the range/EOF.
> > > > + *
> > > > + * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
> > > > + */
> > > > +#define FALLOC_FL_PROVISION          0x80
> > > > +
> > > >  #endif /* _UAPI_FALLOC_H_ */
> > > > --
> > > > 2.31.0
> > > >
> > >
> >
>
Sarthak Kukreti Sept. 22, 2022, 8:08 a.m. UTC | #8
On Wed, Sep 21, 2022 at 8:21 AM Mike Snitzer <snitzer@redhat.com> wrote:
>
> On Wed, Sep 21 2022 at  1:54P -0400,
> Sarthak Kukreti <sarthakkukreti@chromium.org> wrote:
>
> > On Tue, Sep 20, 2022 at 12:49 AM Christoph Hellwig <hch@infradead.org> wrote:
> > >
> > > On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > > > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > >
> > > > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > > > sends a hint to (supported) thinly provisioned block devices to
> > > > allocate space for the given range of sectors via REQ_OP_PROVISION.
> > >
> > > So, how does that "provisioning" actually work in todays world where
> > > storage is usually doing out of place writes in one or more layers,
> > > including the flash storage everyone is using.  Does it give you one
> > > write?  And unlimited number?  Some undecided number inbetween?
> >
> > Apologies, the patchset was a bit short on describing the semantics so
> > I'll expand more in the next revision; I'd say that it's the minimum
> > of regular mode fallocate() guarantees at each allocation layer. For
> > example, the guarantees from a contrived storage stack like (left to
> > right is bottom to top):
> >
> > [ mmc0blkp1 | ext4(1) | sparse file | loop | dm-thinp | dm-thin | ext4(2) ]
> >
> > would be predicated on the guarantees of fallocate() per allocation
> > layer; if ext4(1) was replaced by a filesystem that did not support
> > fallocate(), then there would be no guarantee that a write to a file
> > on ext4(2) succeeds.
> >
> > For dm-thinp, in the current implementation, the provision request
> > allocates blocks for the range specified and adds the mapping to the
> > thinpool metadata. All subsequent writes are to the same block, so
> > you'll be able to write to the same block inifinitely. Brian mentioned
> > this above, one case it doesn't cover is if provision is called on a
> > shared block, but the natural extension would be to allocate and
> > assign a new block and copy the contents of the shared block (kind of
> > like copy-on-provision).
>
> It follows that ChromiumOS isn't using dm-thinp's snapshot support?
>
Not at the moment, but we definitely have ideas to explore re:snapshot
and dm-thinp (like A-B updates with thin volume snapshots), where this
would definitely be useful!

> But please do fold in incremental dm-thinp support to properly handle
> shared blocks (dm-thinp already handles breaking sharing, etc.. so
> I'll need to see where you're hooking into that you don't get this
> "for free").
>
Will do in v2. Thanks for the feedback.

Best
Sarthak

> Mike
>
Brian Foster Sept. 22, 2022, 6:29 p.m. UTC | #9
On Thu, Sep 22, 2022 at 01:04:33AM -0700, Sarthak Kukreti wrote:
> On Wed, Sep 21, 2022 at 8:39 AM Brian Foster <bfoster@redhat.com> wrote:
> >
> > On Fri, Sep 16, 2022 at 02:02:31PM -0700, Sarthak Kukreti wrote:
> > > On Fri, Sep 16, 2022 at 4:56 AM Brian Foster <bfoster@redhat.com> wrote:
> > > >
> > > > On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > > > > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > > >
> > > > > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > > > > sends a hint to (supported) thinly provisioned block devices to
> > > > > allocate space for the given range of sectors via REQ_OP_PROVISION.
> > > > >
> > > > > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > > > ---
> > > > >  block/fops.c                | 7 ++++++-
> > > > >  include/linux/falloc.h      | 3 ++-
> > > > >  include/uapi/linux/falloc.h | 8 ++++++++
> > > > >  3 files changed, 16 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/block/fops.c b/block/fops.c
> > > > > index b90742595317..a436a7596508 100644
> > > > > --- a/block/fops.c
> > > > > +++ b/block/fops.c
> > > > ...
> > > > > @@ -661,6 +662,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> > > > >               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
> > > > >                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > > >               break;
> > > > > +     case FALLOC_FL_PROVISION:
> > > > > +             error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> > > > > +                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > > > +             break;
> > > > >       default:
> > > > >               error = -EOPNOTSUPP;
> > > > >       }
> > > >
> > > > Hi Sarthak,
> > > >
> > > > Neat mechanism.. I played with something very similar in the past (that
> > > > was much more crudely hacked up to target dm-thin) to allow filesystems
> > > > to request a thinly provisioned device to allocate blocks and try to do
> > > > a better job of avoiding inactivation when overprovisioned.
> > > >
> > > > One thing I'm a little curious about here.. what's the need for a new
> > > > fallocate mode? On a cursory glance, the provision mode looks fairly
> > > > analogous to normal (mode == 0) allocation mode with the exception of
> > > > sending the request down to the bdev. blkdev_fallocate() already maps
> > > > some of the logical falloc modes (i.e. punch hole, zero range) to
> > > > sending write sames or discards, etc., and it doesn't currently look
> > > > like it supports allocation mode, so could it not map such requests to
> > > > the underlying REQ_OP_PROVISION op?
> > > >
> > > > I guess the difference would be at the filesystem level where we'd
> > > > probably need to rely on a mount option or some such to control whether
> > > > traditional fallocate issues provision ops (like you've implemented for
> > > > ext4) vs. the specific falloc command, but that seems fairly consistent
> > > > with historical punch hole/discard behavior too. Hm? You might want to
> > > > cc linux-fsdevel in future posts in any event to get some more feedback
> > > > on how other filesystems might want to interact with such a thing.
> > > >
> > > Thanks for the feedback!
> > > Argh, I completely forgot that I should add linux-fsdevel. Let me
> > > re-send this with linux-fsdevel cc'd
> > >
> > > There's a slight distinction is that the current filesystem-level
> > > controls are usually for default handling, but userspace can still
> > > call the relevant functions manually if they need to. For example, for
> > > ext4, the 'discard' mount option dictates whether free blocks are
> > > discarded, but it doesn't set the policy to allow/disallow userspace
> > > from manually punching holes into files even if the mount opt is
> > > 'nodiscard'. FALLOC_FL_PROVISION is similar in that regard; it adds a
> > > manual mechanism for users to provision the files' extents, that is
> > > separate from the filesystems' default handling of provisioning files.
> > >
> >
> > What I'm trying to understand is why not let blkdev_fallocate() issue a
> > provision based on the default mode (i.e. mode == 0) of fallocate(),
> > which is already defined to mean "perform allocation?" It currently
> > issues discards or write zeroes based on variants of
> > FALLOC_FL_PUNCH_HOLE without the need for a separate FALLOC_FL_DISCARD
> > mode, for example.
> >
> It's mostly to keep the block device fallocate() semantics in-line and
> consistent with the file-specific modes: I added the separate
> filesystem fallocate() mode under the assumption that we'd want to
> keep the traditional handling for filesystems intact with (mode == 0).
> And for block devices, I didn't map the requests to mode == 0 so that
> it's less confusing to describe (eg. mode == 0 on block devices will
> issue provision; mode == 0 on files will not). It would complicate
> loopback devices, for instance; if the loop device is backed by a
> file, it would need to use (mode == FALLOC_FL_PROVISION) but if the
> loop device is backed by another block device, then the fallocate()
> call would need to switch to (mode == 0).
> 

I would expect the loopback scenario for provision to behave similar to
how discards are handled. I.e., loopback receives a provision request
and translates that to fallocate(mode = 0). If the backing device is
block, blkdev_fallocate(mode = 0) translates that to another provision
request.  If the backing device is a file, the associated fallocate
handler allocs/maps, if necessary, and then issues a provision on
allocation, if enabled by the fs.

AFAICT there's no need for FL_PROVISION at all in that scenario. Is
there a functional purpose to FL_PROVISION? Is the intent to try and
guarantee that a provision request propagates down the I/O stack? If so,
what happens if blocks were already preallocated in the backing file (in
the loopback file example)?

BTW, an unrelated thing I noticed is that blkdev_fallocate()
unconditionally calls truncate_bdev_range(), which probably doesn't make
sense for any sort of alloc mode.

> With the separate mode, we can describe the semantics of falllcate()
> modes a bit more cleanly, and it is common for both files and block
> devices:
> 
> 1. mode == 0: allocation at the same layer, will not provision on the
> underlying device/filesystem (unsupported for block devices).
> 2. mode == FALLOC_FL_PROVISION, allocation at the layer, will
> provision on the underlying device/filesystem.
> 

I think I see why you make the distinction, since the block layer
doesn't have a "this layer only" mode, but IMO it's also quite confusing
to say that mode == FL_PROVISION can allocate at the current and
underlying layer(s) but mode == 0 to that underlying layer cannot.

Either way, if you want to propose a new falloc mode/modifier, it
probably warrants a more detailed commit log with more explanation of
the purpose, examples of behavior, perhaps some details on how the mode
might be documented in man pages, etc.

Brian

> Block devices don't technically need to use a separate mode, but it
> makes it much less confusing if filesystems are already using a
> separate mode for provision.
> 
> Best
> Sarthak
> 
> > Brian
> >
> > > > BTW another thing that might be useful wrt to dm-thin is to support
> > > > FALLOC_FL_UNSHARE. I.e., it looks like the previous dm-thin patch only
> > > > checks that blocks are allocated, but not whether those blocks are
> > > > shared (re: lookup_result.shared). It might be useful to do the COW in
> > > > such cases if the caller passes down a REQ_UNSHARE or some such flag.
> > > >
> > > That's an interesting idea! There's a few more things on the TODO list
> > > for this patch series but I think we can follow up with a patch to
> > > handle that as well.
> > >
> > > Sarthak
> > >
> > > > Brian
> > > >
> > > > > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > > > > index f3f0b97b1675..a0e506255b20 100644
> > > > > --- a/include/linux/falloc.h
> > > > > +++ b/include/linux/falloc.h
> > > > > @@ -30,7 +30,8 @@ struct space_resv {
> > > > >                                        FALLOC_FL_COLLAPSE_RANGE |     \
> > > > >                                        FALLOC_FL_ZERO_RANGE |         \
> > > > >                                        FALLOC_FL_INSERT_RANGE |       \
> > > > > -                                      FALLOC_FL_UNSHARE_RANGE)
> > > > > +                                      FALLOC_FL_UNSHARE_RANGE |                          \
> > > > > +                                      FALLOC_FL_PROVISION)
> > > > >
> > > > >  /* on ia32 l_start is on a 32-bit boundary */
> > > > >  #if defined(CONFIG_X86_64)
> > > > > diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> > > > > index 51398fa57f6c..2d323d113eed 100644
> > > > > --- a/include/uapi/linux/falloc.h
> > > > > +++ b/include/uapi/linux/falloc.h
> > > > > @@ -77,4 +77,12 @@
> > > > >   */
> > > > >  #define FALLOC_FL_UNSHARE_RANGE              0x40
> > > > >
> > > > > +/*
> > > > > + * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
> > > > > + * blocks for the range/EOF.
> > > > > + *
> > > > > + * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
> > > > > + */
> > > > > +#define FALLOC_FL_PROVISION          0x80
> > > > > +
> > > > >  #endif /* _UAPI_FALLOC_H_ */
> > > > > --
> > > > > 2.31.0
> > > > >
> > > >
> > >
> >
>
Christoph Hellwig Sept. 23, 2022, 8:45 a.m. UTC | #10
On Tue, Sep 20, 2022 at 10:54:32PM -0700, Sarthak Kukreti wrote:
> [ mmc0blkp1 | ext4(1) | sparse file | loop | dm-thinp | dm-thin | ext4(2) ]
> 
> would be predicated on the guarantees of fallocate() per allocation
> layer; if ext4(1) was replaced by a filesystem that did not support
> fallocate(), then there would be no guarantee that a write to a file
> on ext4(2) succeeds.

a write or any unlimited number of writes?
Sarthak Kukreti Dec. 29, 2022, 8:13 a.m. UTC | #11
On Thu, Sep 22, 2022 at 11:29 AM Brian Foster <bfoster@redhat.com> wrote:
>
> On Thu, Sep 22, 2022 at 01:04:33AM -0700, Sarthak Kukreti wrote:
> > On Wed, Sep 21, 2022 at 8:39 AM Brian Foster <bfoster@redhat.com> wrote:
> > >
> > > On Fri, Sep 16, 2022 at 02:02:31PM -0700, Sarthak Kukreti wrote:
> > > > On Fri, Sep 16, 2022 at 4:56 AM Brian Foster <bfoster@redhat.com> wrote:
> > > > >
> > > > > On Thu, Sep 15, 2022 at 09:48:22AM -0700, Sarthak Kukreti wrote:
> > > > > > From: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > > > >
> > > > > > FALLOC_FL_PROVISION is a new fallocate() allocation mode that
> > > > > > sends a hint to (supported) thinly provisioned block devices to
> > > > > > allocate space for the given range of sectors via REQ_OP_PROVISION.
> > > > > >
> > > > > > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > > > > ---
> > > > > >  block/fops.c                | 7 ++++++-
> > > > > >  include/linux/falloc.h      | 3 ++-
> > > > > >  include/uapi/linux/falloc.h | 8 ++++++++
> > > > > >  3 files changed, 16 insertions(+), 2 deletions(-)
> > > > > >
> > > > > > diff --git a/block/fops.c b/block/fops.c
> > > > > > index b90742595317..a436a7596508 100644
> > > > > > --- a/block/fops.c
> > > > > > +++ b/block/fops.c
> > > > > ...
> > > > > > @@ -661,6 +662,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> > > > > >               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
> > > > > >                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > > > >               break;
> > > > > > +     case FALLOC_FL_PROVISION:
> > > > > > +             error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> > > > > > +                                            len >> SECTOR_SHIFT, GFP_KERNEL);
> > > > > > +             break;
> > > > > >       default:
> > > > > >               error = -EOPNOTSUPP;
> > > > > >       }
> > > > >
> > > > > Hi Sarthak,
> > > > >
> > > > > Neat mechanism.. I played with something very similar in the past (that
> > > > > was much more crudely hacked up to target dm-thin) to allow filesystems
> > > > > to request a thinly provisioned device to allocate blocks and try to do
> > > > > a better job of avoiding inactivation when overprovisioned.
> > > > >
> > > > > One thing I'm a little curious about here.. what's the need for a new
> > > > > fallocate mode? On a cursory glance, the provision mode looks fairly
> > > > > analogous to normal (mode == 0) allocation mode with the exception of
> > > > > sending the request down to the bdev. blkdev_fallocate() already maps
> > > > > some of the logical falloc modes (i.e. punch hole, zero range) to
> > > > > sending write sames or discards, etc., and it doesn't currently look
> > > > > like it supports allocation mode, so could it not map such requests to
> > > > > the underlying REQ_OP_PROVISION op?
> > > > >
> > > > > I guess the difference would be at the filesystem level where we'd
> > > > > probably need to rely on a mount option or some such to control whether
> > > > > traditional fallocate issues provision ops (like you've implemented for
> > > > > ext4) vs. the specific falloc command, but that seems fairly consistent
> > > > > with historical punch hole/discard behavior too. Hm? You might want to
> > > > > cc linux-fsdevel in future posts in any event to get some more feedback
> > > > > on how other filesystems might want to interact with such a thing.
> > > > >
> > > > Thanks for the feedback!
> > > > Argh, I completely forgot that I should add linux-fsdevel. Let me
> > > > re-send this with linux-fsdevel cc'd
> > > >
> > > > There's a slight distinction is that the current filesystem-level
> > > > controls are usually for default handling, but userspace can still
> > > > call the relevant functions manually if they need to. For example, for
> > > > ext4, the 'discard' mount option dictates whether free blocks are
> > > > discarded, but it doesn't set the policy to allow/disallow userspace
> > > > from manually punching holes into files even if the mount opt is
> > > > 'nodiscard'. FALLOC_FL_PROVISION is similar in that regard; it adds a
> > > > manual mechanism for users to provision the files' extents, that is
> > > > separate from the filesystems' default handling of provisioning files.
> > > >
> > >
> > > What I'm trying to understand is why not let blkdev_fallocate() issue a
> > > provision based on the default mode (i.e. mode == 0) of fallocate(),
> > > which is already defined to mean "perform allocation?" It currently
> > > issues discards or write zeroes based on variants of
> > > FALLOC_FL_PUNCH_HOLE without the need for a separate FALLOC_FL_DISCARD
> > > mode, for example.
> > >
> > It's mostly to keep the block device fallocate() semantics in-line and
> > consistent with the file-specific modes: I added the separate
> > filesystem fallocate() mode under the assumption that we'd want to
> > keep the traditional handling for filesystems intact with (mode == 0).
> > And for block devices, I didn't map the requests to mode == 0 so that
> > it's less confusing to describe (eg. mode == 0 on block devices will
> > issue provision; mode == 0 on files will not). It would complicate
> > loopback devices, for instance; if the loop device is backed by a
> > file, it would need to use (mode == FALLOC_FL_PROVISION) but if the
> > loop device is backed by another block device, then the fallocate()
> > call would need to switch to (mode == 0).
> >
>
> I would expect the loopback scenario for provision to behave similar to
> how discards are handled. I.e., loopback receives a provision request
> and translates that to fallocate(mode = 0). If the backing device is
> block, blkdev_fallocate(mode = 0) translates that to another provision
> request.  If the backing device is a file, the associated fallocate
> handler allocs/maps, if necessary, and then issues a provision on
> allocation, if enabled by the fs.
>
> AFAICT there's no need for FL_PROVISION at all in that scenario. Is
> there a functional purpose to FL_PROVISION? Is the intent to try and
> guarantee that a provision request propagates down the I/O stack? If so,
> what happens if blocks were already preallocated in the backing file (in
> the loopback file example)?
>
> BTW, an unrelated thing I noticed is that blkdev_fallocate()
> unconditionally calls truncate_bdev_range(), which probably doesn't make
> sense for any sort of alloc mode.
>
Thanks for pointing that out, fixed in v2.

> > With the separate mode, we can describe the semantics of falllcate()
> > modes a bit more cleanly, and it is common for both files and block
> > devices:
> >
> > 1. mode == 0: allocation at the same layer, will not provision on the
> > underlying device/filesystem (unsupported for block devices).
> > 2. mode == FALLOC_FL_PROVISION, allocation at the layer, will
> > provision on the underlying device/filesystem.
> >
>
> I think I see why you make the distinction, since the block layer
> doesn't have a "this layer only" mode, but IMO it's also quite confusing
> to say that mode == FL_PROVISION can allocate at the current and
> underlying layer(s) but mode == 0 to that underlying layer cannot.
>
> Either way, if you want to propose a new falloc mode/modifier, it
> probably warrants a more detailed commit log with more explanation of
> the purpose, examples of behavior, perhaps some details on how the mode
> might be documented in man pages, etc.
>
That's fair. Added more details to the patch commit log in v2.

Thanks
Sarthak

> Brian
>
> > Block devices don't technically need to use a separate mode, but it
> > makes it much less confusing if filesystems are already using a
> > separate mode for provision.
> >
> > Best
> > Sarthak
> >
> > > Brian
> > >
> > > > > BTW another thing that might be useful wrt to dm-thin is to support
> > > > > FALLOC_FL_UNSHARE. I.e., it looks like the previous dm-thin patch only
> > > > > checks that blocks are allocated, but not whether those blocks are
> > > > > shared (re: lookup_result.shared). It might be useful to do the COW in
> > > > > such cases if the caller passes down a REQ_UNSHARE or some such flag.
> > > > >
> > > > That's an interesting idea! There's a few more things on the TODO list
> > > > for this patch series but I think we can follow up with a patch to
> > > > handle that as well.
> > > >
> > > > Sarthak
> > > >
> > > > > Brian
> > > > >
> > > > > > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > > > > > index f3f0b97b1675..a0e506255b20 100644
> > > > > > --- a/include/linux/falloc.h
> > > > > > +++ b/include/linux/falloc.h
> > > > > > @@ -30,7 +30,8 @@ struct space_resv {
> > > > > >                                        FALLOC_FL_COLLAPSE_RANGE |     \
> > > > > >                                        FALLOC_FL_ZERO_RANGE |         \
> > > > > >                                        FALLOC_FL_INSERT_RANGE |       \
> > > > > > -                                      FALLOC_FL_UNSHARE_RANGE)
> > > > > > +                                      FALLOC_FL_UNSHARE_RANGE |                          \
> > > > > > +                                      FALLOC_FL_PROVISION)
> > > > > >
> > > > > >  /* on ia32 l_start is on a 32-bit boundary */
> > > > > >  #if defined(CONFIG_X86_64)
> > > > > > diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> > > > > > index 51398fa57f6c..2d323d113eed 100644
> > > > > > --- a/include/uapi/linux/falloc.h
> > > > > > +++ b/include/uapi/linux/falloc.h
> > > > > > @@ -77,4 +77,12 @@
> > > > > >   */
> > > > > >  #define FALLOC_FL_UNSHARE_RANGE              0x40
> > > > > >
> > > > > > +/*
> > > > > > + * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
> > > > > > + * blocks for the range/EOF.
> > > > > > + *
> > > > > > + * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
> > > > > > + */
> > > > > > +#define FALLOC_FL_PROVISION          0x80
> > > > > > +
> > > > > >  #endif /* _UAPI_FALLOC_H_ */
> > > > > > --
> > > > > > 2.31.0
> > > > > >
> > > > >
> > > >
> > >
> >
>
Sarthak Kukreti Dec. 29, 2022, 8:14 a.m. UTC | #12
On Fri, Sep 23, 2022 at 1:45 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Sep 20, 2022 at 10:54:32PM -0700, Sarthak Kukreti wrote:
> > [ mmc0blkp1 | ext4(1) | sparse file | loop | dm-thinp | dm-thin | ext4(2) ]
> >
> > would be predicated on the guarantees of fallocate() per allocation
> > layer; if ext4(1) was replaced by a filesystem that did not support
> > fallocate(), then there would be no guarantee that a write to a file
> > on ext4(2) succeeds.
>
> a write or any unlimited number of writes?

(Apologies for the super late reply!) In this case, even a write won't
be guaranteed if we run out of space on the lower filesystem. Looking
at the fallocate() man page, I think the key part lies in the
following phrase (emphasis mine):

```
After a successful call, subsequent writes into the range
specified by offset and len are guaranteed not to fail _because of
lack of disk space_
```

So, it's not a blanket guarantee that all writes will always succeed,
but that any writes into that range will not fail due to lack of disk
space. As you mentioned, writes may happen out-of-place in one or more
layer. But the fallocate(FALLOC_FL_PROVISION) ensures that each layer
will preserve space for writes into that range to not fail with ENOSPC
(so eg. ext4 and dm-thinp will set aside enough extents to fulfil that
promise later on for all writes).

Best

Sarthak
diff mbox series

Patch

diff --git a/block/fops.c b/block/fops.c
index b90742595317..a436a7596508 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -605,7 +605,8 @@  static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 #define	BLKDEV_FALLOC_FL_SUPPORTED					\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
-		 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+		 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE |	\
+		 FALLOC_FL_PROVISION)
 
 static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 			     loff_t len)
@@ -661,6 +662,10 @@  static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 		error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
 					     len >> SECTOR_SHIFT, GFP_KERNEL);
 		break;
+	case FALLOC_FL_PROVISION:
+		error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
+					       len >> SECTOR_SHIFT, GFP_KERNEL);
+		break;
 	default:
 		error = -EOPNOTSUPP;
 	}
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index f3f0b97b1675..a0e506255b20 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -30,7 +30,8 @@  struct space_resv {
 					 FALLOC_FL_COLLAPSE_RANGE |	\
 					 FALLOC_FL_ZERO_RANGE |		\
 					 FALLOC_FL_INSERT_RANGE |	\
-					 FALLOC_FL_UNSHARE_RANGE)
+					 FALLOC_FL_UNSHARE_RANGE |                          \
+					 FALLOC_FL_PROVISION)
 
 /* on ia32 l_start is on a 32-bit boundary */
 #if defined(CONFIG_X86_64)
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 51398fa57f6c..2d323d113eed 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -77,4 +77,12 @@ 
  */
 #define FALLOC_FL_UNSHARE_RANGE		0x40
 
+/*
+ * FALLOC_FL_PROVISION acts as a hint for thinly provisioned devices to allocate
+ * blocks for the range/EOF.
+ *
+ * FALLOC_FL_PROVISION can only be used with allocate-mode fallocate.
+ */
+#define FALLOC_FL_PROVISION		0x80
+
 #endif /* _UAPI_FALLOC_H_ */