Message ID | 163192865577.417973.11122330974455662098.stgit@magnolia (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfs: enable userspace to reset damaged file storage | expand |
On 21/09/17 06:30PM, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Create a function that ensures that the storage backing part of a file > contains zeroes and will not trip over old media errors if the contents > are re-read. > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > --- > fs/iomap/direct-io.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/iomap.h | 3 ++ > 2 files changed, 78 insertions(+) > > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index 4ecd255e0511..48826a49f976 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > return iomap_dio_complete(dio); > } > EXPORT_SYMBOL_GPL(iomap_dio_rw); > + > +static loff_t > +iomap_zeroinit_iter(struct iomap_iter *iter) > +{ > + struct iomap *iomap = &iter->iomap; > + const struct iomap *srcmap = iomap_iter_srcmap(iter); > + const u64 start = iomap->addr + iter->pos - iomap->offset; > + const u64 nr_bytes = iomap_length(iter); > + sector_t sector = start >> SECTOR_SHIFT; > + sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT; > + int ret; > + > + if (!iomap->bdev) > + return -ECANCELED; > + > + /* The physical extent must be sector-aligned for block layer APIs. */ > + if ((start | nr_bytes) & (SECTOR_SIZE - 1)) > + return -EINVAL; > + > + /* Must be able to zero storage directly without fs intervention. */ > + if (iomap->flags & IOMAP_F_SHARED) > + return -ECANCELED; > + if (srcmap != iomap) > + return -ECANCELED; > + > + switch (iomap->type) { > + case IOMAP_MAPPED: > + ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors, > + GFP_KERNEL, 0); > + if (ret) > + return ret; > + fallthrough; > + case IOMAP_UNWRITTEN: > + return nr_bytes; > + } > + > + /* Reject holes, inline data, or delalloc extents. */ > + return -ECANCELED; Same comment here as in patch-1 which implements dax_zeroinit_iter(). -ritesh > +} > + > +/* > + * Use a storage device's accelerated zero-writing command to ensure the media > + * are ready to accept read and write commands. FSDAX is not supported. > + * > + * The range arguments must be aligned to sector size. The file must be backed > + * by a block device. The extents returned must not require copy on write (or > + * any other mapping interventions from the filesystem) and must be contiguous. > + * @done will be set to true if the reset succeeded. > + * > + * Returns 0 if the zero initialization succeeded, -ECANCELED if the storage > + * mappings do not support zero initialization, -EOPNOTSUPP if the device does > + * not support it, or the usual negative errno. > + */ > +int > +iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len, > + const struct iomap_ops *ops) > +{ > + struct iomap_iter iter = { > + .inode = inode, > + .pos = pos, > + .len = len, > + .flags = IOMAP_REPORT, > + }; > + int ret; > + > + if (IS_DAX(inode)) > + return -EINVAL; > + if (pos + len > i_size_read(inode)) > + return -EINVAL; > + > + while ((ret = iomap_iter(&iter, ops)) > 0) > + iter.processed = iomap_zeroinit_iter(&iter); > + return ret; > +} > +EXPORT_SYMBOL_GPL(iomap_zeroout_range); > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 24f8489583ca..f4b9c6698388 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -339,6 +339,9 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > ssize_t iomap_dio_complete(struct iomap_dio *dio); > int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); > > +int iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len, > + const struct iomap_ops *ops); > + > #ifdef CONFIG_SWAP > struct file; > struct swap_info_struct; >
On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Create a function that ensures that the storage backing part of a file > contains zeroes and will not trip over old media errors if the contents > are re-read. I don't think this has anything to do with direct I/O, so I'd rather not have it clutter direct-io.c. Also do we really want to wait synchronously for every bio instead of batching them up? Especially as a simple bio_chain is probably all that is needed.
On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Create a function that ensures that the storage backing part of a file > contains zeroes and will not trip over old media errors if the contents > are re-read. > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > --- > fs/iomap/direct-io.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/iomap.h | 3 ++ > 2 files changed, 78 insertions(+) > > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index 4ecd255e0511..48826a49f976 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > return iomap_dio_complete(dio); > } > EXPORT_SYMBOL_GPL(iomap_dio_rw); > + > +static loff_t > +iomap_zeroinit_iter(struct iomap_iter *iter) > +{ > + struct iomap *iomap = &iter->iomap; > + const struct iomap *srcmap = iomap_iter_srcmap(iter); > + const u64 start = iomap->addr + iter->pos - iomap->offset; > + const u64 nr_bytes = iomap_length(iter); > + sector_t sector = start >> SECTOR_SHIFT; > + sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT; > + int ret; > + > + if (!iomap->bdev) > + return -ECANCELED; > + > + /* The physical extent must be sector-aligned for block layer APIs. */ > + if ((start | nr_bytes) & (SECTOR_SIZE - 1)) > + return -EINVAL; > + > + /* Must be able to zero storage directly without fs intervention. */ > + if (iomap->flags & IOMAP_F_SHARED) > + return -ECANCELED; > + if (srcmap != iomap) > + return -ECANCELED; > + > + switch (iomap->type) { > + case IOMAP_MAPPED: > + ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors, > + GFP_KERNEL, 0); Pretty sure this needs to use BLKDEV_ZERO_NOUNMAP. The whole point of this is having zeroed space allocated ready for write on return, so having the hardware optimise away the physical storage zeroing by punching a hole in it's backing store and then potentially getting ENOSPC on the next write to this range would be .... suboptimal. Cheers, Dave.
On Tue, Sep 21, 2021 at 09:29:25AM +0100, Christoph Hellwig wrote: > On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Create a function that ensures that the storage backing part of a file > > contains zeroes and will not trip over old media errors if the contents > > are re-read. > > I don't think this has anything to do with direct I/O, so I'd rather > not have it clutter direct-io.c. Also do we really want to wait > synchronously for every bio instead of batching them up? Especially > as a simple bio_chain is probably all that is needed. __blkdev_issue_zeroout looks appropriate for chaining. I'll move the zeroout routine into a new lowlevel.c file, since this isn't buffered io either. --D
On Wed, Sep 22, 2021 at 08:33:43AM +1000, Dave Chinner wrote: > On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Create a function that ensures that the storage backing part of a file > > contains zeroes and will not trip over old media errors if the contents > > are re-read. > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > > --- > > fs/iomap/direct-io.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ > > include/linux/iomap.h | 3 ++ > > 2 files changed, 78 insertions(+) > > > > > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > > index 4ecd255e0511..48826a49f976 100644 > > --- a/fs/iomap/direct-io.c > > +++ b/fs/iomap/direct-io.c > > @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > > return iomap_dio_complete(dio); > > } > > EXPORT_SYMBOL_GPL(iomap_dio_rw); > > + > > +static loff_t > > +iomap_zeroinit_iter(struct iomap_iter *iter) > > +{ > > + struct iomap *iomap = &iter->iomap; > > + const struct iomap *srcmap = iomap_iter_srcmap(iter); > > + const u64 start = iomap->addr + iter->pos - iomap->offset; > > + const u64 nr_bytes = iomap_length(iter); > > + sector_t sector = start >> SECTOR_SHIFT; > > + sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT; > > + int ret; > > + > > + if (!iomap->bdev) > > + return -ECANCELED; > > + > > + /* The physical extent must be sector-aligned for block layer APIs. */ > > + if ((start | nr_bytes) & (SECTOR_SIZE - 1)) > > + return -EINVAL; > > + > > + /* Must be able to zero storage directly without fs intervention. */ > > + if (iomap->flags & IOMAP_F_SHARED) > > + return -ECANCELED; > > + if (srcmap != iomap) > > + return -ECANCELED; > > + > > + switch (iomap->type) { > > + case IOMAP_MAPPED: > > + ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors, > > + GFP_KERNEL, 0); > > Pretty sure this needs to use BLKDEV_ZERO_NOUNMAP. Oops, good catch! --D > The whole point of this is having zeroed space allocated ready for > write on return, so having the hardware optimise away the physical > storage zeroing by punching a hole in it's backing store and then > potentially getting ENOSPC on the next write to this range would be > .... suboptimal. > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4ecd255e0511..48826a49f976 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); + +static loff_t +iomap_zeroinit_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + const u64 start = iomap->addr + iter->pos - iomap->offset; + const u64 nr_bytes = iomap_length(iter); + sector_t sector = start >> SECTOR_SHIFT; + sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT; + int ret; + + if (!iomap->bdev) + return -ECANCELED; + + /* The physical extent must be sector-aligned for block layer APIs. */ + if ((start | nr_bytes) & (SECTOR_SIZE - 1)) + return -EINVAL; + + /* Must be able to zero storage directly without fs intervention. */ + if (iomap->flags & IOMAP_F_SHARED) + return -ECANCELED; + if (srcmap != iomap) + return -ECANCELED; + + switch (iomap->type) { + case IOMAP_MAPPED: + ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors, + GFP_KERNEL, 0); + if (ret) + return ret; + fallthrough; + case IOMAP_UNWRITTEN: + return nr_bytes; + } + + /* Reject holes, inline data, or delalloc extents. */ + return -ECANCELED; +} + +/* + * Use a storage device's accelerated zero-writing command to ensure the media + * are ready to accept read and write commands. FSDAX is not supported. + * + * The range arguments must be aligned to sector size. The file must be backed + * by a block device. The extents returned must not require copy on write (or + * any other mapping interventions from the filesystem) and must be contiguous. + * @done will be set to true if the reset succeeded. + * + * Returns 0 if the zero initialization succeeded, -ECANCELED if the storage + * mappings do not support zero initialization, -EOPNOTSUPP if the device does + * not support it, or the usual negative errno. + */ +int +iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_REPORT, + }; + int ret; + + if (IS_DAX(inode)) + return -EINVAL; + if (pos + len > i_size_read(inode)) + return -EINVAL; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_zeroinit_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_zeroout_range); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 24f8489583ca..f4b9c6698388 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -339,6 +339,9 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ssize_t iomap_dio_complete(struct iomap_dio *dio); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); +int iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len, + const struct iomap_ops *ops); + #ifdef CONFIG_SWAP struct file; struct swap_info_struct;