Message ID | 1485200596-14057-1-git-send-email-amir73il@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon 23-01-17 21:43:16, Amir Goldstein wrote: > Before calling write f_ops, call file_start_write() instead > of sb_start_write(). > > This ensures freeze protection for both overlay and upper fs > when file is open from an overlayfs mount. > > Replace {sb,file}_start_write() for {copy,clone}_file_range() and > for fallocate(). > > For dedup_file_range() there is no need for mnt_want_write_file(). > File is already open for write, so we already have mnt_want_write() > and we only need file_start_write(). > > Signed-off-by: Amir Goldstein <amir73il@gmail.com> The patch looks like a good cleanup regardless of your overlayfs work. It seems we just missed these places when converting freeze protection to file_start_write(). You can add: Reviewed-by: Jan Kara <jack@suse.cz> Honza > --- > fs/open.c | 4 ++-- > fs/read_write.c | 12 ++++-------- > include/linux/fs.h | 26 +++++++++++++------------- > 3 files changed, 19 insertions(+), 23 deletions(-) > > diff --git a/fs/open.c b/fs/open.c > index 9921f70..d3fe1a1 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -316,7 +316,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > if (!file->f_op->fallocate) > return -EOPNOTSUPP; > > - sb_start_write(inode->i_sb); > + file_start_write(file); > ret = file->f_op->fallocate(file, mode, offset, len); > > /* > @@ -329,7 +329,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > if (ret == 0) > fsnotify_modify(file); > > - sb_end_write(inode->i_sb); > + file_end_write(file); > return ret; > } > EXPORT_SYMBOL_GPL(vfs_fallocate); > diff --git a/fs/read_write.c b/fs/read_write.c > index 5816d4c..1e42487 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1538,7 +1538,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > if (len == 0) > return 0; > > - sb_start_write(inode_out->i_sb); > + file_start_write(file_out); > > /* > * Try cloning first, this is supported by more file systems, and > @@ -1574,7 +1574,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > inc_syscr(current); > inc_syscw(current); > > - sb_end_write(inode_out->i_sb); > + file_end_write(file_out); > > return ret; > } > @@ -1976,11 +1976,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) > } > dst = file_inode(dst_file); > > - ret = mnt_want_write_file(dst_file); > - if (ret) { > - info->status = ret; > - goto next_loop; > - } > + file_start_write(dst_file); > > dst_off = info->dest_offset; > ret = clone_verify_area(dst_file, dst_off, len, true); > @@ -2013,7 +2009,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) > } > > next_file: > - mnt_drop_write_file(dst_file); > + file_end_write(dst_file); > next_loop: > fdput(dst_fd); > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 71811be..8a4dbc6 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1741,19 +1741,6 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > extern int vfs_dedupe_file_range(struct file *file, > struct file_dedupe_range *same); > > -static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, > - struct file *file_out, loff_t pos_out, > - u64 len) > -{ > - int ret; > - > - sb_start_write(file_inode(file_out)->i_sb); > - ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); > - sb_end_write(file_inode(file_out)->i_sb); > - > - return ret; > -} > - > struct super_operations { > struct inode *(*alloc_inode)(struct super_block *sb); > void (*destroy_inode)(struct inode *); > @@ -2579,6 +2566,19 @@ static inline void file_end_write(struct file *file) > __sb_end_write(locks_inode(file)->i_sb, SB_FREEZE_WRITE); > } > > +static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 len) > +{ > + int ret; > + > + file_start_write(file_out); > + ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); > + file_end_write(file_out); > + > + return ret; > +} > + > /* > * get_write_access() gets write permission for a file. > * put_write_access() releases this write permission. > -- > 2.7.4 > >
On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: > Before calling write f_ops, call file_start_write() instead > of sb_start_write(). > > This ensures freeze protection for both overlay and upper fs > when file is open from an overlayfs mount. > > Replace {sb,file}_start_write() for {copy,clone}_file_range() and > for fallocate(). > > For dedup_file_range() there is no need for mnt_want_write_file(). > File is already open for write, so we already have mnt_want_write() > and we only need file_start_write(). Being opened for write is not verified if capable(CAP_SYS_ADMIN). Ugly special case, don't ask me why it's done... Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >> Before calling write f_ops, call file_start_write() instead >> of sb_start_write(). >> >> This ensures freeze protection for both overlay and upper fs >> when file is open from an overlayfs mount. >> >> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >> for fallocate(). >> >> For dedup_file_range() there is no need for mnt_want_write_file(). >> File is already open for write, so we already have mnt_want_write() >> and we only need file_start_write(). > > Being opened for write is not verified if capable(CAP_SYS_ADMIN). > Ugly special case, don't ask me why it's done... > Christoph, Darrick, is that by design? -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2017 at 1:50 PM, Amir Goldstein <amir73il@gmail.com> wrote: > On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >>> Before calling write f_ops, call file_start_write() instead >>> of sb_start_write(). >>> >>> This ensures freeze protection for both overlay and upper fs >>> when file is open from an overlayfs mount. >>> >>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >>> for fallocate(). >>> >>> For dedup_file_range() there is no need for mnt_want_write_file(). >>> File is already open for write, so we already have mnt_want_write() >>> and we only need file_start_write(). >> >> Being opened for write is not verified if capable(CAP_SYS_ADMIN). >> Ugly special case, don't ask me why it's done... >> > > Christoph, Darrick, is that by design? Anyway, whether is makes sense or not, that's a legacy from BTRFS_IOC_FILE_EXTENT_SAME, we probably have to live with. Michael, I recon man page needs updating. I'll remove this hunk from the patch. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[adding mfasheh & btrfs list to cc] On Fri, Jan 27, 2017 at 06:20:12PM +0200, Amir Goldstein wrote: > On Fri, Jan 27, 2017 at 1:50 PM, Amir Goldstein <amir73il@gmail.com> wrote: > > On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > >> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: > >>> Before calling write f_ops, call file_start_write() instead > >>> of sb_start_write(). > >>> > >>> This ensures freeze protection for both overlay and upper fs > >>> when file is open from an overlayfs mount. > >>> > >>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and > >>> for fallocate(). > >>> > >>> For dedup_file_range() there is no need for mnt_want_write_file(). > >>> File is already open for write, so we already have mnt_want_write() > >>> and we only need file_start_write(). > >> > >> Being opened for write is not verified if capable(CAP_SYS_ADMIN). > >> Ugly special case, don't ask me why it's done... > >> > > > > Christoph, Darrick, is that by design? > > Anyway, whether is makes sense or not, that's a legacy from > BTRFS_IOC_FILE_EXTENT_SAME, we probably have to live with. > > Michael, I recon man page needs updating. > > I'll remove this hunk from the patch. I /think/ that behavior (CAP_SYS_ADMIN not requiring destfd to be open for writes in order to dedupe) was intentional; it seems to date back to the original ioctl in 2013. My guess of the justification is that we're not really writing to dest, so if the admin comes along with an O_RDONLY destfd it's ok? <shrug> Let's see if we get any bites from the btrfs developers. :) --D -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2017 at 6:20 PM, Amir Goldstein <amir73il@gmail.com> wrote: > On Fri, Jan 27, 2017 at 1:50 PM, Amir Goldstein <amir73il@gmail.com> wrote: >> On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >>> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >>>> Before calling write f_ops, call file_start_write() instead >>>> of sb_start_write(). >>>> >>>> This ensures freeze protection for both overlay and upper fs >>>> when file is open from an overlayfs mount. >>>> >>>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >>>> for fallocate(). >>>> Oh boy! there is more. IIUC, file_start_write() is in order were write to regular file should get freeze protection and write to special file should not. So after eliminating dedup_range from this patch, from the 3 remaining ops: fallocate(), clone_file_range() and copy_file_range() all have nice stories. fallocate() can operate or regular file directory and blockdev. blockdev does not call for freeze protection, if we use file_start_write() fs that supports fallocate on directory (anybody?) will not get freeze protection. clone_file_range() operates only on regular file, so file_start_write() seems in order. especially if clone_range() is ever extended to operate on blockdev, which does not sound such a far fetched idea. copy_file_range() looks like it was meant to operate only on regular files, but neither syscall nor vfs helper actually check that. Jan, Would it make sense to add directory to file types for which file_start_write() gets freeze protection to cover the fallocate() case correctly? Christoph, To your understanding, is it correct to add the IS_REG check in vfs_copy_file_range()? Michael, man page for copy_file_range(2) does not explicitly mention regular files but it seems implied and also EINVAL does not mention the case of fd is not a regular file, which is how xfs (and probably other fs too) respond. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2017 at 07:29:00PM +0200, Amir Goldstein wrote: > Christoph, > To your understanding, is it correct to add the IS_REG check in > vfs_copy_file_range()? Yes. The only thing that could maybe make sense would be block devices. But I'd prefer to only add then on an as-needed basis. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01/28/2017 05:20 AM, Amir Goldstein wrote: > On Fri, Jan 27, 2017 at 1:50 PM, Amir Goldstein <amir73il@gmail.com> wrote: >> On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >>> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >>>> Before calling write f_ops, call file_start_write() instead >>>> of sb_start_write(). >>>> >>>> This ensures freeze protection for both overlay and upper fs >>>> when file is open from an overlayfs mount. >>>> >>>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >>>> for fallocate(). >>>> >>>> For dedup_file_range() there is no need for mnt_want_write_file(). >>>> File is already open for write, so we already have mnt_want_write() >>>> and we only need file_start_write(). >>> >>> Being opened for write is not verified if capable(CAP_SYS_ADMIN). >>> Ugly special case, don't ask me why it's done... >>> >> >> Christoph, Darrick, is that by design? > > Anyway, whether is makes sense or not, that's a legacy from > BTRFS_IOC_FILE_EXTENT_SAME, we probably have to live with. > > Michael, I recon man page needs updating. Sorry--can you elaborate on what changes are required? Thanks, Michael
On Fri, Jan 27, 2017 at 9:30 PM, Michael Kerrisk (man-pages) <mtk.manpages@gmail.com> wrote: > On 01/28/2017 05:20 AM, Amir Goldstein wrote: >> On Fri, Jan 27, 2017 at 1:50 PM, Amir Goldstein <amir73il@gmail.com> wrote: >>> On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >>>> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >>>>> Before calling write f_ops, call file_start_write() instead >>>>> of sb_start_write(). >>>>> >>>>> This ensures freeze protection for both overlay and upper fs >>>>> when file is open from an overlayfs mount. >>>>> >>>>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >>>>> for fallocate(). >>>>> >>>>> For dedup_file_range() there is no need for mnt_want_write_file(). >>>>> File is already open for write, so we already have mnt_want_write() >>>>> and we only need file_start_write(). >>>> >>>> Being opened for write is not verified if capable(CAP_SYS_ADMIN). >>>> Ugly special case, don't ask me why it's done... >>>> >>> >>> Christoph, Darrick, is that by design? >> >> Anyway, whether is makes sense or not, that's a legacy from >> BTRFS_IOC_FILE_EXTENT_SAME, we probably have to live with. >> >> Michael, I recon man page needs updating. > > Sorry--can you elaborate on what changes are required? > 1. IOCTL-FIDEDUPERANGE(2) "During the call, src_fd must be open for reading and dest_fd must be open for writing" Apparently, with CAP_SYS_ADMIN dest_fd is allowed to be open for read-only <eyebrows raised>. It makes some sense, because the data of dest_fd is not being modified. Its not really clear to me why admin would need this capability in the first place, but that's the way it is. 2. copy_file_range(2) It doesn't say anywhere that fd_in and fd_out are supposed to be regular files and that EINVAL/EISDIR can be returns if they are not, but that is probably the behavior of all file systems. Per Christoph's suggestion, I am going to enforce IS_REG in the vfs helper, which is consistent with behavior of clone_file_range and dedupe_file_range 3. fallocate(2) The entire man page says nothing about what to expect from fallocate of directory, but the error codes document: "ENODEV fd does not refer to a regular file or a directory" In reality, file systems return EBADF for directory fd and the vfs helper code says: /* * Let individual file system decide if it supports preallocation * for directories or not. */ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; IS_BLK was added quite recently so man page is out of date is that regard. And I really wonder if there is any file system that "decides to support preallocation for directories"? Because it would make life easier for me to fix correctness for freeze protection of fallocate if vfs helper would deny falloacte a directory. fallocate() seems to be the only operation, theoretically allowed on a directory file descriptor, but only if open for write. Is there any other beast of this sort? If not, and no fs ever implemented this bizar functionality, then perhaps it is best to get rid of that corner case. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >> Before calling write f_ops, call file_start_write() instead >> of sb_start_write(). >> >> This ensures freeze protection for both overlay and upper fs >> when file is open from an overlayfs mount. >> >> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >> for fallocate(). >> >> For dedup_file_range() there is no need for mnt_want_write_file(). >> File is already open for write, so we already have mnt_want_write() >> and we only need file_start_write(). > > Being opened for write is not verified if capable(CAP_SYS_ADMIN). > Ugly special case, don't ask me why it's done... > Miklos, Your comment was correct, but you applied the patch as is with the dedup_file_range() change to overlayfs-next regardless. mistake?? I was preparing to re-send without the dedup_file_range() bits, but then I realized that it is possible to get to fallocate() and copy_file_range() with non regular file, so I did not re-send. I'll re-post with another patch to limit fallocate() and copy_file_range() to regular file in vfs helper. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 31, 2017 at 9:11 AM, Amir Goldstein <amir73il@gmail.com> wrote: > On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein <amir73il@gmail.com> wrote: >>> Before calling write f_ops, call file_start_write() instead >>> of sb_start_write(). >>> >>> This ensures freeze protection for both overlay and upper fs >>> when file is open from an overlayfs mount. >>> >>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and >>> for fallocate(). >>> >>> For dedup_file_range() there is no need for mnt_want_write_file(). >>> File is already open for write, so we already have mnt_want_write() >>> and we only need file_start_write(). >> >> Being opened for write is not verified if capable(CAP_SYS_ADMIN). >> Ugly special case, don't ask me why it's done... >> > > > Miklos, > > Your comment was correct, but you applied the patch as is with the > dedup_file_range() > change to overlayfs-next regardless. mistake?? > Ping. overlayfs-next still hold a wrong patch. Already posted v3 with some more vfs fixes. > I was preparing to re-send without the dedup_file_range() bits, but then > I realized that it is possible to get to fallocate() and copy_file_range() with > non regular file, so I did not re-send. > > I'll re-post with another patch to limit fallocate() and copy_file_range() to > regular file in vfs helper.
On Mon, Feb 6, 2017 at 3:49 PM, Amir Goldstein <amir73il@gmail.com> wrote: > Ping. > overlayfs-next still hold a wrong patch. > Already posted v3 with some more vfs fixes. Pushed. Thanks, Miklos
diff --git a/fs/open.c b/fs/open.c index 9921f70..d3fe1a1 100644 --- a/fs/open.c +++ b/fs/open.c @@ -316,7 +316,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!file->f_op->fallocate) return -EOPNOTSUPP; - sb_start_write(inode->i_sb); + file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* @@ -329,7 +329,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret == 0) fsnotify_modify(file); - sb_end_write(inode->i_sb); + file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); diff --git a/fs/read_write.c b/fs/read_write.c index 5816d4c..1e42487 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1538,7 +1538,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; - sb_start_write(inode_out->i_sb); + file_start_write(file_out); /* * Try cloning first, this is supported by more file systems, and @@ -1574,7 +1574,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, inc_syscr(current); inc_syscw(current); - sb_end_write(inode_out->i_sb); + file_end_write(file_out); return ret; } @@ -1976,11 +1976,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } dst = file_inode(dst_file); - ret = mnt_want_write_file(dst_file); - if (ret) { - info->status = ret; - goto next_loop; - } + file_start_write(dst_file); dst_off = info->dest_offset; ret = clone_verify_area(dst_file, dst_off, len, true); @@ -2013,7 +2009,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } next_file: - mnt_drop_write_file(dst_file); + file_end_write(dst_file); next_loop: fdput(dst_fd); diff --git a/include/linux/fs.h b/include/linux/fs.h index 71811be..8a4dbc6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1741,19 +1741,6 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); -static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len) -{ - int ret; - - sb_start_write(file_inode(file_out)->i_sb); - ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); - sb_end_write(file_inode(file_out)->i_sb); - - return ret; -} - struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); @@ -2579,6 +2566,19 @@ static inline void file_end_write(struct file *file) __sb_end_write(locks_inode(file)->i_sb, SB_FREEZE_WRITE); } +static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len) +{ + int ret; + + file_start_write(file_out); + ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); + file_end_write(file_out); + + return ret; +} + /* * get_write_access() gets write permission for a file. * put_write_access() releases this write permission.
Before calling write f_ops, call file_start_write() instead of sb_start_write(). This ensures freeze protection for both overlay and upper fs when file is open from an overlayfs mount. Replace {sb,file}_start_write() for {copy,clone}_file_range() and for fallocate(). For dedup_file_range() there is no need for mnt_want_write_file(). File is already open for write, so we already have mnt_want_write() and we only need file_start_write(). Signed-off-by: Amir Goldstein <amir73il@gmail.com> --- fs/open.c | 4 ++-- fs/read_write.c | 12 ++++-------- include/linux/fs.h | 26 +++++++++++++------------- 3 files changed, 19 insertions(+), 23 deletions(-)