Message ID | 1436778368-65447-6-git-send-email-tao.peng@primarydata.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Tao, One question inline (below): On 07/13/2015 05:06 AM, Peng Tao wrote: > It can be called by user space to CLONE two files. > Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE. > Thus we don't mess up userspace with too many ioctls. > > Signed-off-by: Peng Tao <tao.peng@primarydata.com> > --- > fs/nfs/nfs4file.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/nfs.h | 4 ++ > 2 files changed, 106 insertions(+) > > diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c > index dcd39d4..dfa6620 100644 > --- a/fs/nfs/nfs4file.c > +++ b/fs/nfs/nfs4file.c > @@ -4,6 +4,7 @@ > * Copyright (C) 1992 Rick Sladkey > */ > #include <linux/fs.h> > +#include <linux/file.h> > #include <linux/falloc.h> > #include <linux/nfs_fs.h> > #include "internal.h" > @@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t > return nfs42_proc_deallocate(filep, offset, len); > return nfs42_proc_allocate(filep, offset, len); > } > + > +static noinline long > +nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, > + u64 src_off, u64 dst_off, u64 count) > +{ > + struct inode *dst_inode = file_inode(dst_file); > + struct fd src_file; > + struct inode *src_inode; > + int ret; > + > + /* dst file must be opened for writing */ > + if (!(dst_file->f_mode & FMODE_WRITE)) > + return -EINVAL; > + > + ret = mnt_want_write_file(dst_file); > + if (ret) > + return ret; > + > + src_file = fdget(srcfd); > + if (!src_file.file) { > + ret = -EBADF; > + goto out_drop_write; > + } > + > + src_inode = file_inode(src_file.file); > + > + /* src and dst must be different files */ > + ret = -EINVAL; > + if (src_inode == dst_inode) > + goto out_fput; > + > + /* src file must be opened for reading */ > + if (!(src_file.file->f_mode & FMODE_READ)) > + goto out_fput; > + > + /* src and dst must be regular files */ > + ret = -EISDIR; > + if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode)) > + goto out_fput; > + > + ret = -EXDEV; > + if (src_file.file->f_path.mnt != dst_file->f_path.mnt || > + src_inode->i_sb != dst_inode->i_sb) > + goto out_fput; > + > + /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ > + if (dst_inode < src_inode) { Why is the order of inode numbers important? Thanks, Anna > + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); > + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); > + } else { > + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); > + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); > + } > + > + /* flush all pending writes on both src and dst so that server > + * has the latest data */ > + ret = nfs_sync_inode(src_inode); > + if (ret) > + goto out_unlock; > + ret = nfs_sync_inode(dst_inode); > + if (ret) > + goto out_unlock; > + > + ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count); > + > + /* truncate inode page cache of the dst range so that future reads can fetch > + * new data from server */ > + if (!ret) > + truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); > + > +out_unlock: > + if (dst_inode < src_inode) { > + mutex_unlock(&src_inode->i_mutex); > + mutex_unlock(&dst_inode->i_mutex); > + } else { > + mutex_unlock(&dst_inode->i_mutex); > + mutex_unlock(&src_inode->i_mutex); > + } > +out_fput: > + fdput(src_file); > +out_drop_write: > + mnt_drop_write_file(dst_file); > + return ret; > +} > #endif /* CONFIG_NFS_V4_2 */ > > +long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > +{ > + switch (cmd) { > +#ifdef CONFIG_NFS_V4_2 > + case NFS_IOC_CLONE: > + return nfs42_ioctl_clone(file, arg, 0, 0, 0); > +#endif > + } > + > + return -ENOTTY; > +} > + > const struct file_operations nfs4_file_operations = { > #ifdef CONFIG_NFS_V4_2 > .llseek = nfs4_file_llseek, > @@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = { > #endif /* CONFIG_NFS_V4_2 */ > .check_flags = nfs_check_flags, > .setlease = simple_nosetlease, > +#ifdef CONFIG_COMPAT > + .unlocked_ioctl = nfs4_ioctl, > +#else > + .compat_ioctl = nfs4_ioctl, > +#endif /* CONFIG_COMPAT */ > }; > diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h > index 5199a36..d85748d 100644 > --- a/include/uapi/linux/nfs.h > +++ b/include/uapi/linux/nfs.h > @@ -31,6 +31,10 @@ > > #define NFS_PIPE_DIRNAME "nfs" > > +/* NFS ioctls */ > +/* Let's follow btrfs lead on CLONE to avoid messing userspace */ > +#define NFS_IOC_CLONE _IOW(0x94, 9, int) > + > /* > * NFS stats. The good thing with these values is that NFSv3 errors are > * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 13, 2015 at 9:53 PM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote: > Hi Tao, > > One question inline (below): > > On 07/13/2015 05:06 AM, Peng Tao wrote: >> It can be called by user space to CLONE two files. >> Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE. >> Thus we don't mess up userspace with too many ioctls. >> >> Signed-off-by: Peng Tao <tao.peng@primarydata.com> >> --- >> fs/nfs/nfs4file.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++ >> include/uapi/linux/nfs.h | 4 ++ >> 2 files changed, 106 insertions(+) >> >> diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c >> index dcd39d4..dfa6620 100644 >> --- a/fs/nfs/nfs4file.c >> +++ b/fs/nfs/nfs4file.c >> @@ -4,6 +4,7 @@ >> * Copyright (C) 1992 Rick Sladkey >> */ >> #include <linux/fs.h> >> +#include <linux/file.h> >> #include <linux/falloc.h> >> #include <linux/nfs_fs.h> >> #include "internal.h" >> @@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t >> return nfs42_proc_deallocate(filep, offset, len); >> return nfs42_proc_allocate(filep, offset, len); >> } >> + >> +static noinline long >> +nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, >> + u64 src_off, u64 dst_off, u64 count) >> +{ >> + struct inode *dst_inode = file_inode(dst_file); >> + struct fd src_file; >> + struct inode *src_inode; >> + int ret; >> + >> + /* dst file must be opened for writing */ >> + if (!(dst_file->f_mode & FMODE_WRITE)) >> + return -EINVAL; >> + >> + ret = mnt_want_write_file(dst_file); >> + if (ret) >> + return ret; >> + >> + src_file = fdget(srcfd); >> + if (!src_file.file) { >> + ret = -EBADF; >> + goto out_drop_write; >> + } >> + >> + src_inode = file_inode(src_file.file); >> + >> + /* src and dst must be different files */ >> + ret = -EINVAL; >> + if (src_inode == dst_inode) >> + goto out_fput; >> + >> + /* src file must be opened for reading */ >> + if (!(src_file.file->f_mode & FMODE_READ)) >> + goto out_fput; >> + >> + /* src and dst must be regular files */ >> + ret = -EISDIR; >> + if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode)) >> + goto out_fput; >> + >> + ret = -EXDEV; >> + if (src_file.file->f_path.mnt != dst_file->f_path.mnt || >> + src_inode->i_sb != dst_inode->i_sb) >> + goto out_fput; >> + >> + /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ >> + if (dst_inode < src_inode) { > > Why is the order of inode numbers important? If we get two concurrent clone ioctls CLONE(foo,bar) and CLONE(bar,foo), the lock ordering avoids deadlock since we always lock smaller inode first. Cheers, Tao > > Thanks, > Anna > >> + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); >> + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); >> + } else { >> + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); >> + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); >> + } >> + >> + /* flush all pending writes on both src and dst so that server >> + * has the latest data */ >> + ret = nfs_sync_inode(src_inode); >> + if (ret) >> + goto out_unlock; >> + ret = nfs_sync_inode(dst_inode); >> + if (ret) >> + goto out_unlock; >> + >> + ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count); >> + >> + /* truncate inode page cache of the dst range so that future reads can fetch >> + * new data from server */ >> + if (!ret) >> + truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); >> + >> +out_unlock: >> + if (dst_inode < src_inode) { >> + mutex_unlock(&src_inode->i_mutex); >> + mutex_unlock(&dst_inode->i_mutex); >> + } else { >> + mutex_unlock(&dst_inode->i_mutex); >> + mutex_unlock(&src_inode->i_mutex); >> + } >> +out_fput: >> + fdput(src_file); >> +out_drop_write: >> + mnt_drop_write_file(dst_file); >> + return ret; >> +} >> #endif /* CONFIG_NFS_V4_2 */ >> >> +long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) >> +{ >> + switch (cmd) { >> +#ifdef CONFIG_NFS_V4_2 >> + case NFS_IOC_CLONE: >> + return nfs42_ioctl_clone(file, arg, 0, 0, 0); >> +#endif >> + } >> + >> + return -ENOTTY; >> +} >> + >> const struct file_operations nfs4_file_operations = { >> #ifdef CONFIG_NFS_V4_2 >> .llseek = nfs4_file_llseek, >> @@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = { >> #endif /* CONFIG_NFS_V4_2 */ >> .check_flags = nfs_check_flags, >> .setlease = simple_nosetlease, >> +#ifdef CONFIG_COMPAT >> + .unlocked_ioctl = nfs4_ioctl, >> +#else >> + .compat_ioctl = nfs4_ioctl, >> +#endif /* CONFIG_COMPAT */ >> }; >> diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h >> index 5199a36..d85748d 100644 >> --- a/include/uapi/linux/nfs.h >> +++ b/include/uapi/linux/nfs.h >> @@ -31,6 +31,10 @@ >> >> #define NFS_PIPE_DIRNAME "nfs" >> >> +/* NFS ioctls */ >> +/* Let's follow btrfs lead on CLONE to avoid messing userspace */ >> +#define NFS_IOC_CLONE _IOW(0x94, 9, int) >> + >> /* >> * NFS stats. The good thing with these values is that NFSv3 errors are >> * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which >> > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index dcd39d4..dfa6620 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -4,6 +4,7 @@ * Copyright (C) 1992 Rick Sladkey */ #include <linux/fs.h> +#include <linux/file.h> #include <linux/falloc.h> #include <linux/nfs_fs.h> #include "internal.h" @@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_deallocate(filep, offset, len); return nfs42_proc_allocate(filep, offset, len); } + +static noinline long +nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, + u64 src_off, u64 dst_off, u64 count) +{ + struct inode *dst_inode = file_inode(dst_file); + struct fd src_file; + struct inode *src_inode; + int ret; + + /* dst file must be opened for writing */ + if (!(dst_file->f_mode & FMODE_WRITE)) + return -EINVAL; + + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; + + src_file = fdget(srcfd); + if (!src_file.file) { + ret = -EBADF; + goto out_drop_write; + } + + src_inode = file_inode(src_file.file); + + /* src and dst must be different files */ + ret = -EINVAL; + if (src_inode == dst_inode) + goto out_fput; + + /* src file must be opened for reading */ + if (!(src_file.file->f_mode & FMODE_READ)) + goto out_fput; + + /* src and dst must be regular files */ + ret = -EISDIR; + if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src_file.file->f_path.mnt != dst_file->f_path.mnt || + src_inode->i_sb != dst_inode->i_sb) + goto out_fput; + + /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ + if (dst_inode < src_inode) { + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + } + + /* flush all pending writes on both src and dst so that server + * has the latest data */ + ret = nfs_sync_inode(src_inode); + if (ret) + goto out_unlock; + ret = nfs_sync_inode(dst_inode); + if (ret) + goto out_unlock; + + ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count); + + /* truncate inode page cache of the dst range so that future reads can fetch + * new data from server */ + if (!ret) + truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); + +out_unlock: + if (dst_inode < src_inode) { + mutex_unlock(&src_inode->i_mutex); + mutex_unlock(&dst_inode->i_mutex); + } else { + mutex_unlock(&dst_inode->i_mutex); + mutex_unlock(&src_inode->i_mutex); + } +out_fput: + fdput(src_file); +out_drop_write: + mnt_drop_write_file(dst_file); + return ret; +} #endif /* CONFIG_NFS_V4_2 */ +long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { +#ifdef CONFIG_NFS_V4_2 + case NFS_IOC_CLONE: + return nfs42_ioctl_clone(file, arg, 0, 0, 0); +#endif + } + + return -ENOTTY; +} + const struct file_operations nfs4_file_operations = { #ifdef CONFIG_NFS_V4_2 .llseek = nfs4_file_llseek, @@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = { #endif /* CONFIG_NFS_V4_2 */ .check_flags = nfs_check_flags, .setlease = simple_nosetlease, +#ifdef CONFIG_COMPAT + .unlocked_ioctl = nfs4_ioctl, +#else + .compat_ioctl = nfs4_ioctl, +#endif /* CONFIG_COMPAT */ }; diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h index 5199a36..d85748d 100644 --- a/include/uapi/linux/nfs.h +++ b/include/uapi/linux/nfs.h @@ -31,6 +31,10 @@ #define NFS_PIPE_DIRNAME "nfs" +/* NFS ioctls */ +/* Let's follow btrfs lead on CLONE to avoid messing userspace */ +#define NFS_IOC_CLONE _IOW(0x94, 9, int) + /* * NFS stats. The good thing with these values is that NFSv3 errors are * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
It can be called by user space to CLONE two files. Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE. Thus we don't mess up userspace with too many ioctls. Signed-off-by: Peng Tao <tao.peng@primarydata.com> --- fs/nfs/nfs4file.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nfs.h | 4 ++ 2 files changed, 106 insertions(+)