Message ID | 20240520164624.665269-4-aalbersh@redhat.com (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Series | Introduce FS_IOC_FSSETXATTRAT/FS_IOC_FSGETXATTRAT ioctls | expand |
On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote: > XFS has project quotas which could be attached to a directory. All > new inodes in these directories inherit project ID set on parent > directory. > > The project is created from userspace by opening and calling > FS_IOC_FSSETXATTR on each inode. This is not possible for special > files such as FIFO, SOCK, BLK etc. as opening them returns a special > inode from VFS. Therefore, some inodes are left with empty project > ID. Those inodes then are not shown in the quota accounting but > still exist in the directory. > > This patch adds two new ioctls which allows userspace, such as > xfs_quota, to set project ID on special files by using parent > directory to open FS inode. This will let xfs_quota set ID on all > inodes and also reset it when project is removed. Also, as > vfs_fileattr_set() is now will called on special files too, let's > forbid any other attributes except projid and nextents (symlink can > have one). > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > --- > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/fs.h | 11 +++++ > 2 files changed, 104 insertions(+) > > diff --git a/fs/ioctl.c b/fs/ioctl.c > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > --- a/fs/ioctl.c > +++ b/fs/ioctl.c > @@ -22,6 +22,7 @@ > #include <linux/mount.h> > #include <linux/fscrypt.h> > #include <linux/fileattr.h> > +#include <linux/namei.h> > > #include "internal.h" > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > if (fa->fsx_cowextsize == 0) > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > + /* > + * The only use case for special files is to set project ID, forbid any > + * other attributes > + */ > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) When would PROJINHERIT be set on a non-reg/non-dir file? > + return -EINVAL; > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any other type of file, does it? > + return -EINVAL; > + if (fa->fsx_extsize || fa->fsx_cowextsize) > + return -EINVAL; > + } > + > return 0; > } > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > return err; > } > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > +{ > + struct path filepath; > + struct fsxattrat fsxat; > + struct fileattr fa; > + int error; > + > + if (!S_ISDIR(file_inode(file)->i_mode)) > + return -EBADF; > + > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > + return -EFAULT; > + > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > + if (error) > + return error; > + > + error = vfs_fileattr_get(filepath.dentry, &fa); > + if (error) { > + path_put(&filepath); > + return error; > + } > + > + fsxat.fsx.fsx_xflags = fa.fsx_xflags; > + fsxat.fsx.fsx_extsize = fa.fsx_extsize; > + fsxat.fsx.fsx_nextents = fa.fsx_nextents; > + fsxat.fsx.fsx_projid = fa.fsx_projid; > + fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize; > + > + if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat))) > + error = -EFAULT; > + > + path_put(&filepath); > + return error; > +} > + > +static int ioctl_fssetxattrat(struct file *file, void __user *argp) > +{ > + struct mnt_idmap *idmap = file_mnt_idmap(file); > + struct fsxattrat fsxat; > + struct path filepath; > + struct fileattr fa; > + int error; > + > + if (!S_ISDIR(file_inode(file)->i_mode)) > + return -EBADF; > + > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > + return -EFAULT; > + > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > + if (error) > + return error; > + > + error = mnt_want_write(filepath.mnt); > + if (error) { > + path_put(&filepath); > + return error; This could be a goto to the path_put below. > + } > + > + fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags); > + fa.fsx_extsize = fsxat.fsx.fsx_extsize; > + fa.fsx_nextents = fsxat.fsx.fsx_nextents; > + fa.fsx_projid = fsxat.fsx.fsx_projid; > + fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize; > + fa.fsx_valid = true; > + > + error = vfs_fileattr_set(idmap, filepath.dentry, &fa); Why not pass &fsxat.fsx directly to vfs_fileattr_set? > + mnt_drop_write(filepath.mnt); > + path_put(&filepath); > + return error; > +} > + > static int ioctl_getfsuuid(struct file *file, void __user *argp) > { > struct super_block *sb = file_inode(file)->i_sb; > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > case FS_IOC_FSSETXATTR: > return ioctl_fssetxattr(filp, argp); > > + case FS_IOC_FSGETXATTRAT: > + return ioctl_fsgetxattrat(filp, argp); > + > + case FS_IOC_FSSETXATTRAT: > + return ioctl_fssetxattrat(filp, argp); > + > case FS_IOC_GETFSUUID: > return ioctl_getfsuuid(filp, argp); > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index 45e4e64fd664..f8cd8d7bf35d 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -139,6 +139,15 @@ struct fsxattr { > unsigned char fsx_pad[8]; > }; > > +/* > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT > + */ > +struct fsxattrat { > + struct fsxattr fsx; /* XATTR to get/set */ > + __u32 dfd; /* parent dir */ > + const char __user *path; As I mentioned last time[1], embedding a pointer in an ioctl structure creates porting problems because pointer sizes vary between process personalities, so the size of struct fsxattrat will vary and lead to copy_to/from_user overflows. [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/ --D > +}; > + > /* > * Flags for the fsx_xflags field > */ > @@ -231,6 +240,8 @@ struct fsxattr { > #define FS_IOC32_SETVERSION _IOW('v', 2, int) > #define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) > #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) > +#define FS_IOC_FSGETXATTRAT _IOR('X', 33, struct fsxattrat) > +#define FS_IOC_FSSETXATTRAT _IOW('X', 34, struct fsxattrat) > #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) > #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) > /* Returns the external filesystem UUID, the same one blkid returns */ > -- > 2.42.0 > >
On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > XFS has project quotas which could be attached to a directory. All > new inodes in these directories inherit project ID set on parent > directory. > > The project is created from userspace by opening and calling > FS_IOC_FSSETXATTR on each inode. This is not possible for special > files such as FIFO, SOCK, BLK etc. as opening them returns a special > inode from VFS. Therefore, some inodes are left with empty project > ID. Those inodes then are not shown in the quota accounting but > still exist in the directory. > > This patch adds two new ioctls which allows userspace, such as > xfs_quota, to set project ID on special files by using parent > directory to open FS inode. This will let xfs_quota set ID on all > inodes and also reset it when project is removed. Also, as > vfs_fileattr_set() is now will called on special files too, let's > forbid any other attributes except projid and nextents (symlink can > have one). > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > --- > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/fs.h | 11 +++++ > 2 files changed, 104 insertions(+) > > diff --git a/fs/ioctl.c b/fs/ioctl.c > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > --- a/fs/ioctl.c > +++ b/fs/ioctl.c > @@ -22,6 +22,7 @@ > #include <linux/mount.h> > #include <linux/fscrypt.h> > #include <linux/fileattr.h> > +#include <linux/namei.h> > > #include "internal.h" > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > if (fa->fsx_cowextsize == 0) > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > + /* > + * The only use case for special files is to set project ID, forbid any > + * other attributes > + */ > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > + return -EINVAL; > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > + return -EINVAL; > + if (fa->fsx_extsize || fa->fsx_cowextsize) > + return -EINVAL; > + } > + > return 0; > } > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > return err; > } > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > +{ > + struct path filepath; > + struct fsxattrat fsxat; > + struct fileattr fa; > + int error; > + > + if (!S_ISDIR(file_inode(file)->i_mode)) > + return -EBADF; So the *only* thing that is done with the fd of the ioctl is to verify that it is a directory fd - there is no verification that this fd is on the same sb as the path to act on. Was this the intention? It does not make a lot of sense to me and AFAIK there is no precedent to an API like this. There are ioctls that operate on the filesystem using any fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH} and maybe the closest example to what you are trying to add XFS_IOC_BULKSTAT. Trying to think of a saner API for this - perhaps pass an O_PATH fd without any filename in struct fsxattrat, saving you also the headache of passing a variable length string in an ioctl. Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path and file->f_path are on the same sb before proceeding to operate on atfile->f_path.dentry. The (maybe more sane) alternative is to add syscalls instead of ioctls, but I won't force you to go there... What do everyone think? Thanks, Amir.
[fix fsdevel address] On Mon, May 20, 2024 at 10:03 PM Amir Goldstein <amir73il@gmail.com> wrote: > > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > XFS has project quotas which could be attached to a directory. All > > new inodes in these directories inherit project ID set on parent > > directory. > > > > The project is created from userspace by opening and calling > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > inode from VFS. Therefore, some inodes are left with empty project > > ID. Those inodes then are not shown in the quota accounting but > > still exist in the directory. > > > > This patch adds two new ioctls which allows userspace, such as > > xfs_quota, to set project ID on special files by using parent > > directory to open FS inode. This will let xfs_quota set ID on all > > inodes and also reset it when project is removed. Also, as > > vfs_fileattr_set() is now will called on special files too, let's > > forbid any other attributes except projid and nextents (symlink can > > have one). > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > --- > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/fs.h | 11 +++++ > > 2 files changed, 104 insertions(+) > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > --- a/fs/ioctl.c > > +++ b/fs/ioctl.c > > @@ -22,6 +22,7 @@ > > #include <linux/mount.h> > > #include <linux/fscrypt.h> > > #include <linux/fileattr.h> > > +#include <linux/namei.h> > > > > #include "internal.h" > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > if (fa->fsx_cowextsize == 0) > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > + /* > > + * The only use case for special files is to set project ID, forbid any > > + * other attributes > > + */ > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > + return -EINVAL; > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > + return -EINVAL; > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > + return -EINVAL; > > + } > > + > > return 0; > > } > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > return err; > > } > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > +{ > > + struct path filepath; > > + struct fsxattrat fsxat; > > + struct fileattr fa; > > + int error; > > + > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > + return -EBADF; > > So the *only* thing that is done with the fd of the ioctl is to verify > that it is a directory fd - there is no verification that this fd is on the > same sb as the path to act on. > > Was this the intention? It does not make a lot of sense to me > and AFAIK there is no precedent to an API like this. > > There are ioctls that operate on the filesystem using any > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH} > and maybe the closest example to what you are trying to add > XFS_IOC_BULKSTAT. > > Trying to think of a saner API for this - perhaps pass an O_PATH > fd without any filename in struct fsxattrat, saving you also the > headache of passing a variable length string in an ioctl. > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path > and file->f_path are on the same sb before proceeding to operate > on atfile->f_path.dentry. > > The (maybe more sane) alternative is to add syscalls instead of > ioctls, but I won't force you to go there... > > What do everyone think? > > Thanks, > Amir.
On 2024-05-20 10:51:59, Darrick J. Wong wrote: > On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote: > > XFS has project quotas which could be attached to a directory. All > > new inodes in these directories inherit project ID set on parent > > directory. > > > > The project is created from userspace by opening and calling > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > inode from VFS. Therefore, some inodes are left with empty project > > ID. Those inodes then are not shown in the quota accounting but > > still exist in the directory. > > > > This patch adds two new ioctls which allows userspace, such as > > xfs_quota, to set project ID on special files by using parent > > directory to open FS inode. This will let xfs_quota set ID on all > > inodes and also reset it when project is removed. Also, as > > vfs_fileattr_set() is now will called on special files too, let's > > forbid any other attributes except projid and nextents (symlink can > > have one). > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > --- > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/fs.h | 11 +++++ > > 2 files changed, 104 insertions(+) > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > --- a/fs/ioctl.c > > +++ b/fs/ioctl.c > > @@ -22,6 +22,7 @@ > > #include <linux/mount.h> > > #include <linux/fscrypt.h> > > #include <linux/fileattr.h> > > +#include <linux/namei.h> > > > > #include "internal.h" > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > if (fa->fsx_cowextsize == 0) > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > + /* > > + * The only use case for special files is to set project ID, forbid any > > + * other attributes > > + */ > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > When would PROJINHERIT be set on a non-reg/non-dir file? Oh I thought it's set on all inodes in projects anyway, but looks like it's dropped for files in xfs_flags2diflags(). The xfs_quota just set it for each inode so I just haven't checked it. I will drop this check and change xfs_quota to not set PROJINHERIT for special files. > > + return -EINVAL; > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any > other type of file, does it? yes, it doesn't enforce anything, as I described in the comment I tried to reject any other uses. But symlink has fsx_nextents == 1 so get + set call will fail. > > > + return -EINVAL; > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > + return -EINVAL; > > + } > > + > > return 0; > > } > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > return err; > > } > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > +{ > > + struct path filepath; > > + struct fsxattrat fsxat; > > + struct fileattr fa; > > + int error; > > + > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > + return -EBADF; > > + > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > + return -EFAULT; > > + > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > + if (error) > > + return error; > > + > > + error = vfs_fileattr_get(filepath.dentry, &fa); > > + if (error) { > > + path_put(&filepath); > > + return error; > > + } > > + > > + fsxat.fsx.fsx_xflags = fa.fsx_xflags; > > + fsxat.fsx.fsx_extsize = fa.fsx_extsize; > > + fsxat.fsx.fsx_nextents = fa.fsx_nextents; > > + fsxat.fsx.fsx_projid = fa.fsx_projid; > > + fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize; > > + > > + if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat))) > > + error = -EFAULT; > > + > > + path_put(&filepath); > > + return error; > > +} > > + > > +static int ioctl_fssetxattrat(struct file *file, void __user *argp) > > +{ > > + struct mnt_idmap *idmap = file_mnt_idmap(file); > > + struct fsxattrat fsxat; > > + struct path filepath; > > + struct fileattr fa; > > + int error; > > + > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > + return -EBADF; > > + > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > + return -EFAULT; > > + > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > + if (error) > > + return error; > > + > > + error = mnt_want_write(filepath.mnt); > > + if (error) { > > + path_put(&filepath); > > + return error; > > This could be a goto to the path_put below. > > > + } > > + > > + fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags); > > + fa.fsx_extsize = fsxat.fsx.fsx_extsize; > > + fa.fsx_nextents = fsxat.fsx.fsx_nextents; > > + fa.fsx_projid = fsxat.fsx.fsx_projid; > > + fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize; > > + fa.fsx_valid = true; > > + > > + error = vfs_fileattr_set(idmap, filepath.dentry, &fa); > > Why not pass &fsxat.fsx directly to vfs_fileattr_set? vfs_fileattr_set() takes fileattr and there won't be fsx_valid, no? I suppose they aren't aligned > > > + mnt_drop_write(filepath.mnt); > > + path_put(&filepath); > > + return error; > > +} > > + > > static int ioctl_getfsuuid(struct file *file, void __user *argp) > > { > > struct super_block *sb = file_inode(file)->i_sb; > > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > case FS_IOC_FSSETXATTR: > > return ioctl_fssetxattr(filp, argp); > > > > + case FS_IOC_FSGETXATTRAT: > > + return ioctl_fsgetxattrat(filp, argp); > > + > > + case FS_IOC_FSSETXATTRAT: > > + return ioctl_fssetxattrat(filp, argp); > > + > > case FS_IOC_GETFSUUID: > > return ioctl_getfsuuid(filp, argp); > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > > index 45e4e64fd664..f8cd8d7bf35d 100644 > > --- a/include/uapi/linux/fs.h > > +++ b/include/uapi/linux/fs.h > > @@ -139,6 +139,15 @@ struct fsxattr { > > unsigned char fsx_pad[8]; > > }; > > > > +/* > > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT > > + */ > > +struct fsxattrat { > > + struct fsxattr fsx; /* XATTR to get/set */ > > + __u32 dfd; /* parent dir */ > > + const char __user *path; > > As I mentioned last time[1], embedding a pointer in an ioctl structure > creates porting problems because pointer sizes vary between process > personalities, so the size of struct fsxattrat will vary and lead to > copy_to/from_user overflows. Oh right, sorry, totally forgot about that > > > [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/ > > --D > > > +}; > > + > > /* > > * Flags for the fsx_xflags field > > */ > > @@ -231,6 +240,8 @@ struct fsxattr { > > #define FS_IOC32_SETVERSION _IOW('v', 2, int) > > #define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) > > #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) > > +#define FS_IOC_FSGETXATTRAT _IOR('X', 33, struct fsxattrat) > > +#define FS_IOC_FSSETXATTRAT _IOW('X', 34, struct fsxattrat) > > #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) > > #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) > > /* Returns the external filesystem UUID, the same one blkid returns */ > > -- > > 2.42.0 > > > > >
On Mon, May 20, 2024 at 10:51:59AM -0700, Darrick J. Wong wrote: > On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote: > > XFS has project quotas which could be attached to a directory. All > > new inodes in these directories inherit project ID set on parent > > directory. > > > > The project is created from userspace by opening and calling > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > inode from VFS. Therefore, some inodes are left with empty project > > ID. Those inodes then are not shown in the quota accounting but > > still exist in the directory. > > > > This patch adds two new ioctls which allows userspace, such as > > xfs_quota, to set project ID on special files by using parent > > directory to open FS inode. This will let xfs_quota set ID on all > > inodes and also reset it when project is removed. Also, as > > vfs_fileattr_set() is now will called on special files too, let's > > forbid any other attributes except projid and nextents (symlink can > > have one). > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > --- > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/fs.h | 11 +++++ > > 2 files changed, 104 insertions(+) > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > --- a/fs/ioctl.c > > +++ b/fs/ioctl.c > > @@ -22,6 +22,7 @@ > > #include <linux/mount.h> > > #include <linux/fscrypt.h> > > #include <linux/fileattr.h> > > +#include <linux/namei.h> > > > > #include "internal.h" > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > if (fa->fsx_cowextsize == 0) > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > + /* > > + * The only use case for special files is to set project ID, forbid any > > + * other attributes > > + */ > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > When would PROJINHERIT be set on a non-reg/non-dir file? > > > + return -EINVAL; > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any > other type of file, does it? > > > + return -EINVAL; > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > + return -EINVAL; > > + } > > + > > return 0; > > } > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > return err; > > } > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > +{ > > + struct path filepath; > > + struct fsxattrat fsxat; > > + struct fileattr fa; > > + int error; > > + > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > + return -EBADF; > > + > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > + return -EFAULT; > > + > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > + if (error) > > + return error; > > + > > + error = vfs_fileattr_get(filepath.dentry, &fa); > > + if (error) { > > + path_put(&filepath); > > + return error; > > + } > > + > > + fsxat.fsx.fsx_xflags = fa.fsx_xflags; > > + fsxat.fsx.fsx_extsize = fa.fsx_extsize; > > + fsxat.fsx.fsx_nextents = fa.fsx_nextents; > > + fsxat.fsx.fsx_projid = fa.fsx_projid; > > + fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize; > > + > > + if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat))) > > + error = -EFAULT; > > + > > + path_put(&filepath); > > + return error; > > +} > > + > > +static int ioctl_fssetxattrat(struct file *file, void __user *argp) > > +{ > > + struct mnt_idmap *idmap = file_mnt_idmap(file); > > + struct fsxattrat fsxat; > > + struct path filepath; > > + struct fileattr fa; > > + int error; > > + > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > + return -EBADF; > > + > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > + return -EFAULT; > > + > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > + if (error) > > + return error; > > + > > + error = mnt_want_write(filepath.mnt); > > + if (error) { > > + path_put(&filepath); > > + return error; > > This could be a goto to the path_put below. (Unrelated to content but we should probably grow cleanup guards for path_get()/path_put() and mnt_want_write()/mnt_drop_write() then stuff like this becomes a no-brainer.) > > > + } > > + > > + fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags); > > + fa.fsx_extsize = fsxat.fsx.fsx_extsize; > > + fa.fsx_nextents = fsxat.fsx.fsx_nextents; > > + fa.fsx_projid = fsxat.fsx.fsx_projid; > > + fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize; > > + fa.fsx_valid = true; > > + > > + error = vfs_fileattr_set(idmap, filepath.dentry, &fa); > > Why not pass &fsxat.fsx directly to vfs_fileattr_set? > > > + mnt_drop_write(filepath.mnt); > > + path_put(&filepath); > > + return error; > > +} > > + > > static int ioctl_getfsuuid(struct file *file, void __user *argp) > > { > > struct super_block *sb = file_inode(file)->i_sb; > > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > case FS_IOC_FSSETXATTR: > > return ioctl_fssetxattr(filp, argp); > > > > + case FS_IOC_FSGETXATTRAT: > > + return ioctl_fsgetxattrat(filp, argp); > > + > > + case FS_IOC_FSSETXATTRAT: > > + return ioctl_fssetxattrat(filp, argp); > > + > > case FS_IOC_GETFSUUID: > > return ioctl_getfsuuid(filp, argp); > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > > index 45e4e64fd664..f8cd8d7bf35d 100644 > > --- a/include/uapi/linux/fs.h > > +++ b/include/uapi/linux/fs.h > > @@ -139,6 +139,15 @@ struct fsxattr { > > unsigned char fsx_pad[8]; > > }; > > > > +/* > > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT > > + */ > > +struct fsxattrat { > > + struct fsxattr fsx; /* XATTR to get/set */ > > + __u32 dfd; /* parent dir */ > > + const char __user *path; > > As I mentioned last time[1], embedding a pointer in an ioctl structure > creates porting problems because pointer sizes vary between process > personalities, so the size of struct fsxattrat will vary and lead to > copy_to/from_user overflows. > > > [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/ So as you've mentioned in that thread using a u64 or here an aligned_u64 makes the most sense. The kernel has all the necessary magic to deal with this already (u64_to_user_ptr() helper etc.). If you want to be extra-sure it's possible to slap a size for that path as well. The ioctl structure can be versioned by size if it's 64bit aligned. The ioctl code encodes the size of the struct and since the current structure is all nicely 64bit aligned it should just work (tm). kernel/seccomp.c does that size verisioning by ioctl as an example. Then one can do: #define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) switch (EA_IOCTL(cmd)) { case EA_IOCTL(FS_IOC_FSGETXATTRAT): ret = copy_struct_from_user(&kstruct, sizeof(kstruct), usstruct, _IOC_SIZE(cmd)); Which will do all the heavy lifting for you. To me that seems workable but I might miss other problems you're thinking of.
On Tue, May 21, 2024 at 04:06:15PM +0200, Christian Brauner wrote: > On Mon, May 20, 2024 at 10:51:59AM -0700, Darrick J. Wong wrote: > > On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote: > > > XFS has project quotas which could be attached to a directory. All > > > new inodes in these directories inherit project ID set on parent > > > directory. > > > > > > The project is created from userspace by opening and calling > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > inode from VFS. Therefore, some inodes are left with empty project > > > ID. Those inodes then are not shown in the quota accounting but > > > still exist in the directory. > > > > > > This patch adds two new ioctls which allows userspace, such as > > > xfs_quota, to set project ID on special files by using parent > > > directory to open FS inode. This will let xfs_quota set ID on all > > > inodes and also reset it when project is removed. Also, as > > > vfs_fileattr_set() is now will called on special files too, let's > > > forbid any other attributes except projid and nextents (symlink can > > > have one). > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > --- > > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/fs.h | 11 +++++ > > > 2 files changed, 104 insertions(+) > > > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > > --- a/fs/ioctl.c > > > +++ b/fs/ioctl.c > > > @@ -22,6 +22,7 @@ > > > #include <linux/mount.h> > > > #include <linux/fscrypt.h> > > > #include <linux/fileattr.h> > > > +#include <linux/namei.h> > > > > > > #include "internal.h" > > > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > > if (fa->fsx_cowextsize == 0) > > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > > > + /* > > > + * The only use case for special files is to set project ID, forbid any > > > + * other attributes > > > + */ > > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > > > When would PROJINHERIT be set on a non-reg/non-dir file? > > > > > + return -EINVAL; > > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > > > FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any > > other type of file, does it? > > > > > + return -EINVAL; > > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > > + return -EINVAL; > > > + } > > > + > > > return 0; > > > } > > > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > > return err; > > > } > > > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > > +{ > > > + struct path filepath; > > > + struct fsxattrat fsxat; > > > + struct fileattr fa; > > > + int error; > > > + > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > + return -EBADF; > > > + > > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > > + return -EFAULT; > > > + > > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > > + if (error) > > > + return error; > > > + > > > + error = vfs_fileattr_get(filepath.dentry, &fa); > > > + if (error) { > > > + path_put(&filepath); > > > + return error; > > > + } > > > + > > > + fsxat.fsx.fsx_xflags = fa.fsx_xflags; > > > + fsxat.fsx.fsx_extsize = fa.fsx_extsize; > > > + fsxat.fsx.fsx_nextents = fa.fsx_nextents; > > > + fsxat.fsx.fsx_projid = fa.fsx_projid; > > > + fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize; > > > + > > > + if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat))) > > > + error = -EFAULT; > > > + > > > + path_put(&filepath); > > > + return error; > > > +} > > > + > > > +static int ioctl_fssetxattrat(struct file *file, void __user *argp) > > > +{ > > > + struct mnt_idmap *idmap = file_mnt_idmap(file); > > > + struct fsxattrat fsxat; > > > + struct path filepath; > > > + struct fileattr fa; > > > + int error; > > > + > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > + return -EBADF; > > > + > > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > > + return -EFAULT; > > > + > > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > > + if (error) > > > + return error; > > > + > > > + error = mnt_want_write(filepath.mnt); > > > + if (error) { > > > + path_put(&filepath); > > > + return error; > > > > This could be a goto to the path_put below. > > (Unrelated to content but we should probably grow cleanup guards for > path_get()/path_put() and mnt_want_write()/mnt_drop_write() then stuff > like this becomes a no-brainer.) > > > > > > + } > > > + > > > + fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags); > > > + fa.fsx_extsize = fsxat.fsx.fsx_extsize; > > > + fa.fsx_nextents = fsxat.fsx.fsx_nextents; > > > + fa.fsx_projid = fsxat.fsx.fsx_projid; > > > + fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize; > > > + fa.fsx_valid = true; > > > + > > > + error = vfs_fileattr_set(idmap, filepath.dentry, &fa); > > > > Why not pass &fsxat.fsx directly to vfs_fileattr_set? > > > > > + mnt_drop_write(filepath.mnt); > > > + path_put(&filepath); > > > + return error; > > > +} > > > + > > > static int ioctl_getfsuuid(struct file *file, void __user *argp) > > > { > > > struct super_block *sb = file_inode(file)->i_sb; > > > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > > case FS_IOC_FSSETXATTR: > > > return ioctl_fssetxattr(filp, argp); > > > > > > + case FS_IOC_FSGETXATTRAT: > > > + return ioctl_fsgetxattrat(filp, argp); > > > + > > > + case FS_IOC_FSSETXATTRAT: > > > + return ioctl_fssetxattrat(filp, argp); > > > + > > > case FS_IOC_GETFSUUID: > > > return ioctl_getfsuuid(filp, argp); > > > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > > > index 45e4e64fd664..f8cd8d7bf35d 100644 > > > --- a/include/uapi/linux/fs.h > > > +++ b/include/uapi/linux/fs.h > > > @@ -139,6 +139,15 @@ struct fsxattr { > > > unsigned char fsx_pad[8]; > > > }; > > > > > > +/* > > > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT > > > + */ > > > +struct fsxattrat { > > > + struct fsxattr fsx; /* XATTR to get/set */ > > > + __u32 dfd; /* parent dir */ > > > + const char __user *path; > > > > As I mentioned last time[1], embedding a pointer in an ioctl structure > > creates porting problems because pointer sizes vary between process > > personalities, so the size of struct fsxattrat will vary and lead to > > copy_to/from_user overflows. > > > > > > [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/ > > So as you've mentioned in that thread using a u64 or here an aligned_u64 > makes the most sense. The kernel has all the necessary magic to deal > with this already (u64_to_user_ptr() helper etc.). If you want to be > extra-sure it's possible to slap a size for that path as well. > > The ioctl structure can be versioned by size if it's 64bit aligned. The > ioctl code encodes the size of the struct and since the current > structure is all nicely 64bit aligned it should just work (tm). > > kernel/seccomp.c does that size verisioning by ioctl as an example. Then > one can do: > > #define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) > > switch (EA_IOCTL(cmd)) { > case EA_IOCTL(FS_IOC_FSGETXATTRAT): > ret = copy_struct_from_user(&kstruct, sizeof(kstruct), usstruct, _IOC_SIZE(cmd)); > > Which will do all the heavy lifting for you. To me that seems > workable but I might miss other problems you're thinking of. Resending with fixed fsdevel address...
On Tue, May 21, 2024 at 04:19:40PM +0200, Christian Brauner wrote: > On Tue, May 21, 2024 at 04:06:15PM +0200, Christian Brauner wrote: > > On Mon, May 20, 2024 at 10:51:59AM -0700, Darrick J. Wong wrote: > > > On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote: > > > > XFS has project quotas which could be attached to a directory. All > > > > new inodes in these directories inherit project ID set on parent > > > > directory. > > > > > > > > The project is created from userspace by opening and calling > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > ID. Those inodes then are not shown in the quota accounting but > > > > still exist in the directory. > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > xfs_quota, to set project ID on special files by using parent > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > inodes and also reset it when project is removed. Also, as > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > forbid any other attributes except projid and nextents (symlink can > > > > have one). > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > --- > > > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > > > include/uapi/linux/fs.h | 11 +++++ > > > > 2 files changed, 104 insertions(+) > > > > > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > > > --- a/fs/ioctl.c > > > > +++ b/fs/ioctl.c > > > > @@ -22,6 +22,7 @@ > > > > #include <linux/mount.h> > > > > #include <linux/fscrypt.h> > > > > #include <linux/fileattr.h> > > > > +#include <linux/namei.h> > > > > > > > > #include "internal.h" > > > > > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > > > if (fa->fsx_cowextsize == 0) > > > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > > > > > + /* > > > > + * The only use case for special files is to set project ID, forbid any > > > > + * other attributes > > > > + */ > > > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > > > > > When would PROJINHERIT be set on a non-reg/non-dir file? > > > > > > > + return -EINVAL; > > > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > > > > > FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any > > > other type of file, does it? > > > > > > > + return -EINVAL; > > > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > > > + return -EINVAL; > > > > + } > > > > + > > > > return 0; > > > > } > > > > > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > > > return err; > > > > } > > > > > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > > > +{ > > > > + struct path filepath; > > > > + struct fsxattrat fsxat; > > > > + struct fileattr fa; > > > > + int error; > > > > + > > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > > + return -EBADF; > > > > + > > > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > > > + return -EFAULT; > > > > + > > > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > > > + if (error) > > > > + return error; > > > > + > > > > + error = vfs_fileattr_get(filepath.dentry, &fa); > > > > + if (error) { > > > > + path_put(&filepath); > > > > + return error; > > > > + } > > > > + > > > > + fsxat.fsx.fsx_xflags = fa.fsx_xflags; > > > > + fsxat.fsx.fsx_extsize = fa.fsx_extsize; > > > > + fsxat.fsx.fsx_nextents = fa.fsx_nextents; > > > > + fsxat.fsx.fsx_projid = fa.fsx_projid; > > > > + fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize; > > > > + > > > > + if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat))) > > > > + error = -EFAULT; > > > > + > > > > + path_put(&filepath); > > > > + return error; > > > > +} > > > > + > > > > +static int ioctl_fssetxattrat(struct file *file, void __user *argp) > > > > +{ > > > > + struct mnt_idmap *idmap = file_mnt_idmap(file); > > > > + struct fsxattrat fsxat; > > > > + struct path filepath; > > > > + struct fileattr fa; > > > > + int error; > > > > + > > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > > + return -EBADF; > > > > + > > > > + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) > > > > + return -EFAULT; > > > > + > > > > + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); > > > > + if (error) > > > > + return error; > > > > + > > > > + error = mnt_want_write(filepath.mnt); > > > > + if (error) { > > > > + path_put(&filepath); > > > > + return error; > > > > > > This could be a goto to the path_put below. > > > > (Unrelated to content but we should probably grow cleanup guards for > > path_get()/path_put() and mnt_want_write()/mnt_drop_write() then stuff > > like this becomes a no-brainer.) > > > > > > > > > + } > > > > + > > > > + fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags); > > > > + fa.fsx_extsize = fsxat.fsx.fsx_extsize; > > > > + fa.fsx_nextents = fsxat.fsx.fsx_nextents; > > > > + fa.fsx_projid = fsxat.fsx.fsx_projid; > > > > + fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize; > > > > + fa.fsx_valid = true; > > > > + > > > > + error = vfs_fileattr_set(idmap, filepath.dentry, &fa); > > > > > > Why not pass &fsxat.fsx directly to vfs_fileattr_set? > > > > > > > + mnt_drop_write(filepath.mnt); > > > > + path_put(&filepath); > > > > + return error; > > > > +} > > > > + > > > > static int ioctl_getfsuuid(struct file *file, void __user *argp) > > > > { > > > > struct super_block *sb = file_inode(file)->i_sb; > > > > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > > > case FS_IOC_FSSETXATTR: > > > > return ioctl_fssetxattr(filp, argp); > > > > > > > > + case FS_IOC_FSGETXATTRAT: > > > > + return ioctl_fsgetxattrat(filp, argp); > > > > + > > > > + case FS_IOC_FSSETXATTRAT: > > > > + return ioctl_fssetxattrat(filp, argp); > > > > + > > > > case FS_IOC_GETFSUUID: > > > > return ioctl_getfsuuid(filp, argp); > > > > > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > > > > index 45e4e64fd664..f8cd8d7bf35d 100644 > > > > --- a/include/uapi/linux/fs.h > > > > +++ b/include/uapi/linux/fs.h > > > > @@ -139,6 +139,15 @@ struct fsxattr { > > > > unsigned char fsx_pad[8]; > > > > }; > > > > > > > > +/* > > > > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT > > > > + */ > > > > +struct fsxattrat { > > > > + struct fsxattr fsx; /* XATTR to get/set */ > > > > + __u32 dfd; /* parent dir */ > > > > + const char __user *path; > > > > > > As I mentioned last time[1], embedding a pointer in an ioctl structure > > > creates porting problems because pointer sizes vary between process > > > personalities, so the size of struct fsxattrat will vary and lead to > > > copy_to/from_user overflows. > > > > > > > > > [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/ > > > > So as you've mentioned in that thread using a u64 or here an aligned_u64 > > makes the most sense. The kernel has all the necessary magic to deal > > with this already (u64_to_user_ptr() helper etc.). If you want to be > > extra-sure it's possible to slap a size for that path as well. Ooh I didn't know that existed, cleanup patch for xfs on its way! > > The ioctl structure can be versioned by size if it's 64bit aligned. The > > ioctl code encodes the size of the struct and since the current > > structure is all nicely 64bit aligned it should just work (tm). I've tried to merge variations-on-a-theme ioctls with the same @nr and different @size, but every time I've been asked not to do that. I've never understood why, since _IO[RW]* has worked that way on Linux forever, and BUILD_BUG_ON can be employed here to guard against collisions. The point is, I don't think xfs developers have adopted this particular habit and I'd love to know why. > > kernel/seccomp.c does that size verisioning by ioctl as an example. Then > > one can do: > > > > #define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) > > > > switch (EA_IOCTL(cmd)) { > > case EA_IOCTL(FS_IOC_FSGETXATTRAT): > > ret = copy_struct_from_user(&kstruct, sizeof(kstruct), usstruct, _IOC_SIZE(cmd)); > > > > Which will do all the heavy lifting for you. To me that seems > > workable but I might miss other problems you're thinking of. > > Resending with fixed fsdevel address... Yes, please fix that for v3 as well. --D
On 2024-05-20 22:03:43, Amir Goldstein wrote: > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > XFS has project quotas which could be attached to a directory. All > > new inodes in these directories inherit project ID set on parent > > directory. > > > > The project is created from userspace by opening and calling > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > inode from VFS. Therefore, some inodes are left with empty project > > ID. Those inodes then are not shown in the quota accounting but > > still exist in the directory. > > > > This patch adds two new ioctls which allows userspace, such as > > xfs_quota, to set project ID on special files by using parent > > directory to open FS inode. This will let xfs_quota set ID on all > > inodes and also reset it when project is removed. Also, as > > vfs_fileattr_set() is now will called on special files too, let's > > forbid any other attributes except projid and nextents (symlink can > > have one). > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > --- > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/fs.h | 11 +++++ > > 2 files changed, 104 insertions(+) > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > --- a/fs/ioctl.c > > +++ b/fs/ioctl.c > > @@ -22,6 +22,7 @@ > > #include <linux/mount.h> > > #include <linux/fscrypt.h> > > #include <linux/fileattr.h> > > +#include <linux/namei.h> > > > > #include "internal.h" > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > if (fa->fsx_cowextsize == 0) > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > + /* > > + * The only use case for special files is to set project ID, forbid any > > + * other attributes > > + */ > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > + return -EINVAL; > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > + return -EINVAL; > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > + return -EINVAL; > > + } > > + > > return 0; > > } > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > return err; > > } > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > +{ > > + struct path filepath; > > + struct fsxattrat fsxat; > > + struct fileattr fa; > > + int error; > > + > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > + return -EBADF; > > So the *only* thing that is done with the fd of the ioctl is to verify > that it is a directory fd - there is no verification that this fd is on the > same sb as the path to act on. > > Was this the intention? It does not make a lot of sense to me > and AFAIK there is no precedent to an API like this. yeah, as we want to set xattrs on that inode the path is pointing to, so, VFS will call to the FS under that sb. > > There are ioctls that operate on the filesystem using any > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH} > and maybe the closest example to what you are trying to add > XFS_IOC_BULKSTAT. Not sure that I get what you mean here, the *at() part is to get around VFS special inodes and call vfs_fileattr_set/get on FS inodes. > > Trying to think of a saner API for this - perhaps pass an O_PATH > fd without any filename in struct fsxattrat, saving you also the > headache of passing a variable length string in an ioctl. > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path > and file->f_path are on the same sb before proceeding to operate > on atfile->f_path.dentry. Thanks! Didn't know about O_PATH that seems to be a way to get rid of the path passing. > > The (maybe more sane) alternative is to add syscalls instead of > ioctls, but I won't force you to go there... > > What do everyone think? > > Thanks, > Amir. >
On Tue, May 21, 2024 at 7:34 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > On 2024-05-20 22:03:43, Amir Goldstein wrote: > > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > XFS has project quotas which could be attached to a directory. All > > > new inodes in these directories inherit project ID set on parent > > > directory. > > > > > > The project is created from userspace by opening and calling > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > inode from VFS. Therefore, some inodes are left with empty project > > > ID. Those inodes then are not shown in the quota accounting but > > > still exist in the directory. > > > > > > This patch adds two new ioctls which allows userspace, such as > > > xfs_quota, to set project ID on special files by using parent > > > directory to open FS inode. This will let xfs_quota set ID on all > > > inodes and also reset it when project is removed. Also, as > > > vfs_fileattr_set() is now will called on special files too, let's > > > forbid any other attributes except projid and nextents (symlink can > > > have one). > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > --- > > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/fs.h | 11 +++++ > > > 2 files changed, 104 insertions(+) > > > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > > --- a/fs/ioctl.c > > > +++ b/fs/ioctl.c > > > @@ -22,6 +22,7 @@ > > > #include <linux/mount.h> > > > #include <linux/fscrypt.h> > > > #include <linux/fileattr.h> > > > +#include <linux/namei.h> > > > > > > #include "internal.h" > > > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > > if (fa->fsx_cowextsize == 0) > > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > > > + /* > > > + * The only use case for special files is to set project ID, forbid any > > > + * other attributes > > > + */ > > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > > + return -EINVAL; > > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > > + return -EINVAL; > > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > > + return -EINVAL; > > > + } > > > + > > > return 0; > > > } > > > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > > return err; > > > } > > > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > > +{ > > > + struct path filepath; > > > + struct fsxattrat fsxat; > > > + struct fileattr fa; > > > + int error; > > > + > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > + return -EBADF; > > > > So the *only* thing that is done with the fd of the ioctl is to verify > > that it is a directory fd - there is no verification that this fd is on the > > same sb as the path to act on. > > > > Was this the intention? It does not make a lot of sense to me > > and AFAIK there is no precedent to an API like this. > > yeah, as we want to set xattrs on that inode the path is pointing > to, so, VFS will call to the FS under that sb. > > > > > There are ioctls that operate on the filesystem using any > > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH} > > and maybe the closest example to what you are trying to add > > XFS_IOC_BULKSTAT. > > Not sure that I get what you mean here, the *at() part is to get > around VFS special inodes and call vfs_fileattr_set/get on FS inodes. > My point was that with your proposed API the fd argument to ioctl() can be a directory from a completely arbitrary filesystem with nothing to do with the filesystem where fsxat.dfd is from and that makes very little sense from API POV. > > > > Trying to think of a saner API for this - perhaps pass an O_PATH > > fd without any filename in struct fsxattrat, saving you also the > > headache of passing a variable length string in an ioctl. > > > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path > > and file->f_path are on the same sb before proceeding to operate > > on atfile->f_path.dentry. > > Thanks! Didn't know about O_PATH that seems to be a way to get rid > of the path passing. > I found one precedent of this pattern with XFS_IOC_FD_TO_HANDLE, but keep in mind that this is quite an old legacy XFS API. This ioctl is performed on an "fshandle", which is an open fd to any object in the filesystem, but typically the mount root dir. The ioctl gets a structure xfs_fsop_handlereq_t which contains an fd member pointing to another object within an XFS filesystem. Actually, AFAICS, this code does not verify that the object and fshandle are on the same XFS filesystem, but only that both are on XFS filesystems: /* * We can only generate handles for inodes residing on a XFS filesystem, * and only for regular files, directories or symbolic links. */ error = -EINVAL; if (inode->i_sb->s_magic != XFS_SB_MAGIC) goto out_put; I don't know what's the best thing to do is, but I think that verifying that the ioctl fd and the O_PATH fd are on the same sb is the least controversial option for the first version - if needed that could be relaxed later on. Another alternative which is simpler from API POV would be to allow selective ioctl() commands on an O_PATH fd, but I think that is going to be more controversial. Something along those lines (completely untested): diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 45e4e64fd664..562f8bff91d2 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -241,6 +241,13 @@ struct fsxattr { */ #define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) +#define _IOC_AT (0x100) +#define FS_IOC_AT(nr) (_IOC_TYPE(nr) == _IOC_AT) + +/* The following ioctls can be operated on an O_PATH fd */ +#define FS_IOC_FSGETXATTRAT _IOR(_IOC_AT, 31, struct fsxattr) +#define FS_IOC_FSSETXATTRAT _IOW(_IOC_AT, 32, struct fsxattr) + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) * diff --git a/fs/ioctl.c b/fs/ioctl.c index 1d5abfdf0f22..f720500c705b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -867,9 +867,11 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, return ioctl_setflags(filp, argp); case FS_IOC_FSGETXATTR: + case FS_IOC_FSGETXATTRAT: return ioctl_fsgetxattr(filp, argp); case FS_IOC_FSSETXATTR: + case FS_IOC_FSSETXATTRAT: return ioctl_fssetxattr(filp, argp); case FS_IOC_GETFSUUID: @@ -879,7 +881,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, return ioctl_get_fs_sysfs_path(filp, argp); default: - if (S_ISREG(inode->i_mode)) + if (!FS_IOC_AT(cmd) && S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); break; } @@ -889,7 +891,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget(fd); + bool ioc_at = FS_IOC_AT(cmd); + struct fd f = ioc_at ? fdget_raw(fd) : fdget(fd); int error; if (!f.file) @@ -900,7 +903,7 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) goto out; error = do_vfs_ioctl(f.file, fd, cmd, arg); - if (error == -ENOIOCTLCMD) + if (!ioc_at && error == -ENOIOCTLCMD) error = vfs_ioctl(f.file, cmd, arg); --- Thanks, Amir.
Hello! On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > XFS has project quotas which could be attached to a directory. All > new inodes in these directories inherit project ID set on parent > directory. > > The project is created from userspace by opening and calling > FS_IOC_FSSETXATTR on each inode. This is not possible for special > files such as FIFO, SOCK, BLK etc. as opening them returns a special > inode from VFS. Therefore, some inodes are left with empty project > ID. Those inodes then are not shown in the quota accounting but > still exist in the directory. > > This patch adds two new ioctls which allows userspace, such as > xfs_quota, to set project ID on special files by using parent > directory to open FS inode. This will let xfs_quota set ID on all > inodes and also reset it when project is removed. Also, as > vfs_fileattr_set() is now will called on special files too, let's > forbid any other attributes except projid and nextents (symlink can > have one). > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> I'd like to understand one thing. Is it practically useful to set project IDs for special inodes? There is no significant disk space usage associated with them so wrt quotas we are speaking only about the inode itself. So is the concern that user could escape inode project quota accounting and perform some DoS? Or why do we bother with two new somewhat hairy ioctls for something that seems as a small corner case to me? Honza
Hi, On 2024-05-22 12:00:07, Jan Kara wrote: > Hello! > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > XFS has project quotas which could be attached to a directory. All > > new inodes in these directories inherit project ID set on parent > > directory. > > > > The project is created from userspace by opening and calling > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > inode from VFS. Therefore, some inodes are left with empty project > > ID. Those inodes then are not shown in the quota accounting but > > still exist in the directory. > > > > This patch adds two new ioctls which allows userspace, such as > > xfs_quota, to set project ID on special files by using parent > > directory to open FS inode. This will let xfs_quota set ID on all > > inodes and also reset it when project is removed. Also, as > > vfs_fileattr_set() is now will called on special files too, let's > > forbid any other attributes except projid and nextents (symlink can > > have one). > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > I'd like to understand one thing. Is it practically useful to set project > IDs for special inodes? There is no significant disk space usage associated > with them so wrt quotas we are speaking only about the inode itself. So is > the concern that user could escape inode project quota accounting and > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > for something that seems as a small corner case to me? So there's few things: - Quota accounting is missing only some special files. Special files created after quota project is setup inherit ID from the project directory. - For special files created after the project is setup there's no way to make them project-less. Therefore, creating a new project over those will fail due to project ID miss match. - It wasn't possible to hardlink/rename project-less special files inside a project due to ID miss match. The linking is fixed, and renaming is worked around in first patch. The initial report I got was about second and last point, an application was failing to create a new project after "restart" and wasn't able to link special files created beforehand.
On 2024-05-21 21:22:41, Amir Goldstein wrote: > On Tue, May 21, 2024 at 7:34 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > On 2024-05-20 22:03:43, Amir Goldstein wrote: > > > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > new inodes in these directories inherit project ID set on parent > > > > directory. > > > > > > > > The project is created from userspace by opening and calling > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > ID. Those inodes then are not shown in the quota accounting but > > > > still exist in the directory. > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > xfs_quota, to set project ID on special files by using parent > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > inodes and also reset it when project is removed. Also, as > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > forbid any other attributes except projid and nextents (symlink can > > > > have one). > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > --- > > > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > > > include/uapi/linux/fs.h | 11 +++++ > > > > 2 files changed, 104 insertions(+) > > > > > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > > > --- a/fs/ioctl.c > > > > +++ b/fs/ioctl.c > > > > @@ -22,6 +22,7 @@ > > > > #include <linux/mount.h> > > > > #include <linux/fscrypt.h> > > > > #include <linux/fileattr.h> > > > > +#include <linux/namei.h> > > > > > > > > #include "internal.h" > > > > > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > > > if (fa->fsx_cowextsize == 0) > > > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > > > > > + /* > > > > + * The only use case for special files is to set project ID, forbid any > > > > + * other attributes > > > > + */ > > > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > > > + return -EINVAL; > > > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > > > + return -EINVAL; > > > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > > > + return -EINVAL; > > > > + } > > > > + > > > > return 0; > > > > } > > > > > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > > > return err; > > > > } > > > > > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > > > +{ > > > > + struct path filepath; > > > > + struct fsxattrat fsxat; > > > > + struct fileattr fa; > > > > + int error; > > > > + > > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > > + return -EBADF; > > > > > > So the *only* thing that is done with the fd of the ioctl is to verify > > > that it is a directory fd - there is no verification that this fd is on the > > > same sb as the path to act on. > > > > > > Was this the intention? It does not make a lot of sense to me > > > and AFAIK there is no precedent to an API like this. > > > > yeah, as we want to set xattrs on that inode the path is pointing > > to, so, VFS will call to the FS under that sb. > > > > > > > > There are ioctls that operate on the filesystem using any > > > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH} > > > and maybe the closest example to what you are trying to add > > > XFS_IOC_BULKSTAT. > > > > Not sure that I get what you mean here, the *at() part is to get > > around VFS special inodes and call vfs_fileattr_set/get on FS inodes. > > > > My point was that with your proposed API the fd argument to > ioctl() can be a directory from a completely arbitrary filesystem > with nothing to do with the filesystem where fsxat.dfd is from > and that makes very little sense from API POV. I see, yeah, I can add a check for this, for xfs_quota usecase it's fine as it doesn't follow cross mounts. > > > > > > > Trying to think of a saner API for this - perhaps pass an O_PATH > > > fd without any filename in struct fsxattrat, saving you also the > > > headache of passing a variable length string in an ioctl. > > > > > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path > > > and file->f_path are on the same sb before proceeding to operate > > > on atfile->f_path.dentry. > > > > Thanks! Didn't know about O_PATH that seems to be a way to get rid > > of the path passing. > > > > I found one precedent of this pattern with XFS_IOC_FD_TO_HANDLE, > but keep in mind that this is quite an old legacy XFS API. > > This ioctl is performed on an "fshandle", which is an open fd to any > object in the filesystem, but typically the mount root dir. > The ioctl gets a structure xfs_fsop_handlereq_t which contains an > fd member pointing to another object within an XFS filesystem. > > Actually, AFAICS, this code does not verify that the object and fshandle > are on the same XFS filesystem, but only that both are on XFS filesystems: > > /* > * We can only generate handles for inodes residing on a XFS filesystem, > * and only for regular files, directories or symbolic links. > */ > error = -EINVAL; > if (inode->i_sb->s_magic != XFS_SB_MAGIC) > goto out_put; > > I don't know what's the best thing to do is, but I think that verifying > that the ioctl fd and the O_PATH fd are on the same sb is the least > controversial option for the first version - if needed that could be > relaxed later on. > > Another alternative which is simpler from API POV would be to allow > selective ioctl() commands on an O_PATH fd, but I think that is going > to be more controversial. > > Something along those lines (completely untested): > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index 45e4e64fd664..562f8bff91d2 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -241,6 +241,13 @@ struct fsxattr { > */ > #define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) > > +#define _IOC_AT (0x100) > +#define FS_IOC_AT(nr) (_IOC_TYPE(nr) == _IOC_AT) > + > +/* The following ioctls can be operated on an O_PATH fd */ > +#define FS_IOC_FSGETXATTRAT _IOR(_IOC_AT, 31, struct fsxattr) > +#define FS_IOC_FSSETXATTRAT _IOW(_IOC_AT, 32, struct fsxattr) > + > /* > * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) > * > diff --git a/fs/ioctl.c b/fs/ioctl.c > index 1d5abfdf0f22..f720500c705b 100644 > --- a/fs/ioctl.c > +++ b/fs/ioctl.c > @@ -867,9 +867,11 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > return ioctl_setflags(filp, argp); > > case FS_IOC_FSGETXATTR: > + case FS_IOC_FSGETXATTRAT: > return ioctl_fsgetxattr(filp, argp); > > case FS_IOC_FSSETXATTR: > + case FS_IOC_FSSETXATTRAT: > return ioctl_fssetxattr(filp, argp); > > case FS_IOC_GETFSUUID: > @@ -879,7 +881,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > return ioctl_get_fs_sysfs_path(filp, argp); > > default: > - if (S_ISREG(inode->i_mode)) > + if (!FS_IOC_AT(cmd) && S_ISREG(inode->i_mode)) > return file_ioctl(filp, cmd, argp); > break; > } > @@ -889,7 +891,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) > { > - struct fd f = fdget(fd); > + bool ioc_at = FS_IOC_AT(cmd); > + struct fd f = ioc_at ? fdget_raw(fd) : fdget(fd); > int error; > > if (!f.file) > @@ -900,7 +903,7 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned > int, cmd, unsigned long, arg) > goto out; > > error = do_vfs_ioctl(f.file, fd, cmd, arg); > - if (error == -ENOIOCTLCMD) > + if (!ioc_at && error == -ENOIOCTLCMD) > error = vfs_ioctl(f.file, cmd, arg); > > --- > > Thanks, > Amir. >
On Wed, May 22, 2024 at 04:58:31PM +0200, Andrey Albershteyn wrote: > On 2024-05-21 21:22:41, Amir Goldstein wrote: > > On Tue, May 21, 2024 at 7:34 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > On 2024-05-20 22:03:43, Amir Goldstein wrote: > > > > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > new inodes in these directories inherit project ID set on parent > > > > > directory. > > > > > > > > > > The project is created from userspace by opening and calling > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > still exist in the directory. > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > xfs_quota, to set project ID on special files by using parent > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > inodes and also reset it when project is removed. Also, as > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > have one). > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > --- > > > > > fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ > > > > > include/uapi/linux/fs.h | 11 +++++ > > > > > 2 files changed, 104 insertions(+) > > > > > > > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644 > > > > > --- a/fs/ioctl.c > > > > > +++ b/fs/ioctl.c > > > > > @@ -22,6 +22,7 @@ > > > > > #include <linux/mount.h> > > > > > #include <linux/fscrypt.h> > > > > > #include <linux/fileattr.h> > > > > > +#include <linux/namei.h> > > > > > > > > > > #include "internal.h" > > > > > > > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, > > > > > if (fa->fsx_cowextsize == 0) > > > > > fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; > > > > > > > > > > + /* > > > > > + * The only use case for special files is to set project ID, forbid any > > > > > + * other attributes > > > > > + */ > > > > > + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { > > > > > + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) > > > > > + return -EINVAL; > > > > > + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) > > > > > + return -EINVAL; > > > > > + if (fa->fsx_extsize || fa->fsx_cowextsize) > > > > > + return -EINVAL; > > > > > + } > > > > > + > > > > > return 0; > > > > > } > > > > > > > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) > > > > > return err; > > > > > } > > > > > > > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) > > > > > +{ > > > > > + struct path filepath; > > > > > + struct fsxattrat fsxat; > > > > > + struct fileattr fa; > > > > > + int error; > > > > > + > > > > > + if (!S_ISDIR(file_inode(file)->i_mode)) > > > > > + return -EBADF; > > > > > > > > So the *only* thing that is done with the fd of the ioctl is to verify > > > > that it is a directory fd - there is no verification that this fd is on the > > > > same sb as the path to act on. > > > > > > > > Was this the intention? It does not make a lot of sense to me > > > > and AFAIK there is no precedent to an API like this. > > > > > > yeah, as we want to set xattrs on that inode the path is pointing > > > to, so, VFS will call to the FS under that sb. > > > > > > > > > > > There are ioctls that operate on the filesystem using any > > > > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH} > > > > and maybe the closest example to what you are trying to add > > > > XFS_IOC_BULKSTAT. > > > > > > Not sure that I get what you mean here, the *at() part is to get > > > around VFS special inodes and call vfs_fileattr_set/get on FS inodes. > > > > > > > My point was that with your proposed API the fd argument to > > ioctl() can be a directory from a completely arbitrary filesystem > > with nothing to do with the filesystem where fsxat.dfd is from > > and that makes very little sense from API POV. > > I see, yeah, I can add a check for this, for xfs_quota usecase it's > fine as it doesn't follow cross mounts. Do the other *at() syscalls prohibit dfd + path pointing to a different filesystem? It seems odd to have this restriction that the rest don't, but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok. --D > > > > > > > > > > Trying to think of a saner API for this - perhaps pass an O_PATH > > > > fd without any filename in struct fsxattrat, saving you also the > > > > headache of passing a variable length string in an ioctl. > > > > > > > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path > > > > and file->f_path are on the same sb before proceeding to operate > > > > on atfile->f_path.dentry. > > > > > > Thanks! Didn't know about O_PATH that seems to be a way to get rid > > > of the path passing. > > > > > > > I found one precedent of this pattern with XFS_IOC_FD_TO_HANDLE, > > but keep in mind that this is quite an old legacy XFS API. > > > > This ioctl is performed on an "fshandle", which is an open fd to any > > object in the filesystem, but typically the mount root dir. > > The ioctl gets a structure xfs_fsop_handlereq_t which contains an > > fd member pointing to another object within an XFS filesystem. > > > > Actually, AFAICS, this code does not verify that the object and fshandle > > are on the same XFS filesystem, but only that both are on XFS filesystems: > > > > /* > > * We can only generate handles for inodes residing on a XFS filesystem, > > * and only for regular files, directories or symbolic links. > > */ > > error = -EINVAL; > > if (inode->i_sb->s_magic != XFS_SB_MAGIC) > > goto out_put; > > > > I don't know what's the best thing to do is, but I think that verifying > > that the ioctl fd and the O_PATH fd are on the same sb is the least > > controversial option for the first version - if needed that could be > > relaxed later on. > > > > Another alternative which is simpler from API POV would be to allow > > selective ioctl() commands on an O_PATH fd, but I think that is going > > to be more controversial. > > > > Something along those lines (completely untested): > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > > index 45e4e64fd664..562f8bff91d2 100644 > > --- a/include/uapi/linux/fs.h > > +++ b/include/uapi/linux/fs.h > > @@ -241,6 +241,13 @@ struct fsxattr { > > */ > > #define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) > > > > +#define _IOC_AT (0x100) > > +#define FS_IOC_AT(nr) (_IOC_TYPE(nr) == _IOC_AT) > > + > > +/* The following ioctls can be operated on an O_PATH fd */ > > +#define FS_IOC_FSGETXATTRAT _IOR(_IOC_AT, 31, struct fsxattr) > > +#define FS_IOC_FSSETXATTRAT _IOW(_IOC_AT, 32, struct fsxattr) > > + > > /* > > * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) > > * > > diff --git a/fs/ioctl.c b/fs/ioctl.c > > index 1d5abfdf0f22..f720500c705b 100644 > > --- a/fs/ioctl.c > > +++ b/fs/ioctl.c > > @@ -867,9 +867,11 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > return ioctl_setflags(filp, argp); > > > > case FS_IOC_FSGETXATTR: > > + case FS_IOC_FSGETXATTRAT: > > return ioctl_fsgetxattr(filp, argp); > > > > case FS_IOC_FSSETXATTR: > > + case FS_IOC_FSSETXATTRAT: > > return ioctl_fssetxattr(filp, argp); > > > > case FS_IOC_GETFSUUID: > > @@ -879,7 +881,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > return ioctl_get_fs_sysfs_path(filp, argp); > > > > default: > > - if (S_ISREG(inode->i_mode)) > > + if (!FS_IOC_AT(cmd) && S_ISREG(inode->i_mode)) > > return file_ioctl(filp, cmd, argp); > > break; > > } > > @@ -889,7 +891,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, > > > > SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) > > { > > - struct fd f = fdget(fd); > > + bool ioc_at = FS_IOC_AT(cmd); > > + struct fd f = ioc_at ? fdget_raw(fd) : fdget(fd); > > int error; > > > > if (!f.file) > > @@ -900,7 +903,7 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned > > int, cmd, unsigned long, arg) > > goto out; > > > > error = do_vfs_ioctl(f.file, fd, cmd, arg); > > - if (error == -ENOIOCTLCMD) > > + if (!ioc_at && error == -ENOIOCTLCMD) > > error = vfs_ioctl(f.file, cmd, arg); > > > > --- > > > > Thanks, > > Amir. > > > > -- > - Andrey > >
On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote: > > Do the other *at() syscalls prohibit dfd + path pointing to a different > filesystem? It seems odd to have this restriction that the rest don't, > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok. No, but they are arbitrary syscalls so they can do that. ioctls traditionally operate on the specific filesystem of the fd. It feels like these should be syscalls, not ioctls. - Eric
On 2024-05-22 09:38:56, Eric Biggers wrote: > On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote: > > > > Do the other *at() syscalls prohibit dfd + path pointing to a different > > filesystem? It seems odd to have this restriction that the rest don't, > > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok. > > No, but they are arbitrary syscalls so they can do that. ioctls traditionally > operate on the specific filesystem of the fd. > > It feels like these should be syscalls, not ioctls. Won't it also be a bit weird to have FS_IOC_FS[S|G]ETXATTR as ioctls for normal files and a syscall for special files? When both are doing the same thing > > - Eric >
On Wed, May 22, 2024 at 07:23:15PM +0200, Andrey Albershteyn wrote: > On 2024-05-22 09:38:56, Eric Biggers wrote: > > On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote: > > > > > > Do the other *at() syscalls prohibit dfd + path pointing to a different > > > filesystem? It seems odd to have this restriction that the rest don't, > > > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok. > > > > No, but they are arbitrary syscalls so they can do that. ioctls traditionally > > operate on the specific filesystem of the fd. > > > > It feels like these should be syscalls, not ioctls. > > Won't it also be a bit weird to have FS_IOC_FS[S|G]ETXATTR as > ioctls for normal files and a syscall for special files? When both > are doing the same thing > Why would the syscalls be restricted to operating on special files? - Eric
On Wed, May 22, 2024 at 7:38 PM Eric Biggers <ebiggers@kernel.org> wrote: > > On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote: > > > > Do the other *at() syscalls prohibit dfd + path pointing to a different > > filesystem? It seems odd to have this restriction that the rest don't, > > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok. > > No, but they are arbitrary syscalls so they can do that. ioctls traditionally > operate on the specific filesystem of the fd. To emphasize the absurdity think opening /dev/random and doing ioctl to set projid on some xfs file. It is ridiculous. > > It feels like these should be syscalls, not ioctls. > I bet whatever name you choose for syscalls it is going to be too close lexicographically to [gs]etxattrat(2) [1]. It is really crowded in the area of getattr/getfattr/fgetxattr/getxattr/getfileattr/getfsxattr... I think I would vote for [gs]etfsxattrat(2) following the uapi struct fsxattr. I guess we have officially spiralled. Thanks, Amir. [1] https://lore.kernel.org/linux-fsdevel/20240426162042.191916-1-cgoettsche@seltendoof.de/
Hi! On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > On 2024-05-22 12:00:07, Jan Kara wrote: > > Hello! > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > XFS has project quotas which could be attached to a directory. All > > > new inodes in these directories inherit project ID set on parent > > > directory. > > > > > > The project is created from userspace by opening and calling > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > inode from VFS. Therefore, some inodes are left with empty project > > > ID. Those inodes then are not shown in the quota accounting but > > > still exist in the directory. > > > > > > This patch adds two new ioctls which allows userspace, such as > > > xfs_quota, to set project ID on special files by using parent > > > directory to open FS inode. This will let xfs_quota set ID on all > > > inodes and also reset it when project is removed. Also, as > > > vfs_fileattr_set() is now will called on special files too, let's > > > forbid any other attributes except projid and nextents (symlink can > > > have one). > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > I'd like to understand one thing. Is it practically useful to set project > > IDs for special inodes? There is no significant disk space usage associated > > with them so wrt quotas we are speaking only about the inode itself. So is > > the concern that user could escape inode project quota accounting and > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > for something that seems as a small corner case to me? > > So there's few things: > - Quota accounting is missing only some special files. Special files > created after quota project is setup inherit ID from the project > directory. > - For special files created after the project is setup there's no > way to make them project-less. Therefore, creating a new project > over those will fail due to project ID miss match. > - It wasn't possible to hardlink/rename project-less special files > inside a project due to ID miss match. The linking is fixed, and > renaming is worked around in first patch. > > The initial report I got was about second and last point, an > application was failing to create a new project after "restart" and > wasn't able to link special files created beforehand. I see. OK, but wouldn't it then be an easier fix to make sure we *never* inherit project id for special inodes? And make sure inodes with unset project ID don't fail to be linked, renamed, etc... Honza
On 2024-05-23 09:48:28, Jan Kara wrote: > Hi! > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > Hello! > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > XFS has project quotas which could be attached to a directory. All > > > > new inodes in these directories inherit project ID set on parent > > > > directory. > > > > > > > > The project is created from userspace by opening and calling > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > ID. Those inodes then are not shown in the quota accounting but > > > > still exist in the directory. > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > xfs_quota, to set project ID on special files by using parent > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > inodes and also reset it when project is removed. Also, as > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > forbid any other attributes except projid and nextents (symlink can > > > > have one). > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > IDs for special inodes? There is no significant disk space usage associated > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > the concern that user could escape inode project quota accounting and > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > for something that seems as a small corner case to me? > > > > So there's few things: > > - Quota accounting is missing only some special files. Special files > > created after quota project is setup inherit ID from the project > > directory. > > - For special files created after the project is setup there's no > > way to make them project-less. Therefore, creating a new project > > over those will fail due to project ID miss match. > > - It wasn't possible to hardlink/rename project-less special files > > inside a project due to ID miss match. The linking is fixed, and > > renaming is worked around in first patch. > > > > The initial report I got was about second and last point, an > > application was failing to create a new project after "restart" and > > wasn't able to link special files created beforehand. > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > inherit project id for special inodes? And make sure inodes with unset > project ID don't fail to be linked, renamed, etc... But then, in set up project, you can cross-link between projects and escape quota this way. During linking/renaming if source inode has ID but target one doesn't, we won't be able to tell that this link is within the project.
On 2024-05-22 22:03:55, Amir Goldstein wrote: > On Wed, May 22, 2024 at 7:38 PM Eric Biggers <ebiggers@kernel.org> wrote: > > > > On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote: > > > > > > Do the other *at() syscalls prohibit dfd + path pointing to a different > > > filesystem? It seems odd to have this restriction that the rest don't, > > > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok. > > > > No, but they are arbitrary syscalls so they can do that. ioctls traditionally > > operate on the specific filesystem of the fd. > > To emphasize the absurdity > think opening /dev/random and doing ioctl to set projid on some xfs file. > It is ridiculous. > > > > > It feels like these should be syscalls, not ioctls. > > > > I bet whatever name you choose for syscalls it is going to be too > close lexicographically to [gs]etxattrat(2) [1]. It is really crowded > in the area of getattr/getfattr/fgetxattr/getxattr/getfileattr/getfsxattr... > I think I would vote for [gs]etfsxattrat(2) following the uapi struct fsxattr. > I guess we have officially spiralled. > > Thanks, > Amir. > > [1] https://lore.kernel.org/linux-fsdevel/20240426162042.191916-1-cgoettsche@seltendoof.de/ > Thanks for the link, will convert to syscalls in next version.
On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > On 2024-05-23 09:48:28, Jan Kara wrote: > > Hi! > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > Hello! > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > XFS has project quotas which could be attached to a directory. All > > > > > new inodes in these directories inherit project ID set on parent > > > > > directory. > > > > > > > > > > The project is created from userspace by opening and calling > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > still exist in the directory. > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > xfs_quota, to set project ID on special files by using parent > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > inodes and also reset it when project is removed. Also, as > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > have one). > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > IDs for special inodes? There is no significant disk space usage associated > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > the concern that user could escape inode project quota accounting and > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > for something that seems as a small corner case to me? > > > > > > So there's few things: > > > - Quota accounting is missing only some special files. Special files > > > created after quota project is setup inherit ID from the project > > > directory. > > > - For special files created after the project is setup there's no > > > way to make them project-less. Therefore, creating a new project > > > over those will fail due to project ID miss match. > > > - It wasn't possible to hardlink/rename project-less special files > > > inside a project due to ID miss match. The linking is fixed, and > > > renaming is worked around in first patch. > > > > > > The initial report I got was about second and last point, an > > > application was failing to create a new project after "restart" and > > > wasn't able to link special files created beforehand. > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > inherit project id for special inodes? And make sure inodes with unset > > project ID don't fail to be linked, renamed, etc... > > But then, in set up project, you can cross-link between projects and > escape quota this way. During linking/renaming if source inode has > ID but target one doesn't, we won't be able to tell that this link > is within the project. Well, I didn't want to charge these special inodes to project quota at all so "escaping quota" was pretty much what I suggested to do. But my point was that since the only thing that's really charged for these inodes is the inodes itself then does this small inaccuracy really matter in practice? Are we afraid the user is going to fill the filesystem with symlinks? Honza
On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > Hi! > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > Hello! > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > directory. > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > still exist in the directory. > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > have one). > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > the concern that user could escape inode project quota accounting and > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > for something that seems as a small corner case to me? > > > > > > > > So there's few things: > > > > - Quota accounting is missing only some special files. Special files > > > > created after quota project is setup inherit ID from the project > > > > directory. > > > > - For special files created after the project is setup there's no > > > > way to make them project-less. Therefore, creating a new project > > > > over those will fail due to project ID miss match. > > > > - It wasn't possible to hardlink/rename project-less special files > > > > inside a project due to ID miss match. The linking is fixed, and > > > > renaming is worked around in first patch. > > > > > > > > The initial report I got was about second and last point, an > > > > application was failing to create a new project after "restart" and > > > > wasn't able to link special files created beforehand. > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > inherit project id for special inodes? And make sure inodes with unset > > > project ID don't fail to be linked, renamed, etc... > > > > But then, in set up project, you can cross-link between projects and > > escape quota this way. During linking/renaming if source inode has > > ID but target one doesn't, we won't be able to tell that this link > > is within the project. > > Well, I didn't want to charge these special inodes to project quota at all > so "escaping quota" was pretty much what I suggested to do. But my point > was that since the only thing that's really charged for these inodes is the > inodes itself then does this small inaccuracy really matter in practice? > Are we afraid the user is going to fill the filesystem with symlinks? I thought the worry here is that you can't fully reassign the project id for a directory tree unless you have an *at() version of the ioctl to handle the special files that you can't open directly? So you start with a directory tree that's (say) 2% symlinks and project id 5. Later you want to set project id 7 on that subtree, but after the incomplete change, projid 7 is charged for 98% of the tree, and 2% are still stuck on projid 5. This is a mess, and if enforcement is enabled you've just broken it in a way that can't be fixed aside from recreating those files. --D > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR >
On Fri 31-05-24 07:52:04, Darrick J. Wong wrote: > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > > Hi! > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > > Hello! > > > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > > directory. > > > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > > still exist in the directory. > > > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > > have one). > > > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > > the concern that user could escape inode project quota accounting and > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > > for something that seems as a small corner case to me? > > > > > > > > > > So there's few things: > > > > > - Quota accounting is missing only some special files. Special files > > > > > created after quota project is setup inherit ID from the project > > > > > directory. > > > > > - For special files created after the project is setup there's no > > > > > way to make them project-less. Therefore, creating a new project > > > > > over those will fail due to project ID miss match. > > > > > - It wasn't possible to hardlink/rename project-less special files > > > > > inside a project due to ID miss match. The linking is fixed, and > > > > > renaming is worked around in first patch. > > > > > > > > > > The initial report I got was about second and last point, an > > > > > application was failing to create a new project after "restart" and > > > > > wasn't able to link special files created beforehand. > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > > inherit project id for special inodes? And make sure inodes with unset > > > > project ID don't fail to be linked, renamed, etc... > > > > > > But then, in set up project, you can cross-link between projects and > > > escape quota this way. During linking/renaming if source inode has > > > ID but target one doesn't, we won't be able to tell that this link > > > is within the project. > > > > Well, I didn't want to charge these special inodes to project quota at all > > so "escaping quota" was pretty much what I suggested to do. But my point > > was that since the only thing that's really charged for these inodes is the > > inodes itself then does this small inaccuracy really matter in practice? > > Are we afraid the user is going to fill the filesystem with symlinks? > > I thought the worry here is that you can't fully reassign the project > id for a directory tree unless you have an *at() version of the ioctl > to handle the special files that you can't open directly? > > So you start with a directory tree that's (say) 2% symlinks and project > id 5. Later you want to set project id 7 on that subtree, but after the > incomplete change, projid 7 is charged for 98% of the tree, and 2% are > still stuck on projid 5. This is a mess, and if enforcement is enabled > you've just broken it in a way that can't be fixed aside from recreating > those files. So the idea I'm trying to propose (and apparently I'm failing to explain it properly) is: When creating special inode, set i_projid = 0 regardless of directory settings. When creating hardlink or doing rename, if i_projid of dentry is 0, we allow the operation. Teach fsck to set i_projid to 0 when inode is special. As a result, AFAICT no problem with hardlinks, renames or similar. No need for special new ioctl or syscall. The downside is special inodes escape project quota accounting. Do we care? Honza
On 2024-06-03 12:42:59, Jan Kara wrote: > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote: > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > > > Hi! > > > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > > > Hello! > > > > > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > > > directory. > > > > > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > > > still exist in the directory. > > > > > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > > > have one). > > > > > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > > > the concern that user could escape inode project quota accounting and > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > > > for something that seems as a small corner case to me? > > > > > > > > > > > > So there's few things: > > > > > > - Quota accounting is missing only some special files. Special files > > > > > > created after quota project is setup inherit ID from the project > > > > > > directory. > > > > > > - For special files created after the project is setup there's no > > > > > > way to make them project-less. Therefore, creating a new project > > > > > > over those will fail due to project ID miss match. > > > > > > - It wasn't possible to hardlink/rename project-less special files > > > > > > inside a project due to ID miss match. The linking is fixed, and > > > > > > renaming is worked around in first patch. > > > > > > > > > > > > The initial report I got was about second and last point, an > > > > > > application was failing to create a new project after "restart" and > > > > > > wasn't able to link special files created beforehand. > > > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > > > inherit project id for special inodes? And make sure inodes with unset > > > > > project ID don't fail to be linked, renamed, etc... > > > > > > > > But then, in set up project, you can cross-link between projects and > > > > escape quota this way. During linking/renaming if source inode has > > > > ID but target one doesn't, we won't be able to tell that this link > > > > is within the project. > > > > > > Well, I didn't want to charge these special inodes to project quota at all > > > so "escaping quota" was pretty much what I suggested to do. But my point > > > was that since the only thing that's really charged for these inodes is the > > > inodes itself then does this small inaccuracy really matter in practice? > > > Are we afraid the user is going to fill the filesystem with symlinks? > > > > I thought the worry here is that you can't fully reassign the project > > id for a directory tree unless you have an *at() version of the ioctl > > to handle the special files that you can't open directly? > > > > So you start with a directory tree that's (say) 2% symlinks and project > > id 5. Later you want to set project id 7 on that subtree, but after the > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are > > still stuck on projid 5. This is a mess, and if enforcement is enabled > > you've just broken it in a way that can't be fixed aside from recreating > > those files. > > So the idea I'm trying to propose (and apparently I'm failing to explain it > properly) is: > > When creating special inode, set i_projid = 0 regardless of directory > settings. > > When creating hardlink or doing rename, if i_projid of dentry is 0, we > allow the operation. > > Teach fsck to set i_projid to 0 when inode is special. > > As a result, AFAICT no problem with hardlinks, renames or similar. No need > for special new ioctl or syscall. The downside is special inodes escape > project quota accounting. Do we care? I see. But is it fine to allow fill filesystem with special inodes? Don't know if it can be used somehow but this is exception from isoft/ihard limits then. I don't see issues with this approach also, if others don't have other points or other uses for those new syscalls, I can go with this approach.
On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote: > On 2024-06-03 12:42:59, Jan Kara wrote: > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote: > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > > > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > > > > Hi! > > > > > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > > > > Hello! > > > > > > > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > > > > directory. > > > > > > > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > > > > still exist in the directory. > > > > > > > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > > > > have one). > > > > > > > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > > > > the concern that user could escape inode project quota accounting and > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > > > > for something that seems as a small corner case to me? > > > > > > > > > > > > > > So there's few things: > > > > > > > - Quota accounting is missing only some special files. Special files > > > > > > > created after quota project is setup inherit ID from the project > > > > > > > directory. > > > > > > > - For special files created after the project is setup there's no > > > > > > > way to make them project-less. Therefore, creating a new project > > > > > > > over those will fail due to project ID miss match. > > > > > > > - It wasn't possible to hardlink/rename project-less special files > > > > > > > inside a project due to ID miss match. The linking is fixed, and > > > > > > > renaming is worked around in first patch. > > > > > > > > > > > > > > The initial report I got was about second and last point, an > > > > > > > application was failing to create a new project after "restart" and > > > > > > > wasn't able to link special files created beforehand. > > > > > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > > > > inherit project id for special inodes? And make sure inodes with unset > > > > > > project ID don't fail to be linked, renamed, etc... > > > > > > > > > > But then, in set up project, you can cross-link between projects and > > > > > escape quota this way. During linking/renaming if source inode has > > > > > ID but target one doesn't, we won't be able to tell that this link > > > > > is within the project. > > > > > > > > Well, I didn't want to charge these special inodes to project quota at all > > > > so "escaping quota" was pretty much what I suggested to do. But my point > > > > was that since the only thing that's really charged for these inodes is the > > > > inodes itself then does this small inaccuracy really matter in practice? > > > > Are we afraid the user is going to fill the filesystem with symlinks? > > > > > > I thought the worry here is that you can't fully reassign the project > > > id for a directory tree unless you have an *at() version of the ioctl > > > to handle the special files that you can't open directly? > > > > > > So you start with a directory tree that's (say) 2% symlinks and project > > > id 5. Later you want to set project id 7 on that subtree, but after the > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are > > > still stuck on projid 5. This is a mess, and if enforcement is enabled > > > you've just broken it in a way that can't be fixed aside from recreating > > > those files. > > > > So the idea I'm trying to propose (and apparently I'm failing to explain it > > properly) is: > > > > When creating special inode, set i_projid = 0 regardless of directory > > settings. > > > > When creating hardlink or doing rename, if i_projid of dentry is 0, we > > allow the operation. > > > > Teach fsck to set i_projid to 0 when inode is special. > > > > As a result, AFAICT no problem with hardlinks, renames or similar. No need > > for special new ioctl or syscall. The downside is special inodes escape > > project quota accounting. Do we care? > > I see. But is it fine to allow fill filesystem with special inodes? > Don't know if it can be used somehow but this is exception from > isoft/ihard limits then. > > I don't see issues with this approach also, if others don't have > other points or other uses for those new syscalls, I can go with > this approach. I do -- allowing unpriviledged users to create symlinks that consume icount (and possibly bcount) in the root project breaks the entire enforcement mechanism. That's not the way that project quota has worked on xfs and it would be quite rude to nullify the PROJINHERIT flag bit only for these special cases. --D > -- > - Andrey > >
On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote: > > On 2024-06-03 12:42:59, Jan Kara wrote: > > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote: > > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > > > > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > > > > > Hi! > > > > > > > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > > > > > Hello! > > > > > > > > > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > > > > > directory. > > > > > > > > > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > > > > > still exist in the directory. > > > > > > > > > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > > > > > have one). > > > > > > > > > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > > > > > the concern that user could escape inode project quota accounting and > > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > > > > > for something that seems as a small corner case to me? > > > > > > > > > > > > > > > > So there's few things: > > > > > > > > - Quota accounting is missing only some special files. Special files > > > > > > > > created after quota project is setup inherit ID from the project > > > > > > > > directory. > > > > > > > > - For special files created after the project is setup there's no > > > > > > > > way to make them project-less. Therefore, creating a new project > > > > > > > > over those will fail due to project ID miss match. > > > > > > > > - It wasn't possible to hardlink/rename project-less special files > > > > > > > > inside a project due to ID miss match. The linking is fixed, and > > > > > > > > renaming is worked around in first patch. > > > > > > > > > > > > > > > > The initial report I got was about second and last point, an > > > > > > > > application was failing to create a new project after "restart" and > > > > > > > > wasn't able to link special files created beforehand. > > > > > > > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > > > > > inherit project id for special inodes? And make sure inodes with unset > > > > > > > project ID don't fail to be linked, renamed, etc... > > > > > > > > > > > > But then, in set up project, you can cross-link between projects and > > > > > > escape quota this way. During linking/renaming if source inode has > > > > > > ID but target one doesn't, we won't be able to tell that this link > > > > > > is within the project. > > > > > > > > > > Well, I didn't want to charge these special inodes to project quota at all > > > > > so "escaping quota" was pretty much what I suggested to do. But my point > > > > > was that since the only thing that's really charged for these inodes is the > > > > > inodes itself then does this small inaccuracy really matter in practice? > > > > > Are we afraid the user is going to fill the filesystem with symlinks? > > > > > > > > I thought the worry here is that you can't fully reassign the project > > > > id for a directory tree unless you have an *at() version of the ioctl > > > > to handle the special files that you can't open directly? > > > > > > > > So you start with a directory tree that's (say) 2% symlinks and project > > > > id 5. Later you want to set project id 7 on that subtree, but after the > > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are > > > > still stuck on projid 5. This is a mess, and if enforcement is enabled > > > > you've just broken it in a way that can't be fixed aside from recreating > > > > those files. > > > > > > So the idea I'm trying to propose (and apparently I'm failing to explain it > > > properly) is: > > > > > > When creating special inode, set i_projid = 0 regardless of directory > > > settings. > > > > > > When creating hardlink or doing rename, if i_projid of dentry is 0, we > > > allow the operation. > > > > > > Teach fsck to set i_projid to 0 when inode is special. > > > > > > As a result, AFAICT no problem with hardlinks, renames or similar. No need > > > for special new ioctl or syscall. The downside is special inodes escape > > > project quota accounting. Do we care? > > > > I see. But is it fine to allow fill filesystem with special inodes? > > Don't know if it can be used somehow but this is exception from > > isoft/ihard limits then. > > > > I don't see issues with this approach also, if others don't have > > other points or other uses for those new syscalls, I can go with > > this approach. > > I do -- allowing unpriviledged users to create symlinks that consume > icount (and possibly bcount) in the root project breaks the entire > enforcement mechanism. That's not the way that project quota has worked > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > only for these special cases. OK, fair enough. I though someone will hate this. I'd just like to understand one thing: Owner of the inode can change the project ID to 0 anyway so project quotas are more like a cooperative space tracking scheme anyway. If you want to escape it, you can. So what are you exactly worried about? Is it the container usecase where from within the user namespace you cannot change project IDs? Anyway I just wanted to have an explicit decision that the simple solution is not good enough before we go the more complex route ;). Honza
On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote: > > > On 2024-06-03 12:42:59, Jan Kara wrote: > > > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote: > > > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > > > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > > > > > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > > > > > > Hi! > > > > > > > > > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > > > > > > Hello! > > > > > > > > > > > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > > > > > > directory. > > > > > > > > > > > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > > > > > > still exist in the directory. > > > > > > > > > > > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > > > > > > have one). > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > > > > > > the concern that user could escape inode project quota accounting and > > > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > > > > > > for something that seems as a small corner case to me? > > > > > > > > > > > > > > > > > > So there's few things: > > > > > > > > > - Quota accounting is missing only some special files. Special files > > > > > > > > > created after quota project is setup inherit ID from the project > > > > > > > > > directory. > > > > > > > > > - For special files created after the project is setup there's no > > > > > > > > > way to make them project-less. Therefore, creating a new project > > > > > > > > > over those will fail due to project ID miss match. > > > > > > > > > - It wasn't possible to hardlink/rename project-less special files > > > > > > > > > inside a project due to ID miss match. The linking is fixed, and > > > > > > > > > renaming is worked around in first patch. > > > > > > > > > > > > > > > > > > The initial report I got was about second and last point, an > > > > > > > > > application was failing to create a new project after "restart" and > > > > > > > > > wasn't able to link special files created beforehand. > > > > > > > > > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > > > > > > inherit project id for special inodes? And make sure inodes with unset > > > > > > > > project ID don't fail to be linked, renamed, etc... > > > > > > > > > > > > > > But then, in set up project, you can cross-link between projects and > > > > > > > escape quota this way. During linking/renaming if source inode has > > > > > > > ID but target one doesn't, we won't be able to tell that this link > > > > > > > is within the project. > > > > > > > > > > > > Well, I didn't want to charge these special inodes to project quota at all > > > > > > so "escaping quota" was pretty much what I suggested to do. But my point > > > > > > was that since the only thing that's really charged for these inodes is the > > > > > > inodes itself then does this small inaccuracy really matter in practice? > > > > > > Are we afraid the user is going to fill the filesystem with symlinks? > > > > > > > > > > I thought the worry here is that you can't fully reassign the project > > > > > id for a directory tree unless you have an *at() version of the ioctl > > > > > to handle the special files that you can't open directly? > > > > > > > > > > So you start with a directory tree that's (say) 2% symlinks and project > > > > > id 5. Later you want to set project id 7 on that subtree, but after the > > > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are > > > > > still stuck on projid 5. This is a mess, and if enforcement is enabled > > > > > you've just broken it in a way that can't be fixed aside from recreating > > > > > those files. > > > > > > > > So the idea I'm trying to propose (and apparently I'm failing to explain it > > > > properly) is: > > > > > > > > When creating special inode, set i_projid = 0 regardless of directory > > > > settings. > > > > > > > > When creating hardlink or doing rename, if i_projid of dentry is 0, we > > > > allow the operation. > > > > > > > > Teach fsck to set i_projid to 0 when inode is special. > > > > > > > > As a result, AFAICT no problem with hardlinks, renames or similar. No need > > > > for special new ioctl or syscall. The downside is special inodes escape > > > > project quota accounting. Do we care? > > > > > > I see. But is it fine to allow fill filesystem with special inodes? > > > Don't know if it can be used somehow but this is exception from > > > isoft/ihard limits then. > > > > > > I don't see issues with this approach also, if others don't have > > > other points or other uses for those new syscalls, I can go with > > > this approach. > > > > I do -- allowing unpriviledged users to create symlinks that consume > > icount (and possibly bcount) in the root project breaks the entire > > enforcement mechanism. That's not the way that project quota has worked > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > only for these special cases. > > OK, fair enough. I though someone will hate this. I'd just like to > understand one thing: Owner of the inode can change the project ID to 0 > anyway so project quotas are more like a cooperative space tracking scheme > anyway. If you want to escape it, you can. So what are you exactly worried > about? Is it the container usecase where from within the user namespace you > cannot change project IDs? Yep. > Anyway I just wanted to have an explicit decision that the simple solution > is not good enough before we go the more complex route ;). Also, every now and then someone comes along and half-proposes making it so that non-root cannot change project ids anymore. Maybe some day that will succeed. --D > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR >
On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote: > > > > On 2024-06-03 12:42:59, Jan Kara wrote: > > > > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote: > > > > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote: > > > > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote: > > > > > > > > On 2024-05-23 09:48:28, Jan Kara wrote: > > > > > > > > > Hi! > > > > > > > > > > > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote: > > > > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote: > > > > > > > > > > > Hello! > > > > > > > > > > > > > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote: > > > > > > > > > > > > XFS has project quotas which could be attached to a directory. All > > > > > > > > > > > > new inodes in these directories inherit project ID set on parent > > > > > > > > > > > > directory. > > > > > > > > > > > > > > > > > > > > > > > > The project is created from userspace by opening and calling > > > > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special > > > > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special > > > > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project > > > > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but > > > > > > > > > > > > still exist in the directory. > > > > > > > > > > > > > > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as > > > > > > > > > > > > xfs_quota, to set project ID on special files by using parent > > > > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all > > > > > > > > > > > > inodes and also reset it when project is removed. Also, as > > > > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's > > > > > > > > > > > > forbid any other attributes except projid and nextents (symlink can > > > > > > > > > > > > have one). > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> > > > > > > > > > > > > > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project > > > > > > > > > > > IDs for special inodes? There is no significant disk space usage associated > > > > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is > > > > > > > > > > > the concern that user could escape inode project quota accounting and > > > > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls > > > > > > > > > > > for something that seems as a small corner case to me? > > > > > > > > > > > > > > > > > > > > So there's few things: > > > > > > > > > > - Quota accounting is missing only some special files. Special files > > > > > > > > > > created after quota project is setup inherit ID from the project > > > > > > > > > > directory. > > > > > > > > > > - For special files created after the project is setup there's no > > > > > > > > > > way to make them project-less. Therefore, creating a new project > > > > > > > > > > over those will fail due to project ID miss match. > > > > > > > > > > - It wasn't possible to hardlink/rename project-less special files > > > > > > > > > > inside a project due to ID miss match. The linking is fixed, and > > > > > > > > > > renaming is worked around in first patch. > > > > > > > > > > > > > > > > > > > > The initial report I got was about second and last point, an > > > > > > > > > > application was failing to create a new project after "restart" and > > > > > > > > > > wasn't able to link special files created beforehand. > > > > > > > > > > > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never* > > > > > > > > > inherit project id for special inodes? And make sure inodes with unset > > > > > > > > > project ID don't fail to be linked, renamed, etc... > > > > > > > > > > > > > > > > But then, in set up project, you can cross-link between projects and > > > > > > > > escape quota this way. During linking/renaming if source inode has > > > > > > > > ID but target one doesn't, we won't be able to tell that this link > > > > > > > > is within the project. > > > > > > > > > > > > > > Well, I didn't want to charge these special inodes to project quota at all > > > > > > > so "escaping quota" was pretty much what I suggested to do. But my point > > > > > > > was that since the only thing that's really charged for these inodes is the > > > > > > > inodes itself then does this small inaccuracy really matter in practice? > > > > > > > Are we afraid the user is going to fill the filesystem with symlinks? > > > > > > > > > > > > I thought the worry here is that you can't fully reassign the project > > > > > > id for a directory tree unless you have an *at() version of the ioctl > > > > > > to handle the special files that you can't open directly? > > > > > > > > > > > > So you start with a directory tree that's (say) 2% symlinks and project > > > > > > id 5. Later you want to set project id 7 on that subtree, but after the > > > > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are > > > > > > still stuck on projid 5. This is a mess, and if enforcement is enabled > > > > > > you've just broken it in a way that can't be fixed aside from recreating > > > > > > those files. > > > > > > > > > > So the idea I'm trying to propose (and apparently I'm failing to explain it > > > > > properly) is: > > > > > > > > > > When creating special inode, set i_projid = 0 regardless of directory > > > > > settings. > > > > > > > > > > When creating hardlink or doing rename, if i_projid of dentry is 0, we > > > > > allow the operation. > > > > > > > > > > Teach fsck to set i_projid to 0 when inode is special. > > > > > > > > > > As a result, AFAICT no problem with hardlinks, renames or similar. No need > > > > > for special new ioctl or syscall. The downside is special inodes escape > > > > > project quota accounting. Do we care? > > > > > > > > I see. But is it fine to allow fill filesystem with special inodes? > > > > Don't know if it can be used somehow but this is exception from > > > > isoft/ihard limits then. > > > > > > > > I don't see issues with this approach also, if others don't have > > > > other points or other uses for those new syscalls, I can go with > > > > this approach. > > > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > icount (and possibly bcount) in the root project breaks the entire > > > enforcement mechanism. That's not the way that project quota has worked > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > only for these special cases. > > > > OK, fair enough. I though someone will hate this. I'd just like to > > understand one thing: Owner of the inode can change the project ID to 0 > > anyway so project quotas are more like a cooperative space tracking scheme > > anyway. If you want to escape it, you can. So what are you exactly worried > > about? Is it the container usecase where from within the user namespace you > > cannot change project IDs? > > Yep. > > > Anyway I just wanted to have an explicit decision that the simple solution > > is not good enough before we go the more complex route ;). > > Also, every now and then someone comes along and half-proposes making it > so that non-root cannot change project ids anymore. Maybe some day that > will succeed. > I'd just like to point out that the purpose of the project quotas feature as I understand it, is to apply quotas to subtrees, where container storage is a very common private case of project subtree. The purpose is NOT to create a "project" of random files in random paths. My point is that changing the project id of a non-dir child to be different from the project id of its parent is a pretty rare use case (I think?). If changing the projid of non-dir is needed for moving it to a different subtree, we could allow renameat2(2) of non-dir with no hardlinks to implicitly change its inherited project id or explicitly with a flag for a hardlink, e.g.: renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). Which leaves us only with the use cases of: 1. Share some inode (as hardlinks) among projects 2. Recursively changing a subtree projid #1 could be allowed with a flag to linkat(2) or hardlink within a shared project subtree and rename the hardlink out of the shared project subtree with renameat2(2) and explicit flag, e.g.: renameat2(olddirfd, name, newdirfd, name, RENAME_OLD_PROJID). #2 *could technically* be done by the same rename flag that will allow rename to the same path, e.g: renameat2(dirfd, name, dirfd, name, RENAME_NEW_PROJID). It's not pretty and I personally prefer the syscall solution. Just wanted to put it out there. Thanks, Amir.
On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote: > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > > icount (and possibly bcount) in the root project breaks the entire > > > > enforcement mechanism. That's not the way that project quota has worked > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > > only for these special cases. > > > > > > OK, fair enough. I though someone will hate this. I'd just like to > > > understand one thing: Owner of the inode can change the project ID to 0 > > > anyway so project quotas are more like a cooperative space tracking scheme > > > anyway. If you want to escape it, you can. So what are you exactly worried > > > about? Is it the container usecase where from within the user namespace you > > > cannot change project IDs? > > > > Yep. > > > > > Anyway I just wanted to have an explicit decision that the simple solution > > > is not good enough before we go the more complex route ;). > > > > Also, every now and then someone comes along and half-proposes making it > > so that non-root cannot change project ids anymore. Maybe some day that > > will succeed. > > > > I'd just like to point out that the purpose of the project quotas feature > as I understand it, is to apply quotas to subtrees, where container storage > is a very common private case of project subtree. That is the most modern use case, yes. [ And for a walk down history lane.... ] > The purpose is NOT to create a "project" of random files in random > paths. This is *exactly* the original use case that project quotas were designed for back on Irix in the early 1990s and is the original behaviour project quotas brought to Linux. Project quota inheritance didn't come along until 2005: commit 65f1866a3a8e512d43795c116bfef262e703b789 Author: Nathan Scott <nathans@sgi.com> Date: Fri Jun 3 06:04:22 2005 +0000 Add support for project quota inheritance, a merge of Glens changes. Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd. And full support for directory tree quotas using project IDs wasn't fully introduced until a year later in 2006: commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6 Author: Nathan Scott <nathans@sgi.com> Date: Tue May 30 15:54:53 2006 +0000 statvfs component of directory/project quota support, code originally by Glen. Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd. These changes were largely done for an SGI NAS product that allowed us to create one great big XFS filesystem and then create arbitrarily sized, thin provisoned "NFS volumes" as directory quota controlled subdirs instantenously. The directory tree quota defined the size of the volume, and so we could also grow and shrink them instantenously, too. And we could remove them instantenously via background garbage collection after the export was removed and the user had been told it had been destroyed. So that was the original use case for directory tree quotas on XFS - providing scalable, fast management of "thin" storage for a NAS product. Projects quotas had been used for accounting random colections of files for over a decade before this directory quota construct was created, and the "modern" container use cases for directory quotas didn't come along until almost a decade after this capability was added. > My point is that changing the project id of a non-dir child to be different > from the project id of its parent is a pretty rare use case (I think?). Not if you are using project quotas as they were originally intended to be used. > If changing the projid of non-dir is needed for moving it to a > different subtree, > we could allow renameat2(2) of non-dir with no hardlinks to implicitly > change its > inherited project id or explicitly with a flag for a hardlink, e.g.: > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). Why? The only reason XFS returns -EXDEV to rename across project IDs is because nobody wanted to spend the time to work out how to do the quota accounting of the metadata changed in the rename operation accurately. So for that rare case (not something that would happen on the NAS product) we returned -EXDEV to trigger the mv command to copy the file to the destination and then unlink the source instead, thereby handling all the quota accounting correctly. IOWs, this whole "-EXDEV on rename across parent project quota boundaries" is an implementation detail and nothing more. Filesystems that implement project quotas and the directory tree sub-variant don't need to behave like this if they can accurately account for the quota ID changes during an atomic rename operation. If that's too hard, then the fallback is to return -EXDEV and let userspace do it the slow way which will always acocunt the resource usage correctly to the individual projects. Hence I think we should just fix the XFS kernel behaviour to do the right thing in this special file case rather than return -EXDEV and then forget about the rest of it. Sure, update xfs_repair to fix the special file project id issue if it trips over it, but other than that I don't think we need anything more. If fixing it requires new syscalls and tools, then that's much harder to backport to old kernels and distros than just backporting a couple of small XFS kernel patches... -Dave.
On Thu, Jun 06, 2024 at 12:27:38PM +1000, Dave Chinner wrote: > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote: > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > > > icount (and possibly bcount) in the root project breaks the entire > > > > > enforcement mechanism. That's not the way that project quota has worked > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > > > only for these special cases. > > > > > > > > OK, fair enough. I though someone will hate this. I'd just like to > > > > understand one thing: Owner of the inode can change the project ID to 0 > > > > anyway so project quotas are more like a cooperative space tracking scheme > > > > anyway. If you want to escape it, you can. So what are you exactly worried > > > > about? Is it the container usecase where from within the user namespace you > > > > cannot change project IDs? > > > > > > Yep. > > > > > > > Anyway I just wanted to have an explicit decision that the simple solution > > > > is not good enough before we go the more complex route ;). > > > > > > Also, every now and then someone comes along and half-proposes making it > > > so that non-root cannot change project ids anymore. Maybe some day that > > > will succeed. > > > > > > > I'd just like to point out that the purpose of the project quotas feature > > as I understand it, is to apply quotas to subtrees, where container storage > > is a very common private case of project subtree. > > That is the most modern use case, yes. > > [ And for a walk down history lane.... ] Could you take all your institutional knowledge and paste it into a Documentation/ file that we can use as a starting point for what exactly does the project quota design do, please? I'm betting the other two filesystems that implement it (ext4/f2fs) don't quite implement the same behaviors. --D > > The purpose is NOT to create a "project" of random files in random > > paths. > > This is *exactly* the original use case that project quotas were > designed for back on Irix in the early 1990s and is the original > behaviour project quotas brought to Linux. > > Project quota inheritance didn't come along until 2005: > > commit 65f1866a3a8e512d43795c116bfef262e703b789 > Author: Nathan Scott <nathans@sgi.com> > Date: Fri Jun 3 06:04:22 2005 +0000 > > Add support for project quota inheritance, a merge of Glens changes. > Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd. > > And full support for directory tree quotas using project IDs wasn't > fully introduced until a year later in 2006: > > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6 > Author: Nathan Scott <nathans@sgi.com> > Date: Tue May 30 15:54:53 2006 +0000 > > statvfs component of directory/project quota support, code originally by Glen. > Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd. > > These changes were largely done for an SGI NAS product that allowed > us to create one great big XFS filesystem and then create > arbitrarily sized, thin provisoned "NFS volumes" as directory > quota controlled subdirs instantenously. The directory tree quota > defined the size of the volume, and so we could also grow and shrink > them instantenously, too. And we could remove them instantenously > via background garbage collection after the export was removed and > the user had been told it had been destroyed. > > So that was the original use case for directory tree quotas on XFS - > providing scalable, fast management of "thin" storage for a NAS > product. Projects quotas had been used for accounting random > colections of files for over a decade before this directory quota > construct was created, and the "modern" container use cases for > directory quotas didn't come along until almost a decade after this > capability was added. > > > My point is that changing the project id of a non-dir child to be different > > from the project id of its parent is a pretty rare use case (I think?). > > Not if you are using project quotas as they were originally intended > to be used. > > > If changing the projid of non-dir is needed for moving it to a > > different subtree, > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly > > change its > > inherited project id or explicitly with a flag for a hardlink, e.g.: > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). > > Why? > > The only reason XFS returns -EXDEV to rename across project IDs is > because nobody wanted to spend the time to work out how to do the > quota accounting of the metadata changed in the rename operation > accurately. So for that rare case (not something that would happen > on the NAS product) we returned -EXDEV to trigger the mv command to > copy the file to the destination and then unlink the source instead, > thereby handling all the quota accounting correctly. > > IOWs, this whole "-EXDEV on rename across parent project quota > boundaries" is an implementation detail and nothing more. > Filesystems that implement project quotas and the directory tree > sub-variant don't need to behave like this if they can accurately > account for the quota ID changes during an atomic rename operation. > If that's too hard, then the fallback is to return -EXDEV and let > userspace do it the slow way which will always acocunt the resource > usage correctly to the individual projects. > > Hence I think we should just fix the XFS kernel behaviour to do the > right thing in this special file case rather than return -EXDEV and > then forget about the rest of it. Sure, update xfs_repair to fix the > special file project id issue if it trips over it, but other than > that I don't think we need anything more. If fixing it requires new > syscalls and tools, then that's much harder to backport to old > kernels and distros than just backporting a couple of small XFS > kernel patches... > > -Dave. > -- > Dave Chinner > david@fromorbit.com >
On Thu, Jun 6, 2024 at 5:27 AM Dave Chinner <david@fromorbit.com> wrote: > > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote: > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > > > icount (and possibly bcount) in the root project breaks the entire > > > > > enforcement mechanism. That's not the way that project quota has worked > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > > > only for these special cases. > > > > > > > > OK, fair enough. I though someone will hate this. I'd just like to > > > > understand one thing: Owner of the inode can change the project ID to 0 > > > > anyway so project quotas are more like a cooperative space tracking scheme > > > > anyway. If you want to escape it, you can. So what are you exactly worried > > > > about? Is it the container usecase where from within the user namespace you > > > > cannot change project IDs? > > > > > > Yep. > > > > > > > Anyway I just wanted to have an explicit decision that the simple solution > > > > is not good enough before we go the more complex route ;). > > > > > > Also, every now and then someone comes along and half-proposes making it > > > so that non-root cannot change project ids anymore. Maybe some day that > > > will succeed. > > > > > > > I'd just like to point out that the purpose of the project quotas feature > > as I understand it, is to apply quotas to subtrees, where container storage > > is a very common private case of project subtree. > > That is the most modern use case, yes. > > [ And for a walk down history lane.... ] > > > The purpose is NOT to create a "project" of random files in random > > paths. > > This is *exactly* the original use case that project quotas were > designed for back on Irix in the early 1990s and is the original > behaviour project quotas brought to Linux. > > Project quota inheritance didn't come along until 2005: > > commit 65f1866a3a8e512d43795c116bfef262e703b789 > Author: Nathan Scott <nathans@sgi.com> > Date: Fri Jun 3 06:04:22 2005 +0000 > > Add support for project quota inheritance, a merge of Glens changes. > Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd. > > And full support for directory tree quotas using project IDs wasn't > fully introduced until a year later in 2006: > > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6 > Author: Nathan Scott <nathans@sgi.com> > Date: Tue May 30 15:54:53 2006 +0000 > > statvfs component of directory/project quota support, code originally by Glen. > Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd. > > These changes were largely done for an SGI NAS product that allowed > us to create one great big XFS filesystem and then create > arbitrarily sized, thin provisoned "NFS volumes" as directory > quota controlled subdirs instantenously. The directory tree quota > defined the size of the volume, and so we could also grow and shrink > them instantenously, too. And we could remove them instantenously > via background garbage collection after the export was removed and > the user had been told it had been destroyed. > > So that was the original use case for directory tree quotas on XFS - > providing scalable, fast management of "thin" storage for a NAS > product. Projects quotas had been used for accounting random > colections of files for over a decade before this directory quota > construct was created, and the "modern" container use cases for > directory quotas didn't come along until almost a decade after this > capability was added. > Cool. Didn't know all of this. Lucky for us, those historic use cases are well distinguished from the modern subtree use case by the opt-in PROJINHERIT bit. So as long as PROJINHERIT is set, my assumptions mostly hold(?) > > My point is that changing the project id of a non-dir child to be different > > from the project id of its parent is a pretty rare use case (I think?). > > Not if you are using project quotas as they were originally intended > to be used. > Rephrase then: Changing the projid of a non-dir child to be different from the projid of its parent, which has PROJINHERIT bit set, is a pretty rare use case(?) > > If changing the projid of non-dir is needed for moving it to a > > different subtree, > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly > > change its > > inherited project id or explicitly with a flag for a hardlink, e.g.: > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). > > Why? > > The only reason XFS returns -EXDEV to rename across project IDs is > because nobody wanted to spend the time to work out how to do the > quota accounting of the metadata changed in the rename operation > accurately. So for that rare case (not something that would happen > on the NAS product) we returned -EXDEV to trigger the mv command to > copy the file to the destination and then unlink the source instead, > thereby handling all the quota accounting correctly. > > IOWs, this whole "-EXDEV on rename across parent project quota > boundaries" is an implementation detail and nothing more. > Filesystems that implement project quotas and the directory tree > sub-variant don't need to behave like this if they can accurately > account for the quota ID changes during an atomic rename operation. > If that's too hard, then the fallback is to return -EXDEV and let > userspace do it the slow way which will always acocunt the resource > usage correctly to the individual projects. > > Hence I think we should just fix the XFS kernel behaviour to do the > right thing in this special file case rather than return -EXDEV and > then forget about the rest of it. Sure, update xfs_repair to fix the > special file project id issue if it trips over it, but other than > that I don't think we need anything more. If fixing it requires new > syscalls and tools, then that's much harder to backport to old > kernels and distros than just backporting a couple of small XFS > kernel patches... > I assume that by "fix the XFS behavior" you mean "we could allow renameat2(2) of non-dir with no hardlinks to implicitly change its inherited project id"? (in case the new parent has the PROJINHERIT bit) so that the RENAME_NEW_PROJID behavior would be implicit. Unlike rename() from one parent to the other, link()+unlink() is less obvious. The "modern" use cases that I listed where implicit change of projid does not suffice are: 1. Share some inodes (as hardlinks) among projects 2. Recursively changing a subtree projid They could be implemented by explicit flags to renameat2()/linkat() and they could be implemented by [gs]etfsxattrat(2) syscalls. Thanks, Amir.
On 2024-06-06 12:27:38, Dave Chinner wrote: > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote: > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > > > icount (and possibly bcount) in the root project breaks the entire > > > > > enforcement mechanism. That's not the way that project quota has worked > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > > > only for these special cases. > > > > > > > > OK, fair enough. I though someone will hate this. I'd just like to > > > > understand one thing: Owner of the inode can change the project ID to 0 > > > > anyway so project quotas are more like a cooperative space tracking scheme > > > > anyway. If you want to escape it, you can. So what are you exactly worried > > > > about? Is it the container usecase where from within the user namespace you > > > > cannot change project IDs? > > > > > > Yep. > > > > > > > Anyway I just wanted to have an explicit decision that the simple solution > > > > is not good enough before we go the more complex route ;). > > > > > > Also, every now and then someone comes along and half-proposes making it > > > so that non-root cannot change project ids anymore. Maybe some day that > > > will succeed. > > > > > > > I'd just like to point out that the purpose of the project quotas feature > > as I understand it, is to apply quotas to subtrees, where container storage > > is a very common private case of project subtree. > > That is the most modern use case, yes. > > [ And for a walk down history lane.... ] > > > The purpose is NOT to create a "project" of random files in random > > paths. > > This is *exactly* the original use case that project quotas were > designed for back on Irix in the early 1990s and is the original > behaviour project quotas brought to Linux. > > Project quota inheritance didn't come along until 2005: > > commit 65f1866a3a8e512d43795c116bfef262e703b789 > Author: Nathan Scott <nathans@sgi.com> > Date: Fri Jun 3 06:04:22 2005 +0000 > > Add support for project quota inheritance, a merge of Glens changes. > Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd. > > And full support for directory tree quotas using project IDs wasn't > fully introduced until a year later in 2006: > > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6 > Author: Nathan Scott <nathans@sgi.com> > Date: Tue May 30 15:54:53 2006 +0000 > > statvfs component of directory/project quota support, code originally by Glen. > Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd. > > These changes were largely done for an SGI NAS product that allowed > us to create one great big XFS filesystem and then create > arbitrarily sized, thin provisoned "NFS volumes" as directory > quota controlled subdirs instantenously. The directory tree quota > defined the size of the volume, and so we could also grow and shrink > them instantenously, too. And we could remove them instantenously > via background garbage collection after the export was removed and > the user had been told it had been destroyed. > > So that was the original use case for directory tree quotas on XFS - > providing scalable, fast management of "thin" storage for a NAS > product. Projects quotas had been used for accounting random > colections of files for over a decade before this directory quota > construct was created, and the "modern" container use cases for > directory quotas didn't come along until almost a decade after this > capability was added. Thanks! > > > My point is that changing the project id of a non-dir child to be different > > from the project id of its parent is a pretty rare use case (I think?). > > Not if you are using project quotas as they were originally intended > to be used. > > > If changing the projid of non-dir is needed for moving it to a > > different subtree, > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly > > change its > > inherited project id or explicitly with a flag for a hardlink, e.g.: > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). > > Why? > > The only reason XFS returns -EXDEV to rename across project IDs is > because nobody wanted to spend the time to work out how to do the > quota accounting of the metadata changed in the rename operation > accurately. So for that rare case (not something that would happen > on the NAS product) we returned -EXDEV to trigger the mv command to > copy the file to the destination and then unlink the source instead, > thereby handling all the quota accounting correctly. > > IOWs, this whole "-EXDEV on rename across parent project quota > boundaries" is an implementation detail and nothing more. > Filesystems that implement project quotas and the directory tree > sub-variant don't need to behave like this if they can accurately > account for the quota ID changes during an atomic rename operation. > If that's too hard, then the fallback is to return -EXDEV and let > userspace do it the slow way which will always acocunt the resource > usage correctly to the individual projects. > > Hence I think we should just fix the XFS kernel behaviour to do the > right thing in this special file case rather than return -EXDEV and > then forget about the rest of it. I see, I will look into that, this should solve the original issue. But those special file's inodes still will not be accounted by the quota during initial project setup (xfs_quota will skip them), would it worth it adding new syscalls anyway?
On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > On 2024-06-06 12:27:38, Dave Chinner wrote: ... > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > because nobody wanted to spend the time to work out how to do the > > quota accounting of the metadata changed in the rename operation > > accurately. So for that rare case (not something that would happen > > on the NAS product) we returned -EXDEV to trigger the mv command to > > copy the file to the destination and then unlink the source instead, > > thereby handling all the quota accounting correctly. > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > boundaries" is an implementation detail and nothing more. > > Filesystems that implement project quotas and the directory tree > > sub-variant don't need to behave like this if they can accurately > > account for the quota ID changes during an atomic rename operation. > > If that's too hard, then the fallback is to return -EXDEV and let > > userspace do it the slow way which will always acocunt the resource > > usage correctly to the individual projects. > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > right thing in this special file case rather than return -EXDEV and > > then forget about the rest of it. > > I see, I will look into that, this should solve the original issue. I see that you already got Darrick's RVB on the original patch: https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/ What is missing then? A similar patch for rename() that allows rename of zero projid special file as long as (target_dp->i_projid == src_dp->i_projid)? In theory, it would have been nice to fix the zero projid during the above link() and rename() operations, but it would be more challenging and I see no reason to do that if all the other files remain with zero projid after initial project setup (i.e. if not implementing the syscalls). > > But those special file's inodes still will not be accounted by the > quota during initial project setup (xfs_quota will skip them), would > it worth it adding new syscalls anyway? > Is it worth it to you? Adding those new syscalls means adding tests and documentation and handle all the bugs later. If nobody cared about accounting of special files inodes so far, there is no proof that anyone will care that you put in all this work. Thanks, Amir.
On 2024-06-10 12:19:50, Amir Goldstein wrote: > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > On 2024-06-06 12:27:38, Dave Chinner wrote: > ... > > > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > > because nobody wanted to spend the time to work out how to do the > > > quota accounting of the metadata changed in the rename operation > > > accurately. So for that rare case (not something that would happen > > > on the NAS product) we returned -EXDEV to trigger the mv command to > > > copy the file to the destination and then unlink the source instead, > > > thereby handling all the quota accounting correctly. > > > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > > boundaries" is an implementation detail and nothing more. > > > Filesystems that implement project quotas and the directory tree > > > sub-variant don't need to behave like this if they can accurately > > > account for the quota ID changes during an atomic rename operation. > > > If that's too hard, then the fallback is to return -EXDEV and let > > > userspace do it the slow way which will always acocunt the resource > > > usage correctly to the individual projects. > > > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > > right thing in this special file case rather than return -EXDEV and > > > then forget about the rest of it. > > > > I see, I will look into that, this should solve the original issue. > > I see that you already got Darrick's RVB on the original patch: > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/ > > What is missing then? > A similar patch for rename() that allows rename of zero projid special > file as long as (target_dp->i_projid == src_dp->i_projid)? > > In theory, it would have been nice to fix the zero projid during the > above link() and rename() operations, but it would be more challenging > and I see no reason to do that if all the other files remain with zero > projid after initial project setup (i.e. if not implementing the syscalls). I think Dave suggests to get rid of this if-guard and allow link()/rename() for special files but with correct quota calculation. > > > > > But those special file's inodes still will not be accounted by the > > quota during initial project setup (xfs_quota will skip them), would > > it worth it adding new syscalls anyway? > > > > Is it worth it to you? > > Adding those new syscalls means adding tests and documentation > and handle all the bugs later. > > If nobody cared about accounting of special files inodes so far, > there is no proof that anyone will care that you put in all this work. I already have patch and some simple man-pages prepared, I'm wondering if this would be useful for any other usecases which would require setting extended attributes on spec indodes. > > Thanks, > Amir. >
On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > On 2024-06-10 12:19:50, Amir Goldstein wrote: > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > On 2024-06-06 12:27:38, Dave Chinner wrote: > > ... > > > > > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > > > because nobody wanted to spend the time to work out how to do the > > > > quota accounting of the metadata changed in the rename operation > > > > accurately. So for that rare case (not something that would happen > > > > on the NAS product) we returned -EXDEV to trigger the mv command to > > > > copy the file to the destination and then unlink the source instead, > > > > thereby handling all the quota accounting correctly. > > > > > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > > > boundaries" is an implementation detail and nothing more. > > > > Filesystems that implement project quotas and the directory tree > > > > sub-variant don't need to behave like this if they can accurately > > > > account for the quota ID changes during an atomic rename operation. > > > > If that's too hard, then the fallback is to return -EXDEV and let > > > > userspace do it the slow way which will always acocunt the resource > > > > usage correctly to the individual projects. > > > > > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > > > right thing in this special file case rather than return -EXDEV and > > > > then forget about the rest of it. > > > > > > I see, I will look into that, this should solve the original issue. > > > > I see that you already got Darrick's RVB on the original patch: > > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/ > > > > What is missing then? > > A similar patch for rename() that allows rename of zero projid special > > file as long as (target_dp->i_projid == src_dp->i_projid)? > > > > In theory, it would have been nice to fix the zero projid during the > > above link() and rename() operations, but it would be more challenging > > and I see no reason to do that if all the other files remain with zero > > projid after initial project setup (i.e. if not implementing the syscalls). > > I think Dave suggests to get rid of this if-guard and allow > link()/rename() for special files but with correct quota calculation. > > > > > > > > > But those special file's inodes still will not be accounted by the > > > quota during initial project setup (xfs_quota will skip them), would > > > it worth it adding new syscalls anyway? > > > > > > > Is it worth it to you? > > > > Adding those new syscalls means adding tests and documentation > > and handle all the bugs later. > > > > If nobody cared about accounting of special files inodes so far, > > there is no proof that anyone will care that you put in all this work. > > I already have patch and some simple man-pages prepared, I'm > wondering if this would be useful for any other usecases Yes, I personally find it useful. I have applications that query the fsx_xflags and would rather be able to use O_PATH to query/set those flags, since internally in vfs, fileattr_[gs]et() do not really need an open file. > which would > require setting extended attributes on spec indodes. Please do not use the terminology "extended attributes" in the man page to describe struct fsxattr. Better follow the "additional attributes" terminology of xfs ioctl man page [1], even though it is already confusing enough w.r.t "extended attributes" IMO. Thanks, Amir. [1] https://man7.org/linux/man-pages/man2/ioctl_xfs_fsgetxattr.2.html
On Mon 10-06-24 16:21:39, Amir Goldstein wrote: > On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > On 2024-06-10 12:19:50, Amir Goldstein wrote: > > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > > > But those special file's inodes still will not be accounted by the > > > > quota during initial project setup (xfs_quota will skip them), would > > > > it worth it adding new syscalls anyway? > > > > > > > > > > Is it worth it to you? > > > > > > Adding those new syscalls means adding tests and documentation > > > and handle all the bugs later. > > > > > > If nobody cared about accounting of special files inodes so far, > > > there is no proof that anyone will care that you put in all this work. > > > > I already have patch and some simple man-pages prepared, I'm > > wondering if this would be useful for any other usecases > > Yes, I personally find it useful. > I have applications that query the fsx_xflags and would rather > be able to use O_PATH to query/set those flags, since > internally in vfs, fileattr_[gs]et() do not really need an open file. Well, Christian doesn't like [1] how much functionality is actually available through O_PATH fds because it kind of makes them more and more similar to normal fds and that's apparently undesirable for some usecases (for security reasons). So I don't think he'd like to have *another* functionality accepted for O_PATH fds :) But he can speak for himself... Honza [1] https://lore.kernel.org/all/20240412-labeln-filmabend-42422ec453d7@brauner [2] https://lore.kernel.org/all/20240430-machbar-jogginganzug-7fd3cff2c3ed@brauner
On Mon, Jun 10, 2024 at 04:21:39PM +0300, Amir Goldstein wrote: > On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > On 2024-06-10 12:19:50, Amir Goldstein wrote: > > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > > > On 2024-06-06 12:27:38, Dave Chinner wrote: > > > ... > > > > > > > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > > > > because nobody wanted to spend the time to work out how to do the > > > > > quota accounting of the metadata changed in the rename operation > > > > > accurately. So for that rare case (not something that would happen > > > > > on the NAS product) we returned -EXDEV to trigger the mv command to > > > > > copy the file to the destination and then unlink the source instead, > > > > > thereby handling all the quota accounting correctly. > > > > > > > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > > > > boundaries" is an implementation detail and nothing more. > > > > > Filesystems that implement project quotas and the directory tree > > > > > sub-variant don't need to behave like this if they can accurately > > > > > account for the quota ID changes during an atomic rename operation. > > > > > If that's too hard, then the fallback is to return -EXDEV and let > > > > > userspace do it the slow way which will always acocunt the resource > > > > > usage correctly to the individual projects. > > > > > > > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > > > > right thing in this special file case rather than return -EXDEV and > > > > > then forget about the rest of it. > > > > > > > > I see, I will look into that, this should solve the original issue. > > > > > > I see that you already got Darrick's RVB on the original patch: > > > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/ > > > > > > What is missing then? > > > A similar patch for rename() that allows rename of zero projid special > > > file as long as (target_dp->i_projid == src_dp->i_projid)? > > > > > > In theory, it would have been nice to fix the zero projid during the > > > above link() and rename() operations, but it would be more challenging > > > and I see no reason to do that if all the other files remain with zero > > > projid after initial project setup (i.e. if not implementing the syscalls). > > > > I think Dave suggests to get rid of this if-guard and allow > > link()/rename() for special files but with correct quota calculation. > > > > > > > > > > > > > But those special file's inodes still will not be accounted by the > > > > quota during initial project setup (xfs_quota will skip them), would > > > > it worth it adding new syscalls anyway? > > > > > > > > > > Is it worth it to you? > > > > > > Adding those new syscalls means adding tests and documentation > > > and handle all the bugs later. > > > > > > If nobody cared about accounting of special files inodes so far, > > > there is no proof that anyone will care that you put in all this work. > > > > I already have patch and some simple man-pages prepared, I'm > > wondering if this would be useful for any other usecases > > Yes, I personally find it useful. > I have applications that query the fsx_xflags and would rather > be able to use O_PATH to query/set those flags, since > internally in vfs, fileattr_[gs]et() do not really need an open file. > > > which would > > require setting extended attributes on spec indodes. > > Please do not use the terminology "extended attributes" in the man page > to describe struct fsxattr. "XFS file attributes" perhaps? Though that's anachronistic since ext4 supports /some/ of them now. --D > Better follow the "additional attributes" terminology of xfs ioctl man page [1], > even though it is already confusing enough w.r.t "extended attributes" IMO. > > Thanks, > Amir. > > [1] https://man7.org/linux/man-pages/man2/ioctl_xfs_fsgetxattr.2.html >
On Mon, Jun 10, 2024 at 11:26 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Jun 10, 2024 at 04:21:39PM +0300, Amir Goldstein wrote: > > On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > On 2024-06-10 12:19:50, Amir Goldstein wrote: > > > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote: > > > > > > > > > > On 2024-06-06 12:27:38, Dave Chinner wrote: > > > > ... > > > > > > > > > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > > > > > because nobody wanted to spend the time to work out how to do the > > > > > > quota accounting of the metadata changed in the rename operation > > > > > > accurately. So for that rare case (not something that would happen > > > > > > on the NAS product) we returned -EXDEV to trigger the mv command to > > > > > > copy the file to the destination and then unlink the source instead, > > > > > > thereby handling all the quota accounting correctly. > > > > > > > > > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > > > > > boundaries" is an implementation detail and nothing more. > > > > > > Filesystems that implement project quotas and the directory tree > > > > > > sub-variant don't need to behave like this if they can accurately > > > > > > account for the quota ID changes during an atomic rename operation. > > > > > > If that's too hard, then the fallback is to return -EXDEV and let > > > > > > userspace do it the slow way which will always acocunt the resource > > > > > > usage correctly to the individual projects. > > > > > > > > > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > > > > > right thing in this special file case rather than return -EXDEV and > > > > > > then forget about the rest of it. > > > > > > > > > > I see, I will look into that, this should solve the original issue. > > > > > > > > I see that you already got Darrick's RVB on the original patch: > > > > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/ > > > > > > > > What is missing then? > > > > A similar patch for rename() that allows rename of zero projid special > > > > file as long as (target_dp->i_projid == src_dp->i_projid)? > > > > > > > > In theory, it would have been nice to fix the zero projid during the > > > > above link() and rename() operations, but it would be more challenging > > > > and I see no reason to do that if all the other files remain with zero > > > > projid after initial project setup (i.e. if not implementing the syscalls). > > > > > > I think Dave suggests to get rid of this if-guard and allow > > > link()/rename() for special files but with correct quota calculation. > > > > > > > > > > > > > > > > > But those special file's inodes still will not be accounted by the > > > > > quota during initial project setup (xfs_quota will skip them), would > > > > > it worth it adding new syscalls anyway? > > > > > > > > > > > > > Is it worth it to you? > > > > > > > > Adding those new syscalls means adding tests and documentation > > > > and handle all the bugs later. > > > > > > > > If nobody cared about accounting of special files inodes so far, > > > > there is no proof that anyone will care that you put in all this work. > > > > > > I already have patch and some simple man-pages prepared, I'm > > > wondering if this would be useful for any other usecases > > > > Yes, I personally find it useful. > > I have applications that query the fsx_xflags and would rather > > be able to use O_PATH to query/set those flags, since > > internally in vfs, fileattr_[gs]et() do not really need an open file. > > > > > which would > > > require setting extended attributes on spec indodes. > > > > Please do not use the terminology "extended attributes" in the man page > > to describe struct fsxattr. > > "XFS file attributes" perhaps? > > Though that's anachronistic since ext4 supports /some/ of them now. > Technically, it's all the filesystems that support ->fileattr_[gs]et(), which is a lot. Since the feature has matured into a vfs feature with planned new syscalls, the XFS ioctl man page could be referenced for historic perspective and for explaining the reason for the X in fsxattr, but I don't think that the official name of those attributes should include "XFS". Maybe TAFKAXFA (The attributes formerly known as XFS file attributes) :-p Also, adding a syscall to perform FS_IOC_FS[GS]ETXATTR, without being able to perform FS_IOC_[GS]ETFLAGS would be a bit odd IMO. It is possible to multiplex both in the same syscall. For example, let the syscall return the size of the attributes, like getxattr(). Not sure whether it is smart to do it. Then again, not sure if this was already mentioned as an option, besides the more generic proposal by Miklos [1], but we could bind generic xattr handlers for "system.fs.attr" and "system.fs.flags". API-wise, ->fileattr_[gs]et() would become very similar to ->[gs]et_acl(), which are also get/set via "system.*" xattr. The added benefit is that we avoid the confusing distinctions between "additional attributes" and "extended attributes" - "additional file attributes/flags" are a private case of "extended attributes". Implementation wise it should be pretty simple, but I agree with Jan that the question of credentials needed for these syscalls needs to be addressed first and get Christian's blessing. Thanks, Amir. [1] https://lore.kernel.org/linux-fsdevel/YnEeuw6fd1A8usjj@miu.piliscsaba.redhat.com/
On Fri, Jun 07, 2024 at 09:17:34AM +0300, Amir Goldstein wrote: > On Thu, Jun 6, 2024 at 5:27 AM Dave Chinner <david@fromorbit.com> wrote: > > > > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote: > > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > > > > icount (and possibly bcount) in the root project breaks the entire > > > > > > enforcement mechanism. That's not the way that project quota has worked > > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > > > > only for these special cases. > > > > > > > > > > OK, fair enough. I though someone will hate this. I'd just like to > > > > > understand one thing: Owner of the inode can change the project ID to 0 > > > > > anyway so project quotas are more like a cooperative space tracking scheme > > > > > anyway. If you want to escape it, you can. So what are you exactly worried > > > > > about? Is it the container usecase where from within the user namespace you > > > > > cannot change project IDs? > > > > > > > > Yep. > > > > > > > > > Anyway I just wanted to have an explicit decision that the simple solution > > > > > is not good enough before we go the more complex route ;). > > > > > > > > Also, every now and then someone comes along and half-proposes making it > > > > so that non-root cannot change project ids anymore. Maybe some day that > > > > will succeed. > > > > > > > > > > I'd just like to point out that the purpose of the project quotas feature > > > as I understand it, is to apply quotas to subtrees, where container storage > > > is a very common private case of project subtree. > > > > That is the most modern use case, yes. > > > > [ And for a walk down history lane.... ] > > > > > The purpose is NOT to create a "project" of random files in random > > > paths. > > > > This is *exactly* the original use case that project quotas were > > designed for back on Irix in the early 1990s and is the original > > behaviour project quotas brought to Linux. > > > > Project quota inheritance didn't come along until 2005: > > > > commit 65f1866a3a8e512d43795c116bfef262e703b789 > > Author: Nathan Scott <nathans@sgi.com> > > Date: Fri Jun 3 06:04:22 2005 +0000 > > > > Add support for project quota inheritance, a merge of Glens changes. > > Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd. > > > > And full support for directory tree quotas using project IDs wasn't > > fully introduced until a year later in 2006: > > > > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6 > > Author: Nathan Scott <nathans@sgi.com> > > Date: Tue May 30 15:54:53 2006 +0000 > > > > statvfs component of directory/project quota support, code originally by Glen. > > Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd. > > > > These changes were largely done for an SGI NAS product that allowed > > us to create one great big XFS filesystem and then create > > arbitrarily sized, thin provisoned "NFS volumes" as directory > > quota controlled subdirs instantenously. The directory tree quota > > defined the size of the volume, and so we could also grow and shrink > > them instantenously, too. And we could remove them instantenously > > via background garbage collection after the export was removed and > > the user had been told it had been destroyed. > > > > So that was the original use case for directory tree quotas on XFS - > > providing scalable, fast management of "thin" storage for a NAS > > product. Projects quotas had been used for accounting random > > colections of files for over a decade before this directory quota > > construct was created, and the "modern" container use cases for > > directory quotas didn't come along until almost a decade after this > > capability was added. > > > > Cool. Didn't know all of this. > Lucky for us, those historic use cases are well distinguished from > the modern subtree use case by the opt-in PROJINHERIT bit. > So as long as PROJINHERIT is set, my assumptions mostly hold(?) I'm not sure what assumptions you are making, so I can't really make any binding statement on this except to say that the existance of PROJINHERIT on directories does not mean strict directory quotas are being used. i.e. PROJINHERIT is just a flag to automatically tag inodes with aproject ID on creation. It -can- be used as a directory tree quota if specific restrictions in the use of the projinherit flag are enforced, but it can also be used for the original project ID use case so users don't have to manually tag files they create or move to a projet directory with the right project ID after the fact. IOWs, directories that have the same projid and PROJINHERIT set do not need to be in a single hierarchy, and hence do not fit the definition of a directory tree quota. e.g: /projects/docs/project1 /projects/docs/project2 /projects/docs/project3 /projects/src/project1 /projects/src/project2 /projects/src/project3 /projects/build/project1 /projects/build/project2 /projects/build/project3 ..... /home/user1/project1 /home/user2/project3 /home/user6/project2 Now we have multiple different disjoint directory heirarchies with the same project ID because they contain files belonging to a specific project. This is still a random collection of files in random paths that are accounted to a project ID, but it's also using PROJINHERIT to assign the default project ID to files created in the respective project directories. The kernel has no idea that a project ID is associated with a single directory tree. The kernel quota accounting itself simple sees project IDs on inodes and accounts to that project ID. The create and rename code simply see the parent PROJINHERIT bit and so need to set up the new directory entry to be accounted to that different project ID. It's all just mechanism in the kernel - applying project quotas to enforce directory tree quotas is entirely a userspace administration constraint.... The only thing we do to help userspace administration is restrict changing project IDs to the init namespace, thereby preventing containers from being able to manipulate project IDs. This allows the init namespace to set the policy for project quota use and hence allow it to be used for directory tree quotas for container space management. This is essentially an extension of the original NAS use case, in that NFS exports were the isolation barrier that hid the underlying filesystem structure and project ID quota usage from the end users.... > > > My point is that changing the project id of a non-dir child to be different > > > from the project id of its parent is a pretty rare use case (I think?). > > > > Not if you are using project quotas as they were originally intended > > to be used. > > > > Rephrase then: > > Changing the projid of a non-dir child to be different from the projid > of its parent, which has PROJINHERIT bit set, is a pretty rare use case(?) I have no data to indicate how rare it might be - the kernel has never enforced a policy that disallows changing project IDs when PROJINHERIT is set and any user with write permission can change project IDs even when PROJINHERIT is set. Hence we have to assume that there are people out there that rely on this behaviour, regardless of how rare it is... > > > If changing the projid of non-dir is needed for moving it to a > > > different subtree, > > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly > > > change its > > > inherited project id or explicitly with a flag for a hardlink, e.g.: > > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). > > > > Why? > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > because nobody wanted to spend the time to work out how to do the > > quota accounting of the metadata changed in the rename operation > > accurately. So for that rare case (not something that would happen > > on the NAS product) we returned -EXDEV to trigger the mv command to > > copy the file to the destination and then unlink the source instead, > > thereby handling all the quota accounting correctly. > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > boundaries" is an implementation detail and nothing more. > > Filesystems that implement project quotas and the directory tree > > sub-variant don't need to behave like this if they can accurately > > account for the quota ID changes during an atomic rename operation. > > If that's too hard, then the fallback is to return -EXDEV and let > > userspace do it the slow way which will always acocunt the resource > > usage correctly to the individual projects. > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > right thing in this special file case rather than return -EXDEV and > > then forget about the rest of it. Sure, update xfs_repair to fix the > > special file project id issue if it trips over it, but other than > > that I don't think we need anything more. If fixing it requires new > > syscalls and tools, then that's much harder to backport to old > > kernels and distros than just backporting a couple of small XFS > > kernel patches... > > > > I assume that by "fix the XFS behavior" you mean > "we could allow renameat2(2) of non-dir with no hardlinks to implicitly > change its inherited project id"? > (in case the new parent has the PROJINHERIT bit) > so that the RENAME_NEW_PROJID behavior would be implicit. No, I meant "fix the original hardlink of an inode with no project ID within a PROJINHERIT directory always gets -EXDEV" by allowing hard links when the source file has no project ID specified. Actually, looking at 6.10-rc3, this has been merged already. So IMO there's nothing more we need to do here. > Unlike rename() from one parent to the other, link()+unlink() > is less obvious. > > The "modern" use cases that I listed where implicit change of projid > does not suffice are: > > 1. Share some inodes (as hardlinks) among projects > 2. Recursively changing a subtree projid > > They could be implemented by explicit flags to renameat2()/linkat() and > they could be implemented by [gs]etfsxattrat(2) syscalls. What are you expecting to happen when you hardlink an inode across multiple PROJINHERIT directories? I mean, an inode can only have one project ID, so you can't link it into mulitple projects at once if you are using PROJINHERIT (xfs_link() expressly forbids it). So I'm really not sure what problem you are trying to describe or solve here. Can you elaborate more? -Dave.
On Wed, Jun 12, 2024 at 2:40 AM Dave Chinner <david@fromorbit.com> wrote: > > On Fri, Jun 07, 2024 at 09:17:34AM +0300, Amir Goldstein wrote: > > On Thu, Jun 6, 2024 at 5:27 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote: > > > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote: > > > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote: > > > > > > > I do -- allowing unpriviledged users to create symlinks that consume > > > > > > > icount (and possibly bcount) in the root project breaks the entire > > > > > > > enforcement mechanism. That's not the way that project quota has worked > > > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit > > > > > > > only for these special cases. > > > > > > > > > > > > OK, fair enough. I though someone will hate this. I'd just like to > > > > > > understand one thing: Owner of the inode can change the project ID to 0 > > > > > > anyway so project quotas are more like a cooperative space tracking scheme > > > > > > anyway. If you want to escape it, you can. So what are you exactly worried > > > > > > about? Is it the container usecase where from within the user namespace you > > > > > > cannot change project IDs? > > > > > > > > > > Yep. > > > > > > > > > > > Anyway I just wanted to have an explicit decision that the simple solution > > > > > > is not good enough before we go the more complex route ;). > > > > > > > > > > Also, every now and then someone comes along and half-proposes making it > > > > > so that non-root cannot change project ids anymore. Maybe some day that > > > > > will succeed. > > > > > > > > > > > > > I'd just like to point out that the purpose of the project quotas feature > > > > as I understand it, is to apply quotas to subtrees, where container storage > > > > is a very common private case of project subtree. > > > > > > That is the most modern use case, yes. > > > > > > [ And for a walk down history lane.... ] > > > > > > > The purpose is NOT to create a "project" of random files in random > > > > paths. > > > > > > This is *exactly* the original use case that project quotas were > > > designed for back on Irix in the early 1990s and is the original > > > behaviour project quotas brought to Linux. > > > > > > Project quota inheritance didn't come along until 2005: > > > > > > commit 65f1866a3a8e512d43795c116bfef262e703b789 > > > Author: Nathan Scott <nathans@sgi.com> > > > Date: Fri Jun 3 06:04:22 2005 +0000 > > > > > > Add support for project quota inheritance, a merge of Glens changes. > > > Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd. > > > > > > And full support for directory tree quotas using project IDs wasn't > > > fully introduced until a year later in 2006: > > > > > > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6 > > > Author: Nathan Scott <nathans@sgi.com> > > > Date: Tue May 30 15:54:53 2006 +0000 > > > > > > statvfs component of directory/project quota support, code originally by Glen. > > > Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd. > > > > > > These changes were largely done for an SGI NAS product that allowed > > > us to create one great big XFS filesystem and then create > > > arbitrarily sized, thin provisoned "NFS volumes" as directory > > > quota controlled subdirs instantenously. The directory tree quota > > > defined the size of the volume, and so we could also grow and shrink > > > them instantenously, too. And we could remove them instantenously > > > via background garbage collection after the export was removed and > > > the user had been told it had been destroyed. > > > > > > So that was the original use case for directory tree quotas on XFS - > > > providing scalable, fast management of "thin" storage for a NAS > > > product. Projects quotas had been used for accounting random > > > colections of files for over a decade before this directory quota > > > construct was created, and the "modern" container use cases for > > > directory quotas didn't come along until almost a decade after this > > > capability was added. > > > > > > > Cool. Didn't know all of this. > > Lucky for us, those historic use cases are well distinguished from > > the modern subtree use case by the opt-in PROJINHERIT bit. > > So as long as PROJINHERIT is set, my assumptions mostly hold(?) > > I'm not sure what assumptions you are making, so I can't really > make any binding statement on this except to say that the existance > of PROJINHERIT on directories does not mean strict directory quotas > are being used. > > i.e. PROJINHERIT is just a flag to automatically tag inodes with > aproject ID on creation. It -can- be used as a directory tree quota > if specific restrictions in the use of the projinherit flag are > enforced, but it can also be used for the original project ID use > case so users don't have to manually tag files they create or move > to a projet directory with the right project ID after the fact. > > IOWs, directories that have the same projid and PROJINHERIT set > do not need to be in a single hierarchy, and hence do not fit the > definition of a directory tree quota. e.g: > > /projects/docs/project1 > /projects/docs/project2 > /projects/docs/project3 > /projects/src/project1 > /projects/src/project2 > /projects/src/project3 > /projects/build/project1 > /projects/build/project2 > /projects/build/project3 > ..... > /home/user1/project1 > /home/user2/project3 > /home/user6/project2 > > Now we have multiple different disjoint directory heirarchies with > the same project ID because they contain files belonging to a > specific project. This is still a random collection of files in > random paths that are accounted to a project ID, but it's also using > PROJINHERIT to assign the default project ID to files created in the > respective project directories. > > The kernel has no idea that a project ID is associated with a single > directory tree. The kernel quota accounting itself simple sees > project IDs on inodes and accounts to that project ID. The create > and rename code simply see the parent PROJINHERIT bit and so need to > set up the new directory entry to be accounted to that different > project ID. It's all just mechanism in the kernel - applying project > quotas to enforce directory tree quotas is entirely a userspace > administration constraint.... > > The only thing we do to help userspace administration is restrict > changing project IDs to the init namespace, thereby preventing > containers from being able to manipulate project IDs. This allows > the init namespace to set the policy for project quota use and hence > allow it to be used for directory tree quotas for container space > management. This is essentially an extension of the original NAS > use case, in that NFS exports were the isolation barrier that hid > the underlying filesystem structure and project ID quota usage > from the end users.... > > > > > My point is that changing the project id of a non-dir child to be different > > > > from the project id of its parent is a pretty rare use case (I think?). > > > > > > Not if you are using project quotas as they were originally intended > > > to be used. > > > > > > > Rephrase then: > > > > Changing the projid of a non-dir child to be different from the projid > > of its parent, which has PROJINHERIT bit set, is a pretty rare use case(?) > > I have no data to indicate how rare it might be - the kernel has > never enforced a policy that disallows changing project IDs when > PROJINHERIT is set and any user with write permission can change > project IDs even when PROJINHERIT is set. Hence we have to assume > that there are people out there that rely on this behaviour, > regardless of how rare it is... > > > > > If changing the projid of non-dir is needed for moving it to a > > > > different subtree, > > > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly > > > > change its > > > > inherited project id or explicitly with a flag for a hardlink, e.g.: > > > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID). > > > > > > Why? > > > > > > The only reason XFS returns -EXDEV to rename across project IDs is > > > because nobody wanted to spend the time to work out how to do the > > > quota accounting of the metadata changed in the rename operation > > > accurately. So for that rare case (not something that would happen > > > on the NAS product) we returned -EXDEV to trigger the mv command to > > > copy the file to the destination and then unlink the source instead, > > > thereby handling all the quota accounting correctly. > > > > > > IOWs, this whole "-EXDEV on rename across parent project quota > > > boundaries" is an implementation detail and nothing more. > > > Filesystems that implement project quotas and the directory tree > > > sub-variant don't need to behave like this if they can accurately > > > account for the quota ID changes during an atomic rename operation. > > > If that's too hard, then the fallback is to return -EXDEV and let > > > userspace do it the slow way which will always acocunt the resource > > > usage correctly to the individual projects. > > > > > > Hence I think we should just fix the XFS kernel behaviour to do the > > > right thing in this special file case rather than return -EXDEV and > > > then forget about the rest of it. Sure, update xfs_repair to fix the > > > special file project id issue if it trips over it, but other than > > > that I don't think we need anything more. If fixing it requires new > > > syscalls and tools, then that's much harder to backport to old > > > kernels and distros than just backporting a couple of small XFS > > > kernel patches... > > > > > > > I assume that by "fix the XFS behavior" you mean > > "we could allow renameat2(2) of non-dir with no hardlinks to implicitly > > change its inherited project id"? > > (in case the new parent has the PROJINHERIT bit) > > so that the RENAME_NEW_PROJID behavior would be implicit. > > No, I meant "fix the original hardlink of an inode with no project ID > within a PROJINHERIT directory always gets -EXDEV" by allowing > hard links when the source file has no project ID specified. > > Actually, looking at 6.10-rc3, this has been merged already. So IMO > there's nothing more we need to do here. > I don't see it. Which commit is that? I agree that would be enough to address the original report. > > Unlike rename() from one parent to the other, link()+unlink() > > is less obvious. > > > > The "modern" use cases that I listed where implicit change of projid > > does not suffice are: > > > > 1. Share some inodes (as hardlinks) among projects > > 2. Recursively changing a subtree projid > > > > They could be implemented by explicit flags to renameat2()/linkat() and > > they could be implemented by [gs]etfsxattrat(2) syscalls. > > What are you expecting to happen when you hardlink an inode across > multiple PROJINHERIT directories? I mean, an inode can only have > one project ID, so you can't link it into mulitple projects at once > if you are using PROJINHERIT (xfs_link() expressly forbids it). > > So I'm really not sure what problem you are trying to describe or > solve here. Can you elaborate more? The problem that Andrey described started with a subtree that has no projid and then projid was applied to all non-special files. The suggested change to relax hardlink of projid 0 solves this case. But a subtree could also be changed by user, say from projid P1 to projid P2 and in that case inherited projid P1 will remain on the special files, causing a similar failure to link() without the projid 0 relax rule. Anyway, this discussion has wandered off too far from the original problem. If the simple fix is enough for Andrey, it's fine by me. If other people are interested in syscall or other means to change fsxattr on special files, it's fine by me as well. Thanks, Amir.
diff --git a/fs/ioctl.c b/fs/ioctl.c index 1d5abfdf0f22..3e3aacb6ea6e 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -22,6 +22,7 @@ #include <linux/mount.h> #include <linux/fscrypt.h> #include <linux/fileattr.h> +#include <linux/namei.h> #include "internal.h" @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode, if (fa->fsx_cowextsize == 0) fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + /* + * The only use case for special files is to set project ID, forbid any + * other attributes + */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT) + return -EINVAL; + if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents) + return -EINVAL; + if (fa->fsx_extsize || fa->fsx_cowextsize) + return -EINVAL; + } + return 0; } @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) return err; } +static int ioctl_fsgetxattrat(struct file *file, void __user *argp) +{ + struct path filepath; + struct fsxattrat fsxat; + struct fileattr fa; + int error; + + if (!S_ISDIR(file_inode(file)->i_mode)) + return -EBADF; + + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) + return -EFAULT; + + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); + if (error) + return error; + + error = vfs_fileattr_get(filepath.dentry, &fa); + if (error) { + path_put(&filepath); + return error; + } + + fsxat.fsx.fsx_xflags = fa.fsx_xflags; + fsxat.fsx.fsx_extsize = fa.fsx_extsize; + fsxat.fsx.fsx_nextents = fa.fsx_nextents; + fsxat.fsx.fsx_projid = fa.fsx_projid; + fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize; + + if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat))) + error = -EFAULT; + + path_put(&filepath); + return error; +} + +static int ioctl_fssetxattrat(struct file *file, void __user *argp) +{ + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct fsxattrat fsxat; + struct path filepath; + struct fileattr fa; + int error; + + if (!S_ISDIR(file_inode(file)->i_mode)) + return -EBADF; + + if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat))) + return -EFAULT; + + error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath); + if (error) + return error; + + error = mnt_want_write(filepath.mnt); + if (error) { + path_put(&filepath); + return error; + } + + fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags); + fa.fsx_extsize = fsxat.fsx.fsx_extsize; + fa.fsx_nextents = fsxat.fsx.fsx_nextents; + fa.fsx_projid = fsxat.fsx.fsx_projid; + fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize; + fa.fsx_valid = true; + + error = vfs_fileattr_set(idmap, filepath.dentry, &fa); + mnt_drop_write(filepath.mnt); + path_put(&filepath); + return error; +} + static int ioctl_getfsuuid(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); + case FS_IOC_FSGETXATTRAT: + return ioctl_fsgetxattrat(filp, argp); + + case FS_IOC_FSSETXATTRAT: + return ioctl_fssetxattrat(filp, argp); + case FS_IOC_GETFSUUID: return ioctl_getfsuuid(filp, argp); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 45e4e64fd664..f8cd8d7bf35d 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -139,6 +139,15 @@ struct fsxattr { unsigned char fsx_pad[8]; }; +/* + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT + */ +struct fsxattrat { + struct fsxattr fsx; /* XATTR to get/set */ + __u32 dfd; /* parent dir */ + const char __user *path; +}; + /* * Flags for the fsx_xflags field */ @@ -231,6 +240,8 @@ struct fsxattr { #define FS_IOC32_SETVERSION _IOW('v', 2, int) #define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) +#define FS_IOC_FSGETXATTRAT _IOR('X', 33, struct fsxattrat) +#define FS_IOC_FSSETXATTRAT _IOW('X', 34, struct fsxattrat) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) /* Returns the external filesystem UUID, the same one blkid returns */
XFS has project quotas which could be attached to a directory. All new inodes in these directories inherit project ID set on parent directory. The project is created from userspace by opening and calling FS_IOC_FSSETXATTR on each inode. This is not possible for special files such as FIFO, SOCK, BLK etc. as opening them returns a special inode from VFS. Therefore, some inodes are left with empty project ID. Those inodes then are not shown in the quota accounting but still exist in the directory. This patch adds two new ioctls which allows userspace, such as xfs_quota, to set project ID on special files by using parent directory to open FS inode. This will let xfs_quota set ID on all inodes and also reset it when project is removed. Also, as vfs_fileattr_set() is now will called on special files too, let's forbid any other attributes except projid and nextents (symlink can have one). Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com> --- fs/ioctl.c | 93 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 11 +++++ 2 files changed, 104 insertions(+)