diff mbox series

[v2,2/4] fs: add FS_IOC_FSSETXATTRAT and FS_IOC_FSGETXATTRAT

Message ID 20240520164624.665269-4-aalbersh@redhat.com (mailing list archive)
State Accepted, archived
Headers show
Series Introduce FS_IOC_FSSETXATTRAT/FS_IOC_FSGETXATTRAT ioctls | expand

Commit Message

Andrey Albershteyn May 20, 2024, 4:46 p.m. UTC
XFS has project quotas which could be attached to a directory. All
new inodes in these directories inherit project ID set on parent
directory.

The project is created from userspace by opening and calling
FS_IOC_FSSETXATTR on each inode. This is not possible for special
files such as FIFO, SOCK, BLK etc. as opening them returns a special
inode from VFS. Therefore, some inodes are left with empty project
ID. Those inodes then are not shown in the quota accounting but
still exist in the directory.

This patch adds two new ioctls which allows userspace, such as
xfs_quota, to set project ID on special files by using parent
directory to open FS inode. This will let xfs_quota set ID on all
inodes and also reset it when project is removed. Also, as
vfs_fileattr_set() is now will called on special files too, let's
forbid any other attributes except projid and nextents (symlink can
have one).

Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
 fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fs.h | 11 +++++
 2 files changed, 104 insertions(+)

Comments

Darrick J. Wong May 20, 2024, 5:51 p.m. UTC | #1
On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote:
> XFS has project quotas which could be attached to a directory. All
> new inodes in these directories inherit project ID set on parent
> directory.
> 
> The project is created from userspace by opening and calling
> FS_IOC_FSSETXATTR on each inode. This is not possible for special
> files such as FIFO, SOCK, BLK etc. as opening them returns a special
> inode from VFS. Therefore, some inodes are left with empty project
> ID. Those inodes then are not shown in the quota accounting but
> still exist in the directory.
> 
> This patch adds two new ioctls which allows userspace, such as
> xfs_quota, to set project ID on special files by using parent
> directory to open FS inode. This will let xfs_quota set ID on all
> inodes and also reset it when project is removed. Also, as
> vfs_fileattr_set() is now will called on special files too, let's
> forbid any other attributes except projid and nextents (symlink can
> have one).
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
>  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/fs.h | 11 +++++
>  2 files changed, 104 insertions(+)
> 
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 1d5abfdf0f22..3e3aacb6ea6e 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -22,6 +22,7 @@
>  #include <linux/mount.h>
>  #include <linux/fscrypt.h>
>  #include <linux/fileattr.h>
> +#include <linux/namei.h>
>  
>  #include "internal.h"
>  
> @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
>  	if (fa->fsx_cowextsize == 0)
>  		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
>  
> +	/*
> +	 * The only use case for special files is to set project ID, forbid any
> +	 * other attributes
> +	 */
> +	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> +		if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)

When would PROJINHERIT be set on a non-reg/non-dir file?

> +			return -EINVAL;
> +		if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)

FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any
other type of file, does it?

> +			return -EINVAL;
> +		if (fa->fsx_extsize || fa->fsx_cowextsize)
> +			return -EINVAL;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
>  	return err;
>  }
>  
> +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> +{
> +	struct path filepath;
> +	struct fsxattrat fsxat;
> +	struct fileattr fa;
> +	int error;
> +
> +	if (!S_ISDIR(file_inode(file)->i_mode))
> +		return -EBADF;
> +
> +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> +		return -EFAULT;
> +
> +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> +	if (error)
> +		return error;
> +
> +	error = vfs_fileattr_get(filepath.dentry, &fa);
> +	if (error) {
> +		path_put(&filepath);
> +		return error;
> +	}
> +
> +	fsxat.fsx.fsx_xflags = fa.fsx_xflags;
> +	fsxat.fsx.fsx_extsize = fa.fsx_extsize;
> +	fsxat.fsx.fsx_nextents = fa.fsx_nextents;
> +	fsxat.fsx.fsx_projid = fa.fsx_projid;
> +	fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize;
> +
> +	if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat)))
> +		error = -EFAULT;
> +
> +	path_put(&filepath);
> +	return error;
> +}
> +
> +static int ioctl_fssetxattrat(struct file *file, void __user *argp)
> +{
> +	struct mnt_idmap *idmap = file_mnt_idmap(file);
> +	struct fsxattrat fsxat;
> +	struct path filepath;
> +	struct fileattr fa;
> +	int error;
> +
> +	if (!S_ISDIR(file_inode(file)->i_mode))
> +		return -EBADF;
> +
> +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> +		return -EFAULT;
> +
> +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> +	if (error)
> +		return error;
> +
> +	error = mnt_want_write(filepath.mnt);
> +	if (error) {
> +		path_put(&filepath);
> +		return error;

This could be a goto to the path_put below.

> +	}
> +
> +	fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags);
> +	fa.fsx_extsize = fsxat.fsx.fsx_extsize;
> +	fa.fsx_nextents = fsxat.fsx.fsx_nextents;
> +	fa.fsx_projid = fsxat.fsx.fsx_projid;
> +	fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize;
> +	fa.fsx_valid = true;
> +
> +	error = vfs_fileattr_set(idmap, filepath.dentry, &fa);

Why not pass &fsxat.fsx directly to vfs_fileattr_set?

> +	mnt_drop_write(filepath.mnt);
> +	path_put(&filepath);
> +	return error;
> +}
> +
>  static int ioctl_getfsuuid(struct file *file, void __user *argp)
>  {
>  	struct super_block *sb = file_inode(file)->i_sb;
> @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
>  	case FS_IOC_FSSETXATTR:
>  		return ioctl_fssetxattr(filp, argp);
>  
> +	case FS_IOC_FSGETXATTRAT:
> +		return ioctl_fsgetxattrat(filp, argp);
> +
> +	case FS_IOC_FSSETXATTRAT:
> +		return ioctl_fssetxattrat(filp, argp);
> +
>  	case FS_IOC_GETFSUUID:
>  		return ioctl_getfsuuid(filp, argp);
>  
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 45e4e64fd664..f8cd8d7bf35d 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -139,6 +139,15 @@ struct fsxattr {
>  	unsigned char	fsx_pad[8];
>  };
>  
> +/*
> + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT
> + */
> +struct fsxattrat {
> +	struct fsxattr	fsx;		/* XATTR to get/set */
> +	__u32		dfd;		/* parent dir */
> +	const char	__user *path;

As I mentioned last time[1], embedding a pointer in an ioctl structure
creates porting problems because pointer sizes vary between process
personalities, so the size of struct fsxattrat will vary and lead to
copy_to/from_user overflows.


[1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/

--D

> +};
> +
>  /*
>   * Flags for the fsx_xflags field
>   */
> @@ -231,6 +240,8 @@ struct fsxattr {
>  #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
>  #define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
>  #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
> +#define FS_IOC_FSGETXATTRAT		_IOR('X', 33, struct fsxattrat)
> +#define FS_IOC_FSSETXATTRAT		_IOW('X', 34, struct fsxattrat)
>  #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
>  #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
>  /* Returns the external filesystem UUID, the same one blkid returns */
> -- 
> 2.42.0
> 
>
Amir Goldstein May 20, 2024, 7:03 p.m. UTC | #2
On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
>
> XFS has project quotas which could be attached to a directory. All
> new inodes in these directories inherit project ID set on parent
> directory.
>
> The project is created from userspace by opening and calling
> FS_IOC_FSSETXATTR on each inode. This is not possible for special
> files such as FIFO, SOCK, BLK etc. as opening them returns a special
> inode from VFS. Therefore, some inodes are left with empty project
> ID. Those inodes then are not shown in the quota accounting but
> still exist in the directory.
>
> This patch adds two new ioctls which allows userspace, such as
> xfs_quota, to set project ID on special files by using parent
> directory to open FS inode. This will let xfs_quota set ID on all
> inodes and also reset it when project is removed. Also, as
> vfs_fileattr_set() is now will called on special files too, let's
> forbid any other attributes except projid and nextents (symlink can
> have one).
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
>  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/fs.h | 11 +++++
>  2 files changed, 104 insertions(+)
>
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 1d5abfdf0f22..3e3aacb6ea6e 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -22,6 +22,7 @@
>  #include <linux/mount.h>
>  #include <linux/fscrypt.h>
>  #include <linux/fileattr.h>
> +#include <linux/namei.h>
>
>  #include "internal.h"
>
> @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
>         if (fa->fsx_cowextsize == 0)
>                 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
>
> +       /*
> +        * The only use case for special files is to set project ID, forbid any
> +        * other attributes
> +        */
> +       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> +               if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> +                       return -EINVAL;
> +               if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> +                       return -EINVAL;
> +               if (fa->fsx_extsize || fa->fsx_cowextsize)
> +                       return -EINVAL;
> +       }
> +
>         return 0;
>  }
>
> @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
>         return err;
>  }
>
> +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> +{
> +       struct path filepath;
> +       struct fsxattrat fsxat;
> +       struct fileattr fa;
> +       int error;
> +
> +       if (!S_ISDIR(file_inode(file)->i_mode))
> +               return -EBADF;

So the *only* thing that is done with the fd of the ioctl is to verify
that it is a directory fd - there is no verification that this fd is on the
same sb as the path to act on.

Was this the intention? It does not make a lot of sense to me
and AFAIK there is no precedent to an API like this.

There are ioctls that operate on the filesystem using any
fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH}
and maybe the closest example to what you are trying to add
XFS_IOC_BULKSTAT.

Trying to think of a saner API for this - perhaps pass an O_PATH
fd without any filename in struct fsxattrat, saving you also the
headache of passing a variable length string in an ioctl.

Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path
and file->f_path are on the same sb before proceeding to operate
on atfile->f_path.dentry.

The (maybe more sane) alternative is to add syscalls instead of
ioctls, but I won't force you to go there...

What do everyone think?

Thanks,
Amir.
Amir Goldstein May 20, 2024, 7:05 p.m. UTC | #3
[fix fsdevel address]

On Mon, May 20, 2024 at 10:03 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> >
> > XFS has project quotas which could be attached to a directory. All
> > new inodes in these directories inherit project ID set on parent
> > directory.
> >
> > The project is created from userspace by opening and calling
> > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > inode from VFS. Therefore, some inodes are left with empty project
> > ID. Those inodes then are not shown in the quota accounting but
> > still exist in the directory.
> >
> > This patch adds two new ioctls which allows userspace, such as
> > xfs_quota, to set project ID on special files by using parent
> > directory to open FS inode. This will let xfs_quota set ID on all
> > inodes and also reset it when project is removed. Also, as
> > vfs_fileattr_set() is now will called on special files too, let's
> > forbid any other attributes except projid and nextents (symlink can
> > have one).
> >
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > ---
> >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/fs.h | 11 +++++
> >  2 files changed, 104 insertions(+)
> >
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -22,6 +22,7 @@
> >  #include <linux/mount.h>
> >  #include <linux/fscrypt.h>
> >  #include <linux/fileattr.h>
> > +#include <linux/namei.h>
> >
> >  #include "internal.h"
> >
> > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> >         if (fa->fsx_cowextsize == 0)
> >                 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> >
> > +       /*
> > +        * The only use case for special files is to set project ID, forbid any
> > +        * other attributes
> > +        */
> > +       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > +               if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > +                       return -EINVAL;
> > +               if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > +                       return -EINVAL;
> > +               if (fa->fsx_extsize || fa->fsx_cowextsize)
> > +                       return -EINVAL;
> > +       }
> > +
> >         return 0;
> >  }
> >
> > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> >         return err;
> >  }
> >
> > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > +{
> > +       struct path filepath;
> > +       struct fsxattrat fsxat;
> > +       struct fileattr fa;
> > +       int error;
> > +
> > +       if (!S_ISDIR(file_inode(file)->i_mode))
> > +               return -EBADF;
>
> So the *only* thing that is done with the fd of the ioctl is to verify
> that it is a directory fd - there is no verification that this fd is on the
> same sb as the path to act on.
>
> Was this the intention? It does not make a lot of sense to me
> and AFAIK there is no precedent to an API like this.
>
> There are ioctls that operate on the filesystem using any
> fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH}
> and maybe the closest example to what you are trying to add
> XFS_IOC_BULKSTAT.
>
> Trying to think of a saner API for this - perhaps pass an O_PATH
> fd without any filename in struct fsxattrat, saving you also the
> headache of passing a variable length string in an ioctl.
>
> Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path
> and file->f_path are on the same sb before proceeding to operate
> on atfile->f_path.dentry.
>
> The (maybe more sane) alternative is to add syscalls instead of
> ioctls, but I won't force you to go there...
>
> What do everyone think?
>
> Thanks,
> Amir.
Andrey Albershteyn May 21, 2024, 10:52 a.m. UTC | #4
On 2024-05-20 10:51:59, Darrick J. Wong wrote:
> On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote:
> > XFS has project quotas which could be attached to a directory. All
> > new inodes in these directories inherit project ID set on parent
> > directory.
> > 
> > The project is created from userspace by opening and calling
> > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > inode from VFS. Therefore, some inodes are left with empty project
> > ID. Those inodes then are not shown in the quota accounting but
> > still exist in the directory.
> > 
> > This patch adds two new ioctls which allows userspace, such as
> > xfs_quota, to set project ID on special files by using parent
> > directory to open FS inode. This will let xfs_quota set ID on all
> > inodes and also reset it when project is removed. Also, as
> > vfs_fileattr_set() is now will called on special files too, let's
> > forbid any other attributes except projid and nextents (symlink can
> > have one).
> > 
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > ---
> >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/fs.h | 11 +++++
> >  2 files changed, 104 insertions(+)
> > 
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -22,6 +22,7 @@
> >  #include <linux/mount.h>
> >  #include <linux/fscrypt.h>
> >  #include <linux/fileattr.h>
> > +#include <linux/namei.h>
> >  
> >  #include "internal.h"
> >  
> > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> >  	if (fa->fsx_cowextsize == 0)
> >  		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> >  
> > +	/*
> > +	 * The only use case for special files is to set project ID, forbid any
> > +	 * other attributes
> > +	 */
> > +	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > +		if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> 
> When would PROJINHERIT be set on a non-reg/non-dir file?

Oh I thought it's set on all inodes in projects anyway, but looks
like it's dropped for files in xfs_flags2diflags(). The xfs_quota
just set it for each inode so I just haven't checked it. I will drop
this check and change xfs_quota to not set PROJINHERIT for special
files.

> > +			return -EINVAL;
> > +		if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> 
> FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any
> other type of file, does it?

yes, it doesn't enforce anything, as I described in the comment I
tried to reject any other uses. But symlink has fsx_nextents == 1 so
get + set call will fail.

> 
> > +			return -EINVAL;
> > +		if (fa->fsx_extsize || fa->fsx_cowextsize)
> > +			return -EINVAL;
> > +	}
> > +
> >  	return 0;
> >  }
> >  
> > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> >  	return err;
> >  }
> >  
> > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > +{
> > +	struct path filepath;
> > +	struct fsxattrat fsxat;
> > +	struct fileattr fa;
> > +	int error;
> > +
> > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > +		return -EBADF;
> > +
> > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > +		return -EFAULT;
> > +
> > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > +	if (error)
> > +		return error;
> > +
> > +	error = vfs_fileattr_get(filepath.dentry, &fa);
> > +	if (error) {
> > +		path_put(&filepath);
> > +		return error;
> > +	}
> > +
> > +	fsxat.fsx.fsx_xflags = fa.fsx_xflags;
> > +	fsxat.fsx.fsx_extsize = fa.fsx_extsize;
> > +	fsxat.fsx.fsx_nextents = fa.fsx_nextents;
> > +	fsxat.fsx.fsx_projid = fa.fsx_projid;
> > +	fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize;
> > +
> > +	if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat)))
> > +		error = -EFAULT;
> > +
> > +	path_put(&filepath);
> > +	return error;
> > +}
> > +
> > +static int ioctl_fssetxattrat(struct file *file, void __user *argp)
> > +{
> > +	struct mnt_idmap *idmap = file_mnt_idmap(file);
> > +	struct fsxattrat fsxat;
> > +	struct path filepath;
> > +	struct fileattr fa;
> > +	int error;
> > +
> > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > +		return -EBADF;
> > +
> > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > +		return -EFAULT;
> > +
> > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > +	if (error)
> > +		return error;
> > +
> > +	error = mnt_want_write(filepath.mnt);
> > +	if (error) {
> > +		path_put(&filepath);
> > +		return error;
> 
> This could be a goto to the path_put below.
> 
> > +	}
> > +
> > +	fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags);
> > +	fa.fsx_extsize = fsxat.fsx.fsx_extsize;
> > +	fa.fsx_nextents = fsxat.fsx.fsx_nextents;
> > +	fa.fsx_projid = fsxat.fsx.fsx_projid;
> > +	fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize;
> > +	fa.fsx_valid = true;
> > +
> > +	error = vfs_fileattr_set(idmap, filepath.dentry, &fa);
> 
> Why not pass &fsxat.fsx directly to vfs_fileattr_set?

vfs_fileattr_set() takes fileattr and there won't be fsx_valid, no? I
suppose they aren't aligned

> 
> > +	mnt_drop_write(filepath.mnt);
> > +	path_put(&filepath);
> > +	return error;
> > +}
> > +
> >  static int ioctl_getfsuuid(struct file *file, void __user *argp)
> >  {
> >  	struct super_block *sb = file_inode(file)->i_sb;
> > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> >  	case FS_IOC_FSSETXATTR:
> >  		return ioctl_fssetxattr(filp, argp);
> >  
> > +	case FS_IOC_FSGETXATTRAT:
> > +		return ioctl_fsgetxattrat(filp, argp);
> > +
> > +	case FS_IOC_FSSETXATTRAT:
> > +		return ioctl_fssetxattrat(filp, argp);
> > +
> >  	case FS_IOC_GETFSUUID:
> >  		return ioctl_getfsuuid(filp, argp);
> >  
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 45e4e64fd664..f8cd8d7bf35d 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -139,6 +139,15 @@ struct fsxattr {
> >  	unsigned char	fsx_pad[8];
> >  };
> >  
> > +/*
> > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT
> > + */
> > +struct fsxattrat {
> > +	struct fsxattr	fsx;		/* XATTR to get/set */
> > +	__u32		dfd;		/* parent dir */
> > +	const char	__user *path;
> 
> As I mentioned last time[1], embedding a pointer in an ioctl structure
> creates porting problems because pointer sizes vary between process
> personalities, so the size of struct fsxattrat will vary and lead to
> copy_to/from_user overflows.

Oh right, sorry, totally forgot about that

> 
> 
> [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/
> 
> --D
> 
> > +};
> > +
> >  /*
> >   * Flags for the fsx_xflags field
> >   */
> > @@ -231,6 +240,8 @@ struct fsxattr {
> >  #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
> >  #define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
> >  #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
> > +#define FS_IOC_FSGETXATTRAT		_IOR('X', 33, struct fsxattrat)
> > +#define FS_IOC_FSSETXATTRAT		_IOW('X', 34, struct fsxattrat)
> >  #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
> >  #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
> >  /* Returns the external filesystem UUID, the same one blkid returns */
> > -- 
> > 2.42.0
> > 
> > 
>
Christian Brauner May 21, 2024, 2:06 p.m. UTC | #5
On Mon, May 20, 2024 at 10:51:59AM -0700, Darrick J. Wong wrote:
> On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote:
> > XFS has project quotas which could be attached to a directory. All
> > new inodes in these directories inherit project ID set on parent
> > directory.
> > 
> > The project is created from userspace by opening and calling
> > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > inode from VFS. Therefore, some inodes are left with empty project
> > ID. Those inodes then are not shown in the quota accounting but
> > still exist in the directory.
> > 
> > This patch adds two new ioctls which allows userspace, such as
> > xfs_quota, to set project ID on special files by using parent
> > directory to open FS inode. This will let xfs_quota set ID on all
> > inodes and also reset it when project is removed. Also, as
> > vfs_fileattr_set() is now will called on special files too, let's
> > forbid any other attributes except projid and nextents (symlink can
> > have one).
> > 
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > ---
> >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/fs.h | 11 +++++
> >  2 files changed, 104 insertions(+)
> > 
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -22,6 +22,7 @@
> >  #include <linux/mount.h>
> >  #include <linux/fscrypt.h>
> >  #include <linux/fileattr.h>
> > +#include <linux/namei.h>
> >  
> >  #include "internal.h"
> >  
> > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> >  	if (fa->fsx_cowextsize == 0)
> >  		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> >  
> > +	/*
> > +	 * The only use case for special files is to set project ID, forbid any
> > +	 * other attributes
> > +	 */
> > +	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > +		if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> 
> When would PROJINHERIT be set on a non-reg/non-dir file?
> 
> > +			return -EINVAL;
> > +		if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> 
> FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any
> other type of file, does it?
> 
> > +			return -EINVAL;
> > +		if (fa->fsx_extsize || fa->fsx_cowextsize)
> > +			return -EINVAL;
> > +	}
> > +
> >  	return 0;
> >  }
> >  
> > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> >  	return err;
> >  }
> >  
> > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > +{
> > +	struct path filepath;
> > +	struct fsxattrat fsxat;
> > +	struct fileattr fa;
> > +	int error;
> > +
> > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > +		return -EBADF;
> > +
> > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > +		return -EFAULT;
> > +
> > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > +	if (error)
> > +		return error;
> > +
> > +	error = vfs_fileattr_get(filepath.dentry, &fa);
> > +	if (error) {
> > +		path_put(&filepath);
> > +		return error;
> > +	}
> > +
> > +	fsxat.fsx.fsx_xflags = fa.fsx_xflags;
> > +	fsxat.fsx.fsx_extsize = fa.fsx_extsize;
> > +	fsxat.fsx.fsx_nextents = fa.fsx_nextents;
> > +	fsxat.fsx.fsx_projid = fa.fsx_projid;
> > +	fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize;
> > +
> > +	if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat)))
> > +		error = -EFAULT;
> > +
> > +	path_put(&filepath);
> > +	return error;
> > +}
> > +
> > +static int ioctl_fssetxattrat(struct file *file, void __user *argp)
> > +{
> > +	struct mnt_idmap *idmap = file_mnt_idmap(file);
> > +	struct fsxattrat fsxat;
> > +	struct path filepath;
> > +	struct fileattr fa;
> > +	int error;
> > +
> > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > +		return -EBADF;
> > +
> > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > +		return -EFAULT;
> > +
> > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > +	if (error)
> > +		return error;
> > +
> > +	error = mnt_want_write(filepath.mnt);
> > +	if (error) {
> > +		path_put(&filepath);
> > +		return error;
> 
> This could be a goto to the path_put below.

(Unrelated to content but we should probably grow cleanup guards for
path_get()/path_put() and mnt_want_write()/mnt_drop_write() then stuff
like this becomes a no-brainer.)

> 
> > +	}
> > +
> > +	fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags);
> > +	fa.fsx_extsize = fsxat.fsx.fsx_extsize;
> > +	fa.fsx_nextents = fsxat.fsx.fsx_nextents;
> > +	fa.fsx_projid = fsxat.fsx.fsx_projid;
> > +	fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize;
> > +	fa.fsx_valid = true;
> > +
> > +	error = vfs_fileattr_set(idmap, filepath.dentry, &fa);
> 
> Why not pass &fsxat.fsx directly to vfs_fileattr_set?
> 
> > +	mnt_drop_write(filepath.mnt);
> > +	path_put(&filepath);
> > +	return error;
> > +}
> > +
> >  static int ioctl_getfsuuid(struct file *file, void __user *argp)
> >  {
> >  	struct super_block *sb = file_inode(file)->i_sb;
> > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> >  	case FS_IOC_FSSETXATTR:
> >  		return ioctl_fssetxattr(filp, argp);
> >  
> > +	case FS_IOC_FSGETXATTRAT:
> > +		return ioctl_fsgetxattrat(filp, argp);
> > +
> > +	case FS_IOC_FSSETXATTRAT:
> > +		return ioctl_fssetxattrat(filp, argp);
> > +
> >  	case FS_IOC_GETFSUUID:
> >  		return ioctl_getfsuuid(filp, argp);
> >  
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 45e4e64fd664..f8cd8d7bf35d 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -139,6 +139,15 @@ struct fsxattr {
> >  	unsigned char	fsx_pad[8];
> >  };
> >  
> > +/*
> > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT
> > + */
> > +struct fsxattrat {
> > +	struct fsxattr	fsx;		/* XATTR to get/set */
> > +	__u32		dfd;		/* parent dir */
> > +	const char	__user *path;
> 
> As I mentioned last time[1], embedding a pointer in an ioctl structure
> creates porting problems because pointer sizes vary between process
> personalities, so the size of struct fsxattrat will vary and lead to
> copy_to/from_user overflows.
> 
> 
> [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/

So as you've mentioned in that thread using a u64 or here an aligned_u64
makes the most sense. The kernel has all the necessary magic to deal
with this already (u64_to_user_ptr() helper etc.). If you want to be
extra-sure it's possible to slap a size for that path as well.

The ioctl structure can be versioned by size if it's 64bit aligned. The
ioctl code encodes the size of the struct and since the current
structure is all nicely 64bit aligned it should just work (tm).

kernel/seccomp.c does that size verisioning by ioctl as an example. Then
one can do:

#define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))

        switch (EA_IOCTL(cmd)) {
        case EA_IOCTL(FS_IOC_FSGETXATTRAT):
	        ret = copy_struct_from_user(&kstruct, sizeof(kstruct), usstruct, _IOC_SIZE(cmd));

Which will do all the heavy lifting for you. To me that seems
workable but I might miss other problems you're thinking of.
Christian Brauner May 21, 2024, 2:19 p.m. UTC | #6
On Tue, May 21, 2024 at 04:06:15PM +0200, Christian Brauner wrote:
> On Mon, May 20, 2024 at 10:51:59AM -0700, Darrick J. Wong wrote:
> > On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote:
> > > XFS has project quotas which could be attached to a directory. All
> > > new inodes in these directories inherit project ID set on parent
> > > directory.
> > > 
> > > The project is created from userspace by opening and calling
> > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > inode from VFS. Therefore, some inodes are left with empty project
> > > ID. Those inodes then are not shown in the quota accounting but
> > > still exist in the directory.
> > > 
> > > This patch adds two new ioctls which allows userspace, such as
> > > xfs_quota, to set project ID on special files by using parent
> > > directory to open FS inode. This will let xfs_quota set ID on all
> > > inodes and also reset it when project is removed. Also, as
> > > vfs_fileattr_set() is now will called on special files too, let's
> > > forbid any other attributes except projid and nextents (symlink can
> > > have one).
> > > 
> > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > ---
> > >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> > >  include/uapi/linux/fs.h | 11 +++++
> > >  2 files changed, 104 insertions(+)
> > > 
> > > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > > --- a/fs/ioctl.c
> > > +++ b/fs/ioctl.c
> > > @@ -22,6 +22,7 @@
> > >  #include <linux/mount.h>
> > >  #include <linux/fscrypt.h>
> > >  #include <linux/fileattr.h>
> > > +#include <linux/namei.h>
> > >  
> > >  #include "internal.h"
> > >  
> > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> > >  	if (fa->fsx_cowextsize == 0)
> > >  		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> > >  
> > > +	/*
> > > +	 * The only use case for special files is to set project ID, forbid any
> > > +	 * other attributes
> > > +	 */
> > > +	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > > +		if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > 
> > When would PROJINHERIT be set on a non-reg/non-dir file?
> > 
> > > +			return -EINVAL;
> > > +		if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > 
> > FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any
> > other type of file, does it?
> > 
> > > +			return -EINVAL;
> > > +		if (fa->fsx_extsize || fa->fsx_cowextsize)
> > > +			return -EINVAL;
> > > +	}
> > > +
> > >  	return 0;
> > >  }
> > >  
> > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> > >  	return err;
> > >  }
> > >  
> > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > > +{
> > > +	struct path filepath;
> > > +	struct fsxattrat fsxat;
> > > +	struct fileattr fa;
> > > +	int error;
> > > +
> > > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > > +		return -EBADF;
> > > +
> > > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > > +		return -EFAULT;
> > > +
> > > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	error = vfs_fileattr_get(filepath.dentry, &fa);
> > > +	if (error) {
> > > +		path_put(&filepath);
> > > +		return error;
> > > +	}
> > > +
> > > +	fsxat.fsx.fsx_xflags = fa.fsx_xflags;
> > > +	fsxat.fsx.fsx_extsize = fa.fsx_extsize;
> > > +	fsxat.fsx.fsx_nextents = fa.fsx_nextents;
> > > +	fsxat.fsx.fsx_projid = fa.fsx_projid;
> > > +	fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize;
> > > +
> > > +	if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat)))
> > > +		error = -EFAULT;
> > > +
> > > +	path_put(&filepath);
> > > +	return error;
> > > +}
> > > +
> > > +static int ioctl_fssetxattrat(struct file *file, void __user *argp)
> > > +{
> > > +	struct mnt_idmap *idmap = file_mnt_idmap(file);
> > > +	struct fsxattrat fsxat;
> > > +	struct path filepath;
> > > +	struct fileattr fa;
> > > +	int error;
> > > +
> > > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > > +		return -EBADF;
> > > +
> > > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > > +		return -EFAULT;
> > > +
> > > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	error = mnt_want_write(filepath.mnt);
> > > +	if (error) {
> > > +		path_put(&filepath);
> > > +		return error;
> > 
> > This could be a goto to the path_put below.
> 
> (Unrelated to content but we should probably grow cleanup guards for
> path_get()/path_put() and mnt_want_write()/mnt_drop_write() then stuff
> like this becomes a no-brainer.)
> 
> > 
> > > +	}
> > > +
> > > +	fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags);
> > > +	fa.fsx_extsize = fsxat.fsx.fsx_extsize;
> > > +	fa.fsx_nextents = fsxat.fsx.fsx_nextents;
> > > +	fa.fsx_projid = fsxat.fsx.fsx_projid;
> > > +	fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize;
> > > +	fa.fsx_valid = true;
> > > +
> > > +	error = vfs_fileattr_set(idmap, filepath.dentry, &fa);
> > 
> > Why not pass &fsxat.fsx directly to vfs_fileattr_set?
> > 
> > > +	mnt_drop_write(filepath.mnt);
> > > +	path_put(&filepath);
> > > +	return error;
> > > +}
> > > +
> > >  static int ioctl_getfsuuid(struct file *file, void __user *argp)
> > >  {
> > >  	struct super_block *sb = file_inode(file)->i_sb;
> > > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> > >  	case FS_IOC_FSSETXATTR:
> > >  		return ioctl_fssetxattr(filp, argp);
> > >  
> > > +	case FS_IOC_FSGETXATTRAT:
> > > +		return ioctl_fsgetxattrat(filp, argp);
> > > +
> > > +	case FS_IOC_FSSETXATTRAT:
> > > +		return ioctl_fssetxattrat(filp, argp);
> > > +
> > >  	case FS_IOC_GETFSUUID:
> > >  		return ioctl_getfsuuid(filp, argp);
> > >  
> > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > index 45e4e64fd664..f8cd8d7bf35d 100644
> > > --- a/include/uapi/linux/fs.h
> > > +++ b/include/uapi/linux/fs.h
> > > @@ -139,6 +139,15 @@ struct fsxattr {
> > >  	unsigned char	fsx_pad[8];
> > >  };
> > >  
> > > +/*
> > > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT
> > > + */
> > > +struct fsxattrat {
> > > +	struct fsxattr	fsx;		/* XATTR to get/set */
> > > +	__u32		dfd;		/* parent dir */
> > > +	const char	__user *path;
> > 
> > As I mentioned last time[1], embedding a pointer in an ioctl structure
> > creates porting problems because pointer sizes vary between process
> > personalities, so the size of struct fsxattrat will vary and lead to
> > copy_to/from_user overflows.
> > 
> > 
> > [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/
> 
> So as you've mentioned in that thread using a u64 or here an aligned_u64
> makes the most sense. The kernel has all the necessary magic to deal
> with this already (u64_to_user_ptr() helper etc.). If you want to be
> extra-sure it's possible to slap a size for that path as well.
> 
> The ioctl structure can be versioned by size if it's 64bit aligned. The
> ioctl code encodes the size of the struct and since the current
> structure is all nicely 64bit aligned it should just work (tm).
> 
> kernel/seccomp.c does that size verisioning by ioctl as an example. Then
> one can do:
> 
> #define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
> 
>         switch (EA_IOCTL(cmd)) {
>         case EA_IOCTL(FS_IOC_FSGETXATTRAT):
> 	        ret = copy_struct_from_user(&kstruct, sizeof(kstruct), usstruct, _IOC_SIZE(cmd));
> 
> Which will do all the heavy lifting for you. To me that seems
> workable but I might miss other problems you're thinking of.

Resending with fixed fsdevel address...
Darrick J. Wong May 21, 2024, 3:36 p.m. UTC | #7
On Tue, May 21, 2024 at 04:19:40PM +0200, Christian Brauner wrote:
> On Tue, May 21, 2024 at 04:06:15PM +0200, Christian Brauner wrote:
> > On Mon, May 20, 2024 at 10:51:59AM -0700, Darrick J. Wong wrote:
> > > On Mon, May 20, 2024 at 06:46:21PM +0200, Andrey Albershteyn wrote:
> > > > XFS has project quotas which could be attached to a directory. All
> > > > new inodes in these directories inherit project ID set on parent
> > > > directory.
> > > > 
> > > > The project is created from userspace by opening and calling
> > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > ID. Those inodes then are not shown in the quota accounting but
> > > > still exist in the directory.
> > > > 
> > > > This patch adds two new ioctls which allows userspace, such as
> > > > xfs_quota, to set project ID on special files by using parent
> > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > inodes and also reset it when project is removed. Also, as
> > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > forbid any other attributes except projid and nextents (symlink can
> > > > have one).
> > > > 
> > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > ---
> > > >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/fs.h | 11 +++++
> > > >  2 files changed, 104 insertions(+)
> > > > 
> > > > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > > > --- a/fs/ioctl.c
> > > > +++ b/fs/ioctl.c
> > > > @@ -22,6 +22,7 @@
> > > >  #include <linux/mount.h>
> > > >  #include <linux/fscrypt.h>
> > > >  #include <linux/fileattr.h>
> > > > +#include <linux/namei.h>
> > > >  
> > > >  #include "internal.h"
> > > >  
> > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> > > >  	if (fa->fsx_cowextsize == 0)
> > > >  		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> > > >  
> > > > +	/*
> > > > +	 * The only use case for special files is to set project ID, forbid any
> > > > +	 * other attributes
> > > > +	 */
> > > > +	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > > > +		if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > > 
> > > When would PROJINHERIT be set on a non-reg/non-dir file?
> > > 
> > > > +			return -EINVAL;
> > > > +		if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > > 
> > > FS_IOC_FSSETXATTR doesn't enforce anything for fsx_nextents for any
> > > other type of file, does it?
> > > 
> > > > +			return -EINVAL;
> > > > +		if (fa->fsx_extsize || fa->fsx_cowextsize)
> > > > +			return -EINVAL;
> > > > +	}
> > > > +
> > > >  	return 0;
> > > >  }
> > > >  
> > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> > > >  	return err;
> > > >  }
> > > >  
> > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > > > +{
> > > > +	struct path filepath;
> > > > +	struct fsxattrat fsxat;
> > > > +	struct fileattr fa;
> > > > +	int error;
> > > > +
> > > > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > > > +		return -EBADF;
> > > > +
> > > > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > > > +		return -EFAULT;
> > > > +
> > > > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	error = vfs_fileattr_get(filepath.dentry, &fa);
> > > > +	if (error) {
> > > > +		path_put(&filepath);
> > > > +		return error;
> > > > +	}
> > > > +
> > > > +	fsxat.fsx.fsx_xflags = fa.fsx_xflags;
> > > > +	fsxat.fsx.fsx_extsize = fa.fsx_extsize;
> > > > +	fsxat.fsx.fsx_nextents = fa.fsx_nextents;
> > > > +	fsxat.fsx.fsx_projid = fa.fsx_projid;
> > > > +	fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize;
> > > > +
> > > > +	if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat)))
> > > > +		error = -EFAULT;
> > > > +
> > > > +	path_put(&filepath);
> > > > +	return error;
> > > > +}
> > > > +
> > > > +static int ioctl_fssetxattrat(struct file *file, void __user *argp)
> > > > +{
> > > > +	struct mnt_idmap *idmap = file_mnt_idmap(file);
> > > > +	struct fsxattrat fsxat;
> > > > +	struct path filepath;
> > > > +	struct fileattr fa;
> > > > +	int error;
> > > > +
> > > > +	if (!S_ISDIR(file_inode(file)->i_mode))
> > > > +		return -EBADF;
> > > > +
> > > > +	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
> > > > +		return -EFAULT;
> > > > +
> > > > +	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	error = mnt_want_write(filepath.mnt);
> > > > +	if (error) {
> > > > +		path_put(&filepath);
> > > > +		return error;
> > > 
> > > This could be a goto to the path_put below.
> > 
> > (Unrelated to content but we should probably grow cleanup guards for
> > path_get()/path_put() and mnt_want_write()/mnt_drop_write() then stuff
> > like this becomes a no-brainer.)
> > 
> > > 
> > > > +	}
> > > > +
> > > > +	fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags);
> > > > +	fa.fsx_extsize = fsxat.fsx.fsx_extsize;
> > > > +	fa.fsx_nextents = fsxat.fsx.fsx_nextents;
> > > > +	fa.fsx_projid = fsxat.fsx.fsx_projid;
> > > > +	fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize;
> > > > +	fa.fsx_valid = true;
> > > > +
> > > > +	error = vfs_fileattr_set(idmap, filepath.dentry, &fa);
> > > 
> > > Why not pass &fsxat.fsx directly to vfs_fileattr_set?
> > > 
> > > > +	mnt_drop_write(filepath.mnt);
> > > > +	path_put(&filepath);
> > > > +	return error;
> > > > +}
> > > > +
> > > >  static int ioctl_getfsuuid(struct file *file, void __user *argp)
> > > >  {
> > > >  	struct super_block *sb = file_inode(file)->i_sb;
> > > > @@ -872,6 +959,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> > > >  	case FS_IOC_FSSETXATTR:
> > > >  		return ioctl_fssetxattr(filp, argp);
> > > >  
> > > > +	case FS_IOC_FSGETXATTRAT:
> > > > +		return ioctl_fsgetxattrat(filp, argp);
> > > > +
> > > > +	case FS_IOC_FSSETXATTRAT:
> > > > +		return ioctl_fssetxattrat(filp, argp);
> > > > +
> > > >  	case FS_IOC_GETFSUUID:
> > > >  		return ioctl_getfsuuid(filp, argp);
> > > >  
> > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > index 45e4e64fd664..f8cd8d7bf35d 100644
> > > > --- a/include/uapi/linux/fs.h
> > > > +++ b/include/uapi/linux/fs.h
> > > > @@ -139,6 +139,15 @@ struct fsxattr {
> > > >  	unsigned char	fsx_pad[8];
> > > >  };
> > > >  
> > > > +/*
> > > > + * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT
> > > > + */
> > > > +struct fsxattrat {
> > > > +	struct fsxattr	fsx;		/* XATTR to get/set */
> > > > +	__u32		dfd;		/* parent dir */
> > > > +	const char	__user *path;
> > > 
> > > As I mentioned last time[1], embedding a pointer in an ioctl structure
> > > creates porting problems because pointer sizes vary between process
> > > personalities, so the size of struct fsxattrat will vary and lead to
> > > copy_to/from_user overflows.
> > > 
> > > 
> > > [1] https://lore.kernel.org/linux-xfs/20240509232517.GR360919@frogsfrogsfrogs/
> > 
> > So as you've mentioned in that thread using a u64 or here an aligned_u64
> > makes the most sense. The kernel has all the necessary magic to deal
> > with this already (u64_to_user_ptr() helper etc.). If you want to be
> > extra-sure it's possible to slap a size for that path as well.

Ooh I didn't know that existed, cleanup patch for xfs on its way!

> > The ioctl structure can be versioned by size if it's 64bit aligned. The
> > ioctl code encodes the size of the struct and since the current
> > structure is all nicely 64bit aligned it should just work (tm).

I've tried to merge variations-on-a-theme ioctls with the same @nr and
different @size, but every time I've been asked not to do that.  I've
never understood why, since _IO[RW]* has worked that way on Linux
forever, and BUILD_BUG_ON can be employed here to guard against
collisions.

The point is, I don't think xfs developers have adopted this particular
habit and I'd love to know why.

> > kernel/seccomp.c does that size verisioning by ioctl as an example. Then
> > one can do:
> > 
> > #define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
> > 
> >         switch (EA_IOCTL(cmd)) {
> >         case EA_IOCTL(FS_IOC_FSGETXATTRAT):
> > 	        ret = copy_struct_from_user(&kstruct, sizeof(kstruct), usstruct, _IOC_SIZE(cmd));
> > 
> > Which will do all the heavy lifting for you. To me that seems
> > workable but I might miss other problems you're thinking of.
> 
> Resending with fixed fsdevel address...

Yes, please fix that for v3 as well.

--D
Andrey Albershteyn May 21, 2024, 4:34 p.m. UTC | #8
On 2024-05-20 22:03:43, Amir Goldstein wrote:
> On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> >
> > XFS has project quotas which could be attached to a directory. All
> > new inodes in these directories inherit project ID set on parent
> > directory.
> >
> > The project is created from userspace by opening and calling
> > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > inode from VFS. Therefore, some inodes are left with empty project
> > ID. Those inodes then are not shown in the quota accounting but
> > still exist in the directory.
> >
> > This patch adds two new ioctls which allows userspace, such as
> > xfs_quota, to set project ID on special files by using parent
> > directory to open FS inode. This will let xfs_quota set ID on all
> > inodes and also reset it when project is removed. Also, as
> > vfs_fileattr_set() is now will called on special files too, let's
> > forbid any other attributes except projid and nextents (symlink can
> > have one).
> >
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > ---
> >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/fs.h | 11 +++++
> >  2 files changed, 104 insertions(+)
> >
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -22,6 +22,7 @@
> >  #include <linux/mount.h>
> >  #include <linux/fscrypt.h>
> >  #include <linux/fileattr.h>
> > +#include <linux/namei.h>
> >
> >  #include "internal.h"
> >
> > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> >         if (fa->fsx_cowextsize == 0)
> >                 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> >
> > +       /*
> > +        * The only use case for special files is to set project ID, forbid any
> > +        * other attributes
> > +        */
> > +       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > +               if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > +                       return -EINVAL;
> > +               if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > +                       return -EINVAL;
> > +               if (fa->fsx_extsize || fa->fsx_cowextsize)
> > +                       return -EINVAL;
> > +       }
> > +
> >         return 0;
> >  }
> >
> > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> >         return err;
> >  }
> >
> > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > +{
> > +       struct path filepath;
> > +       struct fsxattrat fsxat;
> > +       struct fileattr fa;
> > +       int error;
> > +
> > +       if (!S_ISDIR(file_inode(file)->i_mode))
> > +               return -EBADF;
> 
> So the *only* thing that is done with the fd of the ioctl is to verify
> that it is a directory fd - there is no verification that this fd is on the
> same sb as the path to act on.
> 
> Was this the intention? It does not make a lot of sense to me
> and AFAIK there is no precedent to an API like this.

yeah, as we want to set xattrs on that inode the path is pointing
to, so, VFS will call to the FS under that sb.

> 
> There are ioctls that operate on the filesystem using any
> fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH}
> and maybe the closest example to what you are trying to add
> XFS_IOC_BULKSTAT.

Not sure that I get what you mean here, the *at() part is to get
around VFS special inodes and call vfs_fileattr_set/get on FS inodes.

> 
> Trying to think of a saner API for this - perhaps pass an O_PATH
> fd without any filename in struct fsxattrat, saving you also the
> headache of passing a variable length string in an ioctl.
> 
> Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path
> and file->f_path are on the same sb before proceeding to operate
> on atfile->f_path.dentry.

Thanks! Didn't know about O_PATH that seems to be a way to get rid
of the path passing.

> 
> The (maybe more sane) alternative is to add syscalls instead of
> ioctls, but I won't force you to go there...
> 
> What do everyone think?
> 
> Thanks,
> Amir.
>
Amir Goldstein May 21, 2024, 6:22 p.m. UTC | #9
On Tue, May 21, 2024 at 7:34 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
>
> On 2024-05-20 22:03:43, Amir Goldstein wrote:
> > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > >
> > > XFS has project quotas which could be attached to a directory. All
> > > new inodes in these directories inherit project ID set on parent
> > > directory.
> > >
> > > The project is created from userspace by opening and calling
> > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > inode from VFS. Therefore, some inodes are left with empty project
> > > ID. Those inodes then are not shown in the quota accounting but
> > > still exist in the directory.
> > >
> > > This patch adds two new ioctls which allows userspace, such as
> > > xfs_quota, to set project ID on special files by using parent
> > > directory to open FS inode. This will let xfs_quota set ID on all
> > > inodes and also reset it when project is removed. Also, as
> > > vfs_fileattr_set() is now will called on special files too, let's
> > > forbid any other attributes except projid and nextents (symlink can
> > > have one).
> > >
> > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > ---
> > >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> > >  include/uapi/linux/fs.h | 11 +++++
> > >  2 files changed, 104 insertions(+)
> > >
> > > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > > --- a/fs/ioctl.c
> > > +++ b/fs/ioctl.c
> > > @@ -22,6 +22,7 @@
> > >  #include <linux/mount.h>
> > >  #include <linux/fscrypt.h>
> > >  #include <linux/fileattr.h>
> > > +#include <linux/namei.h>
> > >
> > >  #include "internal.h"
> > >
> > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> > >         if (fa->fsx_cowextsize == 0)
> > >                 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> > >
> > > +       /*
> > > +        * The only use case for special files is to set project ID, forbid any
> > > +        * other attributes
> > > +        */
> > > +       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > > +               if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > > +                       return -EINVAL;
> > > +               if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > > +                       return -EINVAL;
> > > +               if (fa->fsx_extsize || fa->fsx_cowextsize)
> > > +                       return -EINVAL;
> > > +       }
> > > +
> > >         return 0;
> > >  }
> > >
> > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> > >         return err;
> > >  }
> > >
> > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > > +{
> > > +       struct path filepath;
> > > +       struct fsxattrat fsxat;
> > > +       struct fileattr fa;
> > > +       int error;
> > > +
> > > +       if (!S_ISDIR(file_inode(file)->i_mode))
> > > +               return -EBADF;
> >
> > So the *only* thing that is done with the fd of the ioctl is to verify
> > that it is a directory fd - there is no verification that this fd is on the
> > same sb as the path to act on.
> >
> > Was this the intention? It does not make a lot of sense to me
> > and AFAIK there is no precedent to an API like this.
>
> yeah, as we want to set xattrs on that inode the path is pointing
> to, so, VFS will call to the FS under that sb.
>
> >
> > There are ioctls that operate on the filesystem using any
> > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH}
> > and maybe the closest example to what you are trying to add
> > XFS_IOC_BULKSTAT.
>
> Not sure that I get what you mean here, the *at() part is to get
> around VFS special inodes and call vfs_fileattr_set/get on FS inodes.
>

My point was that with your proposed API the fd argument to
ioctl() can be a directory from a completely arbitrary filesystem
with nothing to do with the filesystem where fsxat.dfd is from
and that makes very little sense from API POV.

> >
> > Trying to think of a saner API for this - perhaps pass an O_PATH
> > fd without any filename in struct fsxattrat, saving you also the
> > headache of passing a variable length string in an ioctl.
> >
> > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path
> > and file->f_path are on the same sb before proceeding to operate
> > on atfile->f_path.dentry.
>
> Thanks! Didn't know about O_PATH that seems to be a way to get rid
> of the path passing.
>

I found one precedent of this pattern with XFS_IOC_FD_TO_HANDLE,
but keep in mind that this is quite an old legacy XFS API.

This ioctl is performed on an "fshandle", which is an open fd to any
object in the filesystem, but typically the mount root dir.
The ioctl gets a structure xfs_fsop_handlereq_t which contains an
fd member pointing to another object within an XFS filesystem.

Actually, AFAICS, this code does not verify that the object and fshandle
are on the same XFS filesystem, but only that both are on XFS filesystems:

        /*
         * We can only generate handles for inodes residing on a XFS filesystem,
         * and only for regular files, directories or symbolic links.
         */
        error = -EINVAL;
        if (inode->i_sb->s_magic != XFS_SB_MAGIC)
                goto out_put;

I don't know what's the best thing to do is, but I think that verifying
that the ioctl fd and the O_PATH fd are on the same sb is the least
controversial option for the first version - if needed that could be
relaxed later on.

Another alternative which is simpler from API POV would be to allow
selective ioctl() commands on an O_PATH fd, but I think that is going
to be more controversial.

Something along those lines (completely untested):

diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 45e4e64fd664..562f8bff91d2 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -241,6 +241,13 @@ struct fsxattr {
  */
 #define FS_IOC_GETFSSYSFSPATH          _IOR(0x15, 1, struct fs_sysfs_path)

+#define _IOC_AT                                (0x100)
+#define FS_IOC_AT(nr)                  (_IOC_TYPE(nr) == _IOC_AT)
+
+/* The following ioctls can be operated on an O_PATH fd */
+#define FS_IOC_FSGETXATTRAT            _IOR(_IOC_AT, 31, struct fsxattr)
+#define FS_IOC_FSSETXATTRAT            _IOW(_IOC_AT, 32, struct fsxattr)
+
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  *
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1d5abfdf0f22..f720500c705b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -867,9 +867,11 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                return ioctl_setflags(filp, argp);

        case FS_IOC_FSGETXATTR:
+       case FS_IOC_FSGETXATTRAT:
                return ioctl_fsgetxattr(filp, argp);

        case FS_IOC_FSSETXATTR:
+       case FS_IOC_FSSETXATTRAT:
                return ioctl_fssetxattr(filp, argp);

        case FS_IOC_GETFSUUID:
@@ -879,7 +881,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                return ioctl_get_fs_sysfs_path(filp, argp);

        default:
-               if (S_ISREG(inode->i_mode))
+               if (!FS_IOC_AT(cmd) && S_ISREG(inode->i_mode))
                        return file_ioctl(filp, cmd, argp);
                break;
        }
@@ -889,7 +891,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,

 SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {
-       struct fd f = fdget(fd);
+       bool ioc_at = FS_IOC_AT(cmd);
+       struct fd f = ioc_at ? fdget_raw(fd) : fdget(fd);
        int error;

        if (!f.file)
@@ -900,7 +903,7 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned
int, cmd, unsigned long, arg)
                goto out;

        error = do_vfs_ioctl(f.file, fd, cmd, arg);
-       if (error == -ENOIOCTLCMD)
+       if (!ioc_at && error == -ENOIOCTLCMD)
                error = vfs_ioctl(f.file, cmd, arg);

---

Thanks,
Amir.
Jan Kara May 22, 2024, 10 a.m. UTC | #10
Hello!

On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> XFS has project quotas which could be attached to a directory. All
> new inodes in these directories inherit project ID set on parent
> directory.
> 
> The project is created from userspace by opening and calling
> FS_IOC_FSSETXATTR on each inode. This is not possible for special
> files such as FIFO, SOCK, BLK etc. as opening them returns a special
> inode from VFS. Therefore, some inodes are left with empty project
> ID. Those inodes then are not shown in the quota accounting but
> still exist in the directory.
> 
> This patch adds two new ioctls which allows userspace, such as
> xfs_quota, to set project ID on special files by using parent
> directory to open FS inode. This will let xfs_quota set ID on all
> inodes and also reset it when project is removed. Also, as
> vfs_fileattr_set() is now will called on special files too, let's
> forbid any other attributes except projid and nextents (symlink can
> have one).
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>

I'd like to understand one thing. Is it practically useful to set project
IDs for special inodes? There is no significant disk space usage associated
with them so wrt quotas we are speaking only about the inode itself. So is
the concern that user could escape inode project quota accounting and
perform some DoS? Or why do we bother with two new somewhat hairy ioctls
for something that seems as a small corner case to me?

								Honza
Andrey Albershteyn May 22, 2024, 10:45 a.m. UTC | #11
Hi,

On 2024-05-22 12:00:07, Jan Kara wrote:
> Hello!
> 
> On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > XFS has project quotas which could be attached to a directory. All
> > new inodes in these directories inherit project ID set on parent
> > directory.
> > 
> > The project is created from userspace by opening and calling
> > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > inode from VFS. Therefore, some inodes are left with empty project
> > ID. Those inodes then are not shown in the quota accounting but
> > still exist in the directory.
> > 
> > This patch adds two new ioctls which allows userspace, such as
> > xfs_quota, to set project ID on special files by using parent
> > directory to open FS inode. This will let xfs_quota set ID on all
> > inodes and also reset it when project is removed. Also, as
> > vfs_fileattr_set() is now will called on special files too, let's
> > forbid any other attributes except projid and nextents (symlink can
> > have one).
> > 
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> 
> I'd like to understand one thing. Is it practically useful to set project
> IDs for special inodes? There is no significant disk space usage associated
> with them so wrt quotas we are speaking only about the inode itself. So is
> the concern that user could escape inode project quota accounting and
> perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> for something that seems as a small corner case to me?

So there's few things:
- Quota accounting is missing only some special files. Special files
  created after quota project is setup inherit ID from the project
  directory.
- For special files created after the project is setup there's no
  way to make them project-less. Therefore, creating a new project
  over those will fail due to project ID miss match.
- It wasn't possible to hardlink/rename project-less special files
  inside a project due to ID miss match. The linking is fixed, and
  renaming is worked around in first patch.

The initial report I got was about second and last point, an
application was failing to create a new project after "restart" and
wasn't able to link special files created beforehand.
Andrey Albershteyn May 22, 2024, 2:58 p.m. UTC | #12
On 2024-05-21 21:22:41, Amir Goldstein wrote:
> On Tue, May 21, 2024 at 7:34 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> >
> > On 2024-05-20 22:03:43, Amir Goldstein wrote:
> > > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > > >
> > > > XFS has project quotas which could be attached to a directory. All
> > > > new inodes in these directories inherit project ID set on parent
> > > > directory.
> > > >
> > > > The project is created from userspace by opening and calling
> > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > ID. Those inodes then are not shown in the quota accounting but
> > > > still exist in the directory.
> > > >
> > > > This patch adds two new ioctls which allows userspace, such as
> > > > xfs_quota, to set project ID on special files by using parent
> > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > inodes and also reset it when project is removed. Also, as
> > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > forbid any other attributes except projid and nextents (symlink can
> > > > have one).
> > > >
> > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > ---
> > > >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/fs.h | 11 +++++
> > > >  2 files changed, 104 insertions(+)
> > > >
> > > > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > > > --- a/fs/ioctl.c
> > > > +++ b/fs/ioctl.c
> > > > @@ -22,6 +22,7 @@
> > > >  #include <linux/mount.h>
> > > >  #include <linux/fscrypt.h>
> > > >  #include <linux/fileattr.h>
> > > > +#include <linux/namei.h>
> > > >
> > > >  #include "internal.h"
> > > >
> > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> > > >         if (fa->fsx_cowextsize == 0)
> > > >                 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> > > >
> > > > +       /*
> > > > +        * The only use case for special files is to set project ID, forbid any
> > > > +        * other attributes
> > > > +        */
> > > > +       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > > > +               if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > > > +                       return -EINVAL;
> > > > +               if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > > > +                       return -EINVAL;
> > > > +               if (fa->fsx_extsize || fa->fsx_cowextsize)
> > > > +                       return -EINVAL;
> > > > +       }
> > > > +
> > > >         return 0;
> > > >  }
> > > >
> > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> > > >         return err;
> > > >  }
> > > >
> > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > > > +{
> > > > +       struct path filepath;
> > > > +       struct fsxattrat fsxat;
> > > > +       struct fileattr fa;
> > > > +       int error;
> > > > +
> > > > +       if (!S_ISDIR(file_inode(file)->i_mode))
> > > > +               return -EBADF;
> > >
> > > So the *only* thing that is done with the fd of the ioctl is to verify
> > > that it is a directory fd - there is no verification that this fd is on the
> > > same sb as the path to act on.
> > >
> > > Was this the intention? It does not make a lot of sense to me
> > > and AFAIK there is no precedent to an API like this.
> >
> > yeah, as we want to set xattrs on that inode the path is pointing
> > to, so, VFS will call to the FS under that sb.
> >
> > >
> > > There are ioctls that operate on the filesystem using any
> > > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH}
> > > and maybe the closest example to what you are trying to add
> > > XFS_IOC_BULKSTAT.
> >
> > Not sure that I get what you mean here, the *at() part is to get
> > around VFS special inodes and call vfs_fileattr_set/get on FS inodes.
> >
> 
> My point was that with your proposed API the fd argument to
> ioctl() can be a directory from a completely arbitrary filesystem
> with nothing to do with the filesystem where fsxat.dfd is from
> and that makes very little sense from API POV.

I see, yeah, I can add a check for this, for xfs_quota usecase it's
fine as it doesn't follow cross mounts.

> 
> > >
> > > Trying to think of a saner API for this - perhaps pass an O_PATH
> > > fd without any filename in struct fsxattrat, saving you also the
> > > headache of passing a variable length string in an ioctl.
> > >
> > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path
> > > and file->f_path are on the same sb before proceeding to operate
> > > on atfile->f_path.dentry.
> >
> > Thanks! Didn't know about O_PATH that seems to be a way to get rid
> > of the path passing.
> >
> 
> I found one precedent of this pattern with XFS_IOC_FD_TO_HANDLE,
> but keep in mind that this is quite an old legacy XFS API.
> 
> This ioctl is performed on an "fshandle", which is an open fd to any
> object in the filesystem, but typically the mount root dir.
> The ioctl gets a structure xfs_fsop_handlereq_t which contains an
> fd member pointing to another object within an XFS filesystem.
> 
> Actually, AFAICS, this code does not verify that the object and fshandle
> are on the same XFS filesystem, but only that both are on XFS filesystems:
> 
>         /*
>          * We can only generate handles for inodes residing on a XFS filesystem,
>          * and only for regular files, directories or symbolic links.
>          */
>         error = -EINVAL;
>         if (inode->i_sb->s_magic != XFS_SB_MAGIC)
>                 goto out_put;
> 
> I don't know what's the best thing to do is, but I think that verifying
> that the ioctl fd and the O_PATH fd are on the same sb is the least
> controversial option for the first version - if needed that could be
> relaxed later on.
> 
> Another alternative which is simpler from API POV would be to allow
> selective ioctl() commands on an O_PATH fd, but I think that is going
> to be more controversial.
> 
> Something along those lines (completely untested):
> 
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 45e4e64fd664..562f8bff91d2 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -241,6 +241,13 @@ struct fsxattr {
>   */
>  #define FS_IOC_GETFSSYSFSPATH          _IOR(0x15, 1, struct fs_sysfs_path)
> 
> +#define _IOC_AT                                (0x100)
> +#define FS_IOC_AT(nr)                  (_IOC_TYPE(nr) == _IOC_AT)
> +
> +/* The following ioctls can be operated on an O_PATH fd */
> +#define FS_IOC_FSGETXATTRAT            _IOR(_IOC_AT, 31, struct fsxattr)
> +#define FS_IOC_FSSETXATTRAT            _IOW(_IOC_AT, 32, struct fsxattr)
> +
>  /*
>   * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
>   *
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 1d5abfdf0f22..f720500c705b 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -867,9 +867,11 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
>                 return ioctl_setflags(filp, argp);
> 
>         case FS_IOC_FSGETXATTR:
> +       case FS_IOC_FSGETXATTRAT:
>                 return ioctl_fsgetxattr(filp, argp);
> 
>         case FS_IOC_FSSETXATTR:
> +       case FS_IOC_FSSETXATTRAT:
>                 return ioctl_fssetxattr(filp, argp);
> 
>         case FS_IOC_GETFSUUID:
> @@ -879,7 +881,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
>                 return ioctl_get_fs_sysfs_path(filp, argp);
> 
>         default:
> -               if (S_ISREG(inode->i_mode))
> +               if (!FS_IOC_AT(cmd) && S_ISREG(inode->i_mode))
>                         return file_ioctl(filp, cmd, argp);
>                 break;
>         }
> @@ -889,7 +891,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> 
>  SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
>  {
> -       struct fd f = fdget(fd);
> +       bool ioc_at = FS_IOC_AT(cmd);
> +       struct fd f = ioc_at ? fdget_raw(fd) : fdget(fd);
>         int error;
> 
>         if (!f.file)
> @@ -900,7 +903,7 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned
> int, cmd, unsigned long, arg)
>                 goto out;
> 
>         error = do_vfs_ioctl(f.file, fd, cmd, arg);
> -       if (error == -ENOIOCTLCMD)
> +       if (!ioc_at && error == -ENOIOCTLCMD)
>                 error = vfs_ioctl(f.file, cmd, arg);
> 
> ---
> 
> Thanks,
> Amir.
>
Darrick J. Wong May 22, 2024, 4:28 p.m. UTC | #13
On Wed, May 22, 2024 at 04:58:31PM +0200, Andrey Albershteyn wrote:
> On 2024-05-21 21:22:41, Amir Goldstein wrote:
> > On Tue, May 21, 2024 at 7:34 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > >
> > > On 2024-05-20 22:03:43, Amir Goldstein wrote:
> > > > On Mon, May 20, 2024 at 7:46 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > > > >
> > > > > XFS has project quotas which could be attached to a directory. All
> > > > > new inodes in these directories inherit project ID set on parent
> > > > > directory.
> > > > >
> > > > > The project is created from userspace by opening and calling
> > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > still exist in the directory.
> > > > >
> > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > xfs_quota, to set project ID on special files by using parent
> > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > inodes and also reset it when project is removed. Also, as
> > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > have one).
> > > > >
> > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > ---
> > > > >  fs/ioctl.c              | 93 +++++++++++++++++++++++++++++++++++++++++
> > > > >  include/uapi/linux/fs.h | 11 +++++
> > > > >  2 files changed, 104 insertions(+)
> > > > >
> > > > > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > > > > index 1d5abfdf0f22..3e3aacb6ea6e 100644
> > > > > --- a/fs/ioctl.c
> > > > > +++ b/fs/ioctl.c
> > > > > @@ -22,6 +22,7 @@
> > > > >  #include <linux/mount.h>
> > > > >  #include <linux/fscrypt.h>
> > > > >  #include <linux/fileattr.h>
> > > > > +#include <linux/namei.h>
> > > > >
> > > > >  #include "internal.h"
> > > > >
> > > > > @@ -647,6 +648,19 @@ static int fileattr_set_prepare(struct inode *inode,
> > > > >         if (fa->fsx_cowextsize == 0)
> > > > >                 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
> > > > >
> > > > > +       /*
> > > > > +        * The only use case for special files is to set project ID, forbid any
> > > > > +        * other attributes
> > > > > +        */
> > > > > +       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
> > > > > +               if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
> > > > > +                       return -EINVAL;
> > > > > +               if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
> > > > > +                       return -EINVAL;
> > > > > +               if (fa->fsx_extsize || fa->fsx_cowextsize)
> > > > > +                       return -EINVAL;
> > > > > +       }
> > > > > +
> > > > >         return 0;
> > > > >  }
> > > > >
> > > > > @@ -763,6 +777,79 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
> > > > >         return err;
> > > > >  }
> > > > >
> > > > > +static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
> > > > > +{
> > > > > +       struct path filepath;
> > > > > +       struct fsxattrat fsxat;
> > > > > +       struct fileattr fa;
> > > > > +       int error;
> > > > > +
> > > > > +       if (!S_ISDIR(file_inode(file)->i_mode))
> > > > > +               return -EBADF;
> > > >
> > > > So the *only* thing that is done with the fd of the ioctl is to verify
> > > > that it is a directory fd - there is no verification that this fd is on the
> > > > same sb as the path to act on.
> > > >
> > > > Was this the intention? It does not make a lot of sense to me
> > > > and AFAIK there is no precedent to an API like this.
> > >
> > > yeah, as we want to set xattrs on that inode the path is pointing
> > > to, so, VFS will call to the FS under that sb.
> > >
> > > >
> > > > There are ioctls that operate on the filesystem using any
> > > > fd on that fs, such as FS_IOC_GETFS{UUID,SYSFSPATH}
> > > > and maybe the closest example to what you are trying to add
> > > > XFS_IOC_BULKSTAT.
> > >
> > > Not sure that I get what you mean here, the *at() part is to get
> > > around VFS special inodes and call vfs_fileattr_set/get on FS inodes.
> > >
> > 
> > My point was that with your proposed API the fd argument to
> > ioctl() can be a directory from a completely arbitrary filesystem
> > with nothing to do with the filesystem where fsxat.dfd is from
> > and that makes very little sense from API POV.
> 
> I see, yeah, I can add a check for this, for xfs_quota usecase it's
> fine as it doesn't follow cross mounts.

Do the other *at() syscalls prohibit dfd + path pointing to a different
filesystem?  It seems odd to have this restriction that the rest don't,
but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok.

--D

> > 
> > > >
> > > > Trying to think of a saner API for this - perhaps pass an O_PATH
> > > > fd without any filename in struct fsxattrat, saving you also the
> > > > headache of passing a variable length string in an ioctl.
> > > >
> > > > Then atfile = fdget_raw(fsxat.atfd) and verify that atfile->f_path
> > > > and file->f_path are on the same sb before proceeding to operate
> > > > on atfile->f_path.dentry.
> > >
> > > Thanks! Didn't know about O_PATH that seems to be a way to get rid
> > > of the path passing.
> > >
> > 
> > I found one precedent of this pattern with XFS_IOC_FD_TO_HANDLE,
> > but keep in mind that this is quite an old legacy XFS API.
> > 
> > This ioctl is performed on an "fshandle", which is an open fd to any
> > object in the filesystem, but typically the mount root dir.
> > The ioctl gets a structure xfs_fsop_handlereq_t which contains an
> > fd member pointing to another object within an XFS filesystem.
> > 
> > Actually, AFAICS, this code does not verify that the object and fshandle
> > are on the same XFS filesystem, but only that both are on XFS filesystems:
> > 
> >         /*
> >          * We can only generate handles for inodes residing on a XFS filesystem,
> >          * and only for regular files, directories or symbolic links.
> >          */
> >         error = -EINVAL;
> >         if (inode->i_sb->s_magic != XFS_SB_MAGIC)
> >                 goto out_put;
> > 
> > I don't know what's the best thing to do is, but I think that verifying
> > that the ioctl fd and the O_PATH fd are on the same sb is the least
> > controversial option for the first version - if needed that could be
> > relaxed later on.
> > 
> > Another alternative which is simpler from API POV would be to allow
> > selective ioctl() commands on an O_PATH fd, but I think that is going
> > to be more controversial.
> > 
> > Something along those lines (completely untested):
> > 
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 45e4e64fd664..562f8bff91d2 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -241,6 +241,13 @@ struct fsxattr {
> >   */
> >  #define FS_IOC_GETFSSYSFSPATH          _IOR(0x15, 1, struct fs_sysfs_path)
> > 
> > +#define _IOC_AT                                (0x100)
> > +#define FS_IOC_AT(nr)                  (_IOC_TYPE(nr) == _IOC_AT)
> > +
> > +/* The following ioctls can be operated on an O_PATH fd */
> > +#define FS_IOC_FSGETXATTRAT            _IOR(_IOC_AT, 31, struct fsxattr)
> > +#define FS_IOC_FSSETXATTRAT            _IOW(_IOC_AT, 32, struct fsxattr)
> > +
> >  /*
> >   * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
> >   *
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 1d5abfdf0f22..f720500c705b 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -867,9 +867,11 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> >                 return ioctl_setflags(filp, argp);
> > 
> >         case FS_IOC_FSGETXATTR:
> > +       case FS_IOC_FSGETXATTRAT:
> >                 return ioctl_fsgetxattr(filp, argp);
> > 
> >         case FS_IOC_FSSETXATTR:
> > +       case FS_IOC_FSSETXATTRAT:
> >                 return ioctl_fssetxattr(filp, argp);
> > 
> >         case FS_IOC_GETFSUUID:
> > @@ -879,7 +881,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> >                 return ioctl_get_fs_sysfs_path(filp, argp);
> > 
> >         default:
> > -               if (S_ISREG(inode->i_mode))
> > +               if (!FS_IOC_AT(cmd) && S_ISREG(inode->i_mode))
> >                         return file_ioctl(filp, cmd, argp);
> >                 break;
> >         }
> > @@ -889,7 +891,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
> > 
> >  SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
> >  {
> > -       struct fd f = fdget(fd);
> > +       bool ioc_at = FS_IOC_AT(cmd);
> > +       struct fd f = ioc_at ? fdget_raw(fd) : fdget(fd);
> >         int error;
> > 
> >         if (!f.file)
> > @@ -900,7 +903,7 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned
> > int, cmd, unsigned long, arg)
> >                 goto out;
> > 
> >         error = do_vfs_ioctl(f.file, fd, cmd, arg);
> > -       if (error == -ENOIOCTLCMD)
> > +       if (!ioc_at && error == -ENOIOCTLCMD)
> >                 error = vfs_ioctl(f.file, cmd, arg);
> > 
> > ---
> > 
> > Thanks,
> > Amir.
> > 
> 
> -- 
> - Andrey
> 
>
Eric Biggers May 22, 2024, 4:38 p.m. UTC | #14
On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote:
> 
> Do the other *at() syscalls prohibit dfd + path pointing to a different
> filesystem?  It seems odd to have this restriction that the rest don't,
> but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok.

No, but they are arbitrary syscalls so they can do that.  ioctls traditionally
operate on the specific filesystem of the fd.

It feels like these should be syscalls, not ioctls.

- Eric
Andrey Albershteyn May 22, 2024, 5:23 p.m. UTC | #15
On 2024-05-22 09:38:56, Eric Biggers wrote:
> On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote:
> > 
> > Do the other *at() syscalls prohibit dfd + path pointing to a different
> > filesystem?  It seems odd to have this restriction that the rest don't,
> > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok.
> 
> No, but they are arbitrary syscalls so they can do that.  ioctls traditionally
> operate on the specific filesystem of the fd.
> 
> It feels like these should be syscalls, not ioctls.

Won't it also be a bit weird to have FS_IOC_FS[S|G]ETXATTR as
ioctls for normal files and a syscall for special files? When both
are doing the same thing

> 
> - Eric
>
Eric Biggers May 22, 2024, 6:33 p.m. UTC | #16
On Wed, May 22, 2024 at 07:23:15PM +0200, Andrey Albershteyn wrote:
> On 2024-05-22 09:38:56, Eric Biggers wrote:
> > On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote:
> > > 
> > > Do the other *at() syscalls prohibit dfd + path pointing to a different
> > > filesystem?  It seems odd to have this restriction that the rest don't,
> > > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok.
> > 
> > No, but they are arbitrary syscalls so they can do that.  ioctls traditionally
> > operate on the specific filesystem of the fd.
> > 
> > It feels like these should be syscalls, not ioctls.
> 
> Won't it also be a bit weird to have FS_IOC_FS[S|G]ETXATTR as
> ioctls for normal files and a syscall for special files? When both
> are doing the same thing
> 

Why would the syscalls be restricted to operating on special files?

- Eric
Amir Goldstein May 22, 2024, 7:03 p.m. UTC | #17
On Wed, May 22, 2024 at 7:38 PM Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote:
> >
> > Do the other *at() syscalls prohibit dfd + path pointing to a different
> > filesystem?  It seems odd to have this restriction that the rest don't,
> > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok.
>
> No, but they are arbitrary syscalls so they can do that.  ioctls traditionally
> operate on the specific filesystem of the fd.

To emphasize the absurdity
think opening /dev/random and doing ioctl to set projid on some xfs file.
It is ridiculous.

>
> It feels like these should be syscalls, not ioctls.
>

I bet whatever name you choose for syscalls it is going to be too
close lexicographically to [gs]etxattrat(2) [1]. It is really crowded
in the area of getattr/getfattr/fgetxattr/getxattr/getfileattr/getfsxattr...
I think I would vote for [gs]etfsxattrat(2) following the uapi struct fsxattr.
I guess we have officially spiralled.

Thanks,
Amir.

[1] https://lore.kernel.org/linux-fsdevel/20240426162042.191916-1-cgoettsche@seltendoof.de/
Jan Kara May 23, 2024, 7:48 a.m. UTC | #18
Hi!

On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> On 2024-05-22 12:00:07, Jan Kara wrote:
> > Hello!
> > 
> > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > XFS has project quotas which could be attached to a directory. All
> > > new inodes in these directories inherit project ID set on parent
> > > directory.
> > > 
> > > The project is created from userspace by opening and calling
> > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > inode from VFS. Therefore, some inodes are left with empty project
> > > ID. Those inodes then are not shown in the quota accounting but
> > > still exist in the directory.
> > > 
> > > This patch adds two new ioctls which allows userspace, such as
> > > xfs_quota, to set project ID on special files by using parent
> > > directory to open FS inode. This will let xfs_quota set ID on all
> > > inodes and also reset it when project is removed. Also, as
> > > vfs_fileattr_set() is now will called on special files too, let's
> > > forbid any other attributes except projid and nextents (symlink can
> > > have one).
> > > 
> > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > 
> > I'd like to understand one thing. Is it practically useful to set project
> > IDs for special inodes? There is no significant disk space usage associated
> > with them so wrt quotas we are speaking only about the inode itself. So is
> > the concern that user could escape inode project quota accounting and
> > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > for something that seems as a small corner case to me?
> 
> So there's few things:
> - Quota accounting is missing only some special files. Special files
>   created after quota project is setup inherit ID from the project
>   directory.
> - For special files created after the project is setup there's no
>   way to make them project-less. Therefore, creating a new project
>   over those will fail due to project ID miss match.
> - It wasn't possible to hardlink/rename project-less special files
>   inside a project due to ID miss match. The linking is fixed, and
>   renaming is worked around in first patch.
> 
> The initial report I got was about second and last point, an
> application was failing to create a new project after "restart" and
> wasn't able to link special files created beforehand.

I see. OK, but wouldn't it then be an easier fix to make sure we *never*
inherit project id for special inodes? And make sure inodes with unset
project ID don't fail to be linked, renamed, etc...

								Honza
Andrey Albershteyn May 23, 2024, 11:16 a.m. UTC | #19
On 2024-05-23 09:48:28, Jan Kara wrote:
> Hi!
> 
> On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > Hello!
> > > 
> > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > XFS has project quotas which could be attached to a directory. All
> > > > new inodes in these directories inherit project ID set on parent
> > > > directory.
> > > > 
> > > > The project is created from userspace by opening and calling
> > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > ID. Those inodes then are not shown in the quota accounting but
> > > > still exist in the directory.
> > > > 
> > > > This patch adds two new ioctls which allows userspace, such as
> > > > xfs_quota, to set project ID on special files by using parent
> > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > inodes and also reset it when project is removed. Also, as
> > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > forbid any other attributes except projid and nextents (symlink can
> > > > have one).
> > > > 
> > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > 
> > > I'd like to understand one thing. Is it practically useful to set project
> > > IDs for special inodes? There is no significant disk space usage associated
> > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > the concern that user could escape inode project quota accounting and
> > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > for something that seems as a small corner case to me?
> > 
> > So there's few things:
> > - Quota accounting is missing only some special files. Special files
> >   created after quota project is setup inherit ID from the project
> >   directory.
> > - For special files created after the project is setup there's no
> >   way to make them project-less. Therefore, creating a new project
> >   over those will fail due to project ID miss match.
> > - It wasn't possible to hardlink/rename project-less special files
> >   inside a project due to ID miss match. The linking is fixed, and
> >   renaming is worked around in first patch.
> > 
> > The initial report I got was about second and last point, an
> > application was failing to create a new project after "restart" and
> > wasn't able to link special files created beforehand.
> 
> I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> inherit project id for special inodes? And make sure inodes with unset
> project ID don't fail to be linked, renamed, etc...

But then, in set up project, you can cross-link between projects and
escape quota this way. During linking/renaming if source inode has
ID but target one doesn't, we won't be able to tell that this link
is within the project.
Andrey Albershteyn May 23, 2024, 11:25 a.m. UTC | #20
On 2024-05-22 22:03:55, Amir Goldstein wrote:
> On Wed, May 22, 2024 at 7:38 PM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Wed, May 22, 2024 at 09:28:53AM -0700, Darrick J. Wong wrote:
> > >
> > > Do the other *at() syscalls prohibit dfd + path pointing to a different
> > > filesystem?  It seems odd to have this restriction that the rest don't,
> > > but perhaps documenting this in the ioctl_xfs_fsgetxattrat manpage is ok.
> >
> > No, but they are arbitrary syscalls so they can do that.  ioctls traditionally
> > operate on the specific filesystem of the fd.
> 
> To emphasize the absurdity
> think opening /dev/random and doing ioctl to set projid on some xfs file.
> It is ridiculous.
> 
> >
> > It feels like these should be syscalls, not ioctls.
> >
> 
> I bet whatever name you choose for syscalls it is going to be too
> close lexicographically to [gs]etxattrat(2) [1]. It is really crowded
> in the area of getattr/getfattr/fgetxattr/getxattr/getfileattr/getfsxattr...
> I think I would vote for [gs]etfsxattrat(2) following the uapi struct fsxattr.
> I guess we have officially spiralled.
> 
> Thanks,
> Amir.
> 
> [1] https://lore.kernel.org/linux-fsdevel/20240426162042.191916-1-cgoettsche@seltendoof.de/
> 

Thanks for the link, will convert to syscalls in next version.
Jan Kara May 24, 2024, 4:11 p.m. UTC | #21
On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> On 2024-05-23 09:48:28, Jan Kara wrote:
> > Hi!
> > 
> > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > Hello!
> > > > 
> > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > XFS has project quotas which could be attached to a directory. All
> > > > > new inodes in these directories inherit project ID set on parent
> > > > > directory.
> > > > > 
> > > > > The project is created from userspace by opening and calling
> > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > still exist in the directory.
> > > > > 
> > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > xfs_quota, to set project ID on special files by using parent
> > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > inodes and also reset it when project is removed. Also, as
> > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > have one).
> > > > > 
> > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > 
> > > > I'd like to understand one thing. Is it practically useful to set project
> > > > IDs for special inodes? There is no significant disk space usage associated
> > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > the concern that user could escape inode project quota accounting and
> > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > for something that seems as a small corner case to me?
> > > 
> > > So there's few things:
> > > - Quota accounting is missing only some special files. Special files
> > >   created after quota project is setup inherit ID from the project
> > >   directory.
> > > - For special files created after the project is setup there's no
> > >   way to make them project-less. Therefore, creating a new project
> > >   over those will fail due to project ID miss match.
> > > - It wasn't possible to hardlink/rename project-less special files
> > >   inside a project due to ID miss match. The linking is fixed, and
> > >   renaming is worked around in first patch.
> > > 
> > > The initial report I got was about second and last point, an
> > > application was failing to create a new project after "restart" and
> > > wasn't able to link special files created beforehand.
> > 
> > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > inherit project id for special inodes? And make sure inodes with unset
> > project ID don't fail to be linked, renamed, etc...
> 
> But then, in set up project, you can cross-link between projects and
> escape quota this way. During linking/renaming if source inode has
> ID but target one doesn't, we won't be able to tell that this link
> is within the project.

Well, I didn't want to charge these special inodes to project quota at all
so "escaping quota" was pretty much what I suggested to do. But my point
was that since the only thing that's really charged for these inodes is the
inodes itself then does this small inaccuracy really matter in practice?
Are we afraid the user is going to fill the filesystem with symlinks?

								Honza
Darrick J. Wong May 31, 2024, 2:52 p.m. UTC | #22
On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > Hi!
> > > 
> > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > Hello!
> > > > > 
> > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > directory.
> > > > > > 
> > > > > > The project is created from userspace by opening and calling
> > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > still exist in the directory.
> > > > > > 
> > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > have one).
> > > > > > 
> > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > 
> > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > the concern that user could escape inode project quota accounting and
> > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > for something that seems as a small corner case to me?
> > > > 
> > > > So there's few things:
> > > > - Quota accounting is missing only some special files. Special files
> > > >   created after quota project is setup inherit ID from the project
> > > >   directory.
> > > > - For special files created after the project is setup there's no
> > > >   way to make them project-less. Therefore, creating a new project
> > > >   over those will fail due to project ID miss match.
> > > > - It wasn't possible to hardlink/rename project-less special files
> > > >   inside a project due to ID miss match. The linking is fixed, and
> > > >   renaming is worked around in first patch.
> > > > 
> > > > The initial report I got was about second and last point, an
> > > > application was failing to create a new project after "restart" and
> > > > wasn't able to link special files created beforehand.
> > > 
> > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > inherit project id for special inodes? And make sure inodes with unset
> > > project ID don't fail to be linked, renamed, etc...
> > 
> > But then, in set up project, you can cross-link between projects and
> > escape quota this way. During linking/renaming if source inode has
> > ID but target one doesn't, we won't be able to tell that this link
> > is within the project.
> 
> Well, I didn't want to charge these special inodes to project quota at all
> so "escaping quota" was pretty much what I suggested to do. But my point
> was that since the only thing that's really charged for these inodes is the
> inodes itself then does this small inaccuracy really matter in practice?
> Are we afraid the user is going to fill the filesystem with symlinks?

I thought the worry here is that you can't fully reassign the project
id for a directory tree unless you have an *at() version of the ioctl
to handle the special files that you can't open directly?

So you start with a directory tree that's (say) 2% symlinks and project
id 5.  Later you want to set project id 7 on that subtree, but after the
incomplete change, projid 7 is charged for 98% of the tree, and 2% are
still stuck on projid 5.  This is a mess, and if enforcement is enabled
you've just broken it in a way that can't be fixed aside from recreating
those files.

--D

> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
>
Jan Kara June 3, 2024, 10:42 a.m. UTC | #23
On Fri 31-05-24 07:52:04, Darrick J. Wong wrote:
> On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > > Hi!
> > > > 
> > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > > Hello!
> > > > > > 
> > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > > directory.
> > > > > > > 
> > > > > > > The project is created from userspace by opening and calling
> > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > > still exist in the directory.
> > > > > > > 
> > > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > > have one).
> > > > > > > 
> > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > > 
> > > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > > the concern that user could escape inode project quota accounting and
> > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > > for something that seems as a small corner case to me?
> > > > > 
> > > > > So there's few things:
> > > > > - Quota accounting is missing only some special files. Special files
> > > > >   created after quota project is setup inherit ID from the project
> > > > >   directory.
> > > > > - For special files created after the project is setup there's no
> > > > >   way to make them project-less. Therefore, creating a new project
> > > > >   over those will fail due to project ID miss match.
> > > > > - It wasn't possible to hardlink/rename project-less special files
> > > > >   inside a project due to ID miss match. The linking is fixed, and
> > > > >   renaming is worked around in first patch.
> > > > > 
> > > > > The initial report I got was about second and last point, an
> > > > > application was failing to create a new project after "restart" and
> > > > > wasn't able to link special files created beforehand.
> > > > 
> > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > > inherit project id for special inodes? And make sure inodes with unset
> > > > project ID don't fail to be linked, renamed, etc...
> > > 
> > > But then, in set up project, you can cross-link between projects and
> > > escape quota this way. During linking/renaming if source inode has
> > > ID but target one doesn't, we won't be able to tell that this link
> > > is within the project.
> > 
> > Well, I didn't want to charge these special inodes to project quota at all
> > so "escaping quota" was pretty much what I suggested to do. But my point
> > was that since the only thing that's really charged for these inodes is the
> > inodes itself then does this small inaccuracy really matter in practice?
> > Are we afraid the user is going to fill the filesystem with symlinks?
> 
> I thought the worry here is that you can't fully reassign the project
> id for a directory tree unless you have an *at() version of the ioctl
> to handle the special files that you can't open directly?
> 
> So you start with a directory tree that's (say) 2% symlinks and project
> id 5.  Later you want to set project id 7 on that subtree, but after the
> incomplete change, projid 7 is charged for 98% of the tree, and 2% are
> still stuck on projid 5.  This is a mess, and if enforcement is enabled
> you've just broken it in a way that can't be fixed aside from recreating
> those files.

So the idea I'm trying to propose (and apparently I'm failing to explain it
properly) is:

When creating special inode, set i_projid = 0 regardless of directory
settings.

When creating hardlink or doing rename, if i_projid of dentry is 0, we
allow the operation.

Teach fsck to set i_projid to 0 when inode is special.

As a result, AFAICT no problem with hardlinks, renames or similar. No need
for special new ioctl or syscall. The downside is special inodes escape
project quota accounting. Do we care?

								Honza
Andrey Albershteyn June 3, 2024, 4:28 p.m. UTC | #24
On 2024-06-03 12:42:59, Jan Kara wrote:
> On Fri 31-05-24 07:52:04, Darrick J. Wong wrote:
> > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > > > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > > > Hi!
> > > > > 
> > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > > > Hello!
> > > > > > > 
> > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > > > directory.
> > > > > > > > 
> > > > > > > > The project is created from userspace by opening and calling
> > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > > > still exist in the directory.
> > > > > > > > 
> > > > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > > > have one).
> > > > > > > > 
> > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > > > 
> > > > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > > > the concern that user could escape inode project quota accounting and
> > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > > > for something that seems as a small corner case to me?
> > > > > > 
> > > > > > So there's few things:
> > > > > > - Quota accounting is missing only some special files. Special files
> > > > > >   created after quota project is setup inherit ID from the project
> > > > > >   directory.
> > > > > > - For special files created after the project is setup there's no
> > > > > >   way to make them project-less. Therefore, creating a new project
> > > > > >   over those will fail due to project ID miss match.
> > > > > > - It wasn't possible to hardlink/rename project-less special files
> > > > > >   inside a project due to ID miss match. The linking is fixed, and
> > > > > >   renaming is worked around in first patch.
> > > > > > 
> > > > > > The initial report I got was about second and last point, an
> > > > > > application was failing to create a new project after "restart" and
> > > > > > wasn't able to link special files created beforehand.
> > > > > 
> > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > > > inherit project id for special inodes? And make sure inodes with unset
> > > > > project ID don't fail to be linked, renamed, etc...
> > > > 
> > > > But then, in set up project, you can cross-link between projects and
> > > > escape quota this way. During linking/renaming if source inode has
> > > > ID but target one doesn't, we won't be able to tell that this link
> > > > is within the project.
> > > 
> > > Well, I didn't want to charge these special inodes to project quota at all
> > > so "escaping quota" was pretty much what I suggested to do. But my point
> > > was that since the only thing that's really charged for these inodes is the
> > > inodes itself then does this small inaccuracy really matter in practice?
> > > Are we afraid the user is going to fill the filesystem with symlinks?
> > 
> > I thought the worry here is that you can't fully reassign the project
> > id for a directory tree unless you have an *at() version of the ioctl
> > to handle the special files that you can't open directly?
> > 
> > So you start with a directory tree that's (say) 2% symlinks and project
> > id 5.  Later you want to set project id 7 on that subtree, but after the
> > incomplete change, projid 7 is charged for 98% of the tree, and 2% are
> > still stuck on projid 5.  This is a mess, and if enforcement is enabled
> > you've just broken it in a way that can't be fixed aside from recreating
> > those files.
> 
> So the idea I'm trying to propose (and apparently I'm failing to explain it
> properly) is:
> 
> When creating special inode, set i_projid = 0 regardless of directory
> settings.
> 
> When creating hardlink or doing rename, if i_projid of dentry is 0, we
> allow the operation.
> 
> Teach fsck to set i_projid to 0 when inode is special.
> 
> As a result, AFAICT no problem with hardlinks, renames or similar. No need
> for special new ioctl or syscall. The downside is special inodes escape
> project quota accounting. Do we care?

I see. But is it fine to allow fill filesystem with special inodes?
Don't know if it can be used somehow but this is exception from
isoft/ihard limits then.

I don't see issues with this approach also, if others don't have
other points or other uses for those new syscalls, I can go with
this approach.
Darrick J. Wong June 3, 2024, 5:42 p.m. UTC | #25
On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote:
> On 2024-06-03 12:42:59, Jan Kara wrote:
> > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote:
> > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > > > > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > > > > Hi!
> > > > > > 
> > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > > > > Hello!
> > > > > > > > 
> > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > > > > directory.
> > > > > > > > > 
> > > > > > > > > The project is created from userspace by opening and calling
> > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > > > > still exist in the directory.
> > > > > > > > > 
> > > > > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > > > > have one).
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > > > > 
> > > > > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > > > > the concern that user could escape inode project quota accounting and
> > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > > > > for something that seems as a small corner case to me?
> > > > > > > 
> > > > > > > So there's few things:
> > > > > > > - Quota accounting is missing only some special files. Special files
> > > > > > >   created after quota project is setup inherit ID from the project
> > > > > > >   directory.
> > > > > > > - For special files created after the project is setup there's no
> > > > > > >   way to make them project-less. Therefore, creating a new project
> > > > > > >   over those will fail due to project ID miss match.
> > > > > > > - It wasn't possible to hardlink/rename project-less special files
> > > > > > >   inside a project due to ID miss match. The linking is fixed, and
> > > > > > >   renaming is worked around in first patch.
> > > > > > > 
> > > > > > > The initial report I got was about second and last point, an
> > > > > > > application was failing to create a new project after "restart" and
> > > > > > > wasn't able to link special files created beforehand.
> > > > > > 
> > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > > > > inherit project id for special inodes? And make sure inodes with unset
> > > > > > project ID don't fail to be linked, renamed, etc...
> > > > > 
> > > > > But then, in set up project, you can cross-link between projects and
> > > > > escape quota this way. During linking/renaming if source inode has
> > > > > ID but target one doesn't, we won't be able to tell that this link
> > > > > is within the project.
> > > > 
> > > > Well, I didn't want to charge these special inodes to project quota at all
> > > > so "escaping quota" was pretty much what I suggested to do. But my point
> > > > was that since the only thing that's really charged for these inodes is the
> > > > inodes itself then does this small inaccuracy really matter in practice?
> > > > Are we afraid the user is going to fill the filesystem with symlinks?
> > > 
> > > I thought the worry here is that you can't fully reassign the project
> > > id for a directory tree unless you have an *at() version of the ioctl
> > > to handle the special files that you can't open directly?
> > > 
> > > So you start with a directory tree that's (say) 2% symlinks and project
> > > id 5.  Later you want to set project id 7 on that subtree, but after the
> > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are
> > > still stuck on projid 5.  This is a mess, and if enforcement is enabled
> > > you've just broken it in a way that can't be fixed aside from recreating
> > > those files.
> > 
> > So the idea I'm trying to propose (and apparently I'm failing to explain it
> > properly) is:
> > 
> > When creating special inode, set i_projid = 0 regardless of directory
> > settings.
> > 
> > When creating hardlink or doing rename, if i_projid of dentry is 0, we
> > allow the operation.
> > 
> > Teach fsck to set i_projid to 0 when inode is special.
> > 
> > As a result, AFAICT no problem with hardlinks, renames or similar. No need
> > for special new ioctl or syscall. The downside is special inodes escape
> > project quota accounting. Do we care?
> 
> I see. But is it fine to allow fill filesystem with special inodes?
> Don't know if it can be used somehow but this is exception from
> isoft/ihard limits then.
> 
> I don't see issues with this approach also, if others don't have
> other points or other uses for those new syscalls, I can go with
> this approach.

I do -- allowing unpriviledged users to create symlinks that consume
icount (and possibly bcount) in the root project breaks the entire
enforcement mechanism.  That's not the way that project quota has worked
on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
only for these special cases.

--D

> -- 
> - Andrey
> 
>
Jan Kara June 4, 2024, 8:58 a.m. UTC | #26
On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote:
> > On 2024-06-03 12:42:59, Jan Kara wrote:
> > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote:
> > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > > > > > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > > > > > Hi!
> > > > > > > 
> > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > > > > > Hello!
> > > > > > > > > 
> > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > > > > > directory.
> > > > > > > > > > 
> > > > > > > > > > The project is created from userspace by opening and calling
> > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > > > > > still exist in the directory.
> > > > > > > > > > 
> > > > > > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > > > > > have one).
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > > > > > 
> > > > > > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > > > > > the concern that user could escape inode project quota accounting and
> > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > > > > > for something that seems as a small corner case to me?
> > > > > > > > 
> > > > > > > > So there's few things:
> > > > > > > > - Quota accounting is missing only some special files. Special files
> > > > > > > >   created after quota project is setup inherit ID from the project
> > > > > > > >   directory.
> > > > > > > > - For special files created after the project is setup there's no
> > > > > > > >   way to make them project-less. Therefore, creating a new project
> > > > > > > >   over those will fail due to project ID miss match.
> > > > > > > > - It wasn't possible to hardlink/rename project-less special files
> > > > > > > >   inside a project due to ID miss match. The linking is fixed, and
> > > > > > > >   renaming is worked around in first patch.
> > > > > > > > 
> > > > > > > > The initial report I got was about second and last point, an
> > > > > > > > application was failing to create a new project after "restart" and
> > > > > > > > wasn't able to link special files created beforehand.
> > > > > > > 
> > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > > > > > inherit project id for special inodes? And make sure inodes with unset
> > > > > > > project ID don't fail to be linked, renamed, etc...
> > > > > > 
> > > > > > But then, in set up project, you can cross-link between projects and
> > > > > > escape quota this way. During linking/renaming if source inode has
> > > > > > ID but target one doesn't, we won't be able to tell that this link
> > > > > > is within the project.
> > > > > 
> > > > > Well, I didn't want to charge these special inodes to project quota at all
> > > > > so "escaping quota" was pretty much what I suggested to do. But my point
> > > > > was that since the only thing that's really charged for these inodes is the
> > > > > inodes itself then does this small inaccuracy really matter in practice?
> > > > > Are we afraid the user is going to fill the filesystem with symlinks?
> > > > 
> > > > I thought the worry here is that you can't fully reassign the project
> > > > id for a directory tree unless you have an *at() version of the ioctl
> > > > to handle the special files that you can't open directly?
> > > > 
> > > > So you start with a directory tree that's (say) 2% symlinks and project
> > > > id 5.  Later you want to set project id 7 on that subtree, but after the
> > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are
> > > > still stuck on projid 5.  This is a mess, and if enforcement is enabled
> > > > you've just broken it in a way that can't be fixed aside from recreating
> > > > those files.
> > > 
> > > So the idea I'm trying to propose (and apparently I'm failing to explain it
> > > properly) is:
> > > 
> > > When creating special inode, set i_projid = 0 regardless of directory
> > > settings.
> > > 
> > > When creating hardlink or doing rename, if i_projid of dentry is 0, we
> > > allow the operation.
> > > 
> > > Teach fsck to set i_projid to 0 when inode is special.
> > > 
> > > As a result, AFAICT no problem with hardlinks, renames or similar. No need
> > > for special new ioctl or syscall. The downside is special inodes escape
> > > project quota accounting. Do we care?
> > 
> > I see. But is it fine to allow fill filesystem with special inodes?
> > Don't know if it can be used somehow but this is exception from
> > isoft/ihard limits then.
> > 
> > I don't see issues with this approach also, if others don't have
> > other points or other uses for those new syscalls, I can go with
> > this approach.
> 
> I do -- allowing unpriviledged users to create symlinks that consume
> icount (and possibly bcount) in the root project breaks the entire
> enforcement mechanism.  That's not the way that project quota has worked
> on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> only for these special cases.

OK, fair enough. I though someone will hate this. I'd just like to
understand one thing: Owner of the inode can change the project ID to 0
anyway so project quotas are more like a cooperative space tracking scheme
anyway. If you want to escape it, you can. So what are you exactly worried
about? Is it the container usecase where from within the user namespace you
cannot change project IDs?

Anyway I just wanted to have an explicit decision that the simple solution
is not good enough before we go the more complex route ;).

								Honza
Darrick J. Wong June 5, 2024, 12:37 a.m. UTC | #27
On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote:
> > > On 2024-06-03 12:42:59, Jan Kara wrote:
> > > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote:
> > > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> > > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > > > > > > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > > > > > > Hi!
> > > > > > > > 
> > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > > > > > > Hello!
> > > > > > > > > > 
> > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > > > > > > directory.
> > > > > > > > > > > 
> > > > > > > > > > > The project is created from userspace by opening and calling
> > > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > > > > > > still exist in the directory.
> > > > > > > > > > > 
> > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > > > > > > have one).
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > > > > > > 
> > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > > > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > > > > > > the concern that user could escape inode project quota accounting and
> > > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > > > > > > for something that seems as a small corner case to me?
> > > > > > > > > 
> > > > > > > > > So there's few things:
> > > > > > > > > - Quota accounting is missing only some special files. Special files
> > > > > > > > >   created after quota project is setup inherit ID from the project
> > > > > > > > >   directory.
> > > > > > > > > - For special files created after the project is setup there's no
> > > > > > > > >   way to make them project-less. Therefore, creating a new project
> > > > > > > > >   over those will fail due to project ID miss match.
> > > > > > > > > - It wasn't possible to hardlink/rename project-less special files
> > > > > > > > >   inside a project due to ID miss match. The linking is fixed, and
> > > > > > > > >   renaming is worked around in first patch.
> > > > > > > > > 
> > > > > > > > > The initial report I got was about second and last point, an
> > > > > > > > > application was failing to create a new project after "restart" and
> > > > > > > > > wasn't able to link special files created beforehand.
> > > > > > > > 
> > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > > > > > > inherit project id for special inodes? And make sure inodes with unset
> > > > > > > > project ID don't fail to be linked, renamed, etc...
> > > > > > > 
> > > > > > > But then, in set up project, you can cross-link between projects and
> > > > > > > escape quota this way. During linking/renaming if source inode has
> > > > > > > ID but target one doesn't, we won't be able to tell that this link
> > > > > > > is within the project.
> > > > > > 
> > > > > > Well, I didn't want to charge these special inodes to project quota at all
> > > > > > so "escaping quota" was pretty much what I suggested to do. But my point
> > > > > > was that since the only thing that's really charged for these inodes is the
> > > > > > inodes itself then does this small inaccuracy really matter in practice?
> > > > > > Are we afraid the user is going to fill the filesystem with symlinks?
> > > > > 
> > > > > I thought the worry here is that you can't fully reassign the project
> > > > > id for a directory tree unless you have an *at() version of the ioctl
> > > > > to handle the special files that you can't open directly?
> > > > > 
> > > > > So you start with a directory tree that's (say) 2% symlinks and project
> > > > > id 5.  Later you want to set project id 7 on that subtree, but after the
> > > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are
> > > > > still stuck on projid 5.  This is a mess, and if enforcement is enabled
> > > > > you've just broken it in a way that can't be fixed aside from recreating
> > > > > those files.
> > > > 
> > > > So the idea I'm trying to propose (and apparently I'm failing to explain it
> > > > properly) is:
> > > > 
> > > > When creating special inode, set i_projid = 0 regardless of directory
> > > > settings.
> > > > 
> > > > When creating hardlink or doing rename, if i_projid of dentry is 0, we
> > > > allow the operation.
> > > > 
> > > > Teach fsck to set i_projid to 0 when inode is special.
> > > > 
> > > > As a result, AFAICT no problem with hardlinks, renames or similar. No need
> > > > for special new ioctl or syscall. The downside is special inodes escape
> > > > project quota accounting. Do we care?
> > > 
> > > I see. But is it fine to allow fill filesystem with special inodes?
> > > Don't know if it can be used somehow but this is exception from
> > > isoft/ihard limits then.
> > > 
> > > I don't see issues with this approach also, if others don't have
> > > other points or other uses for those new syscalls, I can go with
> > > this approach.
> > 
> > I do -- allowing unpriviledged users to create symlinks that consume
> > icount (and possibly bcount) in the root project breaks the entire
> > enforcement mechanism.  That's not the way that project quota has worked
> > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > only for these special cases.
> 
> OK, fair enough. I though someone will hate this. I'd just like to
> understand one thing: Owner of the inode can change the project ID to 0
> anyway so project quotas are more like a cooperative space tracking scheme
> anyway. If you want to escape it, you can. So what are you exactly worried
> about? Is it the container usecase where from within the user namespace you
> cannot change project IDs?

Yep.

> Anyway I just wanted to have an explicit decision that the simple solution
> is not good enough before we go the more complex route ;).

Also, every now and then someone comes along and half-proposes making it
so that non-root cannot change project ids anymore.  Maybe some day that
will succeed.

--D

> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
>
Amir Goldstein June 5, 2024, 5:13 a.m. UTC | #28
On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > On Mon, Jun 03, 2024 at 06:28:47PM +0200, Andrey Albershteyn wrote:
> > > > On 2024-06-03 12:42:59, Jan Kara wrote:
> > > > > On Fri 31-05-24 07:52:04, Darrick J. Wong wrote:
> > > > > > On Fri, May 24, 2024 at 06:11:01PM +0200, Jan Kara wrote:
> > > > > > > On Thu 23-05-24 13:16:48, Andrey Albershteyn wrote:
> > > > > > > > On 2024-05-23 09:48:28, Jan Kara wrote:
> > > > > > > > > Hi!
> > > > > > > > >
> > > > > > > > > On Wed 22-05-24 12:45:09, Andrey Albershteyn wrote:
> > > > > > > > > > On 2024-05-22 12:00:07, Jan Kara wrote:
> > > > > > > > > > > Hello!
> > > > > > > > > > >
> > > > > > > > > > > On Mon 20-05-24 18:46:21, Andrey Albershteyn wrote:
> > > > > > > > > > > > XFS has project quotas which could be attached to a directory. All
> > > > > > > > > > > > new inodes in these directories inherit project ID set on parent
> > > > > > > > > > > > directory.
> > > > > > > > > > > >
> > > > > > > > > > > > The project is created from userspace by opening and calling
> > > > > > > > > > > > FS_IOC_FSSETXATTR on each inode. This is not possible for special
> > > > > > > > > > > > files such as FIFO, SOCK, BLK etc. as opening them returns a special
> > > > > > > > > > > > inode from VFS. Therefore, some inodes are left with empty project
> > > > > > > > > > > > ID. Those inodes then are not shown in the quota accounting but
> > > > > > > > > > > > still exist in the directory.
> > > > > > > > > > > >
> > > > > > > > > > > > This patch adds two new ioctls which allows userspace, such as
> > > > > > > > > > > > xfs_quota, to set project ID on special files by using parent
> > > > > > > > > > > > directory to open FS inode. This will let xfs_quota set ID on all
> > > > > > > > > > > > inodes and also reset it when project is removed. Also, as
> > > > > > > > > > > > vfs_fileattr_set() is now will called on special files too, let's
> > > > > > > > > > > > forbid any other attributes except projid and nextents (symlink can
> > > > > > > > > > > > have one).
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > > > > > > > > >
> > > > > > > > > > > I'd like to understand one thing. Is it practically useful to set project
> > > > > > > > > > > IDs for special inodes? There is no significant disk space usage associated
> > > > > > > > > > > with them so wrt quotas we are speaking only about the inode itself. So is
> > > > > > > > > > > the concern that user could escape inode project quota accounting and
> > > > > > > > > > > perform some DoS? Or why do we bother with two new somewhat hairy ioctls
> > > > > > > > > > > for something that seems as a small corner case to me?
> > > > > > > > > >
> > > > > > > > > > So there's few things:
> > > > > > > > > > - Quota accounting is missing only some special files. Special files
> > > > > > > > > >   created after quota project is setup inherit ID from the project
> > > > > > > > > >   directory.
> > > > > > > > > > - For special files created after the project is setup there's no
> > > > > > > > > >   way to make them project-less. Therefore, creating a new project
> > > > > > > > > >   over those will fail due to project ID miss match.
> > > > > > > > > > - It wasn't possible to hardlink/rename project-less special files
> > > > > > > > > >   inside a project due to ID miss match. The linking is fixed, and
> > > > > > > > > >   renaming is worked around in first patch.
> > > > > > > > > >
> > > > > > > > > > The initial report I got was about second and last point, an
> > > > > > > > > > application was failing to create a new project after "restart" and
> > > > > > > > > > wasn't able to link special files created beforehand.
> > > > > > > > >
> > > > > > > > > I see. OK, but wouldn't it then be an easier fix to make sure we *never*
> > > > > > > > > inherit project id for special inodes? And make sure inodes with unset
> > > > > > > > > project ID don't fail to be linked, renamed, etc...
> > > > > > > >
> > > > > > > > But then, in set up project, you can cross-link between projects and
> > > > > > > > escape quota this way. During linking/renaming if source inode has
> > > > > > > > ID but target one doesn't, we won't be able to tell that this link
> > > > > > > > is within the project.
> > > > > > >
> > > > > > > Well, I didn't want to charge these special inodes to project quota at all
> > > > > > > so "escaping quota" was pretty much what I suggested to do. But my point
> > > > > > > was that since the only thing that's really charged for these inodes is the
> > > > > > > inodes itself then does this small inaccuracy really matter in practice?
> > > > > > > Are we afraid the user is going to fill the filesystem with symlinks?
> > > > > >
> > > > > > I thought the worry here is that you can't fully reassign the project
> > > > > > id for a directory tree unless you have an *at() version of the ioctl
> > > > > > to handle the special files that you can't open directly?
> > > > > >
> > > > > > So you start with a directory tree that's (say) 2% symlinks and project
> > > > > > id 5.  Later you want to set project id 7 on that subtree, but after the
> > > > > > incomplete change, projid 7 is charged for 98% of the tree, and 2% are
> > > > > > still stuck on projid 5.  This is a mess, and if enforcement is enabled
> > > > > > you've just broken it in a way that can't be fixed aside from recreating
> > > > > > those files.
> > > > >
> > > > > So the idea I'm trying to propose (and apparently I'm failing to explain it
> > > > > properly) is:
> > > > >
> > > > > When creating special inode, set i_projid = 0 regardless of directory
> > > > > settings.
> > > > >
> > > > > When creating hardlink or doing rename, if i_projid of dentry is 0, we
> > > > > allow the operation.
> > > > >
> > > > > Teach fsck to set i_projid to 0 when inode is special.
> > > > >
> > > > > As a result, AFAICT no problem with hardlinks, renames or similar. No need
> > > > > for special new ioctl or syscall. The downside is special inodes escape
> > > > > project quota accounting. Do we care?
> > > >
> > > > I see. But is it fine to allow fill filesystem with special inodes?
> > > > Don't know if it can be used somehow but this is exception from
> > > > isoft/ihard limits then.
> > > >
> > > > I don't see issues with this approach also, if others don't have
> > > > other points or other uses for those new syscalls, I can go with
> > > > this approach.
> > >
> > > I do -- allowing unpriviledged users to create symlinks that consume
> > > icount (and possibly bcount) in the root project breaks the entire
> > > enforcement mechanism.  That's not the way that project quota has worked
> > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > only for these special cases.
> >
> > OK, fair enough. I though someone will hate this. I'd just like to
> > understand one thing: Owner of the inode can change the project ID to 0
> > anyway so project quotas are more like a cooperative space tracking scheme
> > anyway. If you want to escape it, you can. So what are you exactly worried
> > about? Is it the container usecase where from within the user namespace you
> > cannot change project IDs?
>
> Yep.
>
> > Anyway I just wanted to have an explicit decision that the simple solution
> > is not good enough before we go the more complex route ;).
>
> Also, every now and then someone comes along and half-proposes making it
> so that non-root cannot change project ids anymore.  Maybe some day that
> will succeed.
>

I'd just like to point out that the purpose of the project quotas feature
as I understand it, is to apply quotas to subtrees, where container storage
is a very common private case of project subtree.

The purpose is NOT to create a "project" of random files in random
paths.

My point is that changing the project id of a non-dir child to be different
from the project id of its parent is a pretty rare use case (I think?).

If changing the projid of non-dir is needed for moving it to a
different subtree,
we could allow renameat2(2) of non-dir with no hardlinks to implicitly
change its
inherited project id or explicitly with a flag for a hardlink, e.g.:
renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).

Which leaves us only with the use cases of:
1. Share some inode (as hardlinks) among projects
2. Recursively changing a subtree projid

#1 could be allowed with a flag to linkat(2) or hardlink within a shared
project subtree and rename the hardlink out of the shared project
subtree with renameat2(2) and explicit flag, e.g.:
renameat2(olddirfd, name, newdirfd, name, RENAME_OLD_PROJID).

#2 *could technically* be done by the same rename flag that
will allow rename to the same path, e.g:
renameat2(dirfd, name, dirfd, name, RENAME_NEW_PROJID).

It's not pretty and I personally prefer the syscall solution.
Just wanted to put it out there.

Thanks,
Amir.
Dave Chinner June 6, 2024, 2:27 a.m. UTC | #29
On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote:
> On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > > I do -- allowing unpriviledged users to create symlinks that consume
> > > > icount (and possibly bcount) in the root project breaks the entire
> > > > enforcement mechanism.  That's not the way that project quota has worked
> > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > > only for these special cases.
> > >
> > > OK, fair enough. I though someone will hate this. I'd just like to
> > > understand one thing: Owner of the inode can change the project ID to 0
> > > anyway so project quotas are more like a cooperative space tracking scheme
> > > anyway. If you want to escape it, you can. So what are you exactly worried
> > > about? Is it the container usecase where from within the user namespace you
> > > cannot change project IDs?
> >
> > Yep.
> >
> > > Anyway I just wanted to have an explicit decision that the simple solution
> > > is not good enough before we go the more complex route ;).
> >
> > Also, every now and then someone comes along and half-proposes making it
> > so that non-root cannot change project ids anymore.  Maybe some day that
> > will succeed.
> >
> 
> I'd just like to point out that the purpose of the project quotas feature
> as I understand it, is to apply quotas to subtrees, where container storage
> is a very common private case of project subtree.

That is the most modern use case, yes.

[ And for a walk down history lane.... ]

> The purpose is NOT to create a "project" of random files in random
> paths.

This is *exactly* the original use case that project quotas were
designed for back on Irix in the early 1990s and is the original
behaviour project quotas brought to Linux.

Project quota inheritance didn't come along until 2005:

commit 65f1866a3a8e512d43795c116bfef262e703b789
Author: Nathan Scott <nathans@sgi.com>
Date:   Fri Jun 3 06:04:22 2005 +0000

    Add support for project quota inheritance, a merge of Glens changes.
    Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd.

And full support for directory tree quotas using project IDs wasn't
fully introduced until a year later in 2006:

commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6
Author: Nathan Scott <nathans@sgi.com>
Date:   Tue May 30 15:54:53 2006 +0000

    statvfs component of directory/project quota support, code originally by Glen.
    Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd.

These changes were largely done for an SGI NAS product that allowed
us to create one great big XFS filesystem and then create
arbitrarily sized, thin provisoned  "NFS volumes"  as directory
quota controlled subdirs instantenously. The directory tree quota
defined the size of the volume, and so we could also grow and shrink
them instantenously, too. And we could remove them instantenously
via background garbage collection after the export was removed and
the user had been told it had been destroyed.

So that was the original use case for directory tree quotas on XFS -
providing scalable, fast management of "thin" storage for a NAS
product. Projects quotas had been used for accounting random
colections of files for over a decade before this directory quota
construct was created, and the "modern" container use cases for
directory quotas didn't come along until almost a decade after this
capability was added.

> My point is that changing the project id of a non-dir child to be different
> from the project id of its parent is a pretty rare use case (I think?).

Not if you are using project quotas as they were originally intended
to be used.

> If changing the projid of non-dir is needed for moving it to a
> different subtree,
> we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> change its
> inherited project id or explicitly with a flag for a hardlink, e.g.:
> renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).

Why?

The only reason XFS returns -EXDEV to rename across project IDs is
because nobody wanted to spend the time to work out how to do the
quota accounting of the metadata changed in the rename operation
accurately. So for that rare case (not something that would happen
on the NAS product) we returned -EXDEV to trigger the mv command to
copy the file to the destination and then unlink the source instead,
thereby handling all the quota accounting correctly.

IOWs, this whole "-EXDEV on rename across parent project quota
boundaries" is an implementation detail and nothing more.
Filesystems that implement project quotas and the directory tree
sub-variant don't need to behave like this if they can accurately
account for the quota ID changes during an atomic rename operation.
If that's too hard, then the fallback is to return -EXDEV and let
userspace do it the slow way which will always acocunt the resource
usage correctly to the individual projects.

Hence I think we should just fix the XFS kernel behaviour to do the
right thing in this special file case rather than return -EXDEV and
then forget about the rest of it. Sure, update xfs_repair to fix the
special file project id issue if it trips over it, but other than
that I don't think we need anything more. If fixing it requires new
syscalls and tools, then that's much harder to backport to old
kernels and distros than just backporting a couple of small XFS
kernel patches...

-Dave.
Darrick J. Wong June 6, 2024, 10:54 p.m. UTC | #30
On Thu, Jun 06, 2024 at 12:27:38PM +1000, Dave Chinner wrote:
> On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote:
> > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > > > I do -- allowing unpriviledged users to create symlinks that consume
> > > > > icount (and possibly bcount) in the root project breaks the entire
> > > > > enforcement mechanism.  That's not the way that project quota has worked
> > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > > > only for these special cases.
> > > >
> > > > OK, fair enough. I though someone will hate this. I'd just like to
> > > > understand one thing: Owner of the inode can change the project ID to 0
> > > > anyway so project quotas are more like a cooperative space tracking scheme
> > > > anyway. If you want to escape it, you can. So what are you exactly worried
> > > > about? Is it the container usecase where from within the user namespace you
> > > > cannot change project IDs?
> > >
> > > Yep.
> > >
> > > > Anyway I just wanted to have an explicit decision that the simple solution
> > > > is not good enough before we go the more complex route ;).
> > >
> > > Also, every now and then someone comes along and half-proposes making it
> > > so that non-root cannot change project ids anymore.  Maybe some day that
> > > will succeed.
> > >
> > 
> > I'd just like to point out that the purpose of the project quotas feature
> > as I understand it, is to apply quotas to subtrees, where container storage
> > is a very common private case of project subtree.
> 
> That is the most modern use case, yes.
> 
> [ And for a walk down history lane.... ]

Could you take all your institutional knowledge and paste it into a
Documentation/ file that we can use as a starting point for what exactly
does the project quota design do, please?  I'm betting the other two
filesystems that implement it (ext4/f2fs) don't quite implement the same
behaviors.

--D

> > The purpose is NOT to create a "project" of random files in random
> > paths.
> 
> This is *exactly* the original use case that project quotas were
> designed for back on Irix in the early 1990s and is the original
> behaviour project quotas brought to Linux.
> 
> Project quota inheritance didn't come along until 2005:
> 
> commit 65f1866a3a8e512d43795c116bfef262e703b789
> Author: Nathan Scott <nathans@sgi.com>
> Date:   Fri Jun 3 06:04:22 2005 +0000
> 
>     Add support for project quota inheritance, a merge of Glens changes.
>     Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd.
> 
> And full support for directory tree quotas using project IDs wasn't
> fully introduced until a year later in 2006:
> 
> commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6
> Author: Nathan Scott <nathans@sgi.com>
> Date:   Tue May 30 15:54:53 2006 +0000
> 
>     statvfs component of directory/project quota support, code originally by Glen.
>     Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd.
> 
> These changes were largely done for an SGI NAS product that allowed
> us to create one great big XFS filesystem and then create
> arbitrarily sized, thin provisoned  "NFS volumes"  as directory
> quota controlled subdirs instantenously. The directory tree quota
> defined the size of the volume, and so we could also grow and shrink
> them instantenously, too. And we could remove them instantenously
> via background garbage collection after the export was removed and
> the user had been told it had been destroyed.
> 
> So that was the original use case for directory tree quotas on XFS -
> providing scalable, fast management of "thin" storage for a NAS
> product. Projects quotas had been used for accounting random
> colections of files for over a decade before this directory quota
> construct was created, and the "modern" container use cases for
> directory quotas didn't come along until almost a decade after this
> capability was added.
> 
> > My point is that changing the project id of a non-dir child to be different
> > from the project id of its parent is a pretty rare use case (I think?).
> 
> Not if you are using project quotas as they were originally intended
> to be used.
> 
> > If changing the projid of non-dir is needed for moving it to a
> > different subtree,
> > we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> > change its
> > inherited project id or explicitly with a flag for a hardlink, e.g.:
> > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).
> 
> Why?
> 
> The only reason XFS returns -EXDEV to rename across project IDs is
> because nobody wanted to spend the time to work out how to do the
> quota accounting of the metadata changed in the rename operation
> accurately. So for that rare case (not something that would happen
> on the NAS product) we returned -EXDEV to trigger the mv command to
> copy the file to the destination and then unlink the source instead,
> thereby handling all the quota accounting correctly.
> 
> IOWs, this whole "-EXDEV on rename across parent project quota
> boundaries" is an implementation detail and nothing more.
> Filesystems that implement project quotas and the directory tree
> sub-variant don't need to behave like this if they can accurately
> account for the quota ID changes during an atomic rename operation.
> If that's too hard, then the fallback is to return -EXDEV and let
> userspace do it the slow way which will always acocunt the resource
> usage correctly to the individual projects.
> 
> Hence I think we should just fix the XFS kernel behaviour to do the
> right thing in this special file case rather than return -EXDEV and
> then forget about the rest of it. Sure, update xfs_repair to fix the
> special file project id issue if it trips over it, but other than
> that I don't think we need anything more. If fixing it requires new
> syscalls and tools, then that's much harder to backport to old
> kernels and distros than just backporting a couple of small XFS
> kernel patches...
> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Amir Goldstein June 7, 2024, 6:17 a.m. UTC | #31
On Thu, Jun 6, 2024 at 5:27 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote:
> > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > > > I do -- allowing unpriviledged users to create symlinks that consume
> > > > > icount (and possibly bcount) in the root project breaks the entire
> > > > > enforcement mechanism.  That's not the way that project quota has worked
> > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > > > only for these special cases.
> > > >
> > > > OK, fair enough. I though someone will hate this. I'd just like to
> > > > understand one thing: Owner of the inode can change the project ID to 0
> > > > anyway so project quotas are more like a cooperative space tracking scheme
> > > > anyway. If you want to escape it, you can. So what are you exactly worried
> > > > about? Is it the container usecase where from within the user namespace you
> > > > cannot change project IDs?
> > >
> > > Yep.
> > >
> > > > Anyway I just wanted to have an explicit decision that the simple solution
> > > > is not good enough before we go the more complex route ;).
> > >
> > > Also, every now and then someone comes along and half-proposes making it
> > > so that non-root cannot change project ids anymore.  Maybe some day that
> > > will succeed.
> > >
> >
> > I'd just like to point out that the purpose of the project quotas feature
> > as I understand it, is to apply quotas to subtrees, where container storage
> > is a very common private case of project subtree.
>
> That is the most modern use case, yes.
>
> [ And for a walk down history lane.... ]
>
> > The purpose is NOT to create a "project" of random files in random
> > paths.
>
> This is *exactly* the original use case that project quotas were
> designed for back on Irix in the early 1990s and is the original
> behaviour project quotas brought to Linux.
>
> Project quota inheritance didn't come along until 2005:
>
> commit 65f1866a3a8e512d43795c116bfef262e703b789
> Author: Nathan Scott <nathans@sgi.com>
> Date:   Fri Jun 3 06:04:22 2005 +0000
>
>     Add support for project quota inheritance, a merge of Glens changes.
>     Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd.
>
> And full support for directory tree quotas using project IDs wasn't
> fully introduced until a year later in 2006:
>
> commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6
> Author: Nathan Scott <nathans@sgi.com>
> Date:   Tue May 30 15:54:53 2006 +0000
>
>     statvfs component of directory/project quota support, code originally by Glen.
>     Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd.
>
> These changes were largely done for an SGI NAS product that allowed
> us to create one great big XFS filesystem and then create
> arbitrarily sized, thin provisoned  "NFS volumes"  as directory
> quota controlled subdirs instantenously. The directory tree quota
> defined the size of the volume, and so we could also grow and shrink
> them instantenously, too. And we could remove them instantenously
> via background garbage collection after the export was removed and
> the user had been told it had been destroyed.
>
> So that was the original use case for directory tree quotas on XFS -
> providing scalable, fast management of "thin" storage for a NAS
> product. Projects quotas had been used for accounting random
> colections of files for over a decade before this directory quota
> construct was created, and the "modern" container use cases for
> directory quotas didn't come along until almost a decade after this
> capability was added.
>

Cool. Didn't know all of this.
Lucky for us, those historic use cases are well distinguished from
the modern subtree use case by the opt-in PROJINHERIT bit.
So as long as PROJINHERIT is set, my assumptions mostly hold(?)

> > My point is that changing the project id of a non-dir child to be different
> > from the project id of its parent is a pretty rare use case (I think?).
>
> Not if you are using project quotas as they were originally intended
> to be used.
>

Rephrase then:

Changing the projid of a non-dir child to be different from the projid
of its parent, which has PROJINHERIT bit set, is a pretty rare use case(?)

> > If changing the projid of non-dir is needed for moving it to a
> > different subtree,
> > we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> > change its
> > inherited project id or explicitly with a flag for a hardlink, e.g.:
> > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).
>
> Why?
>
> The only reason XFS returns -EXDEV to rename across project IDs is
> because nobody wanted to spend the time to work out how to do the
> quota accounting of the metadata changed in the rename operation
> accurately. So for that rare case (not something that would happen
> on the NAS product) we returned -EXDEV to trigger the mv command to
> copy the file to the destination and then unlink the source instead,
> thereby handling all the quota accounting correctly.
>
> IOWs, this whole "-EXDEV on rename across parent project quota
> boundaries" is an implementation detail and nothing more.
> Filesystems that implement project quotas and the directory tree
> sub-variant don't need to behave like this if they can accurately
> account for the quota ID changes during an atomic rename operation.
> If that's too hard, then the fallback is to return -EXDEV and let
> userspace do it the slow way which will always acocunt the resource
> usage correctly to the individual projects.
>
> Hence I think we should just fix the XFS kernel behaviour to do the
> right thing in this special file case rather than return -EXDEV and
> then forget about the rest of it. Sure, update xfs_repair to fix the
> special file project id issue if it trips over it, but other than
> that I don't think we need anything more. If fixing it requires new
> syscalls and tools, then that's much harder to backport to old
> kernels and distros than just backporting a couple of small XFS
> kernel patches...
>

I assume that by "fix the XFS behavior" you mean
"we could allow renameat2(2) of non-dir with no hardlinks to implicitly
 change its inherited project id"?
(in case the new parent has the PROJINHERIT bit)
so that the RENAME_NEW_PROJID behavior would be implicit.

Unlike rename() from one parent to the other, link()+unlink()
is less obvious.

The "modern" use cases that I listed where implicit change of projid
does not suffice are:

1. Share some inodes (as hardlinks) among projects
2. Recursively changing a subtree projid

They could be implemented by explicit flags to renameat2()/linkat() and
they could be implemented by [gs]etfsxattrat(2) syscalls.

Thanks,
Amir.
Andrey Albershteyn June 10, 2024, 8:17 a.m. UTC | #32
On 2024-06-06 12:27:38, Dave Chinner wrote:
> On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote:
> > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > > > I do -- allowing unpriviledged users to create symlinks that consume
> > > > > icount (and possibly bcount) in the root project breaks the entire
> > > > > enforcement mechanism.  That's not the way that project quota has worked
> > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > > > only for these special cases.
> > > >
> > > > OK, fair enough. I though someone will hate this. I'd just like to
> > > > understand one thing: Owner of the inode can change the project ID to 0
> > > > anyway so project quotas are more like a cooperative space tracking scheme
> > > > anyway. If you want to escape it, you can. So what are you exactly worried
> > > > about? Is it the container usecase where from within the user namespace you
> > > > cannot change project IDs?
> > >
> > > Yep.
> > >
> > > > Anyway I just wanted to have an explicit decision that the simple solution
> > > > is not good enough before we go the more complex route ;).
> > >
> > > Also, every now and then someone comes along and half-proposes making it
> > > so that non-root cannot change project ids anymore.  Maybe some day that
> > > will succeed.
> > >
> > 
> > I'd just like to point out that the purpose of the project quotas feature
> > as I understand it, is to apply quotas to subtrees, where container storage
> > is a very common private case of project subtree.
> 
> That is the most modern use case, yes.
> 
> [ And for a walk down history lane.... ]
> 
> > The purpose is NOT to create a "project" of random files in random
> > paths.
> 
> This is *exactly* the original use case that project quotas were
> designed for back on Irix in the early 1990s and is the original
> behaviour project quotas brought to Linux.
> 
> Project quota inheritance didn't come along until 2005:
> 
> commit 65f1866a3a8e512d43795c116bfef262e703b789
> Author: Nathan Scott <nathans@sgi.com>
> Date:   Fri Jun 3 06:04:22 2005 +0000
> 
>     Add support for project quota inheritance, a merge of Glens changes.
>     Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd.
> 
> And full support for directory tree quotas using project IDs wasn't
> fully introduced until a year later in 2006:
> 
> commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6
> Author: Nathan Scott <nathans@sgi.com>
> Date:   Tue May 30 15:54:53 2006 +0000
> 
>     statvfs component of directory/project quota support, code originally by Glen.
>     Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd.
> 
> These changes were largely done for an SGI NAS product that allowed
> us to create one great big XFS filesystem and then create
> arbitrarily sized, thin provisoned  "NFS volumes"  as directory
> quota controlled subdirs instantenously. The directory tree quota
> defined the size of the volume, and so we could also grow and shrink
> them instantenously, too. And we could remove them instantenously
> via background garbage collection after the export was removed and
> the user had been told it had been destroyed.
> 
> So that was the original use case for directory tree quotas on XFS -
> providing scalable, fast management of "thin" storage for a NAS
> product. Projects quotas had been used for accounting random
> colections of files for over a decade before this directory quota
> construct was created, and the "modern" container use cases for
> directory quotas didn't come along until almost a decade after this
> capability was added.

Thanks!

> 
> > My point is that changing the project id of a non-dir child to be different
> > from the project id of its parent is a pretty rare use case (I think?).
> 
> Not if you are using project quotas as they were originally intended
> to be used.
> 
> > If changing the projid of non-dir is needed for moving it to a
> > different subtree,
> > we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> > change its
> > inherited project id or explicitly with a flag for a hardlink, e.g.:
> > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).
> 
> Why?
> 
> The only reason XFS returns -EXDEV to rename across project IDs is
> because nobody wanted to spend the time to work out how to do the
> quota accounting of the metadata changed in the rename operation
> accurately. So for that rare case (not something that would happen
> on the NAS product) we returned -EXDEV to trigger the mv command to
> copy the file to the destination and then unlink the source instead,
> thereby handling all the quota accounting correctly.
> 
> IOWs, this whole "-EXDEV on rename across parent project quota
> boundaries" is an implementation detail and nothing more.
> Filesystems that implement project quotas and the directory tree
> sub-variant don't need to behave like this if they can accurately
> account for the quota ID changes during an atomic rename operation.
> If that's too hard, then the fallback is to return -EXDEV and let
> userspace do it the slow way which will always acocunt the resource
> usage correctly to the individual projects.
> 
> Hence I think we should just fix the XFS kernel behaviour to do the
> right thing in this special file case rather than return -EXDEV and
> then forget about the rest of it.

I see, I will look into that, this should solve the original issue.

But those special file's inodes still will not be accounted by the
quota during initial project setup (xfs_quota will skip them), would
it worth it adding new syscalls anyway?
Amir Goldstein June 10, 2024, 9:19 a.m. UTC | #33
On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote:
>
> On 2024-06-06 12:27:38, Dave Chinner wrote:
...
> >
> > The only reason XFS returns -EXDEV to rename across project IDs is
> > because nobody wanted to spend the time to work out how to do the
> > quota accounting of the metadata changed in the rename operation
> > accurately. So for that rare case (not something that would happen
> > on the NAS product) we returned -EXDEV to trigger the mv command to
> > copy the file to the destination and then unlink the source instead,
> > thereby handling all the quota accounting correctly.
> >
> > IOWs, this whole "-EXDEV on rename across parent project quota
> > boundaries" is an implementation detail and nothing more.
> > Filesystems that implement project quotas and the directory tree
> > sub-variant don't need to behave like this if they can accurately
> > account for the quota ID changes during an atomic rename operation.
> > If that's too hard, then the fallback is to return -EXDEV and let
> > userspace do it the slow way which will always acocunt the resource
> > usage correctly to the individual projects.
> >
> > Hence I think we should just fix the XFS kernel behaviour to do the
> > right thing in this special file case rather than return -EXDEV and
> > then forget about the rest of it.
>
> I see, I will look into that, this should solve the original issue.

I see that you already got Darrick's RVB on the original patch:
https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/

What is missing then?
A similar patch for rename() that allows rename of zero projid special
file as long as (target_dp->i_projid == src_dp->i_projid)?

In theory, it would have been nice to fix the zero projid during the
above link() and rename() operations, but it would be more challenging
and I see no reason to do that if all the other files remain with zero
projid after initial project setup (i.e. if not implementing the syscalls).

>
> But those special file's inodes still will not be accounted by the
> quota during initial project setup (xfs_quota will skip them), would
> it worth it adding new syscalls anyway?
>

Is it worth it to you?

Adding those new syscalls means adding tests and documentation
and handle all the bugs later.

If nobody cared about accounting of special files inodes so far,
there is no proof that anyone will care that you put in all this work.

Thanks,
Amir.
Andrey Albershteyn June 10, 2024, 11:50 a.m. UTC | #34
On 2024-06-10 12:19:50, Amir Goldstein wrote:
> On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> >
> > On 2024-06-06 12:27:38, Dave Chinner wrote:
> ...
> > >
> > > The only reason XFS returns -EXDEV to rename across project IDs is
> > > because nobody wanted to spend the time to work out how to do the
> > > quota accounting of the metadata changed in the rename operation
> > > accurately. So for that rare case (not something that would happen
> > > on the NAS product) we returned -EXDEV to trigger the mv command to
> > > copy the file to the destination and then unlink the source instead,
> > > thereby handling all the quota accounting correctly.
> > >
> > > IOWs, this whole "-EXDEV on rename across parent project quota
> > > boundaries" is an implementation detail and nothing more.
> > > Filesystems that implement project quotas and the directory tree
> > > sub-variant don't need to behave like this if they can accurately
> > > account for the quota ID changes during an atomic rename operation.
> > > If that's too hard, then the fallback is to return -EXDEV and let
> > > userspace do it the slow way which will always acocunt the resource
> > > usage correctly to the individual projects.
> > >
> > > Hence I think we should just fix the XFS kernel behaviour to do the
> > > right thing in this special file case rather than return -EXDEV and
> > > then forget about the rest of it.
> >
> > I see, I will look into that, this should solve the original issue.
> 
> I see that you already got Darrick's RVB on the original patch:
> https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/
> 
> What is missing then?
> A similar patch for rename() that allows rename of zero projid special
> file as long as (target_dp->i_projid == src_dp->i_projid)?
> 
> In theory, it would have been nice to fix the zero projid during the
> above link() and rename() operations, but it would be more challenging
> and I see no reason to do that if all the other files remain with zero
> projid after initial project setup (i.e. if not implementing the syscalls).

I think Dave suggests to get rid of this if-guard and allow
link()/rename() for special files but with correct quota calculation.

> 
> >
> > But those special file's inodes still will not be accounted by the
> > quota during initial project setup (xfs_quota will skip them), would
> > it worth it adding new syscalls anyway?
> >
> 
> Is it worth it to you?
> 
> Adding those new syscalls means adding tests and documentation
> and handle all the bugs later.
> 
> If nobody cared about accounting of special files inodes so far,
> there is no proof that anyone will care that you put in all this work.

I already have patch and some simple man-pages prepared, I'm
wondering if this would be useful for any other usecases which would
require setting extended attributes on spec indodes.

> 
> Thanks,
> Amir.
>
Amir Goldstein June 10, 2024, 1:21 p.m. UTC | #35
On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
>
> On 2024-06-10 12:19:50, Amir Goldstein wrote:
> > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > >
> > > On 2024-06-06 12:27:38, Dave Chinner wrote:
> > ...
> > > >
> > > > The only reason XFS returns -EXDEV to rename across project IDs is
> > > > because nobody wanted to spend the time to work out how to do the
> > > > quota accounting of the metadata changed in the rename operation
> > > > accurately. So for that rare case (not something that would happen
> > > > on the NAS product) we returned -EXDEV to trigger the mv command to
> > > > copy the file to the destination and then unlink the source instead,
> > > > thereby handling all the quota accounting correctly.
> > > >
> > > > IOWs, this whole "-EXDEV on rename across parent project quota
> > > > boundaries" is an implementation detail and nothing more.
> > > > Filesystems that implement project quotas and the directory tree
> > > > sub-variant don't need to behave like this if they can accurately
> > > > account for the quota ID changes during an atomic rename operation.
> > > > If that's too hard, then the fallback is to return -EXDEV and let
> > > > userspace do it the slow way which will always acocunt the resource
> > > > usage correctly to the individual projects.
> > > >
> > > > Hence I think we should just fix the XFS kernel behaviour to do the
> > > > right thing in this special file case rather than return -EXDEV and
> > > > then forget about the rest of it.
> > >
> > > I see, I will look into that, this should solve the original issue.
> >
> > I see that you already got Darrick's RVB on the original patch:
> > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/
> >
> > What is missing then?
> > A similar patch for rename() that allows rename of zero projid special
> > file as long as (target_dp->i_projid == src_dp->i_projid)?
> >
> > In theory, it would have been nice to fix the zero projid during the
> > above link() and rename() operations, but it would be more challenging
> > and I see no reason to do that if all the other files remain with zero
> > projid after initial project setup (i.e. if not implementing the syscalls).
>
> I think Dave suggests to get rid of this if-guard and allow
> link()/rename() for special files but with correct quota calculation.
>
> >
> > >
> > > But those special file's inodes still will not be accounted by the
> > > quota during initial project setup (xfs_quota will skip them), would
> > > it worth it adding new syscalls anyway?
> > >
> >
> > Is it worth it to you?
> >
> > Adding those new syscalls means adding tests and documentation
> > and handle all the bugs later.
> >
> > If nobody cared about accounting of special files inodes so far,
> > there is no proof that anyone will care that you put in all this work.
>
> I already have patch and some simple man-pages prepared, I'm
> wondering if this would be useful for any other usecases

Yes, I personally find it useful.
I have applications that query the fsx_xflags and would rather
be able to use O_PATH to query/set those flags, since
internally in vfs, fileattr_[gs]et() do not really need an open file.

> which would
> require setting extended attributes on spec indodes.

Please do not use the terminology "extended attributes" in the man page
to describe struct fsxattr.
Better follow the "additional attributes" terminology of xfs ioctl man page [1],
even though it is already confusing enough w.r.t "extended attributes" IMO.

Thanks,
Amir.

[1] https://man7.org/linux/man-pages/man2/ioctl_xfs_fsgetxattr.2.html
Jan Kara June 10, 2024, 2:44 p.m. UTC | #36
On Mon 10-06-24 16:21:39, Amir Goldstein wrote:
> On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> >
> > On 2024-06-10 12:19:50, Amir Goldstein wrote:
> > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > > >
> > > > But those special file's inodes still will not be accounted by the
> > > > quota during initial project setup (xfs_quota will skip them), would
> > > > it worth it adding new syscalls anyway?
> > > >
> > >
> > > Is it worth it to you?
> > >
> > > Adding those new syscalls means adding tests and documentation
> > > and handle all the bugs later.
> > >
> > > If nobody cared about accounting of special files inodes so far,
> > > there is no proof that anyone will care that you put in all this work.
> >
> > I already have patch and some simple man-pages prepared, I'm
> > wondering if this would be useful for any other usecases
> 
> Yes, I personally find it useful.
> I have applications that query the fsx_xflags and would rather
> be able to use O_PATH to query/set those flags, since
> internally in vfs, fileattr_[gs]et() do not really need an open file.

Well, Christian doesn't like [1] how much functionality is actually
available through O_PATH fds because it kind of makes them more and more
similar to normal fds and that's apparently undesirable for some usecases
(for security reasons). So I don't think he'd like to have *another*
functionality accepted for O_PATH fds :) But he can speak for himself...

								Honza

[1] https://lore.kernel.org/all/20240412-labeln-filmabend-42422ec453d7@brauner
[2] https://lore.kernel.org/all/20240430-machbar-jogginganzug-7fd3cff2c3ed@brauner
Darrick J. Wong June 10, 2024, 8:26 p.m. UTC | #37
On Mon, Jun 10, 2024 at 04:21:39PM +0300, Amir Goldstein wrote:
> On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> >
> > On 2024-06-10 12:19:50, Amir Goldstein wrote:
> > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > > >
> > > > On 2024-06-06 12:27:38, Dave Chinner wrote:
> > > ...
> > > > >
> > > > > The only reason XFS returns -EXDEV to rename across project IDs is
> > > > > because nobody wanted to spend the time to work out how to do the
> > > > > quota accounting of the metadata changed in the rename operation
> > > > > accurately. So for that rare case (not something that would happen
> > > > > on the NAS product) we returned -EXDEV to trigger the mv command to
> > > > > copy the file to the destination and then unlink the source instead,
> > > > > thereby handling all the quota accounting correctly.
> > > > >
> > > > > IOWs, this whole "-EXDEV on rename across parent project quota
> > > > > boundaries" is an implementation detail and nothing more.
> > > > > Filesystems that implement project quotas and the directory tree
> > > > > sub-variant don't need to behave like this if they can accurately
> > > > > account for the quota ID changes during an atomic rename operation.
> > > > > If that's too hard, then the fallback is to return -EXDEV and let
> > > > > userspace do it the slow way which will always acocunt the resource
> > > > > usage correctly to the individual projects.
> > > > >
> > > > > Hence I think we should just fix the XFS kernel behaviour to do the
> > > > > right thing in this special file case rather than return -EXDEV and
> > > > > then forget about the rest of it.
> > > >
> > > > I see, I will look into that, this should solve the original issue.
> > >
> > > I see that you already got Darrick's RVB on the original patch:
> > > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/
> > >
> > > What is missing then?
> > > A similar patch for rename() that allows rename of zero projid special
> > > file as long as (target_dp->i_projid == src_dp->i_projid)?
> > >
> > > In theory, it would have been nice to fix the zero projid during the
> > > above link() and rename() operations, but it would be more challenging
> > > and I see no reason to do that if all the other files remain with zero
> > > projid after initial project setup (i.e. if not implementing the syscalls).
> >
> > I think Dave suggests to get rid of this if-guard and allow
> > link()/rename() for special files but with correct quota calculation.
> >
> > >
> > > >
> > > > But those special file's inodes still will not be accounted by the
> > > > quota during initial project setup (xfs_quota will skip them), would
> > > > it worth it adding new syscalls anyway?
> > > >
> > >
> > > Is it worth it to you?
> > >
> > > Adding those new syscalls means adding tests and documentation
> > > and handle all the bugs later.
> > >
> > > If nobody cared about accounting of special files inodes so far,
> > > there is no proof that anyone will care that you put in all this work.
> >
> > I already have patch and some simple man-pages prepared, I'm
> > wondering if this would be useful for any other usecases
> 
> Yes, I personally find it useful.
> I have applications that query the fsx_xflags and would rather
> be able to use O_PATH to query/set those flags, since
> internally in vfs, fileattr_[gs]et() do not really need an open file.
> 
> > which would
> > require setting extended attributes on spec indodes.
> 
> Please do not use the terminology "extended attributes" in the man page
> to describe struct fsxattr.

"XFS file attributes" perhaps?

Though that's anachronistic since ext4 supports /some/ of them now.

--D

> Better follow the "additional attributes" terminology of xfs ioctl man page [1],
> even though it is already confusing enough w.r.t "extended attributes" IMO.
> 
> Thanks,
> Amir.
> 
> [1] https://man7.org/linux/man-pages/man2/ioctl_xfs_fsgetxattr.2.html
>
Amir Goldstein June 11, 2024, 7:57 a.m. UTC | #38
On Mon, Jun 10, 2024 at 11:26 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Mon, Jun 10, 2024 at 04:21:39PM +0300, Amir Goldstein wrote:
> > On Mon, Jun 10, 2024 at 2:50 PM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > >
> > > On 2024-06-10 12:19:50, Amir Goldstein wrote:
> > > > On Mon, Jun 10, 2024 at 11:17 AM Andrey Albershteyn <aalbersh@redhat.com> wrote:
> > > > >
> > > > > On 2024-06-06 12:27:38, Dave Chinner wrote:
> > > > ...
> > > > > >
> > > > > > The only reason XFS returns -EXDEV to rename across project IDs is
> > > > > > because nobody wanted to spend the time to work out how to do the
> > > > > > quota accounting of the metadata changed in the rename operation
> > > > > > accurately. So for that rare case (not something that would happen
> > > > > > on the NAS product) we returned -EXDEV to trigger the mv command to
> > > > > > copy the file to the destination and then unlink the source instead,
> > > > > > thereby handling all the quota accounting correctly.
> > > > > >
> > > > > > IOWs, this whole "-EXDEV on rename across parent project quota
> > > > > > boundaries" is an implementation detail and nothing more.
> > > > > > Filesystems that implement project quotas and the directory tree
> > > > > > sub-variant don't need to behave like this if they can accurately
> > > > > > account for the quota ID changes during an atomic rename operation.
> > > > > > If that's too hard, then the fallback is to return -EXDEV and let
> > > > > > userspace do it the slow way which will always acocunt the resource
> > > > > > usage correctly to the individual projects.
> > > > > >
> > > > > > Hence I think we should just fix the XFS kernel behaviour to do the
> > > > > > right thing in this special file case rather than return -EXDEV and
> > > > > > then forget about the rest of it.
> > > > >
> > > > > I see, I will look into that, this should solve the original issue.
> > > >
> > > > I see that you already got Darrick's RVB on the original patch:
> > > > https://lore.kernel.org/linux-xfs/20240315024826.GA1927156@frogsfrogsfrogs/
> > > >
> > > > What is missing then?
> > > > A similar patch for rename() that allows rename of zero projid special
> > > > file as long as (target_dp->i_projid == src_dp->i_projid)?
> > > >
> > > > In theory, it would have been nice to fix the zero projid during the
> > > > above link() and rename() operations, but it would be more challenging
> > > > and I see no reason to do that if all the other files remain with zero
> > > > projid after initial project setup (i.e. if not implementing the syscalls).
> > >
> > > I think Dave suggests to get rid of this if-guard and allow
> > > link()/rename() for special files but with correct quota calculation.
> > >
> > > >
> > > > >
> > > > > But those special file's inodes still will not be accounted by the
> > > > > quota during initial project setup (xfs_quota will skip them), would
> > > > > it worth it adding new syscalls anyway?
> > > > >
> > > >
> > > > Is it worth it to you?
> > > >
> > > > Adding those new syscalls means adding tests and documentation
> > > > and handle all the bugs later.
> > > >
> > > > If nobody cared about accounting of special files inodes so far,
> > > > there is no proof that anyone will care that you put in all this work.
> > >
> > > I already have patch and some simple man-pages prepared, I'm
> > > wondering if this would be useful for any other usecases
> >
> > Yes, I personally find it useful.
> > I have applications that query the fsx_xflags and would rather
> > be able to use O_PATH to query/set those flags, since
> > internally in vfs, fileattr_[gs]et() do not really need an open file.
> >
> > > which would
> > > require setting extended attributes on spec indodes.
> >
> > Please do not use the terminology "extended attributes" in the man page
> > to describe struct fsxattr.
>
> "XFS file attributes" perhaps?
>
> Though that's anachronistic since ext4 supports /some/ of them now.
>

Technically, it's all the filesystems that support ->fileattr_[gs]et(),
which is a lot.

Since the feature has matured into a vfs feature with planned new syscalls,
the XFS ioctl man page could be referenced for historic perspective and
for explaining the reason for the X in fsxattr, but I don't think that
the official
name of those attributes should include "XFS".

Maybe TAFKAXFA (The attributes formerly known as XFS file attributes) :-p

Also, adding a syscall to perform FS_IOC_FS[GS]ETXATTR, without
being able to perform FS_IOC_[GS]ETFLAGS would be a bit odd IMO.
It is possible to multiplex both in the same syscall.
For example, let the syscall return the size of the attributes, like getxattr().
Not sure whether it is smart to do it.

Then again, not sure if this was already mentioned as an option, besides
the more generic proposal by Miklos [1], but we could bind generic xattr
handlers for "system.fs.attr" and "system.fs.flags".
API-wise, ->fileattr_[gs]et() would become very similar to ->[gs]et_acl(),
which are also get/set via "system.*" xattr.

The added benefit is that we avoid the confusing distinctions between
"additional attributes" and "extended attributes" -
"additional file attributes/flags" are a private case of "extended attributes".

Implementation wise it should be pretty simple, but I agree with Jan that
the question of credentials needed for these syscalls needs to be
addressed first and get Christian's blessing.

Thanks,
Amir.

[1] https://lore.kernel.org/linux-fsdevel/YnEeuw6fd1A8usjj@miu.piliscsaba.redhat.com/
Dave Chinner June 11, 2024, 11:40 p.m. UTC | #39
On Fri, Jun 07, 2024 at 09:17:34AM +0300, Amir Goldstein wrote:
> On Thu, Jun 6, 2024 at 5:27 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote:
> > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > > > > I do -- allowing unpriviledged users to create symlinks that consume
> > > > > > icount (and possibly bcount) in the root project breaks the entire
> > > > > > enforcement mechanism.  That's not the way that project quota has worked
> > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > > > > only for these special cases.
> > > > >
> > > > > OK, fair enough. I though someone will hate this. I'd just like to
> > > > > understand one thing: Owner of the inode can change the project ID to 0
> > > > > anyway so project quotas are more like a cooperative space tracking scheme
> > > > > anyway. If you want to escape it, you can. So what are you exactly worried
> > > > > about? Is it the container usecase where from within the user namespace you
> > > > > cannot change project IDs?
> > > >
> > > > Yep.
> > > >
> > > > > Anyway I just wanted to have an explicit decision that the simple solution
> > > > > is not good enough before we go the more complex route ;).
> > > >
> > > > Also, every now and then someone comes along and half-proposes making it
> > > > so that non-root cannot change project ids anymore.  Maybe some day that
> > > > will succeed.
> > > >
> > >
> > > I'd just like to point out that the purpose of the project quotas feature
> > > as I understand it, is to apply quotas to subtrees, where container storage
> > > is a very common private case of project subtree.
> >
> > That is the most modern use case, yes.
> >
> > [ And for a walk down history lane.... ]
> >
> > > The purpose is NOT to create a "project" of random files in random
> > > paths.
> >
> > This is *exactly* the original use case that project quotas were
> > designed for back on Irix in the early 1990s and is the original
> > behaviour project quotas brought to Linux.
> >
> > Project quota inheritance didn't come along until 2005:
> >
> > commit 65f1866a3a8e512d43795c116bfef262e703b789
> > Author: Nathan Scott <nathans@sgi.com>
> > Date:   Fri Jun 3 06:04:22 2005 +0000
> >
> >     Add support for project quota inheritance, a merge of Glens changes.
> >     Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd.
> >
> > And full support for directory tree quotas using project IDs wasn't
> > fully introduced until a year later in 2006:
> >
> > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6
> > Author: Nathan Scott <nathans@sgi.com>
> > Date:   Tue May 30 15:54:53 2006 +0000
> >
> >     statvfs component of directory/project quota support, code originally by Glen.
> >     Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd.
> >
> > These changes were largely done for an SGI NAS product that allowed
> > us to create one great big XFS filesystem and then create
> > arbitrarily sized, thin provisoned  "NFS volumes"  as directory
> > quota controlled subdirs instantenously. The directory tree quota
> > defined the size of the volume, and so we could also grow and shrink
> > them instantenously, too. And we could remove them instantenously
> > via background garbage collection after the export was removed and
> > the user had been told it had been destroyed.
> >
> > So that was the original use case for directory tree quotas on XFS -
> > providing scalable, fast management of "thin" storage for a NAS
> > product. Projects quotas had been used for accounting random
> > colections of files for over a decade before this directory quota
> > construct was created, and the "modern" container use cases for
> > directory quotas didn't come along until almost a decade after this
> > capability was added.
> >
> 
> Cool. Didn't know all of this.
> Lucky for us, those historic use cases are well distinguished from
> the modern subtree use case by the opt-in PROJINHERIT bit.
> So as long as PROJINHERIT is set, my assumptions mostly hold(?)

I'm not sure what assumptions you are making, so I can't really
make any binding statement on this except to say that the existance
of PROJINHERIT on directories does not mean strict directory quotas
are being used.

i.e.  PROJINHERIT is just a flag to automatically tag inodes with
aproject ID on creation. It -can- be used as a directory tree quota
if specific restrictions in the use of the projinherit flag are
enforced, but it can also be used for the original project ID use
case so users don't have to manually tag files they create or move
to a projet directory with the right project ID after the fact.

IOWs, directories that have the same projid and PROJINHERIT set
do not need to be in a single hierarchy, and hence  do not fit the
definition of a directory tree quota. e.g:

/projects/docs/project1
/projects/docs/project2
/projects/docs/project3
/projects/src/project1
/projects/src/project2
/projects/src/project3
/projects/build/project1
/projects/build/project2
/projects/build/project3
.....
/home/user1/project1
/home/user2/project3
/home/user6/project2

Now we have multiple different disjoint directory heirarchies with
the same project ID because they contain files belonging to a
specific project. This is still a random collection of files in
random paths that are accounted to a project ID, but it's also using
PROJINHERIT to assign the default project ID to files created in the
respective project directories.

The kernel has no idea that a project ID is associated with a single
directory tree. The kernel quota accounting itself simple sees
project IDs on inodes and accounts to that project ID. The create
and rename code simply see the parent PROJINHERIT bit and so need to
set up the new directory entry to be accounted to that different
project ID. It's all just mechanism in the kernel - applying project
quotas to enforce directory tree quotas is entirely a userspace
administration constraint....

The only thing we do to help userspace administration is restrict
changing project IDs to the init namespace, thereby preventing
containers from being able to manipulate project IDs. This allows
the init namespace to set the policy for project quota use and hence
allow it to be used for directory tree quotas for container space
management. This is essentially an extension of the original NAS
use case, in that NFS exports were the isolation barrier that hid
the underlying filesystem structure and project ID quota usage
from the end users....

> > > My point is that changing the project id of a non-dir child to be different
> > > from the project id of its parent is a pretty rare use case (I think?).
> >
> > Not if you are using project quotas as they were originally intended
> > to be used.
> >
> 
> Rephrase then:
> 
> Changing the projid of a non-dir child to be different from the projid
> of its parent, which has PROJINHERIT bit set, is a pretty rare use case(?)

I have no data to indicate how rare it might be - the kernel has
never enforced a policy that disallows changing project IDs when
PROJINHERIT is set and any user with write permission can change
project IDs even when PROJINHERIT is set. Hence we have to assume
that there are people out there that rely on this behaviour,
regardless of how rare it is...

> > > If changing the projid of non-dir is needed for moving it to a
> > > different subtree,
> > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> > > change its
> > > inherited project id or explicitly with a flag for a hardlink, e.g.:
> > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).
> >
> > Why?
> >
> > The only reason XFS returns -EXDEV to rename across project IDs is
> > because nobody wanted to spend the time to work out how to do the
> > quota accounting of the metadata changed in the rename operation
> > accurately. So for that rare case (not something that would happen
> > on the NAS product) we returned -EXDEV to trigger the mv command to
> > copy the file to the destination and then unlink the source instead,
> > thereby handling all the quota accounting correctly.
> >
> > IOWs, this whole "-EXDEV on rename across parent project quota
> > boundaries" is an implementation detail and nothing more.
> > Filesystems that implement project quotas and the directory tree
> > sub-variant don't need to behave like this if they can accurately
> > account for the quota ID changes during an atomic rename operation.
> > If that's too hard, then the fallback is to return -EXDEV and let
> > userspace do it the slow way which will always acocunt the resource
> > usage correctly to the individual projects.
> >
> > Hence I think we should just fix the XFS kernel behaviour to do the
> > right thing in this special file case rather than return -EXDEV and
> > then forget about the rest of it. Sure, update xfs_repair to fix the
> > special file project id issue if it trips over it, but other than
> > that I don't think we need anything more. If fixing it requires new
> > syscalls and tools, then that's much harder to backport to old
> > kernels and distros than just backporting a couple of small XFS
> > kernel patches...
> >
> 
> I assume that by "fix the XFS behavior" you mean
> "we could allow renameat2(2) of non-dir with no hardlinks to implicitly
>  change its inherited project id"?
> (in case the new parent has the PROJINHERIT bit)
> so that the RENAME_NEW_PROJID behavior would be implicit.

No, I meant "fix the original hardlink of an inode with no project ID
within a PROJINHERIT directory always gets -EXDEV" by allowing
hard links when the source file has no project ID specified.

Actually, looking at 6.10-rc3, this has been merged already. So IMO
there's nothing more we need to do here.

> Unlike rename() from one parent to the other, link()+unlink()
> is less obvious.
>
> The "modern" use cases that I listed where implicit change of projid
> does not suffice are:
> 
> 1. Share some inodes (as hardlinks) among projects
> 2. Recursively changing a subtree projid
>
> They could be implemented by explicit flags to renameat2()/linkat() and
> they could be implemented by [gs]etfsxattrat(2) syscalls.

What are you expecting to happen when you hardlink an inode across
multiple PROJINHERIT directories?  I mean, an inode can only have
one project ID, so you can't link it into mulitple projects at once
if you are using PROJINHERIT (xfs_link() expressly forbids it).

So I'm really not sure what problem you are trying to describe or
solve here. Can you elaborate more?

-Dave.
Amir Goldstein June 12, 2024, 11:24 a.m. UTC | #40
On Wed, Jun 12, 2024 at 2:40 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Fri, Jun 07, 2024 at 09:17:34AM +0300, Amir Goldstein wrote:
> > On Thu, Jun 6, 2024 at 5:27 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Wed, Jun 05, 2024 at 08:13:15AM +0300, Amir Goldstein wrote:
> > > > On Wed, Jun 5, 2024 at 3:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > On Tue, Jun 04, 2024 at 10:58:43AM +0200, Jan Kara wrote:
> > > > > > On Mon 03-06-24 10:42:59, Darrick J. Wong wrote:
> > > > > > > I do -- allowing unpriviledged users to create symlinks that consume
> > > > > > > icount (and possibly bcount) in the root project breaks the entire
> > > > > > > enforcement mechanism.  That's not the way that project quota has worked
> > > > > > > on xfs and it would be quite rude to nullify the PROJINHERIT flag bit
> > > > > > > only for these special cases.
> > > > > >
> > > > > > OK, fair enough. I though someone will hate this. I'd just like to
> > > > > > understand one thing: Owner of the inode can change the project ID to 0
> > > > > > anyway so project quotas are more like a cooperative space tracking scheme
> > > > > > anyway. If you want to escape it, you can. So what are you exactly worried
> > > > > > about? Is it the container usecase where from within the user namespace you
> > > > > > cannot change project IDs?
> > > > >
> > > > > Yep.
> > > > >
> > > > > > Anyway I just wanted to have an explicit decision that the simple solution
> > > > > > is not good enough before we go the more complex route ;).
> > > > >
> > > > > Also, every now and then someone comes along and half-proposes making it
> > > > > so that non-root cannot change project ids anymore.  Maybe some day that
> > > > > will succeed.
> > > > >
> > > >
> > > > I'd just like to point out that the purpose of the project quotas feature
> > > > as I understand it, is to apply quotas to subtrees, where container storage
> > > > is a very common private case of project subtree.
> > >
> > > That is the most modern use case, yes.
> > >
> > > [ And for a walk down history lane.... ]
> > >
> > > > The purpose is NOT to create a "project" of random files in random
> > > > paths.
> > >
> > > This is *exactly* the original use case that project quotas were
> > > designed for back on Irix in the early 1990s and is the original
> > > behaviour project quotas brought to Linux.
> > >
> > > Project quota inheritance didn't come along until 2005:
> > >
> > > commit 65f1866a3a8e512d43795c116bfef262e703b789
> > > Author: Nathan Scott <nathans@sgi.com>
> > > Date:   Fri Jun 3 06:04:22 2005 +0000
> > >
> > >     Add support for project quota inheritance, a merge of Glens changes.
> > >     Merge of xfs-linux-melb:xfs-kern:22806a by kenmcd.
> > >
> > > And full support for directory tree quotas using project IDs wasn't
> > > fully introduced until a year later in 2006:
> > >
> > > commit 4aef4de4d04bcc36a1461c100eb940c162fd5ee6
> > > Author: Nathan Scott <nathans@sgi.com>
> > > Date:   Tue May 30 15:54:53 2006 +0000
> > >
> > >     statvfs component of directory/project quota support, code originally by Glen.
> > >     Merge of xfs-linux-melb:xfs-kern:26105a by kenmcd.
> > >
> > > These changes were largely done for an SGI NAS product that allowed
> > > us to create one great big XFS filesystem and then create
> > > arbitrarily sized, thin provisoned  "NFS volumes"  as directory
> > > quota controlled subdirs instantenously. The directory tree quota
> > > defined the size of the volume, and so we could also grow and shrink
> > > them instantenously, too. And we could remove them instantenously
> > > via background garbage collection after the export was removed and
> > > the user had been told it had been destroyed.
> > >
> > > So that was the original use case for directory tree quotas on XFS -
> > > providing scalable, fast management of "thin" storage for a NAS
> > > product. Projects quotas had been used for accounting random
> > > colections of files for over a decade before this directory quota
> > > construct was created, and the "modern" container use cases for
> > > directory quotas didn't come along until almost a decade after this
> > > capability was added.
> > >
> >
> > Cool. Didn't know all of this.
> > Lucky for us, those historic use cases are well distinguished from
> > the modern subtree use case by the opt-in PROJINHERIT bit.
> > So as long as PROJINHERIT is set, my assumptions mostly hold(?)
>
> I'm not sure what assumptions you are making, so I can't really
> make any binding statement on this except to say that the existance
> of PROJINHERIT on directories does not mean strict directory quotas
> are being used.
>
> i.e.  PROJINHERIT is just a flag to automatically tag inodes with
> aproject ID on creation. It -can- be used as a directory tree quota
> if specific restrictions in the use of the projinherit flag are
> enforced, but it can also be used for the original project ID use
> case so users don't have to manually tag files they create or move
> to a projet directory with the right project ID after the fact.
>
> IOWs, directories that have the same projid and PROJINHERIT set
> do not need to be in a single hierarchy, and hence  do not fit the
> definition of a directory tree quota. e.g:
>
> /projects/docs/project1
> /projects/docs/project2
> /projects/docs/project3
> /projects/src/project1
> /projects/src/project2
> /projects/src/project3
> /projects/build/project1
> /projects/build/project2
> /projects/build/project3
> .....
> /home/user1/project1
> /home/user2/project3
> /home/user6/project2
>
> Now we have multiple different disjoint directory heirarchies with
> the same project ID because they contain files belonging to a
> specific project. This is still a random collection of files in
> random paths that are accounted to a project ID, but it's also using
> PROJINHERIT to assign the default project ID to files created in the
> respective project directories.
>
> The kernel has no idea that a project ID is associated with a single
> directory tree. The kernel quota accounting itself simple sees
> project IDs on inodes and accounts to that project ID. The create
> and rename code simply see the parent PROJINHERIT bit and so need to
> set up the new directory entry to be accounted to that different
> project ID. It's all just mechanism in the kernel - applying project
> quotas to enforce directory tree quotas is entirely a userspace
> administration constraint....
>
> The only thing we do to help userspace administration is restrict
> changing project IDs to the init namespace, thereby preventing
> containers from being able to manipulate project IDs. This allows
> the init namespace to set the policy for project quota use and hence
> allow it to be used for directory tree quotas for container space
> management. This is essentially an extension of the original NAS
> use case, in that NFS exports were the isolation barrier that hid
> the underlying filesystem structure and project ID quota usage
> from the end users....
>
> > > > My point is that changing the project id of a non-dir child to be different
> > > > from the project id of its parent is a pretty rare use case (I think?).
> > >
> > > Not if you are using project quotas as they were originally intended
> > > to be used.
> > >
> >
> > Rephrase then:
> >
> > Changing the projid of a non-dir child to be different from the projid
> > of its parent, which has PROJINHERIT bit set, is a pretty rare use case(?)
>
> I have no data to indicate how rare it might be - the kernel has
> never enforced a policy that disallows changing project IDs when
> PROJINHERIT is set and any user with write permission can change
> project IDs even when PROJINHERIT is set. Hence we have to assume
> that there are people out there that rely on this behaviour,
> regardless of how rare it is...
>
> > > > If changing the projid of non-dir is needed for moving it to a
> > > > different subtree,
> > > > we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> > > > change its
> > > > inherited project id or explicitly with a flag for a hardlink, e.g.:
> > > > renameat2(olddirfd, name, newdirfd, name, RENAME_NEW_PROJID).
> > >
> > > Why?
> > >
> > > The only reason XFS returns -EXDEV to rename across project IDs is
> > > because nobody wanted to spend the time to work out how to do the
> > > quota accounting of the metadata changed in the rename operation
> > > accurately. So for that rare case (not something that would happen
> > > on the NAS product) we returned -EXDEV to trigger the mv command to
> > > copy the file to the destination and then unlink the source instead,
> > > thereby handling all the quota accounting correctly.
> > >
> > > IOWs, this whole "-EXDEV on rename across parent project quota
> > > boundaries" is an implementation detail and nothing more.
> > > Filesystems that implement project quotas and the directory tree
> > > sub-variant don't need to behave like this if they can accurately
> > > account for the quota ID changes during an atomic rename operation.
> > > If that's too hard, then the fallback is to return -EXDEV and let
> > > userspace do it the slow way which will always acocunt the resource
> > > usage correctly to the individual projects.
> > >
> > > Hence I think we should just fix the XFS kernel behaviour to do the
> > > right thing in this special file case rather than return -EXDEV and
> > > then forget about the rest of it. Sure, update xfs_repair to fix the
> > > special file project id issue if it trips over it, but other than
> > > that I don't think we need anything more. If fixing it requires new
> > > syscalls and tools, then that's much harder to backport to old
> > > kernels and distros than just backporting a couple of small XFS
> > > kernel patches...
> > >
> >
> > I assume that by "fix the XFS behavior" you mean
> > "we could allow renameat2(2) of non-dir with no hardlinks to implicitly
> >  change its inherited project id"?
> > (in case the new parent has the PROJINHERIT bit)
> > so that the RENAME_NEW_PROJID behavior would be implicit.
>
> No, I meant "fix the original hardlink of an inode with no project ID
> within a PROJINHERIT directory always gets -EXDEV" by allowing
> hard links when the source file has no project ID specified.
>
> Actually, looking at 6.10-rc3, this has been merged already. So IMO
> there's nothing more we need to do here.
>

I don't see it.
Which commit is that?
I agree that would be enough to address the original report.

> > Unlike rename() from one parent to the other, link()+unlink()
> > is less obvious.
> >
> > The "modern" use cases that I listed where implicit change of projid
> > does not suffice are:
> >
> > 1. Share some inodes (as hardlinks) among projects
> > 2. Recursively changing a subtree projid
> >
> > They could be implemented by explicit flags to renameat2()/linkat() and
> > they could be implemented by [gs]etfsxattrat(2) syscalls.
>
> What are you expecting to happen when you hardlink an inode across
> multiple PROJINHERIT directories?  I mean, an inode can only have
> one project ID, so you can't link it into mulitple projects at once
> if you are using PROJINHERIT (xfs_link() expressly forbids it).
>
> So I'm really not sure what problem you are trying to describe or
> solve here. Can you elaborate more?

The problem that Andrey described started with a subtree
that has no projid and then projid was applied to all non-special files.
The suggested change to relax hardlink of projid 0 solves this case.

But a subtree could also be changed by user, say from projid P1
to projid P2 and in that case inherited projid P1 will remain on
the special files, causing a similar failure to link() without the
projid 0 relax rule.

Anyway, this discussion has wandered off too far from the original problem.

If the simple fix is enough for Andrey, it's fine by me.

If other people are interested in syscall or other means to change
fsxattr on special files, it's fine by me as well.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1d5abfdf0f22..3e3aacb6ea6e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -22,6 +22,7 @@ 
 #include <linux/mount.h>
 #include <linux/fscrypt.h>
 #include <linux/fileattr.h>
+#include <linux/namei.h>
 
 #include "internal.h"
 
@@ -647,6 +648,19 @@  static int fileattr_set_prepare(struct inode *inode,
 	if (fa->fsx_cowextsize == 0)
 		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
 
+	/*
+	 * The only use case for special files is to set project ID, forbid any
+	 * other attributes
+	 */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+		if (fa->fsx_xflags & ~FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+		if (!S_ISLNK(inode->i_mode) && fa->fsx_nextents)
+			return -EINVAL;
+		if (fa->fsx_extsize || fa->fsx_cowextsize)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -763,6 +777,79 @@  static int ioctl_fssetxattr(struct file *file, void __user *argp)
 	return err;
 }
 
+static int ioctl_fsgetxattrat(struct file *file, void __user *argp)
+{
+	struct path filepath;
+	struct fsxattrat fsxat;
+	struct fileattr fa;
+	int error;
+
+	if (!S_ISDIR(file_inode(file)->i_mode))
+		return -EBADF;
+
+	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
+		return -EFAULT;
+
+	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
+	if (error)
+		return error;
+
+	error = vfs_fileattr_get(filepath.dentry, &fa);
+	if (error) {
+		path_put(&filepath);
+		return error;
+	}
+
+	fsxat.fsx.fsx_xflags = fa.fsx_xflags;
+	fsxat.fsx.fsx_extsize = fa.fsx_extsize;
+	fsxat.fsx.fsx_nextents = fa.fsx_nextents;
+	fsxat.fsx.fsx_projid = fa.fsx_projid;
+	fsxat.fsx.fsx_cowextsize = fa.fsx_cowextsize;
+
+	if (copy_to_user(argp, &fsxat, sizeof(struct fsxattrat)))
+		error = -EFAULT;
+
+	path_put(&filepath);
+	return error;
+}
+
+static int ioctl_fssetxattrat(struct file *file, void __user *argp)
+{
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct fsxattrat fsxat;
+	struct path filepath;
+	struct fileattr fa;
+	int error;
+
+	if (!S_ISDIR(file_inode(file)->i_mode))
+		return -EBADF;
+
+	if (copy_from_user(&fsxat, argp, sizeof(struct fsxattrat)))
+		return -EFAULT;
+
+	error = user_path_at(fsxat.dfd, fsxat.path, 0, &filepath);
+	if (error)
+		return error;
+
+	error = mnt_want_write(filepath.mnt);
+	if (error) {
+		path_put(&filepath);
+		return error;
+	}
+
+	fileattr_fill_xflags(&fa, fsxat.fsx.fsx_xflags);
+	fa.fsx_extsize = fsxat.fsx.fsx_extsize;
+	fa.fsx_nextents = fsxat.fsx.fsx_nextents;
+	fa.fsx_projid = fsxat.fsx.fsx_projid;
+	fa.fsx_cowextsize = fsxat.fsx.fsx_cowextsize;
+	fa.fsx_valid = true;
+
+	error = vfs_fileattr_set(idmap, filepath.dentry, &fa);
+	mnt_drop_write(filepath.mnt);
+	path_put(&filepath);
+	return error;
+}
+
 static int ioctl_getfsuuid(struct file *file, void __user *argp)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
@@ -872,6 +959,12 @@  static int do_vfs_ioctl(struct file *filp, unsigned int fd,
 	case FS_IOC_FSSETXATTR:
 		return ioctl_fssetxattr(filp, argp);
 
+	case FS_IOC_FSGETXATTRAT:
+		return ioctl_fsgetxattrat(filp, argp);
+
+	case FS_IOC_FSSETXATTRAT:
+		return ioctl_fssetxattrat(filp, argp);
+
 	case FS_IOC_GETFSUUID:
 		return ioctl_getfsuuid(filp, argp);
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 45e4e64fd664..f8cd8d7bf35d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -139,6 +139,15 @@  struct fsxattr {
 	unsigned char	fsx_pad[8];
 };
 
+/*
+ * Structure passed to FS_IOC_FSGETXATTRAT/FS_IOC_FSSETXATTRAT
+ */
+struct fsxattrat {
+	struct fsxattr	fsx;		/* XATTR to get/set */
+	__u32		dfd;		/* parent dir */
+	const char	__user *path;
+};
+
 /*
  * Flags for the fsx_xflags field
  */
@@ -231,6 +240,8 @@  struct fsxattr {
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
 #define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
 #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#define FS_IOC_FSGETXATTRAT		_IOR('X', 33, struct fsxattrat)
+#define FS_IOC_FSSETXATTRAT		_IOW('X', 34, struct fsxattrat)
 #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
 #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
 /* Returns the external filesystem UUID, the same one blkid returns */