diff mbox series

[09/16] fs: add vfs_set_fscaps()

Message ID 20231129-idmap-fscap-refactor-v1-9-da5a26058a5b@kernel.org (mailing list archive)
State New, archived
Headers show
Series fs: use type-safe uid representation for filesystem capabilities | expand

Commit Message

Seth Forshee (DigitalOcean) Nov. 29, 2023, 9:50 p.m. UTC
Provide a type-safe interface for setting filesystem capabilities and a
generic implementation suitable for most filesystems.

Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 fs/xattr.c         | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 89 insertions(+)

Comments

Amir Goldstein Nov. 30, 2023, 8:01 a.m. UTC | #1
On Wed, Nov 29, 2023 at 11:50 PM Seth Forshee (DigitalOcean)
<sforshee@kernel.org> wrote:
>
> Provide a type-safe interface for setting filesystem capabilities and a
> generic implementation suitable for most filesystems.
>
> Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
> ---
>  fs/xattr.c         | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h |  2 ++
>  2 files changed, 89 insertions(+)
>
> diff --git a/fs/xattr.c b/fs/xattr.c
> index 3abaf9bef0a5..03cc824e4f87 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -247,6 +247,93 @@ int vfs_get_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
>  }
>  EXPORT_SYMBOL(vfs_get_fscaps);
>
> +static int generic_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> +                             const struct vfs_caps *caps, int flags)
> +{
> +       struct inode *inode = d_inode(dentry);
> +       struct vfs_ns_cap_data nscaps;
> +       int size;
> +
> +       size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps,
> +                                &nscaps, sizeof(nscaps));
> +       if (size < 0)
> +               return size;
> +
> +       return __vfs_setxattr_noperm(idmap, dentry, XATTR_NAME_CAPS,
> +                                    &nscaps, size, flags);
> +}
> +
> +/**
> + * vfs_set_fscaps - set filesystem capabilities
> + * @idmap: idmap of the mount the inode was found from
> + * @dentry: the dentry on which to set filesystem capabilities
> + * @caps: the filesystem capabilities to be written
> + * @flags: setxattr flags to use when writing the capabilities xattr
> + *
> + * This function writes the supplied filesystem capabilities to the dentry.
> + *
> + * Return: 0 on success, a negative errno on error.
> + */
> +int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> +                  const struct vfs_caps *caps, int flags)
> +{
> +       struct inode *inode = d_inode(dentry);
> +       struct inode *delegated_inode = NULL;
> +       struct vfs_ns_cap_data nscaps;
> +       int size, error;
> +
> +       /*
> +        * Unfortunately EVM wants to have the raw xattr value to compare to
> +        * the on-disk version, so we need to pass the raw xattr to the
> +        * security hooks. But we also want to do security checks before
> +        * breaking leases, so that means a conversion to the raw xattr here
> +        * which will usually be reduntant with the conversion we do for
> +        * writing the xattr to disk.
> +        */
> +       size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
> +                                sizeof(nscaps));
> +       if (size < 0)
> +               return size;
> +
> +retry_deleg:
> +       inode_lock(inode);
> +
> +       error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
> +       if (error)
> +               goto out_inode_unlock;
> +       error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
> +                                       size, flags);
> +       if (error)
> +               goto out_inode_unlock;
> +
> +       error = try_break_deleg(inode, &delegated_inode);
> +       if (error)
> +               goto out_inode_unlock;
> +
> +       if (inode->i_opflags & IOP_XATTR) {
> +               if (inode->i_op->set_fscaps)
> +                       error = inode->i_op->set_fscaps(idmap, dentry, caps, flags);
> +               else
> +                       error = generic_set_fscaps(idmap, dentry, caps, flags);

I think the non-generic case is missing fsnotify_xattr().

See vfs_set_acl() for comparison.

Thanks,
Amir.
Seth Forshee (DigitalOcean) Nov. 30, 2023, 3:38 p.m. UTC | #2
On Thu, Nov 30, 2023 at 10:01:55AM +0200, Amir Goldstein wrote:
> On Wed, Nov 29, 2023 at 11:50 PM Seth Forshee (DigitalOcean)
> <sforshee@kernel.org> wrote:
> >
> > Provide a type-safe interface for setting filesystem capabilities and a
> > generic implementation suitable for most filesystems.
> >
> > Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
> > ---
> >  fs/xattr.c         | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/fs.h |  2 ++
> >  2 files changed, 89 insertions(+)
> >
> > diff --git a/fs/xattr.c b/fs/xattr.c
> > index 3abaf9bef0a5..03cc824e4f87 100644
> > --- a/fs/xattr.c
> > +++ b/fs/xattr.c
> > @@ -247,6 +247,93 @@ int vfs_get_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> >  }
> >  EXPORT_SYMBOL(vfs_get_fscaps);
> >
> > +static int generic_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> > +                             const struct vfs_caps *caps, int flags)
> > +{
> > +       struct inode *inode = d_inode(dentry);
> > +       struct vfs_ns_cap_data nscaps;
> > +       int size;
> > +
> > +       size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps,
> > +                                &nscaps, sizeof(nscaps));
> > +       if (size < 0)
> > +               return size;
> > +
> > +       return __vfs_setxattr_noperm(idmap, dentry, XATTR_NAME_CAPS,
> > +                                    &nscaps, size, flags);
> > +}
> > +
> > +/**
> > + * vfs_set_fscaps - set filesystem capabilities
> > + * @idmap: idmap of the mount the inode was found from
> > + * @dentry: the dentry on which to set filesystem capabilities
> > + * @caps: the filesystem capabilities to be written
> > + * @flags: setxattr flags to use when writing the capabilities xattr
> > + *
> > + * This function writes the supplied filesystem capabilities to the dentry.
> > + *
> > + * Return: 0 on success, a negative errno on error.
> > + */
> > +int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> > +                  const struct vfs_caps *caps, int flags)
> > +{
> > +       struct inode *inode = d_inode(dentry);
> > +       struct inode *delegated_inode = NULL;
> > +       struct vfs_ns_cap_data nscaps;
> > +       int size, error;
> > +
> > +       /*
> > +        * Unfortunately EVM wants to have the raw xattr value to compare to
> > +        * the on-disk version, so we need to pass the raw xattr to the
> > +        * security hooks. But we also want to do security checks before
> > +        * breaking leases, so that means a conversion to the raw xattr here
> > +        * which will usually be reduntant with the conversion we do for
> > +        * writing the xattr to disk.
> > +        */
> > +       size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
> > +                                sizeof(nscaps));
> > +       if (size < 0)
> > +               return size;
> > +
> > +retry_deleg:
> > +       inode_lock(inode);
> > +
> > +       error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
> > +       if (error)
> > +               goto out_inode_unlock;
> > +       error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
> > +                                       size, flags);
> > +       if (error)
> > +               goto out_inode_unlock;
> > +
> > +       error = try_break_deleg(inode, &delegated_inode);
> > +       if (error)
> > +               goto out_inode_unlock;
> > +
> > +       if (inode->i_opflags & IOP_XATTR) {
> > +               if (inode->i_op->set_fscaps)
> > +                       error = inode->i_op->set_fscaps(idmap, dentry, caps, flags);
> > +               else
> > +                       error = generic_set_fscaps(idmap, dentry, caps, flags);
> 
> I think the non-generic case is missing fsnotify_xattr().
> 
> See vfs_set_acl() for comparison.

Good catch. I'm going to have another look at some of this in light of
some of your other feedback, but I'll get it fixed one way or another in
v2.
Christian Brauner Dec. 1, 2023, 5:39 p.m. UTC | #3
On Wed, Nov 29, 2023 at 03:50:27PM -0600, Seth Forshee (DigitalOcean) wrote:
> Provide a type-safe interface for setting filesystem capabilities and a
> generic implementation suitable for most filesystems.
> 
> Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
> ---
>  fs/xattr.c         | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h |  2 ++
>  2 files changed, 89 insertions(+)
> 
> diff --git a/fs/xattr.c b/fs/xattr.c
> index 3abaf9bef0a5..03cc824e4f87 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -247,6 +247,93 @@ int vfs_get_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
>  }
>  EXPORT_SYMBOL(vfs_get_fscaps);
>  
> +static int generic_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> +			      const struct vfs_caps *caps, int flags)
> +{
> +	struct inode *inode = d_inode(dentry);
> +	struct vfs_ns_cap_data nscaps;
> +	int size;
> +
> +	size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps,
> +				 &nscaps, sizeof(nscaps));
> +	if (size < 0)
> +		return size;

I see, here the size is relevant. Ok, but then please make the
lower-level helper use ssize_t.

> +
> +	return __vfs_setxattr_noperm(idmap, dentry, XATTR_NAME_CAPS,
> +				     &nscaps, size, flags);
> +}
> +
> +/**
> + * vfs_set_fscaps - set filesystem capabilities
> + * @idmap: idmap of the mount the inode was found from
> + * @dentry: the dentry on which to set filesystem capabilities
> + * @caps: the filesystem capabilities to be written
> + * @flags: setxattr flags to use when writing the capabilities xattr
> + *
> + * This function writes the supplied filesystem capabilities to the dentry.
> + *
> + * Return: 0 on success, a negative errno on error.
> + */
> +int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> +		   const struct vfs_caps *caps, int flags)
> +{
> +	struct inode *inode = d_inode(dentry);
> +	struct inode *delegated_inode = NULL;
> +	struct vfs_ns_cap_data nscaps;
> +	int size, error;
> +
> +	/*
> +	 * Unfortunately EVM wants to have the raw xattr value to compare to
> +	 * the on-disk version, so we need to pass the raw xattr to the
> +	 * security hooks. But we also want to do security checks before
> +	 * breaking leases, so that means a conversion to the raw xattr here
> +	 * which will usually be reduntant with the conversion we do for
> +	 * writing the xattr to disk.
> +	 */
> +	size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
> +				 sizeof(nscaps));
> +	if (size < 0)
> +		return size;

Oh right, I remember that. Slight eyeroll. See below though...

> +
> +retry_deleg:
> +	inode_lock(inode);
> +
> +	error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
> +	if (error)
> +		goto out_inode_unlock;
> +	error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
> +					size, flags);
> +	if (error)
> +		goto out_inode_unlock;

For posix acls I added dedicated security hooks that take the struct
posix_acl stuff and then plumb that down into the security modules. You
could do the same thing here and then just force EVM and others to do
their own conversion from in-kernel to xattr format, instead of forcing
the VFS to do this.

Because right now we make everyone pay the price all the time when
really EVM should pay that price and this whole unpleasantness.

> +
> +	error = try_break_deleg(inode, &delegated_inode);
> +	if (error)
> +		goto out_inode_unlock;
> +
> +	if (inode->i_opflags & IOP_XATTR) {

So I'm trying to remember the details how I did this for POSIX ACLs in
commit e499214ce3ef ("acl: don't depend on IOP_XATTR"). I think what you
did here is correct because you need to have an xattr handler for
fscaps currently. IOW, it isn't purely based on inode operations.

And here starts the hate mail in so far as you'll hate me for asking
this:

I think I asked this before when we talked about this but how feasible
would it be to move fscaps completely off of xattr handlers and purely
on inode operations for all filesystems?

Yes, that's a fairly large patchset but it would also be a pretty good
win because we avoid munging this from inode operations through xattr
handlers again which seems a bit ugly and what we really wanted to
avoid desperately with POSIX ACLs.

If this is feasible and you'd be up for it I wouldn't even mind doing
that in two steps. IOW, merge something like this first and them move
everyone off of their individual xattr handlers.

Could you quickly remind me whether there would be any issues with this?
Seth Forshee (DigitalOcean) Dec. 1, 2023, 6:18 p.m. UTC | #4
On Fri, Dec 01, 2023 at 06:39:18PM +0100, Christian Brauner wrote:
> > +/**
> > + * vfs_set_fscaps - set filesystem capabilities
> > + * @idmap: idmap of the mount the inode was found from
> > + * @dentry: the dentry on which to set filesystem capabilities
> > + * @caps: the filesystem capabilities to be written
> > + * @flags: setxattr flags to use when writing the capabilities xattr
> > + *
> > + * This function writes the supplied filesystem capabilities to the dentry.
> > + *
> > + * Return: 0 on success, a negative errno on error.
> > + */
> > +int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> > +		   const struct vfs_caps *caps, int flags)
> > +{
> > +	struct inode *inode = d_inode(dentry);
> > +	struct inode *delegated_inode = NULL;
> > +	struct vfs_ns_cap_data nscaps;
> > +	int size, error;
> > +
> > +	/*
> > +	 * Unfortunately EVM wants to have the raw xattr value to compare to
> > +	 * the on-disk version, so we need to pass the raw xattr to the
> > +	 * security hooks. But we also want to do security checks before
> > +	 * breaking leases, so that means a conversion to the raw xattr here
> > +	 * which will usually be reduntant with the conversion we do for
> > +	 * writing the xattr to disk.
> > +	 */
> > +	size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
> > +				 sizeof(nscaps));
> > +	if (size < 0)
> > +		return size;
> 
> Oh right, I remember that. Slight eyeroll. See below though...
> 
> > +
> > +retry_deleg:
> > +	inode_lock(inode);
> > +
> > +	error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
> > +	if (error)
> > +		goto out_inode_unlock;
> > +	error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
> > +					size, flags);
> > +	if (error)
> > +		goto out_inode_unlock;
> 
> For posix acls I added dedicated security hooks that take the struct
> posix_acl stuff and then plumb that down into the security modules. You
> could do the same thing here and then just force EVM and others to do
> their own conversion from in-kernel to xattr format, instead of forcing
> the VFS to do this.
> 
> Because right now we make everyone pay the price all the time when
> really EVM should pay that price and this whole unpleasantness.

Good point, I'll do that.

> 
> > +
> > +	error = try_break_deleg(inode, &delegated_inode);
> > +	if (error)
> > +		goto out_inode_unlock;
> > +
> > +	if (inode->i_opflags & IOP_XATTR) {
> 
> So I'm trying to remember the details how I did this for POSIX ACLs in
> commit e499214ce3ef ("acl: don't depend on IOP_XATTR"). I think what you
> did here is correct because you need to have an xattr handler for
> fscaps currently. IOW, it isn't purely based on inode operations.
> 
> And here starts the hate mail in so far as you'll hate me for asking
> this:
> 
> I think I asked this before when we talked about this but how feasible
> would it be to move fscaps completely off of xattr handlers and purely
> on inode operations for all filesystems?
> 
> Yes, that's a fairly large patchset but it would also be a pretty good
> win because we avoid munging this from inode operations through xattr
> handlers again which seems a bit ugly and what we really wanted to
> avoid desperately with POSIX ACLs.
> 
> If this is feasible and you'd be up for it I wouldn't even mind doing
> that in two steps. IOW, merge something like this first and them move
> everyone off of their individual xattr handlers.
> 
> Could you quickly remind me whether there would be any issues with this?

It's certainly possible to do this. There wouldn't be any issues per se,
but there are some tradoffs to consider.

First, it's really only overlayfs that needs special handling. It seems
pretty unfortunate to make every filesystem provide its own
implementations which are virtually identical, which is what we'd need
to do if we want to completely avoid the xattr handlers. But we could
still provide a generic implementation that uses only
__vfs_{get,set}xattr(), and most filesystems could use those in their
inode ops. How does that sound?

The other drawback I see is needing to duplicate logic from the
{get,set}xattr codepaths into the fscaps codepaths and maintain them in
parallel. I was trying to avoid that as much as possible, but in the end
I had to duplicate some of the logic anyway. And as Amir pointed out I
did miss some things I needed to duplicate from the setxattr logic, so I
already need to revisit that code and probably pull in more of the
setxattr logic, so there may not be as much benefit here as I'd
originally hoped.
Seth Forshee (DigitalOcean) Dec. 7, 2023, 2:42 p.m. UTC | #5
[Adding Mimi for insights on EVM questions]

On Fri, Dec 01, 2023 at 12:18:00PM -0600, Seth Forshee (DigitalOcean) wrote:
> On Fri, Dec 01, 2023 at 06:39:18PM +0100, Christian Brauner wrote:
> > > +/**
> > > + * vfs_set_fscaps - set filesystem capabilities
> > > + * @idmap: idmap of the mount the inode was found from
> > > + * @dentry: the dentry on which to set filesystem capabilities
> > > + * @caps: the filesystem capabilities to be written
> > > + * @flags: setxattr flags to use when writing the capabilities xattr
> > > + *
> > > + * This function writes the supplied filesystem capabilities to the dentry.
> > > + *
> > > + * Return: 0 on success, a negative errno on error.
> > > + */
> > > +int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> > > +		   const struct vfs_caps *caps, int flags)
> > > +{
> > > +	struct inode *inode = d_inode(dentry);
> > > +	struct inode *delegated_inode = NULL;
> > > +	struct vfs_ns_cap_data nscaps;
> > > +	int size, error;
> > > +
> > > +	/*
> > > +	 * Unfortunately EVM wants to have the raw xattr value to compare to
> > > +	 * the on-disk version, so we need to pass the raw xattr to the
> > > +	 * security hooks. But we also want to do security checks before
> > > +	 * breaking leases, so that means a conversion to the raw xattr here
> > > +	 * which will usually be reduntant with the conversion we do for
> > > +	 * writing the xattr to disk.
> > > +	 */
> > > +	size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
> > > +				 sizeof(nscaps));
> > > +	if (size < 0)
> > > +		return size;
> > 
> > Oh right, I remember that. Slight eyeroll. See below though...
> > 
> > > +
> > > +retry_deleg:
> > > +	inode_lock(inode);
> > > +
> > > +	error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
> > > +	if (error)
> > > +		goto out_inode_unlock;
> > > +	error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
> > > +					size, flags);
> > > +	if (error)
> > > +		goto out_inode_unlock;
> > 
> > For posix acls I added dedicated security hooks that take the struct
> > posix_acl stuff and then plumb that down into the security modules. You
> > could do the same thing here and then just force EVM and others to do
> > their own conversion from in-kernel to xattr format, instead of forcing
> > the VFS to do this.
> > 
> > Because right now we make everyone pay the price all the time when
> > really EVM should pay that price and this whole unpleasantness.
> 
> Good point, I'll do that.

I've been reconsidering various approaches here. One thing I noticed is
that for the non-generic case (iow overlayfs) I missed calling
security_inode_post_setxattr(), where EVM also wants the raw xattr, so
that would require another conversion. That got me wondering whether the
setxattr security hooks really matter when writing fscaps to overlayfs.
And it seems like they might not: the LSMs only look for their own
xattrs, and IMA doesn't do anything with fscaps xattrs. EVM does, but
what it does for a xattr write to an overlayfs indoe seems at least
partially if not completely redundant with what it will do when the
xattr is written to the upper filesystem.

So could we push these security calls down to the generic fscaps
implementations just before/after writing the raw xattr data and just
skip them for overlayfs? If so we can get away with doing the vfs_caps
to xattr conversion only once.

The trade offs are that filesystems which implement fscaps inode
operations become responsible for calling the security hooks if needed,
and if something changes such that we need to call those security hooks
for fscaps on overlayfs this solution would no longer work.
Amir Goldstein Dec. 10, 2023, 4:41 p.m. UTC | #6
On Thu, Dec 7, 2023 at 4:43 PM Seth Forshee (DigitalOcean)
<sforshee@kernel.org> wrote:
>
> [Adding Mimi for insights on EVM questions]
>
> On Fri, Dec 01, 2023 at 12:18:00PM -0600, Seth Forshee (DigitalOcean) wrote:
> > On Fri, Dec 01, 2023 at 06:39:18PM +0100, Christian Brauner wrote:
> > > > +/**
> > > > + * vfs_set_fscaps - set filesystem capabilities
> > > > + * @idmap: idmap of the mount the inode was found from
> > > > + * @dentry: the dentry on which to set filesystem capabilities
> > > > + * @caps: the filesystem capabilities to be written
> > > > + * @flags: setxattr flags to use when writing the capabilities xattr
> > > > + *
> > > > + * This function writes the supplied filesystem capabilities to the dentry.
> > > > + *
> > > > + * Return: 0 on success, a negative errno on error.
> > > > + */
> > > > +int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
> > > > +            const struct vfs_caps *caps, int flags)
> > > > +{
> > > > + struct inode *inode = d_inode(dentry);
> > > > + struct inode *delegated_inode = NULL;
> > > > + struct vfs_ns_cap_data nscaps;
> > > > + int size, error;
> > > > +
> > > > + /*
> > > > +  * Unfortunately EVM wants to have the raw xattr value to compare to
> > > > +  * the on-disk version, so we need to pass the raw xattr to the
> > > > +  * security hooks. But we also want to do security checks before
> > > > +  * breaking leases, so that means a conversion to the raw xattr here
> > > > +  * which will usually be reduntant with the conversion we do for
> > > > +  * writing the xattr to disk.
> > > > +  */
> > > > + size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
> > > > +                          sizeof(nscaps));
> > > > + if (size < 0)
> > > > +         return size;
> > >
> > > Oh right, I remember that. Slight eyeroll. See below though...
> > >
> > > > +
> > > > +retry_deleg:
> > > > + inode_lock(inode);
> > > > +
> > > > + error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
> > > > + if (error)
> > > > +         goto out_inode_unlock;
> > > > + error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
> > > > +                                 size, flags);
> > > > + if (error)
> > > > +         goto out_inode_unlock;
> > >
> > > For posix acls I added dedicated security hooks that take the struct
> > > posix_acl stuff and then plumb that down into the security modules. You
> > > could do the same thing here and then just force EVM and others to do
> > > their own conversion from in-kernel to xattr format, instead of forcing
> > > the VFS to do this.
> > >
> > > Because right now we make everyone pay the price all the time when
> > > really EVM should pay that price and this whole unpleasantness.
> >
> > Good point, I'll do that.
>
> I've been reconsidering various approaches here. One thing I noticed is
> that for the non-generic case (iow overlayfs) I missed calling
> security_inode_post_setxattr(), where EVM also wants the raw xattr, so
> that would require another conversion. That got me wondering whether the
> setxattr security hooks really matter when writing fscaps to overlayfs.
> And it seems like they might not: the LSMs only look for their own
> xattrs, and IMA doesn't do anything with fscaps xattrs. EVM does, but
> what it does for a xattr write to an overlayfs indoe seems at least
> partially if not completely redundant with what it will do when the
> xattr is written to the upper filesystem.
>
> So could we push these security calls down to the generic fscaps
> implementations just before/after writing the raw xattr data and just
> skip them for overlayfs? If so we can get away with doing the vfs_caps
> to xattr conversion only once.
>
> The trade offs are that filesystems which implement fscaps inode
> operations become responsible for calling the security hooks if needed,
> and if something changes such that we need to call those security hooks
> for fscaps on overlayfs this solution would no longer work.

Hi Seth,

I was trying to understand the alternative proposals, but TBH,
I cannot wrap my head about overlayfs+IMA/EVM and I do not
fully understand the use case.

Specifically, I do not understand why the IMA/EVM attestation on
the upper and lower fs isn't enough to make overlayfs tamper proof.
I never got an explanation of the threat model for overlayfs+IMA/EVM.

I know that for SELinux and overlayfs a lot of work was done by Vivek.
I was not involved in this work, but AKAIF, it did not involve any conversion
of selinux xattrs.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/xattr.c b/fs/xattr.c
index 3abaf9bef0a5..03cc824e4f87 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -247,6 +247,93 @@  int vfs_get_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
 }
 EXPORT_SYMBOL(vfs_get_fscaps);
 
+static int generic_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
+			      const struct vfs_caps *caps, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+	struct vfs_ns_cap_data nscaps;
+	int size;
+
+	size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps,
+				 &nscaps, sizeof(nscaps));
+	if (size < 0)
+		return size;
+
+	return __vfs_setxattr_noperm(idmap, dentry, XATTR_NAME_CAPS,
+				     &nscaps, size, flags);
+}
+
+/**
+ * vfs_set_fscaps - set filesystem capabilities
+ * @idmap: idmap of the mount the inode was found from
+ * @dentry: the dentry on which to set filesystem capabilities
+ * @caps: the filesystem capabilities to be written
+ * @flags: setxattr flags to use when writing the capabilities xattr
+ *
+ * This function writes the supplied filesystem capabilities to the dentry.
+ *
+ * Return: 0 on success, a negative errno on error.
+ */
+int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
+		   const struct vfs_caps *caps, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+	struct inode *delegated_inode = NULL;
+	struct vfs_ns_cap_data nscaps;
+	int size, error;
+
+	/*
+	 * Unfortunately EVM wants to have the raw xattr value to compare to
+	 * the on-disk version, so we need to pass the raw xattr to the
+	 * security hooks. But we also want to do security checks before
+	 * breaking leases, so that means a conversion to the raw xattr here
+	 * which will usually be reduntant with the conversion we do for
+	 * writing the xattr to disk.
+	 */
+	size = vfs_caps_to_xattr(idmap, i_user_ns(inode), caps, &nscaps,
+				 sizeof(nscaps));
+	if (size < 0)
+		return size;
+
+retry_deleg:
+	inode_lock(inode);
+
+	error = xattr_permission(idmap, inode, XATTR_NAME_CAPS, MAY_WRITE);
+	if (error)
+		goto out_inode_unlock;
+	error = security_inode_setxattr(idmap, dentry, XATTR_NAME_CAPS, &nscaps,
+					size, flags);
+	if (error)
+		goto out_inode_unlock;
+
+	error = try_break_deleg(inode, &delegated_inode);
+	if (error)
+		goto out_inode_unlock;
+
+	if (inode->i_opflags & IOP_XATTR) {
+		if (inode->i_op->set_fscaps)
+			error = inode->i_op->set_fscaps(idmap, dentry, caps, flags);
+		else
+			error = generic_set_fscaps(idmap, dentry, caps, flags);
+	} else if (unlikely(is_bad_inode(inode))) {
+		error = -EIO;
+	} else {
+		error = -EOPNOTSUPP;
+	}
+
+out_inode_unlock:
+	inode_unlock(inode);
+
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL(vfs_set_fscaps);
+
 int
 __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	       struct inode *inode, const char *name, const void *value,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e25b39e4017a..80992e210b83 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2059,6 +2059,8 @@  extern int __vfs_get_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
 			    struct vfs_caps *caps);
 extern int vfs_get_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
 			  struct vfs_caps *caps);
+extern int vfs_set_fscaps(struct mnt_idmap *idmap, struct dentry *dentry,
+			  const struct vfs_caps *caps, int flags);
 
 enum freeze_holder {
 	FREEZE_HOLDER_KERNEL	= (1U << 0),