diff mbox series

[v2,06/25] capability: provide helpers for converting between xattrs and vfs_caps

Message ID 20240221-idmap-fscap-refactor-v2-6-3039364623bd@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Paul Moore
Headers show
Series fs: use type-safe uid representation for filesystem capabilities | expand

Commit Message

Seth Forshee (DigitalOcean) Feb. 21, 2024, 9:24 p.m. UTC
To pass around vfs_caps instead of raw xattr data we will need to
convert between the two representations near userspace and disk
boundaries. We already convert xattrs from disks to vfs_caps, so move
that code into a helper, and change get_vfs_caps_from_disk() to use the
helper.

When converting vfs_caps to xattrs we have different considerations
depending on the destination of the xattr data. For xattrs which will be
written to disk we need to reject the xattr if the rootid does not map
into the filesystem's user namespace, whereas xattrs read by userspace
may need to undergo a conversion from v3 to v2 format when the rootid
does not map. So this helper is split into an internal and an external
interface. The internal interface does not return an error if the rootid
has no mapping in the target user namespace and will be used for
conversions targeting userspace. The external interface returns
EOVERFLOW if the rootid has no mapping and will be used for all other
conversions.

Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 include/linux/capability.h |  10 ++
 security/commoncap.c       | 228 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 187 insertions(+), 51 deletions(-)

Comments

Christian Brauner Feb. 22, 2024, 3:20 p.m. UTC | #1
On Wed, Feb 21, 2024 at 03:24:37PM -0600, Seth Forshee (DigitalOcean) wrote:
> To pass around vfs_caps instead of raw xattr data we will need to
> convert between the two representations near userspace and disk
> boundaries. We already convert xattrs from disks to vfs_caps, so move
> that code into a helper, and change get_vfs_caps_from_disk() to use the
> helper.
> 
> When converting vfs_caps to xattrs we have different considerations
> depending on the destination of the xattr data. For xattrs which will be
> written to disk we need to reject the xattr if the rootid does not map
> into the filesystem's user namespace, whereas xattrs read by userspace
> may need to undergo a conversion from v3 to v2 format when the rootid
> does not map. So this helper is split into an internal and an external
> interface. The internal interface does not return an error if the rootid
> has no mapping in the target user namespace and will be used for
> conversions targeting userspace. The external interface returns
> EOVERFLOW if the rootid has no mapping and will be used for all other
> conversions.
> 
> Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
> ---
>  include/linux/capability.h |  10 ++
>  security/commoncap.c       | 228 +++++++++++++++++++++++++++++++++++----------
>  2 files changed, 187 insertions(+), 51 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index eb46d346bbbc..a0893ac4664b 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -209,6 +209,16 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
>  		ns_capable(ns, CAP_SYS_ADMIN);
>  }
>  
> +/* helpers to convert between xattr and in-kernel representations */
> +int vfs_caps_from_xattr(struct mnt_idmap *idmap,
> +			struct user_namespace *src_userns,
> +			struct vfs_caps *vfs_caps,
> +			const void *data, size_t size);
> +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +			  struct user_namespace *dest_userns,
> +			  const struct vfs_caps *vfs_caps,
> +			  void *data, size_t size);
> +
>  /* audit system wants to get cap info from files as well */
>  int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
>  			   const struct dentry *dentry,
> diff --git a/security/commoncap.c b/security/commoncap.c
> index a0b5c9740759..7531c9634997 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -619,54 +619,41 @@ static inline int bprm_caps_from_vfs_caps(struct vfs_caps *caps,
>  }
>  
>  /**
> - * get_vfs_caps_from_disk - retrieve vfs caps from disk
> + * vfs_caps_from_xattr - convert raw caps xattr data to vfs_caps
>   *
> - * @idmap:	idmap of the mount the inode was found from
> - * @dentry:	dentry from which @inode is retrieved
> - * @cpu_caps:	vfs capabilities
> + * @idmap:      idmap of the mount the inode was found from
> + * @src_userns: user namespace for ids in xattr data
> + * @vfs_caps:   destination buffer for vfs_caps data
> + * @data:       rax xattr caps data
> + * @size:       size of xattr data
>   *
> - * Extract the on-exec-apply capability sets for an executable file.
> + * Converts a raw security.capability xattr into the kernel-internal
> + * capabilities format.
>   *
> - * If the inode has been found through an idmapped mount the idmap of
> - * the vfsmount must be passed through @idmap. This function will then
> - * take care to map the inode according to @idmap before checking
> - * permissions. On non-idmapped mounts or if permission checking is to be
> - * performed on the raw inode simply pass @nop_mnt_idmap.
> + * If the xattr is being read or written through an idmapped mount the
> + * idmap of the vfsmount must be passed through @idmap. This function
> + * will then take care to map the rootid according to @idmap.
> + *
> + * Return: On success, return 0; on error, return < 0.
>   */
> -int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
> -			   const struct dentry *dentry,
> -			   struct vfs_caps *cpu_caps)
> +int vfs_caps_from_xattr(struct mnt_idmap *idmap,
> +			struct user_namespace *src_userns,
> +			struct vfs_caps *vfs_caps,
> +			const void *data, size_t size)
>  {
> -	struct inode *inode = d_backing_inode(dentry);
>  	__u32 magic_etc;
> -	int size;
> -	struct vfs_ns_cap_data data, *nscaps = &data;
> -	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
> +	const struct vfs_ns_cap_data *ns_caps = data;
> +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
>  	kuid_t rootkuid;
> -	vfsuid_t rootvfsuid;
> -	struct user_namespace *fs_ns;
> -
> -	memset(cpu_caps, 0, sizeof(struct vfs_caps));
> -
> -	if (!inode)
> -		return -ENODATA;
>  
> -	fs_ns = inode->i_sb->s_user_ns;
> -	size = __vfs_getxattr((struct dentry *)dentry, inode,
> -			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> -	if (size == -ENODATA || size == -EOPNOTSUPP)
> -		/* no data, that's ok */
> -		return -ENODATA;
> -
> -	if (size < 0)
> -		return size;
> +	memset(vfs_caps, 0, sizeof(*vfs_caps));
>  
>  	if (size < sizeof(magic_etc))
>  		return -EINVAL;
>  
> -	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
> +	vfs_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
>  
> -	rootkuid = make_kuid(fs_ns, 0);
> +	rootkuid = make_kuid(src_userns, 0);
>  	switch (magic_etc & VFS_CAP_REVISION_MASK) {
>  	case VFS_CAP_REVISION_1:
>  		if (size != XATTR_CAPS_SZ_1)
> @@ -679,39 +666,178 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
>  	case VFS_CAP_REVISION_3:
>  		if (size != XATTR_CAPS_SZ_3)
>  			return -EINVAL;
> -		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
> +		rootkuid = make_kuid(src_userns, le32_to_cpu(ns_caps->rootid));
>  		break;
>  
>  	default:
>  		return -EINVAL;
>  	}
>  
> -	rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
> -	if (!vfsuid_valid(rootvfsuid))
> -		return -ENODATA;
> +	vfs_caps->rootid = make_vfsuid(idmap, src_userns, rootkuid);
> +	if (!vfsuid_valid(vfs_caps->rootid))
> +		return -EOVERFLOW;
>  
> -	/* Limit the caps to the mounter of the filesystem
> -	 * or the more limited uid specified in the xattr.
> +	vfs_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
> +	vfs_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
> +
> +	/*
> +	 * Rev1 had just a single 32-bit word, later expanded
> +	 * to a second one for the high bits
>  	 */
> -	if (!rootid_owns_currentns(rootvfsuid))
> -		return -ENODATA;
> +	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> +		vfs_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> +		vfs_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;

That + makes this even more difficult to read. This should be rewritten.

> +	}
> +
> +	vfs_caps->permitted.val &= CAP_VALID_MASK;
> +	vfs_caps->inheritable.val &= CAP_VALID_MASK;
> +
> +	return 0;
> +}
> +
> +/*
> + * Inner implementation of vfs_caps_to_xattr() which does not return an
> + * error if the rootid does not map into @dest_userns.
> + */
> +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +				   struct user_namespace *dest_userns,
> +				   const struct vfs_caps *vfs_caps,
> +				   void *data, size_t size)
> +{
> +	struct vfs_ns_cap_data *ns_caps = data;
> +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
> +	kuid_t rootkuid;
> +	uid_t rootid;
> +
> +	memset(ns_caps, 0, size);
> +
> +	rootid = 0;
> +	switch (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) {
> +	case VFS_CAP_REVISION_1:
> +		if (size < XATTR_CAPS_SZ_1)
> +			return -EINVAL;
> +		size = XATTR_CAPS_SZ_1;
> +		break;
> +	case VFS_CAP_REVISION_2:
> +		if (size < XATTR_CAPS_SZ_2)
> +			return -EINVAL;
> +		size = XATTR_CAPS_SZ_2;
> +		break;
> +	case VFS_CAP_REVISION_3:
> +		if (size < XATTR_CAPS_SZ_3)
> +			return -EINVAL;
> +		size = XATTR_CAPS_SZ_3;
> +		rootkuid = from_vfsuid(idmap, dest_userns, vfs_caps->rootid);
> +		rootid = from_kuid(dest_userns, rootkuid);
> +		ns_caps->rootid = cpu_to_le32(rootid);
> +		break;
>  
> -	cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
> -	cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	caps->magic_etc = cpu_to_le32(vfs_caps->magic_etc);
> +
> +	caps->data[0].permitted = cpu_to_le32(lower_32_bits(vfs_caps->permitted.val));
> +	caps->data[0].inheritable = cpu_to_le32(lower_32_bits(vfs_caps->inheritable.val));
>  
>  	/*
>  	 * Rev1 had just a single 32-bit word, later expanded
>  	 * to a second one for the high bits
>  	 */
> -	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> -		cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> -		cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
> +	if ((vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> +		caps->data[1].permitted =
> +			cpu_to_le32(upper_32_bits(vfs_caps->permitted.val));
> +		caps->data[1].inheritable =
> +			cpu_to_le32(upper_32_bits(vfs_caps->inheritable.val));
>  	}
>  
> -	cpu_caps->permitted.val &= CAP_VALID_MASK;
> -	cpu_caps->inheritable.val &= CAP_VALID_MASK;
> +	return size;
> +}
> +
> +
> +/**
> + * vfs_caps_to_xattr - convert vfs_caps to raw caps xattr data
> + *
> + * @idmap:       idmap of the mount the inode was found from
> + * @dest_userns: user namespace for ids in xattr data
> + * @vfs_caps:    source vfs_caps data
> + * @data:        destination buffer for rax xattr caps data
> + * @size:        size of the @data buffer
> + *
> + * Converts a kernel-internal capability into the raw security.capability
> + * xattr format.
> + *
> + * If the xattr is being read or written through an idmapped mount the
> + * idmap of the vfsmount must be passed through @idmap. This function
> + * will then take care to map the rootid according to @idmap.
> + *
> + * Return: On success, return the size of the xattr data. On error,
> + * return < 0.
> + */
> +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +			  struct user_namespace *dest_userns,
> +			  const struct vfs_caps *vfs_caps,
> +			  void *data, size_t size)
> +{
> +	struct vfs_ns_cap_data *caps = data;
> +	int ret;

This should very likely be ssize_t ret.

> +
> +	ret = __vfs_caps_to_xattr(idmap, dest_userns, vfs_caps, data, size);
> +	if (ret > 0 &&
> +	    (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) == VFS_CAP_REVISION_3 &&
> +	     le32_to_cpu(caps->rootid) == (uid_t)-1)
> +		return -EOVERFLOW;
> +	return ret;
> +}
> +
> +/**
> + * get_vfs_caps_from_disk - retrieve vfs caps from disk
> + *
> + * @idmap:	idmap of the mount the inode was found from
> + * @dentry:	dentry from which @inode is retrieved
> + * @cpu_caps:	vfs capabilities
> + *
> + * Extract the on-exec-apply capability sets for an executable file.
> + *
> + * If the inode has been found through an idmapped mount the idmap of
> + * the vfsmount must be passed through @idmap. This function will then
> + * take care to map the inode according to @idmap before checking
> + * permissions. On non-idmapped mounts or if permission checking is to be
> + * performed on the raw inode simply pass @nop_mnt_idmap.
> + */
> +int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
> +			   const struct dentry *dentry,
> +			   struct vfs_caps *cpu_caps)
> +{
> +	struct inode *inode = d_backing_inode(dentry);
> +	int size, ret;
> +	struct vfs_ns_cap_data data, *nscaps = &data;
> +
> +	if (!inode)
> +		return -ENODATA;
>  
> -	cpu_caps->rootid = rootvfsuid;
> +	size = __vfs_getxattr((struct dentry *)dentry, inode,
> +			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> +	if (size == -ENODATA || size == -EOPNOTSUPP)
> +		/* no data, that's ok */
> +		return -ENODATA;
> +
> +	if (size < 0)
> +		return size;
> +
> +	ret = vfs_caps_from_xattr(idmap, inode->i_sb->s_user_ns,
> +				  cpu_caps, nscaps, size);
> +	if (ret == -EOVERFLOW)
> +		return -ENODATA;
> +	if (ret)
> +		return ret;
> +
> +	/* Limit the caps to the mounter of the filesystem
> +	 * or the more limited uid specified in the xattr.
> +	 */
> +	if (!rootid_owns_currentns(cpu_caps->rootid))
> +		return -ENODATA;
>  
>  	return 0;
>  }
> 
> -- 
> 2.43.0
>
Seth Forshee (DigitalOcean) Feb. 22, 2024, 3:38 p.m. UTC | #2
On Thu, Feb 22, 2024 at 04:20:08PM +0100, Christian Brauner wrote:
> > +	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> > +		vfs_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> > +		vfs_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
> 
> That + makes this even more difficult to read. This should be rewritten.

Do you meant that you would prefer |= to +=, or do you have something
else in mind?

Note though that this is code that I didn't change, just moved.
Generally I tried to avoid changing code if it wasn't necessary for the
aims of this series.

> > +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> > +			  struct user_namespace *dest_userns,
> > +			  const struct vfs_caps *vfs_caps,
> > +			  void *data, size_t size)
> > +{
> > +	struct vfs_ns_cap_data *caps = data;
> > +	int ret;
> 
> This should very likely be ssize_t ret.

Indeed, I'll fix that.
Christian Brauner Feb. 23, 2024, 8:08 a.m. UTC | #3
On Thu, Feb 22, 2024 at 09:38:04AM -0600, Seth Forshee (DigitalOcean) wrote:
> On Thu, Feb 22, 2024 at 04:20:08PM +0100, Christian Brauner wrote:
> > > +	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> > > +		vfs_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> > > +		vfs_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
> > 
> > That + makes this even more difficult to read. This should be rewritten.
> 
> Do you meant that you would prefer |= to +=, or do you have something

Yes.
Roberto Sassu March 1, 2024, 4:30 p.m. UTC | #4
On Wed, 2024-02-21 at 15:24 -0600, Seth Forshee (DigitalOcean) wrote:
> To pass around vfs_caps instead of raw xattr data we will need to
> convert between the two representations near userspace and disk
> boundaries. We already convert xattrs from disks to vfs_caps, so move
> that code into a helper, and change get_vfs_caps_from_disk() to use the
> helper.
> 
> When converting vfs_caps to xattrs we have different considerations
> depending on the destination of the xattr data. For xattrs which will be
> written to disk we need to reject the xattr if the rootid does not map
> into the filesystem's user namespace, whereas xattrs read by userspace
> may need to undergo a conversion from v3 to v2 format when the rootid
> does not map. So this helper is split into an internal and an external
> interface. The internal interface does not return an error if the rootid
> has no mapping in the target user namespace and will be used for
> conversions targeting userspace. The external interface returns
> EOVERFLOW if the rootid has no mapping and will be used for all other
> conversions.
> 
> Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
> ---
>  include/linux/capability.h |  10 ++
>  security/commoncap.c       | 228 +++++++++++++++++++++++++++++++++++----------
>  2 files changed, 187 insertions(+), 51 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index eb46d346bbbc..a0893ac4664b 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -209,6 +209,16 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
>  		ns_capable(ns, CAP_SYS_ADMIN);
>  }
>  
> +/* helpers to convert between xattr and in-kernel representations */
> +int vfs_caps_from_xattr(struct mnt_idmap *idmap,
> +			struct user_namespace *src_userns,
> +			struct vfs_caps *vfs_caps,
> +			const void *data, size_t size);
> +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +			  struct user_namespace *dest_userns,
> +			  const struct vfs_caps *vfs_caps,
> +			  void *data, size_t size);
> +
>  /* audit system wants to get cap info from files as well */
>  int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
>  			   const struct dentry *dentry,
> diff --git a/security/commoncap.c b/security/commoncap.c
> index a0b5c9740759..7531c9634997 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -619,54 +619,41 @@ static inline int bprm_caps_from_vfs_caps(struct vfs_caps *caps,
>  }
>  
>  /**
> - * get_vfs_caps_from_disk - retrieve vfs caps from disk
> + * vfs_caps_from_xattr - convert raw caps xattr data to vfs_caps
>   *
> - * @idmap:	idmap of the mount the inode was found from
> - * @dentry:	dentry from which @inode is retrieved
> - * @cpu_caps:	vfs capabilities
> + * @idmap:      idmap of the mount the inode was found from
> + * @src_userns: user namespace for ids in xattr data
> + * @vfs_caps:   destination buffer for vfs_caps data
> + * @data:       rax xattr caps data
> + * @size:       size of xattr data
>   *
> - * Extract the on-exec-apply capability sets for an executable file.
> + * Converts a raw security.capability xattr into the kernel-internal
> + * capabilities format.
>   *
> - * If the inode has been found through an idmapped mount the idmap of
> - * the vfsmount must be passed through @idmap. This function will then
> - * take care to map the inode according to @idmap before checking
> - * permissions. On non-idmapped mounts or if permission checking is to be
> - * performed on the raw inode simply pass @nop_mnt_idmap.
> + * If the xattr is being read or written through an idmapped mount the
> + * idmap of the vfsmount must be passed through @idmap. This function
> + * will then take care to map the rootid according to @idmap.
> + *
> + * Return: On success, return 0; on error, return < 0.
>   */
> -int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
> -			   const struct dentry *dentry,
> -			   struct vfs_caps *cpu_caps)
> +int vfs_caps_from_xattr(struct mnt_idmap *idmap,
> +			struct user_namespace *src_userns,
> +			struct vfs_caps *vfs_caps,
> +			const void *data, size_t size)
>  {
> -	struct inode *inode = d_backing_inode(dentry);
>  	__u32 magic_etc;
> -	int size;
> -	struct vfs_ns_cap_data data, *nscaps = &data;
> -	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
> +	const struct vfs_ns_cap_data *ns_caps = data;
> +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
>  	kuid_t rootkuid;
> -	vfsuid_t rootvfsuid;
> -	struct user_namespace *fs_ns;
> -
> -	memset(cpu_caps, 0, sizeof(struct vfs_caps));
> -
> -	if (!inode)
> -		return -ENODATA;
>  
> -	fs_ns = inode->i_sb->s_user_ns;
> -	size = __vfs_getxattr((struct dentry *)dentry, inode,
> -			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> -	if (size == -ENODATA || size == -EOPNOTSUPP)
> -		/* no data, that's ok */
> -		return -ENODATA;
> -
> -	if (size < 0)
> -		return size;
> +	memset(vfs_caps, 0, sizeof(*vfs_caps));
>  
>  	if (size < sizeof(magic_etc))
>  		return -EINVAL;
>  
> -	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
> +	vfs_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
>  
> -	rootkuid = make_kuid(fs_ns, 0);
> +	rootkuid = make_kuid(src_userns, 0);
>  	switch (magic_etc & VFS_CAP_REVISION_MASK) {
>  	case VFS_CAP_REVISION_1:
>  		if (size != XATTR_CAPS_SZ_1)
> @@ -679,39 +666,178 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
>  	case VFS_CAP_REVISION_3:
>  		if (size != XATTR_CAPS_SZ_3)
>  			return -EINVAL;
> -		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
> +		rootkuid = make_kuid(src_userns, le32_to_cpu(ns_caps->rootid));
>  		break;
>  
>  	default:
>  		return -EINVAL;
>  	}
>  
> -	rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
> -	if (!vfsuid_valid(rootvfsuid))
> -		return -ENODATA;
> +	vfs_caps->rootid = make_vfsuid(idmap, src_userns, rootkuid);
> +	if (!vfsuid_valid(vfs_caps->rootid))
> +		return -EOVERFLOW;
>  
> -	/* Limit the caps to the mounter of the filesystem
> -	 * or the more limited uid specified in the xattr.
> +	vfs_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
> +	vfs_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
> +
> +	/*
> +	 * Rev1 had just a single 32-bit word, later expanded
> +	 * to a second one for the high bits
>  	 */
> -	if (!rootid_owns_currentns(rootvfsuid))
> -		return -ENODATA;
> +	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> +		vfs_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> +		vfs_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
> +	}
> +
> +	vfs_caps->permitted.val &= CAP_VALID_MASK;
> +	vfs_caps->inheritable.val &= CAP_VALID_MASK;
> +
> +	return 0;
> +}
> +
> +/*
> + * Inner implementation of vfs_caps_to_xattr() which does not return an
> + * error if the rootid does not map into @dest_userns.
> + */
> +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +				   struct user_namespace *dest_userns,
> +				   const struct vfs_caps *vfs_caps,
> +				   void *data, size_t size)
> +{
> +	struct vfs_ns_cap_data *ns_caps = data;
> +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
> +	kuid_t rootkuid;
> +	uid_t rootid;
> +
> +	memset(ns_caps, 0, size);

size -> sizeof(*ns_caps) (or an equivalent change)

I was zeroing more (the size of the buffer passed to vfs_getxattr()).

Roberto

> +
> +	rootid = 0;
> +	switch (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) {
> +	case VFS_CAP_REVISION_1:
> +		if (size < XATTR_CAPS_SZ_1)
> +			return -EINVAL;
> +		size = XATTR_CAPS_SZ_1;
> +		break;
> +	case VFS_CAP_REVISION_2:
> +		if (size < XATTR_CAPS_SZ_2)
> +			return -EINVAL;
> +		size = XATTR_CAPS_SZ_2;
> +		break;
> +	case VFS_CAP_REVISION_3:
> +		if (size < XATTR_CAPS_SZ_3)
> +			return -EINVAL;
> +		size = XATTR_CAPS_SZ_3;
> +		rootkuid = from_vfsuid(idmap, dest_userns, vfs_caps->rootid);
> +		rootid = from_kuid(dest_userns, rootkuid);
> +		ns_caps->rootid = cpu_to_le32(rootid);
> +		break;
>  
> -	cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
> -	cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	caps->magic_etc = cpu_to_le32(vfs_caps->magic_etc);
> +
> +	caps->data[0].permitted = cpu_to_le32(lower_32_bits(vfs_caps->permitted.val));
> +	caps->data[0].inheritable = cpu_to_le32(lower_32_bits(vfs_caps->inheritable.val));
>  
>  	/*
>  	 * Rev1 had just a single 32-bit word, later expanded
>  	 * to a second one for the high bits
>  	 */
> -	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> -		cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> -		cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
> +	if ((vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> +		caps->data[1].permitted =
> +			cpu_to_le32(upper_32_bits(vfs_caps->permitted.val));
> +		caps->data[1].inheritable =
> +			cpu_to_le32(upper_32_bits(vfs_caps->inheritable.val));
>  	}
>  
> -	cpu_caps->permitted.val &= CAP_VALID_MASK;
> -	cpu_caps->inheritable.val &= CAP_VALID_MASK;
> +	return size;
> +}
> +
> +
> +/**
> + * vfs_caps_to_xattr - convert vfs_caps to raw caps xattr data
> + *
> + * @idmap:       idmap of the mount the inode was found from
> + * @dest_userns: user namespace for ids in xattr data
> + * @vfs_caps:    source vfs_caps data
> + * @data:        destination buffer for rax xattr caps data
> + * @size:        size of the @data buffer
> + *
> + * Converts a kernel-internal capability into the raw security.capability
> + * xattr format.
> + *
> + * If the xattr is being read or written through an idmapped mount the
> + * idmap of the vfsmount must be passed through @idmap. This function
> + * will then take care to map the rootid according to @idmap.
> + *
> + * Return: On success, return the size of the xattr data. On error,
> + * return < 0.
> + */
> +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +			  struct user_namespace *dest_userns,
> +			  const struct vfs_caps *vfs_caps,
> +			  void *data, size_t size)
> +{
> +	struct vfs_ns_cap_data *caps = data;
> +	int ret;
> +
> +	ret = __vfs_caps_to_xattr(idmap, dest_userns, vfs_caps, data, size);
> +	if (ret > 0 &&
> +	    (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) == VFS_CAP_REVISION_3 &&
> +	     le32_to_cpu(caps->rootid) == (uid_t)-1)
> +		return -EOVERFLOW;
> +	return ret;
> +}
> +
> +/**
> + * get_vfs_caps_from_disk - retrieve vfs caps from disk
> + *
> + * @idmap:	idmap of the mount the inode was found from
> + * @dentry:	dentry from which @inode is retrieved
> + * @cpu_caps:	vfs capabilities
> + *
> + * Extract the on-exec-apply capability sets for an executable file.
> + *
> + * If the inode has been found through an idmapped mount the idmap of
> + * the vfsmount must be passed through @idmap. This function will then
> + * take care to map the inode according to @idmap before checking
> + * permissions. On non-idmapped mounts or if permission checking is to be
> + * performed on the raw inode simply pass @nop_mnt_idmap.
> + */
> +int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
> +			   const struct dentry *dentry,
> +			   struct vfs_caps *cpu_caps)
> +{
> +	struct inode *inode = d_backing_inode(dentry);
> +	int size, ret;
> +	struct vfs_ns_cap_data data, *nscaps = &data;
> +
> +	if (!inode)
> +		return -ENODATA;
>  
> -	cpu_caps->rootid = rootvfsuid;
> +	size = __vfs_getxattr((struct dentry *)dentry, inode,
> +			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> +	if (size == -ENODATA || size == -EOPNOTSUPP)
> +		/* no data, that's ok */
> +		return -ENODATA;
> +
> +	if (size < 0)
> +		return size;
> +
> +	ret = vfs_caps_from_xattr(idmap, inode->i_sb->s_user_ns,
> +				  cpu_caps, nscaps, size);
> +	if (ret == -EOVERFLOW)
> +		return -ENODATA;
> +	if (ret)
> +		return ret;
> +
> +	/* Limit the caps to the mounter of the filesystem
> +	 * or the more limited uid specified in the xattr.
> +	 */
> +	if (!rootid_owns_currentns(cpu_caps->rootid))
> +		return -ENODATA;
>  
>  	return 0;
>  }
>
Seth Forshee (DigitalOcean) March 1, 2024, 7 p.m. UTC | #5
On Fri, Mar 01, 2024 at 05:30:55PM +0100, Roberto Sassu wrote:
> > +/*
> > + * Inner implementation of vfs_caps_to_xattr() which does not return an
> > + * error if the rootid does not map into @dest_userns.
> > + */
> > +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
> > +				   struct user_namespace *dest_userns,
> > +				   const struct vfs_caps *vfs_caps,
> > +				   void *data, size_t size)
> > +{
> > +	struct vfs_ns_cap_data *ns_caps = data;
> > +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
> > +	kuid_t rootkuid;
> > +	uid_t rootid;
> > +
> > +	memset(ns_caps, 0, size);
> 
> size -> sizeof(*ns_caps) (or an equivalent change)

This is zeroing out the passed buffer, so it should use the size passed
for the buffer. sizeof(*ns_caps) could potentially be more than the size
of the buffer.

Maybe it would be clearer if it was memset(data, 0, size)?

> I was zeroing more (the size of the buffer passed to vfs_getxattr()).
> 
> Roberto
Roberto Sassu March 4, 2024, 8:33 a.m. UTC | #6
On Fri, 2024-03-01 at 13:00 -0600, Seth Forshee (DigitalOcean) wrote:
> On Fri, Mar 01, 2024 at 05:30:55PM +0100, Roberto Sassu wrote:
> > > +/*
> > > + * Inner implementation of vfs_caps_to_xattr() which does not return an
> > > + * error if the rootid does not map into @dest_userns.
> > > + */
> > > +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
> > > +				   struct user_namespace *dest_userns,
> > > +				   const struct vfs_caps *vfs_caps,
> > > +				   void *data, size_t size)
> > > +{
> > > +	struct vfs_ns_cap_data *ns_caps = data;
> > > +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
> > > +	kuid_t rootkuid;
> > > +	uid_t rootid;
> > > +
> > > +	memset(ns_caps, 0, size);
> > 
> > size -> sizeof(*ns_caps) (or an equivalent change)
> 
> This is zeroing out the passed buffer, so it should use the size passed
> for the buffer. sizeof(*ns_caps) could potentially be more than the size
> of the buffer.

Uhm, then maybe the problem is that you are passing the wrong argument?

ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
	struct xattr_ctx *ctx)
{
	ssize_t error;
	char *kname = ctx->kname->name;

	if (is_fscaps_xattr(kname)) {
		struct vfs_caps caps;
		struct vfs_ns_cap_data data;
		int ret;

		ret = vfs_get_fscaps(idmap, d, &caps);
		if (ret)
			return ret;
		/*
		 * rootid is already in the mount idmap, so pass nop_mnt_idmap
		 * so that it won't be mapped.
		 */
		ret = vfs_caps_to_user_xattr(&nop_mnt_idmap, current_user_ns(),
					     &caps, &data, ctx->size);


ctx->size in my case is 1024 bytes.

Roberto

> Maybe it would be clearer if it was memset(data, 0, size)?
> 
> > I was zeroing more (the size of the buffer passed to vfs_getxattr()).
> > 
> > Roberto
Seth Forshee (DigitalOcean) March 4, 2024, 2:24 p.m. UTC | #7
On Mon, Mar 04, 2024 at 09:33:06AM +0100, Roberto Sassu wrote:
> On Fri, 2024-03-01 at 13:00 -0600, Seth Forshee (DigitalOcean) wrote:
> > On Fri, Mar 01, 2024 at 05:30:55PM +0100, Roberto Sassu wrote:
> > > > +/*
> > > > + * Inner implementation of vfs_caps_to_xattr() which does not return an
> > > > + * error if the rootid does not map into @dest_userns.
> > > > + */
> > > > +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
> > > > +				   struct user_namespace *dest_userns,
> > > > +				   const struct vfs_caps *vfs_caps,
> > > > +				   void *data, size_t size)
> > > > +{
> > > > +	struct vfs_ns_cap_data *ns_caps = data;
> > > > +	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
> > > > +	kuid_t rootkuid;
> > > > +	uid_t rootid;
> > > > +
> > > > +	memset(ns_caps, 0, size);
> > > 
> > > size -> sizeof(*ns_caps) (or an equivalent change)
> > 
> > This is zeroing out the passed buffer, so it should use the size passed
> > for the buffer. sizeof(*ns_caps) could potentially be more than the size
> > of the buffer.
> 
> Uhm, then maybe the problem is that you are passing the wrong argument?
> 
> ssize_t
> do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
> 	struct xattr_ctx *ctx)
> {
> 	ssize_t error;
> 	char *kname = ctx->kname->name;
> 
> 	if (is_fscaps_xattr(kname)) {
> 		struct vfs_caps caps;
> 		struct vfs_ns_cap_data data;
> 		int ret;
> 
> 		ret = vfs_get_fscaps(idmap, d, &caps);
> 		if (ret)
> 			return ret;
> 		/*
> 		 * rootid is already in the mount idmap, so pass nop_mnt_idmap
> 		 * so that it won't be mapped.
> 		 */
> 		ret = vfs_caps_to_user_xattr(&nop_mnt_idmap, current_user_ns(),
> 					     &caps, &data, ctx->size);
> 
> 
> ctx->size in my case is 1024 bytes.

Ah, yes that definitely isn't correct. I will fix it, thanks for finding
it.
diff mbox series

Patch

diff --git a/include/linux/capability.h b/include/linux/capability.h
index eb46d346bbbc..a0893ac4664b 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -209,6 +209,16 @@  static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 		ns_capable(ns, CAP_SYS_ADMIN);
 }
 
+/* helpers to convert between xattr and in-kernel representations */
+int vfs_caps_from_xattr(struct mnt_idmap *idmap,
+			struct user_namespace *src_userns,
+			struct vfs_caps *vfs_caps,
+			const void *data, size_t size);
+ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
+			  struct user_namespace *dest_userns,
+			  const struct vfs_caps *vfs_caps,
+			  void *data, size_t size);
+
 /* audit system wants to get cap info from files as well */
 int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
 			   const struct dentry *dentry,
diff --git a/security/commoncap.c b/security/commoncap.c
index a0b5c9740759..7531c9634997 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -619,54 +619,41 @@  static inline int bprm_caps_from_vfs_caps(struct vfs_caps *caps,
 }
 
 /**
- * get_vfs_caps_from_disk - retrieve vfs caps from disk
+ * vfs_caps_from_xattr - convert raw caps xattr data to vfs_caps
  *
- * @idmap:	idmap of the mount the inode was found from
- * @dentry:	dentry from which @inode is retrieved
- * @cpu_caps:	vfs capabilities
+ * @idmap:      idmap of the mount the inode was found from
+ * @src_userns: user namespace for ids in xattr data
+ * @vfs_caps:   destination buffer for vfs_caps data
+ * @data:       rax xattr caps data
+ * @size:       size of xattr data
  *
- * Extract the on-exec-apply capability sets for an executable file.
+ * Converts a raw security.capability xattr into the kernel-internal
+ * capabilities format.
  *
- * If the inode has been found through an idmapped mount the idmap of
- * the vfsmount must be passed through @idmap. This function will then
- * take care to map the inode according to @idmap before checking
- * permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply pass @nop_mnt_idmap.
+ * If the xattr is being read or written through an idmapped mount the
+ * idmap of the vfsmount must be passed through @idmap. This function
+ * will then take care to map the rootid according to @idmap.
+ *
+ * Return: On success, return 0; on error, return < 0.
  */
-int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
-			   const struct dentry *dentry,
-			   struct vfs_caps *cpu_caps)
+int vfs_caps_from_xattr(struct mnt_idmap *idmap,
+			struct user_namespace *src_userns,
+			struct vfs_caps *vfs_caps,
+			const void *data, size_t size)
 {
-	struct inode *inode = d_backing_inode(dentry);
 	__u32 magic_etc;
-	int size;
-	struct vfs_ns_cap_data data, *nscaps = &data;
-	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
+	const struct vfs_ns_cap_data *ns_caps = data;
+	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
 	kuid_t rootkuid;
-	vfsuid_t rootvfsuid;
-	struct user_namespace *fs_ns;
-
-	memset(cpu_caps, 0, sizeof(struct vfs_caps));
-
-	if (!inode)
-		return -ENODATA;
 
-	fs_ns = inode->i_sb->s_user_ns;
-	size = __vfs_getxattr((struct dentry *)dentry, inode,
-			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
-	if (size == -ENODATA || size == -EOPNOTSUPP)
-		/* no data, that's ok */
-		return -ENODATA;
-
-	if (size < 0)
-		return size;
+	memset(vfs_caps, 0, sizeof(*vfs_caps));
 
 	if (size < sizeof(magic_etc))
 		return -EINVAL;
 
-	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
+	vfs_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
 
-	rootkuid = make_kuid(fs_ns, 0);
+	rootkuid = make_kuid(src_userns, 0);
 	switch (magic_etc & VFS_CAP_REVISION_MASK) {
 	case VFS_CAP_REVISION_1:
 		if (size != XATTR_CAPS_SZ_1)
@@ -679,39 +666,178 @@  int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
 	case VFS_CAP_REVISION_3:
 		if (size != XATTR_CAPS_SZ_3)
 			return -EINVAL;
-		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
+		rootkuid = make_kuid(src_userns, le32_to_cpu(ns_caps->rootid));
 		break;
 
 	default:
 		return -EINVAL;
 	}
 
-	rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
-	if (!vfsuid_valid(rootvfsuid))
-		return -ENODATA;
+	vfs_caps->rootid = make_vfsuid(idmap, src_userns, rootkuid);
+	if (!vfsuid_valid(vfs_caps->rootid))
+		return -EOVERFLOW;
 
-	/* Limit the caps to the mounter of the filesystem
-	 * or the more limited uid specified in the xattr.
+	vfs_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
+	vfs_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
+
+	/*
+	 * Rev1 had just a single 32-bit word, later expanded
+	 * to a second one for the high bits
 	 */
-	if (!rootid_owns_currentns(rootvfsuid))
-		return -ENODATA;
+	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
+		vfs_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
+		vfs_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
+	}
+
+	vfs_caps->permitted.val &= CAP_VALID_MASK;
+	vfs_caps->inheritable.val &= CAP_VALID_MASK;
+
+	return 0;
+}
+
+/*
+ * Inner implementation of vfs_caps_to_xattr() which does not return an
+ * error if the rootid does not map into @dest_userns.
+ */
+static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
+				   struct user_namespace *dest_userns,
+				   const struct vfs_caps *vfs_caps,
+				   void *data, size_t size)
+{
+	struct vfs_ns_cap_data *ns_caps = data;
+	struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
+	kuid_t rootkuid;
+	uid_t rootid;
+
+	memset(ns_caps, 0, size);
+
+	rootid = 0;
+	switch (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) {
+	case VFS_CAP_REVISION_1:
+		if (size < XATTR_CAPS_SZ_1)
+			return -EINVAL;
+		size = XATTR_CAPS_SZ_1;
+		break;
+	case VFS_CAP_REVISION_2:
+		if (size < XATTR_CAPS_SZ_2)
+			return -EINVAL;
+		size = XATTR_CAPS_SZ_2;
+		break;
+	case VFS_CAP_REVISION_3:
+		if (size < XATTR_CAPS_SZ_3)
+			return -EINVAL;
+		size = XATTR_CAPS_SZ_3;
+		rootkuid = from_vfsuid(idmap, dest_userns, vfs_caps->rootid);
+		rootid = from_kuid(dest_userns, rootkuid);
+		ns_caps->rootid = cpu_to_le32(rootid);
+		break;
 
-	cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
-	cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
+	default:
+		return -EINVAL;
+	}
+
+	caps->magic_etc = cpu_to_le32(vfs_caps->magic_etc);
+
+	caps->data[0].permitted = cpu_to_le32(lower_32_bits(vfs_caps->permitted.val));
+	caps->data[0].inheritable = cpu_to_le32(lower_32_bits(vfs_caps->inheritable.val));
 
 	/*
 	 * Rev1 had just a single 32-bit word, later expanded
 	 * to a second one for the high bits
 	 */
-	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
-		cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
-		cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
+	if ((vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
+		caps->data[1].permitted =
+			cpu_to_le32(upper_32_bits(vfs_caps->permitted.val));
+		caps->data[1].inheritable =
+			cpu_to_le32(upper_32_bits(vfs_caps->inheritable.val));
 	}
 
-	cpu_caps->permitted.val &= CAP_VALID_MASK;
-	cpu_caps->inheritable.val &= CAP_VALID_MASK;
+	return size;
+}
+
+
+/**
+ * vfs_caps_to_xattr - convert vfs_caps to raw caps xattr data
+ *
+ * @idmap:       idmap of the mount the inode was found from
+ * @dest_userns: user namespace for ids in xattr data
+ * @vfs_caps:    source vfs_caps data
+ * @data:        destination buffer for rax xattr caps data
+ * @size:        size of the @data buffer
+ *
+ * Converts a kernel-internal capability into the raw security.capability
+ * xattr format.
+ *
+ * If the xattr is being read or written through an idmapped mount the
+ * idmap of the vfsmount must be passed through @idmap. This function
+ * will then take care to map the rootid according to @idmap.
+ *
+ * Return: On success, return the size of the xattr data. On error,
+ * return < 0.
+ */
+ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
+			  struct user_namespace *dest_userns,
+			  const struct vfs_caps *vfs_caps,
+			  void *data, size_t size)
+{
+	struct vfs_ns_cap_data *caps = data;
+	int ret;
+
+	ret = __vfs_caps_to_xattr(idmap, dest_userns, vfs_caps, data, size);
+	if (ret > 0 &&
+	    (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) == VFS_CAP_REVISION_3 &&
+	     le32_to_cpu(caps->rootid) == (uid_t)-1)
+		return -EOVERFLOW;
+	return ret;
+}
+
+/**
+ * get_vfs_caps_from_disk - retrieve vfs caps from disk
+ *
+ * @idmap:	idmap of the mount the inode was found from
+ * @dentry:	dentry from which @inode is retrieved
+ * @cpu_caps:	vfs capabilities
+ *
+ * Extract the on-exec-apply capability sets for an executable file.
+ *
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then
+ * take care to map the inode according to @idmap before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply pass @nop_mnt_idmap.
+ */
+int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
+			   const struct dentry *dentry,
+			   struct vfs_caps *cpu_caps)
+{
+	struct inode *inode = d_backing_inode(dentry);
+	int size, ret;
+	struct vfs_ns_cap_data data, *nscaps = &data;
+
+	if (!inode)
+		return -ENODATA;
 
-	cpu_caps->rootid = rootvfsuid;
+	size = __vfs_getxattr((struct dentry *)dentry, inode,
+			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
+	if (size == -ENODATA || size == -EOPNOTSUPP)
+		/* no data, that's ok */
+		return -ENODATA;
+
+	if (size < 0)
+		return size;
+
+	ret = vfs_caps_from_xattr(idmap, inode->i_sb->s_user_ns,
+				  cpu_caps, nscaps, size);
+	if (ret == -EOVERFLOW)
+		return -ENODATA;
+	if (ret)
+		return ret;
+
+	/* Limit the caps to the mounter of the filesystem
+	 * or the more limited uid specified in the xattr.
+	 */
+	if (!rootid_owns_currentns(cpu_caps->rootid))
+		return -ENODATA;
 
 	return 0;
 }