diff mbox series

[v2,01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

Message ID 20201207163255.564116-2-mszeredi@redhat.com (mailing list archive)
State New, archived
Headers show
Series allow unprivileged overlay mounts | expand

Commit Message

Miklos Szeredi Dec. 7, 2020, 4:32 p.m. UTC
cap_convert_nscap() does permission checking as well as conversion of the
xattr value conditionally based on fs's user-ns.

This is needed by overlayfs and probably other layered fs (ecryptfs) and is
what vfs_foo() is supposed to do anyway.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/xattr.c                 | 17 +++++++++++------
 include/linux/capability.h |  2 +-
 security/commoncap.c       |  3 +--
 3 files changed, 13 insertions(+), 9 deletions(-)

Comments

James Morris Dec. 9, 2020, 1:53 a.m. UTC | #1
On Mon, 7 Dec 2020, Miklos Szeredi wrote:

> cap_convert_nscap() does permission checking as well as conversion of the
> xattr value conditionally based on fs's user-ns.
> 
> This is needed by overlayfs and probably other layered fs (ecryptfs) and is
> what vfs_foo() is supposed to do anyway.
> 
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>


Acked-by: James Morris <jamorris@linux.microsoft.com>
Eric W. Biederman Jan. 1, 2021, 5:35 p.m. UTC | #2
Miklos Szeredi <mszeredi@redhat.com> writes:

> cap_convert_nscap() does permission checking as well as conversion of the
> xattr value conditionally based on fs's user-ns.
>
> This is needed by overlayfs and probably other layered fs (ecryptfs) and is
> what vfs_foo() is supposed to do anyway.

Well crap.

I just noticed this and it turns out this change is wrong.

The problem is that it reads the rootid from the v3 fscap, using
current_user_ns() and then writes it using the sb->s_user_ns.

So any time the stacked filesystems sb->s_user_ns do not match or
current_user_ns does not match sb->s_user_ns this could be a problem.

In a stacked filesystem a second pass through vfs_setxattr will result
in the rootid being translated a second time (with potentially the wrong
namespaces).  I think because of the security checks a we won't write
something we shouldn't be able to write to the filesystem.  Still we
will be writing the wrong v3 fscap which can go quite badly.

This doesn't look terribly difficult to fix.

Probably convert this into a fs independent form using uids in
init_user_ns at input and have cap_convert_nscap convert the v3 fscap
into the filesystem dependent form.  With some way for stackable
filesystems to just skip converting it from the filesystem independent
format.

Uids in xattrs that are expected to go directly to disk, but aren't
always suitable for going directly to disk are tricky.

Eric

> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  fs/xattr.c                 | 17 +++++++++++------
>  include/linux/capability.h |  2 +-
>  security/commoncap.c       |  3 +--
>  3 files changed, 13 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xattr.c b/fs/xattr.c
> index cd7a563e8bcd..fd57153b1f61 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -276,8 +276,16 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
>  {
>  	struct inode *inode = dentry->d_inode;
>  	struct inode *delegated_inode = NULL;
> +	const void  *orig_value = value;
>  	int error;
>  
> +	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
> +		error = cap_convert_nscap(dentry, &value, size);
> +		if (error < 0)
> +			return error;
> +		size = error;
> +	}
> +
>  retry_deleg:
>  	inode_lock(inode);
>  	error = __vfs_setxattr_locked(dentry, name, value, size, flags,
> @@ -289,6 +297,9 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
>  		if (!error)
>  			goto retry_deleg;
>  	}
> +	if (value != orig_value)
> +		kfree(value);
> +
>  	return error;
>  }
>  EXPORT_SYMBOL_GPL(vfs_setxattr);
> @@ -537,12 +548,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
>  		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
>  		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
>  			posix_acl_fix_xattr_from_user(kvalue, size);
> -		else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
> -			error = cap_convert_nscap(d, &kvalue, size);
> -			if (error < 0)
> -				goto out;
> -			size = error;
> -		}
>  	}
>  
>  	error = vfs_setxattr(d, kname, kvalue, size, flags);
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index 1e7fe311cabe..b2f698915c0f 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -270,6 +270,6 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
>  /* audit system wants to get cap info from files as well */
>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
>  
> -extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
> +extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
>  
>  #endif /* !_LINUX_CAPABILITY_H */
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 59bf3c1674c8..bacc1111d871 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -473,7 +473,7 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap)
>   *
>   * If all is ok, we return the new size, on error return < 0.
>   */
> -int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
> +int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
>  {
>  	struct vfs_ns_cap_data *nscap;
>  	uid_t nsrootid;
> @@ -516,7 +516,6 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
>  	nscap->magic_etc = cpu_to_le32(nsmagic);
>  	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
>  
> -	kvfree(*ivalue);
>  	*ivalue = nscap;
>  	return newsize;
>  }
Miklos Szeredi Jan. 11, 2021, 1:49 p.m. UTC | #3
On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
> Miklos Szeredi <mszeredi@redhat.com> writes:
> 
> > cap_convert_nscap() does permission checking as well as conversion of the
> > xattr value conditionally based on fs's user-ns.
> >
> > This is needed by overlayfs and probably other layered fs (ecryptfs) and is
> > what vfs_foo() is supposed to do anyway.
> 
> Well crap.
> 
> I just noticed this and it turns out this change is wrong.
> 
> The problem is that it reads the rootid from the v3 fscap, using
> current_user_ns() and then writes it using the sb->s_user_ns.
> 
> So any time the stacked filesystems sb->s_user_ns do not match or
> current_user_ns does not match sb->s_user_ns this could be a problem.
> 
> In a stacked filesystem a second pass through vfs_setxattr will result
> in the rootid being translated a second time (with potentially the wrong
> namespaces).  I think because of the security checks a we won't write
> something we shouldn't be able to write to the filesystem.  Still we
> will be writing the wrong v3 fscap which can go quite badly.
> 
> This doesn't look terribly difficult to fix.
> 
> Probably convert this into a fs independent form using uids in
> init_user_ns at input and have cap_convert_nscap convert the v3 fscap
> into the filesystem dependent form.  With some way for stackable
> filesystems to just skip converting it from the filesystem independent
> format.
> 
> Uids in xattrs that are expected to go directly to disk, but aren't
> always suitable for going directly to disk are tricky.

I've been looking at this for a couple of days and can't say I clearly
understand everything yet.

For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid of
zero, right?

If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
succeeding unconditionally while v3 one being either converted to v2, rejected
or left as v3 depending on current_user_ns())?

Anyway, here's a patch that I think fixes getxattr() layering for
security.capability.  Does basically what you suggested.  Slight change of
semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
hasn't worked for these since the introduction of v3 in 4.14.  Untested.

I still need to wrap my head around the permission requirements for the
setxattr() case...

Thanks,
Miklos

---
 fs/overlayfs/super.c       |   15 +++
 include/linux/capability.h |    2 
 include/linux/fs.h         |    1 
 security/commoncap.c       |  210 ++++++++++++++++++++++++---------------------
 4 files changed, 132 insertions(+), 96 deletions(-)

--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -395,6 +395,20 @@ static int ovl_remount(struct super_bloc
 	return ret;
 }
 
+static int ovl_cap_get(struct dentry *dentry,
+		       struct vfs_ns_cap_data *nscap)
+{
+	int res;
+	const struct cred *old_cred;
+	struct dentry *realdentry = ovl_dentry_real(dentry);
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	res = vfs_cap_get(realdentry, nscap);
+	revert_creds(old_cred);
+
+	return res;
+}
+
 static const struct super_operations ovl_super_operations = {
 	.alloc_inode	= ovl_alloc_inode,
 	.free_inode	= ovl_free_inode,
@@ -405,6 +419,7 @@ static const struct super_operations ovl
 	.statfs		= ovl_statfs,
 	.show_options	= ovl_show_options,
 	.remount_fs	= ovl_remount,
+	.cap_get	= ovl_cap_get,
 };
 
 enum {
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -272,4 +272,6 @@ extern int get_vfs_caps_from_disk(const
 
 extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
 
+int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
+
 #endif /* !_LINUX_CAPABILITY_H */
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1963,6 +1963,7 @@ struct super_operations {
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
 				    struct shrink_control *);
+	int (*cap_get)(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
 };
 
 /*
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -341,6 +341,13 @@ static __u32 sansflags(__u32 m)
 	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
 }
 
+static bool is_v1header(size_t size, const struct vfs_cap_data *cap)
+{
+	if (size != XATTR_CAPS_SZ_1)
+		return false;
+	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_1;
+}
+
 static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
 {
 	if (size != XATTR_CAPS_SZ_2)
@@ -355,6 +362,72 @@ static bool is_v3header(size_t size, con
 	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
 }
 
+static bool validheader(size_t size, const struct vfs_cap_data *cap)
+{
+	return is_v1header(size, cap) || is_v2header(size, cap) || is_v3header(size, cap);
+}
+
+static void cap_to_v3(const struct vfs_cap_data *cap,
+			 struct vfs_ns_cap_data *nscap)
+{
+	u32 magic, nsmagic;
+
+	nsmagic = VFS_CAP_REVISION_3;
+	magic = le32_to_cpu(cap->magic_etc);
+	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+	nscap->magic_etc = cpu_to_le32(nsmagic);
+	nscap->rootid = cpu_to_le32(0);
+}
+
+static int default_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
+{
+	int err;
+	ssize_t size;
+	kuid_t kroot;
+	uid_t root, mappedroot;
+	char *tmpbuf = NULL;
+	struct vfs_cap_data *cap;
+	struct user_namespace *fs_ns = dentry->d_sb->s_user_ns;
+
+	size = vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf,
+				  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+	if (size < 0)
+		return size;
+
+	cap = (struct vfs_cap_data *) tmpbuf;
+	err = -EINVAL;
+	if (!validheader(size, cap))
+		goto out;
+
+	memset(nscap, 0, sizeof(*nscap));
+	memcpy(nscap, tmpbuf, size);
+	if (!is_v3header(size, cap))
+		cap_to_v3(cap, nscap);
+
+	/* Convert rootid from fs user namespace to init user namespace */
+	root = le32_to_cpu(nscap->rootid);
+	kroot = make_kuid(fs_ns, root);
+	mappedroot = from_kuid(&init_user_ns, kroot);
+	nscap->rootid = cpu_to_le32(mappedroot);
+
+	err = 0;
+out:
+	kfree(tmpbuf);
+	return err;
+}
+
+int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	if (sb->s_op->cap_get)
+		return sb->s_op->cap_get(dentry, nscap);
+	else
+		return default_cap_get(dentry, nscap);
+}
+EXPORT_SYMBOL(vfs_cap_get);
+
 /*
  * getsecurity: We are called for security.* before any attempt to read the
  * xattr from the inode itself.
@@ -369,14 +442,15 @@ static bool is_v3header(size_t size, con
 int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 			  bool alloc)
 {
-	int size, ret;
+	int ret;
+	ssize_t size;
 	kuid_t kroot;
+	__le32 nsmagic, magic;
 	uid_t root, mappedroot;
-	char *tmpbuf = NULL;
+	void *tmpbuf = NULL;
 	struct vfs_cap_data *cap;
-	struct vfs_ns_cap_data *nscap;
+	struct vfs_ns_cap_data nscap;
 	struct dentry *dentry;
-	struct user_namespace *fs_ns;
 
 	if (strcmp(name, "capability") != 0)
 		return -EOPNOTSUPP;
@@ -385,67 +459,50 @@ int cap_inode_getsecurity(struct inode *
 	if (!dentry)
 		return -EINVAL;
 
-	size = sizeof(struct vfs_ns_cap_data);
-	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
-				 &tmpbuf, size, GFP_NOFS);
+	ret = vfs_cap_get(dentry, &nscap);
 	dput(dentry);
 
 	if (ret < 0)
 		return ret;
 
-	fs_ns = inode->i_sb->s_user_ns;
-	cap = (struct vfs_cap_data *) tmpbuf;
-	if (is_v2header((size_t) ret, cap)) {
-		/* If this is sizeof(vfs_cap_data) then we're ok with the
-		 * on-disk value, so return that.  */
-		if (alloc)
-			*buffer = tmpbuf;
-		else
-			kfree(tmpbuf);
-		return ret;
-	} else if (!is_v3header((size_t) ret, cap)) {
-		kfree(tmpbuf);
-		return -EINVAL;
-	}
+	tmpbuf = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+	if (!tmpbuf)
+		return -ENOMEM;
 
-	nscap = (struct vfs_ns_cap_data *) tmpbuf;
-	root = le32_to_cpu(nscap->rootid);
-	kroot = make_kuid(fs_ns, root);
+	root = le32_to_cpu(nscap.rootid);
+	kroot = make_kuid(&init_user_ns, root);
 
 	/* If the root kuid maps to a valid uid in current ns, then return
 	 * this as a nscap. */
 	mappedroot = from_kuid(current_user_ns(), kroot);
 	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+		size = sizeof(struct vfs_cap_data);
 		if (alloc) {
 			*buffer = tmpbuf;
-			nscap->rootid = cpu_to_le32(mappedroot);
-		} else
-			kfree(tmpbuf);
-		return size;
+			tmpbuf = NULL;
+			nscap.rootid = cpu_to_le32(mappedroot);
+			memcpy(*buffer, &nscap, size);
+		}
+		goto out;
 	}
 
-	if (!rootid_owns_currentns(kroot)) {
-		kfree(tmpbuf);
-		return -EOPNOTSUPP;
-	}
+	size = -EOPNOTSUPP;
+	if (!rootid_owns_currentns(kroot))
+		goto out;
 
 	/* This comes from a parent namespace.  Return as a v2 capability */
 	size = sizeof(struct vfs_cap_data);
 	if (alloc) {
-		*buffer = kmalloc(size, GFP_ATOMIC);
-		if (*buffer) {
-			struct vfs_cap_data *cap = *buffer;
-			__le32 nsmagic, magic;
-			magic = VFS_CAP_REVISION_2;
-			nsmagic = le32_to_cpu(nscap->magic_etc);
-			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
-				magic |= VFS_CAP_FLAGS_EFFECTIVE;
-			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
-			cap->magic_etc = cpu_to_le32(magic);
-		} else {
-			size = -ENOMEM;
-		}
+		cap = *buffer = tmpbuf;
+		tmpbuf = NULL;
+		magic = VFS_CAP_REVISION_2;
+		nsmagic = le32_to_cpu(nscap.magic_etc);
+		if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+			magic |= VFS_CAP_FLAGS_EFFECTIVE;
+		memcpy(&cap->data, &nscap.data, sizeof(__le32) * 2 * VFS_CAP_U32);
+		cap->magic_etc = cpu_to_le32(magic);
 	}
+out:
 	kfree(tmpbuf);
 	return size;
 }
@@ -462,11 +519,6 @@ static kuid_t rootid_from_xattr(const vo
 	return make_kuid(task_ns, rootid);
 }
 
-static bool validheader(size_t size, const struct vfs_cap_data *cap)
-{
-	return is_v2header(size, cap) || is_v3header(size, cap);
-}
-
 /*
  * User requested a write of security.capability.  If needed, update the
  * xattr to change from v2 to v3, or to fixup the v3 rootid.
@@ -570,74 +622,40 @@ static inline int bprm_caps_from_vfs_cap
 int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
 {
 	struct inode *inode = d_backing_inode(dentry);
-	__u32 magic_etc;
-	unsigned tocopy, i;
-	int size;
-	struct vfs_ns_cap_data data, *nscaps = &data;
-	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
-	kuid_t rootkuid;
-	struct user_namespace *fs_ns;
+	unsigned int i;
+	int ret;
+	struct vfs_ns_cap_data nscaps;
 
 	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
 
 	if (!inode)
 		return -ENODATA;
 
-	fs_ns = inode->i_sb->s_user_ns;
-	size = __vfs_getxattr((struct dentry *)dentry, inode,
-			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
-	if (size == -ENODATA || size == -EOPNOTSUPP)
+	ret = vfs_cap_get((struct dentry *) dentry, &nscaps);
+	if (ret == -ENODATA || ret == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
 
-	if (size < 0)
-		return size;
-
-	if (size < sizeof(magic_etc))
-		return -EINVAL;
-
-	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
+	if (ret < 0)
+		return ret;
 
-	rootkuid = make_kuid(fs_ns, 0);
-	switch (magic_etc & VFS_CAP_REVISION_MASK) {
-	case VFS_CAP_REVISION_1:
-		if (size != XATTR_CAPS_SZ_1)
-			return -EINVAL;
-		tocopy = VFS_CAP_U32_1;
-		break;
-	case VFS_CAP_REVISION_2:
-		if (size != XATTR_CAPS_SZ_2)
-			return -EINVAL;
-		tocopy = VFS_CAP_U32_2;
-		break;
-	case VFS_CAP_REVISION_3:
-		if (size != XATTR_CAPS_SZ_3)
-			return -EINVAL;
-		tocopy = VFS_CAP_U32_3;
-		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
-		break;
+	cpu_caps->magic_etc = le32_to_cpu(nscaps.magic_etc);
+	cpu_caps->rootid = make_kuid(&init_user_ns, le32_to_cpu(nscaps.rootid));
 
-	default:
-		return -EINVAL;
-	}
 	/* Limit the caps to the mounter of the filesystem
 	 * or the more limited uid specified in the xattr.
 	 */
-	if (!rootid_owns_currentns(rootkuid))
+	if (!rootid_owns_currentns(cpu_caps->rootid))
 		return -ENODATA;
 
 	CAP_FOR_EACH_U32(i) {
-		if (i >= tocopy)
-			break;
-		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
-		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
+		cpu_caps->permitted.cap[i] = le32_to_cpu(nscaps.data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(nscaps.data[i].inheritable);
 	}
 
 	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
 	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
 
-	cpu_caps->rootid = rootkuid;
-
 	return 0;
 }
Eric W. Biederman Jan. 12, 2021, 12:14 a.m. UTC | #4
Miklos Szeredi <miklos@szeredi.hu> writes:

> On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
>> Miklos Szeredi <mszeredi@redhat.com> writes:
>> 
>> > cap_convert_nscap() does permission checking as well as conversion of the
>> > xattr value conditionally based on fs's user-ns.
>> >
>> > This is needed by overlayfs and probably other layered fs (ecryptfs) and is
>> > what vfs_foo() is supposed to do anyway.
>> 
>> Well crap.
>> 
>> I just noticed this and it turns out this change is wrong.
>> 
>> The problem is that it reads the rootid from the v3 fscap, using
>> current_user_ns() and then writes it using the sb->s_user_ns.
>> 
>> So any time the stacked filesystems sb->s_user_ns do not match or
>> current_user_ns does not match sb->s_user_ns this could be a problem.
>> 
>> In a stacked filesystem a second pass through vfs_setxattr will result
>> in the rootid being translated a second time (with potentially the wrong
>> namespaces).  I think because of the security checks a we won't write
>> something we shouldn't be able to write to the filesystem.  Still we
>> will be writing the wrong v3 fscap which can go quite badly.
>> 
>> This doesn't look terribly difficult to fix.
>> 
>> Probably convert this into a fs independent form using uids in
>> init_user_ns at input and have cap_convert_nscap convert the v3 fscap
>> into the filesystem dependent form.  With some way for stackable
>> filesystems to just skip converting it from the filesystem independent
>> format.
>> 
>> Uids in xattrs that are expected to go directly to disk, but aren't
>> always suitable for going directly to disk are tricky.
>
> I've been looking at this for a couple of days and can't say I clearly
> understand everything yet.
>
> For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid of
> zero, right?

Yes.  This assumes that everything is translated into the uids of the
target filesystem.

> If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
> succeeding unconditionally while v3 one being either converted to v2, rejected
> or left as v3 depending on current_user_ns())?

As I understand it v2 fscaps have always succeeded unconditionally.  The
only case I can see for a v2 fscap might not succeed when read is if the
filesystem is outside of the initial user namespace.


> Anyway, here's a patch that I think fixes getxattr() layering for
> security.capability.  Does basically what you suggested.  Slight change of
> semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
> hasn't worked for these since the introduction of v3 in 4.14.
> Untested.

Taking a look.  The goal of change how these operate is to make it so
that layered filesystems can just pass through the data if they don't
want to change anything (even with the user namespaces of the
filesystems in question are different).

Feedback on the code below:
- cap_get should be in inode_operations like get_acl and set_acl.

- cap_get should return a cpu_vfs_cap_data.

  Which means that only make_kuid is needed when reading the cap from
  disk.

  Which means that except for the rootid_owns_currentns check (which
  needs to happen elsewhere) default_cap_get should be today's
  get_vfs_cap_from_disk.

- With the introduction of cap_get I believe commoncap should stop
  implementing the security_inode_getsecurity hook, and rather have
  getxattr observe is the file capability xatter and call the new
  vfs_cap_get then translate to a v2 or v3 cap as appropriate when
  returning the cap to userspace.

I think that would put the code on a solid comprehensible foundation.

Eric


> I still need to wrap my head around the permission requirements for the
> setxattr() case...
>
> Thanks,
> Miklos
>
> ---
>  fs/overlayfs/super.c       |   15 +++
>  include/linux/capability.h |    2 
>  include/linux/fs.h         |    1 
>  security/commoncap.c       |  210 ++++++++++++++++++++++++---------------------
>  4 files changed, 132 insertions(+), 96 deletions(-)
>
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -395,6 +395,20 @@ static int ovl_remount(struct super_bloc
>  	return ret;
>  }
>  
> +static int ovl_cap_get(struct dentry *dentry,
> +		       struct vfs_ns_cap_data *nscap)
> +{
> +	int res;
> +	const struct cred *old_cred;
> +	struct dentry *realdentry = ovl_dentry_real(dentry);
> +
> +	old_cred = ovl_override_creds(dentry->d_sb);
> +	res = vfs_cap_get(realdentry, nscap);
> +	revert_creds(old_cred);
> +
> +	return res;
> +}
> +
>  static const struct super_operations ovl_super_operations = {
>  	.alloc_inode	= ovl_alloc_inode,
>  	.free_inode	= ovl_free_inode,
> @@ -405,6 +419,7 @@ static const struct super_operations ovl
>  	.statfs		= ovl_statfs,
>  	.show_options	= ovl_show_options,
>  	.remount_fs	= ovl_remount,
> +	.cap_get	= ovl_cap_get,
>  };
>  
>  enum {
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -272,4 +272,6 @@ extern int get_vfs_caps_from_disk(const
>  
>  extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
>  
> +int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
> +
>  #endif /* !_LINUX_CAPABILITY_H */
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1963,6 +1963,7 @@ struct super_operations {
>  				  struct shrink_control *);
>  	long (*free_cached_objects)(struct super_block *,
>  				    struct shrink_control *);
> +	int (*cap_get)(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
>  };
>  
>  /*
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -341,6 +341,13 @@ static __u32 sansflags(__u32 m)
>  	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
>  }
>  
> +static bool is_v1header(size_t size, const struct vfs_cap_data *cap)
> +{
> +	if (size != XATTR_CAPS_SZ_1)
> +		return false;
> +	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_1;
> +}
> +
>  static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
>  {
>  	if (size != XATTR_CAPS_SZ_2)
> @@ -355,6 +362,72 @@ static bool is_v3header(size_t size, con
>  	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
>  }
>  
> +static bool validheader(size_t size, const struct vfs_cap_data *cap)
> +{
> +	return is_v1header(size, cap) || is_v2header(size, cap) || is_v3header(size, cap);
> +}
> +
> +static void cap_to_v3(const struct vfs_cap_data *cap,
> +			 struct vfs_ns_cap_data *nscap)
> +{
> +	u32 magic, nsmagic;
> +
> +	nsmagic = VFS_CAP_REVISION_3;
> +	magic = le32_to_cpu(cap->magic_etc);
> +	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
> +		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
> +	nscap->magic_etc = cpu_to_le32(nsmagic);
> +	nscap->rootid = cpu_to_le32(0);
> +}
> +
> +static int default_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
> +{
> +	int err;
> +	ssize_t size;
> +	kuid_t kroot;
> +	uid_t root, mappedroot;
> +	char *tmpbuf = NULL;
> +	struct vfs_cap_data *cap;
> +	struct user_namespace *fs_ns = dentry->d_sb->s_user_ns;
> +
> +	size = vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf,
> +				  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
> +	if (size < 0)
> +		return size;
> +
> +	cap = (struct vfs_cap_data *) tmpbuf;
> +	err = -EINVAL;
> +	if (!validheader(size, cap))
> +		goto out;
> +
> +	memset(nscap, 0, sizeof(*nscap));
> +	memcpy(nscap, tmpbuf, size);
> +	if (!is_v3header(size, cap))
> +		cap_to_v3(cap, nscap);
> +
> +	/* Convert rootid from fs user namespace to init user namespace */
> +	root = le32_to_cpu(nscap->rootid);
> +	kroot = make_kuid(fs_ns, root);
> +	mappedroot = from_kuid(&init_user_ns, kroot);
> +	nscap->rootid = cpu_to_le32(mappedroot);
> +
> +	err = 0;
> +out:
> +	kfree(tmpbuf);
> +	return err;
> +}
> +
> +int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
> +{
> +	struct super_block *sb = dentry->d_sb;
> +
> +	if (sb->s_op->cap_get)
> +		return sb->s_op->cap_get(dentry, nscap);
> +	else
> +		return default_cap_get(dentry, nscap);
> +}
> +EXPORT_SYMBOL(vfs_cap_get);
> +
>  /*
>   * getsecurity: We are called for security.* before any attempt to read the
>   * xattr from the inode itself.
> @@ -369,14 +442,15 @@ static bool is_v3header(size_t size, con
>  int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
>  			  bool alloc)
>  {
> -	int size, ret;
> +	int ret;
> +	ssize_t size;
>  	kuid_t kroot;
> +	__le32 nsmagic, magic;
>  	uid_t root, mappedroot;
> -	char *tmpbuf = NULL;
> +	void *tmpbuf = NULL;
>  	struct vfs_cap_data *cap;
> -	struct vfs_ns_cap_data *nscap;
> +	struct vfs_ns_cap_data nscap;
>  	struct dentry *dentry;
> -	struct user_namespace *fs_ns;
>  
>  	if (strcmp(name, "capability") != 0)
>  		return -EOPNOTSUPP;
> @@ -385,67 +459,50 @@ int cap_inode_getsecurity(struct inode *
>  	if (!dentry)
>  		return -EINVAL;
>  
> -	size = sizeof(struct vfs_ns_cap_data);
> -	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
> -				 &tmpbuf, size, GFP_NOFS);
> +	ret = vfs_cap_get(dentry, &nscap);
>  	dput(dentry);
>  
>  	if (ret < 0)
>  		return ret;
>  
> -	fs_ns = inode->i_sb->s_user_ns;
> -	cap = (struct vfs_cap_data *) tmpbuf;
> -	if (is_v2header((size_t) ret, cap)) {
> -		/* If this is sizeof(vfs_cap_data) then we're ok with the
> -		 * on-disk value, so return that.  */
> -		if (alloc)
> -			*buffer = tmpbuf;
> -		else
> -			kfree(tmpbuf);
> -		return ret;
> -	} else if (!is_v3header((size_t) ret, cap)) {
> -		kfree(tmpbuf);
> -		return -EINVAL;
> -	}
> +	tmpbuf = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
> +	if (!tmpbuf)
> +		return -ENOMEM;
>  
> -	nscap = (struct vfs_ns_cap_data *) tmpbuf;
> -	root = le32_to_cpu(nscap->rootid);
> -	kroot = make_kuid(fs_ns, root);
> +	root = le32_to_cpu(nscap.rootid);
> +	kroot = make_kuid(&init_user_ns, root);
>  
>  	/* If the root kuid maps to a valid uid in current ns, then return
>  	 * this as a nscap. */
>  	mappedroot = from_kuid(current_user_ns(), kroot);
>  	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
> +		size = sizeof(struct vfs_cap_data);
>  		if (alloc) {
>  			*buffer = tmpbuf;
> -			nscap->rootid = cpu_to_le32(mappedroot);
> -		} else
> -			kfree(tmpbuf);
> -		return size;
> +			tmpbuf = NULL;
> +			nscap.rootid = cpu_to_le32(mappedroot);
> +			memcpy(*buffer, &nscap, size);
> +		}
> +		goto out;
>  	}
>  
> -	if (!rootid_owns_currentns(kroot)) {
> -		kfree(tmpbuf);
> -		return -EOPNOTSUPP;
> -	}
> +	size = -EOPNOTSUPP;
> +	if (!rootid_owns_currentns(kroot))
> +		goto out;
>  
>  	/* This comes from a parent namespace.  Return as a v2 capability */
>  	size = sizeof(struct vfs_cap_data);
>  	if (alloc) {
> -		*buffer = kmalloc(size, GFP_ATOMIC);
> -		if (*buffer) {
> -			struct vfs_cap_data *cap = *buffer;
> -			__le32 nsmagic, magic;
> -			magic = VFS_CAP_REVISION_2;
> -			nsmagic = le32_to_cpu(nscap->magic_etc);
> -			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
> -				magic |= VFS_CAP_FLAGS_EFFECTIVE;
> -			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
> -			cap->magic_etc = cpu_to_le32(magic);
> -		} else {
> -			size = -ENOMEM;
> -		}
> +		cap = *buffer = tmpbuf;
> +		tmpbuf = NULL;
> +		magic = VFS_CAP_REVISION_2;
> +		nsmagic = le32_to_cpu(nscap.magic_etc);
> +		if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
> +			magic |= VFS_CAP_FLAGS_EFFECTIVE;
> +		memcpy(&cap->data, &nscap.data, sizeof(__le32) * 2 * VFS_CAP_U32);
> +		cap->magic_etc = cpu_to_le32(magic);
>  	}
> +out:
>  	kfree(tmpbuf);
>  	return size;
>  }
> @@ -462,11 +519,6 @@ static kuid_t rootid_from_xattr(const vo
>  	return make_kuid(task_ns, rootid);
>  }
>  
> -static bool validheader(size_t size, const struct vfs_cap_data *cap)
> -{
> -	return is_v2header(size, cap) || is_v3header(size, cap);
> -}
> -
>  /*
>   * User requested a write of security.capability.  If needed, update the
>   * xattr to change from v2 to v3, or to fixup the v3 rootid.
> @@ -570,74 +622,40 @@ static inline int bprm_caps_from_vfs_cap
>  int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
>  {
>  	struct inode *inode = d_backing_inode(dentry);
> -	__u32 magic_etc;
> -	unsigned tocopy, i;
> -	int size;
> -	struct vfs_ns_cap_data data, *nscaps = &data;
> -	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
> -	kuid_t rootkuid;
> -	struct user_namespace *fs_ns;
> +	unsigned int i;
> +	int ret;
> +	struct vfs_ns_cap_data nscaps;
>  
>  	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
>  
>  	if (!inode)
>  		return -ENODATA;
>  
> -	fs_ns = inode->i_sb->s_user_ns;
> -	size = __vfs_getxattr((struct dentry *)dentry, inode,
> -			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> -	if (size == -ENODATA || size == -EOPNOTSUPP)
> +	ret = vfs_cap_get((struct dentry *) dentry, &nscaps);
> +	if (ret == -ENODATA || ret == -EOPNOTSUPP)
>  		/* no data, that's ok */
>  		return -ENODATA;
>  
> -	if (size < 0)
> -		return size;
> -
> -	if (size < sizeof(magic_etc))
> -		return -EINVAL;
> -
> -	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
> +	if (ret < 0)
> +		return ret;
>  
> -	rootkuid = make_kuid(fs_ns, 0);
> -	switch (magic_etc & VFS_CAP_REVISION_MASK) {
> -	case VFS_CAP_REVISION_1:
> -		if (size != XATTR_CAPS_SZ_1)
> -			return -EINVAL;
> -		tocopy = VFS_CAP_U32_1;
> -		break;
> -	case VFS_CAP_REVISION_2:
> -		if (size != XATTR_CAPS_SZ_2)
> -			return -EINVAL;
> -		tocopy = VFS_CAP_U32_2;
> -		break;
> -	case VFS_CAP_REVISION_3:
> -		if (size != XATTR_CAPS_SZ_3)
> -			return -EINVAL;
> -		tocopy = VFS_CAP_U32_3;
> -		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
> -		break;
> +	cpu_caps->magic_etc = le32_to_cpu(nscaps.magic_etc);
> +	cpu_caps->rootid = make_kuid(&init_user_ns, le32_to_cpu(nscaps.rootid));
>  
> -	default:
> -		return -EINVAL;
> -	}
>  	/* Limit the caps to the mounter of the filesystem
>  	 * or the more limited uid specified in the xattr.
>  	 */
> -	if (!rootid_owns_currentns(rootkuid))
> +	if (!rootid_owns_currentns(cpu_caps->rootid))
>  		return -ENODATA;
>  
>  	CAP_FOR_EACH_U32(i) {
> -		if (i >= tocopy)
> -			break;
> -		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
> -		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
> +		cpu_caps->permitted.cap[i] = le32_to_cpu(nscaps.data[i].permitted);
> +		cpu_caps->inheritable.cap[i] = le32_to_cpu(nscaps.data[i].inheritable);
>  	}
>  
>  	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
>  	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
>  
> -	cpu_caps->rootid = rootkuid;
> -
>  	return 0;
>  }
>
Miklos Szeredi Jan. 12, 2021, 9:43 a.m. UTC | #5
On Tue, Jan 12, 2021 at 1:15 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Miklos Szeredi <miklos@szeredi.hu> writes:
>
> > On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:

> > For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid of
> > zero, right?
>
> Yes.  This assumes that everything is translated into the uids of the
> target filesystem.
>
> > If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
> > succeeding unconditionally while v3 one being either converted to v2, rejected
> > or left as v3 depending on current_user_ns())?
>
> As I understand it v2 fscaps have always succeeded unconditionally.  The
> only case I can see for a v2 fscap might not succeed when read is if the
> filesystem is outside of the initial user namespace.

Looking again, it's rather confusing.  cap_inode_getsecurity()
currently handles the following cases:

v1: -> fails with -EINVAL

v2: -> returns unconverted xattr

v3:
 a) rootid is mapped in the current namespace to non-zero:
     -> convert rootid

 b) rootid owns the current or ancerstor namespace:
     -> convert to v2

 c) rootid is not mapped and is not owner:
     -> return -EOPNOTSUPP -> falls back to unconverted v3

So lets take the example, where a tmpfs is created in a private user
namespace and one file has a v2 cap and the other an equivalent v3 cap
with a zero rootid.  This is the result when looking at it from

1) the namespace of the fs:
---------------------------------------
t = cap_dac_override+eip
tt = cap_dac_override+eip

2) the initial namespace:
---------------------------------------
t = cap_dac_override+eip
tt = cap_dac_override+eip [rootid=1000]

3) an unrelated namespace:
---------------------------------------
t = cap_dac_override+eip
tt = cap_dac_override+eip

Note: in this last case getxattr will actually return a v3 cap with
zero rootid for "tt" which getcap does not display due to being zero.
I could do a setup with a nested namespaces that better demonstrate
the confusing nature of this, but I think this also proves the point.

At this point userspace simply cannot determine whether the returned
cap is in any way valid or not.

The following semantics would make a ton more sense, since getting a
v2 would indicate that rootid is unknown:

- if cap is v2 convert to v3 with zero rootid
- after this, check if rootid needs to be translated, if not return v3
- if yes, try to translate to current ns, if succeeds return translated v3
- if not mappable, return v2

Hmm?

> > Anyway, here's a patch that I think fixes getxattr() layering for
> > security.capability.  Does basically what you suggested.  Slight change of
> > semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
> > hasn't worked for these since the introduction of v3 in 4.14.
> > Untested.
>
> Taking a look.  The goal of change how these operate is to make it so
> that layered filesystems can just pass through the data if they don't
> want to change anything (even with the user namespaces of the
> filesystems in question are different).
>
> Feedback on the code below:
> - cap_get should be in inode_operations like get_acl and set_acl.

So it's not clear to me why xattr ops are per-sb and acl ops are per-inode.

> - cap_get should return a cpu_vfs_cap_data.
>
>   Which means that only make_kuid is needed when reading the cap from
>   disk.

It also means translating the cap bits back and forth between disk and
cpu endian.  Not a big deal, but...

>   Which means that except for the rootid_owns_currentns check (which
>   needs to happen elsewhere) default_cap_get should be today's
>   get_vfs_cap_from_disk.

That's true.   So what's the deal with v1 caps?  Support was silently
dropped for getxattr/setxattr but remained in get_vfs_caps_from_disk()
(I guess to not break legacy disk images), but maybe it's time to
deprecate v1 caps completely?

> - With the introduction of cap_get I believe commoncap should stop
>   implementing the security_inode_getsecurity hook, and rather have
>   getxattr observe is the file capability xatter and call the new
>   vfs_cap_get then translate to a v2 or v3 cap as appropriate when
>   returning the cap to userspace.

Confused.  vfs_cap_get() is the one the layered filesystem will
recurse with, so it must not translate the cap.   The one to do that
would be __vfs_getxattr(), right?

Thanks,
Miklos
Miklos Szeredi Jan. 12, 2021, 10:04 a.m. UTC | #6
On Tue, Jan 12, 2021 at 10:43 AM Miklos Szeredi <miklos@szeredi.hu> wrote:

> The following semantics would make a ton more sense, since getting a
> v2 would indicate that rootid is unknown:
>
> - if cap is v2 convert to v3 with zero rootid
> - after this, check if rootid needs to be translated, if not return v3
> - if yes, try to translate to current ns, if succeeds return translated v3
> - if not mappable, return v2
>
> Hmm?

Actually, it would make even more sense to simply skip unconvertible
caps and return -ENODATA.  In fact that's what I thought would happen
until I looked at the -EOPNOTSUPP fallback code in vfs_getxattr().

Serge, do you remember what was the reason for the unconverted fallback?

Thanks,
Miklos
Eric W. Biederman Jan. 12, 2021, 6:36 p.m. UTC | #7
Miklos Szeredi <miklos@szeredi.hu> writes:

> On Tue, Jan 12, 2021 at 1:15 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Miklos Szeredi <miklos@szeredi.hu> writes:
>>
>> > On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
>
>> > For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid of
>> > zero, right?
>>
>> Yes.  This assumes that everything is translated into the uids of the
>> target filesystem.
>>
>> > If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
>> > succeeding unconditionally while v3 one being either converted to v2, rejected
>> > or left as v3 depending on current_user_ns())?
>>
>> As I understand it v2 fscaps have always succeeded unconditionally.  The
>> only case I can see for a v2 fscap might not succeed when read is if the
>> filesystem is outside of the initial user namespace.
>
> Looking again, it's rather confusing.  cap_inode_getsecurity()
> currently handles the following cases:
>
> v1: -> fails with -EINVAL
>
> v2: -> returns unconverted xattr
>
> v3:
>  a) rootid is mapped in the current namespace to non-zero:
>      -> convert rootid
>
>  b) rootid owns the current or ancerstor namespace:
>      -> convert to v2
>
>  c) rootid is not mapped and is not owner:
>      -> return -EOPNOTSUPP -> falls back to unconverted v3
>
> So lets take the example, where a tmpfs is created in a private user
> namespace and one file has a v2 cap and the other an equivalent v3 cap
> with a zero rootid.  This is the result when looking at it from
>
> 1) the namespace of the fs:
> ---------------------------------------
> t = cap_dac_override+eip
> tt = cap_dac_override+eip
>
> 2) the initial namespace:
> ---------------------------------------
> t = cap_dac_override+eip
> tt = cap_dac_override+eip [rootid=1000]
>
> 3) an unrelated namespace:
> ---------------------------------------
> t = cap_dac_override+eip
> tt = cap_dac_override+eip
>
> Note: in this last case getxattr will actually return a v3 cap with
> zero rootid for "tt" which getcap does not display due to being zero.
> I could do a setup with a nested namespaces that better demonstrate
> the confusing nature of this, but I think this also proves the point.

Yes.  There is real confusion on the reading case when the namespaces
do not match.

> At this point userspace simply cannot determine whether the returned
> cap is in any way valid or not.
>
> The following semantics would make a ton more sense, since getting a
> v2 would indicate that rootid is unknown:

> - if cap is v2 convert to v3 with zero rootid
> - after this, check if rootid needs to be translated, if not return v3
> - if yes, try to translate to current ns, if succeeds return translated v3
> - if not mappable, return v2
>
> Hmm?

So there is the basic question do we want to read the raw bytes on disk
or do we want to return something meaningful to the reader.  As the
existing tools use the xattr interface to set/clear fscaps returning
data to user space rather than raw bytes seems the perfered interface.

My ideal semantics would be:

- If current_user_ns() == sb->s_user_ns return the raw data.

  I don't know how to implement this first scenario while permitting
  stacked filesystems.
  
- Calculate the cpu_vfs_cap_data as get_vfs_caps_from_disk does.
  That gives the meaning of the xattr.

- If "from_kuid(current_userns(), krootid) == 0" return a v2 cap.

- If "rootid_owns_currentns()" return a v2 cap.

- Else return an error.  Probably a permission error.

  The fscap simply can not make sense to the user if the rootid does not
  map.  Return a v2 cap would imply that the caps are present on the
  executable (in the current context) which they are not.


>> > Anyway, here's a patch that I think fixes getxattr() layering for
>> > security.capability.  Does basically what you suggested.  Slight change of
>> > semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
>> > hasn't worked for these since the introduction of v3 in 4.14.
>> > Untested.
>>
>> Taking a look.  The goal of change how these operate is to make it so
>> that layered filesystems can just pass through the data if they don't
>> want to change anything (even with the user namespaces of the
>> filesystems in question are different).
>>
>> Feedback on the code below:
>> - cap_get should be in inode_operations like get_acl and set_acl.
>
> So it's not clear to me why xattr ops are per-sb and acl ops are per-inode.

I don't know why either.  What I do see is everything except
inode->i_sb->s_xattr (the list of xattr handlers) is in
inode_operations.

Especially permission.  So just for consistency I would keep everything
in the inode_operations.

>> - cap_get should return a cpu_vfs_cap_data.
>>
>>   Which means that only make_kuid is needed when reading the cap from
>>   disk.
>
> It also means translating the cap bits back and forth between disk and
> cpu endian.  Not a big deal, but...

For the very rare case of userspace reading and writing them.

For the common case of actually using them it should be optimal, and
it allows for non-standard implementations.  Anything where someone gets
clever we want the bits to be in cpu format to make mistakes harder.

>>   Which means that except for the rootid_owns_currentns check (which
>>   needs to happen elsewhere) default_cap_get should be today's
>>   get_vfs_cap_from_disk.
>
> That's true.   So what's the deal with v1 caps?  Support was silently
> dropped for getxattr/setxattr but remained in get_vfs_caps_from_disk()
> (I guess to not break legacy disk images), but maybe it's time to
> deprecate v1 caps completely?

I really don't remember.  When I look I see they appear to be a subset
of v2 caps.  I would have to look to see if they bits might have a
different meaning.

>> - With the introduction of cap_get I believe commoncap should stop
>>   implementing the security_inode_getsecurity hook, and rather have
>>   getxattr observe is the file capability xatter and call the new
>>   vfs_cap_get then translate to a v2 or v3 cap as appropriate when
>>   returning the cap to userspace.
>
> Confused.  vfs_cap_get() is the one the layered filesystem will
> recurse with, so it must not translate the cap.   The one to do that
> would be __vfs_getxattr(), right?

So there are two layers that I am worrying about.

The layer dealing with fscaps knowing they are fscaps.  That is
vfs_cap_get and friends.

Then there is the layer dealing with xattrs which should also handle
fscaps.  So that the xattr layer stacks properly I believe we need to
completely bypass vfs_getxattr and friends and at the edge of userspace
before there is any possibility of interception call vfs_cap_get and
translate the result to a form userspace can consume.

With xattrs and caps we are in part of the kernel that hasn't seen a lot
of attention and so is not the best factored.  So I am proposing we fix
up the abstractions to make it easier to implement stacked fscap support.

Eric
Eric W. Biederman Jan. 12, 2021, 6:49 p.m. UTC | #8
ebiederm@xmission.com (Eric W. Biederman) writes:

> So there is the basic question do we want to read the raw bytes on disk
> or do we want to return something meaningful to the reader.  As the
> existing tools use the xattr interface to set/clear fscaps returning
> data to user space rather than raw bytes seems the perfered interface.
>
> My ideal semantics would be:
>
> - If current_user_ns() == sb->s_user_ns return the raw data.
>
>   I don't know how to implement this first scenario while permitting
>   stacked filesystems.

    After a little more thought I do.

    In getxattr if the get_cap method is not implemented by the
    filesystem if current_user_ns() == sb->s_user_ns simply treat it as
    an ordinary xattr read/write.

    Otherwise call vfs_get_cap and translate the result as described
    below.
    
    The key point of this is it allows for seeing what is actually on
    disk (when it is not confusing).

> - Calculate the cpu_vfs_cap_data as get_vfs_caps_from_disk does.
>   That gives the meaning of the xattr.
>
> - If "from_kuid(current_userns(), krootid) == 0" return a v2 cap.
>
> - If "rootid_owns_currentns()" return a v2 cap.
>
> - Else return an error.  Probably a permission error.
>
>   The fscap simply can not make sense to the user if the rootid does not
>   map.  Return a v2 cap would imply that the caps are present on the
>   executable (in the current context) which they are not.


Eric
diff mbox series

Patch

diff --git a/fs/xattr.c b/fs/xattr.c
index cd7a563e8bcd..fd57153b1f61 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -276,8 +276,16 @@  vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
+	const void  *orig_value = value;
 	int error;
 
+	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
+		error = cap_convert_nscap(dentry, &value, size);
+		if (error < 0)
+			return error;
+		size = error;
+	}
+
 retry_deleg:
 	inode_lock(inode);
 	error = __vfs_setxattr_locked(dentry, name, value, size, flags,
@@ -289,6 +297,9 @@  vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		if (!error)
 			goto retry_deleg;
 	}
+	if (value != orig_value)
+		kfree(value);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -537,12 +548,6 @@  setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
 			posix_acl_fix_xattr_from_user(kvalue, size);
-		else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
-			error = cap_convert_nscap(d, &kvalue, size);
-			if (error < 0)
-				goto out;
-			size = error;
-		}
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 1e7fe311cabe..b2f698915c0f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -270,6 +270,6 @@  static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
-extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
+extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/security/commoncap.c b/security/commoncap.c
index 59bf3c1674c8..bacc1111d871 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -473,7 +473,7 @@  static bool validheader(size_t size, const struct vfs_cap_data *cap)
  *
  * If all is ok, we return the new size, on error return < 0.
  */
-int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
+int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 {
 	struct vfs_ns_cap_data *nscap;
 	uid_t nsrootid;
@@ -516,7 +516,6 @@  int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
 	nscap->magic_etc = cpu_to_le32(nsmagic);
 	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
 
-	kvfree(*ivalue);
 	*ivalue = nscap;
 	return newsize;
 }