diff mbox series

[v2,3/3] pidfs: implement file handle support

Message ID 20241113-pidfs_fh-v2-3-9a4d28155a37@e43.eu (mailing list archive)
State New
Headers show
Series [v2,1/3] pseudofs: add support for export_ops | expand

Commit Message

Erin Shepherd Nov. 13, 2024, 5:55 p.m. UTC
On 64-bit platforms, userspace can read the pidfd's inode in order to
get a never-repeated PID identifier. On 32-bit platforms this identifier
is not exposed, as inodes are limited to 32 bits. Instead expose the
identifier via export_fh, which makes it available to userspace via
name_to_handle_at

In addition we implement fh_to_dentry, which allows userspace to
recover a pidfd from a PID file handle.

We stash the process' PID in the root pid namespace inside the handle,
and use that to recover the pid (validating that pid->ino matches the
value in the handle, i.e. that the pid has not been reused).

We use the root namespace in order to ensure that file handles can be
moved across namespaces; however, we validate that the PID exists in
the current namespace before returning the inode.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
---
 fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

Comments

Amir Goldstein Nov. 14, 2024, 7:07 a.m. UTC | #1
On Wed, Nov 13, 2024 at 7:11 PM Erin Shepherd <erin.shepherd@e43.eu> wrote:
>
> On 64-bit platforms, userspace can read the pidfd's inode in order to
> get a never-repeated PID identifier. On 32-bit platforms this identifier
> is not exposed, as inodes are limited to 32 bits. Instead expose the
> identifier via export_fh, which makes it available to userspace via
> name_to_handle_at
>
> In addition we implement fh_to_dentry, which allows userspace to
> recover a pidfd from a PID file handle.

"In addition" is a good indication that a separate patch was a good idea..

>
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
>
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
>
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>

This patch has changed enough that you should not have kept my RVB.

BTW, please refrain from using the same subject for the cover letter and
a single patch. They are not the same thing, so if they get the same
name, one of them has an inaccurate description.

> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> ---
>  fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <linux/anon_inodes.h>
> +#include <linux/exportfs.h>
>  #include <linux/file.h>
>  #include <linux/fs.h>
>  #include <linux/magic.h>
> @@ -347,11 +348,69 @@ static const struct dentry_operations pidfs_dentry_operations = {
>         .d_prune        = stashed_dentry_prune,
>  };
>
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +       u64 ino;
> +       s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> +                          struct inode *parent)
> +{
> +       struct pid *pid = inode->i_private;
> +       struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +       if (*max_len < PIDFD_FID_LEN) {
> +               *max_len = PIDFD_FID_LEN;
> +               return FILEID_INVALID;
> +       }
> +
> +       fid->ino = pid->ino;
> +       fid->pid = pid_nr(pid);
> +       *max_len = PIDFD_FID_LEN;
> +       return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +                                        struct fid *gen_fid,
> +                                        int fh_len, int fh_type)
> +{
> +       int ret;
> +       struct path path;
> +       struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +       struct pid *pid;
> +
> +       if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +               return NULL;
> +
> +       scoped_guard(rcu) {
> +               pid = find_pid_ns(fid->pid, &init_pid_ns);
> +               if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
> +                       return NULL;
> +
> +               pid = get_pid(pid);
> +       }
> +
> +       ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +       if (ret < 0)
> +               return ERR_PTR(ret);

How come no need to put_pid() in case of error?

> +
> +       mntput(path.mnt);
> +       return path.dentry;
> +}
> +
> +static const struct export_operations pidfs_export_operations = {
> +       .encode_fh = pidfs_encode_fh,
> +       .fh_to_dentry = pidfs_fh_to_dentry,
> +       .flags = EXPORT_OP_UNRESTRICTED_OPEN,
> +};
> +
>  static int pidfs_init_inode(struct inode *inode, void *data)
>  {
>         inode->i_private = data;
>         inode->i_flags |= S_PRIVATE;
> -       inode->i_mode |= S_IRWXU;
> +       inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;

This change is not explained.
Why is it here?

Thanks,
Amir.
Erin Shepherd Nov. 14, 2024, 12:42 p.m. UTC | #2
On 14/11/2024 08:07, Amir Goldstein wrote:
> On Wed, Nov 13, 2024 at 7:11 PM Erin Shepherd <erin.shepherd@e43.eu> wrote:
>> On 64-bit platforms, userspace can read the pidfd's inode in order to
>> get a never-repeated PID identifier. On 32-bit platforms this identifier
>> is not exposed, as inodes are limited to 32 bits. Instead expose the
>> identifier via export_fh, which makes it available to userspace via
>> name_to_handle_at
>>
>> In addition we implement fh_to_dentry, which allows userspace to
>> recover a pidfd from a PID file handle.
> "In addition" is a good indication that a separate patch was a good idea..
>
>> We stash the process' PID in the root pid namespace inside the handle,
>> and use that to recover the pid (validating that pid->ino matches the
>> value in the handle, i.e. that the pid has not been reused).
>>
>> We use the root namespace in order to ensure that file handles can be
>> moved across namespaces; however, we validate that the PID exists in
>> the current namespace before returning the inode.
>>
>> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> This patch has changed enough that you should not have kept my RVB.
>
> BTW, please refrain from using the same subject for the cover letter and
> a single patch. They are not the same thing, so if they get the same
> name, one of them has an inaccurate description.
>
ACK to all three.


>> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
>> ---
>>  fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 61 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/pidfs.c b/fs/pidfs.c
>> index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
>> --- a/fs/pidfs.c
>> +++ b/fs/pidfs.c
>> @@ -1,5 +1,6 @@
>>  // SPDX-License-Identifier: GPL-2.0
>>  #include <linux/anon_inodes.h>
>> +#include <linux/exportfs.h>
>>  #include <linux/file.h>
>>  #include <linux/fs.h>
>>  #include <linux/magic.h>
>> @@ -347,11 +348,69 @@ static const struct dentry_operations pidfs_dentry_operations = {
>>         .d_prune        = stashed_dentry_prune,
>>  };
>>
>> +#define PIDFD_FID_LEN 3
>> +
>> +struct pidfd_fid {
>> +       u64 ino;
>> +       s32 pid;
>> +} __packed;
>> +
>> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
>> +                          struct inode *parent)
>> +{
>> +       struct pid *pid = inode->i_private;
>> +       struct pidfd_fid *fid = (struct pidfd_fid *)fh;
>> +
>> +       if (*max_len < PIDFD_FID_LEN) {
>> +               *max_len = PIDFD_FID_LEN;
>> +               return FILEID_INVALID;
>> +       }
>> +
>> +       fid->ino = pid->ino;
>> +       fid->pid = pid_nr(pid);
>> +       *max_len = PIDFD_FID_LEN;
>> +       return FILEID_INO64_GEN;
>> +}
>> +
>> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
>> +                                        struct fid *gen_fid,
>> +                                        int fh_len, int fh_type)
>> +{
>> +       int ret;
>> +       struct path path;
>> +       struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
>> +       struct pid *pid;
>> +
>> +       if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
>> +               return NULL;
>> +
>> +       scoped_guard(rcu) {
>> +               pid = find_pid_ns(fid->pid, &init_pid_ns);
>> +               if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
>> +                       return NULL;
>> +
>> +               pid = get_pid(pid);
>> +       }
>> +
>> +       ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
>> +       if (ret < 0)
>> +               return ERR_PTR(ret);
> How come no need to put_pid() in case of error?

This one confused me at first too, but path_from_stashed frees it (via
stashed_ops.put_data) on error. You can see the same pattern in
pidfs_alloc_file.

(It already needs to know how to free it for the case where a stashed
dentry already exists)

>> +
>> +       mntput(path.mnt);
>> +       return path.dentry;
>> +}
>> +
>> +static const struct export_operations pidfs_export_operations = {
>> +       .encode_fh = pidfs_encode_fh,
>> +       .fh_to_dentry = pidfs_fh_to_dentry,
>> +       .flags = EXPORT_OP_UNRESTRICTED_OPEN,
>> +};
>> +
>>  static int pidfs_init_inode(struct inode *inode, void *data)
>>  {
>>         inode->i_private = data;
>>         inode->i_flags |= S_PRIVATE;
>> -       inode->i_mode |= S_IRWXU;
>> +       inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;
> This change is not explained.
> Why is it here?

open_by_handle_at eventually passes through the may_open permission check.
The existing permissions only permits root to open them (since the owning
uid & gid is 0). So, it was necessary to widen them to align with how
pidfd_open works.

If I stick with this approach (see [1]) I'll ensure to document this change
better in the commit message.

[1] https://lore.kernel.org/all/6a3ed633-311d-47ff-8a7e-5121d6186139@e43.eu/
Christian Brauner Nov. 14, 2024, 12:52 p.m. UTC | #3
On Wed, Nov 13, 2024 at 05:55:25PM +0000, Erin Shepherd wrote:
> On 64-bit platforms, userspace can read the pidfd's inode in order to
> get a never-repeated PID identifier. On 32-bit platforms this identifier
> is not exposed, as inodes are limited to 32 bits. Instead expose the
> identifier via export_fh, which makes it available to userspace via
> name_to_handle_at
> 
> In addition we implement fh_to_dentry, which allows userspace to
> recover a pidfd from a PID file handle.
> 
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
> 
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
> 
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> ---

I think you need at least something like the following completely
untested draft on top:

- the pidfs_finish_open_by_handle_at() is somewhat of a clutch to handle
  thread vs thread-group pidfds but it works.

- In contrast to pidfd_open() that uses dentry_open() to create a pidfd
  open_by_handle_at() uses file_open_root(). That's overall fine but
  makes pidfds subject to security hooks which they aren't via
  pidfd_open(). It also necessitats pidfs_finish_open_by_handle_at().
  There's probably other solutions I'm not currently seeing.

- The exportfs_decode_fh_raw() call that's used to decode the pidfd is
  passed vfs_dentry_acceptable() as acceptability callback. For pidfds
  we don't need any of that functionality and we don't need any of the
  disconnected dentry handling logic. So the easiest way to fix that is
  to rely on EXPORT_OP_UNRESTRICTED_OPEN to skip everything. That in
  turns means the only acceptability we have is the nop->fh_to_dentry()
  callback for pidfs.

- This all really needs rigorous selftests before we can even think of
  merging any of this.

Anway, here's the _completely untested, quickly drafted_ diff on top:

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4f2dd4ab4486..65c93f7132d4 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -450,6 +450,13 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
 		goto err_result;
 	}
 
+	/*
+	 * The filesystem has no acceptance criteria other than those in
+	 * nop->fh_to_dentry().
+	 */
+	if (nop->flags & EXPORT_OP_UNRESTRICTED_OPEN)
+		return result;
+
 	/*
 	 * If no acceptance criteria was specified by caller, a disconnected
 	 * dentry is also accepatable. Callers may use this mode to query if
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 056116e58f43..89c2efacc0c3 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
 #include <linux/personality.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/pidfs.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -218,20 +219,21 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
 {
 	int handle_dwords;
 	struct vfsmount *mnt = ctx->root.mnt;
+	struct dentry *dentry;
 
 	/* change the handle size to multiple of sizeof(u32) */
 	handle_dwords = handle->handle_bytes >> 2;
-	path->dentry = exportfs_decode_fh_raw(mnt,
-					  (struct fid *)handle->f_handle,
-					  handle_dwords, handle->handle_type,
-					  ctx->fh_flags,
-					  vfs_dentry_acceptable, ctx);
-	if (IS_ERR_OR_NULL(path->dentry)) {
-		if (path->dentry == ERR_PTR(-ENOMEM))
+	dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
+					handle_dwords, handle->handle_type,
+					ctx->fh_flags, vfs_dentry_acceptable,
+					ctx);
+	if (IS_ERR_OR_NULL(dentry)) {
+		if (dentry == ERR_PTR(-ENOMEM))
 			return -ENOMEM;
 		return -ESTALE;
 	}
 	path->mnt = mntget(mnt);
+	path->dentry = dentry;
 	return 0;
 }
 
@@ -239,7 +241,7 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
 				 unsigned int o_flags)
 {
 	struct path *root = &ctx->root;
-	struct export_operations *nop = root->mnt->mnt_sb->s_export_op;
+	const struct export_operations *nop = root->mnt->mnt_sb->s_export_op;
 
 	if (nop && nop->flags & EXPORT_OP_UNRESTRICTED_OPEN)
 		return true;
@@ -342,7 +344,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 			   int open_flag)
 {
 	long retval = 0;
-	struct path path;
+	struct path path __free(path_put) = {};
 	struct file *file;
 	int fd;
 
@@ -351,19 +353,24 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 		return retval;
 
 	fd = get_unused_fd_flags(open_flag);
-	if (fd < 0) {
-		path_put(&path);
+	if (fd < 0)
 		return fd;
-	}
+
 	file = file_open_root(&path, "", open_flag, 0);
 	if (IS_ERR(file)) {
 		put_unused_fd(fd);
-		retval =  PTR_ERR(file);
-	} else {
-		retval = fd;
-		fd_install(fd, file);
+		return PTR_ERR(file);
 	}
-	path_put(&path);
+
+	retval = pidfs_finish_open_by_handle_at(file, open_flag);
+	if (retval) {
+		put_unused_fd(fd);
+		fput(file);
+		return retval;
+	}
+
+	retval = fd;
+	fd_install(fd, file);
 	return retval;
 }
 
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 0684a9b8fe71..19948002f395 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -237,6 +237,24 @@ struct pid *pidfd_pid(const struct file *file)
 	return file_inode(file)->i_private;
 }
 
+int pidfs_finish_open_by_handle_at(struct file *file, unsigned int oflags)
+{
+	struct pid *pid;
+	bool thread = oflags & PIDFD_THREAD;
+
+	pid = pidfd_pid(file);
+	if (IS_ERR(pid))
+		return 0;
+
+	if (!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
+		return -EINVAL;
+
+	if (thread)
+		file->f_flags |= PIDFD_THREAD;
+
+	return 0;
+}
+
 static struct vfsmount *pidfs_mnt __ro_after_init;
 
 #if BITS_PER_LONG == 32
@@ -377,7 +395,7 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
 					 int fh_len, int fh_type)
 {
 	int ret;
-	struct path path;
+	struct path path __free(path_put) = {};
 	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
 	struct pid *pid;
 
@@ -393,11 +411,12 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
 	}
 
 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
-	if (ret < 0)
+	if (ret < 0) {
+		put_pid(pid);
 		return ERR_PTR(ret);
+	}
 
-	mntput(path.mnt);
-	return path.dentry;
+	return dget(path.dentry);
 }
 
 static const struct export_operations pidfs_export_operations = {
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..9a4130056e7d 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_FS_H
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
+int pidfs_finish_open_by_handle_at(struct file *file, unsigned int oflags);
 void __init pidfs_init(void);
 
 #endif /* _LINUX_PID_FS_H */


>  fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 61 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <linux/anon_inodes.h>
> +#include <linux/exportfs.h>
>  #include <linux/file.h>
>  #include <linux/fs.h>
>  #include <linux/magic.h>
> @@ -347,11 +348,69 @@ static const struct dentry_operations pidfs_dentry_operations = {
>  	.d_prune	= stashed_dentry_prune,
>  };
>  
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +	u64 ino;
> +	s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> +			   struct inode *parent)
> +{
> +	struct pid *pid = inode->i_private;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +	if (*max_len < PIDFD_FID_LEN) {
> +		*max_len = PIDFD_FID_LEN;
> +		return FILEID_INVALID;
> +	}
> +
> +	fid->ino = pid->ino;
> +	fid->pid = pid_nr(pid);
> +	*max_len = PIDFD_FID_LEN;
> +	return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +					 struct fid *gen_fid,
> +					 int fh_len, int fh_type)
> +{
> +	int ret;
> +	struct path path;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +	struct pid *pid;
> +
> +	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +		return NULL;
> +
> +	scoped_guard(rcu) {
> +		pid = find_pid_ns(fid->pid, &init_pid_ns);
> +		if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
> +			return NULL;
> +
> +		pid = get_pid(pid);
> +	}
> +
> +	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	mntput(path.mnt);
> +	return path.dentry;
> +}
> +
> +static const struct export_operations pidfs_export_operations = {
> +	.encode_fh = pidfs_encode_fh,
> +	.fh_to_dentry = pidfs_fh_to_dentry,
> +	.flags = EXPORT_OP_UNRESTRICTED_OPEN,
> +};
> +
>  static int pidfs_init_inode(struct inode *inode, void *data)
>  {
>  	inode->i_private = data;
>  	inode->i_flags |= S_PRIVATE;
> -	inode->i_mode |= S_IRWXU;
> +	inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;
>  	inode->i_op = &pidfs_inode_operations;
>  	inode->i_fop = &pidfs_file_operations;
>  	/*
> @@ -382,6 +441,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
>  		return -ENOMEM;
>  
>  	ctx->ops = &pidfs_sops;
> +	ctx->eops = &pidfs_export_operations;
>  	ctx->dops = &pidfs_dentry_operations;
>  	fc->s_fs_info = (void *)&pidfs_stashed_ops;
>  	return 0;
> 
> -- 
> 2.46.1
>
Erin Shepherd Nov. 14, 2024, 1:13 p.m. UTC | #4
On 14/11/2024 13:52, Christian Brauner wrote:

> I think you need at least something like the following completely
> untested draft on top:
>
> - the pidfs_finish_open_by_handle_at() is somewhat of a clutch to handle
>   thread vs thread-group pidfds but it works.
>
> - In contrast to pidfd_open() that uses dentry_open() to create a pidfd
>   open_by_handle_at() uses file_open_root(). That's overall fine but
>   makes pidfds subject to security hooks which they aren't via
>   pidfd_open(). It also necessitats pidfs_finish_open_by_handle_at().
>   There's probably other solutions I'm not currently seeing.

These two concerns combined with the special flag make me wonder if pidfs
is so much of a special snowflake we should just special case it up front
and skip all of the shared handle decode logic?

> - The exportfs_decode_fh_raw() call that's used to decode the pidfd is
>   passed vfs_dentry_acceptable() as acceptability callback. For pidfds
>   we don't need any of that functionality and we don't need any of the
>   disconnected dentry handling logic. So the easiest way to fix that is
>   to rely on EXPORT_OP_UNRESTRICTED_OPEN to skip everything. That in
>   turns means the only acceptability we have is the nop->fh_to_dentry()
>   callback for pidfs.

With the current logic we go exportfs_decode_fh_raw(...) ->
find_acceptable_alias(result) -> vfs_dentry_acceptable(context, result).


vfs_dentry_acceptable immediately returns 1 if ctx->flags is 0, which will
always be the case if EXPORT_OP_UNRESTRICTED_OPEN was set, so we immediately
fall back out of the call tree and return result.

So I'm not 100% sure we actually need this special case but I'm not opposed.

> - This all really needs rigorous selftests before we can even think of
>   merging any of this.
Christian Brauner Nov. 14, 2024, 2:13 p.m. UTC | #5
On Thu, Nov 14, 2024 at 02:13:06PM +0100, Erin Shepherd wrote:
> On 14/11/2024 13:52, Christian Brauner wrote:
> 
> > I think you need at least something like the following completely
> > untested draft on top:
> >
> > - the pidfs_finish_open_by_handle_at() is somewhat of a clutch to handle
> >   thread vs thread-group pidfds but it works.
> >
> > - In contrast to pidfd_open() that uses dentry_open() to create a pidfd
> >   open_by_handle_at() uses file_open_root(). That's overall fine but
> >   makes pidfds subject to security hooks which they aren't via
> >   pidfd_open(). It also necessitats pidfs_finish_open_by_handle_at().
> >   There's probably other solutions I'm not currently seeing.
> 
> These two concerns combined with the special flag make me wonder if pidfs
> is so much of a special snowflake we should just special case it up front
> and skip all of the shared handle decode logic?

Care to try a patch and see what it looks like?

> 
> > - The exportfs_decode_fh_raw() call that's used to decode the pidfd is
> >   passed vfs_dentry_acceptable() as acceptability callback. For pidfds
> >   we don't need any of that functionality and we don't need any of the
> >   disconnected dentry handling logic. So the easiest way to fix that is
> >   to rely on EXPORT_OP_UNRESTRICTED_OPEN to skip everything. That in
> >   turns means the only acceptability we have is the nop->fh_to_dentry()
> >   callback for pidfs.
> 
> With the current logic we go exportfs_decode_fh_raw(...) ->
> find_acceptable_alias(result) -> vfs_dentry_acceptable(context, result).
> vfs_dentry_acceptable immediately returns 1 if ctx->flags is 0, which will
> always be the case if EXPORT_OP_UNRESTRICTED_OPEN was set, so we immediately
> fall back out of the call tree and return result.
> 
> So I'm not 100% sure we actually need this special case but I'm not opposed.

Oh right, I completely forgot that find_acceptable_alias() is a no-op
for pidfs.
Erin Shepherd Nov. 14, 2024, 9:52 p.m. UTC | #6
On 14/11/2024 15:13, Christian Brauner wrote:

> On Thu, Nov 14, 2024 at 02:13:06PM +0100, Erin Shepherd wrote:
>> These two concerns combined with the special flag make me wonder if pidfs
>> is so much of a special snowflake we should just special case it up front
>> and skip all of the shared handle decode logic?
> Care to try a patch and see what it looks like?

The following is a completely untested sketch on top of the existing patch series.
Some notes:

- I made heavy use of the cleanup macros. I'm happy to convert things back to
  goto out_xx style if preferred - writing things this way just made bashing out
  the code without dropping resources on the floor easier
- If you don't implement fh_to_dentry then name_to_handle_at will just return an error
  unless called with AT_HANDLE_FID. We need to decide what to do about that
- The GET_PATH_FD_IS_NORMAL/etc constants don't match (what I see as) usual kernel style
  but I'm not sure how to conventionally express something like that

Otherwise, I'm fairly happy with how it came out in the end. Maybe handle_to_path and
do_handle_to_path could be collapsed into each other at this point.

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 056116e58f43..697246085b69 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
 #include <linux/personality.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/pidfs.h>
 #include "internal.h"
 #include "mount.h"

@@ -130,6 +131,11 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
     return err;
 }

+enum {
+    GET_PATH_FD_IS_NORMAL = 0,
+    GET_PATH_FD_IS_PIDFD  = 1,
+};
+
 static int get_path_from_fd(int fd, struct path *root)
 {
     if (fd == AT_FDCWD) {
@@ -139,15 +145,16 @@ static int get_path_from_fd(int fd, struct path *root)
         path_get(root);
         spin_unlock(&fs->lock);
     } else {
-        struct fd f = fdget(fd);
+        CLASS(fd, f)(fd);
         if (!fd_file(f))
             return -EBADF;
+        if (pidfd_pid(fd_file(f)))
+            return GET_PATH_FD_IS_PIDFD;
         *root = fd_file(f)->f_path;
         path_get(root);
-        fdput(f);
     }

-    return 0;
+    return GET_PATH_FD_IS_NORMAL;
 }

 enum handle_to_path_flags {
@@ -287,84 +294,94 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
     return true;
 }

-static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
-           struct path *path, unsigned int o_flags)
+static int copy_handle_from_user(struct file_handle __user *ufh, struct file_handle **ret)
 {
-    int retval = 0;
-    struct file_handle f_handle;
-    struct file_handle *handle = NULL;
-    struct handle_to_path_ctx ctx = {};
-
-    retval = get_path_from_fd(mountdirfd, &ctx.root);
-    if (retval)
-        goto out_err;
-
-    if (!may_decode_fh(&ctx, o_flags)) {
-        retval = -EPERM;
-        goto out_path;
-    }
-
+    struct file_handle f_handle, *handle;
     if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
-        retval = -EFAULT;
-        goto out_path;
+        return -EFAULT;
     }
     if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
         (f_handle.handle_bytes == 0)) {
-        retval = -EINVAL;
-        goto out_path;
+        return -EINVAL;
     }
     handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
              GFP_KERNEL);
     if (!handle) {
-        retval = -ENOMEM;
-        goto out_path;
+        return -ENOMEM;
     }
+
     /* copy the full handle */
     *handle = f_handle;
     if (copy_from_user(&handle->f_handle,
                &ufh->f_handle,
                f_handle.handle_bytes)) {
-        retval = -EFAULT;
-        goto out_handle;
+        kfree(handle);
+        return -EFAULT;
     }

-    retval = do_handle_to_path(handle, path, &ctx);
+    *ret = handle;
+    return 0;
+}

-out_handle:
-    kfree(handle);
-out_path:
-    path_put(&ctx.root);
-out_err:
-    return retval;
+static int handle_to_path(struct path root, struct file_handle __user *ufh,
+           struct path *path, unsigned int o_flags)
+{
+    struct file_handle *handle = NULL;
+    struct handle_to_path_ctx ctx = {
+        .root = root,
+    };
+
+    if (!may_decode_fh(&ctx, o_flags)) {
+        return -EPERM;
+    }
+
+    return do_handle_to_path(handle, path, &ctx);
 }

 static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
                int open_flag)
 {
     long retval = 0;
-    struct path path;
-    struct file *file;
-    int fd;
+    struct file_handle *handle __free(kfree) = NULL;
+    struct path __free(path_put) root = {};
+    struct path __free(path_put) path = {};
+    struct file *file = NULL;
+    int root_type;

-    retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
+    root_type = get_path_from_fd(mountdirfd, &root);
+    if (root_type < 0)
+        return root_type;
+
+    retval = copy_handle_from_user(ufh, &handle);
     if (retval)
         return retval;

-    fd = get_unused_fd_flags(open_flag);
-    if (fd < 0) {
-        path_put(&path);
+    CLASS(get_unused_fd, fd)(open_flag);
+    if (fd < 0)
         return fd;
+
+    switch (root_type) {
+    case GET_PATH_FD_IS_NORMAL:
+        retval = handle_to_path(root, handle, &path, open_flag);
+        if (retval)
+            return retval;
+        file = file_open_root(&path, "", open_flag, 0);
+        if (IS_ERR(file)) {
+            return PTR_ERR(file);
+        }
+        break;
+
+    case GET_PATH_FD_IS_PIDFD:
+        file = pidfd_open_by_handle(
+            handle->f_handle, handle->handle_bytes >> 2, handle->handle_type,
+            open_flag);
+        if (IS_ERR(file))
+            return retval;
+        break;
     }
-    file = file_open_root(&path, "", open_flag, 0);
-    if (IS_ERR(file)) {
-        put_unused_fd(fd);
-        retval =  PTR_ERR(file);
-    } else {
-        retval = fd;
-        fd_install(fd, file);
-    }
-    path_put(&path);
-    return retval;
+
+    fd_install(fd, file);
+    return take_fd(fd);
 }

 /**
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 0684a9b8fe71..65b72dc05380 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -453,22 +453,53 @@ static struct file_system_type pidfs_type = {
     .kill_sb        = kill_anon_super,
 };

-struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+static struct file *__pidfs_alloc_file(struct pid *pid, unsigned int flags)
 {
-
     struct file *pidfd_file;
     struct path path;
     int ret;

-    ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
+    ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
     if (ret < 0)
         return ERR_PTR(ret);

     pidfd_file = dentry_open(&path, flags, current_cred());
+    if (!IS_ERR(pidfd_file))
+        pidfd_file->f_flags |= (flags & PIDFD_THREAD);
     path_put(&path);
     return pidfd_file;
 }

+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+    return __pidfs_alloc_file(get_pid(pid), flags);
+}
+
+struct file *pidfd_open_by_handle(void *gen_fid, int fh_len, int fh_type,
+                  unsigned int flags)
+{
+    bool thread = flags & PIDFD_THREAD;
+    struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
+    struct pid *pid;
+
+    if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
+        return ERR_PTR(-ESTALE);
+
+    scoped_guard(rcu) {
+        pid = find_pid_ns(fid->pid, &init_pid_ns);
+        if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
+            return ERR_PTR(-ESTALE);
+        if(!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
+            return ERR_PTR(-ESTALE);
+            /* Something better? EINVAL like pidfd_open wouldn't be
+               very obvious... */
+
+        pid = get_pid(pid);
+    }
+
+    return __pidfs_alloc_file(pid, flags);
+}
+
 void __init pidfs_init(void)
 {
     pidfs_mnt = kern_mount(&pidfs_type);
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..fba2654ae60f 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -3,6 +3,8 @@
 #define _LINUX_PID_FS_H

 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
+struct file *pidfd_open_by_handle(void *gen_fid, int fh_len, int fh_type,
+                  unsigned int flags);
 void __init pidfs_init(void);

 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 22f43721d031..fc47c76e4ff4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2015,11 +2015,6 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
         put_unused_fd(pidfd);
         return PTR_ERR(pidfd_file);
     }
-    /*
-     * anon_inode_getfile() ignores everything outside of the
-     * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
-     */
-    pidfd_file->f_flags |= (flags & PIDFD_THREAD);
     *ret = pidfd_file;
     return pidfd;
 }
Amir Goldstein Nov. 15, 2024, 7:50 a.m. UTC | #7
On Thu, Nov 14, 2024 at 11:51 PM Erin Shepherd <erin.shepherd@e43.eu> wrote:
>
> On 14/11/2024 15:13, Christian Brauner wrote:
>
> > On Thu, Nov 14, 2024 at 02:13:06PM +0100, Erin Shepherd wrote:
> >> These two concerns combined with the special flag make me wonder if pidfs
> >> is so much of a special snowflake we should just special case it up front
> >> and skip all of the shared handle decode logic?
> > Care to try a patch and see what it looks like?
>
> The following is a completely untested sketch on top of the existing patch series.
> Some notes:
>
> - I made heavy use of the cleanup macros. I'm happy to convert things back to
>   goto out_xx style if preferred - writing things this way just made bashing out
>   the code without dropping resources on the floor easier

Your cleanup is very welcome, just please! not in the same patch as refactoring
and logic changes. Please do these 3 different things in different commits.
This patch is unreviewable as far as I am concerned.

> - If you don't implement fh_to_dentry then name_to_handle_at will just return an error
>   unless called with AT_HANDLE_FID. We need to decide what to do about that

What's to decide? I did not understand the problem.

> - The GET_PATH_FD_IS_NORMAL/etc constants don't match (what I see as) usual kernel style
>   but I'm not sure how to conventionally express something like that

I believe the conventional way to express a custom operation is an
optional method.

For example:

static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir,
                char *name, struct dentry *child)
{
        const struct export_operations *nop = dir->d_sb->s_export_op;
        struct path path = {.mnt = mnt, .dentry = dir};

        if (nop->get_name)
                return nop->get_name(dir, name, child);
        else
                return get_name(&path, name, child);
}

There are plenty of optional custom inode, file, sb, dentry
operations with default fallback. some examples:

        if (dir_inode->i_op->atomic_open) {
                dentry = atomic_open(nd, dentry, file, open_flag, mode);

        if (!splice && file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
        } else if (!splice && file_in->f_op->remap_file_range && samesb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,

So I think the right model for you to follow is a custom optional
s_export_op->open_by_handle() operation.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,5 +1,6 @@ 
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/anon_inodes.h>
+#include <linux/exportfs.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/magic.h>
@@ -347,11 +348,69 @@  static const struct dentry_operations pidfs_dentry_operations = {
 	.d_prune	= stashed_dentry_prune,
 };
 
+#define PIDFD_FID_LEN 3
+
+struct pidfd_fid {
+	u64 ino;
+	s32 pid;
+} __packed;
+
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
+{
+	struct pid *pid = inode->i_private;
+	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
+
+	if (*max_len < PIDFD_FID_LEN) {
+		*max_len = PIDFD_FID_LEN;
+		return FILEID_INVALID;
+	}
+
+	fid->ino = pid->ino;
+	fid->pid = pid_nr(pid);
+	*max_len = PIDFD_FID_LEN;
+	return FILEID_INO64_GEN;
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+					 struct fid *gen_fid,
+					 int fh_len, int fh_type)
+{
+	int ret;
+	struct path path;
+	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
+	struct pid *pid;
+
+	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
+		return NULL;
+
+	scoped_guard(rcu) {
+		pid = find_pid_ns(fid->pid, &init_pid_ns);
+		if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
+			return NULL;
+
+		pid = get_pid(pid);
+	}
+
+	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	mntput(path.mnt);
+	return path.dentry;
+}
+
+static const struct export_operations pidfs_export_operations = {
+	.encode_fh = pidfs_encode_fh,
+	.fh_to_dentry = pidfs_fh_to_dentry,
+	.flags = EXPORT_OP_UNRESTRICTED_OPEN,
+};
+
 static int pidfs_init_inode(struct inode *inode, void *data)
 {
 	inode->i_private = data;
 	inode->i_flags |= S_PRIVATE;
-	inode->i_mode |= S_IRWXU;
+	inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;
 	inode->i_op = &pidfs_inode_operations;
 	inode->i_fop = &pidfs_file_operations;
 	/*
@@ -382,6 +441,7 @@  static int pidfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	ctx->ops = &pidfs_sops;
+	ctx->eops = &pidfs_export_operations;
 	ctx->dops = &pidfs_dentry_operations;
 	fc->s_fs_info = (void *)&pidfs_stashed_ops;
 	return 0;