diff mbox series

[4/4] pidfs: implement fh_to_dentry

Message ID 20241101135452.19359-5-erin.shepherd@e43.eu (mailing list archive)
State New
Headers show
Series pidfs: implement file handle support | expand

Commit Message

Erin Shepherd Nov. 1, 2024, 1:54 p.m. UTC
This enables userspace to use name_to_handle_at to recover a pidfd
to a process.

We stash the process' PID in the root pid namespace inside the handle,
and use that to recover the pid (validating that pid->ino matches the
value in the handle, i.e. that the pid has not been reused).

We use the root namespace in order to ensure that file handles can be
moved across namespaces; however, we validate that the PID exists in
the current namespace before returning the inode.

Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
---
 fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 7 deletions(-)

Comments

Amir Goldstein Nov. 12, 2024, 4:33 p.m. UTC | #1
On Fri, Nov 1, 2024 at 3:05 PM Erin Shepherd <erin.shepherd@e43.eu> wrote:
>
> This enables userspace to use name_to_handle_at to recover a pidfd
> to a process.
>
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
>
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
>
> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>

Functionally, this looks correct to me, so you may add:

Reviewed-by: Amir Goldstein <amir73il@gmail.com>

But I can't say that I am a good person to judge if this new functionality
can expose new information to unpriv users or allow them to get access
to processes that they could not get access to before.

Thanks,
Amir.

> ---
>  fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 7 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index c8e7e9011550..2d66610ef385 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = {
>         .d_prune        = stashed_dentry_prune,
>  };
>
> -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +       u64 ino;
> +       s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
>                            struct inode *parent)
>  {
>         struct pid *pid = inode->i_private;
> -
> -       if (*max_len < 2) {
> -               *max_len = 2;
> +       struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +       if (*max_len < PIDFD_FID_LEN) {
> +               *max_len = PIDFD_FID_LEN;
>                 return FILEID_INVALID;
>         }
>
> -       *max_len = 2;
> -       *(u64 *)fh = pid->ino;
> -       return FILEID_KERNFS;
> +       fid->ino = pid->ino;
> +       fid->pid = pid_nr(pid);
> +       *max_len = PIDFD_FID_LEN;
> +       return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +                                        struct fid *gen_fid,
> +                                        int fh_len, int fh_type)
> +{
> +       int ret;
> +       struct path path;
> +       struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +       struct pid *pid;
> +
> +       if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +               return NULL;
> +
> +       pid = find_get_pid_ns(fid->pid, &init_pid_ns);
> +       if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) {
> +               put_pid(pid);
> +               return NULL;
> +       }
> +
> +       ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +       if (ret < 0)
> +               return ERR_PTR(ret);
> +
> +       mntput(path.mnt);
> +       return path.dentry;
>  }
>
>  static const struct export_operations pidfs_export_operations = {
>         .encode_fh = pidfs_encode_fh,
> +       .fh_to_dentry = pidfs_fh_to_dentry,
>  };
>
>  static int pidfs_init_inode(struct inode *inode, void *data)
> --
> 2.46.1
>
>
Jeff Layton Nov. 12, 2024, 11:51 p.m. UTC | #2
On Fri, 2024-11-01 at 13:54 +0000, Erin Shepherd wrote:
> This enables userspace to use name_to_handle_at to recover a pidfd
> to a process.
> 
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
> 
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
> 
> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> ---
>  fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index c8e7e9011550..2d66610ef385 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = {
>  	.d_prune	= stashed_dentry_prune,
>  };
>  
> -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +	u64 ino;
> +	s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
>  			   struct inode *parent)
>  {
>  	struct pid *pid = inode->i_private;
> -	
> -	if (*max_len < 2) {
> -		*max_len = 2;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +	if (*max_len < PIDFD_FID_LEN) {
> +		*max_len = PIDFD_FID_LEN;
>  		return FILEID_INVALID;
>  	}
>  
> -	*max_len = 2;
> -	*(u64 *)fh = pid->ino;
> -	return FILEID_KERNFS;
> +	fid->ino = pid->ino;
> +	fid->pid = pid_nr(pid);

I worry a little here. A container being able to discover its pid in
the root namespace seems a little sketchy. This makes that fairly
simple to figure out.

Maybe generate a random 32 bit value at boot time, and then xor that
into this? Then you could just reverse that and look up the pid.

> +	*max_len = PIDFD_FID_LEN;
> +	return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +					 struct fid *gen_fid,
> +					 int fh_len, int fh_type)
> +{
> +	int ret;
> +	struct path path;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +	struct pid *pid;
> +
> +	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +		return NULL;
> +
> +	pid = find_get_pid_ns(fid->pid, &init_pid_ns);
> +	if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) {
> +		put_pid(pid);
> +		return NULL;
> +	}
> +
> +	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	mntput(path.mnt);
> +	return path.dentry;
>  }
>  
>  static const struct export_operations pidfs_export_operations = {
>  	.encode_fh = pidfs_encode_fh,
> +	.fh_to_dentry = pidfs_fh_to_dentry,
>  };
>  
>  static int pidfs_init_inode(struct inode *inode, void *data)
Amir Goldstein Nov. 13, 2024, 8:01 a.m. UTC | #3
On Wed, Nov 13, 2024 at 1:34 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2024-11-01 at 13:54 +0000, Erin Shepherd wrote:
> > This enables userspace to use name_to_handle_at to recover a pidfd
> > to a process.
> >
> > We stash the process' PID in the root pid namespace inside the handle,
> > and use that to recover the pid (validating that pid->ino matches the
> > value in the handle, i.e. that the pid has not been reused).
> >
> > We use the root namespace in order to ensure that file handles can be
> > moved across namespaces; however, we validate that the PID exists in
> > the current namespace before returning the inode.
> >
> > Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> > ---
> >  fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/pidfs.c b/fs/pidfs.c
> > index c8e7e9011550..2d66610ef385 100644
> > --- a/fs/pidfs.c
> > +++ b/fs/pidfs.c
> > @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = {
> >       .d_prune        = stashed_dentry_prune,
> >  };
> >
> > -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
> > +#define PIDFD_FID_LEN 3
> > +
> > +struct pidfd_fid {
> > +     u64 ino;
> > +     s32 pid;
> > +} __packed;
> > +
> > +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> >                          struct inode *parent)
> >  {
> >       struct pid *pid = inode->i_private;
> > -
> > -     if (*max_len < 2) {
> > -             *max_len = 2;
> > +     struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> > +
> > +     if (*max_len < PIDFD_FID_LEN) {
> > +             *max_len = PIDFD_FID_LEN;
> >               return FILEID_INVALID;
> >       }
> >
> > -     *max_len = 2;
> > -     *(u64 *)fh = pid->ino;
> > -     return FILEID_KERNFS;
> > +     fid->ino = pid->ino;
> > +     fid->pid = pid_nr(pid);
>
> I worry a little here. A container being able to discover its pid in
> the root namespace seems a little sketchy. This makes that fairly
> simple to figure out.
>
> Maybe generate a random 32 bit value at boot time, and then xor that
> into this? Then you could just reverse that and look up the pid.
>

I don't like playing pseudo cryptographic games, we are not
crypto experts so we are bound to lose in this game.

My thinking is the other way around -
- encode FILEID_INO32_GEN with pid_nr + i_generation
- pid_nr is obviously not unique across pidns and reusable
  but that makes it just like i_ino across filesystems
- the resulting file handle is thus usable only in the pidns where
  it was encoded - is that a bad thing?

Erin,

You write that "To ensure file handles are invariant and can move
between pid namespaces, we stash a pid from the initial namespace
inside the file handle."

Why is it a requirement for userspace that pidfs file handles are
invariant to pidns?

Thanks,
Amir.
Erin Shepherd Nov. 13, 2024, 10:11 a.m. UTC | #4
On 13/11/2024 09:01, Amir Goldstein wrote:

> I don't like playing pseudo cryptographic games, we are not
> crypto experts so we are bound to lose in this game.

I agree. It would be one thing to obfusficate things in order to prevent
userspace from relying upon something that's not ABI; it would be another
to do so with the intent of hiding data. If we wanted to do that, we'd
need to actually encrypt the PID (with e.g. AES-CTR(key, iv=inode_nr))

> My thinking is the other way around -
> - encode FILEID_INO32_GEN with pid_nr + i_generation
> - pid_nr is obviously not unique across pidns and reusable
>   but that makes it just like i_ino across filesystems
> - the resulting file handle is thus usable only in the pidns where
>   it was encoded - is that a bad thing?
>
> Erin,
>
> You write that "To ensure file handles are invariant and can move
> between pid namespaces, we stash a pid from the initial namespace
> inside the file handle."
>
> Why is it a requirement for userspace that pidfs file handles are
> invariant to pidns?

I don't think it's a requirement, but I do think its useful - it is nice if
a service inside a pidns can pass you a file handle and you can restore it and
things are fine (consider also handles stored on the filesystem, as a better
analog for PID files)

But I too was uncertain about exposing root namespace PIDs to containers. I
have no objections to limiting restore of file handles to the same pid ns -
though I think we should defnitely document that such a limitation may be
lifted in the future.

- Erin
Christian Brauner Nov. 13, 2024, 12:09 p.m. UTC | #5
On Fri, Nov 01, 2024 at 01:54:52PM +0000, Erin Shepherd wrote:
> This enables userspace to use name_to_handle_at to recover a pidfd
> to a process.
> 
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
> 
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
> 
> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> ---
>  fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index c8e7e9011550..2d66610ef385 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = {
>  	.d_prune	= stashed_dentry_prune,
>  };
>  
> -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +	u64 ino;
> +	s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
>  			   struct inode *parent)
>  {
>  	struct pid *pid = inode->i_private;
> -	
> -	if (*max_len < 2) {
> -		*max_len = 2;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +	if (*max_len < PIDFD_FID_LEN) {
> +		*max_len = PIDFD_FID_LEN;
>  		return FILEID_INVALID;
>  	}
>  
> -	*max_len = 2;
> -	*(u64 *)fh = pid->ino;
> -	return FILEID_KERNFS;
> +	fid->ino = pid->ino;
> +	fid->pid = pid_nr(pid);

Hm, a pidfd comes in two flavours:

(1) thread-group leader pidfd: pidfd_open(<pid>, 0)
(2) thread pidfd:              pidfd_open(<pid>, PIDFD_THREAD)

In your current scheme fid->pid = pid_nr(pid) means that you always
encode a pidfs file handle for a thread pidfd no matter if the provided
pidfd was a thread-group leader pidfd or a thread pidfd. This is very
likely wrong as it means users that use a thread-group pidfd get a
thread-specific pid back.

I think we need to encode (1) and (2) in the pidfs file handle so users
always get back the correct type of pidfd.

That very likely means name_to_handle_at() needs to encode this into the
pidfs file handle.

We need to think a bit how to do this as we need access to the file so
we can tell (1) and (2) apart. It shouldn't be that big of a deal. For
pidfds we don't need any path-based lookup anyway. IOW, AT_EMPTY_PATH is
the only valid case. Starting with v6.13 we'll have getname_maybe_null()
so access to the file is roughly:

struct path path;
struct filename *fname;
unsigned in f_flags = 0;

fname = getname_maybe_null(name, flag & AT_EMPTY_PATH);
if (fname) {
        ret = filename_lookup(dfd, fname, lookup_flags, &path, NULL);
        if (ret)
                return ret;
} else {
	CLASS(fd, f)(dfd);
	if (fd_empty(f))
		return -EBADF;
	path = fd_file(f)->f_path;
	if (pidfd_pid(fd_file(f))
		f_flags = fd_file(f)->f_flags;
	path_get(&path);
}

and then a thread pidfd is reconginzable as f_flags & PIDFD_THREAD/O_EXCL.

The question again is how to plumb this through to the export_operations
encoding function.

> +	*max_len = PIDFD_FID_LEN;
> +	return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +					 struct fid *gen_fid,
> +					 int fh_len, int fh_type)
> +{
> +	int ret;
> +	struct path path;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +	struct pid *pid;
> +
> +	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +		return NULL;
> +
> +	pid = find_get_pid_ns(fid->pid, &init_pid_ns);
> +	if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) {
> +		put_pid(pid);
> +		return NULL;
> +	}

I think we can avoid the premature reference bump and do:

scoped_guard(rcu) {
        struct pid *pid;

	pid = find_pid_ns(fid->pid, &init_pid_ns);
	if (!pid)
		return NULL;

	/* Did the pid get recycled? */
	if (pid->ino != fid->ino)
		return NULL;

	/* Must be resolvable in the caller's pid namespace. */
	if (pid_vnr(pid) == 0)
		return NULL;

	/* Ok, this is the pid we want. */
	get_pid(pid);
}

> +
> +	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	mntput(path.mnt);
> +	return path.dentry;
>  }
>  
>  static const struct export_operations pidfs_export_operations = {
>  	.encode_fh = pidfs_encode_fh,
> +	.fh_to_dentry = pidfs_fh_to_dentry,
>  };
>  
>  static int pidfs_init_inode(struct inode *inode, void *data)
> -- 
> 2.46.1
>
Christian Brauner Nov. 13, 2024, 12:21 p.m. UTC | #6
On Wed, Nov 13, 2024 at 11:11:47AM +0100, Erin Shepherd wrote:
> On 13/11/2024 09:01, Amir Goldstein wrote:
> 
> > I don't like playing pseudo cryptographic games, we are not
> > crypto experts so we are bound to lose in this game.
> 
> I agree. It would be one thing to obfusficate things in order to prevent
> userspace from relying upon something that's not ABI; it would be another
> to do so with the intent of hiding data. If we wanted to do that, we'd
> need to actually encrypt the PID (with e.g. AES-CTR(key, iv=inode_nr))
> 
> > My thinking is the other way around -
> > - encode FILEID_INO32_GEN with pid_nr + i_generation
> > - pid_nr is obviously not unique across pidns and reusable
> >   but that makes it just like i_ino across filesystems
> > - the resulting file handle is thus usable only in the pidns where
> >   it was encoded - is that a bad thing?
> >
> > Erin,
> >
> > You write that "To ensure file handles are invariant and can move
> > between pid namespaces, we stash a pid from the initial namespace
> > inside the file handle."
> >
> > Why is it a requirement for userspace that pidfs file handles are
> > invariant to pidns?
> 
> I don't think it's a requirement, but I do think its useful - it is nice if

It kind of is though, no? Because you need a reliable way to decode the
pidfs file handle to a struct pid. If one encodes pid numbers as seen
from the encoders pid namespace the decoder has no way of knowing what
pid namespace to resolve it in as the same pid number can obviously be
present in multiple pid namespace. So not encoding the global pid number
would be inherently ambiguous.

> a service inside a pidns can pass you a file handle and you can restore it and
> things are fine (consider also handles stored on the filesystem, as a better
> analog for PID files)
> 
> But I too was uncertain about exposing root namespace PIDs to containers. I
> have no objections to limiting restore of file handles to the same pid ns -
> though I think we should defnitely document that such a limitation may be
> lifted in the future.

The point is really just the provided pid needs to be resolvable in the
pid namespace of the caller. Encoding a global pid number means that
internally we can always resolve it as we know that we always encode
pids in the init pid namespace.

In a second step we can then verify that the struct pid we found based
on the pid number is a member of the pid namespace hierarchy of the
caller. If that is the case then the caller is allowed to get a pidfd
from open_by_handle_at() as they would also be able to get a pidfd via
pidfd_open().

So a container will never be able to a pidfd from a pid number that
resolves to a struct pid that is outside its pid namespace hierarchy.

Let me know if I misunderstood the concerns.
Erin Shepherd Nov. 13, 2024, 1:06 p.m. UTC | #7
On 13/11/2024 13:09, Christian Brauner wrote:

> Hm, a pidfd comes in two flavours:
>
> (1) thread-group leader pidfd: pidfd_open(<pid>, 0)
> (2) thread pidfd:              pidfd_open(<pid>, PIDFD_THREAD)
>
> In your current scheme fid->pid = pid_nr(pid) means that you always
> encode a pidfs file handle for a thread pidfd no matter if the provided
> pidfd was a thread-group leader pidfd or a thread pidfd. This is very
> likely wrong as it means users that use a thread-group pidfd get a
> thread-specific pid back.
>
> I think we need to encode (1) and (2) in the pidfs file handle so users
> always get back the correct type of pidfd.
>
> That very likely means name_to_handle_at() needs to encode this into the
> pidfs file handle.

I guess a question here is whether a pidfd handle encodes a handle to a pid
in a specific mode, or just to a pid in general? The thought had occurred
to me while I was working on this initially, but I felt like perhaps treating
it as a property of the file descriptor in general was better.

Currently open_by_handle_at always returns a thread-group pidfd (since
PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to
name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been
passed through to f->f_flags on the restored pidfd, but upon checking I see that
it gets filtered out in do_dentry_open.

I feel like leaving it up to the caller of open_by_handle_at might be better
(because they are probably better informed about whether they want poll() to
inform them of thread or process exit) but I could lean either way.

>> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
>> +					 struct fid *gen_fid,
>> +					 int fh_len, int fh_type)
>> +{
>> +	int ret;
>> +	struct path path;
>> +	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
>> +	struct pid *pid;
>> +
>> +	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
>> +		return NULL;
>> +
>> +	pid = find_get_pid_ns(fid->pid, &init_pid_ns);
>> +	if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) {
>> +		put_pid(pid);
>> +		return NULL;
>> +	}
> I think we can avoid the premature reference bump and do:
>
> scoped_guard(rcu) {
>         struct pid *pid;
>
> 	pid = find_pid_ns(fid->pid, &init_pid_ns);
> 	if (!pid)
> 		return NULL;
>
> 	/* Did the pid get recycled? */
> 	if (pid->ino != fid->ino)
> 		return NULL;
>
> 	/* Must be resolvable in the caller's pid namespace. */
> 	if (pid_vnr(pid) == 0)
> 		return NULL;
>
> 	/* Ok, this is the pid we want. */
> 	get_pid(pid);
> }

I can go with that if preferred. I was worried a bit about making the RCU
critical section too large, but of course I'm sure there are much larger
sections inside the kernel.

>> +
>> +	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
>> +	if (ret < 0)
>> +		return ERR_PTR(ret);
>> +
>> +	mntput(path.mnt);
>> +	return path.dentry;
>>  }

Similarly here i should probably refactor this into dentry_from_stashed in
order to avoid a needless bump-then-drop of path.mnt's reference count
Christian Brauner Nov. 13, 2024, 1:26 p.m. UTC | #8
On Wed, Nov 13, 2024 at 02:06:56PM +0100, Erin Shepherd wrote:
> On 13/11/2024 13:09, Christian Brauner wrote:
> 
> > Hm, a pidfd comes in two flavours:
> >
> > (1) thread-group leader pidfd: pidfd_open(<pid>, 0)
> > (2) thread pidfd:              pidfd_open(<pid>, PIDFD_THREAD)
> >
> > In your current scheme fid->pid = pid_nr(pid) means that you always
> > encode a pidfs file handle for a thread pidfd no matter if the provided
> > pidfd was a thread-group leader pidfd or a thread pidfd. This is very
> > likely wrong as it means users that use a thread-group pidfd get a
> > thread-specific pid back.
> >
> > I think we need to encode (1) and (2) in the pidfs file handle so users
> > always get back the correct type of pidfd.
> >
> > That very likely means name_to_handle_at() needs to encode this into the
> > pidfs file handle.
> 
> I guess a question here is whether a pidfd handle encodes a handle to a pid
> in a specific mode, or just to a pid in general? The thought had occurred
> to me while I was working on this initially, but I felt like perhaps treating
> it as a property of the file descriptor in general was better.
> 
> Currently open_by_handle_at always returns a thread-group pidfd (since
> PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to
> name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been

I don't think you're returning a thread-groupd pidfd from
open_by_handle_at() in your scheme. After all you're encoding the tid in
pid_nr() so you'll always find the struct pid for the thread afaict. If
I'm wrong could you please explain how you think this works? I might
just be missing something obvious.

> passed through to f->f_flags on the restored pidfd, but upon checking I see that
> it gets filtered out in do_dentry_open.

It does, but note that __pidfd_prepare() raises it explicitly on the
file afterwards. So it works fine.

> 
> I feel like leaving it up to the caller of open_by_handle_at might be better
> (because they are probably better informed about whether they want poll() to
> inform them of thread or process exit) but I could lean either way.

So in order to decode a pidfs file handle you want the caller to have to
specify O_EXCL in the flags argument of open_by_handle_at()? Is that
your idea?

> 
> >> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> >> +					 struct fid *gen_fid,
> >> +					 int fh_len, int fh_type)
> >> +{
> >> +	int ret;
> >> +	struct path path;
> >> +	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> >> +	struct pid *pid;
> >> +
> >> +	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> >> +		return NULL;
> >> +
> >> +	pid = find_get_pid_ns(fid->pid, &init_pid_ns);
> >> +	if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) {
> >> +		put_pid(pid);
> >> +		return NULL;
> >> +	}
> > I think we can avoid the premature reference bump and do:
> >
> > scoped_guard(rcu) {
> >         struct pid *pid;
> >
> > 	pid = find_pid_ns(fid->pid, &init_pid_ns);
> > 	if (!pid)
> > 		return NULL;
> >
> > 	/* Did the pid get recycled? */
> > 	if (pid->ino != fid->ino)
> > 		return NULL;
> >
> > 	/* Must be resolvable in the caller's pid namespace. */
> > 	if (pid_vnr(pid) == 0)
> > 		return NULL;
> >
> > 	/* Ok, this is the pid we want. */
> > 	get_pid(pid);
> > }
> 
> I can go with that if preferred. I was worried a bit about making the RCU
> critical section too large, but of course I'm sure there are much larger
> sections inside the kernel.

This is perfectly fine. Don't worry about it.

> 
> >> +
> >> +	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> >> +	if (ret < 0)
> >> +		return ERR_PTR(ret);
> >> +
> >> +	mntput(path.mnt);
> >> +	return path.dentry;
> >>  }
> 
> Similarly here i should probably refactor this into dentry_from_stashed in
> order to avoid a needless bump-then-drop of path.mnt's reference count

No, what you have now is fine. I wouldn't add a specific helper for
this. In contrast to the pid the pidfs mount never goes away.
Erin Shepherd Nov. 13, 2024, 1:48 p.m. UTC | #9
On 13/11/2024 14:26, Christian Brauner wrote:

> On Wed, Nov 13, 2024 at 02:06:56PM +0100, Erin Shepherd wrote:
>> On 13/11/2024 13:09, Christian Brauner wrote:
>>
>>> Hm, a pidfd comes in two flavours:
>>>
>>> (1) thread-group leader pidfd: pidfd_open(<pid>, 0)
>>> (2) thread pidfd:              pidfd_open(<pid>, PIDFD_THREAD)
>>>
>>> In your current scheme fid->pid = pid_nr(pid) means that you always
>>> encode a pidfs file handle for a thread pidfd no matter if the provided
>>> pidfd was a thread-group leader pidfd or a thread pidfd. This is very
>>> likely wrong as it means users that use a thread-group pidfd get a
>>> thread-specific pid back.
>>>
>>> I think we need to encode (1) and (2) in the pidfs file handle so users
>>> always get back the correct type of pidfd.
>>>
>>> That very likely means name_to_handle_at() needs to encode this into the
>>> pidfs file handle.
>> I guess a question here is whether a pidfd handle encodes a handle to a pid
>> in a specific mode, or just to a pid in general? The thought had occurred
>> to me while I was working on this initially, but I felt like perhaps treating
>> it as a property of the file descriptor in general was better.
>>
>> Currently open_by_handle_at always returns a thread-group pidfd (since
>> PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to
>> name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been
> I don't think you're returning a thread-groupd pidfd from
> open_by_handle_at() in your scheme. After all you're encoding the tid in
> pid_nr() so you'll always find the struct pid for the thread afaict. If
> I'm wrong could you please explain how you think this works? I might
> just be missing something obvious.

Moudlo namespaces, the pid in fid->pid is the same one passed to pidfd_open().
In the root namespace, you could replace name_to_handle_at(...) with
pidfd_open(fid->pid, 0) and get the same result (if both are successful, at least).

The resulting pidfd points to the same struct pid. The only thing that should differ
is whether PIDFD_THREAD is set in f->f_flags.

>> I feel like leaving it up to the caller of open_by_handle_at might be better
>> (because they are probably better informed about whether they want poll() to
>> inform them of thread or process exit) but I could lean either way.
> So in order to decode a pidfs file handle you want the caller to have to
> specify O_EXCL in the flags argument of open_by_handle_at()? Is that
> your idea?

If they want a PIDFD_THREAD pidfd, yes. I see it as similar to O_RDONLY, where its a
flag that applies to the file descriptor but not to the underlying file.

While ideally we'd implement it from an API completeness perspective, practically I'm
not sure how often the option would ever be used. While there are hundreds of reasons
why you might want to track the state of another process, I struggle to think of cases
where Process A needs to track Process B's threads besides a debugger (and a debugger
is probably better off using ptrace), and it can happily track its own threads by just
holding onto the pidfd.
Christian Brauner Nov. 14, 2024, 10:29 a.m. UTC | #10
On Wed, Nov 13, 2024 at 02:48:43PM +0100, Erin Shepherd wrote:
> On 13/11/2024 14:26, Christian Brauner wrote:
> 
> > On Wed, Nov 13, 2024 at 02:06:56PM +0100, Erin Shepherd wrote:
> >> On 13/11/2024 13:09, Christian Brauner wrote:
> >>
> >>> Hm, a pidfd comes in two flavours:
> >>>
> >>> (1) thread-group leader pidfd: pidfd_open(<pid>, 0)
> >>> (2) thread pidfd:              pidfd_open(<pid>, PIDFD_THREAD)
> >>>
> >>> In your current scheme fid->pid = pid_nr(pid) means that you always
> >>> encode a pidfs file handle for a thread pidfd no matter if the provided
> >>> pidfd was a thread-group leader pidfd or a thread pidfd. This is very
> >>> likely wrong as it means users that use a thread-group pidfd get a
> >>> thread-specific pid back.
> >>>
> >>> I think we need to encode (1) and (2) in the pidfs file handle so users
> >>> always get back the correct type of pidfd.
> >>>
> >>> That very likely means name_to_handle_at() needs to encode this into the
> >>> pidfs file handle.
> >> I guess a question here is whether a pidfd handle encodes a handle to a pid
> >> in a specific mode, or just to a pid in general? The thought had occurred
> >> to me while I was working on this initially, but I felt like perhaps treating
> >> it as a property of the file descriptor in general was better.
> >>
> >> Currently open_by_handle_at always returns a thread-group pidfd (since
> >> PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to
> >> name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been
> > I don't think you're returning a thread-groupd pidfd from
> > open_by_handle_at() in your scheme. After all you're encoding the tid in
> > pid_nr() so you'll always find the struct pid for the thread afaict. If
> > I'm wrong could you please explain how you think this works? I might
> > just be missing something obvious.
> 
> Moudlo namespaces, the pid in fid->pid is the same one passed to pidfd_open().
> In the root namespace, you could replace name_to_handle_at(...) with
> pidfd_open(fid->pid, 0) and get the same result (if both are successful, at least).
> 
> The resulting pidfd points to the same struct pid. The only thing that should differ
> is whether PIDFD_THREAD is set in f->f_flags.

I see what you mean but then there's another problem afaict.

Two cases:

(1) @pidfd_thread_group = pidfd_open(1234, 0)

    The pidfd_open() will succeed if the struct pid that 1234 resolves
    to is used as a thread-group leader.

(2) @pidfd_thread = pidfd_open(5678, PIDFD_THREAD)

    The pidfd_open() will succeed even if the struct pid that 5678
    resolves to isn't used as a thread-group leader.

    The resulting struct file will be marked as being a thread pidfd by
    raising O_EXCL.

(1') If (1) is passed to name_to_handle_at() a pidfs file handle is
     encoded for 1234. If later open_by_hande_at() is called then by
     default a thread-group leader pidfd is created. This is fine

(2') If (2) is passed to name_to_handle_at() a pidfs file handle is
     encoded for 5678. If later open_by_handle_at() is called then a
     thread-group leader pidfd will be created again.

So in (2') the caller has managed to create a thread-group leader pidfd
even though the struct pid isn't used as a thread-group leader pidfd.
Consequently, that pidfd is useless when passed to any of the pidfd_*()
system calls.

So basically, you need to verify that if O_EXCL isn't specified with
open_by_handle_at() that the struct pid that is resolved is used as a
thread-group leader and if not, refuse to create a pidfd.

Am I making sense?

> 
> >> I feel like leaving it up to the caller of open_by_handle_at might be better
> >> (because they are probably better informed about whether they want poll() to
> >> inform them of thread or process exit) but I could lean either way.
> > So in order to decode a pidfs file handle you want the caller to have to
> > specify O_EXCL in the flags argument of open_by_handle_at()? Is that
> > your idea?
> 
> If they want a PIDFD_THREAD pidfd, yes. I see it as similar to O_RDONLY, where its a
> flag that applies to the file descriptor but not to the underlying file.

This is probably fine.

> 
> While ideally we'd implement it from an API completeness perspective, practically I'm
> not sure how often the option would ever be used. While there are hundreds of reasons
> why you might want to track the state of another process, I struggle to think of cases
> where Process A needs to track Process B's threads besides a debugger (and a debugger
> is probably better off using ptrace), and it can happily track its own threads by just
> holding onto the pidfd.

We recently imlemented PIDFD_THREAD support because it is used inside
Netflix. I forgot the details thought tbh. So it's actually used. We
only implemented it once people requested it.
Erin Shepherd Nov. 14, 2024, 12:21 p.m. UTC | #11
On 14/11/2024 11:29, Christian Brauner wrote:
>> Moudlo namespaces, the pid in fid->pid is the same one passed to pidfd_open().
>> In the root namespace, you could replace name_to_handle_at(...) with
>> pidfd_open(fid->pid, 0) and get the same result (if both are successful, at least).
>>
>> The resulting pidfd points to the same struct pid. The only thing that should differ
>> is whether PIDFD_THREAD is set in f->f_flags.
> I see what you mean but then there's another problem afaict.
>
> Two cases:
>
> (1) @pidfd_thread_group = pidfd_open(1234, 0)
>
>     The pidfd_open() will succeed if the struct pid that 1234 resolves
>     to is used as a thread-group leader.
>
> (2) @pidfd_thread = pidfd_open(5678, PIDFD_THREAD)
>
>     The pidfd_open() will succeed even if the struct pid that 5678
>     resolves to isn't used as a thread-group leader.
>
>     The resulting struct file will be marked as being a thread pidfd by
>     raising O_EXCL.
>
> (1') If (1) is passed to name_to_handle_at() a pidfs file handle is
>      encoded for 1234. If later open_by_hande_at() is called then by
>      default a thread-group leader pidfd is created. This is fine
>
> (2') If (2) is passed to name_to_handle_at() a pidfs file handle is
>      encoded for 5678. If later open_by_handle_at() is called then a
>      thread-group leader pidfd will be created again.
>
> So in (2') the caller has managed to create a thread-group leader pidfd
> even though the struct pid isn't used as a thread-group leader pidfd.
> Consequently, that pidfd is useless when passed to any of the pidfd_*()
> system calls.
>
> So basically, you need to verify that if O_EXCL isn't specified with
> open_by_handle_at() that the struct pid that is resolved is used as a
> thread-group leader and if not, refuse to create a pidfd.
>
> Am I making sense?

Ah, I fully see what you mean now.

I could implement pidfs_file_operations.open and check the flags there, but
that runs into the issue of vfs_open resetting the flags afterwards so its
entirely pointless. If PIDFD_THREAD wasn't in the set O_CREAT / O_EXCL /
O_NOCTTY / O_TRUNC then this would be much easier, but alas; and its ABI now
too.

I guess the options are

1. Let an FS specify that it doesn't want O_EXCL cleared, but this is getting
   to be some gnarly VFS surgery, or
2. We just detect we're working on a pidfd early in open_by_handle_at and
   skip straight into dedicated logic.

I know you suggested (2) earlier and I increasingly think you're right about
it being the best approach. It also fits better with the special casing PIDFD_SELF
will want when that lands.

So I'll see what an implementation with that approach looks like.

>> If they want a PIDFD_THREAD pidfd, yes. I see it as similar to O_RDONLY, where its a
>> flag that applies to the file descriptor but not to the underlying file.
> This is probably fine.
>> While ideally we'd implement it from an API completeness perspective, practically I'm
>> not sure how often the option would ever be used. While there are hundreds of reasons
>> why you might want to track the state of another process, I struggle to think of cases
>> where Process A needs to track Process B's threads besides a debugger (and a debugger
>> is probably better off using ptrace), and it can happily track its own threads by just
>> holding onto the pidfd.
> We recently imlemented PIDFD_THREAD support because it is used inside
> Netflix. I forgot the details thought tbh. So it's actually used. We
> only implemented it once people requested it.
Oh, I entirely understand the utility of PIDFD_THREAD - I'm just not sure how mnay of
those use cases are cross-process (and in the cases where they are cross process, how many
of those uses would benefit from file handles vs fd passing)
diff mbox series

Patch

diff --git a/fs/pidfs.c b/fs/pidfs.c
index c8e7e9011550..2d66610ef385 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -348,23 +348,59 @@  static const struct dentry_operations pidfs_dentry_operations = {
 	.d_prune	= stashed_dentry_prune,
 };
 
-static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
+#define PIDFD_FID_LEN 3
+
+struct pidfd_fid {
+	u64 ino;
+	s32 pid;
+} __packed;
+
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 			   struct inode *parent)
 {
 	struct pid *pid = inode->i_private;
-	
-	if (*max_len < 2) {
-		*max_len = 2;
+	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
+
+	if (*max_len < PIDFD_FID_LEN) {
+		*max_len = PIDFD_FID_LEN;
 		return FILEID_INVALID;
 	}
 
-	*max_len = 2;
-	*(u64 *)fh = pid->ino;
-	return FILEID_KERNFS;
+	fid->ino = pid->ino;
+	fid->pid = pid_nr(pid);
+	*max_len = PIDFD_FID_LEN;
+	return FILEID_INO64_GEN;
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+					 struct fid *gen_fid,
+					 int fh_len, int fh_type)
+{
+	int ret;
+	struct path path;
+	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
+	struct pid *pid;
+
+	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
+		return NULL;
+
+	pid = find_get_pid_ns(fid->pid, &init_pid_ns);
+	if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) {
+		put_pid(pid);
+		return NULL;
+	}
+
+	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	mntput(path.mnt);
+	return path.dentry;
 }
 
 static const struct export_operations pidfs_export_operations = {
 	.encode_fh = pidfs_encode_fh,
+	.fh_to_dentry = pidfs_fh_to_dentry,
 };
 
 static int pidfs_init_inode(struct inode *inode, void *data)