Message ID | 20241101135452.19359-5-erin.shepherd@e43.eu (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | pidfs: implement file handle support | expand |
On Fri, Nov 1, 2024 at 3:05 PM Erin Shepherd <erin.shepherd@e43.eu> wrote: > > This enables userspace to use name_to_handle_at to recover a pidfd > to a process. > > We stash the process' PID in the root pid namespace inside the handle, > and use that to recover the pid (validating that pid->ino matches the > value in the handle, i.e. that the pid has not been reused). > > We use the root namespace in order to ensure that file handles can be > moved across namespaces; however, we validate that the PID exists in > the current namespace before returning the inode. > > Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu> Functionally, this looks correct to me, so you may add: Reviewed-by: Amir Goldstein <amir73il@gmail.com> But I can't say that I am a good person to judge if this new functionality can expose new information to unpriv users or allow them to get access to processes that they could not get access to before. Thanks, Amir. > --- > fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 43 insertions(+), 7 deletions(-) > > diff --git a/fs/pidfs.c b/fs/pidfs.c > index c8e7e9011550..2d66610ef385 100644 > --- a/fs/pidfs.c > +++ b/fs/pidfs.c > @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = { > .d_prune = stashed_dentry_prune, > }; > > -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, > +#define PIDFD_FID_LEN 3 > + > +struct pidfd_fid { > + u64 ino; > + s32 pid; > +} __packed; > + > +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, > struct inode *parent) > { > struct pid *pid = inode->i_private; > - > - if (*max_len < 2) { > - *max_len = 2; > + struct pidfd_fid *fid = (struct pidfd_fid *)fh; > + > + if (*max_len < PIDFD_FID_LEN) { > + *max_len = PIDFD_FID_LEN; > return FILEID_INVALID; > } > > - *max_len = 2; > - *(u64 *)fh = pid->ino; > - return FILEID_KERNFS; > + fid->ino = pid->ino; > + fid->pid = pid_nr(pid); > + *max_len = PIDFD_FID_LEN; > + return FILEID_INO64_GEN; > +} > + > +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, > + struct fid *gen_fid, > + int fh_len, int fh_type) > +{ > + int ret; > + struct path path; > + struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid; > + struct pid *pid; > + > + if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN) > + return NULL; > + > + pid = find_get_pid_ns(fid->pid, &init_pid_ns); > + if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) { > + put_pid(pid); > + return NULL; > + } > + > + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); > + if (ret < 0) > + return ERR_PTR(ret); > + > + mntput(path.mnt); > + return path.dentry; > } > > static const struct export_operations pidfs_export_operations = { > .encode_fh = pidfs_encode_fh, > + .fh_to_dentry = pidfs_fh_to_dentry, > }; > > static int pidfs_init_inode(struct inode *inode, void *data) > -- > 2.46.1 > >
On Fri, 2024-11-01 at 13:54 +0000, Erin Shepherd wrote: > This enables userspace to use name_to_handle_at to recover a pidfd > to a process. > > We stash the process' PID in the root pid namespace inside the handle, > and use that to recover the pid (validating that pid->ino matches the > value in the handle, i.e. that the pid has not been reused). > > We use the root namespace in order to ensure that file handles can be > moved across namespaces; however, we validate that the PID exists in > the current namespace before returning the inode. > > Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu> > --- > fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 43 insertions(+), 7 deletions(-) > > diff --git a/fs/pidfs.c b/fs/pidfs.c > index c8e7e9011550..2d66610ef385 100644 > --- a/fs/pidfs.c > +++ b/fs/pidfs.c > @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = { > .d_prune = stashed_dentry_prune, > }; > > -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, > +#define PIDFD_FID_LEN 3 > + > +struct pidfd_fid { > + u64 ino; > + s32 pid; > +} __packed; > + > +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, > struct inode *parent) > { > struct pid *pid = inode->i_private; > - > - if (*max_len < 2) { > - *max_len = 2; > + struct pidfd_fid *fid = (struct pidfd_fid *)fh; > + > + if (*max_len < PIDFD_FID_LEN) { > + *max_len = PIDFD_FID_LEN; > return FILEID_INVALID; > } > > - *max_len = 2; > - *(u64 *)fh = pid->ino; > - return FILEID_KERNFS; > + fid->ino = pid->ino; > + fid->pid = pid_nr(pid); I worry a little here. A container being able to discover its pid in the root namespace seems a little sketchy. This makes that fairly simple to figure out. Maybe generate a random 32 bit value at boot time, and then xor that into this? Then you could just reverse that and look up the pid. > + *max_len = PIDFD_FID_LEN; > + return FILEID_INO64_GEN; > +} > + > +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, > + struct fid *gen_fid, > + int fh_len, int fh_type) > +{ > + int ret; > + struct path path; > + struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid; > + struct pid *pid; > + > + if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN) > + return NULL; > + > + pid = find_get_pid_ns(fid->pid, &init_pid_ns); > + if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) { > + put_pid(pid); > + return NULL; > + } > + > + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); > + if (ret < 0) > + return ERR_PTR(ret); > + > + mntput(path.mnt); > + return path.dentry; > } > > static const struct export_operations pidfs_export_operations = { > .encode_fh = pidfs_encode_fh, > + .fh_to_dentry = pidfs_fh_to_dentry, > }; > > static int pidfs_init_inode(struct inode *inode, void *data)
On Wed, Nov 13, 2024 at 1:34 AM Jeff Layton <jlayton@kernel.org> wrote: > > On Fri, 2024-11-01 at 13:54 +0000, Erin Shepherd wrote: > > This enables userspace to use name_to_handle_at to recover a pidfd > > to a process. > > > > We stash the process' PID in the root pid namespace inside the handle, > > and use that to recover the pid (validating that pid->ino matches the > > value in the handle, i.e. that the pid has not been reused). > > > > We use the root namespace in order to ensure that file handles can be > > moved across namespaces; however, we validate that the PID exists in > > the current namespace before returning the inode. > > > > Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu> > > --- > > fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- > > 1 file changed, 43 insertions(+), 7 deletions(-) > > > > diff --git a/fs/pidfs.c b/fs/pidfs.c > > index c8e7e9011550..2d66610ef385 100644 > > --- a/fs/pidfs.c > > +++ b/fs/pidfs.c > > @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = { > > .d_prune = stashed_dentry_prune, > > }; > > > > -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, > > +#define PIDFD_FID_LEN 3 > > + > > +struct pidfd_fid { > > + u64 ino; > > + s32 pid; > > +} __packed; > > + > > +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, > > struct inode *parent) > > { > > struct pid *pid = inode->i_private; > > - > > - if (*max_len < 2) { > > - *max_len = 2; > > + struct pidfd_fid *fid = (struct pidfd_fid *)fh; > > + > > + if (*max_len < PIDFD_FID_LEN) { > > + *max_len = PIDFD_FID_LEN; > > return FILEID_INVALID; > > } > > > > - *max_len = 2; > > - *(u64 *)fh = pid->ino; > > - return FILEID_KERNFS; > > + fid->ino = pid->ino; > > + fid->pid = pid_nr(pid); > > I worry a little here. A container being able to discover its pid in > the root namespace seems a little sketchy. This makes that fairly > simple to figure out. > > Maybe generate a random 32 bit value at boot time, and then xor that > into this? Then you could just reverse that and look up the pid. > I don't like playing pseudo cryptographic games, we are not crypto experts so we are bound to lose in this game. My thinking is the other way around - - encode FILEID_INO32_GEN with pid_nr + i_generation - pid_nr is obviously not unique across pidns and reusable but that makes it just like i_ino across filesystems - the resulting file handle is thus usable only in the pidns where it was encoded - is that a bad thing? Erin, You write that "To ensure file handles are invariant and can move between pid namespaces, we stash a pid from the initial namespace inside the file handle." Why is it a requirement for userspace that pidfs file handles are invariant to pidns? Thanks, Amir.
On 13/11/2024 09:01, Amir Goldstein wrote: > I don't like playing pseudo cryptographic games, we are not > crypto experts so we are bound to lose in this game. I agree. It would be one thing to obfusficate things in order to prevent userspace from relying upon something that's not ABI; it would be another to do so with the intent of hiding data. If we wanted to do that, we'd need to actually encrypt the PID (with e.g. AES-CTR(key, iv=inode_nr)) > My thinking is the other way around - > - encode FILEID_INO32_GEN with pid_nr + i_generation > - pid_nr is obviously not unique across pidns and reusable > but that makes it just like i_ino across filesystems > - the resulting file handle is thus usable only in the pidns where > it was encoded - is that a bad thing? > > Erin, > > You write that "To ensure file handles are invariant and can move > between pid namespaces, we stash a pid from the initial namespace > inside the file handle." > > Why is it a requirement for userspace that pidfs file handles are > invariant to pidns? I don't think it's a requirement, but I do think its useful - it is nice if a service inside a pidns can pass you a file handle and you can restore it and things are fine (consider also handles stored on the filesystem, as a better analog for PID files) But I too was uncertain about exposing root namespace PIDs to containers. I have no objections to limiting restore of file handles to the same pid ns - though I think we should defnitely document that such a limitation may be lifted in the future. - Erin
On Fri, Nov 01, 2024 at 01:54:52PM +0000, Erin Shepherd wrote: > This enables userspace to use name_to_handle_at to recover a pidfd > to a process. > > We stash the process' PID in the root pid namespace inside the handle, > and use that to recover the pid (validating that pid->ino matches the > value in the handle, i.e. that the pid has not been reused). > > We use the root namespace in order to ensure that file handles can be > moved across namespaces; however, we validate that the PID exists in > the current namespace before returning the inode. > > Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu> > --- > fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 43 insertions(+), 7 deletions(-) > > diff --git a/fs/pidfs.c b/fs/pidfs.c > index c8e7e9011550..2d66610ef385 100644 > --- a/fs/pidfs.c > +++ b/fs/pidfs.c > @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = { > .d_prune = stashed_dentry_prune, > }; > > -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, > +#define PIDFD_FID_LEN 3 > + > +struct pidfd_fid { > + u64 ino; > + s32 pid; > +} __packed; > + > +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, > struct inode *parent) > { > struct pid *pid = inode->i_private; > - > - if (*max_len < 2) { > - *max_len = 2; > + struct pidfd_fid *fid = (struct pidfd_fid *)fh; > + > + if (*max_len < PIDFD_FID_LEN) { > + *max_len = PIDFD_FID_LEN; > return FILEID_INVALID; > } > > - *max_len = 2; > - *(u64 *)fh = pid->ino; > - return FILEID_KERNFS; > + fid->ino = pid->ino; > + fid->pid = pid_nr(pid); Hm, a pidfd comes in two flavours: (1) thread-group leader pidfd: pidfd_open(<pid>, 0) (2) thread pidfd: pidfd_open(<pid>, PIDFD_THREAD) In your current scheme fid->pid = pid_nr(pid) means that you always encode a pidfs file handle for a thread pidfd no matter if the provided pidfd was a thread-group leader pidfd or a thread pidfd. This is very likely wrong as it means users that use a thread-group pidfd get a thread-specific pid back. I think we need to encode (1) and (2) in the pidfs file handle so users always get back the correct type of pidfd. That very likely means name_to_handle_at() needs to encode this into the pidfs file handle. We need to think a bit how to do this as we need access to the file so we can tell (1) and (2) apart. It shouldn't be that big of a deal. For pidfds we don't need any path-based lookup anyway. IOW, AT_EMPTY_PATH is the only valid case. Starting with v6.13 we'll have getname_maybe_null() so access to the file is roughly: struct path path; struct filename *fname; unsigned in f_flags = 0; fname = getname_maybe_null(name, flag & AT_EMPTY_PATH); if (fname) { ret = filename_lookup(dfd, fname, lookup_flags, &path, NULL); if (ret) return ret; } else { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; path = fd_file(f)->f_path; if (pidfd_pid(fd_file(f)) f_flags = fd_file(f)->f_flags; path_get(&path); } and then a thread pidfd is reconginzable as f_flags & PIDFD_THREAD/O_EXCL. The question again is how to plumb this through to the export_operations encoding function. > + *max_len = PIDFD_FID_LEN; > + return FILEID_INO64_GEN; > +} > + > +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, > + struct fid *gen_fid, > + int fh_len, int fh_type) > +{ > + int ret; > + struct path path; > + struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid; > + struct pid *pid; > + > + if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN) > + return NULL; > + > + pid = find_get_pid_ns(fid->pid, &init_pid_ns); > + if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) { > + put_pid(pid); > + return NULL; > + } I think we can avoid the premature reference bump and do: scoped_guard(rcu) { struct pid *pid; pid = find_pid_ns(fid->pid, &init_pid_ns); if (!pid) return NULL; /* Did the pid get recycled? */ if (pid->ino != fid->ino) return NULL; /* Must be resolvable in the caller's pid namespace. */ if (pid_vnr(pid) == 0) return NULL; /* Ok, this is the pid we want. */ get_pid(pid); } > + > + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); > + if (ret < 0) > + return ERR_PTR(ret); > + > + mntput(path.mnt); > + return path.dentry; > } > > static const struct export_operations pidfs_export_operations = { > .encode_fh = pidfs_encode_fh, > + .fh_to_dentry = pidfs_fh_to_dentry, > }; > > static int pidfs_init_inode(struct inode *inode, void *data) > -- > 2.46.1 >
On Wed, Nov 13, 2024 at 11:11:47AM +0100, Erin Shepherd wrote: > On 13/11/2024 09:01, Amir Goldstein wrote: > > > I don't like playing pseudo cryptographic games, we are not > > crypto experts so we are bound to lose in this game. > > I agree. It would be one thing to obfusficate things in order to prevent > userspace from relying upon something that's not ABI; it would be another > to do so with the intent of hiding data. If we wanted to do that, we'd > need to actually encrypt the PID (with e.g. AES-CTR(key, iv=inode_nr)) > > > My thinking is the other way around - > > - encode FILEID_INO32_GEN with pid_nr + i_generation > > - pid_nr is obviously not unique across pidns and reusable > > but that makes it just like i_ino across filesystems > > - the resulting file handle is thus usable only in the pidns where > > it was encoded - is that a bad thing? > > > > Erin, > > > > You write that "To ensure file handles are invariant and can move > > between pid namespaces, we stash a pid from the initial namespace > > inside the file handle." > > > > Why is it a requirement for userspace that pidfs file handles are > > invariant to pidns? > > I don't think it's a requirement, but I do think its useful - it is nice if It kind of is though, no? Because you need a reliable way to decode the pidfs file handle to a struct pid. If one encodes pid numbers as seen from the encoders pid namespace the decoder has no way of knowing what pid namespace to resolve it in as the same pid number can obviously be present in multiple pid namespace. So not encoding the global pid number would be inherently ambiguous. > a service inside a pidns can pass you a file handle and you can restore it and > things are fine (consider also handles stored on the filesystem, as a better > analog for PID files) > > But I too was uncertain about exposing root namespace PIDs to containers. I > have no objections to limiting restore of file handles to the same pid ns - > though I think we should defnitely document that such a limitation may be > lifted in the future. The point is really just the provided pid needs to be resolvable in the pid namespace of the caller. Encoding a global pid number means that internally we can always resolve it as we know that we always encode pids in the init pid namespace. In a second step we can then verify that the struct pid we found based on the pid number is a member of the pid namespace hierarchy of the caller. If that is the case then the caller is allowed to get a pidfd from open_by_handle_at() as they would also be able to get a pidfd via pidfd_open(). So a container will never be able to a pidfd from a pid number that resolves to a struct pid that is outside its pid namespace hierarchy. Let me know if I misunderstood the concerns.
On 13/11/2024 13:09, Christian Brauner wrote: > Hm, a pidfd comes in two flavours: > > (1) thread-group leader pidfd: pidfd_open(<pid>, 0) > (2) thread pidfd: pidfd_open(<pid>, PIDFD_THREAD) > > In your current scheme fid->pid = pid_nr(pid) means that you always > encode a pidfs file handle for a thread pidfd no matter if the provided > pidfd was a thread-group leader pidfd or a thread pidfd. This is very > likely wrong as it means users that use a thread-group pidfd get a > thread-specific pid back. > > I think we need to encode (1) and (2) in the pidfs file handle so users > always get back the correct type of pidfd. > > That very likely means name_to_handle_at() needs to encode this into the > pidfs file handle. I guess a question here is whether a pidfd handle encodes a handle to a pid in a specific mode, or just to a pid in general? The thought had occurred to me while I was working on this initially, but I felt like perhaps treating it as a property of the file descriptor in general was better. Currently open_by_handle_at always returns a thread-group pidfd (since PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been passed through to f->f_flags on the restored pidfd, but upon checking I see that it gets filtered out in do_dentry_open. I feel like leaving it up to the caller of open_by_handle_at might be better (because they are probably better informed about whether they want poll() to inform them of thread or process exit) but I could lean either way. >> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, >> + struct fid *gen_fid, >> + int fh_len, int fh_type) >> +{ >> + int ret; >> + struct path path; >> + struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid; >> + struct pid *pid; >> + >> + if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN) >> + return NULL; >> + >> + pid = find_get_pid_ns(fid->pid, &init_pid_ns); >> + if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) { >> + put_pid(pid); >> + return NULL; >> + } > I think we can avoid the premature reference bump and do: > > scoped_guard(rcu) { > struct pid *pid; > > pid = find_pid_ns(fid->pid, &init_pid_ns); > if (!pid) > return NULL; > > /* Did the pid get recycled? */ > if (pid->ino != fid->ino) > return NULL; > > /* Must be resolvable in the caller's pid namespace. */ > if (pid_vnr(pid) == 0) > return NULL; > > /* Ok, this is the pid we want. */ > get_pid(pid); > } I can go with that if preferred. I was worried a bit about making the RCU critical section too large, but of course I'm sure there are much larger sections inside the kernel. >> + >> + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); >> + if (ret < 0) >> + return ERR_PTR(ret); >> + >> + mntput(path.mnt); >> + return path.dentry; >> } Similarly here i should probably refactor this into dentry_from_stashed in order to avoid a needless bump-then-drop of path.mnt's reference count
On Wed, Nov 13, 2024 at 02:06:56PM +0100, Erin Shepherd wrote: > On 13/11/2024 13:09, Christian Brauner wrote: > > > Hm, a pidfd comes in two flavours: > > > > (1) thread-group leader pidfd: pidfd_open(<pid>, 0) > > (2) thread pidfd: pidfd_open(<pid>, PIDFD_THREAD) > > > > In your current scheme fid->pid = pid_nr(pid) means that you always > > encode a pidfs file handle for a thread pidfd no matter if the provided > > pidfd was a thread-group leader pidfd or a thread pidfd. This is very > > likely wrong as it means users that use a thread-group pidfd get a > > thread-specific pid back. > > > > I think we need to encode (1) and (2) in the pidfs file handle so users > > always get back the correct type of pidfd. > > > > That very likely means name_to_handle_at() needs to encode this into the > > pidfs file handle. > > I guess a question here is whether a pidfd handle encodes a handle to a pid > in a specific mode, or just to a pid in general? The thought had occurred > to me while I was working on this initially, but I felt like perhaps treating > it as a property of the file descriptor in general was better. > > Currently open_by_handle_at always returns a thread-group pidfd (since > PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to > name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been I don't think you're returning a thread-groupd pidfd from open_by_handle_at() in your scheme. After all you're encoding the tid in pid_nr() so you'll always find the struct pid for the thread afaict. If I'm wrong could you please explain how you think this works? I might just be missing something obvious. > passed through to f->f_flags on the restored pidfd, but upon checking I see that > it gets filtered out in do_dentry_open. It does, but note that __pidfd_prepare() raises it explicitly on the file afterwards. So it works fine. > > I feel like leaving it up to the caller of open_by_handle_at might be better > (because they are probably better informed about whether they want poll() to > inform them of thread or process exit) but I could lean either way. So in order to decode a pidfs file handle you want the caller to have to specify O_EXCL in the flags argument of open_by_handle_at()? Is that your idea? > > >> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, > >> + struct fid *gen_fid, > >> + int fh_len, int fh_type) > >> +{ > >> + int ret; > >> + struct path path; > >> + struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid; > >> + struct pid *pid; > >> + > >> + if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN) > >> + return NULL; > >> + > >> + pid = find_get_pid_ns(fid->pid, &init_pid_ns); > >> + if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) { > >> + put_pid(pid); > >> + return NULL; > >> + } > > I think we can avoid the premature reference bump and do: > > > > scoped_guard(rcu) { > > struct pid *pid; > > > > pid = find_pid_ns(fid->pid, &init_pid_ns); > > if (!pid) > > return NULL; > > > > /* Did the pid get recycled? */ > > if (pid->ino != fid->ino) > > return NULL; > > > > /* Must be resolvable in the caller's pid namespace. */ > > if (pid_vnr(pid) == 0) > > return NULL; > > > > /* Ok, this is the pid we want. */ > > get_pid(pid); > > } > > I can go with that if preferred. I was worried a bit about making the RCU > critical section too large, but of course I'm sure there are much larger > sections inside the kernel. This is perfectly fine. Don't worry about it. > > >> + > >> + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); > >> + if (ret < 0) > >> + return ERR_PTR(ret); > >> + > >> + mntput(path.mnt); > >> + return path.dentry; > >> } > > Similarly here i should probably refactor this into dentry_from_stashed in > order to avoid a needless bump-then-drop of path.mnt's reference count No, what you have now is fine. I wouldn't add a specific helper for this. In contrast to the pid the pidfs mount never goes away.
On 13/11/2024 14:26, Christian Brauner wrote: > On Wed, Nov 13, 2024 at 02:06:56PM +0100, Erin Shepherd wrote: >> On 13/11/2024 13:09, Christian Brauner wrote: >> >>> Hm, a pidfd comes in two flavours: >>> >>> (1) thread-group leader pidfd: pidfd_open(<pid>, 0) >>> (2) thread pidfd: pidfd_open(<pid>, PIDFD_THREAD) >>> >>> In your current scheme fid->pid = pid_nr(pid) means that you always >>> encode a pidfs file handle for a thread pidfd no matter if the provided >>> pidfd was a thread-group leader pidfd or a thread pidfd. This is very >>> likely wrong as it means users that use a thread-group pidfd get a >>> thread-specific pid back. >>> >>> I think we need to encode (1) and (2) in the pidfs file handle so users >>> always get back the correct type of pidfd. >>> >>> That very likely means name_to_handle_at() needs to encode this into the >>> pidfs file handle. >> I guess a question here is whether a pidfd handle encodes a handle to a pid >> in a specific mode, or just to a pid in general? The thought had occurred >> to me while I was working on this initially, but I felt like perhaps treating >> it as a property of the file descriptor in general was better. >> >> Currently open_by_handle_at always returns a thread-group pidfd (since >> PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to >> name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been > I don't think you're returning a thread-groupd pidfd from > open_by_handle_at() in your scheme. After all you're encoding the tid in > pid_nr() so you'll always find the struct pid for the thread afaict. If > I'm wrong could you please explain how you think this works? I might > just be missing something obvious. Moudlo namespaces, the pid in fid->pid is the same one passed to pidfd_open(). In the root namespace, you could replace name_to_handle_at(...) with pidfd_open(fid->pid, 0) and get the same result (if both are successful, at least). The resulting pidfd points to the same struct pid. The only thing that should differ is whether PIDFD_THREAD is set in f->f_flags. >> I feel like leaving it up to the caller of open_by_handle_at might be better >> (because they are probably better informed about whether they want poll() to >> inform them of thread or process exit) but I could lean either way. > So in order to decode a pidfs file handle you want the caller to have to > specify O_EXCL in the flags argument of open_by_handle_at()? Is that > your idea? If they want a PIDFD_THREAD pidfd, yes. I see it as similar to O_RDONLY, where its a flag that applies to the file descriptor but not to the underlying file. While ideally we'd implement it from an API completeness perspective, practically I'm not sure how often the option would ever be used. While there are hundreds of reasons why you might want to track the state of another process, I struggle to think of cases where Process A needs to track Process B's threads besides a debugger (and a debugger is probably better off using ptrace), and it can happily track its own threads by just holding onto the pidfd.
On Wed, Nov 13, 2024 at 02:48:43PM +0100, Erin Shepherd wrote: > On 13/11/2024 14:26, Christian Brauner wrote: > > > On Wed, Nov 13, 2024 at 02:06:56PM +0100, Erin Shepherd wrote: > >> On 13/11/2024 13:09, Christian Brauner wrote: > >> > >>> Hm, a pidfd comes in two flavours: > >>> > >>> (1) thread-group leader pidfd: pidfd_open(<pid>, 0) > >>> (2) thread pidfd: pidfd_open(<pid>, PIDFD_THREAD) > >>> > >>> In your current scheme fid->pid = pid_nr(pid) means that you always > >>> encode a pidfs file handle for a thread pidfd no matter if the provided > >>> pidfd was a thread-group leader pidfd or a thread pidfd. This is very > >>> likely wrong as it means users that use a thread-group pidfd get a > >>> thread-specific pid back. > >>> > >>> I think we need to encode (1) and (2) in the pidfs file handle so users > >>> always get back the correct type of pidfd. > >>> > >>> That very likely means name_to_handle_at() needs to encode this into the > >>> pidfs file handle. > >> I guess a question here is whether a pidfd handle encodes a handle to a pid > >> in a specific mode, or just to a pid in general? The thought had occurred > >> to me while I was working on this initially, but I felt like perhaps treating > >> it as a property of the file descriptor in general was better. > >> > >> Currently open_by_handle_at always returns a thread-group pidfd (since > >> PIDFD_THREAD) isn't set, regardless of what type of pidfd you passed to > >> name_to_handle_at. I had thought that PIDFD_THREAD/O_EXCL would have been > > I don't think you're returning a thread-groupd pidfd from > > open_by_handle_at() in your scheme. After all you're encoding the tid in > > pid_nr() so you'll always find the struct pid for the thread afaict. If > > I'm wrong could you please explain how you think this works? I might > > just be missing something obvious. > > Moudlo namespaces, the pid in fid->pid is the same one passed to pidfd_open(). > In the root namespace, you could replace name_to_handle_at(...) with > pidfd_open(fid->pid, 0) and get the same result (if both are successful, at least). > > The resulting pidfd points to the same struct pid. The only thing that should differ > is whether PIDFD_THREAD is set in f->f_flags. I see what you mean but then there's another problem afaict. Two cases: (1) @pidfd_thread_group = pidfd_open(1234, 0) The pidfd_open() will succeed if the struct pid that 1234 resolves to is used as a thread-group leader. (2) @pidfd_thread = pidfd_open(5678, PIDFD_THREAD) The pidfd_open() will succeed even if the struct pid that 5678 resolves to isn't used as a thread-group leader. The resulting struct file will be marked as being a thread pidfd by raising O_EXCL. (1') If (1) is passed to name_to_handle_at() a pidfs file handle is encoded for 1234. If later open_by_hande_at() is called then by default a thread-group leader pidfd is created. This is fine (2') If (2) is passed to name_to_handle_at() a pidfs file handle is encoded for 5678. If later open_by_handle_at() is called then a thread-group leader pidfd will be created again. So in (2') the caller has managed to create a thread-group leader pidfd even though the struct pid isn't used as a thread-group leader pidfd. Consequently, that pidfd is useless when passed to any of the pidfd_*() system calls. So basically, you need to verify that if O_EXCL isn't specified with open_by_handle_at() that the struct pid that is resolved is used as a thread-group leader and if not, refuse to create a pidfd. Am I making sense? > > >> I feel like leaving it up to the caller of open_by_handle_at might be better > >> (because they are probably better informed about whether they want poll() to > >> inform them of thread or process exit) but I could lean either way. > > So in order to decode a pidfs file handle you want the caller to have to > > specify O_EXCL in the flags argument of open_by_handle_at()? Is that > > your idea? > > If they want a PIDFD_THREAD pidfd, yes. I see it as similar to O_RDONLY, where its a > flag that applies to the file descriptor but not to the underlying file. This is probably fine. > > While ideally we'd implement it from an API completeness perspective, practically I'm > not sure how often the option would ever be used. While there are hundreds of reasons > why you might want to track the state of another process, I struggle to think of cases > where Process A needs to track Process B's threads besides a debugger (and a debugger > is probably better off using ptrace), and it can happily track its own threads by just > holding onto the pidfd. We recently imlemented PIDFD_THREAD support because it is used inside Netflix. I forgot the details thought tbh. So it's actually used. We only implemented it once people requested it.
On 14/11/2024 11:29, Christian Brauner wrote: >> Moudlo namespaces, the pid in fid->pid is the same one passed to pidfd_open(). >> In the root namespace, you could replace name_to_handle_at(...) with >> pidfd_open(fid->pid, 0) and get the same result (if both are successful, at least). >> >> The resulting pidfd points to the same struct pid. The only thing that should differ >> is whether PIDFD_THREAD is set in f->f_flags. > I see what you mean but then there's another problem afaict. > > Two cases: > > (1) @pidfd_thread_group = pidfd_open(1234, 0) > > The pidfd_open() will succeed if the struct pid that 1234 resolves > to is used as a thread-group leader. > > (2) @pidfd_thread = pidfd_open(5678, PIDFD_THREAD) > > The pidfd_open() will succeed even if the struct pid that 5678 > resolves to isn't used as a thread-group leader. > > The resulting struct file will be marked as being a thread pidfd by > raising O_EXCL. > > (1') If (1) is passed to name_to_handle_at() a pidfs file handle is > encoded for 1234. If later open_by_hande_at() is called then by > default a thread-group leader pidfd is created. This is fine > > (2') If (2) is passed to name_to_handle_at() a pidfs file handle is > encoded for 5678. If later open_by_handle_at() is called then a > thread-group leader pidfd will be created again. > > So in (2') the caller has managed to create a thread-group leader pidfd > even though the struct pid isn't used as a thread-group leader pidfd. > Consequently, that pidfd is useless when passed to any of the pidfd_*() > system calls. > > So basically, you need to verify that if O_EXCL isn't specified with > open_by_handle_at() that the struct pid that is resolved is used as a > thread-group leader and if not, refuse to create a pidfd. > > Am I making sense? Ah, I fully see what you mean now. I could implement pidfs_file_operations.open and check the flags there, but that runs into the issue of vfs_open resetting the flags afterwards so its entirely pointless. If PIDFD_THREAD wasn't in the set O_CREAT / O_EXCL / O_NOCTTY / O_TRUNC then this would be much easier, but alas; and its ABI now too. I guess the options are 1. Let an FS specify that it doesn't want O_EXCL cleared, but this is getting to be some gnarly VFS surgery, or 2. We just detect we're working on a pidfd early in open_by_handle_at and skip straight into dedicated logic. I know you suggested (2) earlier and I increasingly think you're right about it being the best approach. It also fits better with the special casing PIDFD_SELF will want when that lands. So I'll see what an implementation with that approach looks like. >> If they want a PIDFD_THREAD pidfd, yes. I see it as similar to O_RDONLY, where its a >> flag that applies to the file descriptor but not to the underlying file. > This is probably fine. >> While ideally we'd implement it from an API completeness perspective, practically I'm >> not sure how often the option would ever be used. While there are hundreds of reasons >> why you might want to track the state of another process, I struggle to think of cases >> where Process A needs to track Process B's threads besides a debugger (and a debugger >> is probably better off using ptrace), and it can happily track its own threads by just >> holding onto the pidfd. > We recently imlemented PIDFD_THREAD support because it is used inside > Netflix. I forgot the details thought tbh. So it's actually used. We > only implemented it once people requested it. Oh, I entirely understand the utility of PIDFD_THREAD - I'm just not sure how mnay of those use cases are cross-process (and in the cases where they are cross process, how many of those uses would benefit from file handles vs fd passing)
diff --git a/fs/pidfs.c b/fs/pidfs.c index c8e7e9011550..2d66610ef385 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -348,23 +348,59 @@ static const struct dentry_operations pidfs_dentry_operations = { .d_prune = stashed_dentry_prune, }; -static int pidfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, +#define PIDFD_FID_LEN 3 + +struct pidfd_fid { + u64 ino; + s32 pid; +} __packed; + +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, struct inode *parent) { struct pid *pid = inode->i_private; - - if (*max_len < 2) { - *max_len = 2; + struct pidfd_fid *fid = (struct pidfd_fid *)fh; + + if (*max_len < PIDFD_FID_LEN) { + *max_len = PIDFD_FID_LEN; return FILEID_INVALID; } - *max_len = 2; - *(u64 *)fh = pid->ino; - return FILEID_KERNFS; + fid->ino = pid->ino; + fid->pid = pid_nr(pid); + *max_len = PIDFD_FID_LEN; + return FILEID_INO64_GEN; +} + +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, + struct fid *gen_fid, + int fh_len, int fh_type) +{ + int ret; + struct path path; + struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid; + struct pid *pid; + + if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN) + return NULL; + + pid = find_get_pid_ns(fid->pid, &init_pid_ns); + if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0) { + put_pid(pid); + return NULL; + } + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); + if (ret < 0) + return ERR_PTR(ret); + + mntput(path.mnt); + return path.dentry; } static const struct export_operations pidfs_export_operations = { .encode_fh = pidfs_encode_fh, + .fh_to_dentry = pidfs_fh_to_dentry, }; static int pidfs_init_inode(struct inode *inode, void *data)
This enables userspace to use name_to_handle_at to recover a pidfd to a process. We stash the process' PID in the root pid namespace inside the handle, and use that to recover the pid (validating that pid->ino matches the value in the handle, i.e. that the pid has not been reused). We use the root namespace in order to ensure that file handles can be moved across namespaces; however, we validate that the PID exists in the current namespace before returning the inode. Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu> --- fs/pidfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-)