diff mbox series

[RFC,06/10] pidfs: allow to retrieve exit information

Message ID 20250228-work-pidfs-kill_on_last_close-v1-6-5bd7e6bb428e@kernel.org (mailing list archive)
State New
Headers show
Series pidfs: provide information after task has been reaped | expand

Commit Message

Christian Brauner Feb. 28, 2025, 12:44 p.m. UTC
Some tools like systemd's jounral need to retrieve the exit and cgroup
information after a process has already been reaped. This can e.g.,
happen when retrieving a pidfd via SCM_PIDFD or SCM_PEERPIDFD.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                 | 70 +++++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/pidfd.h |  3 +-
 2 files changed, 59 insertions(+), 14 deletions(-)

Comments

Mike Yuan March 2, 2025, 2:40 a.m. UTC | #1
On 2/28/25 13:44, Christian Brauner <brauner@kernel.org> wrote:

>  Some tools like systemd's jounral need to retrieve the exit and cgroup
>  information after a process has already been reaped. This can e.g.,
>  happen when retrieving a pidfd via SCM_PIDFD or SCM_PEERPIDFD.
>  
>  Signed-off-by: Christian Brauner <brauner@kernel.org>
>  ---
>   fs/pidfs.c                 | 70 +++++++++++++++++++++++++++++++++++++---------
>   include/uapi/linux/pidfd.h |  3 +-
>   2 files changed, 59 insertions(+), 14 deletions(-)
>  
>  diff --git a/fs/pidfs.c b/fs/pidfs.c
>  index 433f676c066c..e500bc4c5af2 100644
>  --- a/fs/pidfs.c
>  +++ b/fs/pidfs.c
>  @@ -32,11 +32,12 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
>    */
>   struct pidfs_exit_info {
>   	__u64 cgroupid;
>  -	__u64 exit_code;
>  +	__s32 exit_code;
>   };
>  
>   struct pidfs_inode {
>  -	struct pidfs_exit_info exit_info;
>  +	struct pidfs_exit_info __pei;
>  +	struct pidfs_exit_info *exit_info;
>   	struct inode vfs_inode;
>   };
>  
>  @@ -228,11 +229,14 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>   	return poll_flags;
>   }
>  
>  -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
>  +static long pidfd_info(struct file *file, struct task_struct *task,
>  +		       unsigned int cmd, unsigned long arg)
>   {
>   	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
>   	size_t usize = _IOC_SIZE(cmd);
>   	struct pidfd_info kinfo = {};
>  +	struct pidfs_exit_info *exit_info;
>  +	struct inode *inode = file_inode(file);
>   	struct user_namespace *user_ns;
>   	const struct cred *c;
>   	__u64 mask;
>  @@ -248,6 +252,39 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>   	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
>   		return -EFAULT;
>  
>  +	exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
>  +	if (exit_info) {
>  +		/*
>  +		 * TODO: Oleg, I didn't see a reason for putting
>  +		 * retrieval of the exit status of a task behind some
>  +		 * form of permission check. Maybe there's some
>  +		 * potential concerns with seeing the exit status of a
>  +		 * SIGKILLed suid binary or something but even then I'm
>  +		 * not sure that's a problem.
>  +		 *
>  +		 * If we want this we could put this behind some *uid
>  +		 * check similar to what ptrace access does by recording
>  +		 * parts of the creds we'd need for checking this. But
>  +		 * only if we really need it.
>  +		 */
>  +		kinfo.exit_code = exit_info->exit_code;
>  +#ifdef CONFIG_CGROUPS
>  +		kinfo.cgroupid = exit_info->cgroupid;
>  +		kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
>  +#endif
>  +	}
>  +
>  +	/*
>  +	 * If the task has already been reaped only exit information
>  +	 * can be provided. It's entirely possible that the task has
>  +	 * already been reaped but we managed to grab a reference to it
>  +	 * before that. So a full set of information about @task doesn't
>  +	 * mean it hasn't been waited upon. Similarly, a full set of
>  +	 * information doesn't mean that the task hasn't already exited.
>  +	 */
>  +	if (!task)
>  +		goto copy_out;
>  +
>   	c = get_task_cred(task);
>   	if (!c)
>   		return -ESRCH;
>  @@ -267,11 +304,13 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>   	put_cred(c);
>  
>   #ifdef CONFIG_CGROUPS
>  -	rcu_read_lock();
>  -	cgrp = task_dfl_cgroup(task);
>  -	kinfo.cgroupid = cgroup_id(cgrp);
>  -	kinfo.mask |= PIDFD_INFO_CGROUPID;
>  -	rcu_read_unlock();
>  +	if (!kinfo.cgroupid) {
>  +		rcu_read_lock();
>  +		cgrp = task_dfl_cgroup(task);
>  +		kinfo.cgroupid = cgroup_id(cgrp);
>  +		kinfo.mask |= PIDFD_INFO_CGROUPID;
>  +		rcu_read_unlock();
>  +	}
>   #endif
>  
>   	/*
>  @@ -291,6 +330,7 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>   	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
>   		return -ESRCH;
>  
>  +copy_out:
>   	/*
>   	 * If userspace and the kernel have the same struct size it can just
>   	 * be copied. If userspace provides an older struct, only the bits that
>  @@ -341,12 +381,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>   	}
>  
>   	task = get_pid_task(pid, PIDTYPE_PID);
>  -	if (!task)
>  -		return -ESRCH;

Hmm, this breaks our current assumption/assertion on the API in systemd (see pidfd_get_pid_ioctl() in basic/pidfd-util.c).
Moreover, it now imposes an inconsistency: if the pidfd refers to a process from foreign pidns, the current impl treats it as if the process didn't exist, and returns -ESRCH. Now a truly exited task deviates from that...

I'd prefer to retain the current behavior of returning -ESRCH unless PIDFD_INFO_EXIT is specified in mask, in which case it's then guaranteed that -ESRCH would never be seen. IOW the caller should be explicit on what they want, which feels semantically more reasonable to me and probably even simpler?

>  
>   	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
>   	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
>  -		return pidfd_info(task, cmd, arg);
>  +		return pidfd_info(file, task, cmd, arg);
>  +
>  +	if (!task)
>  +		return -ESRCH;
>  
>   	if (arg)
>   		return -EINVAL;
>  @@ -486,7 +527,7 @@ void pidfs_exit(struct task_struct *tsk)
>   		struct cgroup *cgrp;
>   #endif
>   		inode = d_inode(dentry);
>  -		exit_info = &pidfs_i(inode)->exit_info;
>  +		exit_info = &pidfs_i(inode)->__pei;
>  
>   		/* TODO: Annoy Oleg to tell me how to do this correctly. */
>   		if (tsk->signal->flags & SIGNAL_GROUP_EXIT)
>  @@ -501,6 +542,8 @@ void pidfs_exit(struct task_struct *tsk)
>   		rcu_read_unlock();
>   #endif
>  
>  +		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
>  +		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
>   		dput(dentry);
>   	}
>   }
>  @@ -568,7 +611,8 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
>   	if (!pi)
>   		return NULL;
>  
>  -	memset(&pi->exit_info, 0, sizeof(pi->exit_info));
>  +	memset(&pi->__pei, 0, sizeof(pi->__pei));
>  +	pi->exit_info = NULL;
>  
>   	return &pi->vfs_inode;
>   }
>  diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
>  index e0abd0b18841..e5966f1a7743 100644
>  --- a/include/uapi/linux/pidfd.h
>  +++ b/include/uapi/linux/pidfd.h
>  @@ -20,6 +20,7 @@
>   #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
>   #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
>   #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
>  +#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
>  
>   #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
>  
>  @@ -86,7 +87,7 @@ struct pidfd_info {
>   	__u32 sgid;
>   	__u32 fsuid;
>   	__u32 fsgid;
>  -	__u32 spare0[1];
>  +	__s32 exit_code;
>   };
>  
>   #define PIDFS_IOCTL_MAGIC 0xFF
>  
>  --
>  2.47.2
>  
>
Christian Brauner March 2, 2025, 12:33 p.m. UTC | #2
On Sun, Mar 02, 2025 at 02:40:16AM +0000, Mike Yuan wrote:
> On 2/28/25 13:44, Christian Brauner <brauner@kernel.org> wrote:
> 
> >  Some tools like systemd's jounral need to retrieve the exit and cgroup
> >  information after a process has already been reaped. This can e.g.,
> >  happen when retrieving a pidfd via SCM_PIDFD or SCM_PEERPIDFD.
> >  
> >  Signed-off-by: Christian Brauner <brauner@kernel.org>
> >  ---
> >   fs/pidfs.c                 | 70 +++++++++++++++++++++++++++++++++++++---------
> >   include/uapi/linux/pidfd.h |  3 +-
> >   2 files changed, 59 insertions(+), 14 deletions(-)
> >  
> >  diff --git a/fs/pidfs.c b/fs/pidfs.c
> >  index 433f676c066c..e500bc4c5af2 100644
> >  --- a/fs/pidfs.c
> >  +++ b/fs/pidfs.c
> >  @@ -32,11 +32,12 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
> >    */
> >   struct pidfs_exit_info {
> >   	__u64 cgroupid;
> >  -	__u64 exit_code;
> >  +	__s32 exit_code;
> >   };
> >  
> >   struct pidfs_inode {
> >  -	struct pidfs_exit_info exit_info;
> >  +	struct pidfs_exit_info __pei;
> >  +	struct pidfs_exit_info *exit_info;
> >   	struct inode vfs_inode;
> >   };
> >  
> >  @@ -228,11 +229,14 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
> >   	return poll_flags;
> >   }
> >  
> >  -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
> >  +static long pidfd_info(struct file *file, struct task_struct *task,
> >  +		       unsigned int cmd, unsigned long arg)
> >   {
> >   	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
> >   	size_t usize = _IOC_SIZE(cmd);
> >   	struct pidfd_info kinfo = {};
> >  +	struct pidfs_exit_info *exit_info;
> >  +	struct inode *inode = file_inode(file);
> >   	struct user_namespace *user_ns;
> >   	const struct cred *c;
> >   	__u64 mask;
> >  @@ -248,6 +252,39 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >   	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
> >   		return -EFAULT;
> >  
> >  +	exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> >  +	if (exit_info) {
> >  +		/*
> >  +		 * TODO: Oleg, I didn't see a reason for putting
> >  +		 * retrieval of the exit status of a task behind some
> >  +		 * form of permission check. Maybe there's some
> >  +		 * potential concerns with seeing the exit status of a
> >  +		 * SIGKILLed suid binary or something but even then I'm
> >  +		 * not sure that's a problem.
> >  +		 *
> >  +		 * If we want this we could put this behind some *uid
> >  +		 * check similar to what ptrace access does by recording
> >  +		 * parts of the creds we'd need for checking this. But
> >  +		 * only if we really need it.
> >  +		 */
> >  +		kinfo.exit_code = exit_info->exit_code;
> >  +#ifdef CONFIG_CGROUPS
> >  +		kinfo.cgroupid = exit_info->cgroupid;
> >  +		kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
> >  +#endif
> >  +	}
> >  +
> >  +	/*
> >  +	 * If the task has already been reaped only exit information
> >  +	 * can be provided. It's entirely possible that the task has
> >  +	 * already been reaped but we managed to grab a reference to it
> >  +	 * before that. So a full set of information about @task doesn't
> >  +	 * mean it hasn't been waited upon. Similarly, a full set of
> >  +	 * information doesn't mean that the task hasn't already exited.
> >  +	 */
> >  +	if (!task)
> >  +		goto copy_out;
> >  +
> >   	c = get_task_cred(task);
> >   	if (!c)
> >   		return -ESRCH;
> >  @@ -267,11 +304,13 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >   	put_cred(c);
> >  
> >   #ifdef CONFIG_CGROUPS
> >  -	rcu_read_lock();
> >  -	cgrp = task_dfl_cgroup(task);
> >  -	kinfo.cgroupid = cgroup_id(cgrp);
> >  -	kinfo.mask |= PIDFD_INFO_CGROUPID;
> >  -	rcu_read_unlock();
> >  +	if (!kinfo.cgroupid) {
> >  +		rcu_read_lock();
> >  +		cgrp = task_dfl_cgroup(task);
> >  +		kinfo.cgroupid = cgroup_id(cgrp);
> >  +		kinfo.mask |= PIDFD_INFO_CGROUPID;
> >  +		rcu_read_unlock();
> >  +	}
> >   #endif
> >  
> >   	/*
> >  @@ -291,6 +330,7 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >   	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
> >   		return -ESRCH;
> >  
> >  +copy_out:
> >   	/*
> >   	 * If userspace and the kernel have the same struct size it can just
> >   	 * be copied. If userspace provides an older struct, only the bits that
> >  @@ -341,12 +381,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >   	}
> >  
> >   	task = get_pid_task(pid, PIDTYPE_PID);
> >  -	if (!task)
> >  -		return -ESRCH;
> 
> Hmm, this breaks our current assumption/assertion on the API in
> systemd (see pidfd_get_pid_ioctl() in basic/pidfd-util.c).
> Moreover, it now imposes an inconsistency: if the pidfd refers to a
> process from foreign pidns, the current impl treats it as if the
> process didn't exist, and returns -ESRCH. Now a truly exited task
> deviates from that...

Thanks for spotting that. It should not be possible to retrieve
PIDFD_INFO_EXIT if the dead task is outside the pid namespace hierarchy.
That's easy to handle though.

> 
> I'd prefer to retain the current behavior of returning -ESRCH unless
> PIDFD_INFO_EXIT is specified in mask, in which case it's then
> guaranteed that -ESRCH would never be seen. IOW the caller should be
> explicit on what they want, which feels semantically more reasonable
> to me and probably even simpler?

Sure.

> 
> >  
> >   	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
> >   	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
> >  -		return pidfd_info(task, cmd, arg);
> >  +		return pidfd_info(file, task, cmd, arg);
> >  +
> >  +	if (!task)
> >  +		return -ESRCH;
> >  
> >   	if (arg)
> >   		return -EINVAL;
> >  @@ -486,7 +527,7 @@ void pidfs_exit(struct task_struct *tsk)
> >   		struct cgroup *cgrp;
> >   #endif
> >   		inode = d_inode(dentry);
> >  -		exit_info = &pidfs_i(inode)->exit_info;
> >  +		exit_info = &pidfs_i(inode)->__pei;
> >  
> >   		/* TODO: Annoy Oleg to tell me how to do this correctly. */
> >   		if (tsk->signal->flags & SIGNAL_GROUP_EXIT)
> >  @@ -501,6 +542,8 @@ void pidfs_exit(struct task_struct *tsk)
> >   		rcu_read_unlock();
> >   #endif
> >  
> >  +		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
> >  +		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
> >   		dput(dentry);
> >   	}
> >   }
> >  @@ -568,7 +611,8 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
> >   	if (!pi)
> >   		return NULL;
> >  
> >  -	memset(&pi->exit_info, 0, sizeof(pi->exit_info));
> >  +	memset(&pi->__pei, 0, sizeof(pi->__pei));
> >  +	pi->exit_info = NULL;
> >  
> >   	return &pi->vfs_inode;
> >   }
> >  diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> >  index e0abd0b18841..e5966f1a7743 100644
> >  --- a/include/uapi/linux/pidfd.h
> >  +++ b/include/uapi/linux/pidfd.h
> >  @@ -20,6 +20,7 @@
> >   #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
> >   #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
> >   #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
> >  +#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
> >  
> >   #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
> >  
> >  @@ -86,7 +87,7 @@ struct pidfd_info {
> >   	__u32 sgid;
> >   	__u32 fsuid;
> >   	__u32 fsgid;
> >  -	__u32 spare0[1];
> >  +	__s32 exit_code;
> >   };
> >  
> >   #define PIDFS_IOCTL_MAGIC 0xFF
> >  
> >  --
> >  2.47.2
> >  
> >
Oleg Nesterov March 2, 2025, 3:53 p.m. UTC | #3
On 02/28, Christian Brauner wrote:
>
> Some tools like systemd's jounral need to retrieve the exit and cgroup
> information after a process has already been reaped.
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

But unless I am totally confused do_exit() calls pidfd_exit() even
before exit_notify(), the exiting task is not even zombie yet. It
will reaped only when it passes exit_notify() and its parent does
wait().



And what about the multi-threaded case? Suppose the main thread
does sys_exit(0) and it has alive sub-threads.

In this case pidfd_info() will report kinfo.exit_code = 0.
And this is probably fine if (file->f_flags & PIDFD_THREAD) != 0.

But what if this file was created without PIDFD_THREAD? If another
thread does exit_group(1) after that, the process's exit code is
1 << 8, but it can't be retrieved.



Finally, sys_execve(). Suppose we have a main thread L and a
sub-thread T.

T execs and kill the leader L. L exits and populates
pidfs_i(inode)->exit_info.

T calls exchange_tids() in de_thread() and becomes the new leader
with the same (old) pid.

Now, T is very much alive, but pidfs_i(inode)->exit_info != NULL.

Or I am totally confused?



> +	exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> +	if (exit_info) {
> +		/*
> +		 * TODO: Oleg, I didn't see a reason for putting
> +		 * retrieval of the exit status of a task behind some
> +		 * form of permission check.

Neither me.

Oleg.
Christian Brauner March 2, 2025, 4:29 p.m. UTC | #4
On Sun, Mar 02, 2025 at 04:53:46PM +0100, Oleg Nesterov wrote:
> On 02/28, Christian Brauner wrote:
> >
> > Some tools like systemd's jounral need to retrieve the exit and cgroup
> > information after a process has already been reaped.
>               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> But unless I am totally confused do_exit() calls pidfd_exit() even
> before exit_notify(), the exiting task is not even zombie yet. It
> will reaped only when it passes exit_notify() and its parent does
> wait().

The overall goal is that it's possible to retrieve exit status and
cgroupid even if the task has already been reaped.

It's intentionally placed before exit_notify(), i.e., before the task is
a zombie because exit_notify() wakes pidfd-pollers. Ideally, pidfd
pollers would be woken and then could use the PIDFD_GET_INFO ioctl to
retrieve the exit status.

It would however be fine to place it into exit_notify() if it's a better
fit there. If you have a preference let me know.

I don't see a reason why seeing the exit status before that would be an
issue. The only downside would be that some other task that just keeps
ioctl()ing in a loop would possible see the exit status before the
parent does. But I didn't think this would be a big issue.

> And what about the multi-threaded case? Suppose the main thread
> does sys_exit(0) and it has alive sub-threads.
> 
> In this case pidfd_info() will report kinfo.exit_code = 0.
> And this is probably fine if (file->f_flags & PIDFD_THREAD) != 0.

Yes.

> But what if this file was created without PIDFD_THREAD? If another
> thread does exit_group(1) after that, the process's exit code is
> 1 << 8, but it can't be retrieved.

Yes, I had raised that in an off-list discussion about this as well and
was unsure what the cleanest way of dealing with this would be.

My initial approach had been to not just have:

struct pidfs_exit_info {
        __u64 cgroupid;
        __s32 exit_code;
};


but to have:

struct pidfs_exit_info {
        __u64 cgroupid;
        __s32 exit_code;
	__u64 tg_cgroupid;
	__s32 tg_exit_code;
};

so that it would be possible to retrieve either depending on the type of
pidfd. Is that feasible?

> Finally, sys_execve(). Suppose we have a main thread L and a
> sub-thread T.
> 
> T execs and kill the leader L. L exits and populates
> pidfs_i(inode)->exit_info.
> 
> T calls exchange_tids() in de_thread() and becomes the new leader
> with the same (old) pid.
> 
> Now, T is very much alive, but pidfs_i(inode)->exit_info != NULL.

Yes, de_thread() is a good point. That nasty wrinkly I had really
ignored^wforgotten. We should not report an exit status in this case. I
think that's what you're agreeing with as well?

> Or I am totally confused?

No, you're right!

What's the best way of handling the de_thread() case? Would moving this
into exit_notify() be enough where we also handle
PIDFD_THREAD/~PIDFD_THREAD waking?

Thanks for the review!

> 
> 
> 
> > +	exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> > +	if (exit_info) {
> > +		/*
> > +		 * TODO: Oleg, I didn't see a reason for putting
> > +		 * retrieval of the exit status of a task behind some
> > +		 * form of permission check.
> 
> Neither me.
> 
> Oleg.
>
Oleg Nesterov March 2, 2025, 5:21 p.m. UTC | #5
On 03/02, Christian Brauner wrote:
>
> On Sun, Mar 02, 2025 at 04:53:46PM +0100, Oleg Nesterov wrote:
> > On 02/28, Christian Brauner wrote:
> > >
> > > Some tools like systemd's jounral need to retrieve the exit and cgroup
> > > information after a process has already been reaped.
> >               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> > But unless I am totally confused do_exit() calls pidfd_exit() even
> > before exit_notify(), the exiting task is not even zombie yet. It
> > will reaped only when it passes exit_notify() and its parent does
> > wait().
>
> The overall goal is that it's possible to retrieve exit status and
> cgroupid even if the task has already been reaped.

OK, please see below...

> It's intentionally placed before exit_notify(), i.e., before the task is
> a zombie because exit_notify() wakes pidfd-pollers. Ideally, pidfd
> pollers would be woken and then could use the PIDFD_GET_INFO ioctl to
> retrieve the exit status.

This was more a less clear to me. But this doesn't match the "the task has
already been reaped" goal above...

> It would however be fine to place it into exit_notify() if it's a better
> fit there. If you have a preference let me know.
>
> I don't see a reason why seeing the exit status before that would be an
> issue.

The problem is that it is not clear how can we do this correctly.
Especialy considering the problem with exec...

> > But what if this file was created without PIDFD_THREAD? If another
> > thread does exit_group(1) after that, the process's exit code is
> > 1 << 8, but it can't be retrieved.
>
> Yes, I had raised that in an off-list discussion about this as well and
> was unsure what the cleanest way of dealing with this would be.

I am not sure too, but again, please see below.

> > Now, T is very much alive, but pidfs_i(inode)->exit_info != NULL.

...

> What's the best way of handling the de_thread() case? Would moving this
> into exit_notify() be enough where we also handle
> PIDFD_THREAD/~PIDFD_THREAD waking?

I don't think that moving pidfd_exit() into exit_notify() can solve any
problem.

But what if we move pidfd_exit() into release_task() paths? Called when
the task is reaped by the parent/debugger, or if a sub-thread auto-reaps.

Can the users of pidfd_info(PIDFD_INFO_EXIT) rely on POLLHUP from
release_task() -> detach_pid() -> __change_pid(new => NULL) ?

Oleg.
Christian Brauner March 2, 2025, 6:56 p.m. UTC | #6
On Sun, Mar 02, 2025 at 06:21:49PM +0100, Oleg Nesterov wrote:
> On 03/02, Christian Brauner wrote:
> >
> > On Sun, Mar 02, 2025 at 04:53:46PM +0100, Oleg Nesterov wrote:
> > > On 02/28, Christian Brauner wrote:
> > > >
> > > > Some tools like systemd's jounral need to retrieve the exit and cgroup
> > > > information after a process has already been reaped.
> > >               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >
> > > But unless I am totally confused do_exit() calls pidfd_exit() even
> > > before exit_notify(), the exiting task is not even zombie yet. It
> > > will reaped only when it passes exit_notify() and its parent does
> > > wait().
> >
> > The overall goal is that it's possible to retrieve exit status and
> > cgroupid even if the task has already been reaped.
> 
> OK, please see below...
> 
> > It's intentionally placed before exit_notify(), i.e., before the task is
> > a zombie because exit_notify() wakes pidfd-pollers. Ideally, pidfd
> > pollers would be woken and then could use the PIDFD_GET_INFO ioctl to
> > retrieve the exit status.
> 
> This was more a less clear to me. But this doesn't match the "the task has
> already been reaped" goal above...
> 
> > It would however be fine to place it into exit_notify() if it's a better
> > fit there. If you have a preference let me know.
> >
> > I don't see a reason why seeing the exit status before that would be an
> > issue.
> 
> The problem is that it is not clear how can we do this correctly.
> Especialy considering the problem with exec...
> 
> > > But what if this file was created without PIDFD_THREAD? If another
> > > thread does exit_group(1) after that, the process's exit code is
> > > 1 << 8, but it can't be retrieved.
> >
> > Yes, I had raised that in an off-list discussion about this as well and
> > was unsure what the cleanest way of dealing with this would be.
> 
> I am not sure too, but again, please see below.
> 
> > > Now, T is very much alive, but pidfs_i(inode)->exit_info != NULL.
> 
> ...
> 
> > What's the best way of handling the de_thread() case? Would moving this
> > into exit_notify() be enough where we also handle
> > PIDFD_THREAD/~PIDFD_THREAD waking?
> 
> I don't think that moving pidfd_exit() into exit_notify() can solve any
> problem.
> 
> But what if we move pidfd_exit() into release_task() paths? Called when
> the task is reaped by the parent/debugger, or if a sub-thread auto-reaps.
> 
> Can the users of pidfd_info(PIDFD_INFO_EXIT) rely on POLLHUP from
> release_task() -> detach_pid() -> __change_pid(new => NULL) ?

Ok, so:

release_task()
-> __exit_signal()
   -> detach_pid()
      -> __change_pid()

That sounds good. So could we do something like:

diff --git a/kernel/exit.c b/kernel/exit.c
index cae475e7858c..66bb5c53454f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -127,8 +127,10 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 {
        nr_threads--;
        detach_pid(p, PIDTYPE_PID);
+       pidfs_exit(p); // record exit information for individual thread
        if (group_dead) {
                detach_pid(p, PIDTYPE_TGID);
+               pidfs_exit(p); // record exit information for thread-group leader
                detach_pid(p, PIDTYPE_PGID);
                detach_pid(p, PIDTYPE_SID);

I know, as written this won't work but I'm just trying to get the idea
across of recording exit information for both the individual thread and
the thread-group leader in __unhash_process().

That should tackle both problems, i.e., recording exit information for
both thread and thread-group leader as well as exec?
Oleg Nesterov March 2, 2025, 8:24 p.m. UTC | #7
Christian,

I am already sleeping. I'll try to reply right now, but quite possibly
I will need to correct myself tomorrow ;)

On 03/02, Christian Brauner wrote:
>
> Ok, so:
>
> release_task()
> -> __exit_signal()
>    -> detach_pid()
>       -> __change_pid()
>
> That sounds good. So could we do something like:

Yes, this is what I meant, except...

> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -127,8 +127,10 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
>  {
>         nr_threads--;
>         detach_pid(p, PIDTYPE_PID);
> +       pidfs_exit(p); // record exit information for individual thread

To me it would be better to do this in the caller, release_task().
But this is minor and I won't insist. Please see below.

>         if (group_dead) {
>                 detach_pid(p, PIDTYPE_TGID);
> +               pidfs_exit(p); // record exit information for thread-group leader

This looks pointless, task_pid(p) is the same.

> I know, as written this won't work but I'm just trying to get the idea
> across of recording exit information for both the individual thread and
> the thread-group leader in __unhash_process().
>
> That should tackle both problems, i.e., recording exit information for
> both thread and thread-group leader as well as exec?

This will fix the problem with mt-exec, but this won't help to discriminate
the leader-exit and the-whole-group-exit cases...

With this this (or something like this) change pidfd_info() can only report
the exit code of the already reaped thread/process, leader or not.

I mean... If the leader L exits using sys_exit() and it has the live sub-
threads, release_task(L) / __unhash_process(L) will be only called when
the last sub-thread exits and it (or debugger) does "goto repeat;" in
release_task() to finally reap the leader.

IOW. If someone does sys_pidfd_create(group-leader-pid, PIDFD_THREAD),
pidfd_info() won't report PIDFD_INFO_EXIT if the leader has exited using
sys_exit() before other threads.

But perhaps this is fine?

Let me repeat, I have no idea how and why people use pidfd ;)

Oleg.
Lennart Poettering March 3, 2025, 9:06 a.m. UTC | #8
On So, 02.03.25 21:24, Oleg Nesterov (oleg@redhat.com) wrote:

> This will fix the problem with mt-exec, but this won't help to discriminate
> the leader-exit and the-whole-group-exit cases...
>
> With this this (or something like this) change pidfd_info() can only report
> the exit code of the already reaped thread/process, leader or not.
>
> I mean... If the leader L exits using sys_exit() and it has the live sub-
> threads, release_task(L) / __unhash_process(L) will be only called when
> the last sub-thread exits and it (or debugger) does "goto repeat;" in
> release_task() to finally reap the leader.
>
> IOW. If someone does sys_pidfd_create(group-leader-pid, PIDFD_THREAD),
> pidfd_info() won't report PIDFD_INFO_EXIT if the leader has exited using
> sys_exit() before other threads.
>
> But perhaps this is fine?

I think this is fine, but I'd really like a way how userspace can
determine this state reliably. i.e. a zombie state where the exit
status is not available yet is a bit strange by classic UNIX
standards on some level, no?

But I guess that might not be a pidfd specific issue. i.e. I figure
classic waitid() with WNOHANG failing on a zombie process that is set
up like that is a bit weird too, no? Or how does that work there?
(pretty sure some userspace might not be expecting that...)

Lennart

--
Lennart Poettering, Berlin
Christian Brauner March 3, 2025, 11:32 a.m. UTC | #9
On Mon, Mar 03, 2025 at 10:06:31AM +0100, Lennart Poettering wrote:
> On So, 02.03.25 21:24, Oleg Nesterov (oleg@redhat.com) wrote:
> 
> > This will fix the problem with mt-exec, but this won't help to discriminate
> > the leader-exit and the-whole-group-exit cases...
> >
> > With this this (or something like this) change pidfd_info() can only report
> > the exit code of the already reaped thread/process, leader or not.

Yes, that's fine. I don't think we need to report exit status
information right after the task has exited. It's fine to only provide
it once it has been reaped and it makes things simpler afaict.

Pidfd polling allows waiting on either task exit or for a task to have
been reaped. So the contract for PIDFD_INFO_EXIT is simply that EPOLLHUP
must be observed before exit information can be retrieved.

This aligns with wait() as well, where reaping of a thread-group leader
that exited before the thread-group was empty is delayed until the
thread-group is empty.

I think that with PIDFD_INFO_EXIT autoreaping might actually become
usable because it means a parent can ignore SIGCHLD or set SA_NOCLDWAIT
and simply use pidfd polling and PIDFD_INFO_EXIT to get get status
information from its children. But the kernel will autocleanup right
away instead of delaying. If it's a subreaper there's probably some
wrinkle with grand-children that get reparented to it? But for the
non-subreaper case it should be very useful.

> > I mean... If the leader L exits using sys_exit() and it has the live sub-
> > threads, release_task(L) / __unhash_process(L) will be only called when
> > the last sub-thread exits and it (or debugger) does "goto repeat;" in
> > release_task() to finally reap the leader.
> >
> > IOW. If someone does sys_pidfd_create(group-leader-pid, PIDFD_THREAD),
> > pidfd_info() won't report PIDFD_INFO_EXIT if the leader has exited using
> > sys_exit() before other threads.
> >
> > But perhaps this is fine?
> 
> I think this is fine, but I'd really like a way how userspace can
> determine this state reliably. i.e. a zombie state where the exit
> status is not available yet is a bit strange by classic UNIX
> standards on some level, no?
> 
> But I guess that might not be a pidfd specific issue. i.e. I figure
> classic waitid() with WNOHANG failing on a zombie process that is set
> up like that is a bit weird too, no? Or how does that work there?
> (pretty sure some userspace might not be expecting that...)

Yes, how I read the code WNOHANG exhibits the same behavior (so does WNOWAIT):

        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
                         * A zombie ptracee is only visible to its ptracer.
                         * Notification and reaping will be cascaded to the
                         * real parent when the ptracer detaches.
                         */
                        if (unlikely(ptrace) || likely(!p->ptrace))
                                return wait_task_zombie(wo, p);
                }
diff mbox series

Patch

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 433f676c066c..e500bc4c5af2 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -32,11 +32,12 @@  static struct kmem_cache *pidfs_cachep __ro_after_init;
  */
 struct pidfs_exit_info {
 	__u64 cgroupid;
-	__u64 exit_code;
+	__s32 exit_code;
 };
 
 struct pidfs_inode {
-	struct pidfs_exit_info exit_info;
+	struct pidfs_exit_info __pei;
+	struct pidfs_exit_info *exit_info;
 	struct inode vfs_inode;
 };
 
@@ -228,11 +229,14 @@  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	return poll_flags;
 }
 
-static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+static long pidfd_info(struct file *file, struct task_struct *task,
+		       unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
 	size_t usize = _IOC_SIZE(cmd);
 	struct pidfd_info kinfo = {};
+	struct pidfs_exit_info *exit_info;
+	struct inode *inode = file_inode(file);
 	struct user_namespace *user_ns;
 	const struct cred *c;
 	__u64 mask;
@@ -248,6 +252,39 @@  static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
 		return -EFAULT;
 
+	exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+	if (exit_info) {
+		/*
+		 * TODO: Oleg, I didn't see a reason for putting
+		 * retrieval of the exit status of a task behind some
+		 * form of permission check. Maybe there's some
+		 * potential concerns with seeing the exit status of a
+		 * SIGKILLed suid binary or something but even then I'm
+		 * not sure that's a problem.
+		 *
+		 * If we want this we could put this behind some *uid
+		 * check similar to what ptrace access does by recording
+		 * parts of the creds we'd need for checking this. But
+		 * only if we really need it.
+		 */
+		kinfo.exit_code = exit_info->exit_code;
+#ifdef CONFIG_CGROUPS
+		kinfo.cgroupid = exit_info->cgroupid;
+		kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
+#endif
+	}
+
+	/*
+	 * If the task has already been reaped only exit information
+	 * can be provided. It's entirely possible that the task has
+	 * already been reaped but we managed to grab a reference to it
+	 * before that. So a full set of information about @task doesn't
+	 * mean it hasn't been waited upon. Similarly, a full set of
+	 * information doesn't mean that the task hasn't already exited.
+	 */
+	if (!task)
+		goto copy_out;
+
 	c = get_task_cred(task);
 	if (!c)
 		return -ESRCH;
@@ -267,11 +304,13 @@  static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	put_cred(c);
 
 #ifdef CONFIG_CGROUPS
-	rcu_read_lock();
-	cgrp = task_dfl_cgroup(task);
-	kinfo.cgroupid = cgroup_id(cgrp);
-	kinfo.mask |= PIDFD_INFO_CGROUPID;
-	rcu_read_unlock();
+	if (!kinfo.cgroupid) {
+		rcu_read_lock();
+		cgrp = task_dfl_cgroup(task);
+		kinfo.cgroupid = cgroup_id(cgrp);
+		kinfo.mask |= PIDFD_INFO_CGROUPID;
+		rcu_read_unlock();
+	}
 #endif
 
 	/*
@@ -291,6 +330,7 @@  static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
 		return -ESRCH;
 
+copy_out:
 	/*
 	 * If userspace and the kernel have the same struct size it can just
 	 * be copied. If userspace provides an older struct, only the bits that
@@ -341,12 +381,13 @@  static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 
 	task = get_pid_task(pid, PIDTYPE_PID);
-	if (!task)
-		return -ESRCH;
 
 	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
 	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
-		return pidfd_info(task, cmd, arg);
+		return pidfd_info(file, task, cmd, arg);
+
+	if (!task)
+		return -ESRCH;
 
 	if (arg)
 		return -EINVAL;
@@ -486,7 +527,7 @@  void pidfs_exit(struct task_struct *tsk)
 		struct cgroup *cgrp;
 #endif
 		inode = d_inode(dentry);
-		exit_info = &pidfs_i(inode)->exit_info;
+		exit_info = &pidfs_i(inode)->__pei;
 
 		/* TODO: Annoy Oleg to tell me how to do this correctly. */
 		if (tsk->signal->flags & SIGNAL_GROUP_EXIT)
@@ -501,6 +542,8 @@  void pidfs_exit(struct task_struct *tsk)
 		rcu_read_unlock();
 #endif
 
+		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
 		dput(dentry);
 	}
 }
@@ -568,7 +611,8 @@  static struct inode *pidfs_alloc_inode(struct super_block *sb)
 	if (!pi)
 		return NULL;
 
-	memset(&pi->exit_info, 0, sizeof(pi->exit_info));
+	memset(&pi->__pei, 0, sizeof(pi->__pei));
+	pi->exit_info = NULL;
 
 	return &pi->vfs_inode;
 }
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index e0abd0b18841..e5966f1a7743 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -20,6 +20,7 @@ 
 #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
 #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
 #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
+#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
 
 #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
 
@@ -86,7 +87,7 @@  struct pidfd_info {
 	__u32 sgid;
 	__u32 fsuid;
 	__u32 fsgid;
-	__u32 spare0[1];
+	__s32 exit_code;
 };
 
 #define PIDFS_IOCTL_MAGIC 0xFF