diff mbox series

[v9] exec: Fix dead-lock in de_thread with ptrace_attach

Message ID AM8PR10MB470896FBC519ABCC20486958E4349@AM8PR10MB4708.EURPRD10.PROD.OUTLOOK.COM (mailing list archive)
State New
Headers show
Series [v9] exec: Fix dead-lock in de_thread with ptrace_attach | expand

Commit Message

Bernd Edlinger June 11, 2021, 3:55 p.m. UTC
This introduces signal->unsafe_execve_in_progress,
which is used to fix the case when at least one of the
sibling threads is traced, and therefore the trace
process may dead-lock in ptrace_attach, but de_thread
will need to wait for the tracer to continue execution.

The solution is to detect this situation and allow
ptrace_attach to continue, while de_thread() is still
waiting for traced zombies to be eventually released.
When the current thread changed the ptrace status from
non-traced to traced, we can simply abort the whole
execve and restart it by returning -ERESTARTSYS.
This needs to be done before changing the thread leader,
because the PTRACE_EVENT_EXEC needs to know the old
thread pid.

Although it is technically after the point of no return,
we just have to reset bprm->point_of_no_return here,
since at this time only the other threads have received
a fatal signal, not the current thread.

From the user's point of view the whole execve was
simply delayed until after the ptrace_attach.

Other threads die quickly since the cred_guard_mutex
is released, but a deadly signal is already pending.
In case the mutex_lock_killable misses the signal,
->unsafe_execve_in_progress makes sure they release
the mutex immediately and return with -ERESTARTNOINTR.

This means there is no API change, unlike the previous
version of this patch which was discussed here:

https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de/

See tools/testing/selftests/ptrace/vmaccess.c
for a test case that gets fixed by this change.

Note that since the test case was originally designed to
test the ptrace_attach returning an error in this situation,
the test expectation needed to be adjusted, to allow the
API to succeed at the first attempt.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
---
 fs/exec.c                                 | 37 +++++++++++++++++++++++++++++--
 fs/proc/base.c                            |  6 +++++
 include/linux/sched/signal.h              | 13 +++++++++++
 kernel/ptrace.c                           |  9 ++++++++
 kernel/seccomp.c                          | 12 +++++++---
 tools/testing/selftests/ptrace/vmaccess.c | 25 ++++++++++++++-------
 6 files changed, 89 insertions(+), 13 deletions(-)

Comments

Andrew Morton June 11, 2021, 11:16 p.m. UTC | #1
On Fri, 11 Jun 2021 17:55:09 +0200 Bernd Edlinger <bernd.edlinger@hotmail.de> wrote:

> This introduces signal->unsafe_execve_in_progress,
> which is used to fix the case when at least one of the
> sibling threads is traced, and therefore the trace
> process may dead-lock in ptrace_attach, but de_thread
> will need to wait for the tracer to continue execution.
> 
> The solution is to detect this situation and allow
> ptrace_attach to continue, while de_thread() is still
> waiting for traced zombies to be eventually released.
> When the current thread changed the ptrace status from
> non-traced to traced, we can simply abort the whole
> execve and restart it by returning -ERESTARTSYS.
> This needs to be done before changing the thread leader,
> because the PTRACE_EVENT_EXEC needs to know the old
> thread pid.
> 
> Although it is technically after the point of no return,
> we just have to reset bprm->point_of_no_return here,
> since at this time only the other threads have received
> a fatal signal, not the current thread.
> 
> >From the user's point of view the whole execve was
> simply delayed until after the ptrace_attach.
> 
> Other threads die quickly since the cred_guard_mutex
> is released, but a deadly signal is already pending.
> In case the mutex_lock_killable misses the signal,
> ->unsafe_execve_in_progress makes sure they release
> the mutex immediately and return with -ERESTARTNOINTR.
> 
> This means there is no API change, unlike the previous
> version of this patch which was discussed here:
> 
> https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de/
> 
> See tools/testing/selftests/ptrace/vmaccess.c
> for a test case that gets fixed by this change.
> 
> Note that since the test case was originally designed to
> test the ptrace_attach returning an error in this situation,
> the test expectation needed to be adjusted, to allow the
> API to succeed at the first attempt.
> 

err, sorry.  I replied to the v8 patch, not to v9.

--- a/fs/exec.c~exec-fix-dead-lock-in-de_thread-with-ptrace_attach-v9
+++ a/fs/exec.c
@@ -1056,29 +1056,31 @@ static int de_thread(struct task_struct
 		return -EAGAIN;
 	}
 
-	while_each_thread(tsk, t) {
-		if (unlikely(t->ptrace) && t != tsk->group_leader)
-			sig->unsafe_execve_in_progress = true;
-	}
-
 	sig->group_exit_task = tsk;
 	sig->notify_count = zap_other_threads(tsk);
 	if (!thread_group_leader(tsk))
 		sig->notify_count--;
-	spin_unlock_irq(lock);
 
-	if (unlikely(sig->unsafe_execve_in_progress))
+	while_each_thread(tsk, t) {
+		if (unlikely(t->ptrace) && t != tsk->group_leader)
+			sig->unsafe_execve_in_progress = true;
+	}
+
+	if (unlikely(sig->unsafe_execve_in_progress)) {
+		spin_unlock_irq(lock);
 		mutex_unlock(&sig->cred_guard_mutex);
+		spin_lock_irq(lock);
+	}
 
-	for (;;) {
-		set_current_state(TASK_KILLABLE);
-		if (!sig->notify_count)
-			break;
+	while (sig->notify_count) {
+		__set_current_state(TASK_KILLABLE);
+		spin_unlock_irq(lock);
 		schedule();
 		if (__fatal_signal_pending(tsk))
 			goto killed;
+		spin_lock_irq(lock);
 	}
-	__set_current_state(TASK_RUNNING);
+	spin_unlock_irq(lock);
 
 	if (unlikely(sig->unsafe_execve_in_progress)) {
 		if (mutex_lock_killable(&sig->cred_guard_mutex))
Bernd Edlinger June 12, 2021, 5:22 a.m. UTC | #2
On 6/12/21 1:16 AM, Andrew Morton wrote:
> On Fri, 11 Jun 2021 17:55:09 +0200 Bernd Edlinger <bernd.edlinger@hotmail.de> wrote:
> 
>> This introduces signal->unsafe_execve_in_progress,
>> which is used to fix the case when at least one of the
>> sibling threads is traced, and therefore the trace
>> process may dead-lock in ptrace_attach, but de_thread
>> will need to wait for the tracer to continue execution.
>>
>> The solution is to detect this situation and allow
>> ptrace_attach to continue, while de_thread() is still
>> waiting for traced zombies to be eventually released.
>> When the current thread changed the ptrace status from
>> non-traced to traced, we can simply abort the whole
>> execve and restart it by returning -ERESTARTSYS.
>> This needs to be done before changing the thread leader,
>> because the PTRACE_EVENT_EXEC needs to know the old
>> thread pid.
>>
>> Although it is technically after the point of no return,
>> we just have to reset bprm->point_of_no_return here,
>> since at this time only the other threads have received
>> a fatal signal, not the current thread.
>>
>> >From the user's point of view the whole execve was
>> simply delayed until after the ptrace_attach.
>>
>> Other threads die quickly since the cred_guard_mutex
>> is released, but a deadly signal is already pending.
>> In case the mutex_lock_killable misses the signal,
>> ->unsafe_execve_in_progress makes sure they release
>> the mutex immediately and return with -ERESTARTNOINTR.
>>
>> This means there is no API change, unlike the previous
>> version of this patch which was discussed here:
>>
>> https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de/
>>
>> See tools/testing/selftests/ptrace/vmaccess.c
>> for a test case that gets fixed by this change.
>>
>> Note that since the test case was originally designed to
>> test the ptrace_attach returning an error in this situation,
>> the test expectation needed to be adjusted, to allow the
>> API to succeed at the first attempt.
>>
> 
> err, sorry.  I replied to the v8 patch, not to v9.
> 

Sorry for the confusion.

Originally the loop here looked was entered with
sighand locked and was like this:

	while (sig->notify_count) {
		__set_current_state(TASK_KILLABLE);
		if (!sig->notify_count)
			break;
		spin_unlock_irq(lock);
		schedule();
		if (__fatal_signal_pending(tsk))
			goto killed;
	}
	spin_unlock_irq(lock);

v8 did this (tried avoid lots of spin-lock/unlocks):

	sig->group_exit_task = tsk;
	sig->notify_count = zap_other_threads(tsk);
	if (!thread_group_leader(tsk))
		sig->notify_count--;
	spin_unlock_irq(lock);

	if (unlikely(sig->unsafe_execve_in_progress))
		mutex_unlock(&sig->cred_guard_mutex);

	for (;;) {
		set_current_state(TASK_KILLABLE);
		if (!sig->notify_count)
			break;
		schedule();
		if (__fatal_signal_pending(tsk))
			goto killed;
	}

but here I overlooked that there is an execution path without
any spin-lock where sig->group_exit_task is set to NULL, which
could create a race with __signal_exit.

So v9 keeps the loop as it was, and instead does this:

	if (unlikely(sig->unsafe_execve_in_progress)) {
		spin_unlock_irq(lock);
		mutex_unlock(&sig->cred_guard_mutex);
		spin_lock_irq(lock);
	}

because I would not like to release the mutex while an
interrupt spin-lock is held.


Bernd.

> --- a/fs/exec.c~exec-fix-dead-lock-in-de_thread-with-ptrace_attach-v9
> +++ a/fs/exec.c
> @@ -1056,29 +1056,31 @@ static int de_thread(struct task_struct
>  		return -EAGAIN;
>  	}
>  
> -	while_each_thread(tsk, t) {
> -		if (unlikely(t->ptrace) && t != tsk->group_leader)
> -			sig->unsafe_execve_in_progress = true;
> -	}
> -
>  	sig->group_exit_task = tsk;
>  	sig->notify_count = zap_other_threads(tsk);
>  	if (!thread_group_leader(tsk))
>  		sig->notify_count--;
> -	spin_unlock_irq(lock);
>  
> -	if (unlikely(sig->unsafe_execve_in_progress))
> +	while_each_thread(tsk, t) {
> +		if (unlikely(t->ptrace) && t != tsk->group_leader)
> +			sig->unsafe_execve_in_progress = true;
> +	}
> +
> +	if (unlikely(sig->unsafe_execve_in_progress)) {
> +		spin_unlock_irq(lock);
>  		mutex_unlock(&sig->cred_guard_mutex);
> +		spin_lock_irq(lock);
> +	}
>  
> -	for (;;) {
> -		set_current_state(TASK_KILLABLE);
> -		if (!sig->notify_count)
> -			break;
> +	while (sig->notify_count) {
> +		__set_current_state(TASK_KILLABLE);
> +		spin_unlock_irq(lock);
>  		schedule();
>  		if (__fatal_signal_pending(tsk))
>  			goto killed;
> +		spin_lock_irq(lock);
>  	}
> -	__set_current_state(TASK_RUNNING);
> +	spin_unlock_irq(lock);
>  
>  	if (unlikely(sig->unsafe_execve_in_progress)) {
>  		if (mutex_lock_killable(&sig->cred_guard_mutex))
> _
>
Greg Kroah-Hartman June 12, 2021, 7:02 a.m. UTC | #3
On Fri, Jun 11, 2021 at 05:55:09PM +0200, Bernd Edlinger wrote:
> This introduces signal->unsafe_execve_in_progress,
> which is used to fix the case when at least one of the
> sibling threads is traced, and therefore the trace
> process may dead-lock in ptrace_attach, but de_thread
> will need to wait for the tracer to continue execution.
> 
> The solution is to detect this situation and allow
> ptrace_attach to continue, while de_thread() is still
> waiting for traced zombies to be eventually released.
> When the current thread changed the ptrace status from
> non-traced to traced, we can simply abort the whole
> execve and restart it by returning -ERESTARTSYS.
> This needs to be done before changing the thread leader,
> because the PTRACE_EVENT_EXEC needs to know the old
> thread pid.
> 
> Although it is technically after the point of no return,
> we just have to reset bprm->point_of_no_return here,
> since at this time only the other threads have received
> a fatal signal, not the current thread.
> 
> >From the user's point of view the whole execve was
> simply delayed until after the ptrace_attach.
> 
> Other threads die quickly since the cred_guard_mutex
> is released, but a deadly signal is already pending.
> In case the mutex_lock_killable misses the signal,
> ->unsafe_execve_in_progress makes sure they release
> the mutex immediately and return with -ERESTARTNOINTR.
> 
> This means there is no API change, unlike the previous
> version of this patch which was discussed here:
> 
> https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de/
> 
> See tools/testing/selftests/ptrace/vmaccess.c
> for a test case that gets fixed by this change.
> 
> Note that since the test case was originally designed to
> test the ptrace_attach returning an error in this situation,
> the test expectation needed to be adjusted, to allow the
> API to succeed at the first attempt.
> 
> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
> ---
>  fs/exec.c                                 | 37 +++++++++++++++++++++++++++++--
>  fs/proc/base.c                            |  6 +++++
>  include/linux/sched/signal.h              | 13 +++++++++++
>  kernel/ptrace.c                           |  9 ++++++++
>  kernel/seccomp.c                          | 12 +++++++---
>  tools/testing/selftests/ptrace/vmaccess.c | 25 ++++++++++++++-------
>  6 files changed, 89 insertions(+), 13 deletions(-)
> 

<formletter>

This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
    https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.

</formletter>
Eric W. Biederman June 14, 2021, 4:42 p.m. UTC | #4
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> This introduces signal->unsafe_execve_in_progress,
> which is used to fix the case when at least one of the
> sibling threads is traced, and therefore the trace
> process may dead-lock in ptrace_attach, but de_thread
> will need to wait for the tracer to continue execution.

Userspace processes hang waiting for each other.  Not a proper kernel
deadlock.  Annoying but not horrible.  Definitely worth fixing if we can.

> The solution is to detect this situation and allow
> ptrace_attach to continue, while de_thread() is still
> waiting for traced zombies to be eventually released.
> When the current thread changed the ptrace status from
> non-traced to traced, we can simply abort the whole
> execve and restart it by returning -ERESTARTSYS.
> This needs to be done before changing the thread leader,
> because the PTRACE_EVENT_EXEC needs to know the old
> thread pid.

Except you are not detecting this situation.  Testing for t->ptrace
finds tasks that have completed their ptrace attach and no longer need
the cred_gaurd_mutex.

You almost discover the related problem that involves PTRACE_EVENT_EXEC.

It will probably help to have a full description of all of the
processes and states involved in the hang in your description
so you can show how your proposed change avoids the problem.

> Although it is technically after the point of no return,
> we just have to reset bprm->point_of_no_return here,
> since at this time only the other threads have received
> a fatal signal, not the current thread.

No.  If you have killed other threads we are most definitely past the
point where it is at all reasonable to return to userspace.
Perfunctorily killing other threads may leave them with locks held and
who knows what other problems.  Certainly it leaves the application
unable to process a failure from exec and continue on.

> From the user's point of view the whole execve was
> simply delayed until after the ptrace_attach.

Conceptually I like what you are trying to detect and do.
However your description unfortunately does not match the code.

If you can find a test for another process waiting to ptrace_attach
one of our threads before we enter into de_thread that would be a
reasonable time to do something, and would potentially make a nice
fix.


Eric

> Other threads die quickly since the cred_guard_mutex
> is released, but a deadly signal is already pending.
> In case the mutex_lock_killable misses the signal,
> ->unsafe_execve_in_progress makes sure they release
> the mutex immediately and return with -ERESTARTNOINTR.
>
> This means there is no API change, unlike the previous
> version of this patch which was discussed here:
>
> https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de/
>
> See tools/testing/selftests/ptrace/vmaccess.c
> for a test case that gets fixed by this change.
>
> Note that since the test case was originally designed to
> test the ptrace_attach returning an error in this situation,
> the test expectation needed to be adjusted, to allow the
> API to succeed at the first attempt.
>
> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
> ---
>  fs/exec.c                                 | 37 +++++++++++++++++++++++++++++--
>  fs/proc/base.c                            |  6 +++++
>  include/linux/sched/signal.h              | 13 +++++++++++
>  kernel/ptrace.c                           |  9 ++++++++
>  kernel/seccomp.c                          | 12 +++++++---
>  tools/testing/selftests/ptrace/vmaccess.c | 25 ++++++++++++++-------
>  6 files changed, 89 insertions(+), 13 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 8344fba..c7b1926 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1040,6 +1040,8 @@ static int de_thread(struct task_struct *tsk)
>  	struct signal_struct *sig = tsk->signal;
>  	struct sighand_struct *oldsighand = tsk->sighand;
>  	spinlock_t *lock = &oldsighand->siglock;
> +	unsigned int prev_ptrace = tsk->ptrace;
> +	struct task_struct *t = tsk;
>  
>  	if (thread_group_empty(tsk))
>  		goto no_thread_group;
> @@ -1062,6 +1064,17 @@ static int de_thread(struct task_struct *tsk)
>  	if (!thread_group_leader(tsk))
>  		sig->notify_count--;
>  
> +	while_each_thread(tsk, t) {
> +		if (unlikely(t->ptrace) && t != tsk->group_leader)
> +			sig->unsafe_execve_in_progress = true;
> +	}
> +
> +	if (unlikely(sig->unsafe_execve_in_progress)) {
> +		spin_unlock_irq(lock);
> +		mutex_unlock(&sig->cred_guard_mutex);
> +		spin_lock_irq(lock);
> +	}
> +
>  	while (sig->notify_count) {
>  		__set_current_state(TASK_KILLABLE);
>  		spin_unlock_irq(lock);
> @@ -1072,6 +1085,17 @@ static int de_thread(struct task_struct *tsk)
>  	}
>  	spin_unlock_irq(lock);
>  
> +	if (unlikely(sig->unsafe_execve_in_progress)) {
> +		if (mutex_lock_killable(&sig->cred_guard_mutex))
> +			goto killed;
> +		sig->unsafe_execve_in_progress = false;
> +		if (!prev_ptrace && tsk->ptrace) {
> +			sig->group_exit_task = NULL;
> +			sig->notify_count = 0;
> +			return -ERESTARTSYS;
> +		}
> +	}
> +
>  	/*
>  	 * At this point all other threads have exited, all we have to
>  	 * do is to wait for the thread group leader to become inactive,
> @@ -1255,8 +1279,11 @@ int begin_new_exec(struct linux_binprm * bprm)
>  	 * Make this the only thread in the thread group.
>  	 */
>  	retval = de_thread(me);
> -	if (retval)
> +	if (retval) {
> +		if (retval == -ERESTARTSYS)
> +			bprm->point_of_no_return = false;
>  		goto out;
> +	}
>  
>  	/*
>  	 * Cancel any io_uring activity across execve
> @@ -1466,6 +1493,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
>  	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
>  		return -ERESTARTNOINTR;
>  
> +	if (unlikely(current->signal->unsafe_execve_in_progress)) {
> +		mutex_unlock(&current->signal->cred_guard_mutex);
> +		return -ERESTARTNOINTR;
> +	}
> +
>  	bprm->cred = prepare_exec_creds();
>  	if (likely(bprm->cred))
>  		return 0;
> @@ -1482,7 +1514,8 @@ static void free_bprm(struct linux_binprm *bprm)
>  	}
>  	free_arg_pages(bprm);
>  	if (bprm->cred) {
> -		mutex_unlock(&current->signal->cred_guard_mutex);
> +		if (!current->signal->unsafe_execve_in_progress)
> +			mutex_unlock(&current->signal->cred_guard_mutex);
>  		abort_creds(bprm->cred);
>  	}
>  	if (bprm->file) {
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 3851bfc..3b2a55c 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2739,6 +2739,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
>  	if (rv < 0)
>  		goto out_free;
>  
> +	if (unlikely(current->signal->unsafe_execve_in_progress)) {
> +		mutex_unlock(&current->signal->cred_guard_mutex);
> +		rv = -ERESTARTNOINTR;
> +		goto out_free;
> +	}
> +
>  	rv = security_setprocattr(PROC_I(inode)->op.lsm,
>  				  file->f_path.dentry->d_name.name, page,
>  				  count);
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 3f6a0fc..220a083 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -214,6 +214,17 @@ struct signal_struct {
>  #endif
>  
>  	/*
> +	 * Set while execve is executing but is *not* holding
> +	 * cred_guard_mutex to avoid possible dead-locks.
> +	 * The cred_guard_mutex is released *after* de_thread() has
> +	 * called zap_other_threads(), therefore a fatal signal is
> +	 * guaranteed to be already pending in the unlikely event, that
> +	 * current->signal->unsafe_execve_in_progress happens to be
> +	 * true after the cred_guard_mutex was acquired.
> +	 */
> +	bool unsafe_execve_in_progress;
> +
> +	/*
>  	 * Thread is the potential origin of an oom condition; kill first on
>  	 * oom
>  	 */
> @@ -227,6 +238,8 @@ struct signal_struct {
>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>  					 * credential calculations
>  					 * (notably. ptrace)
> +					 * Held while execve runs, except when
> +					 * a sibling thread is being traced.
>  					 * Deprecated do not use in new code.
>  					 * Use exec_update_lock instead.
>  					 */
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 61db50f..0cbc1eb 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -468,6 +468,14 @@ static int ptrace_traceme(void)
>  {
>  	int ret = -EPERM;
>  
> +	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
> +		return -ERESTARTNOINTR;
> +
> +	if (unlikely(current->signal->unsafe_execve_in_progress)) {
> +		mutex_unlock(&current->signal->cred_guard_mutex);
> +		return -ERESTARTNOINTR;
> +	}
> +
>  	write_lock_irq(&tasklist_lock);
>  	/* Are we already being traced? */
>  	if (!current->ptrace) {
> @@ -483,6 +491,7 @@ static int ptrace_traceme(void)
>  		}
>  	}
>  	write_unlock_irq(&tasklist_lock);
> +	mutex_unlock(&current->signal->cred_guard_mutex);
>  
>  	return ret;
>  }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 1d60fc2..b1389ee 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1824,9 +1824,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
>  	 * Make sure we cannot change seccomp or nnp state via TSYNC
>  	 * while another thread is in the middle of calling exec.
>  	 */
> -	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
> -	    mutex_lock_killable(&current->signal->cred_guard_mutex))
> -		goto out_put_fd;
> +	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
> +		if (mutex_lock_killable(&current->signal->cred_guard_mutex))
> +			goto out_put_fd;
> +
> +		if (unlikely(current->signal->unsafe_execve_in_progress)) {
> +			mutex_unlock(&current->signal->cred_guard_mutex);
> +			goto out_put_fd;
> +		}
> +	}
>  
>  	spin_lock_irq(&current->sighand->siglock);
>  
> diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
> index 4db327b..c7c2242 100644
> --- a/tools/testing/selftests/ptrace/vmaccess.c
> +++ b/tools/testing/selftests/ptrace/vmaccess.c
> @@ -39,8 +39,15 @@ static void *thread(void *arg)
>  	f = open(mm, O_RDONLY);
>  	ASSERT_GE(f, 0);
>  	close(f);
> -	f = kill(pid, SIGCONT);
> -	ASSERT_EQ(f, 0);
> +	f = waitpid(-1, NULL, 0);
> +	ASSERT_NE(f, -1);
> +	ASSERT_NE(f, 0);
> +	ASSERT_NE(f, pid);
> +	f = waitpid(-1, NULL, 0);
> +	ASSERT_EQ(f, pid);
> +	f = waitpid(-1, NULL, 0);
> +	ASSERT_EQ(f, -1);
> +	ASSERT_EQ(errno, ECHILD);
>  }
>  
>  TEST(attach)
> @@ -57,22 +64,24 @@ static void *thread(void *arg)
>  
>  	sleep(1);
>  	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
> -	ASSERT_EQ(errno, EAGAIN);
> -	ASSERT_EQ(k, -1);
> +	ASSERT_EQ(k, 0);
>  	k = waitpid(-1, &s, WNOHANG);
>  	ASSERT_NE(k, -1);
>  	ASSERT_NE(k, 0);
>  	ASSERT_NE(k, pid);
>  	ASSERT_EQ(WIFEXITED(s), 1);
>  	ASSERT_EQ(WEXITSTATUS(s), 0);
> -	sleep(1);
> -	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
> -	ASSERT_EQ(k, 0);
>  	k = waitpid(-1, &s, 0);
>  	ASSERT_EQ(k, pid);
>  	ASSERT_EQ(WIFSTOPPED(s), 1);
>  	ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
> -	k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
> +	k = ptrace(PTRACE_CONT, pid, 0L, 0L);
> +	ASSERT_EQ(k, 0);
> +	k = waitpid(-1, &s, 0);
> +	ASSERT_EQ(k, pid);
> +	ASSERT_EQ(WIFSTOPPED(s), 1);
> +	ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
> +	k = ptrace(PTRACE_CONT, pid, 0L, 0L);
>  	ASSERT_EQ(k, 0);
>  	k = waitpid(-1, &s, 0);
>  	ASSERT_EQ(k, pid);
Bernd Edlinger June 15, 2021, 2:26 p.m. UTC | #5
Thanks for your review.

On 6/14/21 6:42 PM, Eric W. Biederman wrote:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>> This introduces signal->unsafe_execve_in_progress,
>> which is used to fix the case when at least one of the
>> sibling threads is traced, and therefore the trace
>> process may dead-lock in ptrace_attach, but de_thread
>> will need to wait for the tracer to continue execution.
> 
> Userspace processes hang waiting for each other.  Not a proper kernel
> deadlock.  Annoying but not horrible.  Definitely worth fixing if we can.
> 

I wonder if I am used a wrong term in the title.
Do you have a suggestion for better wording?

>> The solution is to detect this situation and allow
>> ptrace_attach to continue, while de_thread() is still
>> waiting for traced zombies to be eventually released.
>> When the current thread changed the ptrace status from
>> non-traced to traced, we can simply abort the whole
>> execve and restart it by returning -ERESTARTSYS.
>> This needs to be done before changing the thread leader,
>> because the PTRACE_EVENT_EXEC needs to know the old
>> thread pid.
> 
> Except you are not detecting this situation.  Testing for t->ptrace
> finds tasks that have completed their ptrace attach and no longer need
> the cred_gaurd_mutex.
> 

The first phase of de_thread needs co-operation from a user task,
if and only if any task t except the thread leader has t->ptrace.
Taking tasks from RUNNING->EXIT_ZOMBIE only needs co-operation from kernel code,
that is using mutex_wait_killable(&sig->cred_guard_mutex).
Tasks with !t->ptrace are childs of the thread leader, and are automatically
sent to EXIT_DEAD, see kernel/exit.c (exit_notify):

        if (unlikely(tsk->ptrace)) {
                [...]
        } else if (thread_group_leader(tsk)) {
                autoreap = thread_group_empty(tsk) &&
                        do_notify_parent(tsk, tsk->exit_signal);
        } else {
                autoreap = true;
        }

        if (autoreap) {
                tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
        }

But tasks which are traced have a different parent, and will stay ZOMBIES
for as long as the tracer does not call waitpid or handle the SIGCHILD.

> You almost discover the related problem that involves PTRACE_EVENT_EXEC.
> 
> It will probably help to have a full description of all of the
> processes and states involved in the hang in your description
> so you can show how your proposed change avoids the problem.
> 

Ok, will try to do that.

>> Although it is technically after the point of no return,
>> we just have to reset bprm->point_of_no_return here,
>> since at this time only the other threads have received
>> a fatal signal, not the current thread.
> 
> No.  If you have killed other threads we are most definitely past the
> point where it is at all reasonable to return to userspace.
> Perfunctorily killing other threads may leave them with locks held and
> who knows what other problems.  Certainly it leaves the application
> unable to process a failure from exec and continue on.
> 

Yeah, I tend to agree.  I had assumed that returning -ERESTARTSYS will always
bounce-back to the same execve syscall, and that the restarted execve call
must succeed.  But especially the second assumption is not a given thing.

I wonder if that might work instead?

diff --git a/fs/exec.c b/fs/exec.c
index c7b1926..4490288 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1956,6 +1956,13 @@ static int do_execveat_common(int fd, struct filename *filename,
 out_free:
        free_bprm(bprm);
 
+       if (retval == -ERESTARTSYS && !fatal_signal_pending(current)) {
+               retval = do_execveat_common(fd, filename, argv, envp, flags);
+               if (retval < 0 && !fatal_signal_pending(current))
+                       force_sigsegv(SIGSEGV);
+               return retval;
+       }
+
 out_ret:
        putname(filename);
        return retval;
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
index c7c2242..3b7d81fb 100644
--- a/tools/testing/selftests/ptrace/vmaccess.c
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -74,13 +74,13 @@ static void *thread(void *arg)
        k = waitpid(-1, &s, 0);
        ASSERT_EQ(k, pid);
        ASSERT_EQ(WIFSTOPPED(s), 1);
-       ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
+       ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
        k = ptrace(PTRACE_CONT, pid, 0L, 0L);
        ASSERT_EQ(k, 0);
        k = waitpid(-1, &s, 0);
        ASSERT_EQ(k, pid);
        ASSERT_EQ(WIFSTOPPED(s), 1);
-       ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
+       ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
        k = ptrace(PTRACE_CONT, pid, 0L, 0L);
        ASSERT_EQ(k, 0);
        k = waitpid(-1, &s, 0);


Since the SIGSTOP from the PTRACE_ACCESS is not handled before the restart,
the SIGSTOP and SIGTRAP events in the test case are in reversed order, but
that is a possible outcome too.

>> From the user's point of view the whole execve was
>> simply delayed until after the ptrace_attach.
> 
> Conceptually I like what you are trying to detect and do.
> However your description unfortunately does not match the code.
> 
> If you can find a test for another process waiting to ptrace_attach
> one of our threads before we enter into de_thread that would be a
> reasonable time to do something, and would potentially make a nice
> fix.
> 

No I don't see any way how to do that.

Unfortunately the tracer may or may not decide to do the ptrace_attach
at any time, and it is usually the same process that is unable to do
the waitpid because it is hanging in the ptrace_attach.


Bernd.

> 
> Eric
> 
>> Other threads die quickly since the cred_guard_mutex
>> is released, but a deadly signal is already pending.
>> In case the mutex_lock_killable misses the signal,
>> ->unsafe_execve_in_progress makes sure they release
>> the mutex immediately and return with -ERESTARTNOINTR.
>>
>> This means there is no API change, unlike the previous
>> version of this patch which was discussed here:
>>
>> https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de/
>>
>> See tools/testing/selftests/ptrace/vmaccess.c
>> for a test case that gets fixed by this change.
>>
>> Note that since the test case was originally designed to
>> test the ptrace_attach returning an error in this situation,
>> the test expectation needed to be adjusted, to allow the
>> API to succeed at the first attempt.
>>
>> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
>> ---
>>  fs/exec.c                                 | 37 +++++++++++++++++++++++++++++--
>>  fs/proc/base.c                            |  6 +++++
>>  include/linux/sched/signal.h              | 13 +++++++++++
>>  kernel/ptrace.c                           |  9 ++++++++
>>  kernel/seccomp.c                          | 12 +++++++---
>>  tools/testing/selftests/ptrace/vmaccess.c | 25 ++++++++++++++-------
>>  6 files changed, 89 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 8344fba..c7b1926 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1040,6 +1040,8 @@ static int de_thread(struct task_struct *tsk)
>>  	struct signal_struct *sig = tsk->signal;
>>  	struct sighand_struct *oldsighand = tsk->sighand;
>>  	spinlock_t *lock = &oldsighand->siglock;
>> +	unsigned int prev_ptrace = tsk->ptrace;
>> +	struct task_struct *t = tsk;
>>  
>>  	if (thread_group_empty(tsk))
>>  		goto no_thread_group;
>> @@ -1062,6 +1064,17 @@ static int de_thread(struct task_struct *tsk)
>>  	if (!thread_group_leader(tsk))
>>  		sig->notify_count--;
>>  
>> +	while_each_thread(tsk, t) {
>> +		if (unlikely(t->ptrace) && t != tsk->group_leader)
>> +			sig->unsafe_execve_in_progress = true;
>> +	}
>> +
>> +	if (unlikely(sig->unsafe_execve_in_progress)) {
>> +		spin_unlock_irq(lock);
>> +		mutex_unlock(&sig->cred_guard_mutex);
>> +		spin_lock_irq(lock);
>> +	}
>> +
>>  	while (sig->notify_count) {
>>  		__set_current_state(TASK_KILLABLE);
>>  		spin_unlock_irq(lock);
>> @@ -1072,6 +1085,17 @@ static int de_thread(struct task_struct *tsk)
>>  	}
>>  	spin_unlock_irq(lock);
>>  
>> +	if (unlikely(sig->unsafe_execve_in_progress)) {
>> +		if (mutex_lock_killable(&sig->cred_guard_mutex))
>> +			goto killed;
>> +		sig->unsafe_execve_in_progress = false;
>> +		if (!prev_ptrace && tsk->ptrace) {
>> +			sig->group_exit_task = NULL;
>> +			sig->notify_count = 0;
>> +			return -ERESTARTSYS;
>> +		}
>> +	}
>> +
>>  	/*
>>  	 * At this point all other threads have exited, all we have to
>>  	 * do is to wait for the thread group leader to become inactive,
>> @@ -1255,8 +1279,11 @@ int begin_new_exec(struct linux_binprm * bprm)
>>  	 * Make this the only thread in the thread group.
>>  	 */
>>  	retval = de_thread(me);
>> -	if (retval)
>> +	if (retval) {
>> +		if (retval == -ERESTARTSYS)
>> +			bprm->point_of_no_return = false;
>>  		goto out;
>> +	}
>>  
>>  	/*
>>  	 * Cancel any io_uring activity across execve
>> @@ -1466,6 +1493,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
>>  	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
>>  		return -ERESTARTNOINTR;
>>  
>> +	if (unlikely(current->signal->unsafe_execve_in_progress)) {
>> +		mutex_unlock(&current->signal->cred_guard_mutex);
>> +		return -ERESTARTNOINTR;
>> +	}
>> +
>>  	bprm->cred = prepare_exec_creds();
>>  	if (likely(bprm->cred))
>>  		return 0;
>> @@ -1482,7 +1514,8 @@ static void free_bprm(struct linux_binprm *bprm)
>>  	}
>>  	free_arg_pages(bprm);
>>  	if (bprm->cred) {
>> -		mutex_unlock(&current->signal->cred_guard_mutex);
>> +		if (!current->signal->unsafe_execve_in_progress)
>> +			mutex_unlock(&current->signal->cred_guard_mutex);
>>  		abort_creds(bprm->cred);
>>  	}
>>  	if (bprm->file) {
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 3851bfc..3b2a55c 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -2739,6 +2739,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
>>  	if (rv < 0)
>>  		goto out_free;
>>  
>> +	if (unlikely(current->signal->unsafe_execve_in_progress)) {
>> +		mutex_unlock(&current->signal->cred_guard_mutex);
>> +		rv = -ERESTARTNOINTR;
>> +		goto out_free;
>> +	}
>> +
>>  	rv = security_setprocattr(PROC_I(inode)->op.lsm,
>>  				  file->f_path.dentry->d_name.name, page,
>>  				  count);
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 3f6a0fc..220a083 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -214,6 +214,17 @@ struct signal_struct {
>>  #endif
>>  
>>  	/*
>> +	 * Set while execve is executing but is *not* holding
>> +	 * cred_guard_mutex to avoid possible dead-locks.
>> +	 * The cred_guard_mutex is released *after* de_thread() has
>> +	 * called zap_other_threads(), therefore a fatal signal is
>> +	 * guaranteed to be already pending in the unlikely event, that
>> +	 * current->signal->unsafe_execve_in_progress happens to be
>> +	 * true after the cred_guard_mutex was acquired.
>> +	 */
>> +	bool unsafe_execve_in_progress;
>> +
>> +	/*
>>  	 * Thread is the potential origin of an oom condition; kill first on
>>  	 * oom
>>  	 */
>> @@ -227,6 +238,8 @@ struct signal_struct {
>>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>>  					 * credential calculations
>>  					 * (notably. ptrace)
>> +					 * Held while execve runs, except when
>> +					 * a sibling thread is being traced.
>>  					 * Deprecated do not use in new code.
>>  					 * Use exec_update_lock instead.
>>  					 */
>> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
>> index 61db50f..0cbc1eb 100644
>> --- a/kernel/ptrace.c
>> +++ b/kernel/ptrace.c
>> @@ -468,6 +468,14 @@ static int ptrace_traceme(void)
>>  {
>>  	int ret = -EPERM;
>>  
>> +	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
>> +		return -ERESTARTNOINTR;
>> +
>> +	if (unlikely(current->signal->unsafe_execve_in_progress)) {
>> +		mutex_unlock(&current->signal->cred_guard_mutex);
>> +		return -ERESTARTNOINTR;
>> +	}
>> +
>>  	write_lock_irq(&tasklist_lock);
>>  	/* Are we already being traced? */
>>  	if (!current->ptrace) {
>> @@ -483,6 +491,7 @@ static int ptrace_traceme(void)
>>  		}
>>  	}
>>  	write_unlock_irq(&tasklist_lock);
>> +	mutex_unlock(&current->signal->cred_guard_mutex);
>>  
>>  	return ret;
>>  }
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 1d60fc2..b1389ee 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -1824,9 +1824,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>  	 * Make sure we cannot change seccomp or nnp state via TSYNC
>>  	 * while another thread is in the middle of calling exec.
>>  	 */
>> -	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
>> -	    mutex_lock_killable(&current->signal->cred_guard_mutex))
>> -		goto out_put_fd;
>> +	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
>> +		if (mutex_lock_killable(&current->signal->cred_guard_mutex))
>> +			goto out_put_fd;
>> +
>> +		if (unlikely(current->signal->unsafe_execve_in_progress)) {
>> +			mutex_unlock(&current->signal->cred_guard_mutex);
>> +			goto out_put_fd;
>> +		}
>> +	}
>>  
>>  	spin_lock_irq(&current->sighand->siglock);
>>  
>> diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
>> index 4db327b..c7c2242 100644
>> --- a/tools/testing/selftests/ptrace/vmaccess.c
>> +++ b/tools/testing/selftests/ptrace/vmaccess.c
>> @@ -39,8 +39,15 @@ static void *thread(void *arg)
>>  	f = open(mm, O_RDONLY);
>>  	ASSERT_GE(f, 0);
>>  	close(f);
>> -	f = kill(pid, SIGCONT);
>> -	ASSERT_EQ(f, 0);
>> +	f = waitpid(-1, NULL, 0);
>> +	ASSERT_NE(f, -1);
>> +	ASSERT_NE(f, 0);
>> +	ASSERT_NE(f, pid);
>> +	f = waitpid(-1, NULL, 0);
>> +	ASSERT_EQ(f, pid);
>> +	f = waitpid(-1, NULL, 0);
>> +	ASSERT_EQ(f, -1);
>> +	ASSERT_EQ(errno, ECHILD);
>>  }
>>  
>>  TEST(attach)
>> @@ -57,22 +64,24 @@ static void *thread(void *arg)
>>  
>>  	sleep(1);
>>  	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
>> -	ASSERT_EQ(errno, EAGAIN);
>> -	ASSERT_EQ(k, -1);
>> +	ASSERT_EQ(k, 0);
>>  	k = waitpid(-1, &s, WNOHANG);
>>  	ASSERT_NE(k, -1);
>>  	ASSERT_NE(k, 0);
>>  	ASSERT_NE(k, pid);
>>  	ASSERT_EQ(WIFEXITED(s), 1);
>>  	ASSERT_EQ(WEXITSTATUS(s), 0);
>> -	sleep(1);
>> -	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
>> -	ASSERT_EQ(k, 0);
>>  	k = waitpid(-1, &s, 0);
>>  	ASSERT_EQ(k, pid);
>>  	ASSERT_EQ(WIFSTOPPED(s), 1);
>>  	ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
>> -	k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
>> +	k = ptrace(PTRACE_CONT, pid, 0L, 0L);
>> +	ASSERT_EQ(k, 0);
>> +	k = waitpid(-1, &s, 0);
>> +	ASSERT_EQ(k, pid);
>> +	ASSERT_EQ(WIFSTOPPED(s), 1);
>> +	ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
>> +	k = ptrace(PTRACE_CONT, pid, 0L, 0L);
>>  	ASSERT_EQ(k, 0);
>>  	k = waitpid(-1, &s, 0);
>>  	ASSERT_EQ(k, pid);
Bernd Edlinger June 16, 2021, 9:31 p.m. UTC | #6
On 6/15/21 4:26 PM, Bernd Edlinger wrote:
> Thanks for your review.
> 
> On 6/14/21 6:42 PM, Eric W. Biederman wrote:
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>
>>> This introduces signal->unsafe_execve_in_progress,
>>> which is used to fix the case when at least one of the
>>> sibling threads is traced, and therefore the trace
>>> process may dead-lock in ptrace_attach, but de_thread
>>> will need to wait for the tracer to continue execution.
>>
>> Userspace processes hang waiting for each other.  Not a proper kernel
>> deadlock.  Annoying but not horrible.  Definitely worth fixing if we can.
>>
> 
> I wonder if I am used a wrong term in the title.
> Do you have a suggestion for better wording?
> 
>>> The solution is to detect this situation and allow
>>> ptrace_attach to continue, while de_thread() is still
>>> waiting for traced zombies to be eventually released.
>>> When the current thread changed the ptrace status from
>>> non-traced to traced, we can simply abort the whole
>>> execve and restart it by returning -ERESTARTSYS.
>>> This needs to be done before changing the thread leader,
>>> because the PTRACE_EVENT_EXEC needs to know the old
>>> thread pid.
>>
>> Except you are not detecting this situation.  Testing for t->ptrace
>> finds tasks that have completed their ptrace attach and no longer need
>> the cred_gaurd_mutex.
>>
> 
> The first phase of de_thread needs co-operation from a user task,
> if and only if any task t except the thread leader has t->ptrace.
> Taking tasks from RUNNING->EXIT_ZOMBIE only needs co-operation from kernel code,


Aehm, sorry, that is not correct, what I said here.

I totally overlooked ptrace(PTRACE_SEIZE, pid, 0L, PTRACE_O_TRACEEXIT)

and unfortunately this also prevents even the thread leader to enter the
EXIT_ZOMBIE state because do_exit does:

        ptrace_event(PTRACE_EVENT_EXIT, code);

unfortunately this sends an event to the tracer, and waits not only for
the tracer to call waitpid, but also needs a PTRACE_CONT before do_exit
can call exit_notify which does tsk->exit_state = EXIT_ZOMBIE.

So unfortunately this breaks my patch, so I have to withdraw it for now,
since I see no way how to fix it.

I will clean-up my previous patch which changes the ptrace API to return
an error if an unsafe execve is detected, and send it to this list.


Thanks
Bernd.
Bernd Edlinger June 22, 2021, 5:10 a.m. UTC | #7
On 6/16/21 11:31 PM, Bernd Edlinger wrote:
> On 6/15/21 4:26 PM, Bernd Edlinger wrote:
>> The first phase of de_thread needs co-operation from a user task,
>> if and only if any task t except the thread leader has t->ptrace.
>> Taking tasks from RUNNING->EXIT_ZOMBIE only needs co-operation from kernel code,
> 
> 
> Aehm, sorry, that is not correct, what I said here.
> 
> I totally overlooked ptrace(PTRACE_SEIZE, pid, 0L, PTRACE_O_TRACEEXIT)
> 
> and unfortunately this also prevents even the thread leader to enter the
> EXIT_ZOMBIE state because do_exit does:
> 
>         ptrace_event(PTRACE_EVENT_EXIT, code);
> 
> unfortunately this sends an event to the tracer, and waits not only for
> the tracer to call waitpid, but also needs a PTRACE_CONT before do_exit
> can call exit_notify which does tsk->exit_state = EXIT_ZOMBIE.
> 

P.S:

I think there is something really odd in ptrace_stop().

If it is intentional (which I believe to be the case) to wait here after a
SIGKILL until the process enters the exit_state == EXIT_ZOMBIE, then aborting the
pending ptrace_stop() via sigkill_pending() is questionable, especially because
arch_ptrace_stop_needed() is defined as (0) in most architectures, only sparc and
ia64 do something here.

static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
{
        bool gstop_done = false;

        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
                 * ptrace stop.  This is allowed to block, e.g. for faults
                 * on user stack pages.  We can't keep the siglock while
                 * calling arch_ptrace_stop, so we must release it now.
                 * To preserve proper semantics, we must do this before
                 * any signal bookkeeping like checking group_stop_count.
                 * Meanwhile, a SIGKILL could come in before we retake the
                 * siglock.  That must prevent us from sleeping in TASK_TRACED.
                 * So after regaining the lock, we must check for SIGKILL.
                 */
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop(exit_code, info);
                spin_lock_irq(&current->sighand->siglock);
                if (sigkill_pending(current))
                        return;
        }

        set_special_state(TASK_TRACED);

After this point there is no sigkill_pending() or fatal_signal_pending(), just
a single freezable_schedule() which explains why this can even wait with a fatal
signal pending.  But if the code executes the if block above the sigkill can
only be ignored if it happens immediately before the set_special_state(TASK_TRACED).

What do you think?


Bernd.
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index 8344fba..c7b1926 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1040,6 +1040,8 @@  static int de_thread(struct task_struct *tsk)
 	struct signal_struct *sig = tsk->signal;
 	struct sighand_struct *oldsighand = tsk->sighand;
 	spinlock_t *lock = &oldsighand->siglock;
+	unsigned int prev_ptrace = tsk->ptrace;
+	struct task_struct *t = tsk;
 
 	if (thread_group_empty(tsk))
 		goto no_thread_group;
@@ -1062,6 +1064,17 @@  static int de_thread(struct task_struct *tsk)
 	if (!thread_group_leader(tsk))
 		sig->notify_count--;
 
+	while_each_thread(tsk, t) {
+		if (unlikely(t->ptrace) && t != tsk->group_leader)
+			sig->unsafe_execve_in_progress = true;
+	}
+
+	if (unlikely(sig->unsafe_execve_in_progress)) {
+		spin_unlock_irq(lock);
+		mutex_unlock(&sig->cred_guard_mutex);
+		spin_lock_irq(lock);
+	}
+
 	while (sig->notify_count) {
 		__set_current_state(TASK_KILLABLE);
 		spin_unlock_irq(lock);
@@ -1072,6 +1085,17 @@  static int de_thread(struct task_struct *tsk)
 	}
 	spin_unlock_irq(lock);
 
+	if (unlikely(sig->unsafe_execve_in_progress)) {
+		if (mutex_lock_killable(&sig->cred_guard_mutex))
+			goto killed;
+		sig->unsafe_execve_in_progress = false;
+		if (!prev_ptrace && tsk->ptrace) {
+			sig->group_exit_task = NULL;
+			sig->notify_count = 0;
+			return -ERESTARTSYS;
+		}
+	}
+
 	/*
 	 * At this point all other threads have exited, all we have to
 	 * do is to wait for the thread group leader to become inactive,
@@ -1255,8 +1279,11 @@  int begin_new_exec(struct linux_binprm * bprm)
 	 * Make this the only thread in the thread group.
 	 */
 	retval = de_thread(me);
-	if (retval)
+	if (retval) {
+		if (retval == -ERESTARTSYS)
+			bprm->point_of_no_return = false;
 		goto out;
+	}
 
 	/*
 	 * Cancel any io_uring activity across execve
@@ -1466,6 +1493,11 @@  static int prepare_bprm_creds(struct linux_binprm *bprm)
 	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
 		return -ERESTARTNOINTR;
 
+	if (unlikely(current->signal->unsafe_execve_in_progress)) {
+		mutex_unlock(&current->signal->cred_guard_mutex);
+		return -ERESTARTNOINTR;
+	}
+
 	bprm->cred = prepare_exec_creds();
 	if (likely(bprm->cred))
 		return 0;
@@ -1482,7 +1514,8 @@  static void free_bprm(struct linux_binprm *bprm)
 	}
 	free_arg_pages(bprm);
 	if (bprm->cred) {
-		mutex_unlock(&current->signal->cred_guard_mutex);
+		if (!current->signal->unsafe_execve_in_progress)
+			mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
 	if (bprm->file) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3851bfc..3b2a55c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2739,6 +2739,12 @@  static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	if (rv < 0)
 		goto out_free;
 
+	if (unlikely(current->signal->unsafe_execve_in_progress)) {
+		mutex_unlock(&current->signal->cred_guard_mutex);
+		rv = -ERESTARTNOINTR;
+		goto out_free;
+	}
+
 	rv = security_setprocattr(PROC_I(inode)->op.lsm,
 				  file->f_path.dentry->d_name.name, page,
 				  count);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3f6a0fc..220a083 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -214,6 +214,17 @@  struct signal_struct {
 #endif
 
 	/*
+	 * Set while execve is executing but is *not* holding
+	 * cred_guard_mutex to avoid possible dead-locks.
+	 * The cred_guard_mutex is released *after* de_thread() has
+	 * called zap_other_threads(), therefore a fatal signal is
+	 * guaranteed to be already pending in the unlikely event, that
+	 * current->signal->unsafe_execve_in_progress happens to be
+	 * true after the cred_guard_mutex was acquired.
+	 */
+	bool unsafe_execve_in_progress;
+
+	/*
 	 * Thread is the potential origin of an oom condition; kill first on
 	 * oom
 	 */
@@ -227,6 +238,8 @@  struct signal_struct {
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace)
+					 * Held while execve runs, except when
+					 * a sibling thread is being traced.
 					 * Deprecated do not use in new code.
 					 * Use exec_update_lock instead.
 					 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61db50f..0cbc1eb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -468,6 +468,14 @@  static int ptrace_traceme(void)
 {
 	int ret = -EPERM;
 
+	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
+		return -ERESTARTNOINTR;
+
+	if (unlikely(current->signal->unsafe_execve_in_progress)) {
+		mutex_unlock(&current->signal->cred_guard_mutex);
+		return -ERESTARTNOINTR;
+	}
+
 	write_lock_irq(&tasklist_lock);
 	/* Are we already being traced? */
 	if (!current->ptrace) {
@@ -483,6 +491,7 @@  static int ptrace_traceme(void)
 		}
 	}
 	write_unlock_irq(&tasklist_lock);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 
 	return ret;
 }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1d60fc2..b1389ee 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1824,9 +1824,15 @@  static long seccomp_set_mode_filter(unsigned int flags,
 	 * Make sure we cannot change seccomp or nnp state via TSYNC
 	 * while another thread is in the middle of calling exec.
 	 */
-	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
-	    mutex_lock_killable(&current->signal->cred_guard_mutex))
-		goto out_put_fd;
+	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+		if (mutex_lock_killable(&current->signal->cred_guard_mutex))
+			goto out_put_fd;
+
+		if (unlikely(current->signal->unsafe_execve_in_progress)) {
+			mutex_unlock(&current->signal->cred_guard_mutex);
+			goto out_put_fd;
+		}
+	}
 
 	spin_lock_irq(&current->sighand->siglock);
 
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
index 4db327b..c7c2242 100644
--- a/tools/testing/selftests/ptrace/vmaccess.c
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -39,8 +39,15 @@  static void *thread(void *arg)
 	f = open(mm, O_RDONLY);
 	ASSERT_GE(f, 0);
 	close(f);
-	f = kill(pid, SIGCONT);
-	ASSERT_EQ(f, 0);
+	f = waitpid(-1, NULL, 0);
+	ASSERT_NE(f, -1);
+	ASSERT_NE(f, 0);
+	ASSERT_NE(f, pid);
+	f = waitpid(-1, NULL, 0);
+	ASSERT_EQ(f, pid);
+	f = waitpid(-1, NULL, 0);
+	ASSERT_EQ(f, -1);
+	ASSERT_EQ(errno, ECHILD);
 }
 
 TEST(attach)
@@ -57,22 +64,24 @@  static void *thread(void *arg)
 
 	sleep(1);
 	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
-	ASSERT_EQ(errno, EAGAIN);
-	ASSERT_EQ(k, -1);
+	ASSERT_EQ(k, 0);
 	k = waitpid(-1, &s, WNOHANG);
 	ASSERT_NE(k, -1);
 	ASSERT_NE(k, 0);
 	ASSERT_NE(k, pid);
 	ASSERT_EQ(WIFEXITED(s), 1);
 	ASSERT_EQ(WEXITSTATUS(s), 0);
-	sleep(1);
-	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
-	ASSERT_EQ(k, 0);
 	k = waitpid(-1, &s, 0);
 	ASSERT_EQ(k, pid);
 	ASSERT_EQ(WIFSTOPPED(s), 1);
 	ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
-	k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
+	k = ptrace(PTRACE_CONT, pid, 0L, 0L);
+	ASSERT_EQ(k, 0);
+	k = waitpid(-1, &s, 0);
+	ASSERT_EQ(k, pid);
+	ASSERT_EQ(WIFSTOPPED(s), 1);
+	ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
+	k = ptrace(PTRACE_CONT, pid, 0L, 0L);
 	ASSERT_EQ(k, 0);
 	k = waitpid(-1, &s, 0);
 	ASSERT_EQ(k, pid);