@@ -1040,6 +1040,7 @@ static int de_thread(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
+ struct task_struct *t = tsk;
if (thread_group_empty(tsk))
goto no_thread_group;
@@ -1062,6 +1063,17 @@ static int de_thread(struct task_struct *tsk)
if (!thread_group_leader(tsk))
sig->notify_count--;
+ while_each_thread(tsk, t) {
+ if (unlikely(t->ptrace))
+ sig->unsafe_execve_in_progress = true;
+ }
+
+ if (unlikely(sig->unsafe_execve_in_progress)) {
+ spin_unlock_irq(lock);
+ mutex_unlock(&sig->cred_guard_mutex);
+ spin_lock_irq(lock);
+ }
+
while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
@@ -1152,6 +1164,12 @@ static int de_thread(struct task_struct *tsk)
release_task(leader);
}
+ if (unlikely(sig->unsafe_execve_in_progress)) {
+ if (mutex_lock_killable(&sig->cred_guard_mutex))
+ goto killed;
+ sig->unsafe_execve_in_progress = false;
+ }
+
sig->group_exit_task = NULL;
sig->notify_count = 0;
@@ -1466,6 +1484,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
+ if (unlikely(current->signal->unsafe_execve_in_progress)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
bprm->cred = prepare_exec_creds();
if (likely(bprm->cred))
return 0;
@@ -1482,7 +1505,8 @@ static void free_bprm(struct linux_binprm *bprm)
}
free_arg_pages(bprm);
if (bprm->cred) {
- mutex_unlock(¤t->signal->cred_guard_mutex);
+ if (!current->signal->unsafe_execve_in_progress)
+ mutex_unlock(¤t->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
if (bprm->file) {
@@ -2739,6 +2739,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
+ if (unlikely(current->signal->unsafe_execve_in_progress)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ rv = -ERESTARTNOINTR;
+ goto out_free;
+ }
+
rv = security_setprocattr(PROC_I(inode)->op.lsm,
file->f_path.dentry->d_name.name, page,
count);
@@ -214,6 +214,17 @@ struct signal_struct {
#endif
/*
+ * Set while execve is executing but is *not* holding
+ * cred_guard_mutex to avoid possible dead-locks.
+ * The cred_guard_mutex is released *after* de_thread() has
+ * called zap_other_threads(), therefore a fatal signal is
+ * guaranteed to be already pending in the unlikely event, that
+ * current->signal->unsafe_execve_in_progress happens to be
+ * true after the cred_guard_mutex was acquired.
+ */
+ bool unsafe_execve_in_progress;
+
+ /*
* Thread is the potential origin of an oom condition; kill first on
* oom
*/
@@ -227,6 +238,8 @@ struct signal_struct {
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace)
+ * Held while execve runs, except when
+ * a sibling thread is being traced.
* Deprecated do not use in new code.
* Use exec_update_lock instead.
*/
@@ -402,6 +402,21 @@ static int ptrace_attach(struct task_struct *task, long request,
if (task->ptrace)
goto unlock_tasklist;
+ /*
+ * It may happen that de_thread() has to release the
+ * cred_guard_mutex in order to prevent deadlocks.
+ * In that case unsafe_execve_in_progress will be set.
+ * If that happens you cannot assume that the usual
+ * guarantees implied by cred_guard_mutex are valid.
+ * Just return -EAGAIN in that case.
+ * The tracer is expected to call wait(2) and handle
+ * possible events before calling this API again.
+ */
+ retval = -EAGAIN;
+ if (unlikely(task->signal->unsafe_execve_in_progress) &&
+ task->in_execve)
+ goto unlock_tasklist;
+
if (seize)
flags |= PT_SEIZED;
task->ptrace = flags;
@@ -468,6 +483,14 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
+ return -ERESTARTNOINTR;
+
+ if (unlikely(current->signal->unsafe_execve_in_progress)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
write_lock_irq(&tasklist_lock);
/* Are we already being traced? */
if (!current->ptrace) {
@@ -483,6 +506,7 @@ static int ptrace_traceme(void)
}
}
write_unlock_irq(&tasklist_lock);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
return ret;
}
@@ -1824,9 +1824,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
- if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
- mutex_lock_killable(¤t->signal->cred_guard_mutex))
- goto out_put_fd;
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ if (mutex_lock_killable(¤t->signal->cred_guard_mutex))
+ goto out_put_fd;
+
+ if (unlikely(current->signal->unsafe_execve_in_progress)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ goto out_put_fd;
+ }
+ }
spin_lock_irq(¤t->sighand->siglock);
This introduces signal->unsafe_execve_in_progress, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution. All threads that are traced with PTRACE_O_TRACEEXIT send a PTRACE_EVENT_EXIT to the tracer, and wait for the tracer to do a PTRACE_CONT before reaching the ZOMBIE state. This wait state is not interrupted by zap_other_threads. All threads except the thread leader do also wait for the tracer to receive the exit signal with waitpid before de_thread can continue. When the cred_guard_mutex is held for so long time, any attempt to ptrace_attach this thread will block the tracer. The solution is to detect this situation and make ptrace_attach return -EAGAIN. Do that only after all other checks have been done, and only for the thread that does the execve, other threads can be allowed to proceed, since that has not influence the credentials of the new process. This means this is an API change, but only when some threads are being traced while execve happens in a non-traced thread. See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change. Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de> --- fs/exec.c | 26 +++++++++++++++++++++++++- fs/proc/base.c | 6 ++++++ include/linux/sched/signal.h | 13 +++++++++++++ kernel/ptrace.c | 24 ++++++++++++++++++++++++ kernel/seccomp.c | 12 +++++++++--- 5 files changed, 77 insertions(+), 4 deletions(-) v10: Changes to previous version, make the PTRACE_ATTACH retun -EAGAIN, instead of execve return -ERESTARTSYS. Added some lessions learned to the description.