diff mbox series

[PATCHv2] exec: Fix a deadlock in ptrace

Message ID AM6PR03MB5170EB4427BF5C67EE98FF09E4E60@AM6PR03MB5170.eurprd03.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series [PATCHv2] exec: Fix a deadlock in ptrace | expand

Commit Message

Bernd Edlinger March 1, 2020, 8:34 p.m. UTC
This fixes a deadlock in the tracer when tracing a multi-threaded
application that calls execve while more than one thread are running.

I observed that when running strace on the gcc test suite, it always
blocks after a while, when expect calls execve, because other threads
have to be terminated.  They send ptrace events, but the strace is no
longer able to respond, since it is blocked in vm_access.

The deadlock is always happening when strace needs to access the
tracees process mmap, while another thread in the tracee starts to
execve a child process, but that cannot continue until the
PTRACE_EVENT_EXIT is handled and the WIFEXITED event is received:

strace          D    0 30614  30584 0x00000000
Call Trace:
__schedule+0x3ce/0x6e0
schedule+0x5c/0xd0
schedule_preempt_disabled+0x15/0x20
__mutex_lock.isra.13+0x1ec/0x520
__mutex_lock_killable_slowpath+0x13/0x20
mutex_lock_killable+0x28/0x30
mm_access+0x27/0xa0
process_vm_rw_core.isra.3+0xff/0x550
process_vm_rw+0xdd/0xf0
__x64_sys_process_vm_readv+0x31/0x40
do_syscall_64+0x64/0x220
entry_SYSCALL_64_after_hwframe+0x44/0xa9

expect          D    0 31933  30876 0x80004003
Call Trace:
__schedule+0x3ce/0x6e0
schedule+0x5c/0xd0
flush_old_exec+0xc4/0x770
load_elf_binary+0x35a/0x16c0
search_binary_handler+0x97/0x1d0
__do_execve_file.isra.40+0x5d4/0x8a0
__x64_sys_execve+0x49/0x60
do_syscall_64+0x64/0x220
entry_SYSCALL_64_after_hwframe+0x44/0xa9

The proposed solution is to have a second mutex that is
used in mm_access, so it is allowed to continue while the
dying threads are not yet terminated.

I also took the opportunity to improve the documentation
of prepare_creds, which is obviously out of sync.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
---
 Documentation/security/credentials.rst    | 18 ++++++------
 fs/exec.c                                 |  9 ++++++
 include/linux/binfmts.h                   |  6 +++-
 include/linux/sched/signal.h              |  1 +
 init/init_task.c                          |  1 +
 kernel/cred.c                             |  2 +-
 kernel/fork.c                             |  5 ++--
 mm/process_vm_access.c                    |  2 +-
 tools/testing/selftests/ptrace/Makefile   |  4 +--
 tools/testing/selftests/ptrace/vmaccess.c | 46 +++++++++++++++++++++++++++++++
 10 files changed, 79 insertions(+), 15 deletions(-)
 create mode 100644 tools/testing/selftests/ptrace/vmaccess.c

v2: adds a test case which passes when this patch is applied.

Comments

Eric W. Biederman March 2, 2020, 6:38 a.m. UTC | #1
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> This fixes a deadlock in the tracer when tracing a multi-threaded
> application that calls execve while more than one thread are running.
>
> I observed that when running strace on the gcc test suite, it always
> blocks after a while, when expect calls execve, because other threads
> have to be terminated.  They send ptrace events, but the strace is no
> longer able to respond, since it is blocked in vm_access.
>
> The deadlock is always happening when strace needs to access the
> tracees process mmap, while another thread in the tracee starts to
> execve a child process, but that cannot continue until the
> PTRACE_EVENT_EXIT is handled and the WIFEXITED event is received:

I think your patch works, but I don't think to solve your case another
mutex is necessary.  Possibly it is justified, but I hesitate to
introduce yet another concept in the code.

Having read elsewhere in the thread that this does not solve the problem
Oleg has mentioned I am really hesitant to add more complexity to the
situation.


For your case there is a straight forward and local workaround.

When the current task is ptracing the target task don't bother with
cred_gaurd_mutex and ptrace_may_access in access_mm as those tests
have already passed.  Instead just confirm the ptrace status. AKA
the permission check in ptraces_access_vm.

I think something like this is all we need.

diff --git a/kernel/fork.c b/kernel/fork.c
index cee89229606a..b0ab98c84589 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1224,6 +1224,16 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	struct mm_struct *mm;
 	int err;
 
+	if (task->ptrace && (current == task->parent)) {
+		mm = get_task_mm(task);
+		if ((get_dumpable(mm) != SUID_DUMP_USER) &&
+		    !ptracer_capable(task, mm->user_ns)) {
+			mmput(mm);
+			mm = ERR_PTR(-EACCESS);
+		}
+		return mm;
+	}
+
 	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
 	if (err)
 		return ERR_PTR(err);

Does this solve your test case?

The patch above is short the approriate locking for the ptrace attached
check.  (tasklist_lock I think).  But is enough to illustrate the idea,
and it is probably a check we want in any event so that if the tracer
starts dropping privileges process_vm_readv and process_vm_writev will
still be usable by the tracer.

Eric


> strace          D    0 30614  30584 0x00000000
> Call Trace:
> __schedule+0x3ce/0x6e0
> schedule+0x5c/0xd0
> schedule_preempt_disabled+0x15/0x20
> __mutex_lock.isra.13+0x1ec/0x520
> __mutex_lock_killable_slowpath+0x13/0x20
> mutex_lock_killable+0x28/0x30
> mm_access+0x27/0xa0
> process_vm_rw_core.isra.3+0xff/0x550
> process_vm_rw+0xdd/0xf0
> __x64_sys_process_vm_readv+0x31/0x40
> do_syscall_64+0x64/0x220
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> expect          D    0 31933  30876 0x80004003
> Call Trace:
> __schedule+0x3ce/0x6e0
> schedule+0x5c/0xd0
> flush_old_exec+0xc4/0x770
> load_elf_binary+0x35a/0x16c0
> search_binary_handler+0x97/0x1d0
> __do_execve_file.isra.40+0x5d4/0x8a0
> __x64_sys_execve+0x49/0x60
> do_syscall_64+0x64/0x220
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> The proposed solution is to have a second mutex that is
> used in mm_access, so it is allowed to continue while the
> dying threads are not yet terminated.
>
> I also took the opportunity to improve the documentation
> of prepare_creds, which is obviously out of sync.
>
> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
> ---
>  Documentation/security/credentials.rst    | 18 ++++++------
>  fs/exec.c                                 |  9 ++++++
>  include/linux/binfmts.h                   |  6 +++-
>  include/linux/sched/signal.h              |  1 +
>  init/init_task.c                          |  1 +
>  kernel/cred.c                             |  2 +-
>  kernel/fork.c                             |  5 ++--
>  mm/process_vm_access.c                    |  2 +-
>  tools/testing/selftests/ptrace/Makefile   |  4 +--
>  tools/testing/selftests/ptrace/vmaccess.c | 46 +++++++++++++++++++++++++++++++
>  10 files changed, 79 insertions(+), 15 deletions(-)
>  create mode 100644 tools/testing/selftests/ptrace/vmaccess.c
>
> v2: adds a test case which passes when this patch is applied.
>
>
> diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst
> index 282e79f..c98e0a8 100644
> --- a/Documentation/security/credentials.rst
> +++ b/Documentation/security/credentials.rst
> @@ -437,9 +437,13 @@ new set of credentials by calling::
>  
>  	struct cred *prepare_creds(void);
>  
> -this locks current->cred_replace_mutex and then allocates and constructs a
> -duplicate of the current process's credentials, returning with the mutex still
> -held if successful.  It returns NULL if not successful (out of memory).
> +this allocates and constructs a duplicate of the current process's credentials.
> +It returns NULL if not successful (out of memory).
> +
> +If called from __do_execve_file, the mutex current->signal->cred_guard_mutex
> +is acquired before this function gets called, and the mutex
> +current->signal->cred_change_mutex is acquired later, while the credentials
> +and the process mmap are actually changed.
>  
>  The mutex prevents ``ptrace()`` from altering the ptrace state of a process
>  while security checks on credentials construction and changing is taking place
> @@ -466,9 +470,8 @@ by calling::
>  
>  This will alter various aspects of the credentials and the process, giving the
>  LSM a chance to do likewise, then it will use ``rcu_assign_pointer()`` to
> -actually commit the new credentials to ``current->cred``, it will release
> -``current->cred_replace_mutex`` to allow ``ptrace()`` to take place, and it
> -will notify the scheduler and others of the changes.
> +actually commit the new credentials to ``current->cred``, and it will notify
> +the scheduler and others of the changes.
>  
>  This function is guaranteed to return 0, so that it can be tail-called at the
>  end of such functions as ``sys_setresuid()``.
> @@ -486,8 +489,7 @@ invoked::
>  
>  	void abort_creds(struct cred *new);
>  
> -This releases the lock on ``current->cred_replace_mutex`` that
> -``prepare_creds()`` got and then releases the new credentials.
> +This releases the new credentials.
>  
>  
>  A typical credentials alteration function would look something like this::
> diff --git a/fs/exec.c b/fs/exec.c
> index 74d88da..a6884e4 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1266,6 +1266,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>  	if (retval)
>  		goto out;
>  
> +	retval = mutex_lock_killable(&current->signal->cred_change_mutex);
> +	if (retval)
> +		goto out;
> +
> +	bprm->called_flush_old_exec = 1;
> +
>  	/*
>  	 * Must be called _before_ exec_mmap() as bprm->mm is
>  	 * not visibile until then. This also enables the update
> @@ -1420,6 +1426,8 @@ static void free_bprm(struct linux_binprm *bprm)
>  {
>  	free_arg_pages(bprm);
>  	if (bprm->cred) {
> +		if (bprm->called_flush_old_exec)
> +			mutex_unlock(&current->signal->cred_change_mutex);
>  		mutex_unlock(&current->signal->cred_guard_mutex);
>  		abort_creds(bprm->cred);
>  	}
> @@ -1469,6 +1477,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>  	 * credentials; any time after this it may be unlocked.
>  	 */
>  	security_bprm_committed_creds(bprm);
> +	mutex_unlock(&current->signal->cred_change_mutex);
>  	mutex_unlock(&current->signal->cred_guard_mutex);
>  }
>  EXPORT_SYMBOL(install_exec_creds);
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index b40fc63..2e1318b 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -44,7 +44,11 @@ struct linux_binprm {
>  		 * exec has happened. Used to sanitize execution environment
>  		 * and to set AT_SECURE auxv for glibc.
>  		 */
> -		secureexec:1;
> +		secureexec:1,
> +		/*
> +		 * Set by flush_old_exec, when the cred_change_mutex is taken.
> +		 */
> +		called_flush_old_exec:1;
>  #ifdef __alpha__
>  	unsigned int taso:1;
>  #endif
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 8805025..37eeabe 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -225,6 +225,7 @@ struct signal_struct {
>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>  					 * credential calculations
>  					 * (notably. ptrace) */
> +	struct mutex cred_change_mutex; /* guard against credentials change */
>  } __randomize_layout;
>  
>  /*
> diff --git a/init/init_task.c b/init/init_task.c
> index 9e5cbe5..6cd9a0f 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -26,6 +26,7 @@
>  	.multiprocess	= HLIST_HEAD_INIT,
>  	.rlim		= INIT_RLIMITS,
>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
> +	.cred_change_mutex = __MUTEX_INITIALIZER(init_signals.cred_change_mutex),
>  #ifdef CONFIG_POSIX_TIMERS
>  	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>  	.cputimer	= {
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 809a985..e4c78de 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -676,7 +676,7 @@ void __init cred_init(void)
>   *
>   * Returns the new credentials or NULL if out of memory.
>   *
> - * Does not take, and does not return holding current->cred_replace_mutex.
> + * Does not take, and does not return holding ->cred_guard_mutex.
>   */
>  struct cred *prepare_kernel_cred(struct task_struct *daemon)
>  {
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0808095..0395154 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1224,7 +1224,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
>  	struct mm_struct *mm;
>  	int err;
>  
> -	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
> +	err =  mutex_lock_killable(&task->signal->cred_change_mutex);
>  	if (err)
>  		return ERR_PTR(err);
>  
> @@ -1234,7 +1234,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
>  		mmput(mm);
>  		mm = ERR_PTR(-EACCES);
>  	}
> -	mutex_unlock(&task->signal->cred_guard_mutex);
> +	mutex_unlock(&task->signal->cred_change_mutex);
>  
>  	return mm;
>  }
> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>  
>  	mutex_init(&sig->cred_guard_mutex);
> +	mutex_init(&sig->cred_change_mutex);
>  
>  	return 0;
>  }
> diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
> index 357aa7b..b3e6eb5 100644
> --- a/mm/process_vm_access.c
> +++ b/mm/process_vm_access.c
> @@ -204,7 +204,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>  	if (!mm || IS_ERR(mm)) {
>  		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
>  		/*
> -		 * Explicitly map EACCES to EPERM as EPERM is a more a
> +		 * Explicitly map EACCES to EPERM as EPERM is a more
>  		 * appropriate error code for process_vw_readv/writev
>  		 */
>  		if (rc == -EACCES)
> diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile
> index c0b7f89..2f1f532 100644
> --- a/tools/testing/selftests/ptrace/Makefile
> +++ b/tools/testing/selftests/ptrace/Makefile
> @@ -1,6 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0-only
> -CFLAGS += -iquote../../../../include/uapi -Wall
> +CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
>  
> -TEST_GEN_PROGS := get_syscall_info peeksiginfo
> +TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
>  
>  include ../lib.mk
> diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
> new file mode 100644
> index 0000000..ef08c9f
> --- /dev/null
> +++ b/tools/testing/selftests/ptrace/vmaccess.c
> @@ -0,0 +1,46 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de>
> + * All rights reserved.
> + *
> + * Check whether /proc/$pid/mem can be accessed without causing deadlocks
> + * when de_thread is blocked with ->cred_guard_mutex held.
> + */
> +
> +#include "../kselftest_harness.h"
> +#include <stdio.h>
> +#include <fcntl.h>
> +#include <pthread.h>
> +#include <signal.h>
> +#include <unistd.h>
> +#include <sys/ptrace.h>
> +
> +static void *thread(void *arg)
> +{
> +	ptrace(PTRACE_TRACEME, 0, 0, 0);
> +	return NULL;
> +}
> +
> +TEST(vmaccess)
> +{
> +	int f, pid = fork();
> +	char mm[64];
> +
> +	if (!pid) {
> +		pthread_t pt;
> +		pthread_create(&pt, NULL, thread, NULL);
> +		pthread_join(pt, NULL);
> +		execlp("true", "true", NULL);
> +	}
> +
> +	sleep(1);
> +	sprintf(mm, "/proc/%d/mem", pid);
> +	f = open(mm, O_RDONLY);
> +	ASSERT_LE(0, f)
> +		close(f);
> +	/* this is not fixed! ptrace(PTRACE_ATTACH, pid, 0,0); */
> +	f = kill(pid, SIGCONT);
> +	ASSERT_EQ(0, f);
> +}
> +
> +TEST_HARNESS_MAIN
Oleg Nesterov March 2, 2020, 12:28 p.m. UTC | #2
On 03/01, Bernd Edlinger wrote:
>
> This fixes a deadlock in the tracer when tracing a multi-threaded
> application that calls execve while more than one thread are running.

Heh. Yes, known problem. See my attempt to fix it:
https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/

> @@ -1224,7 +1224,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
>  	struct mm_struct *mm;
>  	int err;
>  
> -	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
> +	err =  mutex_lock_killable(&task->signal->cred_change_mutex);

So if I understand correctly your patch doesn't fix other problems
with debugger waiting for cred_guard_mutex.

I too do not think this can justify the new mutex in signal_struct...

Oleg.
Bernd Edlinger March 2, 2020, 3:43 p.m. UTC | #3
On 3/2/20 7:38 AM, Eric W. Biederman wrote:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>> This fixes a deadlock in the tracer when tracing a multi-threaded
>> application that calls execve while more than one thread are running.
>>
>> I observed that when running strace on the gcc test suite, it always
>> blocks after a while, when expect calls execve, because other threads
>> have to be terminated.  They send ptrace events, but the strace is no
>> longer able to respond, since it is blocked in vm_access.
>>
>> The deadlock is always happening when strace needs to access the
>> tracees process mmap, while another thread in the tracee starts to
>> execve a child process, but that cannot continue until the
>> PTRACE_EVENT_EXIT is handled and the WIFEXITED event is received:
> 
> I think your patch works, but I don't think to solve your case another
> mutex is necessary.  Possibly it is justified, but I hesitate to
> introduce yet another concept in the code.
> 
> Having read elsewhere in the thread that this does not solve the problem
> Oleg has mentioned I am really hesitant to add more complexity to the
> situation.
> 
> 
> For your case there is a straight forward and local workaround.
> 
> When the current task is ptracing the target task don't bother with
> cred_gaurd_mutex and ptrace_may_access in access_mm as those tests
> have already passed.  Instead just confirm the ptrace status. AKA
> the permission check in ptraces_access_vm.
> 
> I think something like this is all we need.
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cee89229606a..b0ab98c84589 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1224,6 +1224,16 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
>  	struct mm_struct *mm;
>  	int err;
>  
> +	if (task->ptrace && (current == task->parent)) {
> +		mm = get_task_mm(task);
> +		if ((get_dumpable(mm) != SUID_DUMP_USER) &&
> +		    !ptracer_capable(task, mm->user_ns)) {
> +			mmput(mm);
> +			mm = ERR_PTR(-EACCESS);
> +		}
> +		return mm;
> +	}
> +
>  	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
>  	if (err)
>  		return ERR_PTR(err);
> 
> Does this solve your test case?
> 

I tried this with s/EACCESS/EACCES/.

The test case in this patch is not fixed, but strace does not freeze,
at least with my setup where it did freeze repeatable.  That is
obviously because it bypasses the cred_guard_mutex.  But all other
process that access this file still freeze, and cannot be
interrupted except with kill -9.

However that smells like a denial of service, that this
simple test case which can be executed by guest, creates a /proc/$pid/mem
that freezes any process, even root, when it looks at it.
I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.


Bernd.
Bernd Edlinger March 2, 2020, 3:56 p.m. UTC | #4
On 3/2/20 1:28 PM, Oleg Nesterov wrote:
> On 03/01, Bernd Edlinger wrote:
>>
>> This fixes a deadlock in the tracer when tracing a multi-threaded
>> application that calls execve while more than one thread are running.
> 
> Heh. Yes, known problem. See my attempt to fix it:
> https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
> 
>> @@ -1224,7 +1224,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
>>  	struct mm_struct *mm;
>>  	int err;
>>  
>> -	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
>> +	err =  mutex_lock_killable(&task->signal->cred_change_mutex);
> 
> So if I understand correctly your patch doesn't fix other problems
> with debugger waiting for cred_guard_mutex.
> 

No, but I see this just as a first step.

> I too do not think this can justify the new mutex in signal_struct...
> 

I think for the vm_access the semantic of this mutex is clear, that it
prevents the credentials to change while it is held by vm_access,
and probably other places can take advantage of this mutex as well.

While on the other hand, the cred_guard_mutex is needed to avoid two
threads calling execve at the same time.  So that is needed as well.

What remains is probably making PTHREAD_ATTACH detect that the process
is currently in execve, and make that call fail in that situation.
I have not thought in depth about that problem, but it will probably
just need the right mutex to access current->in_execve.


That's at least how I see it.


Thanks
Bernd.
Eric W. Biederman March 2, 2020, 3:57 p.m. UTC | #5
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

>
> I tried this with s/EACCESS/EACCES/.
>
> The test case in this patch is not fixed, but strace does not freeze,
> at least with my setup where it did freeze repeatable.

Thanks, That is what I was aiming at.

So we have one method we can pursue to fix this in practice.

> That is
> obviously because it bypasses the cred_guard_mutex.  But all other
> process that access this file still freeze, and cannot be
> interrupted except with kill -9.
>
> However that smells like a denial of service, that this
> simple test case which can be executed by guest, creates a /proc/$pid/mem
> that freezes any process, even root, when it looks at it.
> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.

Yes.  Your the test case in your patch a variant of the original
problem.


I have been staring at this trying to understand the fundamentals of the
original deeper problem.

The current scope of cred_guard_mutex in exec is because being ptraced
causes suid exec to act differently.  So we need to know early if we are
ptraced.

If that case did not exist we could reduce the scope of the
cred_guard_mutex in exec to where your patch puts the cred_change_mutex.

I am starting to think reworking how we deal with ptrace and exec is the
way to solve this problem.

Eric
Bernd Edlinger March 2, 2020, 4:02 p.m. UTC | #6
On 3/2/20 4:57 PM, Eric W. Biederman wrote:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>>
>> I tried this with s/EACCESS/EACCES/.
>>
>> The test case in this patch is not fixed, but strace does not freeze,
>> at least with my setup where it did freeze repeatable.
> 
> Thanks, That is what I was aiming at.
> 
> So we have one method we can pursue to fix this in practice.
> 
>> That is
>> obviously because it bypasses the cred_guard_mutex.  But all other
>> process that access this file still freeze, and cannot be
>> interrupted except with kill -9.
>>
>> However that smells like a denial of service, that this
>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>> that freezes any process, even root, when it looks at it.
>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
> 
> Yes.  Your the test case in your patch a variant of the original
> problem.
> 
> 
> I have been staring at this trying to understand the fundamentals of the
> original deeper problem.
> 
> The current scope of cred_guard_mutex in exec is because being ptraced
> causes suid exec to act differently.  So we need to know early if we are
> ptraced.
> 

It has a second use, that it prevents two threads entering execve,
which would probably result in disaster.

> If that case did not exist we could reduce the scope of the
> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
> 
> I am starting to think reworking how we deal with ptrace and exec is the
> way to solve this problem.
> 
> Eric
>
Eric W. Biederman March 2, 2020, 4:17 p.m. UTC | #7
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> 
>>>
>>> I tried this with s/EACCESS/EACCES/.
>>>
>>> The test case in this patch is not fixed, but strace does not freeze,
>>> at least with my setup where it did freeze repeatable.
>> 
>> Thanks, That is what I was aiming at.
>> 
>> So we have one method we can pursue to fix this in practice.
>> 
>>> That is
>>> obviously because it bypasses the cred_guard_mutex.  But all other
>>> process that access this file still freeze, and cannot be
>>> interrupted except with kill -9.
>>>
>>> However that smells like a denial of service, that this
>>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>>> that freezes any process, even root, when it looks at it.
>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>> 
>> Yes.  Your the test case in your patch a variant of the original
>> problem.
>> 
>> 
>> I have been staring at this trying to understand the fundamentals of the
>> original deeper problem.
>> 
>> The current scope of cred_guard_mutex in exec is because being ptraced
>> causes suid exec to act differently.  So we need to know early if we are
>> ptraced.
>> 
>
> It has a second use, that it prevents two threads entering execve,
> which would probably result in disaster.

Exec can fail with an error code up until de_thread.  de_thread causes
exec to fail with the error code -EAGAIN for the second thread to get
into de_thread.

So no.  The cred_guard_mutex is not needed for that case at all.

>> If that case did not exist we could reduce the scope of the
>> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
>> 
>> I am starting to think reworking how we deal with ptrace and exec is the
>> way to solve this problem.


I am 99% convinced that the fix is to move cred_guard_mutex down.

Then right after we take cred_guard_mutex do:
	if (ptraced) {
		use_original_creds();
	}

And call it a day.

The details suck but I am 99% certain that would solve everyones
problems, and not be too bad to audit either.

Eric
Jann Horn March 2, 2020, 4:43 p.m. UTC | #8
On Mon, Mar 2, 2020 at 5:19 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>
> > On 3/2/20 4:57 PM, Eric W. Biederman wrote:
> >> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> >>
> >>>
> >>> I tried this with s/EACCESS/EACCES/.
> >>>
> >>> The test case in this patch is not fixed, but strace does not freeze,
> >>> at least with my setup where it did freeze repeatable.
> >>
> >> Thanks, That is what I was aiming at.
> >>
> >> So we have one method we can pursue to fix this in practice.
> >>
> >>> That is
> >>> obviously because it bypasses the cred_guard_mutex.  But all other
> >>> process that access this file still freeze, and cannot be
> >>> interrupted except with kill -9.
> >>>
> >>> However that smells like a denial of service, that this
> >>> simple test case which can be executed by guest, creates a /proc/$pid/mem
> >>> that freezes any process, even root, when it looks at it.
> >>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
> >>
> >> Yes.  Your the test case in your patch a variant of the original
> >> problem.
> >>
> >>
> >> I have been staring at this trying to understand the fundamentals of the
> >> original deeper problem.
> >>
> >> The current scope of cred_guard_mutex in exec is because being ptraced
> >> causes suid exec to act differently.  So we need to know early if we are
> >> ptraced.
> >>
> >
> > It has a second use, that it prevents two threads entering execve,
> > which would probably result in disaster.
>
> Exec can fail with an error code up until de_thread.  de_thread causes
> exec to fail with the error code -EAGAIN for the second thread to get
> into de_thread.
>
> So no.  The cred_guard_mutex is not needed for that case at all.
>
> >> If that case did not exist we could reduce the scope of the
> >> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
> >>
> >> I am starting to think reworking how we deal with ptrace and exec is the
> >> way to solve this problem.
>
>
> I am 99% convinced that the fix is to move cred_guard_mutex down.

"move cred_guard_mutex down" as in "take it once we've already set up
the new process, past the point of no return"?

> Then right after we take cred_guard_mutex do:
>         if (ptraced) {
>                 use_original_creds();
>         }
>
> And call it a day.
>
> The details suck but I am 99% certain that would solve everyones
> problems, and not be too bad to audit either.

Ah, hmm, that sounds like it'll work fine at least when no LSMs are involved.

SELinux normally doesn't do the execution-degrading thing, it just
blocks the execution completely - see their selinux_bprm_set_creds()
hook. So I think they'd still need to set some state on the task that
says "we're currently in the middle of an execution where the target
task will run in context X", and then check against that in the
ptrace_may_access hook. Or I suppose they could just kill the task
near the end of execve, although that'd be kinda ugly.
Bernd Edlinger March 2, 2020, 5:01 p.m. UTC | #9
On 3/2/20 5:43 PM, Jann Horn wrote:
> On Mon, Mar 2, 2020 at 5:19 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>
>>> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>>>
>>>>>
>>>>> I tried this with s/EACCESS/EACCES/.
>>>>>
>>>>> The test case in this patch is not fixed, but strace does not freeze,
>>>>> at least with my setup where it did freeze repeatable.
>>>>
>>>> Thanks, That is what I was aiming at.
>>>>
>>>> So we have one method we can pursue to fix this in practice.
>>>>
>>>>> That is
>>>>> obviously because it bypasses the cred_guard_mutex.  But all other
>>>>> process that access this file still freeze, and cannot be
>>>>> interrupted except with kill -9.
>>>>>
>>>>> However that smells like a denial of service, that this
>>>>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>>>>> that freezes any process, even root, when it looks at it.
>>>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>>>>
>>>> Yes.  Your the test case in your patch a variant of the original
>>>> problem.
>>>>
>>>>
>>>> I have been staring at this trying to understand the fundamentals of the
>>>> original deeper problem.
>>>>
>>>> The current scope of cred_guard_mutex in exec is because being ptraced
>>>> causes suid exec to act differently.  So we need to know early if we are
>>>> ptraced.
>>>>
>>>
>>> It has a second use, that it prevents two threads entering execve,
>>> which would probably result in disaster.
>>
>> Exec can fail with an error code up until de_thread.  de_thread causes
>> exec to fail with the error code -EAGAIN for the second thread to get
>> into de_thread.
>>
>> So no.  The cred_guard_mutex is not needed for that case at all.
>>
>>>> If that case did not exist we could reduce the scope of the
>>>> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
>>>>
>>>> I am starting to think reworking how we deal with ptrace and exec is the
>>>> way to solve this problem.
>>
>>
>> I am 99% convinced that the fix is to move cred_guard_mutex down.
> 
> "move cred_guard_mutex down" as in "take it once we've already set up
> the new process, past the point of no return"?
> 
>> Then right after we take cred_guard_mutex do:
>>         if (ptraced) {
>>                 use_original_creds();
>>         }
>>
>> And call it a day.
>>
>> The details suck but I am 99% certain that would solve everyones
>> problems, and not be too bad to audit either.
> 
> Ah, hmm, that sounds like it'll work fine at least when no LSMs are involved.
> 
> SELinux normally doesn't do the execution-degrading thing, it just
> blocks the execution completely - see their selinux_bprm_set_creds()
> hook. So I think they'd still need to set some state on the task that
> says "we're currently in the middle of an execution where the target
> task will run in context X", and then check against that in the
> ptrace_may_access hook. Or I suppose they could just kill the task
> near the end of execve, although that'd be kinda ugly.
> 

We have current->in_execve for that, right?
I think when the cred_guard_mutex is taken only in the critical section,
then PTRACE_ATTACH could take the guard_mutex, and look at current->in_execve,
and just return -EAGAIN in that case, right, everybody happy :)


Bernd.
Bernd Edlinger March 2, 2020, 5:13 p.m. UTC | #10
On 3/2/20 5:17 PM, Eric W. Biederman wrote:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>>
>>>>
>>>> I tried this with s/EACCESS/EACCES/.
>>>>
>>>> The test case in this patch is not fixed, but strace does not freeze,
>>>> at least with my setup where it did freeze repeatable.
>>>
>>> Thanks, That is what I was aiming at.
>>>
>>> So we have one method we can pursue to fix this in practice.
>>>
>>>> That is
>>>> obviously because it bypasses the cred_guard_mutex.  But all other
>>>> process that access this file still freeze, and cannot be
>>>> interrupted except with kill -9.
>>>>
>>>> However that smells like a denial of service, that this
>>>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>>>> that freezes any process, even root, when it looks at it.
>>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>>>
>>> Yes.  Your the test case in your patch a variant of the original
>>> problem.
>>>
>>>
>>> I have been staring at this trying to understand the fundamentals of the
>>> original deeper problem.
>>>
>>> The current scope of cred_guard_mutex in exec is because being ptraced
>>> causes suid exec to act differently.  So we need to know early if we are
>>> ptraced.
>>>
>>
>> It has a second use, that it prevents two threads entering execve,
>> which would probably result in disaster.
> 
> Exec can fail with an error code up until de_thread.  de_thread causes
> exec to fail with the error code -EAGAIN for the second thread to get
> into de_thread.
> 
> So no.  The cred_guard_mutex is not needed for that case at all.
> 

Okay, but that will reset current->in_execve, right?

>>> If that case did not exist we could reduce the scope of the
>>> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
>>>
>>> I am starting to think reworking how we deal with ptrace and exec is the
>>> way to solve this problem.
> 
> 
> I am 99% convinced that the fix is to move cred_guard_mutex down.
> 
> Then right after we take cred_guard_mutex do:
> 	if (ptraced) {
> 		use_original_creds();
> 	}
> 
> And call it a day.
> 
> The details suck but I am 99% certain that would solve everyones
> problems, and not be too bad to audit either.
> 
> Eric
>
Jann Horn March 2, 2020, 5:37 p.m. UTC | #11
On Mon, Mar 2, 2020 at 6:01 PM Bernd Edlinger <bernd.edlinger@hotmail.de> wrote:
> On 3/2/20 5:43 PM, Jann Horn wrote:
> > On Mon, Mar 2, 2020 at 5:19 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> >>
> >>> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
> >>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> >>>>
> >>>>>
> >>>>> I tried this with s/EACCESS/EACCES/.
> >>>>>
> >>>>> The test case in this patch is not fixed, but strace does not freeze,
> >>>>> at least with my setup where it did freeze repeatable.
> >>>>
> >>>> Thanks, That is what I was aiming at.
> >>>>
> >>>> So we have one method we can pursue to fix this in practice.
> >>>>
> >>>>> That is
> >>>>> obviously because it bypasses the cred_guard_mutex.  But all other
> >>>>> process that access this file still freeze, and cannot be
> >>>>> interrupted except with kill -9.
> >>>>>
> >>>>> However that smells like a denial of service, that this
> >>>>> simple test case which can be executed by guest, creates a /proc/$pid/mem
> >>>>> that freezes any process, even root, when it looks at it.
> >>>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
> >>>>
> >>>> Yes.  Your the test case in your patch a variant of the original
> >>>> problem.
> >>>>
> >>>>
> >>>> I have been staring at this trying to understand the fundamentals of the
> >>>> original deeper problem.
> >>>>
> >>>> The current scope of cred_guard_mutex in exec is because being ptraced
> >>>> causes suid exec to act differently.  So we need to know early if we are
> >>>> ptraced.
> >>>>
> >>>
> >>> It has a second use, that it prevents two threads entering execve,
> >>> which would probably result in disaster.
> >>
> >> Exec can fail with an error code up until de_thread.  de_thread causes
> >> exec to fail with the error code -EAGAIN for the second thread to get
> >> into de_thread.
> >>
> >> So no.  The cred_guard_mutex is not needed for that case at all.
> >>
> >>>> If that case did not exist we could reduce the scope of the
> >>>> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
> >>>>
> >>>> I am starting to think reworking how we deal with ptrace and exec is the
> >>>> way to solve this problem.
> >>
> >>
> >> I am 99% convinced that the fix is to move cred_guard_mutex down.
> >
> > "move cred_guard_mutex down" as in "take it once we've already set up
> > the new process, past the point of no return"?
> >
> >> Then right after we take cred_guard_mutex do:
> >>         if (ptraced) {
> >>                 use_original_creds();
> >>         }
> >>
> >> And call it a day.
> >>
> >> The details suck but I am 99% certain that would solve everyones
> >> problems, and not be too bad to audit either.
> >
> > Ah, hmm, that sounds like it'll work fine at least when no LSMs are involved.
> >
> > SELinux normally doesn't do the execution-degrading thing, it just
> > blocks the execution completely - see their selinux_bprm_set_creds()
> > hook. So I think they'd still need to set some state on the task that
> > says "we're currently in the middle of an execution where the target
> > task will run in context X", and then check against that in the
> > ptrace_may_access hook. Or I suppose they could just kill the task
> > near the end of execve, although that'd be kinda ugly.
> >
>
> We have current->in_execve for that, right?
> I think when the cred_guard_mutex is taken only in the critical section,
> then PTRACE_ATTACH could take the guard_mutex, and look at current->in_execve,
> and just return -EAGAIN in that case, right, everybody happy :)

It's probably going to mean that things like strace will just randomly
fail to attach to processes if they happen to be in the middle of
execve... but I guess that works?
Christian Brauner March 2, 2020, 5:42 p.m. UTC | #12
<linux-doc@vger.kernel.org>,"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,"linux-mm@kvack.org" <linux-mm@kvack.org>,"stable@vger.kernel.org" <stable@vger.kernel.org>,linux-security-module <linux-security-module@vger.kernel.org>
From: Christian Brauner <christian.brauner@ubuntu.com>
Message-ID: <9C3BF644-0F82-48C9-9116-8554204FB57D@ubuntu.com>

On March 2, 2020 6:37:27 PM GMT+01:00, Jann Horn <jannh@google.com> wrote:
>On Mon, Mar 2, 2020 at 6:01 PM Bernd Edlinger
><bernd.edlinger@hotmail.de> wrote:
>> On 3/2/20 5:43 PM, Jann Horn wrote:
>> > On Mon, Mar 2, 2020 at 5:19 PM Eric W. Biederman
><ebiederm@xmission.com> wrote:
>> >>
>> >> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> >>
>> >>> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>> >>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> >>>>
>> >>>>>
>> >>>>> I tried this with s/EACCESS/EACCES/.
>> >>>>>
>> >>>>> The test case in this patch is not fixed, but strace does not
>freeze,
>> >>>>> at least with my setup where it did freeze repeatable.
>> >>>>
>> >>>> Thanks, That is what I was aiming at.
>> >>>>
>> >>>> So we have one method we can pursue to fix this in practice.
>> >>>>
>> >>>>> That is
>> >>>>> obviously because it bypasses the cred_guard_mutex.  But all
>other
>> >>>>> process that access this file still freeze, and cannot be
>> >>>>> interrupted except with kill -9.
>> >>>>>
>> >>>>> However that smells like a denial of service, that this
>> >>>>> simple test case which can be executed by guest, creates a
>/proc/$pid/mem
>> >>>>> that freezes any process, even root, when it looks at it.
>> >>>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>> >>>>
>> >>>> Yes.  Your the test case in your patch a variant of the original
>> >>>> problem.
>> >>>>
>> >>>>
>> >>>> I have been staring at this trying to understand the
>fundamentals of the
>> >>>> original deeper problem.
>> >>>>
>> >>>> The current scope of cred_guard_mutex in exec is because being
>ptraced
>> >>>> causes suid exec to act differently.  So we need to know early
>if we are
>> >>>> ptraced.
>> >>>>
>> >>>
>> >>> It has a second use, that it prevents two threads entering
>execve,
>> >>> which would probably result in disaster.
>> >>
>> >> Exec can fail with an error code up until de_thread.  de_thread
>causes
>> >> exec to fail with the error code -EAGAIN for the second thread to
>get
>> >> into de_thread.
>> >>
>> >> So no.  The cred_guard_mutex is not needed for that case at all.
>> >>
>> >>>> If that case did not exist we could reduce the scope of the
>> >>>> cred_guard_mutex in exec to where your patch puts the
>cred_change_mutex.
>> >>>>
>> >>>> I am starting to think reworking how we deal with ptrace and
>exec is the
>> >>>> way to solve this problem.
>> >>
>> >>
>> >> I am 99% convinced that the fix is to move cred_guard_mutex down.
>> >
>> > "move cred_guard_mutex down" as in "take it once we've already set
>up
>> > the new process, past the point of no return"?
>> >
>> >> Then right after we take cred_guard_mutex do:
>> >>         if (ptraced) {
>> >>                 use_original_creds();
>> >>         }
>> >>
>> >> And call it a day.
>> >>
>> >> The details suck but I am 99% certain that would solve everyones
>> >> problems, and not be too bad to audit either.
>> >
>> > Ah, hmm, that sounds like it'll work fine at least when no LSMs are
>involved.
>> >
>> > SELinux normally doesn't do the execution-degrading thing, it just
>> > blocks the execution completely - see their
>selinux_bprm_set_creds()
>> > hook. So I think they'd still need to set some state on the task
>that
>> > says "we're currently in the middle of an execution where the
>target
>> > task will run in context X", and then check against that in the
>> > ptrace_may_access hook. Or I suppose they could just kill the task
>> > near the end of execve, although that'd be kinda ugly.
>> >
>>
>> We have current->in_execve for that, right?
>> I think when the cred_guard_mutex is taken only in the critical
>section,
>> then PTRACE_ATTACH could take the guard_mutex, and look at
>current->in_execve,
>> and just return -EAGAIN in that case, right, everybody happy :)
>
>It's probably going to mean that things like strace will just randomly
>fail to attach to processes if they happen to be in the middle of
>execve... but I guess that works?

That sounds like an acceptable outcome.
We can at least risk it and if we regress
revert or come up with the more complex
solution suggested in another mail here?
Jann Horn March 2, 2020, 6:08 p.m. UTC | #13
On Mon, Mar 2, 2020 at 6:43 PM <christian@brauner.io> wrote:
> On March 2, 2020 6:37:27 PM GMT+01:00, Jann Horn <jannh@google.com> wrote:
> >On Mon, Mar 2, 2020 at 6:01 PM Bernd Edlinger
> ><bernd.edlinger@hotmail.de> wrote:
> >> On 3/2/20 5:43 PM, Jann Horn wrote:
> >> > On Mon, Mar 2, 2020 at 5:19 PM Eric W. Biederman
> ><ebiederm@xmission.com> wrote:
[...]
> >> >> I am 99% convinced that the fix is to move cred_guard_mutex down.
> >> >
> >> > "move cred_guard_mutex down" as in "take it once we've already set
> >up
> >> > the new process, past the point of no return"?
> >> >
> >> >> Then right after we take cred_guard_mutex do:
> >> >>         if (ptraced) {
> >> >>                 use_original_creds();
> >> >>         }
> >> >>
> >> >> And call it a day.
> >> >>
> >> >> The details suck but I am 99% certain that would solve everyones
> >> >> problems, and not be too bad to audit either.
> >> >
> >> > Ah, hmm, that sounds like it'll work fine at least when no LSMs are
> >involved.
> >> >
> >> > SELinux normally doesn't do the execution-degrading thing, it just
> >> > blocks the execution completely - see their
> >selinux_bprm_set_creds()
> >> > hook. So I think they'd still need to set some state on the task
> >that
> >> > says "we're currently in the middle of an execution where the
> >target
> >> > task will run in context X", and then check against that in the
> >> > ptrace_may_access hook. Or I suppose they could just kill the task
> >> > near the end of execve, although that'd be kinda ugly.
> >> >
> >>
> >> We have current->in_execve for that, right?
> >> I think when the cred_guard_mutex is taken only in the critical
> >section,
> >> then PTRACE_ATTACH could take the guard_mutex, and look at
> >current->in_execve,
> >> and just return -EAGAIN in that case, right, everybody happy :)
> >
> >It's probably going to mean that things like strace will just randomly
> >fail to attach to processes if they happen to be in the middle of
> >execve... but I guess that works?
>
> That sounds like an acceptable outcome.
> We can at least risk it and if we regress
> revert or come up with the more complex
> solution suggested in another mail here?

Yeah, sounds reasonable, I guess.
Eric W. Biederman March 2, 2020, 9:49 p.m. UTC | #14
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> On 3/2/20 5:17 PM, Eric W. Biederman wrote:
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> 
>>> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>>>
>>>>>
>>>>> I tried this with s/EACCESS/EACCES/.
>>>>>
>>>>> The test case in this patch is not fixed, but strace does not freeze,
>>>>> at least with my setup where it did freeze repeatable.
>>>>
>>>> Thanks, That is what I was aiming at.
>>>>
>>>> So we have one method we can pursue to fix this in practice.
>>>>
>>>>> That is
>>>>> obviously because it bypasses the cred_guard_mutex.  But all other
>>>>> process that access this file still freeze, and cannot be
>>>>> interrupted except with kill -9.
>>>>>
>>>>> However that smells like a denial of service, that this
>>>>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>>>>> that freezes any process, even root, when it looks at it.
>>>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>>>>
>>>> Yes.  Your the test case in your patch a variant of the original
>>>> problem.
>>>>
>>>>
>>>> I have been staring at this trying to understand the fundamentals of the
>>>> original deeper problem.
>>>>
>>>> The current scope of cred_guard_mutex in exec is because being ptraced
>>>> causes suid exec to act differently.  So we need to know early if we are
>>>> ptraced.
>>>>
>>>
>>> It has a second use, that it prevents two threads entering execve,
>>> which would probably result in disaster.
>> 
>> Exec can fail with an error code up until de_thread.  de_thread causes
>> exec to fail with the error code -EAGAIN for the second thread to get
>> into de_thread.
>> 
>> So no.  The cred_guard_mutex is not needed for that case at all.
>> 
>
> Okay, but that will reset current->in_execve, right?

Absolutely.

The error handling kicks in and exec_binprm fails with a negative
return code.  Then __do_excve_file cleans up and clears
current->in_execve.

Eric
Bernd Edlinger March 2, 2020, 10 p.m. UTC | #15
On 3/2/20 10:49 PM, Eric W. Biederman wrote:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>> On 3/2/20 5:17 PM, Eric W. Biederman wrote:
>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>>
>>>> On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>>>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>>>>
>>>>>>
>>>>>> I tried this with s/EACCESS/EACCES/.
>>>>>>
>>>>>> The test case in this patch is not fixed, but strace does not freeze,
>>>>>> at least with my setup where it did freeze repeatable.
>>>>>
>>>>> Thanks, That is what I was aiming at.
>>>>>
>>>>> So we have one method we can pursue to fix this in practice.
>>>>>
>>>>>> That is
>>>>>> obviously because it bypasses the cred_guard_mutex.  But all other
>>>>>> process that access this file still freeze, and cannot be
>>>>>> interrupted except with kill -9.
>>>>>>
>>>>>> However that smells like a denial of service, that this
>>>>>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>>>>>> that freezes any process, even root, when it looks at it.
>>>>>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>>>>>
>>>>> Yes.  Your the test case in your patch a variant of the original
>>>>> problem.
>>>>>
>>>>>
>>>>> I have been staring at this trying to understand the fundamentals of the
>>>>> original deeper problem.
>>>>>
>>>>> The current scope of cred_guard_mutex in exec is because being ptraced
>>>>> causes suid exec to act differently.  So we need to know early if we are
>>>>> ptraced.
>>>>>
>>>>
>>>> It has a second use, that it prevents two threads entering execve,
>>>> which would probably result in disaster.
>>>
>>> Exec can fail with an error code up until de_thread.  de_thread causes
>>> exec to fail with the error code -EAGAIN for the second thread to get
>>> into de_thread.
>>>
>>> So no.  The cred_guard_mutex is not needed for that case at all.
>>>
>>
>> Okay, but that will reset current->in_execve, right?
> 
> Absolutely.
> 
> The error handling kicks in and exec_binprm fails with a negative
> return code.  Then __do_excve_file cleans up and clears
> current->in_execve.
> 

Yes of course.  I was under the wrong impression that that value is
a kind of global, but it is a thread local.

So I think I need a new boolean see v3 of the patch, and soon v4 (with
just one comment fixed).

I'm currently executing the strace v5.5 testsuite, and every test
is passed so far.  I'll also look at gdb testsuite, before I send the
next version.


Thanks
Bernd.
Eric W. Biederman March 3, 2020, 8:13 p.m. UTC | #16
Jann Horn <jannh@google.com> writes:

> On Mon, Mar 2, 2020 at 5:19 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>
>> > On 3/2/20 4:57 PM, Eric W. Biederman wrote:
>> >> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> >>
>> >>>
>> >>> I tried this with s/EACCESS/EACCES/.
>> >>>
>> >>> The test case in this patch is not fixed, but strace does not freeze,
>> >>> at least with my setup where it did freeze repeatable.
>> >>
>> >> Thanks, That is what I was aiming at.
>> >>
>> >> So we have one method we can pursue to fix this in practice.
>> >>
>> >>> That is
>> >>> obviously because it bypasses the cred_guard_mutex.  But all other
>> >>> process that access this file still freeze, and cannot be
>> >>> interrupted except with kill -9.
>> >>>
>> >>> However that smells like a denial of service, that this
>> >>> simple test case which can be executed by guest, creates a /proc/$pid/mem
>> >>> that freezes any process, even root, when it looks at it.
>> >>> I mean: "ln -s README /proc/$pid/mem" would be a nice bomb.
>> >>
>> >> Yes.  Your the test case in your patch a variant of the original
>> >> problem.
>> >>
>> >>
>> >> I have been staring at this trying to understand the fundamentals of the
>> >> original deeper problem.
>> >>
>> >> The current scope of cred_guard_mutex in exec is because being ptraced
>> >> causes suid exec to act differently.  So we need to know early if we are
>> >> ptraced.
>> >>
>> >
>> > It has a second use, that it prevents two threads entering execve,
>> > which would probably result in disaster.
>>
>> Exec can fail with an error code up until de_thread.  de_thread causes
>> exec to fail with the error code -EAGAIN for the second thread to get
>> into de_thread.
>>
>> So no.  The cred_guard_mutex is not needed for that case at all.
>>
>> >> If that case did not exist we could reduce the scope of the
>> >> cred_guard_mutex in exec to where your patch puts the cred_change_mutex.
>> >>
>> >> I am starting to think reworking how we deal with ptrace and exec is the
>> >> way to solve this problem.
>>
>>
>> I am 99% convinced that the fix is to move cred_guard_mutex down.
>
> "move cred_guard_mutex down" as in "take it once we've already set up
> the new process, past the point of no return"?
>
>> Then right after we take cred_guard_mutex do:
>>         if (ptraced) {
>>                 use_original_creds();
>>         }
>>
>> And call it a day.
>>
>> The details suck but I am 99% certain that would solve everyones
>> problems, and not be too bad to audit either.
>
> Ah, hmm, that sounds like it'll work fine at least when no LSMs are involved.
>
> SELinux normally doesn't do the execution-degrading thing, it just
> blocks the execution completely - see their selinux_bprm_set_creds()
> hook. So I think they'd still need to set some state on the task that
> says "we're currently in the middle of an execution where the target
> task will run in context X", and then check against that in the
> ptrace_may_access hook. Or I suppose they could just kill the task
> near the end of execve, although that'd be kinda ugly.


So I looked and it the selinux thing is complicated.  It is not
as simple as selinux does not do the execution degrading thing.

In the normal case selinux retains the same sid for no_new_privs
and for no_suid.   I don't know when selinux policy says to change sids
on exec.

My sense right now is that selinux is completely inconsistent in this
area and it might not be a bit deal at all to write a patch to make
selinux match the rest of exec and unconditionally do the execution
degrading.

Certainly attempting it and having the conversation is probably the best
way to illuminate this dark corner.

Eric
diff mbox series

Patch

diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst
index 282e79f..c98e0a8 100644
--- a/Documentation/security/credentials.rst
+++ b/Documentation/security/credentials.rst
@@ -437,9 +437,13 @@  new set of credentials by calling::
 
 	struct cred *prepare_creds(void);
 
-this locks current->cred_replace_mutex and then allocates and constructs a
-duplicate of the current process's credentials, returning with the mutex still
-held if successful.  It returns NULL if not successful (out of memory).
+this allocates and constructs a duplicate of the current process's credentials.
+It returns NULL if not successful (out of memory).
+
+If called from __do_execve_file, the mutex current->signal->cred_guard_mutex
+is acquired before this function gets called, and the mutex
+current->signal->cred_change_mutex is acquired later, while the credentials
+and the process mmap are actually changed.
 
 The mutex prevents ``ptrace()`` from altering the ptrace state of a process
 while security checks on credentials construction and changing is taking place
@@ -466,9 +470,8 @@  by calling::
 
 This will alter various aspects of the credentials and the process, giving the
 LSM a chance to do likewise, then it will use ``rcu_assign_pointer()`` to
-actually commit the new credentials to ``current->cred``, it will release
-``current->cred_replace_mutex`` to allow ``ptrace()`` to take place, and it
-will notify the scheduler and others of the changes.
+actually commit the new credentials to ``current->cred``, and it will notify
+the scheduler and others of the changes.
 
 This function is guaranteed to return 0, so that it can be tail-called at the
 end of such functions as ``sys_setresuid()``.
@@ -486,8 +489,7 @@  invoked::
 
 	void abort_creds(struct cred *new);
 
-This releases the lock on ``current->cred_replace_mutex`` that
-``prepare_creds()`` got and then releases the new credentials.
+This releases the new credentials.
 
 
 A typical credentials alteration function would look something like this::
diff --git a/fs/exec.c b/fs/exec.c
index 74d88da..a6884e4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1266,6 +1266,12 @@  int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+	retval = mutex_lock_killable(&current->signal->cred_change_mutex);
+	if (retval)
+		goto out;
+
+	bprm->called_flush_old_exec = 1;
+
 	/*
 	 * Must be called _before_ exec_mmap() as bprm->mm is
 	 * not visibile until then. This also enables the update
@@ -1420,6 +1426,8 @@  static void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
+		if (bprm->called_flush_old_exec)
+			mutex_unlock(&current->signal->cred_change_mutex);
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
@@ -1469,6 +1477,7 @@  void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->signal->cred_change_mutex);
 	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index b40fc63..2e1318b 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -44,7 +44,11 @@  struct linux_binprm {
 		 * exec has happened. Used to sanitize execution environment
 		 * and to set AT_SECURE auxv for glibc.
 		 */
-		secureexec:1;
+		secureexec:1,
+		/*
+		 * Set by flush_old_exec, when the cred_change_mutex is taken.
+		 */
+		called_flush_old_exec:1;
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 8805025..37eeabe 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -225,6 +225,7 @@  struct signal_struct {
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace) */
+	struct mutex cred_change_mutex; /* guard against credentials change */
 } __randomize_layout;
 
 /*
diff --git a/init/init_task.c b/init/init_task.c
index 9e5cbe5..6cd9a0f 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -26,6 +26,7 @@ 
 	.multiprocess	= HLIST_HEAD_INIT,
 	.rlim		= INIT_RLIMITS,
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
+	.cred_change_mutex = __MUTEX_INITIALIZER(init_signals.cred_change_mutex),
 #ifdef CONFIG_POSIX_TIMERS
 	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
 	.cputimer	= {
diff --git a/kernel/cred.c b/kernel/cred.c
index 809a985..e4c78de 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -676,7 +676,7 @@  void __init cred_init(void)
  *
  * Returns the new credentials or NULL if out of memory.
  *
- * Does not take, and does not return holding current->cred_replace_mutex.
+ * Does not take, and does not return holding ->cred_guard_mutex.
  */
 struct cred *prepare_kernel_cred(struct task_struct *daemon)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 0808095..0395154 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1224,7 +1224,7 @@  struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	struct mm_struct *mm;
 	int err;
 
-	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+	err =  mutex_lock_killable(&task->signal->cred_change_mutex);
 	if (err)
 		return ERR_PTR(err);
 
@@ -1234,7 +1234,7 @@  struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_change_mutex);
 
 	return mm;
 }
@@ -1594,6 +1594,7 @@  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
+	mutex_init(&sig->cred_change_mutex);
 
 	return 0;
 }
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7b..b3e6eb5 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -204,7 +204,7 @@  static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 	if (!mm || IS_ERR(mm)) {
 		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
 		/*
-		 * Explicitly map EACCES to EPERM as EPERM is a more a
+		 * Explicitly map EACCES to EPERM as EPERM is a more
 		 * appropriate error code for process_vw_readv/writev
 		 */
 		if (rc == -EACCES)
diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile
index c0b7f89..2f1f532 100644
--- a/tools/testing/selftests/ptrace/Makefile
+++ b/tools/testing/selftests/ptrace/Makefile
@@ -1,6 +1,6 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -iquote../../../../include/uapi -Wall
+CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
 
-TEST_GEN_PROGS := get_syscall_info peeksiginfo
+TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
 
 include ../lib.mk
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
new file mode 100644
index 0000000..ef08c9f
--- /dev/null
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -0,0 +1,46 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de>
+ * All rights reserved.
+ *
+ * Check whether /proc/$pid/mem can be accessed without causing deadlocks
+ * when de_thread is blocked with ->cred_guard_mutex held.
+ */
+
+#include "../kselftest_harness.h"
+#include <stdio.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+
+static void *thread(void *arg)
+{
+	ptrace(PTRACE_TRACEME, 0, 0, 0);
+	return NULL;
+}
+
+TEST(vmaccess)
+{
+	int f, pid = fork();
+	char mm[64];
+
+	if (!pid) {
+		pthread_t pt;
+		pthread_create(&pt, NULL, thread, NULL);
+		pthread_join(pt, NULL);
+		execlp("true", "true", NULL);
+	}
+
+	sleep(1);
+	sprintf(mm, "/proc/%d/mem", pid);
+	f = open(mm, O_RDONLY);
+	ASSERT_LE(0, f)
+		close(f);
+	/* this is not fixed! ptrace(PTRACE_ATTACH, pid, 0,0); */
+	f = kill(pid, SIGCONT);
+	ASSERT_EQ(0, f);
+}
+
+TEST_HARNESS_MAIN