@@ -747,6 +747,14 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
+ /*
+ * When we exit and ->mm (the reference pinning ->mm's address space)
+ * goes away, we stash a reference to the mm_struct itself (counted via
+ * exit_mm->mm_count) in this member.
+ * This allows us to continue using the mm_struct for security checks
+ * and such even after the task has started exiting.
+ */
+ struct mm_struct *exit_mm;
/* Per-thread vma caching: */
struct vmacache vmacache;
@@ -476,6 +476,8 @@ static void exit_mm(void)
/* more a memory barrier than a real lock */
task_lock(current);
current->mm = NULL;
+ mmgrab(mm); /* for current->exit_mm */
+ current->exit_mm = mm;
mmap_read_unlock(mm);
enter_lazy_tlb(mm, current);
task_unlock(current);
@@ -438,32 +438,6 @@ void put_task_stack(struct task_struct *tsk)
}
#endif
-void free_task(struct task_struct *tsk)
-{
- scs_release(tsk);
-
-#ifndef CONFIG_THREAD_INFO_IN_TASK
- /*
- * The task is finally done with both the stack and thread_info,
- * so free both.
- */
- release_task_stack(tsk);
-#else
- /*
- * If the task had a separate stack allocation, it should be gone
- * by now.
- */
- WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
-#endif
- rt_mutex_debug_task_free(tsk);
- ftrace_graph_exit_task(tsk);
- arch_release_task_struct(tsk);
- if (tsk->flags & PF_KTHREAD)
- free_kthread_struct(tsk);
- free_task_struct(tsk);
-}
-EXPORT_SYMBOL(free_task);
-
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
@@ -722,6 +696,34 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}
+void free_task(struct task_struct *tsk)
+{
+ scs_release(tsk);
+
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * The task is finally done with both the stack and thread_info,
+ * so free both.
+ */
+ release_task_stack(tsk);
+#else
+ /*
+ * If the task had a separate stack allocation, it should be gone
+ * by now.
+ */
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
+#endif
+ rt_mutex_debug_task_free(tsk);
+ ftrace_graph_exit_task(tsk);
+ arch_release_task_struct(tsk);
+ if (tsk->flags & PF_KTHREAD)
+ free_kthread_struct(tsk);
+ if (tsk->exit_mm)
+ mmdrop_async(tsk->exit_mm);
+ free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
@@ -342,7 +342,17 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
* Pairs with a write barrier in commit_creds().
*/
smp_rmb();
+ /*
+ * Look up the target task's mm_struct. If it fails because the task is
+ * exiting and has gone through exit_mm(), we can instead use ->exit_mm
+ * as long as we only use members that are preserved by an mmgrab()
+ * reference.
+ * The only case in which both ->mm and ->exit_mm can be NULL should be
+ * kernel threads.
+ */
mm = task->mm;
+ if (!mm)
+ mm = task->exit_mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
!ptrace_has_cap(cred, mm->user_ns, mode)))
__ptrace_may_access() checks can happen on target tasks that are in the middle of do_exit(), past exit_mm(). At that point, the ->mm pointer has been NULLed out, and the mm_struct has been mmput(). Unfortunately, the mm_struct contains the dumpability and the user_ns in which the task last went through execve(), and we need those for __ptrace_may_access(). Currently, that problem is handled by failing open: If the ->mm is gone, we assume that the task was dumpable. In some edge cases, this could potentially expose access to things like /proc/$pid/fd/$fd of originally non-dumpable processes. (exit_files() comes after exit_mm(), so the file descriptor table is still there when we've gone through exit_mm().) One way to fix this would be to move mm->user_ns and the dumpability state over into the task_struct. However, that gets quite ugly if we want to preserve existing semantics because e.g. PR_SET_DUMPABLE and commit_creds() would then have to scan through all tasks sharing the mm_struct and keep them in sync manually - that'd be a bit error-prone and overcomplicated. (Moving these things into the signal_struct is not an option because that is kept across executions, and pre-execve co-threads will share the signal_struct that is also used by the task that has gone through execve().) I believe that this patch may be the least bad option to fix this - keep the mm_struct (but not process memory) around with an mmgrab() reference from exit_mm() until the task goes away completely. Note that this moves free_task() down in order to make mmdrop_async() available without a forward declaration. Cc: stable@vger.kernel.org Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks") Signed-off-by: Jann Horn <jannh@google.com> --- include/linux/sched.h | 8 +++++++ kernel/exit.c | 2 ++ kernel/fork.c | 54 ++++++++++++++++++++++--------------------- kernel/ptrace.c | 10 ++++++++ 4 files changed, 48 insertions(+), 26 deletions(-)