diff mbox series

[v4,19/36] arm64/gcs: Allocate a new GCS for threads with GCS enabled

Message ID 20230807-arm64-gcs-v4-19-68cfa37f9069@kernel.org (mailing list archive)
State Superseded
Headers show
Series arm64/gcs: Provide support for GCS in userspace | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes, riscv/for-next or riscv/master

Commit Message

Mark Brown Aug. 7, 2023, 10 p.m. UTC
We do not currently have a mechanism to specify a new GCS for a new
thread so when a thread is created which has GCS enabled allocate one
for it.  Since there is no current API for specifying the size of the
GCS we follow the extensively discussed x86 implementation and allocate
min(RLIMIT_STACK, 4G).  Since the GCS only stores the call stack and not
any variables this should be more than sufficient for most applications.

When allocating the stack we initialise GCSPR_EL0 to point to one entry
below the end of the region allocated, this keeps the top entry of the
stack 0 so software walking the GCS can easily detect the end of the
region.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm64/include/asm/gcs.h |  7 ++++++
 arch/arm64/kernel/process.c  | 30 ++++++++++++++++++++++++
 arch/arm64/mm/gcs.c          | 56 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)

Comments

Catalin Marinas Aug. 11, 2023, 4:26 p.m. UTC | #1
On Mon, Aug 07, 2023 at 11:00:24PM +0100, Mark Brown wrote:
> diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
> index b0a67efc522b..1e059c37088d 100644
> --- a/arch/arm64/mm/gcs.c
> +++ b/arch/arm64/mm/gcs.c
> @@ -8,6 +8,62 @@
>  #include <asm/cpufeature.h>
>  #include <asm/page.h>
>  
> +static unsigned long alloc_gcs(unsigned long addr, unsigned long size,
> +			       unsigned long token_offset, bool set_res_tok)
> +{
> +	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
> +	struct mm_struct *mm = current->mm;
> +	unsigned long mapped_addr, unused;
> +
> +	if (addr)
> +		flags |= MAP_FIXED_NOREPLACE;
> +
> +	mmap_write_lock(mm);
> +	mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
> +			      VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);

Why not PROT_WRITE as well? I guess I need to check the x86 patches
since the do_mmap() called here has a different prototype than what's in
mainline.

This gets confusing since currently the VM_* flags are derived from the
PROT_* flags passed to mmap(). But you skip the PROT_WRITE in favour of
adding VM_WRITE directly.

I haven't followed the x86 discussion but did we run out of PROT_* bits
for a PROT_SHADOW_STACK?

> +	mmap_write_unlock(mm);
> +
> +	return mapped_addr;
> +}
> +
> +static unsigned long gcs_size(unsigned long size)
> +{
> +	if (size)
> +		return PAGE_ALIGN(size);
> +
> +	/* Allocate RLIMIT_STACK with limits of PAGE_SIZE..4G */
> +	size = PAGE_ALIGN(min_t(unsigned long long,
> +				rlimit(RLIMIT_STACK), SZ_4G));
> +	return max(PAGE_SIZE, size);
> +}

I saw Szabolcs commenting on the default size as well. Maybe we should
go for RLIMIT_STACK/2 but let's see how the other sub-thread is going.

> +
> +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
> +				     unsigned long clone_flags, size_t size)
> +{
> +	unsigned long addr;
> +
> +	if (!system_supports_gcs())
> +		return 0;
> +
> +	if (!task_gcs_el0_enabled(tsk))
> +		return 0;
> +
> +	if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
> +		return 0;

Is it safe for CLONE_VFORK not to get a new shadow stack? A syscall for
exec could push something to the stack. I guess the GCS pointer in the
parent stays the same, so it wouldn't matter.

That said, I think this check should be somewhere higher up in the
caller of gcs_alloc_thread_stack(). The copy_thread_gcs() function
already does most of the above checks. Is the GCS allocation called from
elsewhere as well?
Mark Brown Aug. 18, 2023, 8:15 p.m. UTC | #2
On Fri, Aug 11, 2023 at 05:26:03PM +0100, Catalin Marinas wrote:
> On Mon, Aug 07, 2023 at 11:00:24PM +0100, Mark Brown wrote:

> > +	mmap_write_lock(mm);
> > +	mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
> > +			      VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);

> Why not PROT_WRITE as well? I guess I need to check the x86 patches
> since the do_mmap() called here has a different prototype than what's in
> mainline.

> This gets confusing since currently the VM_* flags are derived from the
> PROT_* flags passed to mmap(). But you skip the PROT_WRITE in favour of
> adding VM_WRITE directly.

I have to confess that I inherited this from the x86 code and never
thought too hard about it.  I've got a horrible feeling the reasoning is
simply the way in which x86 fits shadow stack into the page tables
without having a mechanism like permission indirection, these don't
apply for us.

> I haven't followed the x86 discussion but did we run out of PROT_* bits
> for a PROT_SHADOW_STACK?

It's more that there are security concerns with having PROT_, especially
in conjunction with needing to provide a token for stack pivot - we not
only need to map pages for the GCS, we also need to write a cap token
into it so that we can pivot to the new stack.  If the GCS can ever be
written to by userspace via normal means then that's an issue for the
basic protection model that the feature is trying to implement.  If we
have the PROT_ but try to check for bad uses of it that makes everything
messy and complicated which is especially non-ideal for a feature with a
security focus.  Having a more packaged system call is easier for
everyone.

More detail in the x86 patch that's currently in -next:

   https://lore.kernel.org/all/20230319001535.23210-34-rick.p.edgecombe@intel.com/

> > +	/* Allocate RLIMIT_STACK with limits of PAGE_SIZE..4G */
> > +	size = PAGE_ALIGN(min_t(unsigned long long,
> > +				rlimit(RLIMIT_STACK), SZ_4G));
> > +	return max(PAGE_SIZE, size);
> > +}

> I saw Szabolcs commenting on the default size as well. Maybe we should
> go for RLIMIT_STACK/2 but let's see how the other sub-thread is going.

I've updated it.

> > +	if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
> > +		return 0;

> Is it safe for CLONE_VFORK not to get a new shadow stack? A syscall for
> exec could push something to the stack. I guess the GCS pointer in the
> parent stays the same, so it wouldn't matter.

Yes, pushing should be fine just as for the regular stack.

> That said, I think this check should be somewhere higher up in the
> caller of gcs_alloc_thread_stack(). The copy_thread_gcs() function
> already does most of the above checks. Is the GCS allocation called from
> elsewhere as well?

That's the only place.  I've moved the above check into copy_thread_gcs(),
you're right that the other checks are redundant as they're done in the
caller already.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 04594ef59dad..4371a2f99b4a 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -58,6 +58,8 @@  static inline bool task_gcs_el0_enabled(struct task_struct *task)
 void gcs_set_el0_mode(struct task_struct *task);
 void gcs_free(struct task_struct *task);
 void gcs_preserve_current_state(void);
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+			    unsigned long clone_flags, size_t size);
 
 #else
 
@@ -69,6 +71,11 @@  static inline bool task_gcs_el0_enabled(struct task_struct *task)
 static inline void gcs_set_el0_mode(struct task_struct *task) { }
 static inline void gcs_free(struct task_struct *task) { }
 static inline void gcs_preserve_current_state(void) { }
+static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+				     unsigned long clone_flags, size_t size)
+{
+	return -ENOTSUPP;
+}
 
 #endif
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b8a42471aea3..1de6371ca2d8 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -284,9 +284,34 @@  static void flush_gcs(void)
 	}
 }
 
+static int copy_thread_gcs(struct task_struct *p, unsigned long clone_flags,
+			   size_t stack_size)
+{
+	unsigned long gcs;
+
+	if (!system_supports_gcs())
+		return 0;
+
+	if (!task_gcs_el0_enabled(p))
+		return 0;
+
+	p->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);
+
+	gcs = gcs_alloc_thread_stack(p, clone_flags, stack_size);
+	if (IS_ERR_VALUE(gcs))
+		return PTR_ERR((void *)gcs);
+
+	return 0;
+}
+
 #else
 
 static void flush_gcs(void) { }
+static int copy_thread_gcs(struct task_struct *p, unsigned long clone_flags,
+			   size_t stack_size)
+{
+	return 0;
+}
 
 #endif
 
@@ -368,6 +393,7 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	unsigned long stack_start = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
+	int ret;
 
 	memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
 
@@ -409,6 +435,10 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 			p->thread.uw.tp_value = tls;
 			p->thread.tpidr2_el0 = 0;
 		}
+
+		ret = copy_thread_gcs(p, clone_flags, args->stack_size);
+		if (ret != 0)
+			return ret;
 	} else {
 		/*
 		 * A kthread has no context to ERET to, so ensure any buggy
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index b0a67efc522b..1e059c37088d 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -8,6 +8,62 @@ 
 #include <asm/cpufeature.h>
 #include <asm/page.h>
 
+static unsigned long alloc_gcs(unsigned long addr, unsigned long size,
+			       unsigned long token_offset, bool set_res_tok)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+	struct mm_struct *mm = current->mm;
+	unsigned long mapped_addr, unused;
+
+	if (addr)
+		flags |= MAP_FIXED_NOREPLACE;
+
+	mmap_write_lock(mm);
+	mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+			      VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+	mmap_write_unlock(mm);
+
+	return mapped_addr;
+}
+
+static unsigned long gcs_size(unsigned long size)
+{
+	if (size)
+		return PAGE_ALIGN(size);
+
+	/* Allocate RLIMIT_STACK with limits of PAGE_SIZE..4G */
+	size = PAGE_ALIGN(min_t(unsigned long long,
+				rlimit(RLIMIT_STACK), SZ_4G));
+	return max(PAGE_SIZE, size);
+}
+
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+				     unsigned long clone_flags, size_t size)
+{
+	unsigned long addr;
+
+	if (!system_supports_gcs())
+		return 0;
+
+	if (!task_gcs_el0_enabled(tsk))
+		return 0;
+
+	if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+		return 0;
+
+	size = gcs_size(size);
+
+	addr = alloc_gcs(0, size, 0, 0);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
+	tsk->thread.gcs_base = addr;
+	tsk->thread.gcs_size = size;
+	tsk->thread.gcspr_el0 = addr + size - sizeof(u64);
+
+	return addr;
+}
+
 /*
  * Apply the GCS mode configured for the specified task to the
  * hardware.