diff mbox series

[v2,1/2] scs: switch to vmapped shadow stacks

Message ID 20201124195940.27061-2-samitolvanen@google.com (mailing list archive)
State New, archived
Headers show
Series scs: switch to vmapped shadow stacks | expand

Commit Message

Sami Tolvanen Nov. 24, 2020, 7:59 p.m. UTC
The kernel currently uses kmem_cache to allocate shadow call stacks,
which means an overflows may not be immediately detected and can
potentially result in another task's shadow stack to be overwritten.

This change switches SCS to use virtually mapped shadow stacks for
tasks, which increases shadow stack size to a full page and provides
more robust overflow detection, similarly to VMAP_STACK.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/scs.h | 12 ++++-----
 kernel/scs.c        | 66 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 61 insertions(+), 17 deletions(-)

Comments

Kees Cook Nov. 24, 2020, 10:04 p.m. UTC | #1
On Tue, Nov 24, 2020 at 11:59:39AM -0800, Sami Tolvanen wrote:
> The kernel currently uses kmem_cache to allocate shadow call stacks,
> which means an overflows may not be immediately detected and can
> potentially result in another task's shadow stack to be overwritten.
> 
> This change switches SCS to use virtually mapped shadow stacks for
> tasks, which increases shadow stack size to a full page and provides
> more robust overflow detection, similarly to VMAP_STACK.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>

Reviewed-by: Kees Cook <keescook@chromium.org>
Will Deacon Nov. 30, 2020, 11:44 a.m. UTC | #2
On Tue, Nov 24, 2020 at 11:59:39AM -0800, Sami Tolvanen wrote:
> The kernel currently uses kmem_cache to allocate shadow call stacks,
> which means an overflows may not be immediately detected and can
> potentially result in another task's shadow stack to be overwritten.
> 
> This change switches SCS to use virtually mapped shadow stacks for
> tasks, which increases shadow stack size to a full page and provides
> more robust overflow detection, similarly to VMAP_STACK.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> ---
>  include/linux/scs.h | 12 ++++-----
>  kernel/scs.c        | 66 +++++++++++++++++++++++++++++++++++++--------
>  2 files changed, 61 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/scs.h b/include/linux/scs.h
> index 6dec390cf154..2a506c2a16f4 100644
> --- a/include/linux/scs.h
> +++ b/include/linux/scs.h
> @@ -15,12 +15,8 @@
>  
>  #ifdef CONFIG_SHADOW_CALL_STACK
>  
> -/*
> - * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit
> - * architecture) provided ~40% safety margin on stack usage while keeping
> - * memory allocation overhead reasonable.
> - */
> -#define SCS_SIZE		SZ_1K
> +#define SCS_ORDER		0
> +#define SCS_SIZE		(PAGE_SIZE << SCS_ORDER)
>  #define GFP_SCS			(GFP_KERNEL | __GFP_ZERO)
>  
>  /* An illegal pointer value to mark the end of the shadow stack. */
> @@ -33,6 +29,8 @@
>  #define task_scs(tsk)		(task_thread_info(tsk)->scs_base)
>  #define task_scs_sp(tsk)	(task_thread_info(tsk)->scs_sp)
>  
> +void *scs_alloc(int node);
> +void scs_free(void *s);
>  void scs_init(void);
>  int scs_prepare(struct task_struct *tsk, int node);
>  void scs_release(struct task_struct *tsk);
> @@ -61,6 +59,8 @@ static inline bool task_scs_end_corrupted(struct task_struct *tsk)
>  
>  #else /* CONFIG_SHADOW_CALL_STACK */
>  
> +static inline void *scs_alloc(int node) { return NULL; }
> +static inline void scs_free(void *s) {}
>  static inline void scs_init(void) {}
>  static inline void scs_task_reset(struct task_struct *tsk) {}
>  static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; }
> diff --git a/kernel/scs.c b/kernel/scs.c
> index 4ff4a7ba0094..25b0dd5aa0e2 100644
> --- a/kernel/scs.c
> +++ b/kernel/scs.c
> @@ -5,50 +5,94 @@
>   * Copyright (C) 2019 Google LLC
>   */
>  
> +#include <linux/cpuhotplug.h>
>  #include <linux/kasan.h>
>  #include <linux/mm.h>
>  #include <linux/scs.h>
> -#include <linux/slab.h>
> +#include <linux/vmalloc.h>
>  #include <linux/vmstat.h>
>  
> -static struct kmem_cache *scs_cache;
> -
>  static void __scs_account(void *s, int account)
>  {
> -	struct page *scs_page = virt_to_page(s);
> +	struct page *scs_page = vmalloc_to_page(s);
>  
>  	mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
>  			    account * (SCS_SIZE / SZ_1K));
>  }
>  
> -static void *scs_alloc(int node)
> +/* Matches NR_CACHED_STACKS for VMAP_STACK */
> +#define NR_CACHED_SCS 2
> +static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
> +
> +void *scs_alloc(int node)
>  {
> -	void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
> +	int i;
> +	void *s;
> +
> +	for (i = 0; i < NR_CACHED_SCS; i++) {
> +		s = this_cpu_xchg(scs_cache[i], NULL);
> +		if (s) {
> +			kasan_unpoison_vmalloc(s, SCS_SIZE);
> +			memset(s, 0, SCS_SIZE);
> +			goto out;
> +		}
> +	}
> +
> +	s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
> +				 GFP_SCS, PAGE_KERNEL, 0, node,
> +				 __builtin_return_address(0));
>  
>  	if (!s)
>  		return NULL;

Sorry I didn't spot this before, but if you put the xchg/vmalloc code
into a new __scs_alloc() function then you can drop the label and this
becomes:

	s = __scs_alloc(...);
	if (!s)
		return NULL;

	*__scs_maghic(s) = SCS_ENG_MAGIC;
	...

With that:

Acked-by: Will Deacon <will@kernel.org>

Will
Sami Tolvanen Nov. 30, 2020, 8:03 p.m. UTC | #3
On Mon, Nov 30, 2020 at 3:44 AM Will Deacon <will@kernel.org> wrote:
>
> On Tue, Nov 24, 2020 at 11:59:39AM -0800, Sami Tolvanen wrote:
> > +void *scs_alloc(int node)
> >  {
> > -     void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
> > +     int i;
> > +     void *s;
> > +
> > +     for (i = 0; i < NR_CACHED_SCS; i++) {
> > +             s = this_cpu_xchg(scs_cache[i], NULL);
> > +             if (s) {
> > +                     kasan_unpoison_vmalloc(s, SCS_SIZE);
> > +                     memset(s, 0, SCS_SIZE);
> > +                     goto out;
> > +             }
> > +     }
> > +
> > +     s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
> > +                              GFP_SCS, PAGE_KERNEL, 0, node,
> > +                              __builtin_return_address(0));
> >
> >       if (!s)
> >               return NULL;
>
> Sorry I didn't spot this before, but if you put the xchg/vmalloc code
> into a new __scs_alloc() function then you can drop the label and this
> becomes:
>
>         s = __scs_alloc(...);
>         if (!s)
>                 return NULL;
>
>         *__scs_maghic(s) = SCS_ENG_MAGIC;
>         ...

Good point, I'll change this in v3.

Sami
diff mbox series

Patch

diff --git a/include/linux/scs.h b/include/linux/scs.h
index 6dec390cf154..2a506c2a16f4 100644
--- a/include/linux/scs.h
+++ b/include/linux/scs.h
@@ -15,12 +15,8 @@ 
 
 #ifdef CONFIG_SHADOW_CALL_STACK
 
-/*
- * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit
- * architecture) provided ~40% safety margin on stack usage while keeping
- * memory allocation overhead reasonable.
- */
-#define SCS_SIZE		SZ_1K
+#define SCS_ORDER		0
+#define SCS_SIZE		(PAGE_SIZE << SCS_ORDER)
 #define GFP_SCS			(GFP_KERNEL | __GFP_ZERO)
 
 /* An illegal pointer value to mark the end of the shadow stack. */
@@ -33,6 +29,8 @@ 
 #define task_scs(tsk)		(task_thread_info(tsk)->scs_base)
 #define task_scs_sp(tsk)	(task_thread_info(tsk)->scs_sp)
 
+void *scs_alloc(int node);
+void scs_free(void *s);
 void scs_init(void);
 int scs_prepare(struct task_struct *tsk, int node);
 void scs_release(struct task_struct *tsk);
@@ -61,6 +59,8 @@  static inline bool task_scs_end_corrupted(struct task_struct *tsk)
 
 #else /* CONFIG_SHADOW_CALL_STACK */
 
+static inline void *scs_alloc(int node) { return NULL; }
+static inline void scs_free(void *s) {}
 static inline void scs_init(void) {}
 static inline void scs_task_reset(struct task_struct *tsk) {}
 static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; }
diff --git a/kernel/scs.c b/kernel/scs.c
index 4ff4a7ba0094..25b0dd5aa0e2 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -5,50 +5,94 @@ 
  * Copyright (C) 2019 Google LLC
  */
 
+#include <linux/cpuhotplug.h>
 #include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/scs.h>
-#include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 
-static struct kmem_cache *scs_cache;
-
 static void __scs_account(void *s, int account)
 {
-	struct page *scs_page = virt_to_page(s);
+	struct page *scs_page = vmalloc_to_page(s);
 
 	mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
 			    account * (SCS_SIZE / SZ_1K));
 }
 
-static void *scs_alloc(int node)
+/* Matches NR_CACHED_STACKS for VMAP_STACK */
+#define NR_CACHED_SCS 2
+static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
+
+void *scs_alloc(int node)
 {
-	void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
+	int i;
+	void *s;
+
+	for (i = 0; i < NR_CACHED_SCS; i++) {
+		s = this_cpu_xchg(scs_cache[i], NULL);
+		if (s) {
+			kasan_unpoison_vmalloc(s, SCS_SIZE);
+			memset(s, 0, SCS_SIZE);
+			goto out;
+		}
+	}
+
+	s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+				 GFP_SCS, PAGE_KERNEL, 0, node,
+				 __builtin_return_address(0));
 
 	if (!s)
 		return NULL;
 
+out:
 	*__scs_magic(s) = SCS_END_MAGIC;
 
 	/*
 	 * Poison the allocation to catch unintentional accesses to
 	 * the shadow stack when KASAN is enabled.
 	 */
-	kasan_poison_object_data(scs_cache, s);
+	kasan_poison_vmalloc(s, SCS_SIZE);
 	__scs_account(s, 1);
 	return s;
 }
 
-static void scs_free(void *s)
+void scs_free(void *s)
 {
+	int i;
+
 	__scs_account(s, -1);
-	kasan_unpoison_object_data(scs_cache, s);
-	kmem_cache_free(scs_cache, s);
+
+	/*
+	 * We cannot sleep as this can be called in interrupt context,
+	 * so use this_cpu_cmpxchg to update the cache, and vfree_atomic
+	 * to free the stack.
+	 */
+
+	for (i = 0; i < NR_CACHED_SCS; i++)
+		if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
+			return;
+
+	vfree_atomic(s);
+}
+
+static int scs_cleanup(unsigned int cpu)
+{
+	int i;
+	void **cache = per_cpu_ptr(scs_cache, cpu);
+
+	for (i = 0; i < NR_CACHED_SCS; i++) {
+		vfree(cache[i]);
+		cache[i] = NULL;
+	}
+
+	return 0;
 }
 
 void __init scs_init(void)
 {
-	scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL);
+	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
+			  scs_cleanup);
 }
 
 int scs_prepare(struct task_struct *tsk, int node)