@@ -37,6 +37,15 @@ static inline int shstk_update_last_frame(unsigned long val) { return 0; }
static inline bool shstk_is_enabled(void) { return false; }
#endif /* CONFIG_X86_USER_SHADOW_STACK */
+int arch_create_rstor_token(unsigned long ssp, unsigned long *token_addr);
+bool arch_cpu_supports_shadow_stack(void);
+bool arch_is_shstk_enabled(struct task_struct *task);
+void arch_set_shstk_base_size(struct task_struct *task, unsigned long base,
+ unsigned long size);
+void arch_get_shstk_base_size(struct task_struct *task, unsigned long *base,
+ unsigned long *size);
+void arch_set_shstk_ptr_and_enable(unsigned long ssp);
+void arch_set_thread_shstk_status(bool enable);
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SHSTK_H */
@@ -5,9 +5,6 @@
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
#define MAP_ABOVE4G 0x80 /* only map above 4GB */
-/* Flags for map_shadow_stack(2) */
-#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
-
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */
@@ -25,6 +25,7 @@
#include <asm/special_insns.h>
#include <asm/fpu/api.h>
#include <asm/prctl.h>
+#include <linux/usershstk.h>
#define SS_FRAME_SIZE 8
@@ -43,11 +44,56 @@ static void features_clr(unsigned long features)
current->thread.features &= ~features;
}
+bool arch_cpu_supports_shadow_stack(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_USER_SHSTK);
+}
+
+bool arch_is_shstk_enabled(struct task_struct *task)
+{
+ return features_enabled(ARCH_SHSTK_SHSTK);
+}
+
+void arch_set_shstk_base_size(struct task_struct *task, unsigned long base,
+ unsigned long size)
+{
+ struct thread_shstk *shstk = &task->thread.shstk;
+
+ shstk->base = base;
+ shstk->size = size;
+}
+
+void arch_get_shstk_base_size(struct task_struct *task, unsigned long *base,
+ unsigned long *size)
+{
+ struct thread_shstk *shstk = &task->thread.shstk;
+
+ *base = shstk->base;
+ *size = shstk->size;
+}
+
+
+void arch_set_shstk_ptr_and_enable(unsigned long ssp)
+{
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+ fpregs_unlock();
+}
+
+void arch_set_thread_shstk_status(bool enable)
+{
+ if (enable)
+ features_set(ARCH_SHSTK_SHSTK);
+ else
+ features_clr(ARCH_SHSTK_SHSTK);
+}
+
/*
* Create a restore token on the shadow stack. A token is always 8-byte
* and aligned to 8.
*/
-static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+int arch_create_rstor_token(unsigned long ssp, unsigned long *token_addr)
{
unsigned long addr;
@@ -72,118 +118,6 @@ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
return 0;
}
-/*
- * VM_SHADOW_STACK will have a guard page. This helps userspace protect
- * itself from attacks. The reasoning is as follows:
- *
- * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
- * INCSSP instruction can increment the shadow stack pointer. It is the
- * shadow stack analog of an instruction like:
- *
- * addq $0x80, %rsp
- *
- * However, there is one important difference between an ADD on %rsp
- * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
- * memory of the first and last elements that were "popped". It can be
- * thought of as acting like this:
- *
- * READ_ONCE(ssp); // read+discard top element on stack
- * ssp += nr_to_pop * 8; // move the shadow stack
- * READ_ONCE(ssp-8); // read+discard last popped stack element
- *
- * The maximum distance INCSSP can move the SSP is 2040 bytes, before
- * it would read the memory. Therefore a single page gap will be enough
- * to prevent any operation from shifting the SSP to an adjacent stack,
- * since it would have to land in the gap at least once, causing a
- * fault.
- */
-static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
- unsigned long token_offset, bool set_res_tok)
-{
- int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
- struct mm_struct *mm = current->mm;
- unsigned long mapped_addr, unused;
-
- if (addr)
- flags |= MAP_FIXED_NOREPLACE;
-
- mmap_write_lock(mm);
- mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
- VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
- mmap_write_unlock(mm);
-
- if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
- goto out;
-
- if (create_rstor_token(mapped_addr + token_offset, NULL)) {
- vm_munmap(mapped_addr, size);
- return -EINVAL;
- }
-
-out:
- return mapped_addr;
-}
-
-static unsigned long adjust_shstk_size(unsigned long size)
-{
- if (size)
- return PAGE_ALIGN(size);
-
- return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
-}
-
-static void unmap_shadow_stack(u64 base, u64 size)
-{
- int r;
-
- r = vm_munmap(base, size);
-
- /*
- * mmap_write_lock_killable() failed with -EINTR. This means
- * the process is about to die and have it's MM cleaned up.
- * This task shouldn't ever make it back to userspace. In this
- * case it is ok to leak a shadow stack, so just exit out.
- */
- if (r == -EINTR)
- return;
-
- /*
- * For all other types of vm_munmap() failure, either the
- * system is out of memory or there is bug.
- */
- WARN_ON_ONCE(r);
-}
-
-static int shstk_setup(void)
-{
- struct thread_shstk *shstk = ¤t->thread.shstk;
- unsigned long addr, size;
-
- /* Already enabled */
- if (features_enabled(ARCH_SHSTK_SHSTK))
- return 0;
-
- /* Also not supported for 32 bit */
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
- return -EOPNOTSUPP;
-
- size = adjust_shstk_size(0);
- addr = alloc_shstk(0, size, 0, false);
- if (IS_ERR_VALUE(addr))
- return PTR_ERR((void *)addr);
-
- fpregs_lock_and_load();
- wrmsrl(MSR_IA32_PL3_SSP, addr + size);
- wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
- fpregs_unlock();
-
- shstk->base = addr;
- shstk->size = size;
- features_set(ARCH_SHSTK_SHSTK);
-
- return 0;
-}
-
void reset_thread_features(void)
{
memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk));
@@ -191,48 +125,6 @@ void reset_thread_features(void)
current->thread.features_locked = 0;
}
-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
- unsigned long stack_size)
-{
- struct thread_shstk *shstk = &tsk->thread.shstk;
- unsigned long addr, size;
-
- /*
- * If shadow stack is not enabled on the new thread, skip any
- * switch to a new shadow stack.
- */
- if (!features_enabled(ARCH_SHSTK_SHSTK))
- return 0;
-
- /*
- * For CLONE_VFORK the child will share the parents shadow stack.
- * Make sure to clear the internal tracking of the thread shadow
- * stack so the freeing logic run for child knows to leave it alone.
- */
- if (clone_flags & CLONE_VFORK) {
- shstk->base = 0;
- shstk->size = 0;
- return 0;
- }
-
- /*
- * For !CLONE_VM the child will use a copy of the parents shadow
- * stack.
- */
- if (!(clone_flags & CLONE_VM))
- return 0;
-
- size = adjust_shstk_size(stack_size);
- addr = alloc_shstk(0, size, 0, false);
- if (IS_ERR_VALUE(addr))
- return addr;
-
- shstk->base = addr;
- shstk->size = size;
-
- return addr + size;
-}
-
static unsigned long get_user_shstk_addr(void)
{
unsigned long long ssp;
@@ -402,44 +294,6 @@ int restore_signal_shadow_stack(void)
return 0;
}
-void shstk_free(struct task_struct *tsk)
-{
- struct thread_shstk *shstk = &tsk->thread.shstk;
-
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
- !features_enabled(ARCH_SHSTK_SHSTK))
- return;
-
- /*
- * When fork() with CLONE_VM fails, the child (tsk) already has a
- * shadow stack allocated, and exit_thread() calls this function to
- * free it. In this case the parent (current) and the child share
- * the same mm struct.
- */
- if (!tsk->mm || tsk->mm != current->mm)
- return;
-
- /*
- * If shstk->base is NULL, then this task is not managing its
- * own shadow stack (CLONE_VFORK). So skip freeing it.
- */
- if (!shstk->base)
- return;
-
- /*
- * shstk->base is NULL for CLONE_VFORK child tasks, and so is
- * normal. But size = 0 on a shstk->base is not normal and
- * indicated an attempt to free the thread shadow stack twice.
- * Warn about it.
- */
- if (WARN_ON(!shstk->size))
- return;
-
- unmap_shadow_stack(shstk->base, shstk->size);
-
- shstk->size = 0;
-}
-
static int wrss_control(bool enable)
{
u64 msrval;
@@ -502,36 +356,6 @@ static int shstk_disable(void)
return 0;
}
-SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
-{
- bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
- unsigned long aligned_size;
-
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
- return -EOPNOTSUPP;
-
- if (flags & ~SHADOW_STACK_SET_TOKEN)
- return -EINVAL;
-
- /* If there isn't space for a token */
- if (set_tok && size < 8)
- return -ENOSPC;
-
- if (addr && addr < SZ_4G)
- return -ERANGE;
-
- /*
- * An overflow would result in attempting to write the restore token
- * to the wrong location. Not catastrophic, but just return the right
- * error code and block it.
- */
- aligned_size = PAGE_ALIGN(size);
- if (aligned_size < size)
- return -EOVERFLOW;
-
- return alloc_shstk(addr, aligned_size, size, set_tok);
-}
-
long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
{
unsigned long features = arg2;
new file mode 100644
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SHSTK_H
+#define _SHSTK_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+unsigned long alloc_shstk(unsigned long addr, unsigned long size,
+ unsigned long token_offset, bool set_res_tok);
+int shstk_setup(void);
+int create_rstor_token(unsigned long ssp, unsigned long *token_addr);
+bool cpu_supports_shadow_stack(void);
+bool is_shstk_enabled(struct task_struct *task);
+void set_shstk_base_size(struct task_struct *task, unsigned long base,
+ unsigned long size);
+void get_shstk_base_size(struct task_struct *task, unsigned long *base,
+ unsigned long *size);
+void set_shstk_ptr_and_enable(unsigned long ssp);
+void set_thread_shstk_status(bool enable);
+unsigned long adjust_shstk_size(unsigned long size);
+void unmap_shadow_stack(u64 base, u64 size);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _SHSTK_H */
@@ -87,4 +87,7 @@
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
+/* Flags for map_shadow_stack(2) */
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
+
#endif /* __ASM_GENERIC_MMAN_COMMON_H */
@@ -140,6 +140,8 @@ KCOV_INSTRUMENT_stackleak.o := n
obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
+obj-$(CONFIG_ARCH_HAS_USER_SHADOW_STACK) += usershstk.o
+
$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data config_data.gz
new file mode 100644
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <linux/syscalls.h>
+#include <asm/shstk.h>
+#include <linux/usershstk.h>
+
+#define SHSTK_ENTRY_SIZE sizeof(void *)
+
+bool cpu_supports_shadow_stack(void)
+{
+ return arch_cpu_supports_shadow_stack();
+}
+
+bool is_shstk_enabled(struct task_struct *task)
+{
+ return arch_is_shstk_enabled(task);
+}
+
+void set_shstk_base_size(struct task_struct *task, unsigned long base,
+ unsigned long size)
+{
+ arch_set_shstk_base_size(task, base, size);
+}
+
+void get_shstk_base_size(struct task_struct *task, unsigned long *base,
+ unsigned long *size)
+{
+ arch_get_shstk_base_size(task, base, size);
+}
+
+void set_shstk_ptr_and_enable(unsigned long ssp)
+{
+ arch_set_shstk_ptr_and_enable(ssp);
+}
+
+void set_thread_shstk_status(bool enable)
+{
+ arch_set_thread_shstk_status(enable);
+}
+
+int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+ return arch_create_rstor_token(ssp, token_addr);
+}
+
+unsigned long adjust_shstk_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+}
+
+void unmap_shadow_stack(u64 base, u64 size)
+{
+ int r;
+
+ r = vm_munmap(base, size);
+
+ /*
+ * mmap_write_lock_killable() failed with -EINTR. This means
+ * the process is about to die and have it's MM cleaned up.
+ * This task shouldn't ever make it back to userspace. In this
+ * case it is ok to leak a shadow stack, so just exit out.
+ */
+ if (r == -EINTR)
+ return;
+
+ /*
+ * For all other types of vm_munmap() failure, either the
+ * system is out of memory or there is bug.
+ */
+ WARN_ON_ONCE(r);
+}
+
+/*
+ * VM_SHADOW_STACK will have a guard page. This helps userspace protect
+ * itself from attacks. The reasoning is as follows:
+ *
+ * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
+ * INCSSP instruction can increment the shadow stack pointer. It is the
+ * shadow stack analog of an instruction like:
+ *
+ * addq $0x80, %rsp
+ *
+ * However, there is one important difference between an ADD on %rsp
+ * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
+ * memory of the first and last elements that were "popped". It can be
+ * thought of as acting like this:
+ *
+ * READ_ONCE(ssp); // read+discard top element on stack
+ * ssp += nr_to_pop * 8; // move the shadow stack
+ * READ_ONCE(ssp-8); // read+discard last popped stack element
+ *
+ * The maximum distance INCSSP can move the SSP is 2040 bytes, before
+ * it would read the memory. Therefore a single page gap will be enough
+ * to prevent any operation from shifting the SSP to an adjacent stack,
+ * since it would have to land in the gap at least once, causing a
+ * fault.
+ */
+unsigned long alloc_shstk(unsigned long addr, unsigned long size,
+ unsigned long token_offset, bool set_res_tok)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+ flags |= IS_ENABLED(CONFIG_X86_64) ? MAP_ABOVE4G : 0;
+
+ struct mm_struct *mm = current->mm;
+ unsigned long mapped_addr, unused;
+
+ if (addr)
+ flags |= MAP_FIXED_NOREPLACE;
+
+ mmap_write_lock(mm);
+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
+ goto out;
+
+ if (create_rstor_token(mapped_addr + token_offset, NULL)) {
+ vm_munmap(mapped_addr, size);
+ return -EINVAL;
+ }
+
+out:
+ return mapped_addr;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+ unsigned long base, size;
+
+ if (!cpu_supports_shadow_stack() ||
+ !is_shstk_enabled(current))
+ return;
+
+ /*
+ * When fork() with CLONE_VM fails, the child (tsk) already has a
+ * shadow stack allocated, and exit_thread() calls this function to
+ * free it. In this case the parent (current) and the child share
+ * the same mm struct.
+ */
+ if (!tsk->mm || tsk->mm != current->mm)
+ return;
+
+ get_shstk_base_size(tsk, &base, &size);
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!size))
+ return;
+
+ unmap_shadow_stack(base, size);
+
+ set_shstk_base_size(tsk, 0, 0);
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+ bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+ unsigned long aligned_size;
+
+ if (!cpu_supports_shadow_stack())
+ return -EOPNOTSUPP;
+
+ if (flags & ~SHADOW_STACK_SET_TOKEN)
+ return -EINVAL;
+
+ /* If there isn't space for a token */
+ if (set_tok && size < SHSTK_ENTRY_SIZE)
+ return -ENOSPC;
+
+ if (addr && (addr & (PAGE_SIZE - 1)))
+ return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_X86_64) &&
+ addr && addr < SZ_4G)
+ return -ERANGE;
+
+ /*
+ * An overflow would result in attempting to write the restore token
+ * to the wrong location. Not catastrophic, but just return the right
+ * error code and block it.
+ */
+ aligned_size = PAGE_ALIGN(size);
+ if (aligned_size < size)
+ return -EOVERFLOW;
+
+ return alloc_shstk(addr, aligned_size, size, set_tok);
+}
+
+int shstk_setup(void)
+{
+ struct thread_shstk *shstk = ¤t->thread.shstk;
+ unsigned long addr, size;
+
+ /* Already enabled */
+ if (is_shstk_enabled(current))
+ return 0;
+
+ /* Also not supported for 32 bit */
+ if (!cpu_supports_shadow_stack() ||
+ (IS_ENABLED(CONFIG_X86_64) && in_ia32_syscall()))
+ return -EOPNOTSUPP;
+
+ size = adjust_shstk_size(0);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ set_shstk_ptr_and_enable(addr + size);
+ set_shstk_base_size(current, addr, size);
+
+ set_thread_shstk_status(true);
+
+ return 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
+ unsigned long stack_size)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+ unsigned long addr, size;
+
+ if (!cpu_supports_shadow_stack())
+ return -EOPNOTSUPP;
+
+ /*
+ * If shadow stack is not enabled on the new thread, skip any
+ * switch to a new shadow stack.
+ */
+ if (!is_shstk_enabled(tsk))
+ return 0;
+
+ /*
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ set_shstk_base_size(tsk, 0, 0);
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
+ * stack.
+ */
+ if (!(clone_flags & CLONE_VM))
+ return 0;
+
+ size = adjust_shstk_size(stack_size);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ set_shstk_base_size(tsk, addr, size);
+
+ return addr + size;
+}
CPU assisted shadow stack are supported by x86, arm64 and risc-v. In terms of enabling shadow stack feature for usermode code in kernel, they have following commonalities - Expose a user ABI (via a prctl) to allow user mode to explicitly ask for enabling shadow stack instead of by default enabling it. x86 series pre-dates arm64 or risc-v announcment of support, so it ended up doing a arch specific prctl instead of generic one. arm64 and risc-v have converged on using generic prctl and each of them can handle it appropriatley. - On fork or clone, shadow stack has to be COWed or not COWed depending on CLONE_VM was passed or not. Additionally if CLONE_VFORK was passed then same (parent one) shadow stack should be used. - To create shadow stack mappings, implement `map_shadow_stack` system call. This patch picks up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` config introduction and incorproate most of the common flows between different architectures. On a high level, shadow stack allocation and shadow stack de-allocation are base operations on virtual memory and common between architectures. Similarly shadow stack setup on prctl (arch specific or otherwise) is a common flow. Treatment of shadow stack virtual memory on `clone/fork` and implementaiton of `map_shadow_stack` is also converged into common flow. To implement these common flows, each architecture have arch-specific enabling mechanism as well as arch-specific data structures in task/ thread struct. So additionally this patch tries to abstract certain operation/helpers and allowing each architecture to have their arch_* implementation to implement the abstractions. Signed-off-by: Deepak Gupta <debug@rivosinc.com> --- arch/x86/include/asm/shstk.h | 9 + arch/x86/include/uapi/asm/mman.h | 3 - arch/x86/kernel/shstk.c | 270 ++++++------------------------ include/linux/usershstk.h | 25 +++ include/uapi/asm-generic/mman-common.h | 3 + kernel/Makefile | 2 + kernel/usershstk.c | 289 +++++++++++++++++++++++++++++++++ 7 files changed, 375 insertions(+), 226 deletions(-)