diff mbox series

[RFC,RFT,2/5] fork: Add shadow stack support to clone3()

Message ID 20231023-clone3-shadow-stack-v1-2-d867d0b5d4d0@kernel.org (mailing list archive)
State New
Headers show
Series fork: Support shadow stacks in clone3() | expand

Commit Message

Mark Brown Oct. 23, 2023, 1:20 p.m. UTC
Unlike with the normal stack there is no API for configuring the the shadow
stack for a new thread, instead the kernel will dynamically allocate a new
shadow stack with the same size as the normal stack. This appears to be due
to the shadow stack series having been in development since before the more
extensible clone3() was added rather than anything more deliberate.

Add parameters to clone3() specifying the address and size of a shadow
stack for the newly created process, we validate that the range specified
is accessible to userspace but do not validate that it has been mapped
appropriately for use as a shadow stack (normally via map_shadow_stack()).
If the shadow stack is specified in this way then the caller is responsible
for freeing the memory as with the main stack. If no shadow stack is
specified then the existing implicit allocation and freeing behaviour is
maintained.

If the architecture does not support shadow stacks the shadow stack
parameters must be zero, architectures that do support the feature are
expected to have the same requirement on individual systems that lack
shadow stack support.

Update the existing x86 implementation to pay attention to the newly added
arguments, in order to maintain compatibility we use the existing behaviour
if no shadow stack is specified. Minimal validation is done of the supplied
parameters, detailed enforcement is left to when the thread is executed.
Since we are now using four fields from the kernel_clone_args we pass that
into the shadow stack code rather than individual fields.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/x86/include/asm/shstk.h | 11 +++++++----
 arch/x86/kernel/process.c    |  2 +-
 arch/x86/kernel/shstk.c      | 36 +++++++++++++++++++++++++++++++-----
 include/linux/sched/task.h   |  2 ++
 include/uapi/linux/sched.h   | 17 ++++++++++++++---
 kernel/fork.c                | 40 ++++++++++++++++++++++++++++++++++++++--
 6 files changed, 93 insertions(+), 15 deletions(-)

Comments

Rick Edgecombe Oct. 23, 2023, 4:32 p.m. UTC | #1
+Some security folks

On Mon, 2023-10-23 at 14:20 +0100, Mark Brown wrote:
> Unlike with the normal stack there is no API for configuring the the
> shadow
> stack for a new thread, instead the kernel will dynamically allocate
> a new
> shadow stack with the same size as the normal stack. This appears to
> be due
> to the shadow stack series having been in development since before
> the more
> extensible clone3() was added rather than anything more deliberate.
> 
> Add parameters to clone3() specifying the address and size of a
> shadow
> stack for the newly created process, we validate that the range
> specified
> is accessible to userspace but do not validate that it has been
> mapped
> appropriately for use as a shadow stack (normally via
> map_shadow_stack()).
> If the shadow stack is specified in this way then the caller is
> responsible
> for freeing the memory as with the main stack. If no shadow stack is
> specified then the existing implicit allocation and freeing behaviour
> is
> maintained.
> 
> If the architecture does not support shadow stacks the shadow stack
> parameters must be zero, architectures that do support the feature
> are
> expected to have the same requirement on individual systems that lack
> shadow stack support.
> 
> Update the existing x86 implementation to pay attention to the newly
> added
> arguments, in order to maintain compatibility we use the existing
> behaviour
> if no shadow stack is specified. Minimal validation is done of the
> supplied
> parameters, detailed enforcement is left to when the thread is
> executed.
> Since we are now using four fields from the kernel_clone_args we pass
> that
> into the shadow stack code rather than individual fields.

This will give userspace new powers, very close to a "set SSP" ability.
They could start a new thread on an active shadow stack, corrupt it,
etc.

One way to avoid this would be to have shstk_alloc_thread_stack()
consume a token on the shadow stack passed in the clone args. But it's
tricky because there is not a CMPXCHG, on x86 at least, that works with
shadow stack accesses. So the kernel would probably have to GUP the
page and do a normal CMPXCHG off of the direct map.

That said, it's already possible to get two threads on the same shadow
stack by unmapping one and mapping another shadow stack in the same
place, while the target thread is not doing a call/ret. I don't know if
there is anything we could do about that without serious compatibility
restrictions. But this patch would make it a bit more trivial.

I might lean towards the token solution, even if it becomes more heavy
weight to use clone3 in this way. It depends on whether the above is
worth defending.

> 
> Signed-off-by: Mark Brown <broonie@kernel.org>
> ---
>  arch/x86/include/asm/shstk.h | 11 +++++++----
>  arch/x86/kernel/process.c    |  2 +-
>  arch/x86/kernel/shstk.c      | 36 +++++++++++++++++++++++++++++++---
> --
>  include/linux/sched/task.h   |  2 ++
>  include/uapi/linux/sched.h   | 17 ++++++++++++++---
>  kernel/fork.c                | 40
> ++++++++++++++++++++++++++++++++++++++--
>  6 files changed, 93 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/shstk.h
> b/arch/x86/include/asm/shstk.h
> index 42fee8959df7..8be7b0a909c3 100644
> --- a/arch/x86/include/asm/shstk.h
> +++ b/arch/x86/include/asm/shstk.h
> @@ -6,6 +6,7 @@
>  #include <linux/types.h>
>  
>  struct task_struct;
> +struct kernel_clone_args;
>  struct ksignal;
>  
>  #ifdef CONFIG_X86_USER_SHADOW_STACK
> @@ -16,8 +17,8 @@ struct thread_shstk {
>  
>  long shstk_prctl(struct task_struct *task, int option, unsigned long
> arg2);
>  void reset_thread_features(void);
> -unsigned long shstk_alloc_thread_stack(struct task_struct *p,
> unsigned long clone_flags,
> -                                      unsigned long stack_size);
> +unsigned long shstk_alloc_thread_stack(struct task_struct *p,
> +                                      const struct kernel_clone_args
> *args);
>  void shstk_free(struct task_struct *p);
>  int setup_signal_shadow_stack(struct ksignal *ksig);
>  int restore_signal_shadow_stack(void);
> @@ -26,8 +27,10 @@ static inline long shstk_prctl(struct task_struct
> *task, int option,
>                                unsigned long arg2) { return -EINVAL;
> }
>  static inline void reset_thread_features(void) {}
>  static inline unsigned long shstk_alloc_thread_stack(struct
> task_struct *p,
> -                                                    unsigned long
> clone_flags,
> -                                                    unsigned long
> stack_size) { return 0; }
> +                                                    const struct
> kernel_clone_args *args)
> +{
> +       return 0;
> +}
>  static inline void shstk_free(struct task_struct *p) {}
>  static inline int setup_signal_shadow_stack(struct ksignal *ksig) {
> return 0; }
>  static inline int restore_signal_shadow_stack(void) { return 0; }
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index b6f4e8399fca..a9ca80ea5056 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -207,7 +207,7 @@ int copy_thread(struct task_struct *p, const
> struct kernel_clone_args *args)
>          * is disabled, new_ssp will remain 0, and fpu_clone() will
> know not to
>          * update it.
>          */
> -       new_ssp = shstk_alloc_thread_stack(p, clone_flags, args-
> >stack_size);
> +       new_ssp = shstk_alloc_thread_stack(p, args);
>         if (IS_ERR_VALUE(new_ssp))
>                 return PTR_ERR((void *)new_ssp);
>  
> diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
> index 59e15dd8d0f8..3ae5c3d657dc 100644
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -191,18 +191,44 @@ void reset_thread_features(void)
>         current->thread.features_locked = 0;
>  }
>  
> -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
> unsigned long clone_flags,
> -                                      unsigned long stack_size)
> +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
> +                                      const struct kernel_clone_args
> *args)
>  {
>         struct thread_shstk *shstk = &tsk->thread.shstk;
> +       unsigned long clone_flags = args->flags;
>         unsigned long addr, size;
>  
>         /*
>          * If shadow stack is not enabled on the new thread, skip any
> -        * switch to a new shadow stack.
> +        * implicit switch to a new shadow stack and reject attempts
> to
> +        * explciitly specify one.
>          */
> -       if (!features_enabled(ARCH_SHSTK_SHSTK))
> +       if (!features_enabled(ARCH_SHSTK_SHSTK)) {
> +               if (args->shadow_stack)
> +                       return (unsigned long)ERR_PTR(-EINVAL);
> +
>                 return 0;
> +       }
> +
> +       /*
> +        * If the user specified a shadow stack then do some basic
> +        * validation and use it.  The caller is responsible for
> +        * freeing the shadow stack.
> +        */
> +       if (args->shadow_stack) {
> +               addr = args->shadow_stack;
> +               size = args->shadow_stack_size;
> +
> +               if (!IS_ALIGNED(addr, 8))
> +                       return (unsigned long)ERR_PTR(-EINVAL);
> +               if (size < 8)
> +                       return (unsigned long)ERR_PTR(-EINVAL);
> +
> +               shstk->base = 0;
> +               shstk->size = 0;
> +
> +               return addr + size;
> +       }
>  
>         /*
>          * For CLONE_VFORK the child will share the parents shadow
> stack.
> @@ -222,7 +248,7 @@ unsigned long shstk_alloc_thread_stack(struct
> task_struct *tsk, unsigned long cl
>         if (!(clone_flags & CLONE_VM))
>                 return 0;
>  
> -       size = adjust_shstk_size(stack_size);
> +       size = adjust_shstk_size(args->stack_size);
>         addr = alloc_shstk(0, size, 0, false);
>         if (IS_ERR_VALUE(addr))
>                 return addr;
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index a23af225c898..94e7cf62be51 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -41,6 +41,8 @@ struct kernel_clone_args {
>         void *fn_arg;
>         struct cgroup *cgrp;
>         struct css_set *cset;
> +       unsigned long shadow_stack;
> +       unsigned long shadow_stack_size;
>  };
>  
>  /*
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 3bac0a8ceab2..1bd1b956834d 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -84,6 +84,14 @@
>   *                kernel's limit of nested PID namespaces.
>   * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
>   *                a file descriptor for the cgroup.
> + * @shadow_stack: Specify the location of the shadow stack for the
> + *                child process.
> + *                Note, @shadow_stack is expected to point to the
> + *                lowest address. The stack direction will be
> + *                determined by the kernel and set up
> + *                appropriately based on @shadow_stack_size.
> + * @shadow_stack_size:   The size of the shadow stack for the child
> + *                       process.
>   *
>   * The structure is versioned by size and thus extensible.
>   * New struct members must go at the end of the struct and
> @@ -101,12 +109,15 @@ struct clone_args {
>         __aligned_u64 set_tid;
>         __aligned_u64 set_tid_size;
>         __aligned_u64 cgroup;
> +       __aligned_u64 shadow_stack;
> +       __aligned_u64 shadow_stack_size;
>  };
>  #endif
>  
> -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
> -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
> -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
> +#define CLONE_ARGS_SIZE_VER0  64 /* sizeof first published struct */
> +#define CLONE_ARGS_SIZE_VER1  80 /* sizeof second published struct
> */
> +#define CLONE_ARGS_SIZE_VER2  88 /* sizeof third published struct */
> +#define CLONE_ARGS_SIZE_VER3 104 /* sizeof fourth published struct
> */
>  
>  /*
>   * Scheduling policies
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3b6d20dfb9a8..bd61aa7353b0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -3069,7 +3069,9 @@ noinline static int
> copy_clone_args_from_user(struct kernel_clone_args *kargs,
>                      CLONE_ARGS_SIZE_VER1);
>         BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
>                      CLONE_ARGS_SIZE_VER2);
> -       BUILD_BUG_ON(sizeof(struct clone_args) !=
> CLONE_ARGS_SIZE_VER2);
> +       BUILD_BUG_ON(offsetofend(struct clone_args,
> shadow_stack_size) !=
> +                    CLONE_ARGS_SIZE_VER3);
> +       BUILD_BUG_ON(sizeof(struct clone_args) !=
> CLONE_ARGS_SIZE_VER3);
>  
>         if (unlikely(usize > PAGE_SIZE))
>                 return -E2BIG;
> @@ -3112,6 +3114,8 @@ noinline static int
> copy_clone_args_from_user(struct kernel_clone_args *kargs,
>                 .tls            = args.tls,
>                 .set_tid_size   = args.set_tid_size,
>                 .cgroup         = args.cgroup,
> +               .shadow_stack   = args.shadow_stack,
> +               .shadow_stack_size      = args.shadow_stack_size,
>         };
>  
>         if (args.set_tid &&
> @@ -3152,6 +3156,38 @@ static inline bool clone3_stack_valid(struct
> kernel_clone_args *kargs)
>         return true;
>  }
>  
> +/**
> + * clone3_shadow_stack_valid - check and prepare shadow stack
> + * @kargs: kernel clone args
> + *
> + * Verify that the shadow stack arguments userspace gave us are
> sane.
> + */
> +static inline bool clone3_shadow_stack_valid(struct
> kernel_clone_args *kargs)
> +{
> +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
> +       if (kargs->shadow_stack) {
> +               if (!kargs->shadow_stack_size)
> +                       return false;
> +
> +               /*
> +                * This doesn't validate that the addresses are
> mapped
> +                * VM_SHADOW_STACK, just that they're mapped at all.
> +                */

It just checks the range, right?

> +               if (!access_ok((void __user *)kargs->shadow_stack,
> +                              kargs->shadow_stack_size))
> +                       return false;
> +       } else {
> +               if (kargs->shadow_stack_size)
> +                       return false;
> +       }
> +
> +       return true;
> +#else
> +       /* The architecture does not support shadow stacks */
> +       return !kargs->shadow_stack && !kargs->shadow_stack_size;
> +#endif
> +}
> +
>  static bool clone3_args_valid(struct kernel_clone_args *kargs)
>  {
>         /* Verify that no unknown flags are passed along. */
> @@ -3174,7 +3210,7 @@ static bool clone3_args_valid(struct
> kernel_clone_args *kargs)
>             kargs->exit_signal)
>                 return false;
>  
> -       if (!clone3_stack_valid(kargs))
> +       if (!clone3_stack_valid(kargs) ||
> !clone3_shadow_stack_valid(kargs))
>                 return false;
>  
>         return true;
>
Mark Brown Oct. 23, 2023, 6:32 p.m. UTC | #2
On Mon, Oct 23, 2023 at 04:32:22PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2023-10-23 at 14:20 +0100, Mark Brown wrote:

> +Some security folks

I *think* I captured everyone for future versions but I might've missed
some, it's a long Cc list.

> > Add parameters to clone3() specifying the address and size of a
> > shadow
> > stack for the newly created process, we validate that the range
> > specified
> > is accessible to userspace but do not validate that it has been
> > mapped
> > appropriately for use as a shadow stack (normally via
> > map_shadow_stack()).
> > If the shadow stack is specified in this way then the caller is
> > responsible
> > for freeing the memory as with the main stack. If no shadow stack is
> > specified then the existing implicit allocation and freeing behaviour
> > is
> > maintained.

> This will give userspace new powers, very close to a "set SSP" ability.
> They could start a new thread on an active shadow stack, corrupt it,
> etc.

That's true.

> One way to avoid this would be to have shstk_alloc_thread_stack()
> consume a token on the shadow stack passed in the clone args. But it's
> tricky because there is not a CMPXCHG, on x86 at least, that works with
> shadow stack accesses. So the kernel would probably have to GUP the
> page and do a normal CMPXCHG off of the direct map.

> That said, it's already possible to get two threads on the same shadow
> stack by unmapping one and mapping another shadow stack in the same
> place, while the target thread is not doing a call/ret. I don't know if
> there is anything we could do about that without serious compatibility
> restrictions. But this patch would make it a bit more trivial.

> I might lean towards the token solution, even if it becomes more heavy
> weight to use clone3 in this way. It depends on whether the above is
> worth defending.

Right.  We're already adding the cost of the extra map_shadow_stack() so
it doesn't seem that out of scope.  We could also allow clone3() to be
used for allocation, potentially removing the ability to specify the
address entirely and only specifying the size.  I did consider that
option but it felt awkward in the API, though equally the whole shadow
stack allocation thing is a bit that way.  That would avoid concerns
about placing and validating tokens entirely but gives less control to
userspace.

This also doesn't do anything to stop anyone trying to allocate sub page
shadow stacks if they're trying to save memory with all the lack of
overrun protection that implies, though that seems to me to be much more
of a deliberate decision that people might make, a token would prevent
that too unless write access to the shadow stack is enabled.

> > +               /*
> > +                * This doesn't validate that the addresses are
> > mapped
> > +                * VM_SHADOW_STACK, just that they're mapped at all.
> > +                */

> It just checks the range, right?

Yes, same check as for the normal stack.
Rick Edgecombe Oct. 26, 2023, 5:10 p.m. UTC | #3
On Mon, 2023-10-23 at 19:32 +0100, Mark Brown wrote:
> Right.  We're already adding the cost of the extra map_shadow_stack()
> so
> it doesn't seem that out of scope.  We could also allow clone3() to
> be
> used for allocation, potentially removing the ability to specify the
> address entirely and only specifying the size.  I did consider that
> option but it felt awkward in the API, though equally the whole
> shadow
> stack allocation thing is a bit that way.  That would avoid concerns
> about placing and validating tokens entirely but gives less control
> to
> userspace.

There is also cost in the form of extra complexity. Not to throw FUD,
but GUP has been the source of thorny problems. And here we would be
doing it around security races. We're probably helped that shadow stack
is only private/anonymous memory, so maybe it's enough of a normal case
to not worry about it.

Still, there is some extra complexity, and I'm not sure if we really
need it. The justification seems to mostly be that it's not as flexible
as normal stacks with clone3.

I don't understand why doing size-only is awkward. Just because it
doesn't match the regular stack clone3 semantics?

> 
> This also doesn't do anything to stop anyone trying to allocate sub
> page
> shadow stacks if they're trying to save memory with all the lack of
> overrun protection that implies, though that seems to me to be much
> more
> of a deliberate decision that people might make, a token would
> prevent
> that too unless write access to the shadow stack is enabled.

Sorry, I'm not following. Sub-page shadow stacks?

> 
> > > +               /*
> > > +                * This doesn't validate that the addresses are
> > > mapped
> > > +                * VM_SHADOW_STACK, just that they're mapped at
> > > all.
> > > +                */
> 
> > It just checks the range, right?
> 
> Yes, same check as for the normal stack.

What looked wrong is that the comment says that it checks if the
addresses are mapped, but the code just does access_ok(). It's a minor
thing in any case.
Mark Brown Oct. 26, 2023, 5:53 p.m. UTC | #4
On Thu, Oct 26, 2023 at 05:10:47PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2023-10-23 at 19:32 +0100, Mark Brown wrote:

> > Right.  We're already adding the cost of the extra map_shadow_stack()
> > so
> > it doesn't seem that out of scope.  We could also allow clone3() to
> > be
> > used for allocation, potentially removing the ability to specify the
> > address entirely and only specifying the size.  I did consider that
> > option but it felt awkward in the API, though equally the whole
> > shadow
> > stack allocation thing is a bit that way.  That would avoid concerns
> > about placing and validating tokens entirely but gives less control
> > to
> > userspace.

> There is also cost in the form of extra complexity. Not to throw FUD,
> but GUP has been the source of thorny problems. And here we would be
> doing it around security races. We're probably helped that shadow stack
> is only private/anonymous memory, so maybe it's enough of a normal case
> to not worry about it.

> Still, there is some extra complexity, and I'm not sure if we really
> need it. The justification seems to mostly be that it's not as flexible
> as normal stacks with clone3.

I definitely agree on the complexity, trying to valdiate a token is
going to be more code doing fiddly things and there's always the risk
that something will change around it and invalidate assumptions the code
makes.  Particularly given my inability to test x86 I'm certainly way
more happy pushing this series forward implementing size only than I am
doing token validation.

> I don't understand why doing size-only is awkward. Just because it
> doesn't match the regular stack clone3 semantics?

Basically, yes - we don't allocate userpace pages in clone3() for the
normal stack and we do offer userspace control over where to place
things.  There was some grumbling about this in the current ABI from the
arm64 side, though the limited control of the size is more of the issue
really.

I'm not sure placement control is essential but the other bit of it is
the freeing of the shadow stack, especially if userspace is doing stack
switches the current behaviour where we free the stack when the thread
is exiting doesn't feel great exactly.  It's mainly an issue for
programs that pivot stacks which isn't the common case but it is a
general sharp edge.

> > This also doesn't do anything to stop anyone trying to allocate sub
> > page
> > shadow stacks if they're trying to save memory with all the lack of
> > overrun protection that implies, though that seems to me to be much
> > more
> > of a deliberate decision that people might make, a token would
> > prevent
> > that too unless write access to the shadow stack is enabled.

> Sorry, I'm not following. Sub-page shadow stacks?

If someone decides to allocate a page of shadow stack then point thread
A at the first half of the page and thread B at the second half of the
page nothing would stop them.  There are obvious issues with this but I
can see someone trying to do it in a system that creates lots of
threads and has memory constraints.

> > > > +               /*
> > > > +                * This doesn't validate that the addresses are
> > > > mapped
> > > > +                * VM_SHADOW_STACK, just that they're mapped at
> > > > all.
> > > > +                */

> > > It just checks the range, right?

> > Yes, same check as for the normal stack.

> What looked wrong is that the comment says that it checks if the
> addresses are mapped, but the code just does access_ok(). It's a minor
> thing in any case.

Oh, I see, yes.
Deepak Gupta Oct. 26, 2023, 8:40 p.m. UTC | #5
On Thu, Oct 26, 2023 at 06:53:37PM +0100, Mark Brown wrote:
>On Thu, Oct 26, 2023 at 05:10:47PM +0000, Edgecombe, Rick P wrote:
>> On Mon, 2023-10-23 at 19:32 +0100, Mark Brown wrote:
>
>> > Right.  We're already adding the cost of the extra map_shadow_stack()
>> > so
>> > it doesn't seem that out of scope.  We could also allow clone3() to
>> > be
>> > used for allocation, potentially removing the ability to specify the
>> > address entirely and only specifying the size.  I did consider that
>> > option but it felt awkward in the API, though equally the whole
>> > shadow
>> > stack allocation thing is a bit that way.  That would avoid concerns
>> > about placing and validating tokens entirely but gives less control
>> > to
>> > userspace.
>
>> There is also cost in the form of extra complexity. Not to throw FUD,
>> but GUP has been the source of thorny problems. And here we would be
>> doing it around security races. We're probably helped that shadow stack
>> is only private/anonymous memory, so maybe it's enough of a normal case
>> to not worry about it.
>
>> Still, there is some extra complexity, and I'm not sure if we really
>> need it. The justification seems to mostly be that it's not as flexible
>> as normal stacks with clone3.
>
>I definitely agree on the complexity, trying to valdiate a token is
>going to be more code doing fiddly things and there's always the risk
>that something will change around it and invalidate assumptions the code
>makes.  Particularly given my inability to test x86 I'm certainly way
>more happy pushing this series forward implementing size only than I am
>doing token validation.
>

FWIW, from arch specific perspective, RISC-V shadow stack extension has
`ssamoswap` to perform this token exchange. But I understand x86 has this
limitation (not sure about arm GCS).

 From security perspective:--
Someone having ability to execute clone3 with control on input, probably
already achieved some level of control flow bending because they need to
corrupt memory and then carefully control registers input to clone3.
Although if it is purely a data oriented gadget, I think it is possible.

Since this RFC is mostly concerned about `size` of shadow stack. I think
we should limit it to size only.

>> I don't understand why doing size-only is awkward. Just because it
>> doesn't match the regular stack clone3 semantics?
>
>Basically, yes - we don't allocate userpace pages in clone3() for the
>normal stack and we do offer userspace control over where to place
>things.  There was some grumbling about this in the current ABI from the
>arm64 side, though the limited control of the size is more of the issue
>really.
>
>I'm not sure placement control is essential but the other bit of it is
>the freeing of the shadow stack, especially if userspace is doing stack
>switches the current behaviour where we free the stack when the thread
>is exiting doesn't feel great exactly.  It's mainly an issue for
>programs that pivot stacks which isn't the common case but it is a
>general sharp edge.

In general, I am assuming such placement requirements emanate because
regular stack holds data (local args, etc) as well and thus software may
make assumptions about how stack frame is prepared and may worry about
layout and such. In case of shadow stack, it can only hold return
addresses and tokens (created by user mode itself). Both of them endup
there as result of call or user sw own way of setting up tokens.

So I don't see a need for software to specify address.

>
>> > This also doesn't do anything to stop anyone trying to allocate sub
>> > page
>> > shadow stacks if they're trying to save memory with all the lack of
>> > overrun protection that implies, though that seems to me to be much
>> > more
>> > of a deliberate decision that people might make, a token would
>> > prevent
>> > that too unless write access to the shadow stack is enabled.
>
>> Sorry, I'm not following. Sub-page shadow stacks?
>
>If someone decides to allocate a page of shadow stack then point thread
>A at the first half of the page and thread B at the second half of the
>page nothing would stop them.  There are obvious issues with this but I
>can see someone trying to do it in a system that creates lots of
>threads and has memory constraints.
>
>> > > > +               /*
>> > > > +                * This doesn't validate that the addresses are
>> > > > mapped
>> > > > +                * VM_SHADOW_STACK, just that they're mapped at
>> > > > all.
>> > > > +                */
>
>> > > It just checks the range, right?
>
>> > Yes, same check as for the normal stack.
>
>> What looked wrong is that the comment says that it checks if the
>> addresses are mapped, but the code just does access_ok(). It's a minor
>> thing in any case.
>
>Oh, I see, yes.
Rick Edgecombe Oct. 26, 2023, 11:31 p.m. UTC | #6
On Thu, 2023-10-26 at 18:53 +0100, Mark Brown wrote:
> Particularly given my inability to test x86 I'm certainly way
> more happy pushing this series forward implementing size only than I
> am
> doing token validation.

I can help with testing/development once we get the plan settled on.
Rick Edgecombe Oct. 26, 2023, 11:32 p.m. UTC | #7
On Thu, 2023-10-26 at 13:40 -0700, Deepak Gupta wrote:
> 
> FWIW, from arch specific perspective, RISC-V shadow stack extension
> has
> `ssamoswap` to perform this token exchange. But I understand x86 has
> this
> limitation (not sure about arm GCS).
> 
>  From security perspective:--
> Someone having ability to execute clone3 with control on input,
> probably
> already achieved some level of control flow bending because they need
> to
> corrupt memory and then carefully control registers input to clone3.
> Although if it is purely a data oriented gadget, I think it is
> possible.

struct clone_args should be data somewhere, at least temporarily.

> 
> Since this RFC is mostly concerned about `size` of shadow stack. I
> think
> we should limit it to size only.

Seems reasonable to me. It still leaves open the option of adding an
shadow stack address field later AFAICT.
Szabolcs Nagy Oct. 27, 2023, 11:49 a.m. UTC | #8
The 10/26/2023 13:40, Deepak Gupta wrote:
> On Thu, Oct 26, 2023 at 06:53:37PM +0100, Mark Brown wrote:
> > I'm not sure placement control is essential but the other bit of it is
> > the freeing of the shadow stack, especially if userspace is doing stack
> > switches the current behaviour where we free the stack when the thread
> > is exiting doesn't feel great exactly.  It's mainly an issue for
> > programs that pivot stacks which isn't the common case but it is a
> > general sharp edge.
>
> In general, I am assuming such placement requirements emanate because
> regular stack holds data (local args, etc) as well and thus software may
> make assumptions about how stack frame is prepared and may worry about
> layout and such. In case of shadow stack, it can only hold return

no. the lifetime is the issue: a stack in principle can outlive
a thread and resumed even after the original thread exited.
for that to work the shadow stack has to outlive the thread too.

(or the other way around: a stack can be freed before the thread
exits, if the thread pivots away from that stack.)

posix threads etc. don't allow this, but the linux syscall abi
(clone) does allow it.

i think it is reasonable to tie the shadow stack lifetime to the
thread lifetime, but this clearly introduces a limitation on how
the clone api can be used. such constraint on the userspace
programming model is normally a bad decision, but given that most
software (including all posix conforming code) is not affected,
i think it is acceptable for an opt-in feature like shadow stack.

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Deepak Gupta Oct. 27, 2023, 11:24 p.m. UTC | #9
On Fri, Oct 27, 2023 at 12:49:59PM +0100, Szabolcs.Nagy@arm.com wrote:
>The 10/26/2023 13:40, Deepak Gupta wrote:
>> On Thu, Oct 26, 2023 at 06:53:37PM +0100, Mark Brown wrote:
>> > I'm not sure placement control is essential but the other bit of it is
>> > the freeing of the shadow stack, especially if userspace is doing stack
>> > switches the current behaviour where we free the stack when the thread
>> > is exiting doesn't feel great exactly.  It's mainly an issue for
>> > programs that pivot stacks which isn't the common case but it is a
>> > general sharp edge.
>>
>> In general, I am assuming such placement requirements emanate because
>> regular stack holds data (local args, etc) as well and thus software may
>> make assumptions about how stack frame is prepared and may worry about
>> layout and such. In case of shadow stack, it can only hold return
>
>no. the lifetime is the issue: a stack in principle can outlive
>a thread and resumed even after the original thread exited.
>for that to work the shadow stack has to outlive the thread too.
>

I understand an application can pre-allocate a pool of stack and re-use
them whenever it's spawning new threads using clone3 system call.

However, once a new thread has been spawned how can it resume?
By resume I mean consume the callstack context from an earlier thread.
Or you meant something else by `resume` here?

Can you give an example of such an application or runtime where a newly
created thread consumes callstack context created by going away thread?

>(or the other way around: a stack can be freed before the thread
>exits, if the thread pivots away from that stack.)

This is simply a thread saying that I am moving to a different stack.
Again, interested in learning why would a thread do that. If I've to
speculate on reasons, I could think of user runtime managing it's own
pool of worker items (some people call them green threads) or current
stack became too small.

JIT runtimes (and such stuff like go routines) do such things but in
those cases, kernel has no idea about it. From kernel's perspective
there is a main thread stack (hosting thread for JIT) and then main
thread can take a decision switching stack to execute JITted code.
But in that case all it needs is a shadow stack and managing lifetime of
such shadow stack using `clone` wouldn't be helpful and perhaps
`map_shadow_stack` should be used to create on the fly shadow stack.

Another case I can think of for a thread to move to a different stack
when current stack was too small and it wants larger memory. In such
cases as well, I imagine that particular thread would be issuing `mmap`
to allocate larger memory and thus that particular thread can very well
issue `map_shadow_stack`

In both of these cases, a stack free actually means thread (application)
issuing a system call to free the going away stack memory. It can free up
going away shadow stack memory in same way using `unmap_shadow_stack`

Let me know if I misunderstood something or missing some other usecase of
a stack being freed before the thread exits.

>
>posix threads etc. don't allow this, but the linux syscall abi
>(clone) does allow it.
>
>i think it is reasonable to tie the shadow stack lifetime to the
>thread lifetime, but this clearly introduces a limitation on how
>the clone api can be used. such constraint on the userspace
>programming model is normally a bad decision, but given that most
>software (including all posix conforming code) is not affected,
>i think it is acceptable for an opt-in feature like shadow stack.
>
>IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Szabolcs Nagy Oct. 30, 2023, 11:39 a.m. UTC | #10
The 10/27/2023 16:24, Deepak Gupta wrote:
> On Fri, Oct 27, 2023 at 12:49:59PM +0100, Szabolcs.Nagy@arm.com wrote:
> > no. the lifetime is the issue: a stack in principle can outlive
> > a thread and resumed even after the original thread exited.
> > for that to work the shadow stack has to outlive the thread too.
> 
> I understand an application can pre-allocate a pool of stack and re-use
> them whenever it's spawning new threads using clone3 system call.
> 
> However, once a new thread has been spawned how can it resume?

a thread can getcontext then exit. later another thread
can setcontext and execute on the stack of the exited
thread and return to a previous stack frame there.

(unlikely to work on runtimes where tls or thread id is
exposed and thus may be cached on the stack. so not for
posix.. but e.g. a go runtime could do this)

> By resume I mean consume the callstack context from an earlier thread.
> Or you meant something else by `resume` here?
> 
> Can you give an example of such an application or runtime where a newly
> created thread consumes callstack context created by going away thread?

my claim was not that existing runtimes are doing this,
but that the linux interface contract allows this and
tieing the stack lifetime to the thread is a change of
contract.

> > (or the other way around: a stack can be freed before the thread
> > exits, if the thread pivots away from that stack.)
> 
> This is simply a thread saying that I am moving to a different stack.
> Again, interested in learning why would a thread do that. If I've to
> speculate on reasons, I could think of user runtime managing it's own
> pool of worker items (some people call them green threads) or current
> stack became too small.

switching stack is common, freeing the original stack may not be,
but there is nothing that prevents this and then the corresponding
shadow stack is clearly leaked if the kernel manages it. the amount
of leak is proportional to the number of live threads and the sum
of their original stack size which can be big.

but as i said i think this lifetime issue is minor compared
to other shadow stack issues, so it is ok if the shadow stack
is kernel managed.
Deepak Gupta Oct. 30, 2023, 6:20 p.m. UTC | #11
On Mon, Oct 30, 2023 at 11:39:17AM +0000, Szabolcs.Nagy@arm.com wrote:
>The 10/27/2023 16:24, Deepak Gupta wrote:
>> On Fri, Oct 27, 2023 at 12:49:59PM +0100, Szabolcs.Nagy@arm.com wrote:
>> > no. the lifetime is the issue: a stack in principle can outlive
>> > a thread and resumed even after the original thread exited.
>> > for that to work the shadow stack has to outlive the thread too.
>>
>> I understand an application can pre-allocate a pool of stack and re-use
>> them whenever it's spawning new threads using clone3 system call.
>>
>> However, once a new thread has been spawned how can it resume?
>
>a thread can getcontext then exit. later another thread
>can setcontext and execute on the stack of the exited
>thread and return to a previous stack frame there.
>
>(unlikely to work on runtimes where tls or thread id is
>exposed and thus may be cached on the stack. so not for
>posix.. but e.g. a go runtime could do this)

Aah then as you mentioned, we basically need clear lifetime rules around
their creation and deletion.
Because `getcontext/swapcontext/setcontext` can be updated to save shadow
stack token on stack itself and use that to resume. It's just lifetime
that needs to be managed.

>
>> By resume I mean consume the callstack context from an earlier thread.
>> Or you meant something else by `resume` here?
>>
>> Can you give an example of such an application or runtime where a newly
>> created thread consumes callstack context created by going away thread?
>
>my claim was not that existing runtimes are doing this,
>but that the linux interface contract allows this and
>tieing the stack lifetime to the thread is a change of
>contract.
>
>> > (or the other way around: a stack can be freed before the thread
>> > exits, if the thread pivots away from that stack.)
>>
>> This is simply a thread saying that I am moving to a different stack.
>> Again, interested in learning why would a thread do that. If I've to
>> speculate on reasons, I could think of user runtime managing it's own
>> pool of worker items (some people call them green threads) or current
>> stack became too small.
>
>switching stack is common, freeing the original stack may not be,
>but there is nothing that prevents this and then the corresponding
>shadow stack is clearly leaked if the kernel manages it. the amount
>of leak is proportional to the number of live threads and the sum
>of their original stack size which can be big.
>
>but as i said i think this lifetime issue is minor compared
>to other shadow stack issues, so it is ok if the shadow stack
>is kernel managed.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 42fee8959df7..8be7b0a909c3 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -6,6 +6,7 @@ 
 #include <linux/types.h>
 
 struct task_struct;
+struct kernel_clone_args;
 struct ksignal;
 
 #ifdef CONFIG_X86_USER_SHADOW_STACK
@@ -16,8 +17,8 @@  struct thread_shstk {
 
 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
 void reset_thread_features(void);
-unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
-				       unsigned long stack_size);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+				       const struct kernel_clone_args *args);
 void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
@@ -26,8 +27,10 @@  static inline long shstk_prctl(struct task_struct *task, int option,
 			       unsigned long arg2) { return -EINVAL; }
 static inline void reset_thread_features(void) {}
 static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
-						     unsigned long clone_flags,
-						     unsigned long stack_size) { return 0; }
+						     const struct kernel_clone_args *args)
+{
+	return 0;
+}
 static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6f4e8399fca..a9ca80ea5056 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -207,7 +207,7 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	 * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
 	 * update it.
 	 */
-	new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+	new_ssp = shstk_alloc_thread_stack(p, args);
 	if (IS_ERR_VALUE(new_ssp))
 		return PTR_ERR((void *)new_ssp);
 
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 59e15dd8d0f8..3ae5c3d657dc 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -191,18 +191,44 @@  void reset_thread_features(void)
 	current->thread.features_locked = 0;
 }
 
-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
-				       unsigned long stack_size)
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
+				       const struct kernel_clone_args *args)
 {
 	struct thread_shstk *shstk = &tsk->thread.shstk;
+	unsigned long clone_flags = args->flags;
 	unsigned long addr, size;
 
 	/*
 	 * If shadow stack is not enabled on the new thread, skip any
-	 * switch to a new shadow stack.
+	 * implicit switch to a new shadow stack and reject attempts to
+	 * explciitly specify one.
 	 */
-	if (!features_enabled(ARCH_SHSTK_SHSTK))
+	if (!features_enabled(ARCH_SHSTK_SHSTK)) {
+		if (args->shadow_stack)
+			return (unsigned long)ERR_PTR(-EINVAL);
+
 		return 0;
+	}
+
+	/*
+	 * If the user specified a shadow stack then do some basic
+	 * validation and use it.  The caller is responsible for
+	 * freeing the shadow stack.
+	 */
+	if (args->shadow_stack) {
+		addr = args->shadow_stack;
+		size = args->shadow_stack_size;
+
+		if (!IS_ALIGNED(addr, 8))
+			return (unsigned long)ERR_PTR(-EINVAL);
+		if (size < 8)
+			return (unsigned long)ERR_PTR(-EINVAL);
+
+		shstk->base = 0;
+		shstk->size = 0;
+
+		return addr + size;
+	}
 
 	/*
 	 * For CLONE_VFORK the child will share the parents shadow stack.
@@ -222,7 +248,7 @@  unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
 	if (!(clone_flags & CLONE_VM))
 		return 0;
 
-	size = adjust_shstk_size(stack_size);
+	size = adjust_shstk_size(args->stack_size);
 	addr = alloc_shstk(0, size, 0, false);
 	if (IS_ERR_VALUE(addr))
 		return addr;
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..94e7cf62be51 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -41,6 +41,8 @@  struct kernel_clone_args {
 	void *fn_arg;
 	struct cgroup *cgrp;
 	struct css_set *cset;
+	unsigned long shadow_stack;
+	unsigned long shadow_stack_size;
 };
 
 /*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..1bd1b956834d 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -84,6 +84,14 @@ 
  *                kernel's limit of nested PID namespaces.
  * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
  *                a file descriptor for the cgroup.
+ * @shadow_stack: Specify the location of the shadow stack for the
+ *                child process.
+ *                Note, @shadow_stack is expected to point to the
+ *                lowest address. The stack direction will be
+ *                determined by the kernel and set up
+ *                appropriately based on @shadow_stack_size.
+ * @shadow_stack_size:   The size of the shadow stack for the child
+ *                       process.
  *
  * The structure is versioned by size and thus extensible.
  * New struct members must go at the end of the struct and
@@ -101,12 +109,15 @@  struct clone_args {
 	__aligned_u64 set_tid;
 	__aligned_u64 set_tid_size;
 	__aligned_u64 cgroup;
+	__aligned_u64 shadow_stack;
+	__aligned_u64 shadow_stack_size;
 };
 #endif
 
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
-#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
-#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER0  64 /* sizeof first published struct */
+#define CLONE_ARGS_SIZE_VER1  80 /* sizeof second published struct */
+#define CLONE_ARGS_SIZE_VER2  88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER3 104 /* sizeof fourth published struct */
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..bd61aa7353b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3069,7 +3069,9 @@  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		     CLONE_ARGS_SIZE_VER1);
 	BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
 		     CLONE_ARGS_SIZE_VER2);
-	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+	BUILD_BUG_ON(offsetofend(struct clone_args, shadow_stack_size) !=
+		     CLONE_ARGS_SIZE_VER3);
+	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER3);
 
 	if (unlikely(usize > PAGE_SIZE))
 		return -E2BIG;
@@ -3112,6 +3114,8 @@  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		.tls		= args.tls,
 		.set_tid_size	= args.set_tid_size,
 		.cgroup		= args.cgroup,
+		.shadow_stack	= args.shadow_stack,
+		.shadow_stack_size	= args.shadow_stack_size,
 	};
 
 	if (args.set_tid &&
@@ -3152,6 +3156,38 @@  static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
 	return true;
 }
 
+/**
+ * clone3_shadow_stack_valid - check and prepare shadow stack
+ * @kargs: kernel clone args
+ *
+ * Verify that the shadow stack arguments userspace gave us are sane.
+ */
+static inline bool clone3_shadow_stack_valid(struct kernel_clone_args *kargs)
+{
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
+	if (kargs->shadow_stack) {
+		if (!kargs->shadow_stack_size)
+			return false;
+
+		/*
+		 * This doesn't validate that the addresses are mapped
+		 * VM_SHADOW_STACK, just that they're mapped at all.
+		 */
+		if (!access_ok((void __user *)kargs->shadow_stack,
+			       kargs->shadow_stack_size))
+			return false;
+	} else {
+		if (kargs->shadow_stack_size)
+			return false;
+	}
+
+	return true;
+#else
+	/* The architecture does not support shadow stacks */
+	return !kargs->shadow_stack && !kargs->shadow_stack_size;
+#endif
+}
+
 static bool clone3_args_valid(struct kernel_clone_args *kargs)
 {
 	/* Verify that no unknown flags are passed along. */
@@ -3174,7 +3210,7 @@  static bool clone3_args_valid(struct kernel_clone_args *kargs)
 	    kargs->exit_signal)
 		return false;
 
-	if (!clone3_stack_valid(kargs))
+	if (!clone3_stack_valid(kargs) || !clone3_shadow_stack_valid(kargs))
 		return false;
 
 	return true;