diff mbox series

[18/35] mm: Add guard pages around a shadow stack.

Message ID 20220130211838.8382-19-rick.p.edgecombe@intel.com (mailing list archive)
State New
Headers show
Series Shadow stacks for userspace | expand

Commit Message

Edgecombe, Rick P Jan. 30, 2022, 9:18 p.m. UTC
From: Yu-cheng Yu <yu-cheng.yu@intel.com>

INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
first and the last elements in the range, effectively touches those memory
areas.

The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.
Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
CALL, and RET from going beyond.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Kees Cook <keescook@chromium.org>
---

Yu-cheng v25:
 - Move SHADOW_STACK_GUARD_GAP to arch/x86/mm/mmap.c.

Yu-cheng v24:
 - Instead changing vm_*_gap(), create x86-specific versions.

 arch/x86/include/asm/page_types.h |  7 +++++
 arch/x86/mm/mmap.c                | 46 +++++++++++++++++++++++++++++++
 include/linux/mm.h                |  4 +++
 3 files changed, 57 insertions(+)

Comments

Dave Hansen Feb. 9, 2022, 10:23 p.m. UTC | #1
On 1/30/22 13:18, Rick Edgecombe wrote:
> INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
> first and the last elements in the range, effectively touches those memory
> areas.

This is a pretty close copy of the instruction reference text for
INCSSP.  I'm feeling rather dense today, but that's just not making any
sense.

The pseudocode is more sensible in the SDM.  I think this needs a better
explanation:

	The INCSSP instruction increments the shadow stack pointer.  It
	is the shadow stack analog of an instruction like:

		addq	$0x80, %rsp

	However, there is one important difference between an ADD on
	%rsp and INCSSP.  In addition to modifying SSP, INCSSP also
	reads from the memory of the first and last elements that were
	"popped".  You can think of it as acting like this:

	READ_ONCE(ssp);       // read+discard top element on stack
	ssp += nr_to_pop * 8; // move the shadow stack
	READ_ONCE(ssp-8);     // read+discard last popped stack element
	

> The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
> 255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.

... That maximum distance, combined with an a guard pages at the end of
a shadow stack ensures that INCSSP will fault before it is able to move
across an entire guard page.

> Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
> CALL, and RET from going beyond.

> 
> diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
> index a506a411474d..e1533fdc08b4 100644
> --- a/arch/x86/include/asm/page_types.h
> +++ b/arch/x86/include/asm/page_types.h
> @@ -73,6 +73,13 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
>  
>  extern void initmem_init(void);
>  
> +#define vm_start_gap vm_start_gap
> +struct vm_area_struct;
> +extern unsigned long vm_start_gap(struct vm_area_struct *vma);
> +
> +#define vm_end_gap vm_end_gap
> +extern unsigned long vm_end_gap(struct vm_area_struct *vma);
> +
>  #endif	/* !__ASSEMBLY__ */
>  
>  #endif	/* _ASM_X86_PAGE_DEFS_H */
> diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
> index f3f52c5e2fd6..81f9325084d3 100644
> --- a/arch/x86/mm/mmap.c
> +++ b/arch/x86/mm/mmap.c
> @@ -250,3 +250,49 @@ bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
>  		return false;
>  	return true;
>  }
> +
> +/*
> + * Shadow stack pointer is moved by CALL, RET, and INCSSP(Q/D).  INCSSPQ
> + * moves shadow stack pointer up to 255 * 8 = ~2 KB (~1KB for INCSSPD) and
> + * touches the first and the last element in the range, which triggers a
> + * page fault if the range is not in a shadow stack.  Because of this,
> + * creating 4-KB guard pages around a shadow stack prevents these
> + * instructions from going beyond.
> + */
> +#define SHADOW_STACK_GUARD_GAP PAGE_SIZE
> +
> +unsigned long vm_start_gap(struct vm_area_struct *vma)
> +{
> +	unsigned long vm_start = vma->vm_start;
> +	unsigned long gap = 0;
> +
> +	if (vma->vm_flags & VM_GROWSDOWN)
> +		gap = stack_guard_gap;
> +	else if (vma->vm_flags & VM_SHADOW_STACK)
> +		gap = SHADOW_STACK_GUARD_GAP;
> +
> +	if (gap != 0) {
> +		vm_start -= gap;
> +		if (vm_start > vma->vm_start)
> +			vm_start = 0;
> +	}
> +	return vm_start;
> +}
> +
> +unsigned long vm_end_gap(struct vm_area_struct *vma)
> +{
> +	unsigned long vm_end = vma->vm_end;
> +	unsigned long gap = 0;
> +
> +	if (vma->vm_flags & VM_GROWSUP)
> +		gap = stack_guard_gap;
> +	else if (vma->vm_flags & VM_SHADOW_STACK)
> +		gap = SHADOW_STACK_GUARD_GAP;
> +
> +	if (gap != 0) {
> +		vm_end += gap;
> +		if (vm_end < vma->vm_end)
> +			vm_end = -PAGE_SIZE;
> +	}
> +	return vm_end;
> +}

First of all, __weak would be a lot better than these #ifdefs.

Second, I have the same basic objection to this as the maybe_mkwrite()
mess.  This is a forked copy of the code.  Instead of refactoring, it's
just copied-pasted-and-#ifdef'd.  Not so nice.

Isn't this just a matter of overriding 'stack_guard_gap' for
VM_SHADOW_STACK?  Why don't we just do this:

unsigned long stack_guard_gap(struct vm_area_struct *vma)
{
	if (vma->vm_flags & VM_SHADOW_STACK)
		return SHADOW_STACK_GUARD_GAP;

	return __stack_guard_gap;
}

Or, worst-case if people don't want 2 easily compiled-out lines added to
generic code, define:

unsigned long __weak stack_guard_gap(struct vm_area_struct *vma)
{
	return __stack_guard_gap;
}

in generic code, and put the top definition in arch/x86.
David Laight Feb. 10, 2022, 10:38 p.m. UTC | #2
From: Dave Hansen
> Sent: 09 February 2022 22:24
> 
> On 1/30/22 13:18, Rick Edgecombe wrote:
> > INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
> > first and the last elements in the range, effectively touches those memory
> > areas.
> 
> This is a pretty close copy of the instruction reference text for
> INCSSP.  I'm feeling rather dense today, but that's just not making any
> sense.
> 
> The pseudocode is more sensible in the SDM.  I think this needs a better
> explanation:
> 
> 	The INCSSP instruction increments the shadow stack pointer.  It
> 	is the shadow stack analog of an instruction like:
> 
> 		addq	$0x80, %rsp
> 
> 	However, there is one important difference between an ADD on
> 	%rsp and INCSSP.  In addition to modifying SSP, INCSSP also
> 	reads from the memory of the first and last elements that were
> 	"popped".  You can think of it as acting like this:
> 
> 	READ_ONCE(ssp);       // read+discard top element on stack
> 	ssp += nr_to_pop * 8; // move the shadow stack
> 	READ_ONCE(ssp-8);     // read+discard last popped stack element
> 
> 
> > The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
> > 255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.
> 
> ... That maximum distance, combined with an a guard pages at the end of
> a shadow stack ensures that INCSSP will fault before it is able to move
> across an entire guard page.
> 
> > Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
> > CALL, and RET from going beyond.

Do you need a real guard page?
Or is it just enough to ensure that the adjacent page isn't another
shadow stack page?

Any other page will cause a fault because the PTE isn't readonly+dirty.

I'm not sure how common single page allocates are in Linux.
But adjacent shadow stacks may be rare anyway.
So a check against both adjacent PTE entries would suffice.
Or maybe always allocate an even (or odd) numbered page.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Dave Hansen Feb. 10, 2022, 10:43 p.m. UTC | #3
On 1/30/22 13:18, Rick Edgecombe wrote:
> INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
> first and the last elements in the range, effectively touches those memory
> areas.
> 
> The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
> 255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.
> Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
> CALL, and RET from going beyond.

What is the downside of not applying this patch?  The shadow stack gap
is 1MB instead of 4k?

That, frankly, doesn't seem too bad.  How badly do we *need* this patch?
Andy Lutomirski Feb. 10, 2022, 11:07 p.m. UTC | #4
On Thu, Feb 10, 2022 at 2:44 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 1/30/22 13:18, Rick Edgecombe wrote:
> > INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
> > first and the last elements in the range, effectively touches those memory
> > areas.
> >
> > The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
> > 255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.
> > Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
> > CALL, and RET from going beyond.
>
> What is the downside of not applying this patch?  The shadow stack gap
> is 1MB instead of 4k?
>
> That, frankly, doesn't seem too bad.  How badly do we *need* this patch?

1MB of oer-thread guard address space in a 32-bit program may be a
show stopper.  Do we intend to support any of this for 32-bit?

--Andy
Edgecombe, Rick P Feb. 10, 2022, 11:40 p.m. UTC | #5
On Thu, 2022-02-10 at 15:07 -0800, Andy Lutomirski wrote:
> On Thu, Feb 10, 2022 at 2:44 PM Dave Hansen <dave.hansen@intel.com>
> wrote:
> > 
> > On 1/30/22 13:18, Rick Edgecombe wrote:
> > > INCSSP(Q/D) increments shadow stack pointer and 'pops and
> > > discards' the
> > > first and the last elements in the range, effectively touches
> > > those memory
> > > areas.
> > > 
> > > The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes
> > > and
> > > 255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from
> > > PAGE_SIZE.
> > > Thus, putting a gap page on both ends of a shadow stack prevents
> > > INCSSP,
> > > CALL, and RET from going beyond.
> > 
> > What is the downside of not applying this patch?  The shadow stack
> > gap
> > is 1MB instead of 4k?
> > 
> > That, frankly, doesn't seem too bad.  How badly do we *need* this
> > patch?

Like just using VM_SHADOW_STACK | VM_GROWSDOWN to get a regular stack
sized gap? I think it could work. It also simplifies the mm->stack_vm
accounting.

It would no longer get a gap at the end though. I don't think it's
needed.

> 
> 1MB of oer-thread guard address space in a 32-bit program may be a
> show stopper.  Do we intend to support any of this for 32-bit?

It is supported in the 32 bit compatibility mode, although IBT had
dropped it. I guess this was probably the reason.
Edgecombe, Rick P Feb. 10, 2022, 11:42 p.m. UTC | #6
On Thu, 2022-02-10 at 22:38 +0000, David Laight wrote:
> Do you need a real guard page?
> Or is it just enough to ensure that the adjacent page isn't another
> shadow stack page?
> 
> Any other page will cause a fault because the PTE isn't
> readonly+dirty.
> 
> I'm not sure how common single page allocates are in Linux.

I think it came from this discussion:

https://lore.kernel.org/lkml/CAG48ez1ytOfQyNZMNPFp7XqKcpd7_aRai9G5s7rx0V=8ZG+r2A@mail.gmail.com/#t

> But adjacent shadow stacks may be rare anyway.
> So a check against both adjacent PTE entries would suffice.
> Or maybe always allocate an even (or odd) numbered page.

It just needs to not be adjacent to shadow stack memory to do the job.
Would that be small to implement? It might be a tradeoff of code
complexity.
David Laight Feb. 11, 2022, 9:08 a.m. UTC | #7
From: Edgecombe, Rick P
> Sent: 10 February 2022 23:43
> 
> On Thu, 2022-02-10 at 22:38 +0000, David Laight wrote:
> > Do you need a real guard page?
> > Or is it just enough to ensure that the adjacent page isn't another
> > shadow stack page?
> >
> > Any other page will cause a fault because the PTE isn't
> > readonly+dirty.
> >
> > I'm not sure how common single page allocates are in Linux.
> 
> I think it came from this discussion:
> 
> https://lore.kernel.org/lkml/CAG48ez1ytOfQyNZMNPFp7XqKcpd7_aRai9G5s7rx0V=8ZG+r2A@mail.gmail.com/#t
> 
> > But adjacent shadow stacks may be rare anyway.
> > So a check against both adjacent PTE entries would suffice.
> > Or maybe always allocate an even (or odd) numbered page.
> 
> It just needs to not be adjacent to shadow stack memory to do the job.
> Would that be small to implement? It might be a tradeoff of code
> complexity.

That's what I thought.
Although the VA use for guard pages might be a problem in itself.

I'm not sure why I thought shadow stacks would be a single page.
For user space that 'only' allows 512 calls.
For kernel it is a massive waste of memory.
It is probably worth putting multiple kernel shadow stacks into the same page.
(Code that can overrun can do other stuff more easily.)

The hardware engineers failed to think about the implementation (again).
The shadow stack should (probably) run in the opposite direction to
the normal stack.
Then the shadow stack can be placed at the other end of the VA allocated
to a user space stack.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Andy Lutomirski Feb. 11, 2022, 5:54 p.m. UTC | #8
On 2/10/22 15:40, Edgecombe, Rick P wrote:
> On Thu, 2022-02-10 at 15:07 -0800, Andy Lutomirski wrote:
>> On Thu, Feb 10, 2022 at 2:44 PM Dave Hansen <dave.hansen@intel.com>
>> wrote:
>>>
>>> On 1/30/22 13:18, Rick Edgecombe wrote:
>>>> INCSSP(Q/D) increments shadow stack pointer and 'pops and
>>>> discards' the
>>>> first and the last elements in the range, effectively touches
>>>> those memory
>>>> areas.
>>>>
>>>> The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes
>>>> and
>>>> 255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from
>>>> PAGE_SIZE.
>>>> Thus, putting a gap page on both ends of a shadow stack prevents
>>>> INCSSP,
>>>> CALL, and RET from going beyond.
>>>
>>> What is the downside of not applying this patch?  The shadow stack
>>> gap
>>> is 1MB instead of 4k?
>>>
>>> That, frankly, doesn't seem too bad.  How badly do we *need* this
>>> patch?
> 
> Like just using VM_SHADOW_STACK | VM_GROWSDOWN to get a regular stack
> sized gap? I think it could work. It also simplifies the mm->stack_vm
> accounting.

Seems not crazy.  Do we want automatically growing shadow stacks?  I 
don't really like the historical unix behavior where the main thread has 
a sort-of-infinite stack and every other thread has a fixed stack.

> 
> It would no longer get a gap at the end though. I don't think it's
> needed.
> 

I may have missed something about the oddball way the mm code works, but 
it seems if you have a gap at one end of every shadow stack, you 
automatically have a gap at the other end.
Edgecombe, Rick P Feb. 12, 2022, 12:10 a.m. UTC | #9
On Fri, 2022-02-11 at 09:54 -0800, Andy Lutomirski wrote:
> > Like just using VM_SHADOW_STACK | VM_GROWSDOWN to get a regular
> > stack
> > sized gap? I think it could work. It also simplifies the mm-
> > >stack_vm
> > accounting.
> 
> Seems not crazy.  Do we want automatically growing shadow stacks?  I 
> don't really like the historical unix behavior where the main thread
> has 
> a sort-of-infinite stack and every other thread has a fixed stack.

Ah! I was scratching my head why glibc did that - it's historical. Yea,
probably it's not needed and adding strange behavior.

> 
> > 
> > It would no longer get a gap at the end though. I don't think it's
> > needed.
> > 
> 
> I may have missed something about the oddball way the mm code works,
> but 
> it seems if you have a gap at one end of every shadow stack, you 
> automatically have a gap at the other end.

Right, we only need one, and this patch added a gap on both ends.

Per Andy's comment about the "oddball way" the mm code does the gaps -
The previous version of this (PROT_SHADOW_STACK) had an issue where if
you started with writable memory, then mprotect() with
PROT_SHADOW_STACK, the internal rb tree would get confused over the
sudden appearance of a gap. This new version follows closer to how
MAP_STACK avoids the problem I saw. But the way these guard gaps work
seem to barely avoid problems when you do things like split the vma by
mprotect()ing the middle of one. I wasn't sure if it's worth a
refactor. I guess the solution is quite old and there hasn't been
problems. I'm not even sure what the change would be, but it does feel
like adding to something fragile. Maybe shadow stack should just place
a guard page manually and not add any special shadow stack logic to the
mm code...

Other than that I'm inclined to remove the end gap and justify this
patch better in the commit log. Something like this (borrowing some
from Dave's comments):

	The architecture of shadow stack constrains the ability of 
	userspace to move the shadow stack pointer (ssp) in order to 
	prevent corrupting or switching to other shadow stacks. The 
	RSTORSSP can move the spp to different shadow stacks, but it 
	requires a specially placed token in order to switch shadow 
	stacks. However, the architecture does not prevent 
	incrementing or decrementing shadow stack pointer to wander 
	onto an adjacent shadow stack. To prevent this in software, 
	enforce guard pages at the beginning of shadow stack vmas, such
	that t
here will always be a gap between adjacent shadow stacks.

	Make the gap big enough so that no userspace ssp changing 
	operations (besides RSTORSSP), can move the ssp from one stack 
	to the next. The ssp can increment or decrement by CALL, RET 
	and INCSSP. CALL and RET can move the ssp by a maximum of 8 
	bytes, at which point the shadow stack would be accessed.

        The INCSSP instruction can also increment the shadow stack 
	pointer. It is the shadow stack analog of an instruction like:

                addq    $0x80, %rsp

        However, there is one important difference between an ADD on
        %rsp and INCSSP. In addition to modifying SSP, INCSSP also
        reads from the memory of the first and last elements that were
        "popped". It can be thought of as acting like this:

        READ_ONCE(ssp);       // read+discard top element on stack
        ssp += nr_to_pop * 8; // move the shadow stack
        READ_ONCE(ssp-8);     // read+discard last popped stack element

	The maximum distance INCSSP can move the ssp is 2040 bytes, 
	before it would read the memory. There for a single page gap 
	will be enough to prevent any operation from shifting the ssp 
	to an adjacent gap, before the shadow stack would be read and 
	cause a fault.

	This could be accomplished by using VM_GROWSDOWN, but this has 
	two downsides.
	   1. VM_GROWSDOWN will have a 1MB gap which is on the large 
	      side for 32 bit address spaces
	   2. The behavior would allow shadow stack's to grow, which 
	      is unneeded and adds a strange difference to how most 
	      regular stacks work.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a506a411474d..e1533fdc08b4 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -73,6 +73,13 @@  bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
 extern void initmem_init(void);
 
+#define vm_start_gap vm_start_gap
+struct vm_area_struct;
+extern unsigned long vm_start_gap(struct vm_area_struct *vma);
+
+#define vm_end_gap vm_end_gap
+extern unsigned long vm_end_gap(struct vm_area_struct *vma);
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif	/* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index f3f52c5e2fd6..81f9325084d3 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -250,3 +250,49 @@  bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
 		return false;
 	return true;
 }
+
+/*
+ * Shadow stack pointer is moved by CALL, RET, and INCSSP(Q/D).  INCSSPQ
+ * moves shadow stack pointer up to 255 * 8 = ~2 KB (~1KB for INCSSPD) and
+ * touches the first and the last element in the range, which triggers a
+ * page fault if the range is not in a shadow stack.  Because of this,
+ * creating 4-KB guard pages around a shadow stack prevents these
+ * instructions from going beyond.
+ */
+#define SHADOW_STACK_GUARD_GAP PAGE_SIZE
+
+unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+	unsigned long vm_start = vma->vm_start;
+	unsigned long gap = 0;
+
+	if (vma->vm_flags & VM_GROWSDOWN)
+		gap = stack_guard_gap;
+	else if (vma->vm_flags & VM_SHADOW_STACK)
+		gap = SHADOW_STACK_GUARD_GAP;
+
+	if (gap != 0) {
+		vm_start -= gap;
+		if (vm_start > vma->vm_start)
+			vm_start = 0;
+	}
+	return vm_start;
+}
+
+unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+	unsigned long vm_end = vma->vm_end;
+	unsigned long gap = 0;
+
+	if (vma->vm_flags & VM_GROWSUP)
+		gap = stack_guard_gap;
+	else if (vma->vm_flags & VM_SHADOW_STACK)
+		gap = SHADOW_STACK_GUARD_GAP;
+
+	if (gap != 0) {
+		vm_end += gap;
+		if (vm_end < vma->vm_end)
+			vm_end = -PAGE_SIZE;
+	}
+	return vm_end;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b3cb3a17037b..e125358d7f75 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2797,6 +2797,7 @@  struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
 	return vma;
 }
 
+#ifndef vm_start_gap
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_start = vma->vm_start;
@@ -2808,7 +2809,9 @@  static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 	}
 	return vm_start;
 }
+#endif
 
+#ifndef vm_end_gap
 static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_end = vma->vm_end;
@@ -2820,6 +2823,7 @@  static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
 	}
 	return vm_end;
 }
+#endif
 
 static inline unsigned long vma_pages(struct vm_area_struct *vma)
 {