diff mbox series

[v2,16/39] x86/mm: Update maybe_mkwrite() for shadow stack

Message ID 20220929222936.14584-17-rick.p.edgecombe@intel.com (mailing list archive)
State New
Headers show
Series Shadowstacks for userspace | expand

Commit Message

Edgecombe, Rick P Sept. 29, 2022, 10:29 p.m. UTC
From: Yu-cheng Yu <yu-cheng.yu@intel.com>

When serving a page fault, maybe_mkwrite() makes a PTE writable if there is
a write access to it, and its vma has VM_WRITE. Shadow stack accesses to
shadow stack vma's are also treated as write accesses by the fault handler.
This is because setting shadow stack memory makes it writable via some
instructions, so COW has to happen even for shadow stack reads.

So maybe_mkwrite() should continue to set VM_WRITE vma's as normally
writable, but also set VM_WRITE|VM_SHADOW_STACK vma's as shadow stack.

Do this by adding a pte_mkwrite_shstk() and a cross-arch stub. Check for
VM_SHADOW_STACK in maybe_mkwrite() and call pte_mkwrite_shstk()
accordingly.

Apply the same changes to maybe_pmd_mkwrite().

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Kees Cook <keescook@chromium.org>

---

v2:
 - Change to handle shadow stacks that are VM_WRITE|VM_SHADOW_STACK
 - Ditch arch specific maybe_mkwrite(), and make the code generic

Yu-cheng v29:
 - Remove likely()'s.

 arch/x86/include/asm/pgtable.h |  2 ++
 include/linux/mm.h             | 14 +++++++++++++-
 include/linux/pgtable.h        | 14 ++++++++++++++
 mm/huge_memory.c               |  9 ++++++++-
 mm/memory.c                    |  3 +--
 5 files changed, 38 insertions(+), 4 deletions(-)

Comments

Kees Cook Oct. 3, 2022, 6:22 p.m. UTC | #1
On Thu, Sep 29, 2022 at 03:29:13PM -0700, Rick Edgecombe wrote:
> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> 
> When serving a page fault, maybe_mkwrite() makes a PTE writable if there is
> a write access to it, and its vma has VM_WRITE. Shadow stack accesses to
> shadow stack vma's are also treated as write accesses by the fault handler.
> This is because setting shadow stack memory makes it writable via some
> instructions, so COW has to happen even for shadow stack reads.
> 
> So maybe_mkwrite() should continue to set VM_WRITE vma's as normally
> writable, but also set VM_WRITE|VM_SHADOW_STACK vma's as shadow stack.
> 
> Do this by adding a pte_mkwrite_shstk() and a cross-arch stub. Check for
> VM_SHADOW_STACK in maybe_mkwrite() and call pte_mkwrite_shstk()
> accordingly.
> 
> Apply the same changes to maybe_pmd_mkwrite().
> 
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>

Reviewed-by: Kees Cook <keescook@chromium.org>
Kirill A . Shutemov Oct. 3, 2022, 11:53 p.m. UTC | #2
On Thu, Sep 29, 2022 at 03:29:13PM -0700, Rick Edgecombe wrote:
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8cd413c5a329..fef14ab3abcb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -981,13 +981,25 @@ void free_compound_page(struct page *page);
>   * servicing faults for write access.  In the normal case, do always want
>   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
>   * that do not have writing enabled, when used by access_process_vm.
> + *
> + * If a vma is shadow stack (a type of writable memory), mark the pte shadow
> + * stack.
>   */
> +#ifndef maybe_mkwrite
>  static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  {
> -	if (likely(vma->vm_flags & VM_WRITE))
> +	if (!(vma->vm_flags & VM_WRITE))
> +		goto out;
> +
> +	if (vma->vm_flags & VM_SHADOW_STACK)
> +		pte = pte_mkwrite_shstk(pte);
> +	else
>  		pte = pte_mkwrite(pte);
> +
> +out:
>  	return pte;
>  }
> +#endif

Maybe take opportunity to move it to <linux/pgtable.h>? It is really not a
place for the helper.
Peter Zijlstra Oct. 14, 2022, 3:32 p.m. UTC | #3
On Thu, Sep 29, 2022 at 03:29:13PM -0700, Rick Edgecombe wrote:

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8cd413c5a329..fef14ab3abcb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -981,13 +981,25 @@ void free_compound_page(struct page *page);
>   * servicing faults for write access.  In the normal case, do always want
>   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
>   * that do not have writing enabled, when used by access_process_vm.
> + *
> + * If a vma is shadow stack (a type of writable memory), mark the pte shadow
> + * stack.
>   */
> +#ifndef maybe_mkwrite
>  static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  {
> -	if (likely(vma->vm_flags & VM_WRITE))
> +	if (!(vma->vm_flags & VM_WRITE))
> +		goto out;
> +
> +	if (vma->vm_flags & VM_SHADOW_STACK)
> +		pte = pte_mkwrite_shstk(pte);
> +	else
>  		pte = pte_mkwrite(pte);
> +
> +out:
>  	return pte;
>  }
> +#endif

Why the #ifndef guard? There is no other implementation, nor does this
patch introduce one.

Also, wouldn't it be simpler to write it like:

static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
	if (!(vma->vm_flags & VM_WRITE))
		return pte;

	if (vma->vm_flags & VM_SHADOW_STACK)
		return pte_mkwrite_shstk(pte);

	return pte_mkwrite(pte);
}

? (idem for the pmd version etc..)
Edgecombe, Rick P Oct. 14, 2022, 3:45 p.m. UTC | #4
On Fri, 2022-10-14 at 17:32 +0200, Peter Zijlstra wrote:
> On Thu, Sep 29, 2022 at 03:29:13PM -0700, Rick Edgecombe wrote:
> 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 8cd413c5a329..fef14ab3abcb 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -981,13 +981,25 @@ void free_compound_page(struct page *page);
> >    * servicing faults for write access.  In the normal case, do
> > always want
> >    * pte_mkwrite.  But get_user_pages can cause write faults for
> > mappings
> >    * that do not have writing enabled, when used by
> > access_process_vm.
> > + *
> > + * If a vma is shadow stack (a type of writable memory), mark the
> > pte shadow
> > + * stack.
> >    */
> > +#ifndef maybe_mkwrite
> >   static inline pte_t maybe_mkwrite(pte_t pte, struct
> > vm_area_struct *vma)
> >   {
> > -     if (likely(vma->vm_flags & VM_WRITE))
> > +     if (!(vma->vm_flags & VM_WRITE))
> > +             goto out;
> > +
> > +     if (vma->vm_flags & VM_SHADOW_STACK)
> > +             pte = pte_mkwrite_shstk(pte);
> > +     else
> >                pte = pte_mkwrite(pte);
> > +
> > +out:
> >        return pte;
> >   }
> > +#endif
> 
> Why the #ifndef guard? There is no other implementation, nor does
> this
> patch introduce one.

Oh yea, this series used to add another one, but I forgot to remove the
guards. Thanks.

> 
> Also, wouldn't it be simpler to write it like:
> 
> static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct
> *vma)
> {
>         if (!(vma->vm_flags & VM_WRITE))
>                 return pte;
> 
>         if (vma->vm_flags & VM_SHADOW_STACK)
>                 return pte_mkwrite_shstk(pte);
> 
>         return pte_mkwrite(pte);
> }

Yep, that looks better. Thanks.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 58c7bf9d7392..7a769c4dbc1c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -419,6 +419,7 @@  static inline pte_t pte_mkdirty(pte_t pte)
 	return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
 }
 
+#define pte_mkwrite_shstk pte_mkwrite_shstk
 static inline pte_t pte_mkwrite_shstk(pte_t pte)
 {
 	/* pte_clear_cow() also sets Dirty=1 */
@@ -555,6 +556,7 @@  static inline pmd_t pmd_mkdirty(pmd_t pmd)
 	return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
 }
 
+#define pmd_mkwrite_shstk pmd_mkwrite_shstk
 static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
 {
 	return pmd_clear_cow(pmd);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8cd413c5a329..fef14ab3abcb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -981,13 +981,25 @@  void free_compound_page(struct page *page);
  * servicing faults for write access.  In the normal case, do always want
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
+ *
+ * If a vma is shadow stack (a type of writable memory), mark the pte shadow
+ * stack.
  */
+#ifndef maybe_mkwrite
 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 {
-	if (likely(vma->vm_flags & VM_WRITE))
+	if (!(vma->vm_flags & VM_WRITE))
+		goto out;
+
+	if (vma->vm_flags & VM_SHADOW_STACK)
+		pte = pte_mkwrite_shstk(pte);
+	else
 		pte = pte_mkwrite(pte);
+
+out:
 	return pte;
 }
+#endif
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaa..21115b4895ca 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -480,6 +480,13 @@  static inline pte_t pte_sw_mkyoung(pte_t pte)
 #define pte_mk_savedwrite pte_mkwrite
 #endif
 
+#ifndef pte_mkwrite_shstk
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+	return pte;
+}
+#endif
+
 #ifndef pte_clear_savedwrite
 #define pte_clear_savedwrite pte_wrprotect
 #endif
@@ -488,6 +495,13 @@  static inline pte_t pte_sw_mkyoung(pte_t pte)
 #define pmd_savedwrite pmd_write
 #endif
 
+#ifndef pmd_mkwrite_shstk
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+	return pmd;
+}
+#endif
+
 #ifndef pmd_mk_savedwrite
 #define pmd_mk_savedwrite pmd_mkwrite
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..11fc69eb4717 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -554,8 +554,15 @@  __setup("transparent_hugepage=", setup_transparent_hugepage);
 
 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
-	if (likely(vma->vm_flags & VM_WRITE))
+	if (!(vma->vm_flags & VM_WRITE))
+		goto out;
+
+	if (vma->vm_flags & VM_SHADOW_STACK)
+		pmd = pmd_mkwrite_shstk(pmd);
+	else
 		pmd = pmd_mkwrite(pmd);
+
+out:
 	return pmd;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 4ba73f5aa8bb..6e8379f6793c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4098,8 +4098,7 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	entry = mk_pte(page, vma->vm_page_prot);
 	entry = pte_sw_mkyoung(entry);
-	if (vma->vm_flags & VM_WRITE)
-		entry = pte_mkwrite(pte_mkdirty(entry));
+	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
 			&vmf->ptl);