diff mbox series

[v6,06/23] mm/shmem: Handle uffd-wp special pte in page fault handler

Message ID 20211115075522.73795-7-peterx@redhat.com (mailing list archive)
State New
Headers show
Series userfaultfd-wp: Support shmem and hugetlbfs | expand

Commit Message

Peter Xu Nov. 15, 2021, 7:55 a.m. UTC
File-backed memories are prone to unmap/swap so the ptes are always unstable,
because they can be easily faulted back later using the page cache.  This could
lead to uffd-wp getting lost when unmapping or swapping out such memory.  One
example is shmem.  PTE markers are needed to store those information.

This patch prepares it by handling uffd-wp pte markers first it is applied
elsewhere, so that the page fault handler can recognize uffd-wp pte markers.

The handling of uffd-wp pte markers is similar to missing fault, it's just that
we'll handle this "missing fault" when we see the pte markers, meanwhile we
need to make sure the marker information is kept during processing the fault.

This is a slow path of uffd-wp handling, because zapping of wr-protected shmem
ptes should be rare.  So far it should only trigger in two conditions:

  (1) When trying to punch holes in shmem_fallocate(), there is an optimization
      to zap the pgtables before evicting the page.

  (2) When swapping out shmem pages.

Because of this, the page fault handling is simplifed too by not sending the
wr-protect message in the 1st page fault, instead the page will be installed
read-only, so the uffd-wp message will be generated in the next fault, which
will trigger the do_wp_page() path of general uffd-wp handling.

Disable fault-around for all uffd-wp registered ranges for extra safety just
like uffd-minor fault, and clean the code up.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/userfaultfd_k.h | 17 +++++++++
 mm/memory.c                   | 71 ++++++++++++++++++++++++++++++-----
 2 files changed, 79 insertions(+), 9 deletions(-)

Comments

Alistair Popple Dec. 16, 2021, 5:56 a.m. UTC | #1
On Monday, 15 November 2021 6:55:05 PM AEDT Peter Xu wrote:

[...]

> diff --git a/mm/memory.c b/mm/memory.c
> index d5966d9e24c3..e8557d43a87d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3452,6 +3452,43 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
>  	return 0;
>  }
>  
> +static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> +{
> +	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> +				       vmf->address, &vmf->ptl);
> +	/*
> +	 * Be careful so that we will only recover a special uffd-wp pte into a
> +	 * none pte.  Otherwise it means the pte could have changed, so retry.
> +	 */
> +	if (is_pte_marker(*vmf->pte))
> +		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
> +	pte_unmap_unlock(vmf->pte, vmf->ptl);
> +	return 0;
> +}
> +
> +/*
> + * This is actually a page-missing access, but with uffd-wp special pte
> + * installed.  It means this pte was wr-protected before being unmapped.
> + */
> +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
> +{
> +	/* Careful!  vmf->pte unmapped after return */
> +	if (!pte_unmap_same(vmf))

Hasn't vmf->pte already been unmapped by do_swap_page() by the time we get
here?

> +		return 0;
> +
> +	/*
> +	 * Just in case there're leftover special ptes even after the region
> +	 * got unregistered - we can simply clear them.  We can also do that
> +	 * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
> +	 * ranges, but it should be more efficient to be done lazily here.
> +	 */
> +	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
> +		return pte_marker_clear(vmf);
> +
> +	/* do_fault() can handle pte markers too like none pte */
> +	return do_fault(vmf);
> +}
> +
>  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  {
>  	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
> @@ -3465,8 +3502,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  	if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
>  		return VM_FAULT_SIGBUS;
>  
> -	/* TODO: handle pte markers */
> -	return 0;
> +	if (marker & PTE_MARKER_UFFD_WP)

Can we make this check `marker == PTE_MARKER_UFFD_WP`? There is currently only
one user of pte markers, and from what I can tell pte_marker_handle_uffd_wp()
wouldn't do the correct thing if other users were added because it could clear
non-uffd-wp markers. I don't think it's worth making it do the right thing now,
but a comment noting that would be helpful.

> +		return pte_marker_handle_uffd_wp(vmf);
> +
> +	/* This is an unknown pte marker */
> +	return VM_FAULT_SIGBUS;
>  }
>  
>  /*
> @@ -3968,6 +4008,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> +	bool uffd_wp = is_pte_marker_uffd_wp(vmf->orig_pte);
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  	bool prefault = vmf->address != addr;
>  	pte_t entry;
> @@ -3982,6 +4023,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>  
>  	if (write)
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +	if (unlikely(uffd_wp))
> +		entry = pte_mkuffd_wp(pte_wrprotect(entry));
>  	/* copy-on-write page */
>  	if (write && !(vma->vm_flags & VM_SHARED)) {
>  		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
> @@ -4155,9 +4198,21 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
>  	return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
>  }
>  
> +/* Return true if we should do read fault-around, false otherwise */
> +static inline bool should_fault_around(struct vm_fault *vmf)
> +{
> +	/* No ->map_pages?  No way to fault around... */
> +	if (!vmf->vma->vm_ops->map_pages)
> +		return false;
> +
> +	if (uffd_disable_fault_around(vmf->vma))
> +		return false;
> +
> +	return fault_around_bytes >> PAGE_SHIFT > 1;
> +}
> +
>  static vm_fault_t do_read_fault(struct vm_fault *vmf)
>  {
> -	struct vm_area_struct *vma = vmf->vma;
>  	vm_fault_t ret = 0;
>  
>  	/*
> @@ -4165,12 +4220,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
>  	 * if page by the offset is not ready to be mapped (cold cache or
>  	 * something).
>  	 */
> -	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
> -		if (likely(!userfaultfd_minor(vmf->vma))) {
> -			ret = do_fault_around(vmf);
> -			if (ret)
> -				return ret;
> -		}
> +	if (should_fault_around(vmf)) {
> +		ret = do_fault_around(vmf);
> +		if (ret)
> +			return ret;
>  	}
>  
>  	ret = __do_fault(vmf);
>
Peter Xu Dec. 16, 2021, 6:17 a.m. UTC | #2
On Thu, Dec 16, 2021 at 04:56:42PM +1100, Alistair Popple wrote:
> On Monday, 15 November 2021 6:55:05 PM AEDT Peter Xu wrote:
> 
> [...]
> 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index d5966d9e24c3..e8557d43a87d 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3452,6 +3452,43 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
> >  	return 0;
> >  }
> >  
> > +static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> > +{
> > +	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> > +				       vmf->address, &vmf->ptl);
> > +	/*
> > +	 * Be careful so that we will only recover a special uffd-wp pte into a
> > +	 * none pte.  Otherwise it means the pte could have changed, so retry.
> > +	 */
> > +	if (is_pte_marker(*vmf->pte))
> > +		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
> > +	pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * This is actually a page-missing access, but with uffd-wp special pte
> > + * installed.  It means this pte was wr-protected before being unmapped.
> > + */
> > +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
> > +{
> > +	/* Careful!  vmf->pte unmapped after return */
> > +	if (!pte_unmap_same(vmf))
> 
> Hasn't vmf->pte already been unmapped by do_swap_page() by the time we get
> here?

Great catch, thanks!

It was needed before with the "swap special pte" version because that was
handled outside do_swap_page().  After the rebase I forgot to remove it.

I believe it didn't crash simply because we've got commit 2ca99358671a ("mm:
clear vmf->pte after pte_unmap_same() returns", 2021-11-06) very recently so it
just became a safe no-op, so all things will still work.

I'll drop it.

> 
> > +		return 0;
> > +
> > +	/*
> > +	 * Just in case there're leftover special ptes even after the region
> > +	 * got unregistered - we can simply clear them.  We can also do that
> > +	 * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
> > +	 * ranges, but it should be more efficient to be done lazily here.
> > +	 */
> > +	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
> > +		return pte_marker_clear(vmf);
> > +
> > +	/* do_fault() can handle pte markers too like none pte */
> > +	return do_fault(vmf);
> > +}
> > +
> >  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >  {
> >  	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
> > @@ -3465,8 +3502,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >  	if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
> >  		return VM_FAULT_SIGBUS;
> >  
> > -	/* TODO: handle pte markers */
> > -	return 0;
> > +	if (marker & PTE_MARKER_UFFD_WP)
> 
> Can we make this check `marker == PTE_MARKER_UFFD_WP`? There is currently only
> one user of pte markers, and from what I can tell pte_marker_handle_uffd_wp()
> wouldn't do the correct thing if other users were added because it could clear
> non-uffd-wp markers. I don't think it's worth making it do the right thing now,
> but a comment noting that would be helpful.

Sure thing, and yeah I agree it's trivial and shouldn't matter in real-life.

I'll change it to "marker == PTE_MARKER_UFFD_WP" as you suggested, so if
there's surprise we'll get a sigbus.

Thanks,

> 
> > +		return pte_marker_handle_uffd_wp(vmf);
> > +
> > +	/* This is an unknown pte marker */
> > +	return VM_FAULT_SIGBUS;
> >  }
Alistair Popple Dec. 16, 2021, 6:30 a.m. UTC | #3
On Thursday, 16 December 2021 5:17:30 PM AEDT Peter Xu wrote:
> On Thu, Dec 16, 2021 at 04:56:42PM +1100, Alistair Popple wrote:
> > On Monday, 15 November 2021 6:55:05 PM AEDT Peter Xu wrote:
> > 
> > [...]
> > 
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index d5966d9e24c3..e8557d43a87d 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -3452,6 +3452,43 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
> > >  	return 0;
> > >  }
> > >  
> > > +static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> > > +{
> > > +	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> > > +				       vmf->address, &vmf->ptl);
> > > +	/*
> > > +	 * Be careful so that we will only recover a special uffd-wp pte into a
> > > +	 * none pte.  Otherwise it means the pte could have changed, so retry.
> > > +	 */
> > > +	if (is_pte_marker(*vmf->pte))
> > > +		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
> > > +	pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * This is actually a page-missing access, but with uffd-wp special pte
> > > + * installed.  It means this pte was wr-protected before being unmapped.
> > > + */
> > > +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
> > > +{
> > > +	/* Careful!  vmf->pte unmapped after return */
> > > +	if (!pte_unmap_same(vmf))
> > 
> > Hasn't vmf->pte already been unmapped by do_swap_page() by the time we get
> > here?
> 
> Great catch, thanks!
> 
> It was needed before with the "swap special pte" version because that was
> handled outside do_swap_page().  After the rebase I forgot to remove it.

No worries, and for what it's worth IMHO this version that handles it inside
do_swap_page() along with all the other "special" cases is much nicer.

> I believe it didn't crash simply because we've got commit 2ca99358671a ("mm:
> clear vmf->pte after pte_unmap_same() returns", 2021-11-06) very recently so it
> just became a safe no-op, so all things will still work.
> 
> I'll drop it.
> 
> > 
> > > +		return 0;
> > > +
> > > +	/*
> > > +	 * Just in case there're leftover special ptes even after the region
> > > +	 * got unregistered - we can simply clear them.  We can also do that
> > > +	 * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
> > > +	 * ranges, but it should be more efficient to be done lazily here.
> > > +	 */
> > > +	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
> > > +		return pte_marker_clear(vmf);
> > > +
> > > +	/* do_fault() can handle pte markers too like none pte */
> > > +	return do_fault(vmf);
> > > +}
> > > +
> > >  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> > >  {
> > >  	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
> > > @@ -3465,8 +3502,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> > >  	if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
> > >  		return VM_FAULT_SIGBUS;
> > >  
> > > -	/* TODO: handle pte markers */
> > > -	return 0;
> > > +	if (marker & PTE_MARKER_UFFD_WP)
> > 
> > Can we make this check `marker == PTE_MARKER_UFFD_WP`? There is currently only
> > one user of pte markers, and from what I can tell pte_marker_handle_uffd_wp()
> > wouldn't do the correct thing if other users were added because it could clear
> > non-uffd-wp markers. I don't think it's worth making it do the right thing now,
> > but a comment noting that would be helpful.
> 
> Sure thing, and yeah I agree it's trivial and shouldn't matter in real-life.
> 
> I'll change it to "marker == PTE_MARKER_UFFD_WP" as you suggested, so if
> there's surprise we'll get a sigbus.
> 
> Thanks,
> 
> > 
> > > +		return pte_marker_handle_uffd_wp(vmf);
> > > +
> > > +	/* This is an unknown pte marker */
> > > +	return VM_FAULT_SIGBUS;
> > >  }
> 
>
diff mbox series

Patch

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 7d7ffec53ddb..05cec02140cb 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -96,6 +96,18 @@  static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
 }
 
+/*
+ * Don't do fault around for either WP or MINOR registered uffd range.  For
+ * MINOR registered range, fault around will be a total disaster and ptes can
+ * be installed without notifications; for WP it should mostly be fine as long
+ * as the fault around checks for pte_none() before the installation, however
+ * to be super safe we just forbid it.
+ */
+static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
+}
+
 static inline bool userfaultfd_missing(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_UFFD_MISSING;
@@ -236,6 +248,11 @@  static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 {
 }
 
+static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 static inline bool is_pte_marker_uffd_wp(pte_t pte)
diff --git a/mm/memory.c b/mm/memory.c
index d5966d9e24c3..e8557d43a87d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3452,6 +3452,43 @@  static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
 	return 0;
 }
 
+static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
+{
+	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+				       vmf->address, &vmf->ptl);
+	/*
+	 * Be careful so that we will only recover a special uffd-wp pte into a
+	 * none pte.  Otherwise it means the pte could have changed, so retry.
+	 */
+	if (is_pte_marker(*vmf->pte))
+		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
+	pte_unmap_unlock(vmf->pte, vmf->ptl);
+	return 0;
+}
+
+/*
+ * This is actually a page-missing access, but with uffd-wp special pte
+ * installed.  It means this pte was wr-protected before being unmapped.
+ */
+static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
+{
+	/* Careful!  vmf->pte unmapped after return */
+	if (!pte_unmap_same(vmf))
+		return 0;
+
+	/*
+	 * Just in case there're leftover special ptes even after the region
+	 * got unregistered - we can simply clear them.  We can also do that
+	 * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
+	 * ranges, but it should be more efficient to be done lazily here.
+	 */
+	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+		return pte_marker_clear(vmf);
+
+	/* do_fault() can handle pte markers too like none pte */
+	return do_fault(vmf);
+}
+
 static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 {
 	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
@@ -3465,8 +3502,11 @@  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 	if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
 		return VM_FAULT_SIGBUS;
 
-	/* TODO: handle pte markers */
-	return 0;
+	if (marker & PTE_MARKER_UFFD_WP)
+		return pte_marker_handle_uffd_wp(vmf);
+
+	/* This is an unknown pte marker */
+	return VM_FAULT_SIGBUS;
 }
 
 /*
@@ -3968,6 +4008,7 @@  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	bool uffd_wp = is_pte_marker_uffd_wp(vmf->orig_pte);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool prefault = vmf->address != addr;
 	pte_t entry;
@@ -3982,6 +4023,8 @@  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	if (unlikely(uffd_wp))
+		entry = pte_mkuffd_wp(pte_wrprotect(entry));
 	/* copy-on-write page */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
@@ -4155,9 +4198,21 @@  static vm_fault_t do_fault_around(struct vm_fault *vmf)
 	return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
 }
 
+/* Return true if we should do read fault-around, false otherwise */
+static inline bool should_fault_around(struct vm_fault *vmf)
+{
+	/* No ->map_pages?  No way to fault around... */
+	if (!vmf->vma->vm_ops->map_pages)
+		return false;
+
+	if (uffd_disable_fault_around(vmf->vma))
+		return false;
+
+	return fault_around_bytes >> PAGE_SHIFT > 1;
+}
+
 static vm_fault_t do_read_fault(struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
 	vm_fault_t ret = 0;
 
 	/*
@@ -4165,12 +4220,10 @@  static vm_fault_t do_read_fault(struct vm_fault *vmf)
 	 * if page by the offset is not ready to be mapped (cold cache or
 	 * something).
 	 */
-	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-		if (likely(!userfaultfd_minor(vmf->vma))) {
-			ret = do_fault_around(vmf);
-			if (ret)
-				return ret;
-		}
+	if (should_fault_around(vmf)) {
+		ret = do_fault_around(vmf);
+		if (ret)
+			return ret;
 	}
 
 	ret = __do_fault(vmf);