diff mbox series

[v2,13/16] fsnotify: generate pre-content permission event on page fault

Message ID b8c3f0d9ed6d23f9a636919e28293cdbbe22e0db.1723144881.git.josef@toxicpanda.com (mailing list archive)
State New
Headers show
Series fanotify: add pre-content hooks | expand

Commit Message

Josef Bacik Aug. 8, 2024, 7:27 p.m. UTC
FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
on the faulting method.

This pre-content event is meant to be used by hierarchical storage
managers that want to fill in the file content on first read access.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 include/linux/mm.h |  2 +
 mm/filemap.c       | 97 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 92 insertions(+), 7 deletions(-)

Comments

Amir Goldstein Aug. 9, 2024, 10:34 a.m. UTC | #1
On Thu, Aug 8, 2024 at 9:28 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> on the faulting method.
>
> This pre-content event is meant to be used by hierarchical storage
> managers that want to fill in the file content on first read access.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  include/linux/mm.h |  2 +
>  mm/filemap.c       | 97 ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 92 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ab3d78116043..c33f3b7f7261 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3503,6 +3503,8 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
>  extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>                 pgoff_t start_pgoff, pgoff_t end_pgoff);
>  extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
> +extern vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
> +                                                   struct file **fpin);
>
>  extern unsigned long stack_guard_gap;
>  /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 8b1684b62177..3d232166b051 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -46,6 +46,7 @@
>  #include <linux/pipe_fs_i.h>
>  #include <linux/splice.h>
>  #include <linux/rcupdate_wait.h>
> +#include <linux/fsnotify.h>
>  #include <asm/pgalloc.h>
>  #include <asm/tlbflush.h>
>  #include "internal.h"
> @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
>   * that.  If we didn't pin a file then we return NULL.  The file that is
>   * returned needs to be fput()'ed when we're done with it.
>   */
> -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> +                                          struct file *fpin)
>  {
>         struct file *file = vmf->vma->vm_file;
>         struct file_ra_state *ra = &file->f_ra;
>         struct address_space *mapping = file->f_mapping;
>         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> -       struct file *fpin = NULL;
>         unsigned long vm_flags = vmf->vma->vm_flags;
>         unsigned int mmap_miss;
>
> @@ -3190,12 +3191,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>   * was pinned if we have to drop the mmap_lock in order to do IO.
>   */
>  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> -                                           struct folio *folio)
> +                                           struct folio *folio,
> +                                           struct file *fpin)
>  {
>         struct file *file = vmf->vma->vm_file;
>         struct file_ra_state *ra = &file->f_ra;
>         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
> -       struct file *fpin = NULL;
>         unsigned int mmap_miss;
>
>         /* See comment in do_sync_mmap_readahead. */
> @@ -3260,6 +3261,72 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
>         return ret;
>  }
>
> +/**
> + * filemap_maybe_emit_fsnotify_event - maybe emit a pre-content event.
> + * @vmf:       struct vm_fault containing details of the fault.
> + * @fpin:      pointer to the struct file pointer that may be pinned.
> + *
> + * If we have pre-content watches on this file we will need to emit an event for
> + * this range.  We will handle dropping the lock and emitting the event.
> + *
> + * If FAULT_FLAG_RETRY_NOWAIT is set then we'll return VM_FAULT_RETRY.
> + *
> + * If no event was emitted then *fpin will be NULL and we will return 0.
> + *
> + * If any error occurred we will return VM_FAULT_SIGBUS, *fpin could still be
> + * set and will need to have fput() called on it.
> + *
> + * If we emitted the event then we will return 0 and *fpin will be set, this
> + * must have fput() called on it, and the caller must call VM_FAULT_RETRY after
> + * any other operations it does in order to re-fault the page and make sure the
> + * appropriate locking is maintained.
> + *
> + * Return: the appropriate vm_fault_t return code, 0 on success.
> + */
> +vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
> +                                            struct file **fpin)
> +{
> +       struct file *file = vmf->vma->vm_file;
> +       loff_t pos = vmf->pgoff << PAGE_SHIFT;
> +       int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;

You missed my comment about using MAY_ACCESS here
and alter fsnotify hook, so legacy FAN_ACCESS_PERM event
won't be generated from page fault.

Thanks,
Amir.

> +       int ret;
> +
> +       /*
> +        * We already did this and now we're retrying with everything locked,
> +        * don't emit the event and continue.
> +        */
> +       if (vmf->flags & FAULT_FLAG_TRIED)
> +               return 0;
> +
> +       /* No watches, return NULL. */
> +       if (!fsnotify_file_has_pre_content_watches(file))
> +               return 0;
> +
> +       /* We are NOWAIT, we can't wait, just return EAGAIN. */
> +       if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> +               return VM_FAULT_RETRY;
> +
> +       /*
> +        * If this fails then we're not allowed to drop the fault lock, return a
> +        * SIGBUS so we don't errantly populate pagecache with bogus data for
> +        * this file.
> +        */
> +       *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
> +       if (*fpin == NULL)
> +               return VM_FAULT_SIGBUS | VM_FAULT_RETRY;
> +
> +       /*
> +        * We can't fput(*fpin) at this point because we could have been passed
> +        * in fpin from a previous call.
> +        */
> +       ret = fsnotify_file_area_perm(*fpin, mask, &pos, PAGE_SIZE);
> +       if (ret)
> +               return VM_FAULT_SIGBUS;
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(filemap_maybe_emit_fsnotify_event);
> +
>  /**
>   * filemap_fault - read in file data for page fault handling
>   * @vmf:       struct vm_fault containing details of the fault
> @@ -3299,6 +3366,19 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>         if (unlikely(index >= max_idx))
>                 return VM_FAULT_SIGBUS;
>
> +       /*
> +        * If we have pre-content watchers then we need to generate events on
> +        * page fault so that we can populate any data before the fault.
> +        */
> +       ret = filemap_maybe_emit_fsnotify_event(vmf, &fpin);
> +       if (unlikely(ret)) {
> +               if (fpin) {
> +                       fput(fpin);
> +                       ret |= VM_FAULT_RETRY;
> +               }
> +               return ret;
> +       }
> +
>         /*
>          * Do we have something in the page cache already?
>          */
> @@ -3309,21 +3389,24 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>                  * the lock.
>                  */
>                 if (!(vmf->flags & FAULT_FLAG_TRIED))
> -                       fpin = do_async_mmap_readahead(vmf, folio);
> +                       fpin = do_async_mmap_readahead(vmf, folio, fpin);
>                 if (unlikely(!folio_test_uptodate(folio))) {
>                         filemap_invalidate_lock_shared(mapping);
>                         mapping_locked = true;
>                 }
>         } else {
>                 ret = filemap_fault_recheck_pte_none(vmf);
> -               if (unlikely(ret))
> +               if (unlikely(ret)) {
> +                       if (fpin)
> +                               goto out_retry;
>                         return ret;
> +               }
>
>                 /* No page in the page cache at all */
>                 count_vm_event(PGMAJFAULT);
>                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
>                 ret = VM_FAULT_MAJOR;
> -               fpin = do_sync_mmap_readahead(vmf);
> +               fpin = do_sync_mmap_readahead(vmf, fpin);
>  retry_find:
>                 /*
>                  * See comment in filemap_create_folio() why we need
> --
> 2.43.0
>
Josef Bacik Aug. 9, 2024, 2:19 p.m. UTC | #2
On Fri, Aug 09, 2024 at 12:34:34PM +0200, Amir Goldstein wrote:
> On Thu, Aug 8, 2024 at 9:28 PM Josef Bacik <josef@toxicpanda.com> wrote:
> >
> > FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> > on the faulting method.
> >
> > This pre-content event is meant to be used by hierarchical storage
> > managers that want to fill in the file content on first read access.
> >
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  include/linux/mm.h |  2 +
> >  mm/filemap.c       | 97 ++++++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 92 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index ab3d78116043..c33f3b7f7261 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3503,6 +3503,8 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
> >  extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
> >                 pgoff_t start_pgoff, pgoff_t end_pgoff);
> >  extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
> > +extern vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
> > +                                                   struct file **fpin);
> >
> >  extern unsigned long stack_guard_gap;
> >  /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 8b1684b62177..3d232166b051 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -46,6 +46,7 @@
> >  #include <linux/pipe_fs_i.h>
> >  #include <linux/splice.h>
> >  #include <linux/rcupdate_wait.h>
> > +#include <linux/fsnotify.h>
> >  #include <asm/pgalloc.h>
> >  #include <asm/tlbflush.h>
> >  #include "internal.h"
> > @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
> >   * that.  If we didn't pin a file then we return NULL.  The file that is
> >   * returned needs to be fput()'ed when we're done with it.
> >   */
> > -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> > +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> > +                                          struct file *fpin)
> >  {
> >         struct file *file = vmf->vma->vm_file;
> >         struct file_ra_state *ra = &file->f_ra;
> >         struct address_space *mapping = file->f_mapping;
> >         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> > -       struct file *fpin = NULL;
> >         unsigned long vm_flags = vmf->vma->vm_flags;
> >         unsigned int mmap_miss;
> >
> > @@ -3190,12 +3191,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> >   * was pinned if we have to drop the mmap_lock in order to do IO.
> >   */
> >  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> > -                                           struct folio *folio)
> > +                                           struct folio *folio,
> > +                                           struct file *fpin)
> >  {
> >         struct file *file = vmf->vma->vm_file;
> >         struct file_ra_state *ra = &file->f_ra;
> >         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
> > -       struct file *fpin = NULL;
> >         unsigned int mmap_miss;
> >
> >         /* See comment in do_sync_mmap_readahead. */
> > @@ -3260,6 +3261,72 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
> >         return ret;
> >  }
> >
> > +/**
> > + * filemap_maybe_emit_fsnotify_event - maybe emit a pre-content event.
> > + * @vmf:       struct vm_fault containing details of the fault.
> > + * @fpin:      pointer to the struct file pointer that may be pinned.
> > + *
> > + * If we have pre-content watches on this file we will need to emit an event for
> > + * this range.  We will handle dropping the lock and emitting the event.
> > + *
> > + * If FAULT_FLAG_RETRY_NOWAIT is set then we'll return VM_FAULT_RETRY.
> > + *
> > + * If no event was emitted then *fpin will be NULL and we will return 0.
> > + *
> > + * If any error occurred we will return VM_FAULT_SIGBUS, *fpin could still be
> > + * set and will need to have fput() called on it.
> > + *
> > + * If we emitted the event then we will return 0 and *fpin will be set, this
> > + * must have fput() called on it, and the caller must call VM_FAULT_RETRY after
> > + * any other operations it does in order to re-fault the page and make sure the
> > + * appropriate locking is maintained.
> > + *
> > + * Return: the appropriate vm_fault_t return code, 0 on success.
> > + */
> > +vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
> > +                                            struct file **fpin)
> > +{
> > +       struct file *file = vmf->vma->vm_file;
> > +       loff_t pos = vmf->pgoff << PAGE_SHIFT;
> > +       int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> 
> You missed my comment about using MAY_ACCESS here
> and alter fsnotify hook, so legacy FAN_ACCESS_PERM event
> won't be generated from page fault.

I did miss that, I'll fix it up in v3, thanks!

Josef
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab3d78116043..c33f3b7f7261 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3503,6 +3503,8 @@  extern vm_fault_t filemap_fault(struct vm_fault *vmf);
 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
+						    struct file **fpin);
 
 extern unsigned long stack_guard_gap;
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
diff --git a/mm/filemap.c b/mm/filemap.c
index 8b1684b62177..3d232166b051 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,6 +46,7 @@ 
 #include <linux/pipe_fs_i.h>
 #include <linux/splice.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/fsnotify.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -3112,13 +3113,13 @@  static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
  * that.  If we didn't pin a file then we return NULL.  The file that is
  * returned needs to be fput()'ed when we're done with it.
  */
-static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
+					   struct file *fpin)
 {
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	struct address_space *mapping = file->f_mapping;
 	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
-	struct file *fpin = NULL;
 	unsigned long vm_flags = vmf->vma->vm_flags;
 	unsigned int mmap_miss;
 
@@ -3190,12 +3191,12 @@  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
  * was pinned if we have to drop the mmap_lock in order to do IO.
  */
 static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
-					    struct folio *folio)
+					    struct folio *folio,
+					    struct file *fpin)
 {
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
-	struct file *fpin = NULL;
 	unsigned int mmap_miss;
 
 	/* See comment in do_sync_mmap_readahead. */
@@ -3260,6 +3261,72 @@  static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
 	return ret;
 }
 
+/**
+ * filemap_maybe_emit_fsnotify_event - maybe emit a pre-content event.
+ * @vmf:	struct vm_fault containing details of the fault.
+ * @fpin:	pointer to the struct file pointer that may be pinned.
+ *
+ * If we have pre-content watches on this file we will need to emit an event for
+ * this range.  We will handle dropping the lock and emitting the event.
+ *
+ * If FAULT_FLAG_RETRY_NOWAIT is set then we'll return VM_FAULT_RETRY.
+ *
+ * If no event was emitted then *fpin will be NULL and we will return 0.
+ *
+ * If any error occurred we will return VM_FAULT_SIGBUS, *fpin could still be
+ * set and will need to have fput() called on it.
+ *
+ * If we emitted the event then we will return 0 and *fpin will be set, this
+ * must have fput() called on it, and the caller must call VM_FAULT_RETRY after
+ * any other operations it does in order to re-fault the page and make sure the
+ * appropriate locking is maintained.
+ *
+ * Return: the appropriate vm_fault_t return code, 0 on success.
+ */
+vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
+					     struct file **fpin)
+{
+	struct file *file = vmf->vma->vm_file;
+	loff_t pos = vmf->pgoff << PAGE_SHIFT;
+	int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
+	int ret;
+
+	/*
+	 * We already did this and now we're retrying with everything locked,
+	 * don't emit the event and continue.
+	 */
+	if (vmf->flags & FAULT_FLAG_TRIED)
+		return 0;
+
+	/* No watches, return NULL. */
+	if (!fsnotify_file_has_pre_content_watches(file))
+		return 0;
+
+	/* We are NOWAIT, we can't wait, just return EAGAIN. */
+	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+		return VM_FAULT_RETRY;
+
+	/*
+	 * If this fails then we're not allowed to drop the fault lock, return a
+	 * SIGBUS so we don't errantly populate pagecache with bogus data for
+	 * this file.
+	 */
+	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+	if (*fpin == NULL)
+		return VM_FAULT_SIGBUS | VM_FAULT_RETRY;
+
+	/*
+	 * We can't fput(*fpin) at this point because we could have been passed
+	 * in fpin from a previous call.
+	 */
+	ret = fsnotify_file_area_perm(*fpin, mask, &pos, PAGE_SIZE);
+	if (ret)
+		return VM_FAULT_SIGBUS;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(filemap_maybe_emit_fsnotify_event);
+
 /**
  * filemap_fault - read in file data for page fault handling
  * @vmf:	struct vm_fault containing details of the fault
@@ -3299,6 +3366,19 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 	if (unlikely(index >= max_idx))
 		return VM_FAULT_SIGBUS;
 
+	/*
+	 * If we have pre-content watchers then we need to generate events on
+	 * page fault so that we can populate any data before the fault.
+	 */
+	ret = filemap_maybe_emit_fsnotify_event(vmf, &fpin);
+	if (unlikely(ret)) {
+		if (fpin) {
+			fput(fpin);
+			ret |= VM_FAULT_RETRY;
+		}
+		return ret;
+	}
+
 	/*
 	 * Do we have something in the page cache already?
 	 */
@@ -3309,21 +3389,24 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 		 * the lock.
 		 */
 		if (!(vmf->flags & FAULT_FLAG_TRIED))
-			fpin = do_async_mmap_readahead(vmf, folio);
+			fpin = do_async_mmap_readahead(vmf, folio, fpin);
 		if (unlikely(!folio_test_uptodate(folio))) {
 			filemap_invalidate_lock_shared(mapping);
 			mapping_locked = true;
 		}
 	} else {
 		ret = filemap_fault_recheck_pte_none(vmf);
-		if (unlikely(ret))
+		if (unlikely(ret)) {
+			if (fpin)
+				goto out_retry;
 			return ret;
+		}
 
 		/* No page in the page cache at all */
 		count_vm_event(PGMAJFAULT);
 		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
-		fpin = do_sync_mmap_readahead(vmf);
+		fpin = do_sync_mmap_readahead(vmf, fpin);
 retry_find:
 		/*
 		 * See comment in filemap_create_folio() why we need