diff mbox series

[v4,5/5] fsnotify: require inode lock held during child flag update

Message ID 20221111220614.991928-6-stephen.s.brennan@oracle.com (mailing list archive)
State New, archived
Headers show
Series fsnotify: fix softlockups iterating over d_subdirs | expand

Commit Message

Stephen Brennan Nov. 11, 2022, 10:06 p.m. UTC
With the prior changes to fsnotify, it is now possible for
fsnotify_recalc_mask() to return before all children flags have been
set. Imagine that two CPUs attempt to add a mark to an inode which would
require watching the children of that directory:

CPU 1:                                 CPU 2:

fsnotify_recalc_mask() {
  spin_lock();
  update_children = ...
  __fsnotify_recalc_mask();
  update_children = ...
  spin_unlock();
  // update_children is true!
  fsnotify_conn_set_children_dentry_flags() {
    // updating flags ...
    cond_resched();

                                       fsnotify_recalc_mask() {
                                         spin_lock();
                                         update_children = ...
                                         __fsnotify_recalc_mask();
                                         update_children = ...
                                         spin_unlock();
                                         // update_children is false
                                       }
                                       // returns to userspace, but
                                       // not all children are marked
    // continue updating flags
   }
}

To prevent this situation, hold the directory inode lock. This ensures
that any concurrent update to the mask will block until the update is
complete, so that we can guarantee that child flags are set prior to
returning.

Since the directory inode lock is now held during iteration over
d_subdirs, we are guaranteed that __d_move() cannot remove the dentry we
hold, so we no longer need check whether we should retry iteration. We
also are guaranteed that no cursors are moving through the list, since
simple_readdir() holds the inode read lock. Simplify the iteration by
removing this logic.

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
---
 fs/notify/fsnotify.c | 25 +++++++++----------------
 fs/notify/mark.c     |  8 ++++++++
 2 files changed, 17 insertions(+), 16 deletions(-)

Comments

Amir Goldstein Nov. 12, 2022, 9:42 a.m. UTC | #1
fsnotify_update_flags

On Sat, Nov 12, 2022 at 12:06 AM Stephen Brennan
<stephen.s.brennan@oracle.com> wrote:
>
> With the prior changes to fsnotify, it is now possible for
> fsnotify_recalc_mask() to return before all children flags have been
> set. Imagine that two CPUs attempt to add a mark to an inode which would
> require watching the children of that directory:
>
> CPU 1:                                 CPU 2:
>
> fsnotify_recalc_mask() {
>   spin_lock();
>   update_children = ...
>   __fsnotify_recalc_mask();
>   update_children = ...
>   spin_unlock();
>   // update_children is true!
>   fsnotify_conn_set_children_dentry_flags() {
>     // updating flags ...
>     cond_resched();
>
>                                        fsnotify_recalc_mask() {
>                                          spin_lock();
>                                          update_children = ...
>                                          __fsnotify_recalc_mask();
>                                          update_children = ...
>                                          spin_unlock();
>                                          // update_children is false
>                                        }
>                                        // returns to userspace, but
>                                        // not all children are marked
>     // continue updating flags
>    }
> }
>
> To prevent this situation, hold the directory inode lock. This ensures
> that any concurrent update to the mask will block until the update is
> complete, so that we can guarantee that child flags are set prior to
> returning.
>
> Since the directory inode lock is now held during iteration over
> d_subdirs, we are guaranteed that __d_move() cannot remove the dentry we
> hold, so we no longer need check whether we should retry iteration. We
> also are guaranteed that no cursors are moving through the list, since
> simple_readdir() holds the inode read lock. Simplify the iteration by
> removing this logic.
>

I very much prefer to start the series with this patch and avoid
those details altogether.

> Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
> ---
>  fs/notify/fsnotify.c | 25 +++++++++----------------
>  fs/notify/mark.c     |  8 ++++++++
>  2 files changed, 17 insertions(+), 16 deletions(-)
>
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index 0ba61211456c..b5778775b88d 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -102,6 +102,8 @@ void fsnotify_sb_delete(struct super_block *sb)
>   * on a child we run all of our children and set a dentry flag saying that the
>   * parent cares.  Thus when an event happens on a child it can quickly tell
>   * if there is a need to find a parent and send the event to the parent.
> + *
> + * Context: inode locked exclusive
>   */
>  void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
>  {
> @@ -124,22 +126,16 @@ void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
>          * over d_subdirs which will allow us to sleep.
>          */
>         spin_lock(&alias->d_lock);
> -retry:
>         list_for_each_entry(child, &alias->d_subdirs, d_child) {
>                 /*
> -                * We need to hold a reference while we sleep. But we cannot
> -                * sleep holding a reference to a cursor, or we risk skipping
> -                * through the list.
> -                *
> -                * When we wake, dput() could free the dentry, invalidating the
> -                * list pointers.  We can't look at the list pointers until we
> -                * re-lock the parent, and we can't dput() once we have the
> -                * parent locked.  So the solution is to hold onto our reference
> -                * and free it the *next* time we drop alias->d_lock: either at
> -                * the end of the function, or at the time of the next sleep.
> +                * We need to hold a reference while we sleep. When we wake,
> +                * dput() could free the dentry, invalidating the list pointers.
> +                * We can't look at the list pointers until we re-lock the
> +                * parent, and we can't dput() once we have the parent locked.
> +                * So the solution is to hold onto our reference and free it the
> +                * *next* time we drop alias->d_lock: either at the end of the
> +                * function, or at the time of the next sleep.
>                  */
> -               if (child->d_flags & DCACHE_DENTRY_CURSOR)
> -                       continue;
>                 if (need_resched()) {
>                         dget(child);
>                         spin_unlock(&alias->d_lock);
> @@ -147,9 +143,6 @@ void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
>                         last_ref = child;
>                         cond_resched();
>                         spin_lock(&alias->d_lock);
> -                       /* Check for races with __d_move() */
> -                       if (child->d_parent != alias)
> -                               goto retry;
>                 }
>
>                 /*
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 6797a2952f87..f39cd88ad778 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -203,10 +203,15 @@ static void fsnotify_conn_set_children_dentry_flags(
>  void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  {
>         bool update_children;
> +       struct inode *inode = NULL;
>
>         if (!conn)
>                 return;
>
> +       if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
> +               inode = fsnotify_conn_inode(conn);
> +               inode_lock(inode);
> +       }
>         spin_lock(&conn->lock);
>         update_children = !fsnotify_conn_watches_children(conn);
>         __fsnotify_recalc_mask(conn);
> @@ -219,6 +224,9 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>          */
>         if (update_children)
>                 fsnotify_conn_set_children_dentry_flags(conn);
> +
> +       if (inode)
> +               inode_unlock(inode);
>  }
>

Interesting.

I was imagining inode_lock taken only inside
fsnotify_conn_set_children_dentry_flags()

The reason is that removing the parent watch does not need
to be serialized for lazy clean up to work correctly.

Maybe I am missing something or maybe it is just best practice
to serialize all parent state changes to keep the mental model
of the code simpler and to keep it the same as the existing upstream
mental model.

So I am NOT opposed to serializing fsnotify_recalc_mask()
just wanted to hear opinions.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 0ba61211456c..b5778775b88d 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -102,6 +102,8 @@  void fsnotify_sb_delete(struct super_block *sb)
  * on a child we run all of our children and set a dentry flag saying that the
  * parent cares.  Thus when an event happens on a child it can quickly tell
  * if there is a need to find a parent and send the event to the parent.
+ *
+ * Context: inode locked exclusive
  */
 void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
 {
@@ -124,22 +126,16 @@  void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
 	 * over d_subdirs which will allow us to sleep.
 	 */
 	spin_lock(&alias->d_lock);
-retry:
 	list_for_each_entry(child, &alias->d_subdirs, d_child) {
 		/*
-		 * We need to hold a reference while we sleep. But we cannot
-		 * sleep holding a reference to a cursor, or we risk skipping
-		 * through the list.
-		 *
-		 * When we wake, dput() could free the dentry, invalidating the
-		 * list pointers.  We can't look at the list pointers until we
-		 * re-lock the parent, and we can't dput() once we have the
-		 * parent locked.  So the solution is to hold onto our reference
-		 * and free it the *next* time we drop alias->d_lock: either at
-		 * the end of the function, or at the time of the next sleep.
+		 * We need to hold a reference while we sleep. When we wake,
+		 * dput() could free the dentry, invalidating the list pointers.
+		 * We can't look at the list pointers until we re-lock the
+		 * parent, and we can't dput() once we have the parent locked.
+		 * So the solution is to hold onto our reference and free it the
+		 * *next* time we drop alias->d_lock: either at the end of the
+		 * function, or at the time of the next sleep.
 		 */
-		if (child->d_flags & DCACHE_DENTRY_CURSOR)
-			continue;
 		if (need_resched()) {
 			dget(child);
 			spin_unlock(&alias->d_lock);
@@ -147,9 +143,6 @@  void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
 			last_ref = child;
 			cond_resched();
 			spin_lock(&alias->d_lock);
-			/* Check for races with __d_move() */
-			if (child->d_parent != alias)
-				goto retry;
 		}
 
 		/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 6797a2952f87..f39cd88ad778 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -203,10 +203,15 @@  static void fsnotify_conn_set_children_dentry_flags(
 void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	bool update_children;
+	struct inode *inode = NULL;
 
 	if (!conn)
 		return;
 
+	if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
+		inode = fsnotify_conn_inode(conn);
+		inode_lock(inode);
+	}
 	spin_lock(&conn->lock);
 	update_children = !fsnotify_conn_watches_children(conn);
 	__fsnotify_recalc_mask(conn);
@@ -219,6 +224,9 @@  void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 	 */
 	if (update_children)
 		fsnotify_conn_set_children_dentry_flags(conn);
+
+	if (inode)
+		inode_unlock(inode);
 }
 
 /* Free all connectors queued for freeing once SRCU period ends */