diff mbox series

[v4] fs/namespace: defer RCU sync for MNT_DETACH umount

Message ID 20250408210350.749901-12-echanude@redhat.com (mailing list archive)
State New
Headers show
Series [v4] fs/namespace: defer RCU sync for MNT_DETACH umount | expand

Commit Message

Eric Chanudet April 8, 2025, 8:58 p.m. UTC
Defer releasing the detached file-system when calling namespace_unlock()
during a lazy umount to return faster.

When requesting MNT_DETACH, the caller does not expect the file-system
to be shut down upon returning from the syscall. Calling
synchronize_rcu_expedited() has a significant cost on RT kernel that
defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
mount in a separate list and put it on a workqueue to run post RCU
grace-period.

w/o patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )

w/ patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )

Signed-off-by: Alexander Larsson <alexl@redhat.com>
Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
Signed-off-by: Eric Chanudet <echanude@redhat.com>
---

Attempt to re-spin this series based on the feedback received in v3 that
pointed out the need to wait the grace-period in namespace_unlock()
before calling the deferred mntput().

v4:
- Use queue_rcu_work() to defer free_mounts() for lazy umounts
- Drop lazy_unlock global and refactor using a helper
v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
- Removed unneeded code for lazy umount case.
- Don't block within interrupt context.
v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
- Only defer releasing umount'ed filesystems for lazy umounts
v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/

 fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 7 deletions(-)

Comments

Christian Brauner April 9, 2025, 10:37 a.m. UTC | #1
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> Defer releasing the detached file-system when calling namespace_unlock()
> during a lazy umount to return faster.
> 
> When requesting MNT_DETACH, the caller does not expect the file-system
> to be shut down upon returning from the syscall. Calling
> synchronize_rcu_expedited() has a significant cost on RT kernel that
> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> mount in a separate list and put it on a workqueue to run post RCU
> grace-period.
> 
> w/o patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> 
> w/ patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> 
> Signed-off-by: Alexander Larsson <alexl@redhat.com>
> Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
> Signed-off-by: Eric Chanudet <echanude@redhat.com>
> ---
> 
> Attempt to re-spin this series based on the feedback received in v3 that
> pointed out the need to wait the grace-period in namespace_unlock()
> before calling the deferred mntput().

I still hate this with a passion because it adds another special-sauce
path into the unlock path. I've folded the following diff into it so it
at least doesn't start passing that pointless boolean and doesn't
introduce __namespace_unlock(). Just use a global variable and pick the
value off of it just as we do with the lists. Testing this now:

diff --git a/fs/namespace.c b/fs/namespace.c
index e5b0b920dd97..25599428706c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
 static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
-static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
-static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool unmounted_lazily;          /* protected by namespace_sem */
+static HLIST_HEAD(unmounted);          /* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints);      /* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);

 #ifdef CONFIG_FSNOTIFY
@@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list)

 static void defer_free_mounts(struct work_struct *work)
 {
-       struct deferred_free_mounts *d = container_of(
-               to_rcu_work(work), struct deferred_free_mounts, rwork);
+       struct deferred_free_mounts *d;

+       d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
        free_mounts(&d->release_list);
        kfree(d);
 }

-static void __namespace_unlock(bool lazy)
+static void namespace_unlock(void)
 {
        HLIST_HEAD(head);
        LIST_HEAD(list);
+       bool defer = unmounted_lazily;

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);
@@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
        if (likely(hlist_empty(&head)))
                return;

-       if (lazy) {
-               struct deferred_free_mounts *d =
-                       kmalloc(sizeof(*d), GFP_KERNEL);
+       if (defer) {
+               struct deferred_free_mounts *d;

-               if (unlikely(!d))
-                       goto out;
-
-               hlist_move_list(&head, &d->release_list);
-               INIT_RCU_WORK(&d->rwork, defer_free_mounts);
-               queue_rcu_work(system_wq, &d->rwork);
-               return;
+               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
+               if (d) {
+                       hlist_move_list(&head, &d->release_list);
+                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+                       queue_rcu_work(system_wq, &d->rwork);
+                       return;
+               }
        }
-
-out:
        synchronize_rcu_expedited();
        free_mounts(&head);
 }

-static inline void namespace_unlock(void)
-{
-       __namespace_unlock(false);
-}
-
 static inline void namespace_lock(void)
 {
        down_write(&namespace_sem);
@@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
        }
 out:
        unlock_mount_hash();
-       __namespace_unlock(flags & MNT_DETACH);
+       namespace_unlock();
        return retval;
 }


> 
> v4:
> - Use queue_rcu_work() to defer free_mounts() for lazy umounts
> - Drop lazy_unlock global and refactor using a helper
> v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
> - Removed unneeded code for lazy umount case.
> - Don't block within interrupt context.
> v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
> - Only defer releasing umount'ed filesystems for lazy umounts
> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
> 
>  fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 14935a0500a2..e5b0b920dd97 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
>  static unsigned int mp_hash_mask __ro_after_init;
>  static unsigned int mp_hash_shift __ro_after_init;
>  
> +struct deferred_free_mounts {
> +	struct rcu_work rwork;
> +	struct hlist_head release_list;
> +};
> +
>  static __initdata unsigned long mhash_entries;
>  static int __init set_mhash_entries(char *str)
>  {
> @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void)
>  }
>  #endif
>  
> -static void namespace_unlock(void)
> +static void free_mounts(struct hlist_head *mount_list)
>  {
> -	struct hlist_head head;
>  	struct hlist_node *p;
>  	struct mount *m;
> +
> +	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
> +		hlist_del(&m->mnt_umount);
> +		mntput(&m->mnt);
> +	}
> +}
> +
> +static void defer_free_mounts(struct work_struct *work)
> +{
> +	struct deferred_free_mounts *d = container_of(
> +		to_rcu_work(work), struct deferred_free_mounts, rwork);
> +
> +	free_mounts(&d->release_list);
> +	kfree(d);
> +}
> +
> +static void __namespace_unlock(bool lazy)
> +{
> +	HLIST_HEAD(head);
>  	LIST_HEAD(list);
>  
>  	hlist_move_list(&unmounted, &head);
> @@ -1817,12 +1840,27 @@ static void namespace_unlock(void)
>  	if (likely(hlist_empty(&head)))
>  		return;
>  
> -	synchronize_rcu_expedited();
> +	if (lazy) {
> +		struct deferred_free_mounts *d =
> +			kmalloc(sizeof(*d), GFP_KERNEL);
>  
> -	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> -		hlist_del(&m->mnt_umount);
> -		mntput(&m->mnt);
> +		if (unlikely(!d))
> +			goto out;
> +
> +		hlist_move_list(&head, &d->release_list);
> +		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +		queue_rcu_work(system_wq, &d->rwork);
> +		return;
>  	}
> +
> +out:
> +	synchronize_rcu_expedited();
> +	free_mounts(&head);
> +}
> +
> +static inline void namespace_unlock(void)
> +{
> +	__namespace_unlock(false);
>  }
>  
>  static inline void namespace_lock(void)
> @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags)
>  	}
>  out:
>  	unlock_mount_hash();
> -	namespace_unlock();
> +	__namespace_unlock(flags & MNT_DETACH);
>  	return retval;
>  }
>  
> -- 
> 2.49.0
>
Mateusz Guzik April 9, 2025, 1:04 p.m. UTC | #2
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> Defer releasing the detached file-system when calling namespace_unlock()
> during a lazy umount to return faster.
> 
> When requesting MNT_DETACH, the caller does not expect the file-system
> to be shut down upon returning from the syscall. Calling
> synchronize_rcu_expedited() has a significant cost on RT kernel that
> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> mount in a separate list and put it on a workqueue to run post RCU
> grace-period.
> 
> w/o patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> 
> w/ patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> 

Christian wants the patch done differently and posted his diff, so I'm
not going to comment on it.

I do have some feedback about the commit message though.

In v1 it points out a real user which runs into it, while this one does
not. So I would rewrite this and put in bench results from the actual
consumer -- as it is one is left to wonder why patching up lazy unmount
is of any significance.

I had to look up what rcupdate.rcu_normal_after_boot=1 is. Docs claim it
makes everyone use normal grace-periods, which explains the difference.
But without that one is left to wonder if perhaps there is a perf bug in
RCU instead where this is taking longer than it should despite the
option. Thus I would also denote how the delay shows up.

v1 for reference:
> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
Sebastian Andrzej Siewior April 9, 2025, 1:14 p.m. UTC | #3
On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote:
> I still hate this with a passion because it adds another special-sauce
> path into the unlock path. I've folded the following diff into it so it
> at least doesn't start passing that pointless boolean and doesn't
> introduce __namespace_unlock(). Just use a global variable and pick the
> value off of it just as we do with the lists. Testing this now:

I tried to apply this on top of the previous one but it all chunks
failed.

One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
them all via queue_rcu_work()?
If so, couldn't we have make deferred_free_mounts global and have two
release_list, say release_list and release_list_next_gp? The first one
will be used if queue_rcu_work() returns true, otherwise the second.
Then once defer_free_mounts() is done and release_list_next_gp not
empty, it would move release_list_next_gp -> release_list and invoke
queue_rcu_work().
This would avoid the kmalloc, synchronize_rcu_expedited() and the
special-sauce.

> diff --git a/fs/namespace.c b/fs/namespace.c
> index e5b0b920dd97..25599428706c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)> +               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
> +               if (d) {
> +                       hlist_move_list(&head, &d->release_list);
> +                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +                       queue_rcu_work(system_wq, &d->rwork);

Couldn't we do system_unbound_wq?

Sebastian
Mateusz Guzik April 9, 2025, 2:02 p.m. UTC | #4
On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> them all via queue_rcu_work()?
> If so, couldn't we have make deferred_free_mounts global and have two
> release_list, say release_list and release_list_next_gp? The first one
> will be used if queue_rcu_work() returns true, otherwise the second.
> Then once defer_free_mounts() is done and release_list_next_gp not
> empty, it would move release_list_next_gp -> release_list and invoke
> queue_rcu_work().
> This would avoid the kmalloc, synchronize_rcu_expedited() and the
> special-sauce.
> 

To my understanding it was preferred for non-lazy unmount consumers to
wait until the mntput before returning.

That aside if I understood your approach it would de facto serialize all
of these?

As in with the posted patches you can have different worker threads
progress in parallel as they all get a private list to iterate.

With your proposal only one can do any work.

One has to assume with sufficient mount/unmount traffic this can
eventually get into trouble.
Sebastian Andrzej Siewior April 9, 2025, 2:25 p.m. UTC | #5
On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > them all via queue_rcu_work()?
> > If so, couldn't we have make deferred_free_mounts global and have two
> > release_list, say release_list and release_list_next_gp? The first one
> > will be used if queue_rcu_work() returns true, otherwise the second.
> > Then once defer_free_mounts() is done and release_list_next_gp not
> > empty, it would move release_list_next_gp -> release_list and invoke
> > queue_rcu_work().
> > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > special-sauce.
> > 
> 
> To my understanding it was preferred for non-lazy unmount consumers to
> wait until the mntput before returning.
> 
> That aside if I understood your approach it would de facto serialize all
> of these?
> 
> As in with the posted patches you can have different worker threads
> progress in parallel as they all get a private list to iterate.
> 
> With your proposal only one can do any work.
> 
> One has to assume with sufficient mount/unmount traffic this can
> eventually get into trouble.

Right, it would serialize them within the same worker thread. With one
worker for each put you would schedule multiple worker from the RCU
callback. Given the system_wq you will schedule them all on the CPU
which invokes the RCU callback. This kind of serializes it, too.

The mntput() callback uses spinlock_t for locking and then it frees
resources. It does not look like it waits for something nor takes ages.
So it might not be needed to split each put into its own worker on a
different CPU… One busy bee might be enough ;)

Sebastian
Christian Brauner April 9, 2025, 4:04 p.m. UTC | #6
On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > them all via queue_rcu_work()?
> > > If so, couldn't we have make deferred_free_mounts global and have two
> > > release_list, say release_list and release_list_next_gp? The first one
> > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > empty, it would move release_list_next_gp -> release_list and invoke
> > > queue_rcu_work().
> > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > special-sauce.
> > > 
> > 
> > To my understanding it was preferred for non-lazy unmount consumers to
> > wait until the mntput before returning.
> > 
> > That aside if I understood your approach it would de facto serialize all
> > of these?
> > 
> > As in with the posted patches you can have different worker threads
> > progress in parallel as they all get a private list to iterate.
> > 
> > With your proposal only one can do any work.
> > 
> > One has to assume with sufficient mount/unmount traffic this can
> > eventually get into trouble.
> 
> Right, it would serialize them within the same worker thread. With one
> worker for each put you would schedule multiple worker from the RCU
> callback. Given the system_wq you will schedule them all on the CPU
> which invokes the RCU callback. This kind of serializes it, too.
> 
> The mntput() callback uses spinlock_t for locking and then it frees
> resources. It does not look like it waits for something nor takes ages.
> So it might not be needed to split each put into its own worker on a
> different CPU… One busy bee might be enough ;)

Unmounting can trigger very large number of mounts to be unmounted. If
you're on a container heavy system or services that all propagate to
each other in different mount namespaces mount propagation will generate
a ton of umounts. So this cannot be underestimated.

If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
during the unmount.

If a concurrent path lookup calls legitimize_mnt() on such a mount and
sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
concurrent unmounter hold the last reference and it __legitimize_mnt()
can thus simply drop the reference count. The final mntput() will be
done by the umounter.

The synchronize_rcu() call in namespace_unlock() takes care that the
last mntput() doesn't happen until path walking has dropped out of RCU
mode.

Without it it's possible that a non-MNT_DETACH umounter gets a spurious
EBUSY error because a concurrent lazy path walk will suddenly put the
last reference via mntput().

I'm unclear how that's handled in whatever it is you're proposing.
Eric Chanudet April 9, 2025, 4:08 p.m. UTC | #7
> > > On Wed, Apr 09, 2025 at 12:37:06PM +0200, Christian Brauner wrote:
> > > > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > > > > Attempt to re-spin this series based on the feedback received in v3 that
> > > > > pointed out the need to wait the grace-period in namespace_unlock()
> > > > > before calling the deferred mntput().
> > > > 
> > > > I still hate this with a passion because it adds another special-sauce
> > > > path into the unlock path. I've folded the following diff into it so it
> > > > at least doesn't start passing that pointless boolean and doesn't
> > > > introduce __namespace_unlock(). Just use a global variable and pick the
> > > > value off of it just as we do with the lists. Testing this now:

My apologies, I went with the feedback from v3[1] and failed to parse
the context surrounding it.

[1] https://lore.kernel.org/all/Znx-WGU5Wx6RaJyD@casper.infradead.org/

> > > > @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
> > > >         }
> > > >  out:
> > > >         unlock_mount_hash();
> > > > -       __namespace_unlock(flags & MNT_DETACH);
> > > > +       namespace_unlock();
> > > >         return retval;
> > > >  }
> > > > 
> > > > 

I believe you skipped setting unmounted_lazily in this hunk?

With this, I have applied your patch for the following discussion and
down thread. Happy to send a v5, should this patch be deemed worth
pursuing.

On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > them all via queue_rcu_work()?
> > > If so, couldn't we have make deferred_free_mounts global and have two
> > > release_list, say release_list and release_list_next_gp? The first one
> > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > empty, it would move release_list_next_gp -> release_list and invoke
> > > queue_rcu_work().
> > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > special-sauce.
> > > 
> > 
> > To my understanding it was preferred for non-lazy unmount consumers to
> > wait until the mntput before returning.

Unless I misunderstand the statement, and from the previous thread[2],
this is a requirement of the user API.

[2] https://lore.kernel.org/all/Y8m+M%2FffIEEWbfmv@ZenIV/

> > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote:
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index e5b0b920dd97..25599428706c 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
> > > …
> > > > +               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
> > > > +               if (d) {
> > > > +                       hlist_move_list(&head, &d->release_list);
> > > > +                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> > > > +                       queue_rcu_work(system_wq, &d->rwork);
> > > 
> > > Couldn't we do system_unbound_wq?

I think we can, afaict we don't need locality? I'll run some tests with
system_unbound_wq.

Thanks,
Christian Brauner April 9, 2025, 4:09 p.m. UTC | #8
On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote:
> > I still hate this with a passion because it adds another special-sauce
> > path into the unlock path. I've folded the following diff into it so it
> > at least doesn't start passing that pointless boolean and doesn't
> > introduce __namespace_unlock(). Just use a global variable and pick the
> > value off of it just as we do with the lists. Testing this now:
> 
> I tried to apply this on top of the previous one but it all chunks
> failed.

See https://web.git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs-6.16.mount
Eric Chanudet April 9, 2025, 4:41 p.m. UTC | #9
On Wed, Apr 09, 2025 at 03:04:43PM +0200, Mateusz Guzik wrote:
> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > Defer releasing the detached file-system when calling namespace_unlock()
> > during a lazy umount to return faster.
> > 
> > When requesting MNT_DETACH, the caller does not expect the file-system
> > to be shut down upon returning from the syscall. Calling
> > synchronize_rcu_expedited() has a significant cost on RT kernel that
> > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> > mount in a separate list and put it on a workqueue to run post RCU
> > grace-period.
> > 
> > w/o patch, 6.15-rc1 PREEMPT_RT:
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
> >     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
> >     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> > 
> > w/ patch, 6.15-rc1 PREEMPT_RT:
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
> >     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
> >     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> > 
> 
> Christian wants the patch done differently and posted his diff, so I'm
> not going to comment on it.
> 
> I do have some feedback about the commit message though.
> 
> In v1 it points out a real user which runs into it, while this one does
> not. So I would rewrite this and put in bench results from the actual
> consumer -- as it is one is left to wonder why patching up lazy unmount
> is of any significance.

Certainly. Doing the test mentioned in v1 again with v4+Christian's
suggested changes:
- QEMU x86_64, 8cpus, PREEMPT_RT, w/o patch:
# perf stat -r 10 --table --null -- crun run test
    0.07584 +- 0.00440 seconds time elapsed  ( +-  5.80% )
- QEMU x86_64, 8cpus, PREEMPT_RT, w/ patch:
# perf stat -r 10 --table --null -- crun run test
    0.01421 +- 0.00387 seconds time elapsed  ( +- 27.26% )

I will add that to the commit message.

> I had to look up what rcupdate.rcu_normal_after_boot=1 is. Docs claim it
> makes everyone use normal grace-periods, which explains the difference.
> But without that one is left to wonder if perhaps there is a perf bug in
> RCU instead where this is taking longer than it should despite the
> option. Thus I would also denote how the delay shows up.

I tried the test above while trying to force expedited RCU on the
cmdline with:
    rcupdate.rcu_normal_after_boot=0 rcupdate.rcu_expedited=1

Unfortunately, rcupdate.rcu_normal_after_boot=0 has no effect and
rcupdate_announce_bootup_oddness() reports:
[    0.015251] 	No expedited grace period (rcu_normal_after_boot).

Which yielded similar results:
- QEMU x86_64, 8cpus, PREEMPT_RT, w/o patch:
# perf stat -r 10 --table --null -- crun run test
    0.07838 +- 0.00322 seconds time elapsed  ( +-  4.11% )
- QEMU x86_64, 8cpus, PREEMPT_RT, w/ patch:
# perf stat -r 10 --table --null -- crun run test
    0.01582 +- 0.00353 seconds time elapsed  ( +- 22.30% )

I don't think rcupdate.rcu_expedited=1 had an effect, but I have not
confirmed that yet.

> v1 for reference:
> > v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
Ian Kent April 10, 2025, 1:17 a.m. UTC | #10
On 9/4/25 18:37, Christian Brauner wrote:
> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
>> Defer releasing the detached file-system when calling namespace_unlock()
>> during a lazy umount to return faster.
>>
>> When requesting MNT_DETACH, the caller does not expect the file-system
>> to be shut down upon returning from the syscall. Calling
>> synchronize_rcu_expedited() has a significant cost on RT kernel that
>> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
>> mount in a separate list and put it on a workqueue to run post RCU
>> grace-period.
>>
>> w/o patch, 6.15-rc1 PREEMPT_RT:
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>>      0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>>      0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
>>
>> w/ patch, 6.15-rc1 PREEMPT_RT:
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>>      0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>>      0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
>>
>> Signed-off-by: Alexander Larsson <alexl@redhat.com>
>> Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
>> Signed-off-by: Eric Chanudet <echanude@redhat.com>
>> ---
>>
>> Attempt to re-spin this series based on the feedback received in v3 that
>> pointed out the need to wait the grace-period in namespace_unlock()
>> before calling the deferred mntput().
> I still hate this with a passion because it adds another special-sauce
> path into the unlock path. I've folded the following diff into it so it
> at least doesn't start passing that pointless boolean and doesn't
> introduce __namespace_unlock(). Just use a global variable and pick the
> value off of it just as we do with the lists. Testing this now:

Yeah, it's painful that's for sure.


I also agree with you about the parameter, changing the call signature

always rubbed me the wrong way but I didn't push back on it mostly because

we needed to find a way to do it sensibly and it sounds like that's still

the case.


AFAICT what's needed is a way to synchronize umount with the lockless path

walk. Now umount detaches the mounts concerned, calls the rcu synchronize

(essentially sleeps) to ensure that any lockless path walks see the umount

before completing. But that rcu sync. is, as we can see, really wasteful so

we do need to find a viable way to synchronize this.


Strictly speaking the synchronization problem exists for normal and detached

umounts but if we can find a sound solution for detached mounts perhaps 
we can

extend later (but now that seems like a stretch) ...


I'm not sure why, perhaps it's just me, I don't know, but with this we don't

seem to be working well together to find a solution, I hope we can 
change that

this time around.


I was thinking of using a completion for this synchronization but even that

would be messy because of possible multiple processes doing walks at the 
time

which doesn't lend cleanly to using a completion.


Do you have any ideas on how this could be done yourself?


Ian

>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e5b0b920dd97..25599428706c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
>   static struct hlist_head *mountpoint_hashtable __ro_after_init;
>   static struct kmem_cache *mnt_cache __ro_after_init;
>   static DECLARE_RWSEM(namespace_sem);
> -static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
> -static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> +static bool unmounted_lazily;          /* protected by namespace_sem */
> +static HLIST_HEAD(unmounted);          /* protected by namespace_sem */
> +static LIST_HEAD(ex_mountpoints);      /* protected by namespace_sem */
>   static DEFINE_SEQLOCK(mnt_ns_tree_lock);
>
>   #ifdef CONFIG_FSNOTIFY
> @@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list)
>
>   static void defer_free_mounts(struct work_struct *work)
>   {
> -       struct deferred_free_mounts *d = container_of(
> -               to_rcu_work(work), struct deferred_free_mounts, rwork);
> +       struct deferred_free_mounts *d;
>
> +       d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
>          free_mounts(&d->release_list);
>          kfree(d);
>   }
>
> -static void __namespace_unlock(bool lazy)
> +static void namespace_unlock(void)
>   {
>          HLIST_HEAD(head);
>          LIST_HEAD(list);
> +       bool defer = unmounted_lazily;
>
>          hlist_move_list(&unmounted, &head);
>          list_splice_init(&ex_mountpoints, &list);
> @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
>          if (likely(hlist_empty(&head)))
>                  return;
>
> -       if (lazy) {
> -               struct deferred_free_mounts *d =
> -                       kmalloc(sizeof(*d), GFP_KERNEL);
> +       if (defer) {
> +               struct deferred_free_mounts *d;
>
> -               if (unlikely(!d))
> -                       goto out;
> -
> -               hlist_move_list(&head, &d->release_list);
> -               INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> -               queue_rcu_work(system_wq, &d->rwork);
> -               return;
> +               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
> +               if (d) {
> +                       hlist_move_list(&head, &d->release_list);
> +                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +                       queue_rcu_work(system_wq, &d->rwork);
> +                       return;
> +               }
>          }
> -
> -out:
>          synchronize_rcu_expedited();
>          free_mounts(&head);
>   }
>
> -static inline void namespace_unlock(void)
> -{
> -       __namespace_unlock(false);
> -}
> -
>   static inline void namespace_lock(void)
>   {
>          down_write(&namespace_sem);
> @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
>          }
>   out:
>          unlock_mount_hash();
> -       __namespace_unlock(flags & MNT_DETACH);
> +       namespace_unlock();
>          return retval;
>   }
>
>
>> v4:
>> - Use queue_rcu_work() to defer free_mounts() for lazy umounts
>> - Drop lazy_unlock global and refactor using a helper
>> v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
>> - Removed unneeded code for lazy umount case.
>> - Don't block within interrupt context.
>> v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
>> - Only defer releasing umount'ed filesystems for lazy umounts
>> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
>>
>>   fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
>>   1 file changed, 45 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 14935a0500a2..e5b0b920dd97 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
>>   static unsigned int mp_hash_mask __ro_after_init;
>>   static unsigned int mp_hash_shift __ro_after_init;
>>   
>> +struct deferred_free_mounts {
>> +	struct rcu_work rwork;
>> +	struct hlist_head release_list;
>> +};
>> +
>>   static __initdata unsigned long mhash_entries;
>>   static int __init set_mhash_entries(char *str)
>>   {
>> @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void)
>>   }
>>   #endif
>>   
>> -static void namespace_unlock(void)
>> +static void free_mounts(struct hlist_head *mount_list)
>>   {
>> -	struct hlist_head head;
>>   	struct hlist_node *p;
>>   	struct mount *m;
>> +
>> +	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
>> +		hlist_del(&m->mnt_umount);
>> +		mntput(&m->mnt);
>> +	}
>> +}
>> +
>> +static void defer_free_mounts(struct work_struct *work)
>> +{
>> +	struct deferred_free_mounts *d = container_of(
>> +		to_rcu_work(work), struct deferred_free_mounts, rwork);
>> +
>> +	free_mounts(&d->release_list);
>> +	kfree(d);
>> +}
>> +
>> +static void __namespace_unlock(bool lazy)
>> +{
>> +	HLIST_HEAD(head);
>>   	LIST_HEAD(list);
>>   
>>   	hlist_move_list(&unmounted, &head);
>> @@ -1817,12 +1840,27 @@ static void namespace_unlock(void)
>>   	if (likely(hlist_empty(&head)))
>>   		return;
>>   
>> -	synchronize_rcu_expedited();
>> +	if (lazy) {
>> +		struct deferred_free_mounts *d =
>> +			kmalloc(sizeof(*d), GFP_KERNEL);
>>   
>> -	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>> -		hlist_del(&m->mnt_umount);
>> -		mntput(&m->mnt);
>> +		if (unlikely(!d))
>> +			goto out;
>> +
>> +		hlist_move_list(&head, &d->release_list);
>> +		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
>> +		queue_rcu_work(system_wq, &d->rwork);
>> +		return;
>>   	}
>> +
>> +out:
>> +	synchronize_rcu_expedited();
>> +	free_mounts(&head);
>> +}
>> +
>> +static inline void namespace_unlock(void)
>> +{
>> +	__namespace_unlock(false);
>>   }
>>   
>>   static inline void namespace_lock(void)
>> @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags)
>>   	}
>>   out:
>>   	unlock_mount_hash();
>> -	namespace_unlock();
>> +	__namespace_unlock(flags & MNT_DETACH);
>>   	return retval;
>>   }
>>   
>> -- 
>> 2.49.0
>>
Ian Kent April 10, 2025, 3:04 a.m. UTC | #11
On 10/4/25 00:04, Christian Brauner wrote:
> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
>>>> them all via queue_rcu_work()?
>>>> If so, couldn't we have make deferred_free_mounts global and have two
>>>> release_list, say release_list and release_list_next_gp? The first one
>>>> will be used if queue_rcu_work() returns true, otherwise the second.
>>>> Then once defer_free_mounts() is done and release_list_next_gp not
>>>> empty, it would move release_list_next_gp -> release_list and invoke
>>>> queue_rcu_work().
>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the
>>>> special-sauce.
>>>>
>>> To my understanding it was preferred for non-lazy unmount consumers to
>>> wait until the mntput before returning.
>>>
>>> That aside if I understood your approach it would de facto serialize all
>>> of these?
>>>
>>> As in with the posted patches you can have different worker threads
>>> progress in parallel as they all get a private list to iterate.
>>>
>>> With your proposal only one can do any work.
>>>
>>> One has to assume with sufficient mount/unmount traffic this can
>>> eventually get into trouble.
>> Right, it would serialize them within the same worker thread. With one
>> worker for each put you would schedule multiple worker from the RCU
>> callback. Given the system_wq you will schedule them all on the CPU
>> which invokes the RCU callback. This kind of serializes it, too.
>>
>> The mntput() callback uses spinlock_t for locking and then it frees
>> resources. It does not look like it waits for something nor takes ages.
>> So it might not be needed to split each put into its own worker on a
>> different CPU… One busy bee might be enough ;)
> Unmounting can trigger very large number of mounts to be unmounted. If
> you're on a container heavy system or services that all propagate to
> each other in different mount namespaces mount propagation will generate
> a ton of umounts. So this cannot be underestimated.

Indeed yes, or shutting down autofs when it's using a direct mount map

with a few hundred (or thousand) entries.


Foe my part it's things like the targeted mount information and mounted

mounts listing system calls (done again recently by Miklos) and this slow

umount that are, IMHO, really important.


Ian

>
> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> during the unmount.
>
> If a concurrent path lookup calls legitimize_mnt() on such a mount and
> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> concurrent unmounter hold the last reference and it __legitimize_mnt()
> can thus simply drop the reference count. The final mntput() will be
> done by the umounter.
>
> The synchronize_rcu() call in namespace_unlock() takes care that the
> last mntput() doesn't happen until path walking has dropped out of RCU
> mode.
>
> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> EBUSY error because a concurrent lazy path walk will suddenly put the
> last reference via mntput().
>
> I'm unclear how that's handled in whatever it is you're proposing.
>
Sebastian Andrzej Siewior April 10, 2025, 8:28 a.m. UTC | #12
On 2025-04-09 18:04:21 [+0200], Christian Brauner wrote:
> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > > them all via queue_rcu_work()?
> > > > If so, couldn't we have make deferred_free_mounts global and have two
> > > > release_list, say release_list and release_list_next_gp? The first one
> > > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > > empty, it would move release_list_next_gp -> release_list and invoke
> > > > queue_rcu_work().
> > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > > special-sauce.
> > > > 
> > > 
> > > To my understanding it was preferred for non-lazy unmount consumers to
> > > wait until the mntput before returning.
> > > 
> > > That aside if I understood your approach it would de facto serialize all
> > > of these?
> > > 
> > > As in with the posted patches you can have different worker threads
> > > progress in parallel as they all get a private list to iterate.
> > > 
> > > With your proposal only one can do any work.
> > > 
> > > One has to assume with sufficient mount/unmount traffic this can
> > > eventually get into trouble.
> > 
> > Right, it would serialize them within the same worker thread. With one
> > worker for each put you would schedule multiple worker from the RCU
> > callback. Given the system_wq you will schedule them all on the CPU
> > which invokes the RCU callback. This kind of serializes it, too.
> > 
> > The mntput() callback uses spinlock_t for locking and then it frees
> > resources. It does not look like it waits for something nor takes ages.
> > So it might not be needed to split each put into its own worker on a
> > different CPU… One busy bee might be enough ;)
> 
> Unmounting can trigger very large number of mounts to be unmounted. If
> you're on a container heavy system or services that all propagate to
> each other in different mount namespaces mount propagation will generate
> a ton of umounts. So this cannot be underestimated.

So you want to have two of these unmounts in two worker so you can split
them on two CPUs in best case. As of today, in order to get through with
umounts asap you accelerate the grace period. And after the wake up may
utilize more than one CPU.

> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> during the unmount.
> 
> If a concurrent path lookup calls legitimize_mnt() on such a mount and
> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> concurrent unmounter hold the last reference and it __legitimize_mnt()
> can thus simply drop the reference count. The final mntput() will be
> done by the umounter.
> 
> The synchronize_rcu() call in namespace_unlock() takes care that the
> last mntput() doesn't happen until path walking has dropped out of RCU
> mode.
> 
> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> EBUSY error because a concurrent lazy path walk will suddenly put the
> last reference via mntput().
> 
> I'm unclear how that's handled in whatever it is you're proposing.

Okay. So we can't do this for UMOUNT_SYNC callers, thank you for the
explanation. We could avoid the memory allocation and have one worker to
take care of them all but you are afraid what this would mean to huge
container. Understandable. The s/system_wq/system_unbound_wq/ would make
sense.

Sebastian
Christian Brauner April 10, 2025, 10:48 a.m. UTC | #13
On Thu, Apr 10, 2025 at 10:28:33AM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 18:04:21 [+0200], Christian Brauner wrote:
> > On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > > > them all via queue_rcu_work()?
> > > > > If so, couldn't we have make deferred_free_mounts global and have two
> > > > > release_list, say release_list and release_list_next_gp? The first one
> > > > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > > > empty, it would move release_list_next_gp -> release_list and invoke
> > > > > queue_rcu_work().
> > > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > > > special-sauce.
> > > > > 
> > > > 
> > > > To my understanding it was preferred for non-lazy unmount consumers to
> > > > wait until the mntput before returning.
> > > > 
> > > > That aside if I understood your approach it would de facto serialize all
> > > > of these?
> > > > 
> > > > As in with the posted patches you can have different worker threads
> > > > progress in parallel as they all get a private list to iterate.
> > > > 
> > > > With your proposal only one can do any work.
> > > > 
> > > > One has to assume with sufficient mount/unmount traffic this can
> > > > eventually get into trouble.
> > > 
> > > Right, it would serialize them within the same worker thread. With one
> > > worker for each put you would schedule multiple worker from the RCU
> > > callback. Given the system_wq you will schedule them all on the CPU
> > > which invokes the RCU callback. This kind of serializes it, too.
> > > 
> > > The mntput() callback uses spinlock_t for locking and then it frees
> > > resources. It does not look like it waits for something nor takes ages.
> > > So it might not be needed to split each put into its own worker on a
> > > different CPU… One busy bee might be enough ;)
> > 
> > Unmounting can trigger very large number of mounts to be unmounted. If
> > you're on a container heavy system or services that all propagate to
> > each other in different mount namespaces mount propagation will generate
> > a ton of umounts. So this cannot be underestimated.
> 
> So you want to have two of these unmounts in two worker so you can split
> them on two CPUs in best case. As of today, in order to get through with
> umounts asap you accelerate the grace period. And after the wake up may
> utilize more than one CPU.
> 
> > If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> > umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> > during the unmount.
> > 
> > If a concurrent path lookup calls legitimize_mnt() on such a mount and
> > sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> > concurrent unmounter hold the last reference and it __legitimize_mnt()
> > can thus simply drop the reference count. The final mntput() will be
> > done by the umounter.
> > 
> > The synchronize_rcu() call in namespace_unlock() takes care that the
> > last mntput() doesn't happen until path walking has dropped out of RCU
> > mode.
> > 
> > Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> > EBUSY error because a concurrent lazy path walk will suddenly put the
> > last reference via mntput().
> > 
> > I'm unclear how that's handled in whatever it is you're proposing.
> 
> Okay. So we can't do this for UMOUNT_SYNC callers, thank you for the
> explanation. We could avoid the memory allocation and have one worker to
> take care of them all but you are afraid what this would mean to huge
> container. Understandable. The s/system_wq/system_unbound_wq/ would make
> sense.

Don't get me wrong if you have a clever idea here I'm all ears.
Ian Kent April 10, 2025, 1:58 p.m. UTC | #14
On 10/4/25 00:04, Christian Brauner wrote:
> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
>>>> them all via queue_rcu_work()?
>>>> If so, couldn't we have make deferred_free_mounts global and have two
>>>> release_list, say release_list and release_list_next_gp? The first one
>>>> will be used if queue_rcu_work() returns true, otherwise the second.
>>>> Then once defer_free_mounts() is done and release_list_next_gp not
>>>> empty, it would move release_list_next_gp -> release_list and invoke
>>>> queue_rcu_work().
>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the
>>>> special-sauce.
>>>>
>>> To my understanding it was preferred for non-lazy unmount consumers to
>>> wait until the mntput before returning.
>>>
>>> That aside if I understood your approach it would de facto serialize all
>>> of these?
>>>
>>> As in with the posted patches you can have different worker threads
>>> progress in parallel as they all get a private list to iterate.
>>>
>>> With your proposal only one can do any work.
>>>
>>> One has to assume with sufficient mount/unmount traffic this can
>>> eventually get into trouble.
>> Right, it would serialize them within the same worker thread. With one
>> worker for each put you would schedule multiple worker from the RCU
>> callback. Given the system_wq you will schedule them all on the CPU
>> which invokes the RCU callback. This kind of serializes it, too.
>>
>> The mntput() callback uses spinlock_t for locking and then it frees
>> resources. It does not look like it waits for something nor takes ages.
>> So it might not be needed to split each put into its own worker on a
>> different CPU… One busy bee might be enough ;)
> Unmounting can trigger very large number of mounts to be unmounted. If
> you're on a container heavy system or services that all propagate to
> each other in different mount namespaces mount propagation will generate
> a ton of umounts. So this cannot be underestimated.
>
> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> during the unmount.
>
> If a concurrent path lookup calls legitimize_mnt() on such a mount and
> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> concurrent unmounter hold the last reference and it __legitimize_mnt()
> can thus simply drop the reference count. The final mntput() will be
> done by the umounter.

In umount_tree() it looks like the unmounted mount remains hashed (ie.

disconnect_mount() returns false) so can't it still race with an rcu-walk

regardless of the sybcronsize_rcu().


Surely I'm missing something ...


Ian

>
> The synchronize_rcu() call in namespace_unlock() takes care that the
> last mntput() doesn't happen until path walking has dropped out of RCU
> mode.
>
> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> EBUSY error because a concurrent lazy path walk will suddenly put the
> last reference via mntput().
>
> I'm unclear how that's handled in whatever it is you're proposing.
>
Ian Kent April 11, 2025, 2:36 a.m. UTC | #15
On 10/4/25 21:58, Ian Kent wrote:
>
> On 10/4/25 00:04, Christian Brauner wrote:
>> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior 
>> wrote:
>>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
>>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior 
>>>> wrote:
>>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we 
>>>>> handle
>>>>> them all via queue_rcu_work()?
>>>>> If so, couldn't we have make deferred_free_mounts global and have two
>>>>> release_list, say release_list and release_list_next_gp? The first 
>>>>> one
>>>>> will be used if queue_rcu_work() returns true, otherwise the second.
>>>>> Then once defer_free_mounts() is done and release_list_next_gp not
>>>>> empty, it would move release_list_next_gp -> release_list and invoke
>>>>> queue_rcu_work().
>>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the
>>>>> special-sauce.
>>>>>
>>>> To my understanding it was preferred for non-lazy unmount consumers to
>>>> wait until the mntput before returning.
>>>>
>>>> That aside if I understood your approach it would de facto 
>>>> serialize all
>>>> of these?
>>>>
>>>> As in with the posted patches you can have different worker threads
>>>> progress in parallel as they all get a private list to iterate.
>>>>
>>>> With your proposal only one can do any work.
>>>>
>>>> One has to assume with sufficient mount/unmount traffic this can
>>>> eventually get into trouble.
>>> Right, it would serialize them within the same worker thread. With one
>>> worker for each put you would schedule multiple worker from the RCU
>>> callback. Given the system_wq you will schedule them all on the CPU
>>> which invokes the RCU callback. This kind of serializes it, too.
>>>
>>> The mntput() callback uses spinlock_t for locking and then it frees
>>> resources. It does not look like it waits for something nor takes ages.
>>> So it might not be needed to split each put into its own worker on a
>>> different CPU… One busy bee might be enough ;)
>> Unmounting can trigger very large number of mounts to be unmounted. If
>> you're on a container heavy system or services that all propagate to
>> each other in different mount namespaces mount propagation will generate
>> a ton of umounts. So this cannot be underestimated.
>>
>> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
>> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
>> during the unmount.
>>
>> If a concurrent path lookup calls legitimize_mnt() on such a mount and
>> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
>> concurrent unmounter hold the last reference and it __legitimize_mnt()
>> can thus simply drop the reference count. The final mntput() will be
>> done by the umounter.
>
> In umount_tree() it looks like the unmounted mount remains hashed (ie.
>
> disconnect_mount() returns false) so can't it still race with an rcu-walk
>
> regardless of the sybcronsize_rcu().
>
>
> Surely I'm missing something ...

Ans I am, please ignore this.

I miss-read the return of the mnt->mnt_parent->mnt.mnt_flags check in 
disconnect_mount(),

my bad.

>
>
> Ian
>
>>
>> The synchronize_rcu() call in namespace_unlock() takes care that the
>> last mntput() doesn't happen until path walking has dropped out of RCU
>> mode.
>>
>> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
>> EBUSY error because a concurrent lazy path walk will suddenly put the
>> last reference via mntput().
>>
>> I'm unclear how that's handled in whatever it is you're proposing.
>>
>
Christian Brauner April 11, 2025, 3:17 p.m. UTC | #16
> With this, I have applied your patch for the following discussion and
> down thread. Happy to send a v5, should this patch be deemed worth
> pursuing.

I'll just switch to system_unbound_wq and there's no need to resend for
now. If the numbers somehow change significantly due to that change just
mention that. Thanks.
Eric Chanudet April 11, 2025, 6:30 p.m. UTC | #17
On Fri, Apr 11, 2025 at 05:17:25PM +0200, Christian Brauner wrote:
> > With this, I have applied your patch for the following discussion and
> > down thread. Happy to send a v5, should this patch be deemed worth
> > pursuing.
> 
> I'll just switch to system_unbound_wq and there's no need to resend for
> now. If the numbers somehow change significantly due to that change just
> mention that. Thanks.
> 

Thank you, I do not see a significant change in the numbers with
system_unbound_wq compared to system_wq.

Best,
Mark Brown April 16, 2025, 10:11 p.m. UTC | #18
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> Defer releasing the detached file-system when calling namespace_unlock()
> during a lazy umount to return faster.
> 
> When requesting MNT_DETACH, the caller does not expect the file-system
> to be shut down upon returning from the syscall. Calling
> synchronize_rcu_expedited() has a significant cost on RT kernel that
> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> mount in a separate list and put it on a workqueue to run post RCU
> grace-period.

For the past couple of days we've been seeing failures in a bunch of LTP
filesystem related tests on various arm64 systems.  The failures are
mostly (I think all) in the form:

20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1

ie, a failure to stand up the test environment on the loopback device
all happening immediately after some other filesystem related test which
also used the loop device.  A bisect points to commit a6c7a78f1b6b97
which is this, which does look rather relevant.  LTP is obviously being
very much an edge case here.

Bisect log:

git bisect start
# status: waiting for both good and bad commits
# bad: [f660850bc246fef15ba78c81f686860324396628] Add linux-next specific files for 20250416
git bisect bad f660850bc246fef15ba78c81f686860324396628
# status: waiting for good commit(s), bad commit known
# good: [a6b9fbe391e8da36d2892590db4db4ff94005807] Merge branch 'for-linux-next-fixes' of https://gitlab.freedesktop.org/drm/misc/kernel.git
git bisect good a6b9fbe391e8da36d2892590db4db4ff94005807
# bad: [c017ce6f8d2939445ac473ada6a266aca0a0d6eb] Merge branch 'drm-next' of https://gitlab.freedesktop.org/drm/kernel.git
git bisect bad c017ce6f8d2939445ac473ada6a266aca0a0d6eb
# bad: [3efe6d22f422cbba9de75b53890c624a83dbb70a] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
git bisect bad 3efe6d22f422cbba9de75b53890c624a83dbb70a
# good: [ce44f781015a988baf21317f7822567a62a77a5f] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git
git bisect good ce44f781015a988baf21317f7822567a62a77a5f
# good: [64a47089f778b6e4bfaaf62d4384eaa2bcaf9b63] Merge branch 'overlayfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git
git bisect good 64a47089f778b6e4bfaaf62d4384eaa2bcaf9b63
# good: [cdb4a05e60b2646d25f7227c7dfe5d54c3f3a173] Merge branch 'for-next' of git://github.com/openrisc/linux.git
git bisect good cdb4a05e60b2646d25f7227c7dfe5d54c3f3a173
# good: [00b7410736b1d46ab26c3b4e04eaa819e3f7448c] Merge branch 'vfs-6.16.misc' into vfs.all
git bisect good 00b7410736b1d46ab26c3b4e04eaa819e3f7448c
# bad: [a9d6e19f91b6600c02276cd7903f747a5389950c] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git
git bisect bad a9d6e19f91b6600c02276cd7903f747a5389950c
# bad: [03e1a90f178e3cea3e8864135046e31f4dbe5e2f] Merge branch 'vfs-6.16.mount' into vfs.all
git bisect bad 03e1a90f178e3cea3e8864135046e31f4dbe5e2f
# good: [a9d7de0f68b79e5e481967fc605698915a37ac13] Merge patch series "pidfs: ensure consistent ENOENT/ESRCH reporting"
git bisect good a9d7de0f68b79e5e481967fc605698915a37ac13
# bad: [675e87c588fc7d054c8f626fd59fcad6c534f4c0] selftests/mount_settattr: add missing STATX_MNT_ID_UNIQUE define
git bisect bad 675e87c588fc7d054c8f626fd59fcad6c534f4c0
# bad: [449f3214ce15b697277d5991f096140cf773e849] selftests/mount_settattr: don't define sys_open_tree() twice
git bisect bad 449f3214ce15b697277d5991f096140cf773e849
# bad: [a6c7a78f1b6b974a10fcf4646769ba8bf2596c58] fs/namespace: defer RCU sync for MNT_DETACH umount
git bisect bad a6c7a78f1b6b974a10fcf4646769ba8bf2596c58
# first bad commit: [a6c7a78f1b6b974a10fcf4646769ba8bf2596c58] fs/namespace: defer RCU sync for MNT_DETACH umount
Christian Brauner April 17, 2025, 9:01 a.m. UTC | #19
On Wed, Apr 16, 2025 at 11:11:51PM +0100, Mark Brown wrote:
> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > Defer releasing the detached file-system when calling namespace_unlock()
> > during a lazy umount to return faster.
> > 
> > When requesting MNT_DETACH, the caller does not expect the file-system
> > to be shut down upon returning from the syscall. Calling
> > synchronize_rcu_expedited() has a significant cost on RT kernel that
> > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> > mount in a separate list and put it on a workqueue to run post RCU
> > grace-period.
> 
> For the past couple of days we've been seeing failures in a bunch of LTP
> filesystem related tests on various arm64 systems.  The failures are
> mostly (I think all) in the form:
> 
> 20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
> 20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
> 20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
> 20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1
> 
> ie, a failure to stand up the test environment on the loopback device
> all happening immediately after some other filesystem related test which
> also used the loop device.  A bisect points to commit a6c7a78f1b6b97
> which is this, which does look rather relevant.  LTP is obviously being
> very much an edge case here.

Hah, here's something I didn't consider and that I should've caught.

Look, on current mainline no matter if MNT_DETACH/UMOUNT_SYNC or
non-MNT_DETACH/UMOUNT_SYNC. The mntput() calls after the
synchronize_rcu_expedited() calls will end up in task_work():

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }

because all of those mntput()s are done from the task's contect.

IOW, if userspace does umount(MNT_DETACH) and the task has returned to
userspace it is guaranteed that all calls to cleanup_mnt() are done.

With your change that simply isn't true anymore. The call to
queue_rcu_work() will offload those mntput() to be done from a kthread.
That in turn means all those mntputs end up on the delayed_mntput_work()
queue. So the mounts aren't cleaned up by the time the task returns to
userspace.

And that's likely problematic even for the explicit MNT_DETACH use-case
because it means EBUSY errors are a lot more likely to be seen by
concurrent mounters especially for loop devices.

And fwiw, this is exactly what I pointed out in a prior posting to this
patch series.

But we've also worsened that situation by doing the deferred thing for
any non-UMOUNT_SYNC. That which includes namespace exit. IOW, if the
last task in a new mount namespace exits it will drop_collected_mounts()
without UMOUNT_SYNC because we know that they aren't reachable anymore,
after all the mount namespace is dead.

But now we defer all cleanup to the kthread which means when the task
returns to userspace there's still mounts to be cleaned up.
Ian Kent April 17, 2025, 10:17 a.m. UTC | #20
On 17/4/25 17:01, Christian Brauner wrote:
> On Wed, Apr 16, 2025 at 11:11:51PM +0100, Mark Brown wrote:
>> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
>>> Defer releasing the detached file-system when calling namespace_unlock()
>>> during a lazy umount to return faster.
>>>
>>> When requesting MNT_DETACH, the caller does not expect the file-system
>>> to be shut down upon returning from the syscall. Calling
>>> synchronize_rcu_expedited() has a significant cost on RT kernel that
>>> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
>>> mount in a separate list and put it on a workqueue to run post RCU
>>> grace-period.
>> For the past couple of days we've been seeing failures in a bunch of LTP
>> filesystem related tests on various arm64 systems.  The failures are
>> mostly (I think all) in the form:
>>
>> 20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
>> 20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
>> 20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
>> 20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1
>>
>> ie, a failure to stand up the test environment on the loopback device
>> all happening immediately after some other filesystem related test which
>> also used the loop device.  A bisect points to commit a6c7a78f1b6b97
>> which is this, which does look rather relevant.  LTP is obviously being
>> very much an edge case here.
> Hah, here's something I didn't consider and that I should've caught.
>
> Look, on current mainline no matter if MNT_DETACH/UMOUNT_SYNC or
> non-MNT_DETACH/UMOUNT_SYNC. The mntput() calls after the
> synchronize_rcu_expedited() calls will end up in task_work():
>
>          if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
>                  struct task_struct *task = current;
>                  if (likely(!(task->flags & PF_KTHREAD))) {
>                          init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
>                          if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
>                                  return;
>                  }
>                  if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
>                          schedule_delayed_work(&delayed_mntput_work, 1);
>                  return;
>          }
>
> because all of those mntput()s are done from the task's contect.
>
> IOW, if userspace does umount(MNT_DETACH) and the task has returned to
> userspace it is guaranteed that all calls to cleanup_mnt() are done.
>
> With your change that simply isn't true anymore. The call to
> queue_rcu_work() will offload those mntput() to be done from a kthread.
> That in turn means all those mntputs end up on the delayed_mntput_work()
> queue. So the mounts aren't cleaned up by the time the task returns to
> userspace.
>
> And that's likely problematic even for the explicit MNT_DETACH use-case
> because it means EBUSY errors are a lot more likely to be seen by
> concurrent mounters especially for loop devices.
>
> And fwiw, this is exactly what I pointed out in a prior posting to this
> patch series.

And I didn't understand what you said then but this problem is more

understandable to me now.


>
> But we've also worsened that situation by doing the deferred thing for
> any non-UMOUNT_SYNC. That which includes namespace exit. IOW, if the
> last task in a new mount namespace exits it will drop_collected_mounts()
> without UMOUNT_SYNC because we know that they aren't reachable anymore,
> after all the mount namespace is dead.
>
> But now we defer all cleanup to the kthread which means when the task
> returns to userspace there's still mounts to be cleaned up.

Correct me if I'm wrong but the actual problem is that the mechanism used

to wait until there are no processes doing an rcu-walk on mounts in the

discard list is unnecessarily long according to what Eric has seen. So a

different was to know there are no processes doing an rcu-walk for one of

these mounts is needed.


There must be a better way to do this than the current rcu wait method ...


Ian
Christian Brauner April 17, 2025, 11:31 a.m. UTC | #21
On Thu, Apr 17, 2025 at 06:17:01PM +0800, Ian Kent wrote:
> 
> On 17/4/25 17:01, Christian Brauner wrote:
> > On Wed, Apr 16, 2025 at 11:11:51PM +0100, Mark Brown wrote:
> > > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > > > Defer releasing the detached file-system when calling namespace_unlock()
> > > > during a lazy umount to return faster.
> > > > 
> > > > When requesting MNT_DETACH, the caller does not expect the file-system
> > > > to be shut down upon returning from the syscall. Calling
> > > > synchronize_rcu_expedited() has a significant cost on RT kernel that
> > > > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> > > > mount in a separate list and put it on a workqueue to run post RCU
> > > > grace-period.
> > > For the past couple of days we've been seeing failures in a bunch of LTP
> > > filesystem related tests on various arm64 systems.  The failures are
> > > mostly (I think all) in the form:
> > > 
> > > 20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
> > > 20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
> > > 20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
> > > 20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1
> > > 
> > > ie, a failure to stand up the test environment on the loopback device
> > > all happening immediately after some other filesystem related test which
> > > also used the loop device.  A bisect points to commit a6c7a78f1b6b97
> > > which is this, which does look rather relevant.  LTP is obviously being
> > > very much an edge case here.
> > Hah, here's something I didn't consider and that I should've caught.
> > 
> > Look, on current mainline no matter if MNT_DETACH/UMOUNT_SYNC or
> > non-MNT_DETACH/UMOUNT_SYNC. The mntput() calls after the
> > synchronize_rcu_expedited() calls will end up in task_work():
> > 
> >          if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
> >                  struct task_struct *task = current;
> >                  if (likely(!(task->flags & PF_KTHREAD))) {
> >                          init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
> >                          if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
> >                                  return;
> >                  }
> >                  if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
> >                          schedule_delayed_work(&delayed_mntput_work, 1);
> >                  return;
> >          }
> > 
> > because all of those mntput()s are done from the task's contect.
> > 
> > IOW, if userspace does umount(MNT_DETACH) and the task has returned to
> > userspace it is guaranteed that all calls to cleanup_mnt() are done.
> > 
> > With your change that simply isn't true anymore. The call to
> > queue_rcu_work() will offload those mntput() to be done from a kthread.
> > That in turn means all those mntputs end up on the delayed_mntput_work()
> > queue. So the mounts aren't cleaned up by the time the task returns to
> > userspace.
> > 
> > And that's likely problematic even for the explicit MNT_DETACH use-case
> > because it means EBUSY errors are a lot more likely to be seen by
> > concurrent mounters especially for loop devices.
> > 
> > And fwiw, this is exactly what I pointed out in a prior posting to this
> > patch series.
> 
> And I didn't understand what you said then but this problem is more
> 
> understandable to me now.
> 
> 
> > 
> > But we've also worsened that situation by doing the deferred thing for
> > any non-UMOUNT_SYNC. That which includes namespace exit. IOW, if the
> > last task in a new mount namespace exits it will drop_collected_mounts()
> > without UMOUNT_SYNC because we know that they aren't reachable anymore,
> > after all the mount namespace is dead.
> > 
> > But now we defer all cleanup to the kthread which means when the task
> > returns to userspace there's still mounts to be cleaned up.
> 
> Correct me if I'm wrong but the actual problem is that the mechanism used
> 
> to wait until there are no processes doing an rcu-walk on mounts in the
> 
> discard list is unnecessarily long according to what Eric has seen. So a

I think that the current approach is still salvagable but I need to test
this and currently LTP doesn't really compile for me.
Mark Brown April 17, 2025, 11:49 a.m. UTC | #22
On Thu, Apr 17, 2025 at 01:31:40PM +0200, Christian Brauner wrote:

> I think that the current approach is still salvagable but I need to test
> this and currently LTP doesn't really compile for me.

FWIW there's some rootfs images with LTP used by KernelCI available
here:

    https://storage.kernelci.org/images/rootfs/debian/bookworm-ltp/20240313.0/

Dunno if that's helpful or not.
Christian Brauner April 17, 2025, 3:12 p.m. UTC | #23
On Thu, Apr 17, 2025 at 01:31:40PM +0200, Christian Brauner wrote:
> On Thu, Apr 17, 2025 at 06:17:01PM +0800, Ian Kent wrote:
> > 
> > On 17/4/25 17:01, Christian Brauner wrote:
> > > On Wed, Apr 16, 2025 at 11:11:51PM +0100, Mark Brown wrote:
> > > > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > > > > Defer releasing the detached file-system when calling namespace_unlock()
> > > > > during a lazy umount to return faster.
> > > > > 
> > > > > When requesting MNT_DETACH, the caller does not expect the file-system
> > > > > to be shut down upon returning from the syscall. Calling
> > > > > synchronize_rcu_expedited() has a significant cost on RT kernel that
> > > > > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> > > > > mount in a separate list and put it on a workqueue to run post RCU
> > > > > grace-period.
> > > > For the past couple of days we've been seeing failures in a bunch of LTP
> > > > filesystem related tests on various arm64 systems.  The failures are
> > > > mostly (I think all) in the form:
> > > > 
> > > > 20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
> > > > 20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
> > > > 20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
> > > > 20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1
> > > > 
> > > > ie, a failure to stand up the test environment on the loopback device
> > > > all happening immediately after some other filesystem related test which
> > > > also used the loop device.  A bisect points to commit a6c7a78f1b6b97
> > > > which is this, which does look rather relevant.  LTP is obviously being
> > > > very much an edge case here.
> > > Hah, here's something I didn't consider and that I should've caught.
> > > 
> > > Look, on current mainline no matter if MNT_DETACH/UMOUNT_SYNC or
> > > non-MNT_DETACH/UMOUNT_SYNC. The mntput() calls after the
> > > synchronize_rcu_expedited() calls will end up in task_work():
> > > 
> > >          if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
> > >                  struct task_struct *task = current;
> > >                  if (likely(!(task->flags & PF_KTHREAD))) {
> > >                          init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
> > >                          if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
> > >                                  return;
> > >                  }
> > >                  if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
> > >                          schedule_delayed_work(&delayed_mntput_work, 1);
> > >                  return;
> > >          }
> > > 
> > > because all of those mntput()s are done from the task's contect.
> > > 
> > > IOW, if userspace does umount(MNT_DETACH) and the task has returned to
> > > userspace it is guaranteed that all calls to cleanup_mnt() are done.
> > > 
> > > With your change that simply isn't true anymore. The call to
> > > queue_rcu_work() will offload those mntput() to be done from a kthread.
> > > That in turn means all those mntputs end up on the delayed_mntput_work()
> > > queue. So the mounts aren't cleaned up by the time the task returns to
> > > userspace.
> > > 
> > > And that's likely problematic even for the explicit MNT_DETACH use-case
> > > because it means EBUSY errors are a lot more likely to be seen by
> > > concurrent mounters especially for loop devices.
> > > 
> > > And fwiw, this is exactly what I pointed out in a prior posting to this
> > > patch series.
> > 
> > And I didn't understand what you said then but this problem is more
> > 
> > understandable to me now.

I mean I'm saying it could be problematic for the MNT_DETACH case. I'm
not sure how likely it is. If some process P1 does MNT_DETACH on a loop
device and then another process P2 wants to use that loop device and
sess EBUSY then we don't care. That can already happen. But even in this
case I'm not sure if there aren't subtle ways where this will bite us.

But there's two other problems:

(1) The real issue is with the same process P1 doing stupid stuff that
    just happened to work. For example if there's code out there that
    does a MNT_DETACH on a filesystem that uses a loop device
    immediately followed by the desire to reuse the loop device:

    It doesn't matter whether such code must in theory already be
    prepared to handle the case of seeing EBUSY after the MNT_DETACH. If
    this currently just happens to work because we guarantee that the
    last mntput() and cleanup_mnt() will have been done when the caller
    returns to userspace it's a uapi break plain and simple.

    This implicit guarantee isn't there anymore after your patch because
    the final mntput() from is done from the system_unbound_wq which has
    the consequence that the cleanup_mnt() is done from the
    delayed_mntput_work workqeueue. And that leads to problem number
    (2).

(2) If a userspace task is dealing with e.g., a broken NFS server and
    does a umount(MNT_DETACH) and that NFS server blocks indefinitely
    then right now it will be the task's problem that called the umount.
    It will simply hang and pay the price.

    With your patch however, that cleanup_mnt() and the
    deactivate_super() call it entails will be done from
    delayed_mntput_work...

    So if there's some userspace process with a broken NFS server and it
    does umount(MNT_DETACH) it will end up hanging every other
    umount(MNT_DETACH) on the system because the dealyed_mntput_work
    workqueue (to my understanding) cannot make progress.

    So in essence this patch to me seems like handing a DOS vector for
    MNT_DETACH to userspace.

> > 
> > 
> > > 
> > > But we've also worsened that situation by doing the deferred thing for
> > > any non-UMOUNT_SYNC. That which includes namespace exit. IOW, if the
> > > last task in a new mount namespace exits it will drop_collected_mounts()
> > > without UMOUNT_SYNC because we know that they aren't reachable anymore,
> > > after all the mount namespace is dead.
> > > 
> > > But now we defer all cleanup to the kthread which means when the task
> > > returns to userspace there's still mounts to be cleaned up.
> > 
> > Correct me if I'm wrong but the actual problem is that the mechanism used
> > 
> > to wait until there are no processes doing an rcu-walk on mounts in the
> > 
> > discard list is unnecessarily long according to what Eric has seen. So a
> 
> I think that the current approach is still salvagable but I need to test
> this and currently LTP doesn't really compile for me.

I managed to reproduce this and it is like I suspected. I just thought
"Oh well, if it's not UMOUNT_SYNC then we can just punt this to a
workqueue." which is obviously not going to work.

If a mount namespace gets cleaned up or if a detached mount tree is
cleaned up or if audit calls drop_collected_mounts() we obviously don't
want to defer the unmount. So the fix is to simply restrict this to
userspace actually requesting MNT_DETACH.

But I'm seeing way more substantial issues with this patch.
Christian Brauner April 17, 2025, 3:28 p.m. UTC | #24
>     So if there's some userspace process with a broken NFS server and it
>     does umount(MNT_DETACH) it will end up hanging every other
>     umount(MNT_DETACH) on the system because the dealyed_mntput_work
>     workqueue (to my understanding) cannot make progress.

Ok, "to my understanding" has been updated after going back and reading
the delayed work code. Luckily it's not as bad as I thought it is
because it's queued on system_wq which is multi-threaded so it's at
least not causing everyone with MNT_DETACH to get stuck. I'm still
skeptical how safe this all is.
Sebastian Andrzej Siewior April 17, 2025, 3:31 p.m. UTC | #25
On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> >     So if there's some userspace process with a broken NFS server and it
> >     does umount(MNT_DETACH) it will end up hanging every other
> >     umount(MNT_DETACH) on the system because the dealyed_mntput_work
> >     workqueue (to my understanding) cannot make progress.
> 
> Ok, "to my understanding" has been updated after going back and reading
> the delayed work code. Luckily it's not as bad as I thought it is
> because it's queued on system_wq which is multi-threaded so it's at
> least not causing everyone with MNT_DETACH to get stuck. I'm still
> skeptical how safe this all is. 

I would (again) throw system_unbound_wq into the game because the former
will remain on the CPU on which has been enqueued (if speaking about
multi threading).

Sebastian
Christian Brauner April 17, 2025, 4:28 p.m. UTC | #26
On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > >     So if there's some userspace process with a broken NFS server and it
> > >     does umount(MNT_DETACH) it will end up hanging every other
> > >     umount(MNT_DETACH) on the system because the dealyed_mntput_work
> > >     workqueue (to my understanding) cannot make progress.
> > 
> > Ok, "to my understanding" has been updated after going back and reading
> > the delayed work code. Luckily it's not as bad as I thought it is
> > because it's queued on system_wq which is multi-threaded so it's at
> > least not causing everyone with MNT_DETACH to get stuck. I'm still
> > skeptical how safe this all is. 
> 
> I would (again) throw system_unbound_wq into the game because the former
> will remain on the CPU on which has been enqueued (if speaking about
> multi threading).

Yes, good point.

However, what about using polled grace periods?

A first simple-minded thing to do would be to record the grace period
after umount_tree() has finished and the check it in namespace_unlock():

diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..1e7ebcdd1ebc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
 static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
+static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
 static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);
@@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
+       unsigned long unmount_seq = rcu_unmount_seq;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
@@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
        if (likely(hlist_empty(&head)))
                return;

-       synchronize_rcu_expedited();
+       cond_synchronize_rcu_expedited(unmount_seq);

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
@@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                 */
                mnt_notify_add(p);
        }
+
+       rcu_unmount_seq = get_state_synchronize_rcu();
 }

 static void shrink_submounts(struct mount *mnt);


I'm not sure how much that would buy us. If it doesn't then it should be
possible to play with the following possibly strange idea:

diff --git a/fs/mount.h b/fs/mount.h
index 7aecf2a60472..51b86300dc50 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -61,6 +61,7 @@ struct mount {
                struct rb_node mnt_node; /* node in the ns->mounts rbtree */
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
+               unsigned long mnt_rcu_unmount_seq;
        };
 #ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..aae9df75beed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
+       bool needs_synchronize_rcu = false;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
@@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
        if (likely(hlist_empty(&head)))
                return;

-       synchronize_rcu_expedited();
+       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
+                       continue;
+
+               needs_synchronize_rcu = true;
+               break;
+       }
+
+       if (needs_synchronize_rcu)
+               synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
@@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
-               if (disconnect)
+               if (disconnect) {
+                       p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
                        hlist_add_head(&p->mnt_umount, &unmounted);
+               }

                /*
                 * At this point p->mnt_ns is NULL, notification will be queued

This would allow to elide synchronize rcu calls if they elapsed in the
meantime since we moved that mount to the unmounted list.
Eric Chanudet April 17, 2025, 10:33 p.m. UTC | #27
On Thu, Apr 17, 2025 at 06:28:20PM +0200, Christian Brauner wrote:
> On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > > > (1) The real issue is with the same process P1 doing stupid stuff that
> > > >     just happened to work. For example if there's code out there that
> > > >     does a MNT_DETACH on a filesystem that uses a loop device
> > > >     immediately followed by the desire to reuse the loop device:
> > > > 
> > > >     It doesn't matter whether such code must in theory already be
> > > >     prepared to handle the case of seeing EBUSY after the MNT_DETACH. If
> > > >     this currently just happens to work because we guarantee that the
> > > >     last mntput() and cleanup_mnt() will have been done when the caller
> > > >     returns to userspace it's a uapi break plain and simple.
> > > > 
> > > >     This implicit guarantee isn't there anymore after your patch because
> > > >     the final mntput() from is done from the system_unbound_wq which has
> > > >     the consequence that the cleanup_mnt() is done from the
> > > >     delayed_mntput_work workqeueue. And that leads to problem number
> > > >     (2).

The following script runs fine on mainline, but consistently fails with
the version of this patch after discussions upstream:
  #! /usr/bin/bash -eux

  mntpath="/mnt"
  umount -R "$mntpath" || true

  fspath="/root/fs.ext4"
  dd if=/dev/zero of="$fspath" bs=1M count=500
  mkfs.ext4 "$fspath"

  loopdev="$(losetup -f "$fspath" --show)"

  for i in $(seq 0 99); do
      mkfs.ext4 -Fq "$fspath"
      mount "$loopdev" "$mntpath"
      touch "$mntpath/f1"
      umount -l "$mntpath"
  done
  losetup -d "$loopdev"

Failure looks like:
+ for i in $(seq 0 99)
+ mkfs.ext4 -Fq /root/fs.ext4
/root/fs.ext4 contains a ext4 file system
    created on Thu Apr 17 20:42:04 2025
+ mount /dev/loop0 /mnt
+ touch /mnt/f1
+ umount -l /mnt
+ for i in $(seq 0 99)
+ mkfs.ext4 -Fq /root/fs.ext4
/root/fs.ext4 contains a ext4 file system
    created on Thu Apr 17 20:42:04 2025
+ mount /dev/loop0 /mnt
mount: /mnt: mount(2) system call failed: Structure needs cleaning.

[    9.352478] EXT4-fs (loop0): mounted filesystem 3c5c632e-24d1-4027-b378-f51e67972883 r/w with ordered data mode. Quota mode: none.
[    9.449121] EXT4-fs (loop0): unmounting filesystem 3c5c632e-24d1-4027-b378-f51e67972883.
[    9.462093] EXT4-fs (loop0): ext4_check_descriptors: Checksum for group 32 failed (10605!=64170)
[    9.462099] EXT4-fs (loop0): group descriptors corrupted!

Seems worse than EBUSY :(.

> However, what about using polled grace periods?
> 
> A first simple-minded thing to do would be to record the grace period
> after umount_tree() has finished and the check it in namespace_unlock():
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..1e7ebcdd1ebc 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init;
>  static struct hlist_head *mountpoint_hashtable __ro_after_init;
>  static struct kmem_cache *mnt_cache __ro_after_init;
>  static DECLARE_RWSEM(namespace_sem);
> +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
>  static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>  static DEFINE_SEQLOCK(mnt_ns_tree_lock);
> @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
>         struct hlist_head head;
>         struct hlist_node *p;
>         struct mount *m;
> +       unsigned long unmount_seq = rcu_unmount_seq;
>         LIST_HEAD(list);
> 
>         hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
>         if (likely(hlist_empty(&head)))
>                 return;
> 
> -       synchronize_rcu_expedited();
> +       cond_synchronize_rcu_expedited(unmount_seq);
> 
>         hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>                 hlist_del(&m->mnt_umount);
> @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>                  */
>                 mnt_notify_add(p);
>         }
> +
> +       rcu_unmount_seq = get_state_synchronize_rcu();
>  }
> 
>  static void shrink_submounts(struct mount *mnt);
> 
> 
> I'm not sure how much that would buy us. If it doesn't then it should be
> possible to play with the following possibly strange idea:

Did not improve the lazy unmount, no corruption running the script.
QEMU x86_64, 8cpus, PREEMP_RT:
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount /mnt
          0.019498 +- 0.000944 seconds time elapsed  ( +-  4.84% )
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount -l /mnt
          0.020635 +- 0.000959 seconds time elapsed  ( +-  4.65% )

> diff --git a/fs/mount.h b/fs/mount.h
> index 7aecf2a60472..51b86300dc50 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -61,6 +61,7 @@ struct mount {
>                 struct rb_node mnt_node; /* node in the ns->mounts rbtree */
>                 struct rcu_head mnt_rcu;
>                 struct llist_node mnt_llist;
> +               unsigned long mnt_rcu_unmount_seq;
>         };
>  #ifdef CONFIG_SMP
>         struct mnt_pcp __percpu *mnt_pcp;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..aae9df75beed 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
>         struct hlist_head head;
>         struct hlist_node *p;
>         struct mount *m;
> +       bool needs_synchronize_rcu = false;
>         LIST_HEAD(list);
> 
>         hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
>         if (likely(hlist_empty(&head)))
>                 return;
> 
> -       synchronize_rcu_expedited();
> +       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> +               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> +                       continue;
> +
> +               needs_synchronize_rcu = true;
> +               break;
> +       }
> +
> +       if (needs_synchronize_rcu)
> +               synchronize_rcu_expedited();
> 
>         hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>                 hlist_del(&m->mnt_umount);
> @@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>                         }
>                 }
>                 change_mnt_propagation(p, MS_PRIVATE);
> -               if (disconnect)
> +               if (disconnect) {
> +                       p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
>                         hlist_add_head(&p->mnt_umount, &unmounted);
> +               }
> 
>                 /*
>                  * At this point p->mnt_ns is NULL, notification will be queued
> 
> This would allow to elide synchronize rcu calls if they elapsed in the
> meantime since we moved that mount to the unmounted list.

Faster umount, lazy or not, no corruption running the script.
QEMU x86_64, 8cpus, PREEMP_RT:
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount  /mnt
          0.001482 +- 0.000121 seconds time elapsed  ( +-  8.17% )
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l /mnt
          0.002248 +- 0.000845 seconds time elapsed  ( +- 37.58% )

The crun test from v1 without your patch:
# perf stat -r 10 --null -- crun run test
           0.08166 +- 0.00476 seconds time elapsed  ( +-  5.83% )
The crun test from v1 with your patch:
# perf stat -r 10 --null -- crun run test
           0.01449 +- 0.00457 seconds time elapsed  ( +- 31.55% )

I have not run the LTP fs tests with that last patch yet, but that looks
like quite an improvement.

Best,
Ian Kent April 18, 2025, 12:31 a.m. UTC | #28
On 17/4/25 23:12, Christian Brauner wrote:
> On Thu, Apr 17, 2025 at 01:31:40PM +0200, Christian Brauner wrote:
>> On Thu, Apr 17, 2025 at 06:17:01PM +0800, Ian Kent wrote:
>>> On 17/4/25 17:01, Christian Brauner wrote:
>>>> On Wed, Apr 16, 2025 at 11:11:51PM +0100, Mark Brown wrote:
>>>>> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
>>>>>> Defer releasing the detached file-system when calling namespace_unlock()
>>>>>> during a lazy umount to return faster.
>>>>>>
>>>>>> When requesting MNT_DETACH, the caller does not expect the file-system
>>>>>> to be shut down upon returning from the syscall. Calling
>>>>>> synchronize_rcu_expedited() has a significant cost on RT kernel that
>>>>>> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
>>>>>> mount in a separate list and put it on a workqueue to run post RCU
>>>>>> grace-period.
>>>>> For the past couple of days we've been seeing failures in a bunch of LTP
>>>>> filesystem related tests on various arm64 systems.  The failures are
>>>>> mostly (I think all) in the form:
>>>>>
>>>>> 20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
>>>>> 20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
>>>>> 20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
>>>>> 20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1
>>>>>
>>>>> ie, a failure to stand up the test environment on the loopback device
>>>>> all happening immediately after some other filesystem related test which
>>>>> also used the loop device.  A bisect points to commit a6c7a78f1b6b97
>>>>> which is this, which does look rather relevant.  LTP is obviously being
>>>>> very much an edge case here.
>>>> Hah, here's something I didn't consider and that I should've caught.
>>>>
>>>> Look, on current mainline no matter if MNT_DETACH/UMOUNT_SYNC or
>>>> non-MNT_DETACH/UMOUNT_SYNC. The mntput() calls after the
>>>> synchronize_rcu_expedited() calls will end up in task_work():
>>>>
>>>>           if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
>>>>                   struct task_struct *task = current;
>>>>                   if (likely(!(task->flags & PF_KTHREAD))) {
>>>>                           init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
>>>>                           if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
>>>>                                   return;
>>>>                   }
>>>>                   if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
>>>>                           schedule_delayed_work(&delayed_mntput_work, 1);
>>>>                   return;
>>>>           }
>>>>
>>>> because all of those mntput()s are done from the task's contect.
>>>>
>>>> IOW, if userspace does umount(MNT_DETACH) and the task has returned to
>>>> userspace it is guaranteed that all calls to cleanup_mnt() are done.
>>>>
>>>> With your change that simply isn't true anymore. The call to
>>>> queue_rcu_work() will offload those mntput() to be done from a kthread.
>>>> That in turn means all those mntputs end up on the delayed_mntput_work()
>>>> queue. So the mounts aren't cleaned up by the time the task returns to
>>>> userspace.
>>>>
>>>> And that's likely problematic even for the explicit MNT_DETACH use-case
>>>> because it means EBUSY errors are a lot more likely to be seen by
>>>> concurrent mounters especially for loop devices.
>>>>
>>>> And fwiw, this is exactly what I pointed out in a prior posting to this
>>>> patch series.
>>> And I didn't understand what you said then but this problem is more
>>>
>>> understandable to me now.
> I mean I'm saying it could be problematic for the MNT_DETACH case. I'm
> not sure how likely it is. If some process P1 does MNT_DETACH on a loop
> device and then another process P2 wants to use that loop device and
> sess EBUSY then we don't care. That can already happen. But even in this
> case I'm not sure if there aren't subtle ways where this will bite us.
>
> But there's two other problems:
>
> (1) The real issue is with the same process P1 doing stupid stuff that
>      just happened to work. For example if there's code out there that
>      does a MNT_DETACH on a filesystem that uses a loop device
>      immediately followed by the desire to reuse the loop device:
>
>      It doesn't matter whether such code must in theory already be
>      prepared to handle the case of seeing EBUSY after the MNT_DETACH. If
>      this currently just happens to work because we guarantee that the
>      last mntput() and cleanup_mnt() will have been done when the caller
>      returns to userspace it's a uapi break plain and simple.
>
>      This implicit guarantee isn't there anymore after your patch because
>      the final mntput() from is done from the system_unbound_wq which has
>      the consequence that the cleanup_mnt() is done from the
>      delayed_mntput_work workqeueue. And that leads to problem number
>      (2).

This is a bit puzzling to me.


All the mounts in the tree should be unhashed before any of these mntput()

calls so I didn't think it would be found. I'll need to look at the loop

device case to work out how it's finding (or holing onto) the stale mount

and concluding it's busy.


Although there was place I was concerned about:

                 disconnect = disconnect_mount(p, how);
                 if (mnt_has_parent(p)) {
                         mnt_add_count(p->mnt_parent, -1);
                         if (!disconnect) {
                                 /* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                         } else {
                                 umount_mnt(p);
                         }
                 }

here p ends up not being unhashed so I need to look again at this area and

try and understand the special cases.


>
> (2) If a userspace task is dealing with e.g., a broken NFS server and
>      does a umount(MNT_DETACH) and that NFS server blocks indefinitely
>      then right now it will be the task's problem that called the umount.
>      It will simply hang and pay the price.
>
>      With your patch however, that cleanup_mnt() and the
>      deactivate_super() call it entails will be done from
>      delayed_mntput_work...
>
>      So if there's some userspace process with a broken NFS server and it
>      does umount(MNT_DETACH) it will end up hanging every other
>      umount(MNT_DETACH) on the system because the dealyed_mntput_work
>      workqueue (to my understanding) cannot make progress.
>
>      So in essence this patch to me seems like handing a DOS vector for
>      MNT_DETACH to userspace.

Umm ... this is a really serious problem and can fairly easily happen and

as much as NFS has improved over the years it still happens.


But again, the problem is not so much waiting for rcu-walks to be done as

waiting longer than is needed ...


>
>>>
>>>> But we've also worsened that situation by doing the deferred thing for
>>>> any non-UMOUNT_SYNC. That which includes namespace exit. IOW, if the
>>>> last task in a new mount namespace exits it will drop_collected_mounts()
>>>> without UMOUNT_SYNC because we know that they aren't reachable anymore,
>>>> after all the mount namespace is dead.
>>>>
>>>> But now we defer all cleanup to the kthread which means when the task
>>>> returns to userspace there's still mounts to be cleaned up.
>>> Correct me if I'm wrong but the actual problem is that the mechanism used
>>>
>>> to wait until there are no processes doing an rcu-walk on mounts in the
>>>
>>> discard list is unnecessarily long according to what Eric has seen. So a
>> I think that the current approach is still salvagable but I need to test
>> this and currently LTP doesn't really compile for me.
> I managed to reproduce this and it is like I suspected. I just thought
> "Oh well, if it's not UMOUNT_SYNC then we can just punt this to a
> workqueue." which is obviously not going to work.
>
> If a mount namespace gets cleaned up or if a detached mount tree is
> cleaned up or if audit calls drop_collected_mounts() we obviously don't
> want to defer the unmount. So the fix is to simply restrict this to
> userspace actually requesting MNT_DETACH.
>
> But I'm seeing way more substantial issues with this patch.

Yeah, it's certainly much more difficult than it looks at first sight.


Ian
Ian Kent April 18, 2025, 1:13 a.m. UTC | #29
On 18/4/25 00:28, Christian Brauner wrote:
> On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
>>>>      So if there's some userspace process with a broken NFS server and it
>>>>      does umount(MNT_DETACH) it will end up hanging every other
>>>>      umount(MNT_DETACH) on the system because the dealyed_mntput_work
>>>>      workqueue (to my understanding) cannot make progress.
>>> Ok, "to my understanding" has been updated after going back and reading
>>> the delayed work code. Luckily it's not as bad as I thought it is
>>> because it's queued on system_wq which is multi-threaded so it's at
>>> least not causing everyone with MNT_DETACH to get stuck. I'm still
>>> skeptical how safe this all is.
>> I would (again) throw system_unbound_wq into the game because the former
>> will remain on the CPU on which has been enqueued (if speaking about
>> multi threading).
> Yes, good point.
>
> However, what about using polled grace periods?
>
> A first simple-minded thing to do would be to record the grace period
> after umount_tree() has finished and the check it in namespace_unlock():
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..1e7ebcdd1ebc 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init;
>   static struct hlist_head *mountpoint_hashtable __ro_after_init;
>   static struct kmem_cache *mnt_cache __ro_after_init;
>   static DECLARE_RWSEM(namespace_sem);
> +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
>   static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
>   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>   static DEFINE_SEQLOCK(mnt_ns_tree_lock);
> @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
>          struct hlist_head head;
>          struct hlist_node *p;
>          struct mount *m;
> +       unsigned long unmount_seq = rcu_unmount_seq;
>          LIST_HEAD(list);
>
>          hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
>          if (likely(hlist_empty(&head)))
>                  return;
>
> -       synchronize_rcu_expedited();
> +       cond_synchronize_rcu_expedited(unmount_seq);
>
>          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>                  hlist_del(&m->mnt_umount);
> @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>                   */
>                  mnt_notify_add(p);
>          }
> +
> +       rcu_unmount_seq = get_state_synchronize_rcu();
>   }
>
>   static void shrink_submounts(struct mount *mnt);
>
>
> I'm not sure how much that would buy us. If it doesn't then it should be
> possible to play with the following possibly strange idea:
>
> diff --git a/fs/mount.h b/fs/mount.h
> index 7aecf2a60472..51b86300dc50 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -61,6 +61,7 @@ struct mount {
>                  struct rb_node mnt_node; /* node in the ns->mounts rbtree */
>                  struct rcu_head mnt_rcu;
>                  struct llist_node mnt_llist;
> +               unsigned long mnt_rcu_unmount_seq;
>          };
>   #ifdef CONFIG_SMP
>          struct mnt_pcp __percpu *mnt_pcp;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..aae9df75beed 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
>          struct hlist_head head;
>          struct hlist_node *p;
>          struct mount *m;
> +       bool needs_synchronize_rcu = false;
>          LIST_HEAD(list);
>
>          hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
>          if (likely(hlist_empty(&head)))
>                  return;
>
> -       synchronize_rcu_expedited();
> +       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> +               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> +                       continue;
> +
> +               needs_synchronize_rcu = true;
> +               break;
> +       }
> +
> +       if (needs_synchronize_rcu)
> +               synchronize_rcu_expedited();
>
>          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>                  hlist_del(&m->mnt_umount);
> @@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>                          }
>                  }
>                  change_mnt_propagation(p, MS_PRIVATE);
> -               if (disconnect)
> +               if (disconnect) {
> +                       p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
>                          hlist_add_head(&p->mnt_umount, &unmounted);
> +               }
>
>                  /*
>                   * At this point p->mnt_ns is NULL, notification will be queued
>
> This would allow to elide synchronize rcu calls if they elapsed in the
> meantime since we moved that mount to the unmounted list.

This last patch is a much better way to do this IMHO.

The approach is so much more like many other places we have "rcu check 
before use" type code.

Ian
Ian Kent April 18, 2025, 1:20 a.m. UTC | #30
On 18/4/25 09:13, Ian Kent wrote:
>
> On 18/4/25 00:28, Christian Brauner wrote:
>> On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior 
>> wrote:
>>> On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
>>>>>      So if there's some userspace process with a broken NFS server 
>>>>> and it
>>>>>      does umount(MNT_DETACH) it will end up hanging every other
>>>>>      umount(MNT_DETACH) on the system because the dealyed_mntput_work
>>>>>      workqueue (to my understanding) cannot make progress.
>>>> Ok, "to my understanding" has been updated after going back and 
>>>> reading
>>>> the delayed work code. Luckily it's not as bad as I thought it is
>>>> because it's queued on system_wq which is multi-threaded so it's at
>>>> least not causing everyone with MNT_DETACH to get stuck. I'm still
>>>> skeptical how safe this all is.
>>> I would (again) throw system_unbound_wq into the game because the 
>>> former
>>> will remain on the CPU on which has been enqueued (if speaking about
>>> multi threading).
>> Yes, good point.
>>
>> However, what about using polled grace periods?
>>
>> A first simple-minded thing to do would be to record the grace period
>> after umount_tree() has finished and the check it in namespace_unlock():
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index d9ca80dcc544..1e7ebcdd1ebc 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable 
>> __ro_after_init;
>>   static struct hlist_head *mountpoint_hashtable __ro_after_init;
>>   static struct kmem_cache *mnt_cache __ro_after_init;
>>   static DECLARE_RWSEM(namespace_sem);
>> +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
>>   static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
>>   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>>   static DEFINE_SEQLOCK(mnt_ns_tree_lock);
>> @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
>>          struct hlist_head head;
>>          struct hlist_node *p;
>>          struct mount *m;
>> +       unsigned long unmount_seq = rcu_unmount_seq;
>>          LIST_HEAD(list);
>>
>>          hlist_move_list(&unmounted, &head);
>> @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
>>          if (likely(hlist_empty(&head)))
>>                  return;
>>
>> -       synchronize_rcu_expedited();
>> +       cond_synchronize_rcu_expedited(unmount_seq);
>>
>>          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>>                  hlist_del(&m->mnt_umount);
>> @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum 
>> umount_tree_flags how)
>>                   */
>>                  mnt_notify_add(p);
>>          }
>> +
>> +       rcu_unmount_seq = get_state_synchronize_rcu();
>>   }
>>
>>   static void shrink_submounts(struct mount *mnt);
>>
>>
>> I'm not sure how much that would buy us. If it doesn't then it should be
>> possible to play with the following possibly strange idea:
>>
>> diff --git a/fs/mount.h b/fs/mount.h
>> index 7aecf2a60472..51b86300dc50 100644
>> --- a/fs/mount.h
>> +++ b/fs/mount.h
>> @@ -61,6 +61,7 @@ struct mount {
>>                  struct rb_node mnt_node; /* node in the ns->mounts 
>> rbtree */
>>                  struct rcu_head mnt_rcu;
>>                  struct llist_node mnt_llist;
>> +               unsigned long mnt_rcu_unmount_seq;
>>          };
>>   #ifdef CONFIG_SMP
>>          struct mnt_pcp __percpu *mnt_pcp;
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index d9ca80dcc544..aae9df75beed 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
>>          struct hlist_head head;
>>          struct hlist_node *p;
>>          struct mount *m;
>> +       bool needs_synchronize_rcu = false;
>>          LIST_HEAD(list);
>>
>>          hlist_move_list(&unmounted, &head);
>> @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
>>          if (likely(hlist_empty(&head)))
>>                  return;
>>
>> -       synchronize_rcu_expedited();
>> +       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>> +               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
>> +                       continue;
>> +
>> +               needs_synchronize_rcu = true;
>> +               break;
>> +       }
>> +
>> +       if (needs_synchronize_rcu)
>> +               synchronize_rcu_expedited();
>>
>>          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>>                  hlist_del(&m->mnt_umount);
>> @@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, 
>> enum umount_tree_flags how)
>>                          }
>>                  }
>>                  change_mnt_propagation(p, MS_PRIVATE);
>> -               if (disconnect)
>> +               if (disconnect) {
>> +                       p->mnt_rcu_unmount_seq = 
>> get_state_synchronize_rcu();
>>                          hlist_add_head(&p->mnt_umount, &unmounted);
>> +               }
>>
>>                  /*
>>                   * At this point p->mnt_ns is NULL, notification 
>> will be queued
>>
>> This would allow to elide synchronize rcu calls if they elapsed in the
>> meantime since we moved that mount to the unmounted list.
>
> This last patch is a much better way to do this IMHO.
>
> The approach is so much more like many other places we have "rcu check 
> before use" type code.

If there are several thousand mounts in the discard list having two 
loops could end up a bit slow.

I wonder if we could combine the two loops into one ... I'll think about 
that.


>
> Ian
>
>
Christian Brauner April 18, 2025, 8:47 a.m. UTC | #31
On Fri, Apr 18, 2025 at 09:20:52AM +0800, Ian Kent wrote:
> On 18/4/25 09:13, Ian Kent wrote:
> > 
> > On 18/4/25 00:28, Christian Brauner wrote:
> > > On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior
> > > wrote:
> > > > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > > > > >      So if there's some userspace process with a broken
> > > > > > NFS server and it
> > > > > >      does umount(MNT_DETACH) it will end up hanging every other
> > > > > >      umount(MNT_DETACH) on the system because the dealyed_mntput_work
> > > > > >      workqueue (to my understanding) cannot make progress.
> > > > > Ok, "to my understanding" has been updated after going back
> > > > > and reading
> > > > > the delayed work code. Luckily it's not as bad as I thought it is
> > > > > because it's queued on system_wq which is multi-threaded so it's at
> > > > > least not causing everyone with MNT_DETACH to get stuck. I'm still
> > > > > skeptical how safe this all is.
> > > > I would (again) throw system_unbound_wq into the game because
> > > > the former
> > > > will remain on the CPU on which has been enqueued (if speaking about
> > > > multi threading).
> > > Yes, good point.
> > > 
> > > However, what about using polled grace periods?
> > > 
> > > A first simple-minded thing to do would be to record the grace period
> > > after umount_tree() has finished and the check it in namespace_unlock():
> > > 
> > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > index d9ca80dcc544..1e7ebcdd1ebc 100644
> > > --- a/fs/namespace.c
> > > +++ b/fs/namespace.c
> > > @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable
> > > __ro_after_init;
> > >   static struct hlist_head *mountpoint_hashtable __ro_after_init;
> > >   static struct kmem_cache *mnt_cache __ro_after_init;
> > >   static DECLARE_RWSEM(namespace_sem);
> > > +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
> > >   static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
> > >   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > >   static DEFINE_SEQLOCK(mnt_ns_tree_lock);
> > > @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
> > >          struct hlist_head head;
> > >          struct hlist_node *p;
> > >          struct mount *m;
> > > +       unsigned long unmount_seq = rcu_unmount_seq;
> > >          LIST_HEAD(list);
> > > 
> > >          hlist_move_list(&unmounted, &head);
> > > @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
> > >          if (likely(hlist_empty(&head)))
> > >                  return;
> > > 
> > > -       synchronize_rcu_expedited();
> > > +       cond_synchronize_rcu_expedited(unmount_seq);
> > > 
> > >          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> > >                  hlist_del(&m->mnt_umount);
> > > @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt,
> > > enum umount_tree_flags how)
> > >                   */
> > >                  mnt_notify_add(p);
> > >          }
> > > +
> > > +       rcu_unmount_seq = get_state_synchronize_rcu();
> > >   }
> > > 
> > >   static void shrink_submounts(struct mount *mnt);
> > > 
> > > 
> > > I'm not sure how much that would buy us. If it doesn't then it should be
> > > possible to play with the following possibly strange idea:
> > > 
> > > diff --git a/fs/mount.h b/fs/mount.h
> > > index 7aecf2a60472..51b86300dc50 100644
> > > --- a/fs/mount.h
> > > +++ b/fs/mount.h
> > > @@ -61,6 +61,7 @@ struct mount {
> > >                  struct rb_node mnt_node; /* node in the ns->mounts
> > > rbtree */
> > >                  struct rcu_head mnt_rcu;
> > >                  struct llist_node mnt_llist;
> > > +               unsigned long mnt_rcu_unmount_seq;
> > >          };
> > >   #ifdef CONFIG_SMP
> > >          struct mnt_pcp __percpu *mnt_pcp;
> > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > index d9ca80dcc544..aae9df75beed 100644
> > > --- a/fs/namespace.c
> > > +++ b/fs/namespace.c
> > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
> > >          struct hlist_head head;
> > >          struct hlist_node *p;
> > >          struct mount *m;
> > > +       bool needs_synchronize_rcu = false;
> > >          LIST_HEAD(list);
> > > 
> > >          hlist_move_list(&unmounted, &head);
> > > @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
> > >          if (likely(hlist_empty(&head)))
> > >                  return;
> > > 
> > > -       synchronize_rcu_expedited();
> > > +       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> > > +               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> > > +                       continue;

This has a bug. This needs to be:

	/* A grace period has already elapsed. */
	if (poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
		continue;

	/* Oh oh, we have to pay up. */
	needs_synchronize_rcu = true;
	break;

which I'm pretty sure will eradicate most of the performance gain you've
seen because fundamentally the two version shouldn't be different (Note,
I drafted this while on my way out the door. r.

I would test the following version where we pay the cost of the
smb_mb() from poll_state_synchronize_rcu() exactly one time:

diff --git a/fs/mount.h b/fs/mount.h
index 7aecf2a60472..51b86300dc50 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -61,6 +61,7 @@ struct mount {
                struct rb_node mnt_node; /* node in the ns->mounts rbtree */
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
+               unsigned long mnt_rcu_unmount_seq;
        };
 #ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..dd367c54bc29 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
+       unsigned long mnt_rcu_unmount_seq = 0;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
@@ -1817,7 +1818,10 @@ static void namespace_unlock(void)
        if (likely(hlist_empty(&head)))
                return;

-       synchronize_rcu_expedited();
+       hlist_for_each_entry_safe(m, p, &head, mnt_umount)
+               mnt_rcu_unmount_seq = max(m->mnt_rcu_unmount_seq, mnt_rcu_unmount_seq);
+
+       cond_synchronize_rcu_expedited(mnt_rcu_unmount_seq);

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
@@ -1923,8 +1927,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
-               if (disconnect)
+               if (disconnect) {
+                       p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
                        hlist_add_head(&p->mnt_umount, &unmounted);
+               }

                /*
                 * At this point p->mnt_ns is NULL, notification will be queued

If this doesn't help I had considered recording the rcu sequence number
during __legitimize_mnt() in the mounts. But we likely can't do that
because get_state_synchronize_rcu() is expensive because it inserts a
smb_mb() and that would likely be noticable during path lookup. This
would also hinge on the notion that the last store of the rcu sequence
number is guaranteed to be seen when we check them in namespace_unlock().

Another possibly insane idea (haven't fully thought it out but throwing
it out there to test): allocate a percpu counter for each mount and
increment it each time we enter __legitimize_mnt() and decrement it when
we leave __legitimize_mnt(). During umount_tree() check the percpu sum
for each mount after it's been added to the @unmounted list.

If we see any mount that has a non-zero count we set a global
@needs_synchronize_rcu to true and stop counting for the other mounts
(saving percpu summing cycles). Then call or elide
synchronize_rcu_expedited() based on the @needs_synchronize_rcu boolean
in namespace_unlock().

The percpu might make this cheap enough for __legitimize_mnt() to be
workable (ignoring any other pitfalls I've currently not had time to
warp my head around).
Christian Brauner April 18, 2025, 8:59 a.m. UTC | #32
On Fri, Apr 18, 2025 at 08:31:03AM +0800, Ian Kent wrote:
> On 17/4/25 23:12, Christian Brauner wrote:
> > On Thu, Apr 17, 2025 at 01:31:40PM +0200, Christian Brauner wrote:
> > > On Thu, Apr 17, 2025 at 06:17:01PM +0800, Ian Kent wrote:
> > > > On 17/4/25 17:01, Christian Brauner wrote:
> > > > > On Wed, Apr 16, 2025 at 11:11:51PM +0100, Mark Brown wrote:
> > > > > > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > > > > > > Defer releasing the detached file-system when calling namespace_unlock()
> > > > > > > during a lazy umount to return faster.
> > > > > > > 
> > > > > > > When requesting MNT_DETACH, the caller does not expect the file-system
> > > > > > > to be shut down upon returning from the syscall. Calling
> > > > > > > synchronize_rcu_expedited() has a significant cost on RT kernel that
> > > > > > > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> > > > > > > mount in a separate list and put it on a workqueue to run post RCU
> > > > > > > grace-period.
> > > > > > For the past couple of days we've been seeing failures in a bunch of LTP
> > > > > > filesystem related tests on various arm64 systems.  The failures are
> > > > > > mostly (I think all) in the form:
> > > > > > 
> > > > > > 20101 10:12:40.378045  tst_test.c:1833: TINFO: === Testing on vfat ===
> > > > > > 20102 10:12:40.385091  tst_test.c:1170: TINFO: Formatting /dev/loop0 with vfat opts='' extra opts=''
> > > > > > 20103 10:12:40.391032  mkfs.vfat: unable to open /dev/loop0: Device or resource busy
> > > > > > 20104 10:12:40.395953  tst_test.c:1170: TBROK: mkfs.vfat failed with exit code 1
> > > > > > 
> > > > > > ie, a failure to stand up the test environment on the loopback device
> > > > > > all happening immediately after some other filesystem related test which
> > > > > > also used the loop device.  A bisect points to commit a6c7a78f1b6b97
> > > > > > which is this, which does look rather relevant.  LTP is obviously being
> > > > > > very much an edge case here.
> > > > > Hah, here's something I didn't consider and that I should've caught.
> > > > > 
> > > > > Look, on current mainline no matter if MNT_DETACH/UMOUNT_SYNC or
> > > > > non-MNT_DETACH/UMOUNT_SYNC. The mntput() calls after the
> > > > > synchronize_rcu_expedited() calls will end up in task_work():
> > > > > 
> > > > >           if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
> > > > >                   struct task_struct *task = current;
> > > > >                   if (likely(!(task->flags & PF_KTHREAD))) {
> > > > >                           init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
> > > > >                           if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
> > > > >                                   return;
> > > > >                   }
> > > > >                   if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
> > > > >                           schedule_delayed_work(&delayed_mntput_work, 1);
> > > > >                   return;
> > > > >           }
> > > > > 
> > > > > because all of those mntput()s are done from the task's contect.
> > > > > 
> > > > > IOW, if userspace does umount(MNT_DETACH) and the task has returned to
> > > > > userspace it is guaranteed that all calls to cleanup_mnt() are done.
> > > > > 
> > > > > With your change that simply isn't true anymore. The call to
> > > > > queue_rcu_work() will offload those mntput() to be done from a kthread.
> > > > > That in turn means all those mntputs end up on the delayed_mntput_work()
> > > > > queue. So the mounts aren't cleaned up by the time the task returns to
> > > > > userspace.
> > > > > 
> > > > > And that's likely problematic even for the explicit MNT_DETACH use-case
> > > > > because it means EBUSY errors are a lot more likely to be seen by
> > > > > concurrent mounters especially for loop devices.
> > > > > 
> > > > > And fwiw, this is exactly what I pointed out in a prior posting to this
> > > > > patch series.
> > > > And I didn't understand what you said then but this problem is more
> > > > 
> > > > understandable to me now.
> > I mean I'm saying it could be problematic for the MNT_DETACH case. I'm
> > not sure how likely it is. If some process P1 does MNT_DETACH on a loop
> > device and then another process P2 wants to use that loop device and
> > sess EBUSY then we don't care. That can already happen. But even in this
> > case I'm not sure if there aren't subtle ways where this will bite us.
> > 
> > But there's two other problems:
> > 
> > (1) The real issue is with the same process P1 doing stupid stuff that
> >      just happened to work. For example if there's code out there that
> >      does a MNT_DETACH on a filesystem that uses a loop device
> >      immediately followed by the desire to reuse the loop device:
> > 
> >      It doesn't matter whether such code must in theory already be
> >      prepared to handle the case of seeing EBUSY after the MNT_DETACH. If
> >      this currently just happens to work because we guarantee that the
> >      last mntput() and cleanup_mnt() will have been done when the caller
> >      returns to userspace it's a uapi break plain and simple.
> > 
> >      This implicit guarantee isn't there anymore after your patch because
> >      the final mntput() from is done from the system_unbound_wq which has
> >      the consequence that the cleanup_mnt() is done from the
> >      delayed_mntput_work workqeueue. And that leads to problem number
> >      (2).
> 
> This is a bit puzzling to me.
> 
> 
> All the mounts in the tree should be unhashed before any of these mntput()
> 
> calls so I didn't think it would be found. I'll need to look at the loop
> 
> device case to work out how it's finding (or holing onto) the stale mount
> 
> and concluding it's busy.

Say you do:

mount(/dev/loop0 /mnt);

Unmounting that thing with or without MNT_DETACH will have the following
effect (if no path lookup happens and it isn't kept busy otherwise):

After the task returns the loop device will be free again because
deactivate_super() will have been called and the loop device is
release when the relevant filesystems release the claim on the block
device.

IOW, if the task that just returned wanted to reuse the same loop device
right after the umount() returned for another image file it could. It
would succeed with or without MNT_DETACH. Because the task_work means
that cleanup_mnt() will have been called when the task returns to
userspace.

But when we start offloading this to a workqueue that "guarantee" isn't
there anymore specifically for MNT_DETACH. If the system is mighty busy
the system_unbound_wq that does the mntput() and the delayed_mntput_work
workqueue that would ultimately do the cleanup_mnt() and thus
deactivate_super() to release the loop device could be run way way after
the task has returned to userspace. Thus, the task could prepare a new
image file and whatever and then try to use the same loop device and it
would fail because the workqueue hasn't gotten around to the item yet.
And it would be pretty opaque to the caller. I have no idea how likely
that is to become a problem. I'm just saying that is a not so subtle
change in behavior that might be noticable.
Christian Brauner April 18, 2025, 12:55 p.m. UTC | #33
On Fri, Apr 18, 2025 at 10:47:10AM +0200, Christian Brauner wrote:
> On Fri, Apr 18, 2025 at 09:20:52AM +0800, Ian Kent wrote:
> > On 18/4/25 09:13, Ian Kent wrote:
> > > 
> > > On 18/4/25 00:28, Christian Brauner wrote:
> > > > On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior
> > > > wrote:
> > > > > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > > > > > >      So if there's some userspace process with a broken
> > > > > > > NFS server and it
> > > > > > >      does umount(MNT_DETACH) it will end up hanging every other
> > > > > > >      umount(MNT_DETACH) on the system because the dealyed_mntput_work
> > > > > > >      workqueue (to my understanding) cannot make progress.
> > > > > > Ok, "to my understanding" has been updated after going back
> > > > > > and reading
> > > > > > the delayed work code. Luckily it's not as bad as I thought it is
> > > > > > because it's queued on system_wq which is multi-threaded so it's at
> > > > > > least not causing everyone with MNT_DETACH to get stuck. I'm still
> > > > > > skeptical how safe this all is.
> > > > > I would (again) throw system_unbound_wq into the game because
> > > > > the former
> > > > > will remain on the CPU on which has been enqueued (if speaking about
> > > > > multi threading).
> > > > Yes, good point.
> > > > 
> > > > However, what about using polled grace periods?
> > > > 
> > > > A first simple-minded thing to do would be to record the grace period
> > > > after umount_tree() has finished and the check it in namespace_unlock():
> > > > 
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index d9ca80dcc544..1e7ebcdd1ebc 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable
> > > > __ro_after_init;
> > > >   static struct hlist_head *mountpoint_hashtable __ro_after_init;
> > > >   static struct kmem_cache *mnt_cache __ro_after_init;
> > > >   static DECLARE_RWSEM(namespace_sem);
> > > > +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
> > > >   static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
> > > >   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > >   static DEFINE_SEQLOCK(mnt_ns_tree_lock);
> > > > @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
> > > >          struct hlist_head head;
> > > >          struct hlist_node *p;
> > > >          struct mount *m;
> > > > +       unsigned long unmount_seq = rcu_unmount_seq;
> > > >          LIST_HEAD(list);
> > > > 
> > > >          hlist_move_list(&unmounted, &head);
> > > > @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
> > > >          if (likely(hlist_empty(&head)))
> > > >                  return;
> > > > 
> > > > -       synchronize_rcu_expedited();
> > > > +       cond_synchronize_rcu_expedited(unmount_seq);
> > > > 
> > > >          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> > > >                  hlist_del(&m->mnt_umount);
> > > > @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt,
> > > > enum umount_tree_flags how)
> > > >                   */
> > > >                  mnt_notify_add(p);
> > > >          }
> > > > +
> > > > +       rcu_unmount_seq = get_state_synchronize_rcu();
> > > >   }
> > > > 
> > > >   static void shrink_submounts(struct mount *mnt);
> > > > 
> > > > 
> > > > I'm not sure how much that would buy us. If it doesn't then it should be
> > > > possible to play with the following possibly strange idea:
> > > > 
> > > > diff --git a/fs/mount.h b/fs/mount.h
> > > > index 7aecf2a60472..51b86300dc50 100644
> > > > --- a/fs/mount.h
> > > > +++ b/fs/mount.h
> > > > @@ -61,6 +61,7 @@ struct mount {
> > > >                  struct rb_node mnt_node; /* node in the ns->mounts
> > > > rbtree */
> > > >                  struct rcu_head mnt_rcu;
> > > >                  struct llist_node mnt_llist;
> > > > +               unsigned long mnt_rcu_unmount_seq;
> > > >          };
> > > >   #ifdef CONFIG_SMP
> > > >          struct mnt_pcp __percpu *mnt_pcp;
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index d9ca80dcc544..aae9df75beed 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
> > > >          struct hlist_head head;
> > > >          struct hlist_node *p;
> > > >          struct mount *m;
> > > > +       bool needs_synchronize_rcu = false;
> > > >          LIST_HEAD(list);
> > > > 
> > > >          hlist_move_list(&unmounted, &head);
> > > > @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
> > > >          if (likely(hlist_empty(&head)))
> > > >                  return;
> > > > 
> > > > -       synchronize_rcu_expedited();
> > > > +       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> > > > +               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> > > > +                       continue;
> 
> This has a bug. This needs to be:
> 
> 	/* A grace period has already elapsed. */
> 	if (poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> 		continue;
> 
> 	/* Oh oh, we have to pay up. */
> 	needs_synchronize_rcu = true;
> 	break;
> 
> which I'm pretty sure will eradicate most of the performance gain you've
> seen because fundamentally the two version shouldn't be different (Note,
> I drafted this while on my way out the door. r.
> 
> I would test the following version where we pay the cost of the
> smb_mb() from poll_state_synchronize_rcu() exactly one time:
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 7aecf2a60472..51b86300dc50 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -61,6 +61,7 @@ struct mount {
>                 struct rb_node mnt_node; /* node in the ns->mounts rbtree */
>                 struct rcu_head mnt_rcu;
>                 struct llist_node mnt_llist;
> +               unsigned long mnt_rcu_unmount_seq;
>         };
>  #ifdef CONFIG_SMP
>         struct mnt_pcp __percpu *mnt_pcp;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..dd367c54bc29 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
>         struct hlist_head head;
>         struct hlist_node *p;
>         struct mount *m;
> +       unsigned long mnt_rcu_unmount_seq = 0;
>         LIST_HEAD(list);
> 
>         hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1818,10 @@ static void namespace_unlock(void)
>         if (likely(hlist_empty(&head)))
>                 return;
> 
> -       synchronize_rcu_expedited();
> +       hlist_for_each_entry_safe(m, p, &head, mnt_umount)
> +               mnt_rcu_unmount_seq = max(m->mnt_rcu_unmount_seq, mnt_rcu_unmount_seq);
> +
> +       cond_synchronize_rcu_expedited(mnt_rcu_unmount_seq);
> 
>         hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>                 hlist_del(&m->mnt_umount);
> @@ -1923,8 +1927,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>                         }
>                 }
>                 change_mnt_propagation(p, MS_PRIVATE);
> -               if (disconnect)
> +               if (disconnect) {
> +                       p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
>                         hlist_add_head(&p->mnt_umount, &unmounted);
> +               }
> 
>                 /*
>                  * At this point p->mnt_ns is NULL, notification will be queued

Appended is a version of the queue_rcu_work() patch that I think should
work... I've drafted it just now and folded all the changes into your
original patch. I have not at all tested this and I'm not really working
today because it's a public holiday but if you want to play with it
please do so.

I'm somewhat torn. I can see that this is painful on RT kernels and I
see that we should do something about it but I'm not sure whether I'm
willing to incur that new shenanigans on non-RT kernels as well if we
don't have to since the expedited grace period sync works just fine on
non-RT kernels. So maybe we just keep it and special-case the RT case
with queue_rcu_work(). I haven't decided whether I want to do that or
not.
diff mbox series

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index 14935a0500a2..e5b0b920dd97 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@  static unsigned int m_hash_shift __ro_after_init;
 static unsigned int mp_hash_mask __ro_after_init;
 static unsigned int mp_hash_shift __ro_after_init;
 
+struct deferred_free_mounts {
+	struct rcu_work rwork;
+	struct hlist_head release_list;
+};
+
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
@@ -1789,11 +1794,29 @@  static bool need_notify_mnt_list(void)
 }
 #endif
 
-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list)
 {
-	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
+
+	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
+}
+
+static void defer_free_mounts(struct work_struct *work)
+{
+	struct deferred_free_mounts *d = container_of(
+		to_rcu_work(work), struct deferred_free_mounts, rwork);
+
+	free_mounts(&d->release_list);
+	kfree(d);
+}
+
+static void __namespace_unlock(bool lazy)
+{
+	HLIST_HEAD(head);
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1817,12 +1840,27 @@  static void namespace_unlock(void)
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu_expedited();
+	if (lazy) {
+		struct deferred_free_mounts *d =
+			kmalloc(sizeof(*d), GFP_KERNEL);
 
-	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
-		hlist_del(&m->mnt_umount);
-		mntput(&m->mnt);
+		if (unlikely(!d))
+			goto out;
+
+		hlist_move_list(&head, &d->release_list);
+		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+		queue_rcu_work(system_wq, &d->rwork);
+		return;
 	}
+
+out:
+	synchronize_rcu_expedited();
+	free_mounts(&head);
+}
+
+static inline void namespace_unlock(void)
+{
+	__namespace_unlock(false);
 }
 
 static inline void namespace_lock(void)
@@ -2056,7 +2094,7 @@  static int do_umount(struct mount *mnt, int flags)
 	}
 out:
 	unlock_mount_hash();
-	namespace_unlock();
+	__namespace_unlock(flags & MNT_DETACH);
 	return retval;
 }