diff mbox series

[3/5] fs: lockless mntns rbtree lookup

Message ID 20241210-work-mount-rbtree-lockless-v1-3-338366b9bbe4@kernel.org (mailing list archive)
State New
Headers show
Series fs: lockless mntns rbtree lookup | expand

Commit Message

Christian Brauner Dec. 10, 2024, 8:57 p.m. UTC
Currently we use a read-write lock but for the simple search case we can
make this lockless. Creating a new mount namespace is a rather rare
event compared with querying mounts in a foreign mount namespace. Once
this is picked up by e.g., systemd to list mounts in another mount in
it's isolated services or in containers this will be used a lot so this
seems worthwhile doing.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     |  5 ++-
 fs/namespace.c | 99 ++++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 73 insertions(+), 31 deletions(-)

Comments

Christian Brauner Dec. 11, 2024, 8:13 a.m. UTC | #1
Hey Peter,

I had a question for you and meant to Cc you but forgot. This makes one
of our rbtree uses lockless using the seqlock pattern. See below.

I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() /
rb_find_add_rcu()") you added new _rcu() variants.

We're using another search function that allows us to walk the tree in
either direction:

        guard(read_lock)(&mnt_ns_tree_lock);
        for (;;) {
                struct rb_node *node;

                if (previous)
                        node = rb_prev(&mntns->mnt_ns_tree_node);
                else
                        node = rb_next(&mntns->mnt_ns_tree_node);
                if (!node)
                        return ERR_PTR(-ENOENT);

                mntns = node_to_mnt_ns(node);
                node = &mntns->mnt_ns_tree_node;

But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever
considered adding rcu safe variants for those two as well?

Thanks!
Christian

On Tue, Dec 10, 2024 at 09:57:59PM +0100, Christian Brauner wrote:
> Currently we use a read-write lock but for the simple search case we can
> make this lockless. Creating a new mount namespace is a rather rare
> event compared with querying mounts in a foreign mount namespace. Once
> this is picked up by e.g., systemd to list mounts in another mount in
> it's isolated services or in containers this will be used a lot so this
> seems worthwhile doing.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/mount.h     |  5 ++-
>  fs/namespace.c | 99 ++++++++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 73 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -16,7 +16,10 @@ struct mnt_namespace {
>  	u64 event;
>  	unsigned int		nr_mounts; /* # of mounts in the namespace */
>  	unsigned int		pending_mounts;
> -	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
> +	union {
> +		struct rb_node	mnt_ns_tree_node; /* node in the mnt_ns_tree */
> +		struct rcu_head mnt_ns_rcu;
> +	};
>  	refcount_t		passive; /* number references not pinning @mounts */
>  } __randomize_layout;
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..21e990482c5b2e1844d17413b55b58803fa7b008 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>  static DEFINE_RWLOCK(mnt_ns_tree_lock);
> +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
> +
>  static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
>  
>  struct mount_kattr {
> @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
>   */
>  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
>  
> -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
> -{
> -	u64 seq_b = ns->seq;
> -
> -	if (seq < seq_b)
> -		return -1;
> -	if (seq > seq_b)
> -		return 1;
> -	return 0;
> -}
> -
>  static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
>  {
>  	if (!node)
> @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
>  	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
>  }
>  
> -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
> +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
>  {
>  	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
>  	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
>  	u64 seq_a = ns_a->seq;
> +	u64 seq_b = ns_b->seq;
>  
> -	return mnt_ns_cmp(seq_a, ns_b) < 0;
> +	if (seq_a < seq_b)
> +		return -1;
> +	if (seq_a > seq_b)
> +		return 1;
> +	return 0;
> +}
> +
> +static inline void mnt_ns_tree_write_lock(void)
> +{
> +	write_lock(&mnt_ns_tree_lock);
> +	write_seqcount_begin(&mnt_ns_tree_seqcount);
> +}
> +
> +static inline void mnt_ns_tree_write_unlock(void)
> +{
> +	write_seqcount_end(&mnt_ns_tree_seqcount);
> +	write_unlock(&mnt_ns_tree_lock);
>  }
>  
>  static void mnt_ns_tree_add(struct mnt_namespace *ns)
>  {
> -	guard(write_lock)(&mnt_ns_tree_lock);
> -	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
> +	struct rb_node *node;
> +
> +	mnt_ns_tree_write_lock();
> +	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
> +	mnt_ns_tree_write_unlock();
> +
> +	WARN_ON_ONCE(node);
>  }
>  
>  static void mnt_ns_release(struct mnt_namespace *ns)
> @@ -150,15 +163,24 @@ static void mnt_ns_release(struct mnt_namespace *ns)
>  }
>  DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
>  
> +static void mnt_ns_release_rcu(struct rcu_head *rcu)
> +{
> +	struct mnt_namespace *mnt_ns;
> +
> +	mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu);
> +	mnt_ns_release(mnt_ns);
> +}
> +
>  static void mnt_ns_tree_remove(struct mnt_namespace *ns)
>  {
>  	/* remove from global mount namespace list */
>  	if (!is_anon_ns(ns)) {
> -		guard(write_lock)(&mnt_ns_tree_lock);
> +		mnt_ns_tree_write_lock();
>  		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
> +		mnt_ns_tree_write_unlock();
>  	}
>  
> -	mnt_ns_release(ns);
> +	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
>  }
>  
>  /*
> @@ -168,23 +190,23 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns)
>  static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
>  {
>  	struct rb_node *node = mnt_ns_tree.rb_node;
> -	struct mnt_namespace *ret = NULL;
> +	struct mnt_namespace *mnt_ns = NULL;
>  
> -	lockdep_assert_held(&mnt_ns_tree_lock);
> +	lockdep_assert(rcu_read_lock_held());
>  
>  	while (node) {
>  		struct mnt_namespace *n = node_to_mnt_ns(node);
>  
>  		if (mnt_ns_id <= n->seq) {
> -			ret = node_to_mnt_ns(node);
> +			mnt_ns = node_to_mnt_ns(node);
>  			if (mnt_ns_id == n->seq)
>  				break;
> -			node = node->rb_left;
> +			node = rcu_dereference_raw(node->rb_left);
>  		} else {
> -			node = node->rb_right;
> +			node = rcu_dereference_raw(node->rb_right);
>  		}
>  	}
> -	return ret;
> +	return mnt_ns;
>  }
>  
>  /*
> @@ -194,18 +216,35 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
>   * namespace the @namespace_sem must first be acquired. If the namespace has
>   * already shut down before acquiring @namespace_sem, {list,stat}mount() will
>   * see that the mount rbtree of the namespace is empty.
> + *
> + * Note the lookup is lockless protected by a sequence counter. We only
> + * need to guard against false negatives as false positives aren't
> + * possible. So if we didn't find a mount namespace and the sequence
> + * counter has changed we need to retry. If the sequence counter is
> + * still the same we know the search actually failed.
>   */
>  static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
>  {
> -       struct mnt_namespace *ns;
> +	struct mnt_namespace *ns;
> +	unsigned int seq;
>  
> -       guard(read_lock)(&mnt_ns_tree_lock);
> -       ns = mnt_ns_find_id_at(mnt_ns_id);
> -       if (!ns || ns->seq != mnt_ns_id)
> -               return NULL;
> +	guard(rcu)();
> +	do {
> +		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
> +		ns = mnt_ns_find_id_at(mnt_ns_id);
> +		if (ns)
> +			break;
> +	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
>  
> -       refcount_inc(&ns->passive);
> -       return ns;
> +	if (!ns || ns->seq != mnt_ns_id)
> +		return NULL;
> +
> +	/*
> +	* The last reference count is put with after RCU delay so we
> +	* don't need to use refcount_inc_not_zero().
> +	*/
> +	refcount_inc(&ns->passive);
> +	return ns;
>  }
>  
>  static inline void lock_mount_hash(void)
> 
> -- 
> 2.45.2
>
Peter Zijlstra Dec. 11, 2024, 9:02 a.m. UTC | #2
On Wed, Dec 11, 2024 at 09:13:16AM +0100, Christian Brauner wrote:
> Hey Peter,
> 
> I had a question for you and meant to Cc you but forgot. This makes one
> of our rbtree uses lockless using the seqlock pattern. See below.
> 
> I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() /
> rb_find_add_rcu()") you added new _rcu() variants.

The original patches are much older, see:

  d72da4a4d973 ("rbtree: Make lockless searches non-fatal")
  ade3f510f93a ("rbtree: Implement generic latch_tree")

(aw gawd, almost 10 years)

> We're using another search function that allows us to walk the tree in
> either direction:
> 
>         guard(read_lock)(&mnt_ns_tree_lock);
>         for (;;) {
>                 struct rb_node *node;
> 
>                 if (previous)
>                         node = rb_prev(&mntns->mnt_ns_tree_node);
>                 else
>                         node = rb_next(&mntns->mnt_ns_tree_node);
>                 if (!node)
>                         return ERR_PTR(-ENOENT);
> 
>                 mntns = node_to_mnt_ns(node);
>                 node = &mntns->mnt_ns_tree_node;
> 
> But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever
> considered adding rcu safe variants for those two as well?

Urgh, those are hard :-(

So back when I did the lockless lookups, I only ensured the child
pointers were 'stable'. I did not deal with the parent (and colour)
pointer.

The next/prev iterators very much rely on the parent pointer.

Someone would have to go through the tree rotations again and see if it
is possible to also update the parent pointers in such a way as to not
create temporary loops.

Notably, the thing you want to avoid is an interrupt doing a tree
traversal on the CPU that's doing the tree rotation getting stuck.

The other, possibly even harder option would be to (finally) implement
threaded RB trees, where the current NULL child pointers become a 'list'
pointer to the next (in-order) element. But that too makes rotations
'interesting' and must avoid creating loops.

But in all those cases you have to also consider the ramifications of
rb_next/prev hitting a modification, I suspect like with the simple
lookup, you can miss entire subtrees. So depending on the requirements,
this might not be suitable for you.

The far easier option might be to simply add a list_head along with the
rb_node and iterate that -- normally the problem with this is that you
can't easily move elements around in an RCU-list, but luck will have it
you don't move elements around. Your tree location is very static
afaict.
Christian Brauner Dec. 11, 2024, 9:39 a.m. UTC | #3
On Wed, Dec 11, 2024 at 10:02:12AM +0100, Peter Zijlstra wrote:
> On Wed, Dec 11, 2024 at 09:13:16AM +0100, Christian Brauner wrote:
> > Hey Peter,
> > 
> > I had a question for you and meant to Cc you but forgot. This makes one
> > of our rbtree uses lockless using the seqlock pattern. See below.
> > 
> > I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() /
> > rb_find_add_rcu()") you added new _rcu() variants.
> 
> The original patches are much older, see:
> 
>   d72da4a4d973 ("rbtree: Make lockless searches non-fatal")
>   ade3f510f93a ("rbtree: Implement generic latch_tree")
> 
> (aw gawd, almost 10 years)
> 
> > We're using another search function that allows us to walk the tree in
> > either direction:
> > 
> >         guard(read_lock)(&mnt_ns_tree_lock);
> >         for (;;) {
> >                 struct rb_node *node;
> > 
> >                 if (previous)
> >                         node = rb_prev(&mntns->mnt_ns_tree_node);
> >                 else
> >                         node = rb_next(&mntns->mnt_ns_tree_node);
> >                 if (!node)
> >                         return ERR_PTR(-ENOENT);
> > 
> >                 mntns = node_to_mnt_ns(node);
> >                 node = &mntns->mnt_ns_tree_node;
> > 
> > But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever
> > considered adding rcu safe variants for those two as well?
> 
> Urgh, those are hard :-(
> 
> So back when I did the lockless lookups, I only ensured the child
> pointers were 'stable'. I did not deal with the parent (and colour)
> pointer.
> 
> The next/prev iterators very much rely on the parent pointer.
> 
> Someone would have to go through the tree rotations again and see if it
> is possible to also update the parent pointers in such a way as to not
> create temporary loops.
> 
> Notably, the thing you want to avoid is an interrupt doing a tree
> traversal on the CPU that's doing the tree rotation getting stuck.
> 
> The other, possibly even harder option would be to (finally) implement
> threaded RB trees, where the current NULL child pointers become a 'list'
> pointer to the next (in-order) element. But that too makes rotations
> 'interesting' and must avoid creating loops.

Ah, "fun".

> 
> But in all those cases you have to also consider the ramifications of
> rb_next/prev hitting a modification, I suspect like with the simple
> lookup, you can miss entire subtrees. So depending on the requirements,

I think it's fine as long was have the ability to detect that the tree
was modified and retry - like in the simple lookup case. I was happy to
see that you guarantee that only false negatives are possible. Last I
looked at this some time ago Paul was much more pessimistic about
walking the tree with seqlock and rcu. It's good that this now work
afaik.

> this might not be suitable for you.
> 
> The far easier option might be to simply add a list_head along with the
> rb_node and iterate that -- normally the problem with this is that you
> can't easily move elements around in an RCU-list, but luck will have it
> you don't move elements around. Your tree location is very static

Good idea. Yeah, we don't move anything around. And our insertions are
done with a key that's unique - the 64bit mount id - for the system
lifetime.

> afaict.
>
diff mbox series

Patch

diff --git a/fs/mount.h b/fs/mount.h
index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -16,7 +16,10 @@  struct mnt_namespace {
 	u64 event;
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
-	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
+	union {
+		struct rb_node	mnt_ns_tree_node; /* node in the mnt_ns_tree */
+		struct rcu_head mnt_ns_rcu;
+	};
 	refcount_t		passive; /* number references not pinning @mounts */
 } __randomize_layout;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..21e990482c5b2e1844d17413b55b58803fa7b008 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,6 +79,8 @@  static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
+
 static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
 
 struct mount_kattr {
@@ -105,17 +107,6 @@  EXPORT_SYMBOL_GPL(fs_kobj);
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
-	u64 seq_b = ns->seq;
-
-	if (seq < seq_b)
-		return -1;
-	if (seq > seq_b)
-		return 1;
-	return 0;
-}
-
 static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 {
 	if (!node)
@@ -123,19 +114,41 @@  static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
 }
 
-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
 {
 	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
 	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
 	u64 seq_a = ns_a->seq;
+	u64 seq_b = ns_b->seq;
 
-	return mnt_ns_cmp(seq_a, ns_b) < 0;
+	if (seq_a < seq_b)
+		return -1;
+	if (seq_a > seq_b)
+		return 1;
+	return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+	write_lock(&mnt_ns_tree_lock);
+	write_seqcount_begin(&mnt_ns_tree_seqcount);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+	write_seqcount_end(&mnt_ns_tree_seqcount);
+	write_unlock(&mnt_ns_tree_lock);
 }
 
 static void mnt_ns_tree_add(struct mnt_namespace *ns)
 {
-	guard(write_lock)(&mnt_ns_tree_lock);
-	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+	struct rb_node *node;
+
+	mnt_ns_tree_write_lock();
+	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+	mnt_ns_tree_write_unlock();
+
+	WARN_ON_ONCE(node);
 }
 
 static void mnt_ns_release(struct mnt_namespace *ns)
@@ -150,15 +163,24 @@  static void mnt_ns_release(struct mnt_namespace *ns)
 }
 DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
 
+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+	struct mnt_namespace *mnt_ns;
+
+	mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu);
+	mnt_ns_release(mnt_ns);
+}
+
 static void mnt_ns_tree_remove(struct mnt_namespace *ns)
 {
 	/* remove from global mount namespace list */
 	if (!is_anon_ns(ns)) {
-		guard(write_lock)(&mnt_ns_tree_lock);
+		mnt_ns_tree_write_lock();
 		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+		mnt_ns_tree_write_unlock();
 	}
 
-	mnt_ns_release(ns);
+	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
 }
 
 /*
@@ -168,23 +190,23 @@  static void mnt_ns_tree_remove(struct mnt_namespace *ns)
 static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
 {
 	struct rb_node *node = mnt_ns_tree.rb_node;
-	struct mnt_namespace *ret = NULL;
+	struct mnt_namespace *mnt_ns = NULL;
 
-	lockdep_assert_held(&mnt_ns_tree_lock);
+	lockdep_assert(rcu_read_lock_held());
 
 	while (node) {
 		struct mnt_namespace *n = node_to_mnt_ns(node);
 
 		if (mnt_ns_id <= n->seq) {
-			ret = node_to_mnt_ns(node);
+			mnt_ns = node_to_mnt_ns(node);
 			if (mnt_ns_id == n->seq)
 				break;
-			node = node->rb_left;
+			node = rcu_dereference_raw(node->rb_left);
 		} else {
-			node = node->rb_right;
+			node = rcu_dereference_raw(node->rb_right);
 		}
 	}
-	return ret;
+	return mnt_ns;
 }
 
 /*
@@ -194,18 +216,35 @@  static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
  * namespace the @namespace_sem must first be acquired. If the namespace has
  * already shut down before acquiring @namespace_sem, {list,stat}mount() will
  * see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
  */
 static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
 {
-       struct mnt_namespace *ns;
+	struct mnt_namespace *ns;
+	unsigned int seq;
 
-       guard(read_lock)(&mnt_ns_tree_lock);
-       ns = mnt_ns_find_id_at(mnt_ns_id);
-       if (!ns || ns->seq != mnt_ns_id)
-               return NULL;
+	guard(rcu)();
+	do {
+		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
+		ns = mnt_ns_find_id_at(mnt_ns_id);
+		if (ns)
+			break;
+	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
 
-       refcount_inc(&ns->passive);
-       return ns;
+	if (!ns || ns->seq != mnt_ns_id)
+		return NULL;
+
+	/*
+	* The last reference count is put with after RCU delay so we
+	* don't need to use refcount_inc_not_zero().
+	*/
+	refcount_inc(&ns->passive);
+	return ns;
 }
 
 static inline void lock_mount_hash(void)