Message ID | 20241210-work-mount-rbtree-lockless-v1-3-338366b9bbe4@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fs: lockless mntns rbtree lookup | expand |
Hey Peter, I had a question for you and meant to Cc you but forgot. This makes one of our rbtree uses lockless using the seqlock pattern. See below. I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() / rb_find_add_rcu()") you added new _rcu() variants. We're using another search function that allows us to walk the tree in either direction: guard(read_lock)(&mnt_ns_tree_lock); for (;;) { struct rb_node *node; if (previous) node = rb_prev(&mntns->mnt_ns_tree_node); else node = rb_next(&mntns->mnt_ns_tree_node); if (!node) return ERR_PTR(-ENOENT); mntns = node_to_mnt_ns(node); node = &mntns->mnt_ns_tree_node; But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever considered adding rcu safe variants for those two as well? Thanks! Christian On Tue, Dec 10, 2024 at 09:57:59PM +0100, Christian Brauner wrote: > Currently we use a read-write lock but for the simple search case we can > make this lockless. Creating a new mount namespace is a rather rare > event compared with querying mounts in a foreign mount namespace. Once > this is picked up by e.g., systemd to list mounts in another mount in > it's isolated services or in containers this will be used a lot so this > seems worthwhile doing. > > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > fs/mount.h | 5 ++- > fs/namespace.c | 99 ++++++++++++++++++++++++++++++++++++++++------------------ > 2 files changed, 73 insertions(+), 31 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -16,7 +16,10 @@ struct mnt_namespace { > u64 event; > unsigned int nr_mounts; /* # of mounts in the namespace */ > unsigned int pending_mounts; > - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + union { > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + struct rcu_head mnt_ns_rcu; > + }; > refcount_t passive; /* number references not pinning @mounts */ > } __randomize_layout; > > diff --git a/fs/namespace.c b/fs/namespace.c > index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..21e990482c5b2e1844d17413b55b58803fa7b008 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem); > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > static DEFINE_RWLOCK(mnt_ns_tree_lock); > +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); > + > static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ > > struct mount_kattr { > @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); > */ > __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); > > -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) > -{ > - u64 seq_b = ns->seq; > - > - if (seq < seq_b) > - return -1; > - if (seq > seq_b) > - return 1; > - return 0; > -} > - > static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > { > if (!node) > @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); > } > > -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) > +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) > { > struct mnt_namespace *ns_a = node_to_mnt_ns(a); > struct mnt_namespace *ns_b = node_to_mnt_ns(b); > u64 seq_a = ns_a->seq; > + u64 seq_b = ns_b->seq; > > - return mnt_ns_cmp(seq_a, ns_b) < 0; > + if (seq_a < seq_b) > + return -1; > + if (seq_a > seq_b) > + return 1; > + return 0; > +} > + > +static inline void mnt_ns_tree_write_lock(void) > +{ > + write_lock(&mnt_ns_tree_lock); > + write_seqcount_begin(&mnt_ns_tree_seqcount); > +} > + > +static inline void mnt_ns_tree_write_unlock(void) > +{ > + write_seqcount_end(&mnt_ns_tree_seqcount); > + write_unlock(&mnt_ns_tree_lock); > } > > static void mnt_ns_tree_add(struct mnt_namespace *ns) > { > - guard(write_lock)(&mnt_ns_tree_lock); > - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); > + struct rb_node *node; > + > + mnt_ns_tree_write_lock(); > + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); > + mnt_ns_tree_write_unlock(); > + > + WARN_ON_ONCE(node); > } > > static void mnt_ns_release(struct mnt_namespace *ns) > @@ -150,15 +163,24 @@ static void mnt_ns_release(struct mnt_namespace *ns) > } > DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) > > +static void mnt_ns_release_rcu(struct rcu_head *rcu) > +{ > + struct mnt_namespace *mnt_ns; > + > + mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu); > + mnt_ns_release(mnt_ns); > +} > + > static void mnt_ns_tree_remove(struct mnt_namespace *ns) > { > /* remove from global mount namespace list */ > if (!is_anon_ns(ns)) { > - guard(write_lock)(&mnt_ns_tree_lock); > + mnt_ns_tree_write_lock(); > rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > + mnt_ns_tree_write_unlock(); > } > > - mnt_ns_release(ns); > + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); > } > > /* > @@ -168,23 +190,23 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns) > static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > { > struct rb_node *node = mnt_ns_tree.rb_node; > - struct mnt_namespace *ret = NULL; > + struct mnt_namespace *mnt_ns = NULL; > > - lockdep_assert_held(&mnt_ns_tree_lock); > + lockdep_assert(rcu_read_lock_held()); > > while (node) { > struct mnt_namespace *n = node_to_mnt_ns(node); > > if (mnt_ns_id <= n->seq) { > - ret = node_to_mnt_ns(node); > + mnt_ns = node_to_mnt_ns(node); > if (mnt_ns_id == n->seq) > break; > - node = node->rb_left; > + node = rcu_dereference_raw(node->rb_left); > } else { > - node = node->rb_right; > + node = rcu_dereference_raw(node->rb_right); > } > } > - return ret; > + return mnt_ns; > } > > /* > @@ -194,18 +216,35 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > * namespace the @namespace_sem must first be acquired. If the namespace has > * already shut down before acquiring @namespace_sem, {list,stat}mount() will > * see that the mount rbtree of the namespace is empty. > + * > + * Note the lookup is lockless protected by a sequence counter. We only > + * need to guard against false negatives as false positives aren't > + * possible. So if we didn't find a mount namespace and the sequence > + * counter has changed we need to retry. If the sequence counter is > + * still the same we know the search actually failed. > */ > static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) > { > - struct mnt_namespace *ns; > + struct mnt_namespace *ns; > + unsigned int seq; > > - guard(read_lock)(&mnt_ns_tree_lock); > - ns = mnt_ns_find_id_at(mnt_ns_id); > - if (!ns || ns->seq != mnt_ns_id) > - return NULL; > + guard(rcu)(); > + do { > + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); > + ns = mnt_ns_find_id_at(mnt_ns_id); > + if (ns) > + break; > + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); > > - refcount_inc(&ns->passive); > - return ns; > + if (!ns || ns->seq != mnt_ns_id) > + return NULL; > + > + /* > + * The last reference count is put with after RCU delay so we > + * don't need to use refcount_inc_not_zero(). > + */ > + refcount_inc(&ns->passive); > + return ns; > } > > static inline void lock_mount_hash(void) > > -- > 2.45.2 >
On Wed, Dec 11, 2024 at 09:13:16AM +0100, Christian Brauner wrote: > Hey Peter, > > I had a question for you and meant to Cc you but forgot. This makes one > of our rbtree uses lockless using the seqlock pattern. See below. > > I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() / > rb_find_add_rcu()") you added new _rcu() variants. The original patches are much older, see: d72da4a4d973 ("rbtree: Make lockless searches non-fatal") ade3f510f93a ("rbtree: Implement generic latch_tree") (aw gawd, almost 10 years) > We're using another search function that allows us to walk the tree in > either direction: > > guard(read_lock)(&mnt_ns_tree_lock); > for (;;) { > struct rb_node *node; > > if (previous) > node = rb_prev(&mntns->mnt_ns_tree_node); > else > node = rb_next(&mntns->mnt_ns_tree_node); > if (!node) > return ERR_PTR(-ENOENT); > > mntns = node_to_mnt_ns(node); > node = &mntns->mnt_ns_tree_node; > > But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever > considered adding rcu safe variants for those two as well? Urgh, those are hard :-( So back when I did the lockless lookups, I only ensured the child pointers were 'stable'. I did not deal with the parent (and colour) pointer. The next/prev iterators very much rely on the parent pointer. Someone would have to go through the tree rotations again and see if it is possible to also update the parent pointers in such a way as to not create temporary loops. Notably, the thing you want to avoid is an interrupt doing a tree traversal on the CPU that's doing the tree rotation getting stuck. The other, possibly even harder option would be to (finally) implement threaded RB trees, where the current NULL child pointers become a 'list' pointer to the next (in-order) element. But that too makes rotations 'interesting' and must avoid creating loops. But in all those cases you have to also consider the ramifications of rb_next/prev hitting a modification, I suspect like with the simple lookup, you can miss entire subtrees. So depending on the requirements, this might not be suitable for you. The far easier option might be to simply add a list_head along with the rb_node and iterate that -- normally the problem with this is that you can't easily move elements around in an RCU-list, but luck will have it you don't move elements around. Your tree location is very static afaict.
On Wed, Dec 11, 2024 at 10:02:12AM +0100, Peter Zijlstra wrote: > On Wed, Dec 11, 2024 at 09:13:16AM +0100, Christian Brauner wrote: > > Hey Peter, > > > > I had a question for you and meant to Cc you but forgot. This makes one > > of our rbtree uses lockless using the seqlock pattern. See below. > > > > I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() / > > rb_find_add_rcu()") you added new _rcu() variants. > > The original patches are much older, see: > > d72da4a4d973 ("rbtree: Make lockless searches non-fatal") > ade3f510f93a ("rbtree: Implement generic latch_tree") > > (aw gawd, almost 10 years) > > > We're using another search function that allows us to walk the tree in > > either direction: > > > > guard(read_lock)(&mnt_ns_tree_lock); > > for (;;) { > > struct rb_node *node; > > > > if (previous) > > node = rb_prev(&mntns->mnt_ns_tree_node); > > else > > node = rb_next(&mntns->mnt_ns_tree_node); > > if (!node) > > return ERR_PTR(-ENOENT); > > > > mntns = node_to_mnt_ns(node); > > node = &mntns->mnt_ns_tree_node; > > > > But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever > > considered adding rcu safe variants for those two as well? > > Urgh, those are hard :-( > > So back when I did the lockless lookups, I only ensured the child > pointers were 'stable'. I did not deal with the parent (and colour) > pointer. > > The next/prev iterators very much rely on the parent pointer. > > Someone would have to go through the tree rotations again and see if it > is possible to also update the parent pointers in such a way as to not > create temporary loops. > > Notably, the thing you want to avoid is an interrupt doing a tree > traversal on the CPU that's doing the tree rotation getting stuck. > > The other, possibly even harder option would be to (finally) implement > threaded RB trees, where the current NULL child pointers become a 'list' > pointer to the next (in-order) element. But that too makes rotations > 'interesting' and must avoid creating loops. Ah, "fun". > > But in all those cases you have to also consider the ramifications of > rb_next/prev hitting a modification, I suspect like with the simple > lookup, you can miss entire subtrees. So depending on the requirements, I think it's fine as long was have the ability to detect that the tree was modified and retry - like in the simple lookup case. I was happy to see that you guarantee that only false negatives are possible. Last I looked at this some time ago Paul was much more pessimistic about walking the tree with seqlock and rcu. It's good that this now work afaik. > this might not be suitable for you. > > The far easier option might be to simply add a list_head along with the > rb_node and iterate that -- normally the problem with this is that you > can't easily move elements around in an RCU-list, but luck will have it > you don't move elements around. Your tree location is very static Good idea. Yeah, we don't move anything around. And our insertions are done with a key that's unique - the 64bit mount id - for the system lifetime. > afaict. >
diff --git a/fs/mount.h b/fs/mount.h index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,7 +16,10 @@ struct mnt_namespace { u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + union { + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + struct rcu_head mnt_ns_rcu; + }; refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; diff --git a/fs/namespace.c b/fs/namespace.c index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..21e990482c5b2e1844d17413b55b58803fa7b008 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_RWLOCK(mnt_ns_tree_lock); +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); + static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ struct mount_kattr { @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) -{ - u64 seq_b = ns->seq; - - if (seq < seq_b) - return -1; - if (seq > seq_b) - return 1; - return 0; -} - static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; + u64 seq_b = ns_b->seq; - return mnt_ns_cmp(seq_a, ns_b) < 0; + if (seq_a < seq_b) + return -1; + if (seq_a > seq_b) + return 1; + return 0; +} + +static inline void mnt_ns_tree_write_lock(void) +{ + write_lock(&mnt_ns_tree_lock); + write_seqcount_begin(&mnt_ns_tree_seqcount); +} + +static inline void mnt_ns_tree_write_unlock(void) +{ + write_seqcount_end(&mnt_ns_tree_seqcount); + write_unlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { - guard(write_lock)(&mnt_ns_tree_lock); - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); + struct rb_node *node; + + mnt_ns_tree_write_lock(); + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + mnt_ns_tree_write_unlock(); + + WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) @@ -150,15 +163,24 @@ static void mnt_ns_release(struct mnt_namespace *ns) } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +static void mnt_ns_release_rcu(struct rcu_head *rcu) +{ + struct mnt_namespace *mnt_ns; + + mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu); + mnt_ns_release(mnt_ns); +} + static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { - guard(write_lock)(&mnt_ns_tree_lock); + mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + mnt_ns_tree_write_unlock(); } - mnt_ns_release(ns); + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } /* @@ -168,23 +190,23 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns) static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) { struct rb_node *node = mnt_ns_tree.rb_node; - struct mnt_namespace *ret = NULL; + struct mnt_namespace *mnt_ns = NULL; - lockdep_assert_held(&mnt_ns_tree_lock); + lockdep_assert(rcu_read_lock_held()); while (node) { struct mnt_namespace *n = node_to_mnt_ns(node); if (mnt_ns_id <= n->seq) { - ret = node_to_mnt_ns(node); + mnt_ns = node_to_mnt_ns(node); if (mnt_ns_id == n->seq) break; - node = node->rb_left; + node = rcu_dereference_raw(node->rb_left); } else { - node = node->rb_right; + node = rcu_dereference_raw(node->rb_right); } } - return ret; + return mnt_ns; } /* @@ -194,18 +216,35 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. + * + * Note the lookup is lockless protected by a sequence counter. We only + * need to guard against false negatives as false positives aren't + * possible. So if we didn't find a mount namespace and the sequence + * counter has changed we need to retry. If the sequence counter is + * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; + struct mnt_namespace *ns; + unsigned int seq; - guard(read_lock)(&mnt_ns_tree_lock); - ns = mnt_ns_find_id_at(mnt_ns_id); - if (!ns || ns->seq != mnt_ns_id) - return NULL; + guard(rcu)(); + do { + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); + ns = mnt_ns_find_id_at(mnt_ns_id); + if (ns) + break; + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); - refcount_inc(&ns->passive); - return ns; + if (!ns || ns->seq != mnt_ns_id) + return NULL; + + /* + * The last reference count is put with after RCU delay so we + * don't need to use refcount_inc_not_zero(). + */ + refcount_inc(&ns->passive); + return ns; } static inline void lock_mount_hash(void)
Currently we use a read-write lock but for the simple search case we can make this lockless. Creating a new mount namespace is a rather rare event compared with querying mounts in a foreign mount namespace. Once this is picked up by e.g., systemd to list mounts in another mount in it's isolated services or in containers this will be used a lot so this seems worthwhile doing. Signed-off-by: Christian Brauner <brauner@kernel.org> --- fs/mount.h | 5 ++- fs/namespace.c | 99 ++++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 73 insertions(+), 31 deletions(-)