Message ID | 20241212-work-mount-rbtree-lockless-v2-3-4fe6cef02534@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fs: lockless mntns lookup | expand |
On Thu, Dec 12, 2024 at 12:56:02PM +0100, Christian Brauner wrote: > @@ -16,7 +16,10 @@ struct mnt_namespace { > u64 event; > unsigned int nr_mounts; /* # of mounts in the namespace */ > unsigned int pending_mounts; > - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + union { > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + struct rcu_head mnt_ns_rcu; > + }; > refcount_t passive; /* number references not pinning @mounts */ > } __randomize_layout; > static void mnt_ns_tree_remove(struct mnt_namespace *ns) > { > /* remove from global mount namespace list */ > if (!is_anon_ns(ns)) { > - guard(write_lock)(&mnt_ns_tree_lock); > + mnt_ns_tree_write_lock(); > rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > + mnt_ns_tree_write_unlock(); > } > > - mnt_ns_release(ns); > + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); > } I'm not sure that union is sane; the above means you're overwriting the tree node while a concurrent lookup might still see the node and want to decent from it.
On Thu, 2024-12-12 at 12:56 +0100, Christian Brauner wrote: > Currently we use a read-write lock but for the simple search case we can > make this lockless. Creating a new mount namespace is a rather rare > event compared with querying mounts in a foreign mount namespace. Once > this is picked up by e.g., systemd to list mounts in another mount in > it's isolated services or in containers this will be used a lot so this > seems worthwhile doing. > > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > fs/mount.h | 5 ++- > fs/namespace.c | 119 +++++++++++++++++++++++++++++++++++---------------------- > 2 files changed, 77 insertions(+), 47 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -16,7 +16,10 @@ struct mnt_namespace { > u64 event; > unsigned int nr_mounts; /* # of mounts in the namespace */ > unsigned int pending_mounts; > - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + union { > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + struct rcu_head mnt_ns_rcu; > + }; I second Peter's concerns about the union here. You really want to union the rcu structure with something that isn't ever touched during a search for the object. > refcount_t passive; /* number references not pinning @mounts */ > } __randomize_layout; > > diff --git a/fs/namespace.c b/fs/namespace.c > index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..9463b9ab95f0a5db32cfe5fc5564d7f25ce3e06f 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem); > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > static DEFINE_RWLOCK(mnt_ns_tree_lock); > +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); > + > static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ > > struct mount_kattr { > @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); > */ > __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); > > -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) > -{ > - u64 seq_b = ns->seq; > - > - if (seq < seq_b) > - return -1; > - if (seq > seq_b) > - return 1; > - return 0; > -} > - > static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > { > if (!node) > @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); > } > > -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) > +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) > { > struct mnt_namespace *ns_a = node_to_mnt_ns(a); > struct mnt_namespace *ns_b = node_to_mnt_ns(b); > u64 seq_a = ns_a->seq; > + u64 seq_b = ns_b->seq; > + > + if (seq_a < seq_b) > + return -1; > + if (seq_a > seq_b) > + return 1; > + return 0; > +} > > - return mnt_ns_cmp(seq_a, ns_b) < 0; > +static inline void mnt_ns_tree_write_lock(void) > +{ > + write_lock(&mnt_ns_tree_lock); > + write_seqcount_begin(&mnt_ns_tree_seqcount); > +} > + > +static inline void mnt_ns_tree_write_unlock(void) > +{ > + write_seqcount_end(&mnt_ns_tree_seqcount); > + write_unlock(&mnt_ns_tree_lock); > } > > static void mnt_ns_tree_add(struct mnt_namespace *ns) > { > - guard(write_lock)(&mnt_ns_tree_lock); > - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); > + struct rb_node *node; > + > + mnt_ns_tree_write_lock(); > + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); > + mnt_ns_tree_write_unlock(); > + > + WARN_ON_ONCE(node); > } > > static void mnt_ns_release(struct mnt_namespace *ns) > @@ -150,41 +163,36 @@ static void mnt_ns_release(struct mnt_namespace *ns) > } > DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) > > +static void mnt_ns_release_rcu(struct rcu_head *rcu) > +{ > + struct mnt_namespace *mnt_ns; > + > + mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu); > + mnt_ns_release(mnt_ns); > +} > + > static void mnt_ns_tree_remove(struct mnt_namespace *ns) > { > /* remove from global mount namespace list */ > if (!is_anon_ns(ns)) { > - guard(write_lock)(&mnt_ns_tree_lock); > + mnt_ns_tree_write_lock(); > rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > + mnt_ns_tree_write_unlock(); > } > > - mnt_ns_release(ns); > + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); > } > > -/* > - * Returns the mount namespace which either has the specified id, or has the > - * next smallest id afer the specified one. > - */ > -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > +static int mnt_ns_find(const void *key, const struct rb_node *node) > { > - struct rb_node *node = mnt_ns_tree.rb_node; > - struct mnt_namespace *ret = NULL; > - > - lockdep_assert_held(&mnt_ns_tree_lock); > - > - while (node) { > - struct mnt_namespace *n = node_to_mnt_ns(node); > + const u64 mnt_ns_id = *(u64 *)key; > + const struct mnt_namespace *ns = node_to_mnt_ns(node); > > - if (mnt_ns_id <= n->seq) { > - ret = node_to_mnt_ns(node); > - if (mnt_ns_id == n->seq) > - break; > - node = node->rb_left; > - } else { > - node = node->rb_right; > - } > - } > - return ret; > + if (mnt_ns_id < ns->seq) > + return -1; > + if (mnt_ns_id > ns->seq) > + return 1; > + return 0; > } > > /* > @@ -194,18 +202,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > * namespace the @namespace_sem must first be acquired. If the namespace has > * already shut down before acquiring @namespace_sem, {list,stat}mount() will > * see that the mount rbtree of the namespace is empty. > + * > + * Note the lookup is lockless protected by a sequence counter. We only > + * need to guard against false negatives as false positives aren't > + * possible. So if we didn't find a mount namespace and the sequence > + * counter has changed we need to retry. If the sequence counter is > + * still the same we know the search actually failed. > */ > static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) > { > - struct mnt_namespace *ns; > + struct mnt_namespace *ns; > + struct rb_node *node; > + unsigned int seq; > + > + guard(rcu)(); > + do { > + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); > + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); > + if (node) > + break; > + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); I don't really get the need for a seqcount here. Typically we use those when you could potentially find an object that's in some sort of inconsistent state. Is that the case here? It seems like you'd either find the object or not. It seems like either outcome is OK without the need to retry the search? > > - guard(read_lock)(&mnt_ns_tree_lock); > - ns = mnt_ns_find_id_at(mnt_ns_id); > - if (!ns || ns->seq != mnt_ns_id) > - return NULL; > + if (!node) > + return NULL; > > - refcount_inc(&ns->passive); > - return ns; > + /* > + * The last reference count is put with after RCU delay so we > + * don't need to use refcount_inc_not_zero(). > + */ > + ns = node_to_mnt_ns(node); > + refcount_inc(&ns->passive); > + return ns; > } > > static inline void lock_mount_hash(void) >
> > + guard(rcu)(); > > + do { > > + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); > > + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); > > + if (node) > > + break; > > + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); > > I don't really get the need for a seqcount here. Typically we use those > when you could potentially find an object that's in some sort of > inconsistent state. > > Is that the case here? It seems like you'd either find the object or > not. It seems like either outcome is OK without the need to retry the > search? The sequence count is to detect concurrent changes to the rbtree. As a change to the rbtree might cause us to miss whole subtrees as it could be rotated. So it's not about the object but about the rbtree.
On Thu, Dec 12, 2024 at 01:09:29PM +0100, Peter Zijlstra wrote: > On Thu, Dec 12, 2024 at 12:56:02PM +0100, Christian Brauner wrote: > > > > @@ -16,7 +16,10 @@ struct mnt_namespace { > > u64 event; > > unsigned int nr_mounts; /* # of mounts in the namespace */ > > unsigned int pending_mounts; > > - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > > + union { > > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > > + struct rcu_head mnt_ns_rcu; > > + }; > > refcount_t passive; /* number references not pinning @mounts */ > > } __randomize_layout; > > > static void mnt_ns_tree_remove(struct mnt_namespace *ns) > > { > > /* remove from global mount namespace list */ > > if (!is_anon_ns(ns)) { > > - guard(write_lock)(&mnt_ns_tree_lock); > > + mnt_ns_tree_write_lock(); > > rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > > + mnt_ns_tree_write_unlock(); > > } > > > > - mnt_ns_release(ns); > > + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); > > } > > I'm not sure that union is sane; the above means you're overwriting the > tree node while a concurrent lookup might still see the node and want to > decent from it. Though, you're right. I'll fix this up. Thanks for the reviews!
diff --git a/fs/mount.h b/fs/mount.h index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,7 +16,10 @@ struct mnt_namespace { u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + union { + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + struct rcu_head mnt_ns_rcu; + }; refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; diff --git a/fs/namespace.c b/fs/namespace.c index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..9463b9ab95f0a5db32cfe5fc5564d7f25ce3e06f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_RWLOCK(mnt_ns_tree_lock); +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); + static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ struct mount_kattr { @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) -{ - u64 seq_b = ns->seq; - - if (seq < seq_b) - return -1; - if (seq > seq_b) - return 1; - return 0; -} - static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; + u64 seq_b = ns_b->seq; + + if (seq_a < seq_b) + return -1; + if (seq_a > seq_b) + return 1; + return 0; +} - return mnt_ns_cmp(seq_a, ns_b) < 0; +static inline void mnt_ns_tree_write_lock(void) +{ + write_lock(&mnt_ns_tree_lock); + write_seqcount_begin(&mnt_ns_tree_seqcount); +} + +static inline void mnt_ns_tree_write_unlock(void) +{ + write_seqcount_end(&mnt_ns_tree_seqcount); + write_unlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { - guard(write_lock)(&mnt_ns_tree_lock); - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); + struct rb_node *node; + + mnt_ns_tree_write_lock(); + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + mnt_ns_tree_write_unlock(); + + WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) @@ -150,41 +163,36 @@ static void mnt_ns_release(struct mnt_namespace *ns) } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +static void mnt_ns_release_rcu(struct rcu_head *rcu) +{ + struct mnt_namespace *mnt_ns; + + mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu); + mnt_ns_release(mnt_ns); +} + static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { - guard(write_lock)(&mnt_ns_tree_lock); + mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + mnt_ns_tree_write_unlock(); } - mnt_ns_release(ns); + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } -/* - * Returns the mount namespace which either has the specified id, or has the - * next smallest id afer the specified one. - */ -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +static int mnt_ns_find(const void *key, const struct rb_node *node) { - struct rb_node *node = mnt_ns_tree.rb_node; - struct mnt_namespace *ret = NULL; - - lockdep_assert_held(&mnt_ns_tree_lock); - - while (node) { - struct mnt_namespace *n = node_to_mnt_ns(node); + const u64 mnt_ns_id = *(u64 *)key; + const struct mnt_namespace *ns = node_to_mnt_ns(node); - if (mnt_ns_id <= n->seq) { - ret = node_to_mnt_ns(node); - if (mnt_ns_id == n->seq) - break; - node = node->rb_left; - } else { - node = node->rb_right; - } - } - return ret; + if (mnt_ns_id < ns->seq) + return -1; + if (mnt_ns_id > ns->seq) + return 1; + return 0; } /* @@ -194,18 +202,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. + * + * Note the lookup is lockless protected by a sequence counter. We only + * need to guard against false negatives as false positives aren't + * possible. So if we didn't find a mount namespace and the sequence + * counter has changed we need to retry. If the sequence counter is + * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; + struct mnt_namespace *ns; + struct rb_node *node; + unsigned int seq; + + guard(rcu)(); + do { + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); + if (node) + break; + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); - guard(read_lock)(&mnt_ns_tree_lock); - ns = mnt_ns_find_id_at(mnt_ns_id); - if (!ns || ns->seq != mnt_ns_id) - return NULL; + if (!node) + return NULL; - refcount_inc(&ns->passive); - return ns; + /* + * The last reference count is put with after RCU delay so we + * don't need to use refcount_inc_not_zero(). + */ + ns = node_to_mnt_ns(node); + refcount_inc(&ns->passive); + return ns; } static inline void lock_mount_hash(void)
Currently we use a read-write lock but for the simple search case we can make this lockless. Creating a new mount namespace is a rather rare event compared with querying mounts in a foreign mount namespace. Once this is picked up by e.g., systemd to list mounts in another mount in it's isolated services or in containers this will be used a lot so this seems worthwhile doing. Signed-off-by: Christian Brauner <brauner@kernel.org> --- fs/mount.h | 5 ++- fs/namespace.c | 119 +++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 77 insertions(+), 47 deletions(-)