Message ID | e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Support foreign mount namespace with statmount/listmount | expand |
On Mon, 2024-06-24 at 11:49 -0400, Josef Bacik wrote: > In order to allow for listmount() to be used on different namespaces we > need a way to lookup a mount ns by its id. Keep a rbtree of the current > !anonymous mount name spaces indexed by ID that we can use to look up > the namespace. > > Co-developed-by: Christian Brauner <brauner@kernel.org> > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > fs/mount.h | 2 + > fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 113 insertions(+), 2 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 4adce73211ae..ad4b1ddebb54 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -16,6 +16,8 @@ struct mnt_namespace { > u64 event; > unsigned int nr_mounts; /* # of mounts in the namespace */ > unsigned int pending_mounts; > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + refcount_t passive; /* number references not pinning @mounts */ > } __randomize_layout; > > struct mnt_pcp { > diff --git a/fs/namespace.c b/fs/namespace.c > index 45df82f2a059..babdebdb0a9c 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; > static DECLARE_RWSEM(namespace_sem); > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > +static DEFINE_RWLOCK(mnt_ns_tree_lock); > +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */ > > struct mount_kattr { > unsigned int attr_set; > @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); > */ > __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); > > +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) > +{ > + u64 seq_b = ns->seq; > + > + if (seq < seq_b) > + return -1; > + if (seq > seq_b) > + return 1; > + return 0; > +} > + > +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > +{ > + if (!node) > + return NULL; > + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); > +} > + > +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) > +{ > + struct mnt_namespace *ns_a = node_to_mnt_ns(a); > + struct mnt_namespace *ns_b = node_to_mnt_ns(b); > + u64 seq_a = ns_a->seq; > + > + return mnt_ns_cmp(seq_a, ns_b) < 0; > +} > + > +static void mnt_ns_tree_add(struct mnt_namespace *ns) > +{ > + guard(write_lock)(&mnt_ns_tree_lock); > + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); > +} > + > +static void mnt_ns_release(struct mnt_namespace *ns) > +{ > + lockdep_assert_not_held(&mnt_ns_tree_lock); > + Why is it bad to hold this lock here? AFAICT, put_user_ns just does a schedule_work when the counter goes to 0. Granted, I don't see a reason why you would want to hold it here, but usually that sort of assertion means that it _must_ be forbidden. > + /* keep alive for {list,stat}mount() */ > + if (refcount_dec_and_test(&ns->passive)) { > + put_user_ns(ns->user_ns); > + kfree(ns); > + } > +} > +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) > + > +static void mnt_ns_tree_remove(struct mnt_namespace *ns) > +{ > + /* remove from global mount namespace list */ > + if (!is_anon_ns(ns)) { > + guard(write_lock)(&mnt_ns_tree_lock); > + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > + } > + > + mnt_ns_release(ns); > +} > + > +/* > + * Returns the mount namespace which either has the specified id, or has the > + * next smallest id afer the specified one. > + */ > +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > +{ > + struct rb_node *node = mnt_ns_tree.rb_node; > + struct mnt_namespace *ret = NULL; > + > + lockdep_assert_held(&mnt_ns_tree_lock); > + > + while (node) { > + struct mnt_namespace *n = node_to_mnt_ns(node); > + > + if (mnt_ns_id <= n->seq) { > + ret = node_to_mnt_ns(node); > + if (mnt_ns_id == n->seq) > + break; > + node = node->rb_left; > + } else { > + node = node->rb_right; > + } > + } > + return ret; > +} > + > +/* > + * Lookup a mount namespace by id and take a passive reference count. Taking a > + * passive reference means the mount namespace can be emptied if e.g., the last > + * task holding an active reference exits. To access the mounts of the > + * namespace the @namespace_sem must first be acquired. If the namespace has > + * already shut down before acquiring @namespace_sem, {list,stat}mount() will > + * see that the mount rbtree of the namespace is empty. > + */ > +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) > +{ > + struct mnt_namespace *ns; > + > + guard(read_lock)(&mnt_ns_tree_lock); > + ns = mnt_ns_find_id_at(mnt_ns_id); > + if (!ns || ns->seq != mnt_ns_id) > + return NULL; > + > + refcount_inc(&ns->passive); > + return ns; > +} > + > static inline void lock_mount_hash(void) > { > write_seqlock(&mount_lock); > @@ -3736,8 +3841,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) > if (!is_anon_ns(ns)) > ns_free_inum(&ns->ns); > dec_mnt_namespaces(ns->ucounts); > - put_user_ns(ns->user_ns); > - kfree(ns); > + mnt_ns_tree_remove(ns); > } > > /* > @@ -3776,7 +3880,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a > if (!anon) > new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); > refcount_set(&new_ns->ns.count, 1); > + refcount_set(&new_ns->passive, 1); > new_ns->mounts = RB_ROOT; > + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); > init_waitqueue_head(&new_ns->poll); > new_ns->user_ns = get_user_ns(user_ns); > new_ns->ucounts = ucounts; > @@ -3853,6 +3959,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, > while (p->mnt.mnt_root != q->mnt.mnt_root) > p = next_mnt(skip_mnt_tree(p), old); > } > + mnt_ns_tree_add(new_ns); > namespace_unlock(); > > if (rootmnt) > @@ -5208,6 +5315,8 @@ static void __init init_mount_tree(void) > > set_fs_pwd(current->fs, &root); > set_fs_root(current->fs, &root); > + > + mnt_ns_tree_add(ns); > } > > void __init mnt_init(void)
On Tue, Jun 25, 2024 at 09:03:03AM GMT, Jeff Layton wrote: > On Mon, 2024-06-24 at 11:49 -0400, Josef Bacik wrote: > > In order to allow for listmount() to be used on different namespaces we > > need a way to lookup a mount ns by its id. Keep a rbtree of the current > > !anonymous mount name spaces indexed by ID that we can use to look up > > the namespace. > > > > Co-developed-by: Christian Brauner <brauner@kernel.org> > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > --- > > fs/mount.h | 2 + > > fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++- > > 2 files changed, 113 insertions(+), 2 deletions(-) > > > > diff --git a/fs/mount.h b/fs/mount.h > > index 4adce73211ae..ad4b1ddebb54 100644 > > --- a/fs/mount.h > > +++ b/fs/mount.h > > @@ -16,6 +16,8 @@ struct mnt_namespace { > > u64 event; > > unsigned int nr_mounts; /* # of mounts in the namespace */ > > unsigned int pending_mounts; > > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > > + refcount_t passive; /* number references not pinning @mounts */ > > } __randomize_layout; > > > > struct mnt_pcp { > > diff --git a/fs/namespace.c b/fs/namespace.c > > index 45df82f2a059..babdebdb0a9c 100644 > > --- a/fs/namespace.c > > +++ b/fs/namespace.c > > @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; > > static DECLARE_RWSEM(namespace_sem); > > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > > +static DEFINE_RWLOCK(mnt_ns_tree_lock); > > +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */ > > > > struct mount_kattr { > > unsigned int attr_set; > > @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); > > */ > > __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); > > > > +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) > > +{ > > + u64 seq_b = ns->seq; > > + > > + if (seq < seq_b) > > + return -1; > > + if (seq > seq_b) > > + return 1; > > + return 0; > > +} > > + > > +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > > +{ > > + if (!node) > > + return NULL; > > + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); > > +} > > + > > +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) > > +{ > > + struct mnt_namespace *ns_a = node_to_mnt_ns(a); > > + struct mnt_namespace *ns_b = node_to_mnt_ns(b); > > + u64 seq_a = ns_a->seq; > > + > > + return mnt_ns_cmp(seq_a, ns_b) < 0; > > +} > > + > > +static void mnt_ns_tree_add(struct mnt_namespace *ns) > > +{ > > + guard(write_lock)(&mnt_ns_tree_lock); > > + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); > > +} > > + > > +static void mnt_ns_release(struct mnt_namespace *ns) > > +{ > > + lockdep_assert_not_held(&mnt_ns_tree_lock); > > + > > Why is it bad to hold this lock here? AFAICT, put_user_ns just does a > schedule_work when the counter goes to 0. Granted, I don't see a reason > why you would want to hold it here, but usually that sort of assertion > means that it _must_ be forbidden. I just annotate locking assumptions liberally. There's no reason to take the lock there and there's no current codepath that needs to hold it so don't waste cycles and hold it when we don't have to.
diff --git a/fs/mount.h b/fs/mount.h index 4adce73211ae..ad4b1ddebb54 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,6 +16,8 @@ struct mnt_namespace { u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 45df82f2a059..babdebdb0a9c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static DEFINE_RWLOCK(mnt_ns_tree_lock); +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */ struct mount_kattr { unsigned int attr_set; @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) +{ + u64 seq_b = ns->seq; + + if (seq < seq_b) + return -1; + if (seq > seq_b) + return 1; + return 0; +} + +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); +} + +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +{ + struct mnt_namespace *ns_a = node_to_mnt_ns(a); + struct mnt_namespace *ns_b = node_to_mnt_ns(b); + u64 seq_a = ns_a->seq; + + return mnt_ns_cmp(seq_a, ns_b) < 0; +} + +static void mnt_ns_tree_add(struct mnt_namespace *ns) +{ + guard(write_lock)(&mnt_ns_tree_lock); + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); +} + +static void mnt_ns_release(struct mnt_namespace *ns) +{ + lockdep_assert_not_held(&mnt_ns_tree_lock); + + /* keep alive for {list,stat}mount() */ + if (refcount_dec_and_test(&ns->passive)) { + put_user_ns(ns->user_ns); + kfree(ns); + } +} +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) + +static void mnt_ns_tree_remove(struct mnt_namespace *ns) +{ + /* remove from global mount namespace list */ + if (!is_anon_ns(ns)) { + guard(write_lock)(&mnt_ns_tree_lock); + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + } + + mnt_ns_release(ns); +} + +/* + * Returns the mount namespace which either has the specified id, or has the + * next smallest id afer the specified one. + */ +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +{ + struct rb_node *node = mnt_ns_tree.rb_node; + struct mnt_namespace *ret = NULL; + + lockdep_assert_held(&mnt_ns_tree_lock); + + while (node) { + struct mnt_namespace *n = node_to_mnt_ns(node); + + if (mnt_ns_id <= n->seq) { + ret = node_to_mnt_ns(node); + if (mnt_ns_id == n->seq) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return ret; +} + +/* + * Lookup a mount namespace by id and take a passive reference count. Taking a + * passive reference means the mount namespace can be emptied if e.g., the last + * task holding an active reference exits. To access the mounts of the + * namespace the @namespace_sem must first be acquired. If the namespace has + * already shut down before acquiring @namespace_sem, {list,stat}mount() will + * see that the mount rbtree of the namespace is empty. + */ +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) +{ + struct mnt_namespace *ns; + + guard(read_lock)(&mnt_ns_tree_lock); + ns = mnt_ns_find_id_at(mnt_ns_id); + if (!ns || ns->seq != mnt_ns_id) + return NULL; + + refcount_inc(&ns->passive); + return ns; +} + static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); @@ -3736,8 +3841,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - kfree(ns); + mnt_ns_tree_remove(ns); } /* @@ -3776,7 +3880,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); + refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -3853,6 +3959,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } + mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -5208,6 +5315,8 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); + + mnt_ns_tree_add(ns); } void __init mnt_init(void)