[3/8] fs: keep an index of current mount namespaces

Message ID	e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com (mailing list archive)
State	New
Headers	show Received: from mail-yw1-f177.google.com (mail-yw1-f177.google.com [209.85.128.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8A6DF19D885 for <linux-fsdevel@vger.kernel.org>; Mon, 24 Jun 2024 15:50:43 +0000 (UTC) From: Josef Bacik <josef@toxicpanda.com> To: linux-fsdevel@vger.kernel.org, brauner@kernel.org, kernel-team@fb.com Subject: [PATCH 3/8] fs: keep an index of current mount namespaces Date: Mon, 24 Jun 2024 11:49:46 -0400 Message-ID: <e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com> In-Reply-To: <cover.1719243756.git.josef@toxicpanda.com> References: <cover.1719243756.git.josef@toxicpanda.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	Support foreign mount namespace with statmount/listmount \| expand [0/8] Support foreign mount namespace with statmount/listmount [1/8] fs: relax permissions for listmount() [2/8] fs: relax permissions for statmount() [3/8] fs: keep an index of current mount namespaces [4/8] fs: export the mount ns id via statmount [5/8] fs: Allow listmount() in foreign mount namespace [6/8] fs: Allow statmount() in foreign mount namespace [7/8] fs: add an ioctl to get the mnt ns id from nsfs [8/8] selftests: add a test for the foreign mnt ns extensions

Message ID

e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com (mailing list archive)

State

New

Headers

From: Josef Bacik <josef@toxicpanda.com>
To: linux-fsdevel@vger.kernel.org,
	brauner@kernel.org,
	kernel-team@fb.com
Subject: [PATCH 3/8] fs: keep an index of current mount namespaces
Date: Mon, 24 Jun 2024 11:49:46 -0400
Message-ID: 
 <e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com>
In-Reply-To: <cover.1719243756.git.josef@toxicpanda.com>
References: <cover.1719243756.git.josef@toxicpanda.com>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

Series

Support foreign mount namespace with statmount/listmount | expand

Commit Message

Josef Bacik June 24, 2024, 3:49 p.m. UTC

In order to allow for listmount() to be used on different namespaces we
need a way to lookup a mount ns by its id.  Keep a rbtree of the current
!anonymous mount name spaces indexed by ID that we can use to look up
the namespace.

Co-developed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     |   2 +
 fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 113 insertions(+), 2 deletions(-)

Comments

Jeff Layton June 25, 2024, 1:03 p.m. UTC | #1

On Mon, 2024-06-24 at 11:49 -0400, Josef Bacik wrote:
> In order to allow for listmount() to be used on different namespaces we
> need a way to lookup a mount ns by its id.  Keep a rbtree of the current
> !anonymous mount name spaces indexed by ID that we can use to look up
> the namespace.
> 
> Co-developed-by: Christian Brauner <brauner@kernel.org>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/mount.h     |   2 +
>  fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 113 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 4adce73211ae..ad4b1ddebb54 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -16,6 +16,8 @@ struct mnt_namespace {
>  	u64 event;
>  	unsigned int		nr_mounts; /* # of mounts in the namespace */
>  	unsigned int		pending_mounts;
> +	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
> +	refcount_t		passive; /* number references not pinning @mounts */
>  } __randomize_layout;
>  
>  struct mnt_pcp {
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 45df82f2a059..babdebdb0a9c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
>  static DECLARE_RWSEM(namespace_sem);
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> +static DEFINE_RWLOCK(mnt_ns_tree_lock);
> +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */
>  
>  struct mount_kattr {
>  	unsigned int attr_set;
> @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
>   */
>  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
>  
> +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
> +{
> +	u64 seq_b = ns->seq;
> +
> +	if (seq < seq_b)
> +		return -1;
> +	if (seq > seq_b)
> +		return 1;
> +	return 0;
> +}
> +
> +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
> +{
> +	if (!node)
> +		return NULL;
> +	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
> +}
> +
> +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
> +{
> +	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
> +	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
> +	u64 seq_a = ns_a->seq;
> +
> +	return mnt_ns_cmp(seq_a, ns_b) < 0;
> +}
> +
> +static void mnt_ns_tree_add(struct mnt_namespace *ns)
> +{
> +	guard(write_lock)(&mnt_ns_tree_lock);
> +	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
> +}
> +
> +static void mnt_ns_release(struct mnt_namespace *ns)
> +{
> +	lockdep_assert_not_held(&mnt_ns_tree_lock);
> +

Why is it bad to hold this lock here? AFAICT, put_user_ns just does a
schedule_work when the counter goes to 0. Granted, I don't see a reason
why you would want to hold it here, but usually that sort of assertion
means that it _must_ be forbidden.


> +	/* keep alive for {list,stat}mount() */
> +	if (refcount_dec_and_test(&ns->passive)) {
> +		put_user_ns(ns->user_ns);
> +		kfree(ns);
> +	}
> +}
> +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
> +
> +static void mnt_ns_tree_remove(struct mnt_namespace *ns)
> +{
> +	/* remove from global mount namespace list */
> +	if (!is_anon_ns(ns)) {
> +		guard(write_lock)(&mnt_ns_tree_lock);
> +		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
> +	}
> +
> +	mnt_ns_release(ns);
> +}
> +
> +/*
> + * Returns the mount namespace which either has the specified id, or has the
> + * next smallest id afer the specified one.
> + */
> +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
> +{
> +	struct rb_node *node = mnt_ns_tree.rb_node;
> +	struct mnt_namespace *ret = NULL;
> +
> +	lockdep_assert_held(&mnt_ns_tree_lock);
> +
> +	while (node) {
> +		struct mnt_namespace *n = node_to_mnt_ns(node);
> +
> +		if (mnt_ns_id <= n->seq) {
> +			ret = node_to_mnt_ns(node);
> +			if (mnt_ns_id == n->seq)
> +				break;
> +			node = node->rb_left;
> +		} else {
> +			node = node->rb_right;
> +		}
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Lookup a mount namespace by id and take a passive reference count. Taking a
> + * passive reference means the mount namespace can be emptied if e.g., the last
> + * task holding an active reference exits. To access the mounts of the
> + * namespace the @namespace_sem must first be acquired. If the namespace has
> + * already shut down before acquiring @namespace_sem, {list,stat}mount() will
> + * see that the mount rbtree of the namespace is empty.
> + */
> +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
> +{
> +       struct mnt_namespace *ns;
> +
> +       guard(read_lock)(&mnt_ns_tree_lock);
> +       ns = mnt_ns_find_id_at(mnt_ns_id);
> +       if (!ns || ns->seq != mnt_ns_id)
> +               return NULL;
> +
> +       refcount_inc(&ns->passive);
> +       return ns;
> +}
> +
>  static inline void lock_mount_hash(void)
>  {
>  	write_seqlock(&mount_lock);
> @@ -3736,8 +3841,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
>  	if (!is_anon_ns(ns))
>  		ns_free_inum(&ns->ns);
>  	dec_mnt_namespaces(ns->ucounts);
> -	put_user_ns(ns->user_ns);
> -	kfree(ns);
> +	mnt_ns_tree_remove(ns);
>  }
>  
>  /*
> @@ -3776,7 +3880,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
>  	if (!anon)
>  		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
>  	refcount_set(&new_ns->ns.count, 1);
> +	refcount_set(&new_ns->passive, 1);
>  	new_ns->mounts = RB_ROOT;
> +	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
>  	init_waitqueue_head(&new_ns->poll);
>  	new_ns->user_ns = get_user_ns(user_ns);
>  	new_ns->ucounts = ucounts;
> @@ -3853,6 +3959,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
>  		while (p->mnt.mnt_root != q->mnt.mnt_root)
>  			p = next_mnt(skip_mnt_tree(p), old);
>  	}
> +	mnt_ns_tree_add(new_ns);
>  	namespace_unlock();
>  
>  	if (rootmnt)
> @@ -5208,6 +5315,8 @@ static void __init init_mount_tree(void)
>  
>  	set_fs_pwd(current->fs, &root);
>  	set_fs_root(current->fs, &root);
> +
> +	mnt_ns_tree_add(ns);
>  }
>  
>  void __init mnt_init(void)

Christian Brauner June 25, 2024, 1:39 p.m. UTC | #2

On Tue, Jun 25, 2024 at 09:03:03AM GMT, Jeff Layton wrote:
> On Mon, 2024-06-24 at 11:49 -0400, Josef Bacik wrote:
> > In order to allow for listmount() to be used on different namespaces we
> > need a way to lookup a mount ns by its id.  Keep a rbtree of the current
> > !anonymous mount name spaces indexed by ID that we can use to look up
> > the namespace.
> > 
> > Co-developed-by: Christian Brauner <brauner@kernel.org>
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  fs/mount.h     |   2 +
> >  fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 113 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/mount.h b/fs/mount.h
> > index 4adce73211ae..ad4b1ddebb54 100644
> > --- a/fs/mount.h
> > +++ b/fs/mount.h
> > @@ -16,6 +16,8 @@ struct mnt_namespace {
> >  	u64 event;
> >  	unsigned int		nr_mounts; /* # of mounts in the namespace */
> >  	unsigned int		pending_mounts;
> > +	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
> > +	refcount_t		passive; /* number references not pinning @mounts */
> >  } __randomize_layout;
> >  
> >  struct mnt_pcp {
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 45df82f2a059..babdebdb0a9c 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
> >  static DECLARE_RWSEM(namespace_sem);
> >  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
> >  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > +static DEFINE_RWLOCK(mnt_ns_tree_lock);
> > +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */
> >  
> >  struct mount_kattr {
> >  	unsigned int attr_set;
> > @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
> >   */
> >  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
> >  
> > +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
> > +{
> > +	u64 seq_b = ns->seq;
> > +
> > +	if (seq < seq_b)
> > +		return -1;
> > +	if (seq > seq_b)
> > +		return 1;
> > +	return 0;
> > +}
> > +
> > +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
> > +{
> > +	if (!node)
> > +		return NULL;
> > +	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
> > +}
> > +
> > +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
> > +{
> > +	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
> > +	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
> > +	u64 seq_a = ns_a->seq;
> > +
> > +	return mnt_ns_cmp(seq_a, ns_b) < 0;
> > +}
> > +
> > +static void mnt_ns_tree_add(struct mnt_namespace *ns)
> > +{
> > +	guard(write_lock)(&mnt_ns_tree_lock);
> > +	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
> > +}
> > +
> > +static void mnt_ns_release(struct mnt_namespace *ns)
> > +{
> > +	lockdep_assert_not_held(&mnt_ns_tree_lock);
> > +
> 
> Why is it bad to hold this lock here? AFAICT, put_user_ns just does a
> schedule_work when the counter goes to 0. Granted, I don't see a reason
> why you would want to hold it here, but usually that sort of assertion
> means that it _must_ be forbidden.

I just annotate locking assumptions liberally. There's no reason to take
the lock there and there's no current codepath that needs to hold it so
don't waste cycles and hold it when we don't have to.

diff --git a/fs/mount.h b/fs/mount.h
index 4adce73211ae..ad4b1ddebb54 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -16,6 +16,8 @@  struct mnt_namespace {
 	u64 event;
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
+	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
+	refcount_t		passive; /* number references not pinning @mounts */
 } __randomize_layout;
 
 struct mnt_pcp {
diff --git a/fs/namespace.c b/fs/namespace.c
index 45df82f2a059..babdebdb0a9c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -78,6 +78,8 @@  static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */
 
 struct mount_kattr {
 	unsigned int attr_set;
@@ -103,6 +105,109 @@  EXPORT_SYMBOL_GPL(fs_kobj);
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
+static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
+{
+	u64 seq_b = ns->seq;
+
+	if (seq < seq_b)
+		return -1;
+	if (seq > seq_b)
+		return 1;
+	return 0;
+}
+
+static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
+{
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
+}
+
+static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+{
+	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
+	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
+	u64 seq_a = ns_a->seq;
+
+	return mnt_ns_cmp(seq_a, ns_b) < 0;
+}
+
+static void mnt_ns_tree_add(struct mnt_namespace *ns)
+{
+	guard(write_lock)(&mnt_ns_tree_lock);
+	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+}
+
+static void mnt_ns_release(struct mnt_namespace *ns)
+{
+	lockdep_assert_not_held(&mnt_ns_tree_lock);
+
+	/* keep alive for {list,stat}mount() */
+	if (refcount_dec_and_test(&ns->passive)) {
+		put_user_ns(ns->user_ns);
+		kfree(ns);
+	}
+}
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+
+static void mnt_ns_tree_remove(struct mnt_namespace *ns)
+{
+	/* remove from global mount namespace list */
+	if (!is_anon_ns(ns)) {
+		guard(write_lock)(&mnt_ns_tree_lock);
+		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+	}
+
+	mnt_ns_release(ns);
+}
+
+/*
+ * Returns the mount namespace which either has the specified id, or has the
+ * next smallest id afer the specified one.
+ */
+static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+{
+	struct rb_node *node = mnt_ns_tree.rb_node;
+	struct mnt_namespace *ret = NULL;
+
+	lockdep_assert_held(&mnt_ns_tree_lock);
+
+	while (node) {
+		struct mnt_namespace *n = node_to_mnt_ns(node);
+
+		if (mnt_ns_id <= n->seq) {
+			ret = node_to_mnt_ns(node);
+			if (mnt_ns_id == n->seq)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Lookup a mount namespace by id and take a passive reference count. Taking a
+ * passive reference means the mount namespace can be emptied if e.g., the last
+ * task holding an active reference exits. To access the mounts of the
+ * namespace the @namespace_sem must first be acquired. If the namespace has
+ * already shut down before acquiring @namespace_sem, {list,stat}mount() will
+ * see that the mount rbtree of the namespace is empty.
+ */
+static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
+{
+       struct mnt_namespace *ns;
+
+       guard(read_lock)(&mnt_ns_tree_lock);
+       ns = mnt_ns_find_id_at(mnt_ns_id);
+       if (!ns || ns->seq != mnt_ns_id)
+               return NULL;
+
+       refcount_inc(&ns->passive);
+       return ns;
+}
+
 static inline void lock_mount_hash(void)
 {
 	write_seqlock(&mount_lock);
@@ -3736,8 +3841,7 @@  static void free_mnt_ns(struct mnt_namespace *ns)
 	if (!is_anon_ns(ns))
 		ns_free_inum(&ns->ns);
 	dec_mnt_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	kfree(ns);
+	mnt_ns_tree_remove(ns);
 }
 
 /*
@@ -3776,7 +3880,9 @@  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	if (!anon)
 		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	refcount_set(&new_ns->ns.count, 1);
+	refcount_set(&new_ns->passive, 1);
 	new_ns->mounts = RB_ROOT;
+	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
@@ -3853,6 +3959,7 @@  struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(skip_mnt_tree(p), old);
 	}
+	mnt_ns_tree_add(new_ns);
 	namespace_unlock();
 
 	if (rootmnt)
@@ -5208,6 +5315,8 @@  static void __init init_mount_tree(void)
 
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
+
+	mnt_ns_tree_add(ns);
 }
 
 void __init mnt_init(void)

[3/8] fs: keep an index of current mount namespaces

Commit Message

Comments

Patch