diff mbox series

[RFC,1/3] inode: add fastpath for filesystem user namespace retrieval

Message ID 20250416-work-mnt_idmap-s_user_ns-v1-1-273bef3a61ec@kernel.org (mailing list archive)
State New
Headers show
Series mnt_idmapping: avoid pointer chase & inline low-level helpers | expand

Commit Message

Christian Brauner April 16, 2025, 1:17 p.m. UTC
We currently always chase a pointer inode->i_sb->s_user_ns whenever we
need to map a uid/gid which is noticeable during path lookup as noticed
by Linus in [1]. In the majority of cases we don't need to bother with
that pointer chase because the inode won't be located on a filesystem
that's mounted in a user namespace. The user namespace of the superblock
cannot ever change once it's mounted. So introduce and raise IOP_USERNS
on all inodes and check for that flag in i_user_ns() when we retrieve
the user namespace.

Link: https://lore.kernel.org/CAHk-=whJgRDtxTudTQ9HV8BFw5-bBsu+c8Ouwd_PrPqPB6_KEQ@mail.gmail.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c                    |  6 ++++++
 fs/mnt_idmapping.c            | 14 --------------
 include/linux/fs.h            |  5 ++++-
 include/linux/mnt_idmapping.h | 14 ++++++++++++++
 4 files changed, 24 insertions(+), 15 deletions(-)

Comments

Mateusz Guzik April 16, 2025, 1:49 p.m. UTC | #1
On Wed, Apr 16, 2025 at 3:17 PM Christian Brauner <brauner@kernel.org> wrote:
>
> We currently always chase a pointer inode->i_sb->s_user_ns whenever we
> need to map a uid/gid which is noticeable during path lookup as noticed
> by Linus in [1]. In the majority of cases we don't need to bother with
> that pointer chase because the inode won't be located on a filesystem
> that's mounted in a user namespace. The user namespace of the superblock
> cannot ever change once it's mounted. So introduce and raise IOP_USERNS
> on all inodes and check for that flag in i_user_ns() when we retrieve
> the user namespace.
>
> Link: https://lore.kernel.org/CAHk-=whJgRDtxTudTQ9HV8BFw5-bBsu+c8Ouwd_PrPqPB6_KEQ@mail.gmail.com [1]
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/inode.c                    |  6 ++++++
>  fs/mnt_idmapping.c            | 14 --------------
>  include/linux/fs.h            |  5 ++++-
>  include/linux/mnt_idmapping.h | 14 ++++++++++++++
>  4 files changed, 24 insertions(+), 15 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 99318b157a9a..7335d05dd7d5 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -245,6 +245,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
>                 inode->i_opflags |= IOP_XATTR;
>         if (sb->s_type->fs_flags & FS_MGTIME)
>                 inode->i_opflags |= IOP_MGTIME;
> +       if (unlikely(!initial_idmapping(i_user_ns(inode))))
> +               inode->i_opflags |= IOP_USERNS;
>         i_uid_write(inode, 0);
>         i_gid_write(inode, 0);
>         atomic_set(&inode->i_writecount, 0);
> @@ -1864,6 +1866,10 @@ static void iput_final(struct inode *inode)
>
>         WARN_ON(inode->i_state & I_NEW);
>
> +       /* This is security sensitive so catch missing IOP_USERNS. */
> +       VFS_WARN_ON_ONCE(!initial_idmapping(i_user_ns(inode)) &&
> +                        !(inode->i_opflags & IOP_USERNS));
> +
>         if (op->drop_inode)
>                 drop = op->drop_inode(inode);
>         else
> diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
> index a37991fdb194..8f7ae908ea16 100644
> --- a/fs/mnt_idmapping.c
> +++ b/fs/mnt_idmapping.c
> @@ -42,20 +42,6 @@ struct mnt_idmap invalid_mnt_idmap = {
>  };
>  EXPORT_SYMBOL_GPL(invalid_mnt_idmap);
>
> -/**
> - * initial_idmapping - check whether this is the initial mapping
> - * @ns: idmapping to check
> - *
> - * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
> - * [...], 1000 to 1000 [...].
> - *
> - * Return: true if this is the initial mapping, false if not.
> - */
> -static inline bool initial_idmapping(const struct user_namespace *ns)
> -{
> -       return ns == &init_user_ns;
> -}
> -
>  /**
>   * make_vfsuid - map a filesystem kuid according to an idmapping
>   * @idmap: the mount's idmapping
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 016b0fe1536e..d28384d5b752 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -663,6 +663,7 @@ is_uncached_acl(struct posix_acl *acl)
>  #define IOP_DEFAULT_READLINK   0x0010
>  #define IOP_MGTIME     0x0020
>  #define IOP_CACHED_LINK        0x0040
> +#define IOP_USERNS     0x0080
>
>  /*
>   * Keep mostly read-only and often accessed (especially for
> @@ -1454,7 +1455,9 @@ struct super_block {
>
>  static inline struct user_namespace *i_user_ns(const struct inode *inode)
>  {
> -       return inode->i_sb->s_user_ns;
> +       if (unlikely(inode->i_opflags & IOP_USERNS))
> +               return inode->i_sb->s_user_ns;
> +       return &init_user_ns;
>  }
>
>  /* Helper functions so that in most cases filesystems will
> diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
> index e71a6070a8f8..85553b3a7904 100644
> --- a/include/linux/mnt_idmapping.h
> +++ b/include/linux/mnt_idmapping.h
> @@ -25,6 +25,20 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t));
>  static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val));
>  static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val));
>
> +/**
> + * initial_idmapping - check whether this is the initial mapping
> + * @ns: idmapping to check
> + *
> + * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
> + * [...], 1000 to 1000 [...].
> + *
> + * Return: true if this is the initial mapping, false if not.
> + */
> +static inline bool initial_idmapping(const struct user_namespace *ns)
> +{
> +       return ns == &init_user_ns;
> +}
> +
>  static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap)
>  {
>         return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap;
>
> --
> 2.47.2
>

I don't have an opinion.

But if going this way, i think you want the assert for correctly
applied flag when fetching the ns, not just iput_final.
Christian Brauner April 16, 2025, 2:14 p.m. UTC | #2
On Wed, Apr 16, 2025 at 03:49:43PM +0200, Mateusz Guzik wrote:
> On Wed, Apr 16, 2025 at 3:17 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> > We currently always chase a pointer inode->i_sb->s_user_ns whenever we
> > need to map a uid/gid which is noticeable during path lookup as noticed
> > by Linus in [1]. In the majority of cases we don't need to bother with
> > that pointer chase because the inode won't be located on a filesystem
> > that's mounted in a user namespace. The user namespace of the superblock
> > cannot ever change once it's mounted. So introduce and raise IOP_USERNS
> > on all inodes and check for that flag in i_user_ns() when we retrieve
> > the user namespace.
> >
> > Link: https://lore.kernel.org/CAHk-=whJgRDtxTudTQ9HV8BFw5-bBsu+c8Ouwd_PrPqPB6_KEQ@mail.gmail.com [1]
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  fs/inode.c                    |  6 ++++++
> >  fs/mnt_idmapping.c            | 14 --------------
> >  include/linux/fs.h            |  5 ++++-
> >  include/linux/mnt_idmapping.h | 14 ++++++++++++++
> >  4 files changed, 24 insertions(+), 15 deletions(-)
> >
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 99318b157a9a..7335d05dd7d5 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -245,6 +245,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
> >                 inode->i_opflags |= IOP_XATTR;
> >         if (sb->s_type->fs_flags & FS_MGTIME)
> >                 inode->i_opflags |= IOP_MGTIME;
> > +       if (unlikely(!initial_idmapping(i_user_ns(inode))))
> > +               inode->i_opflags |= IOP_USERNS;
> >         i_uid_write(inode, 0);
> >         i_gid_write(inode, 0);
> >         atomic_set(&inode->i_writecount, 0);
> > @@ -1864,6 +1866,10 @@ static void iput_final(struct inode *inode)
> >
> >         WARN_ON(inode->i_state & I_NEW);
> >
> > +       /* This is security sensitive so catch missing IOP_USERNS. */
> > +       VFS_WARN_ON_ONCE(!initial_idmapping(i_user_ns(inode)) &&
> > +                        !(inode->i_opflags & IOP_USERNS));
> > +
> >         if (op->drop_inode)
> >                 drop = op->drop_inode(inode);
> >         else
> > diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
> > index a37991fdb194..8f7ae908ea16 100644
> > --- a/fs/mnt_idmapping.c
> > +++ b/fs/mnt_idmapping.c
> > @@ -42,20 +42,6 @@ struct mnt_idmap invalid_mnt_idmap = {
> >  };
> >  EXPORT_SYMBOL_GPL(invalid_mnt_idmap);
> >
> > -/**
> > - * initial_idmapping - check whether this is the initial mapping
> > - * @ns: idmapping to check
> > - *
> > - * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
> > - * [...], 1000 to 1000 [...].
> > - *
> > - * Return: true if this is the initial mapping, false if not.
> > - */
> > -static inline bool initial_idmapping(const struct user_namespace *ns)
> > -{
> > -       return ns == &init_user_ns;
> > -}
> > -
> >  /**
> >   * make_vfsuid - map a filesystem kuid according to an idmapping
> >   * @idmap: the mount's idmapping
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 016b0fe1536e..d28384d5b752 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -663,6 +663,7 @@ is_uncached_acl(struct posix_acl *acl)
> >  #define IOP_DEFAULT_READLINK   0x0010
> >  #define IOP_MGTIME     0x0020
> >  #define IOP_CACHED_LINK        0x0040
> > +#define IOP_USERNS     0x0080
> >
> >  /*
> >   * Keep mostly read-only and often accessed (especially for
> > @@ -1454,7 +1455,9 @@ struct super_block {
> >
> >  static inline struct user_namespace *i_user_ns(const struct inode *inode)
> >  {
> > -       return inode->i_sb->s_user_ns;
> > +       if (unlikely(inode->i_opflags & IOP_USERNS))
> > +               return inode->i_sb->s_user_ns;
> > +       return &init_user_ns;
> >  }
> >
> >  /* Helper functions so that in most cases filesystems will
> > diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
> > index e71a6070a8f8..85553b3a7904 100644
> > --- a/include/linux/mnt_idmapping.h
> > +++ b/include/linux/mnt_idmapping.h
> > @@ -25,6 +25,20 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t));
> >  static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val));
> >  static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val));
> >
> > +/**
> > + * initial_idmapping - check whether this is the initial mapping
> > + * @ns: idmapping to check
> > + *
> > + * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
> > + * [...], 1000 to 1000 [...].
> > + *
> > + * Return: true if this is the initial mapping, false if not.
> > + */
> > +static inline bool initial_idmapping(const struct user_namespace *ns)
> > +{
> > +       return ns == &init_user_ns;
> > +}
> > +
> >  static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap)
> >  {
> >         return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap;
> >
> > --
> > 2.47.2
> >
> 
> I don't have an opinion.
> 
> But if going this way, i think you want the assert for correctly
> applied flag when fetching the ns, not just iput_final.

Sure.
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index 99318b157a9a..7335d05dd7d5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -245,6 +245,8 @@  int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
 		inode->i_opflags |= IOP_XATTR;
 	if (sb->s_type->fs_flags & FS_MGTIME)
 		inode->i_opflags |= IOP_MGTIME;
+	if (unlikely(!initial_idmapping(i_user_ns(inode))))
+		inode->i_opflags |= IOP_USERNS;
 	i_uid_write(inode, 0);
 	i_gid_write(inode, 0);
 	atomic_set(&inode->i_writecount, 0);
@@ -1864,6 +1866,10 @@  static void iput_final(struct inode *inode)
 
 	WARN_ON(inode->i_state & I_NEW);
 
+	/* This is security sensitive so catch missing IOP_USERNS. */
+	VFS_WARN_ON_ONCE(!initial_idmapping(i_user_ns(inode)) &&
+			 !(inode->i_opflags & IOP_USERNS));
+
 	if (op->drop_inode)
 		drop = op->drop_inode(inode);
 	else
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index a37991fdb194..8f7ae908ea16 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -42,20 +42,6 @@  struct mnt_idmap invalid_mnt_idmap = {
 };
 EXPORT_SYMBOL_GPL(invalid_mnt_idmap);
 
-/**
- * initial_idmapping - check whether this is the initial mapping
- * @ns: idmapping to check
- *
- * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
- * [...], 1000 to 1000 [...].
- *
- * Return: true if this is the initial mapping, false if not.
- */
-static inline bool initial_idmapping(const struct user_namespace *ns)
-{
-	return ns == &init_user_ns;
-}
-
 /**
  * make_vfsuid - map a filesystem kuid according to an idmapping
  * @idmap: the mount's idmapping
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 016b0fe1536e..d28384d5b752 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -663,6 +663,7 @@  is_uncached_acl(struct posix_acl *acl)
 #define IOP_DEFAULT_READLINK	0x0010
 #define IOP_MGTIME	0x0020
 #define IOP_CACHED_LINK	0x0040
+#define IOP_USERNS	0x0080
 
 /*
  * Keep mostly read-only and often accessed (especially for
@@ -1454,7 +1455,9 @@  struct super_block {
 
 static inline struct user_namespace *i_user_ns(const struct inode *inode)
 {
-	return inode->i_sb->s_user_ns;
+	if (unlikely(inode->i_opflags & IOP_USERNS))
+		return inode->i_sb->s_user_ns;
+	return &init_user_ns;
 }
 
 /* Helper functions so that in most cases filesystems will
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index e71a6070a8f8..85553b3a7904 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -25,6 +25,20 @@  static_assert(sizeof(vfsgid_t) == sizeof(kgid_t));
 static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val));
 static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val));
 
+/**
+ * initial_idmapping - check whether this is the initial mapping
+ * @ns: idmapping to check
+ *
+ * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
+ * [...], 1000 to 1000 [...].
+ *
+ * Return: true if this is the initial mapping, false if not.
+ */
+static inline bool initial_idmapping(const struct user_namespace *ns)
+{
+	return ns == &init_user_ns;
+}
+
 static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap)
 {
 	return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap;