diff mbox series

[15/15] overlayfs: make use of ->layers safe in rcu pathwalk

Message ID 20231002023711.GP3389589@ZenIV (mailing list archive)
State New, archived
Headers show
Series [01/15] rcu pathwalk: prevent bogus hard errors from may_lookup() | expand

Commit Message

Al Viro Oct. 2, 2023, 2:37 a.m. UTC
ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
freed without an RCU delay on fs shutdown.  Fortunately, kern_unmount_array()
used to drop those mounts does include an RCU delay, so freeing is
delayed; unfortunately, the array passed to kern_unmount_array() is
formed by mangling ->layers contents and that happens without any
delays.

Use a separate array instead; local if we have a few layers,
kmalloc'ed if there's a lot of them.  If allocation fails,
fall back to kern_unmount() for individual mounts; it's
not a fast path by any stretch of imagination.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/ovl_entry.h |  1 -
 fs/overlayfs/params.c    | 26 ++++++++++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

Comments

Amir Goldstein Oct. 2, 2023, 6:40 a.m. UTC | #1
On Mon, Oct 2, 2023 at 5:37 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
> freed without an RCU delay on fs shutdown.  Fortunately, kern_unmount_array()
> used to drop those mounts does include an RCU delay, so freeing is
> delayed; unfortunately, the array passed to kern_unmount_array() is
> formed by mangling ->layers contents and that happens without any
> delays.
>
> Use a separate array instead; local if we have a few layers,
> kmalloc'ed if there's a lot of them.  If allocation fails,
> fall back to kern_unmount() for individual mounts; it's
> not a fast path by any stretch of imagination.

If that is the case, then having 3 different code paths seems
quite an excessive over optimization...

I think there is a better way -
layout the mounts array linearly in ofs->mounts[] to begin with,
remove .mnt out of ovl_layer and use ofs->mounts[layer->idx] to
get to a layer's mount.

I can try to write this patch to see how it ends up looking.

Thanks,
Amir.

>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
>  fs/overlayfs/ovl_entry.h |  1 -
>  fs/overlayfs/params.c    | 26 ++++++++++++++++++++------
>  2 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
> index e9539f98e86a..618b63bb7987 100644
> --- a/fs/overlayfs/ovl_entry.h
> +++ b/fs/overlayfs/ovl_entry.h
> @@ -30,7 +30,6 @@ struct ovl_sb {
>  };
>
>  struct ovl_layer {
> -       /* ovl_free_fs() relies on @mnt being the first member! */
>         struct vfsmount *mnt;
>         /* Trap in ovl inode cache */
>         struct inode *trap;
> diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
> index b9355bb6d75a..ab594fd407b4 100644
> --- a/fs/overlayfs/params.c
> +++ b/fs/overlayfs/params.c
> @@ -738,8 +738,15 @@ int ovl_init_fs_context(struct fs_context *fc)
>  void ovl_free_fs(struct ovl_fs *ofs)
>  {
>         struct vfsmount **mounts;
> +       struct vfsmount *m[16];
> +       unsigned n = ofs->numlayer;
>         unsigned i;
>
> +       if (n > 16)
> +               mounts = kmalloc_array(n, sizeof(struct mount *), GFP_KERNEL);
> +       else
> +               mounts = m;
> +
>         iput(ofs->workbasedir_trap);
>         iput(ofs->indexdir_trap);
>         iput(ofs->workdir_trap);
> @@ -752,14 +759,21 @@ void ovl_free_fs(struct ovl_fs *ofs)
>         if (ofs->upperdir_locked)
>                 ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
>
> -       /* Hack!  Reuse ofs->layers as a vfsmount array before freeing it */
> -       mounts = (struct vfsmount **) ofs->layers;

If we are getting rid of the hack, we should remove the associated
static_assert() statements in ovl_entry.h.

> -       for (i = 0; i < ofs->numlayer; i++) {
> +       for (i = 0; i < n; i++) {
>                 iput(ofs->layers[i].trap);
> -               mounts[i] = ofs->layers[i].mnt;
> -               kfree(ofs->layers[i].name);
> +               if (unlikely(!mounts))
> +                       kern_unmount(ofs->layers[i].mnt);
> +               else
> +                       mounts[i] = ofs->layers[i].mnt;
>         }
> -       kern_unmount_array(mounts, ofs->numlayer);
> +       if (mounts) {
> +               kern_unmount_array(mounts, n);
> +               if (mounts != m)
> +                       kfree(mounts);
> +       }
> +       // by this point we had an RCU delay from kern_unmount{_array,}()
> +       for (i = 0; i < n; i++)
> +               kfree(ofs->layers[i].name);
>         kfree(ofs->layers);
>         for (i = 0; i < ofs->numfs; i++)
>                 free_anon_bdev(ofs->fs[i].pseudo_dev);
> --
> 2.39.2
>
Al Viro Oct. 2, 2023, 7:23 a.m. UTC | #2
On Mon, Oct 02, 2023 at 09:40:15AM +0300, Amir Goldstein wrote:
> On Mon, Oct 2, 2023 at 5:37 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
> > freed without an RCU delay on fs shutdown.  Fortunately, kern_unmount_array()
> > used to drop those mounts does include an RCU delay, so freeing is
> > delayed; unfortunately, the array passed to kern_unmount_array() is
> > formed by mangling ->layers contents and that happens without any
> > delays.
> >
> > Use a separate array instead; local if we have a few layers,
> > kmalloc'ed if there's a lot of them.  If allocation fails,
> > fall back to kern_unmount() for individual mounts; it's
> > not a fast path by any stretch of imagination.
> 
> If that is the case, then having 3 different code paths seems
> quite an excessive over optimization...
> 
> I think there is a better way -
> layout the mounts array linearly in ofs->mounts[] to begin with,
> remove .mnt out of ovl_layer and use ofs->mounts[layer->idx] to
> get to a layer's mount.
> 
> I can try to write this patch to see how it ends up looking.

Fine by me; about the only problem I see is the cache footprint...
How many layers do you consider the normal use, actually?
Amir Goldstein Oct. 2, 2023, 8:53 a.m. UTC | #3
On Mon, Oct 2, 2023 at 10:23 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Mon, Oct 02, 2023 at 09:40:15AM +0300, Amir Goldstein wrote:
> > On Mon, Oct 2, 2023 at 5:37 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
> > > freed without an RCU delay on fs shutdown.  Fortunately, kern_unmount_array()
> > > used to drop those mounts does include an RCU delay, so freeing is
> > > delayed; unfortunately, the array passed to kern_unmount_array() is
> > > formed by mangling ->layers contents and that happens without any
> > > delays.
> > >
> > > Use a separate array instead; local if we have a few layers,
> > > kmalloc'ed if there's a lot of them.  If allocation fails,
> > > fall back to kern_unmount() for individual mounts; it's
> > > not a fast path by any stretch of imagination.
> >
> > If that is the case, then having 3 different code paths seems
> > quite an excessive over optimization...
> >
> > I think there is a better way -
> > layout the mounts array linearly in ofs->mounts[] to begin with,
> > remove .mnt out of ovl_layer and use ofs->mounts[layer->idx] to
> > get to a layer's mount.
> >
> > I can try to write this patch to see how it ends up looking.
>
> Fine by me; about the only problem I see is the cache footprint...

I've also considered just allocating the extra space for
ofs->mounts[] at super creation time rather than on super destruction.
I just cannot get myself to be bothered with this cleanup code
because of saving memory of ovl_fs.

However, looking closer, we have a wasfull layer->name pointer,
which is only ever used for ovl_show_options() (to print the original
requested layer path from mount options).

So I am inclined to move these rarely accessed pointers to
ofs->layer_names[], which can be used for the temp array for
kern_unmount_array() because freeing name does not need
RCU delay AFAICT (?).

I will take a swing at it.

> How many layers do you consider the normal use, actually?

Define "normal".
Currently, we have #define OVL_MAX_STACK 500
and being able to create an overlay with many layers was
cited as one of the requirements to drive the move to new mount api:

https://lore.kernel.org/linux-unionfs/20230605-fs-overlayfs-mount_api-v3-0-730d9646b27d@kernel.org/

I am not really familiar with the user workloads that need that many layers
and why. It's obviously not the common case.

Thanks,
Amir.
Al Viro Oct. 3, 2023, 8:47 p.m. UTC | #4
On Mon, Oct 02, 2023 at 11:53:23AM +0300, Amir Goldstein wrote:

> I've also considered just allocating the extra space for
> ofs->mounts[] at super creation time rather than on super destruction.
> I just cannot get myself to be bothered with this cleanup code
> because of saving memory of ovl_fs.
> 
> However, looking closer, we have a wasfull layer->name pointer,
> which is only ever used for ovl_show_options() (to print the original
> requested layer path from mount options).
> 
> So I am inclined to move these rarely accessed pointers to
> ofs->layer_names[], which can be used for the temp array for
> kern_unmount_array() because freeing name does not need
> RCU delay AFAICT (?).

AFAICS, it doesn't.  The only user after the setup is done is
->show_options(), i.e. show_vfsmnt() and show_mountinfo().
Anyone who tries to use those without making sure that
struct mount is not going to come apart under them will
have far worse troubles...

FWIW, I'm perfectly fine with having these fixes go through your tree;
they are independent from the rest of the series.
diff mbox series

Patch

diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index e9539f98e86a..618b63bb7987 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -30,7 +30,6 @@  struct ovl_sb {
 };
 
 struct ovl_layer {
-	/* ovl_free_fs() relies on @mnt being the first member! */
 	struct vfsmount *mnt;
 	/* Trap in ovl inode cache */
 	struct inode *trap;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index b9355bb6d75a..ab594fd407b4 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -738,8 +738,15 @@  int ovl_init_fs_context(struct fs_context *fc)
 void ovl_free_fs(struct ovl_fs *ofs)
 {
 	struct vfsmount **mounts;
+	struct vfsmount *m[16];
+	unsigned n = ofs->numlayer;
 	unsigned i;
 
+	if (n > 16)
+		mounts = kmalloc_array(n, sizeof(struct mount *), GFP_KERNEL);
+	else
+		mounts = m;
+
 	iput(ofs->workbasedir_trap);
 	iput(ofs->indexdir_trap);
 	iput(ofs->workdir_trap);
@@ -752,14 +759,21 @@  void ovl_free_fs(struct ovl_fs *ofs)
 	if (ofs->upperdir_locked)
 		ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
 
-	/* Hack!  Reuse ofs->layers as a vfsmount array before freeing it */
-	mounts = (struct vfsmount **) ofs->layers;
-	for (i = 0; i < ofs->numlayer; i++) {
+	for (i = 0; i < n; i++) {
 		iput(ofs->layers[i].trap);
-		mounts[i] = ofs->layers[i].mnt;
-		kfree(ofs->layers[i].name);
+		if (unlikely(!mounts))
+			kern_unmount(ofs->layers[i].mnt);
+		else
+			mounts[i] = ofs->layers[i].mnt;
 	}
-	kern_unmount_array(mounts, ofs->numlayer);
+	if (mounts) {
+		kern_unmount_array(mounts, n);
+		if (mounts != m)
+			kfree(mounts);
+	}
+	// by this point we had an RCU delay from kern_unmount{_array,}()
+	for (i = 0; i < n; i++)
+		kfree(ofs->layers[i].name);
 	kfree(ofs->layers);
 	for (i = 0; i < ofs->numfs; i++)
 		free_anon_bdev(ofs->fs[i].pseudo_dev);