diff mbox

[REVIEW] mnt: In umount propagation reparent in a separate pass

Message ID 871srpj2yp.fsf_-_@xmission.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eric W. Biederman May 15, 2017, 8:10 p.m. UTC
It was observed that in some pathlogical cases that the current code
does not unmount everything it should.  After investigation it was
determined that the issue is that mnt_change_mntpoint can can change
which mounts are available to be unmounted during mount propagation
which is wrong.

The trivial reproducer is:
$ cat ./pathological.sh

mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo

$ unshare -Urm ./pathological.sh

The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.

Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h     |  1 +
 fs/namespace.c |  1 +
 fs/pnode.c     | 35 ++++++++++++++++++++++++++++++-----
 3 files changed, 32 insertions(+), 5 deletions(-)

Comments

Andrey Vagin May 15, 2017, 11:12 p.m. UTC | #1
On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
> 
> It was observed that in some pathlogical cases that the current code
> does not unmount everything it should.  After investigation it was
> determined that the issue is that mnt_change_mntpoint can can change
> which mounts are available to be unmounted during mount propagation
> which is wrong.
> 
> The trivial reproducer is:
> $ cat ./pathological.sh
> 
> mount -t tmpfs test-base /mnt
> cd /mnt
> mkdir 1 2 1/1
> mount --bind 1 1
> mount --make-shared 1
> mount --bind 1 2
> mount --bind 1/1 1/1
> mount --bind 1/1 1/1
> echo
> grep test-base /proc/self/mountinfo
> umount 1/1
> echo
> grep test-base /proc/self/mountinfo
> 
> $ unshare -Urm ./pathological.sh
> 
> The expected output looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> The output without the fix looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> That last mount in the output was in the propgation tree to be unmounted but
> was missed because the mnt_change_mountpoint changed it's parent before the walk
> through the mount propagation tree observed it.
> 

It works for me and the patch looks correct. Thanks!

Acked-by: Andrei Vagin <avagin@virtuozzo.com>

> Cc: stable@vger.kernel.org
> Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |  1 +
>  fs/namespace.c |  1 +
>  fs/pnode.c     | 35 ++++++++++++++++++++++++++++++-----
>  3 files changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index bf1fda6eed8f..ede5a1d5cf99 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_reparent;	/* reparent list entry */
>  #ifdef CONFIG_FSNOTIFY
>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8bd3e4d448b9..51e49866e1fe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_reparent);
>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>  	}
>  	return mnt;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5bc7896d122a..52aca0a118ff 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -439,7 +439,7 @@ static void mark_umount_candidates(struct mount *mnt)
>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>   * parent propagates to.
>   */
> -static void __propagate_umount(struct mount *mnt)
> +static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
>  {
>  	struct mount *parent = mnt->mnt_parent;
>  	struct mount *m;
> @@ -464,17 +464,38 @@ static void __propagate_umount(struct mount *mnt)
>  		 */
>  		topper = find_topper(child);
>  		if (topper)
> -			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> -					      topper);
> +			list_add_tail(&topper->mnt_reparent, to_reparent);
>  
> -		if (list_empty(&child->mnt_mounts)) {
> +		if (topper || list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
> +			list_del_init(&child->mnt_reparent);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>  			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>  		}
>  	}
>  }
>  
> +static void reparent_mounts(struct list_head *to_reparent)
> +{
> +	while (!list_empty(to_reparent)) {
> +		struct mount *mnt, *parent;
> +		struct mountpoint *mp;
> +
> +		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
> +		list_del_init(&mnt->mnt_reparent);
> +
> +		/* Where should this mount be reparented to? */
> +		mp = mnt->mnt_mp;
> +		parent = mnt->mnt_parent;
> +		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
> +			mp = parent->mnt_mp;
> +			parent = parent->mnt_parent;
> +		}
> +
> +		mnt_change_mountpoint(parent, mp, mnt);
> +	}
> +}
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -485,11 +506,15 @@ static void __propagate_umount(struct mount *mnt)
>  int propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_reparent);
>  
>  	list_for_each_entry_reverse(mnt, list, mnt_list)
>  		mark_umount_candidates(mnt);
>  
>  	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> +		__propagate_umount(mnt, &to_reparent);
> +
> +	reparent_mounts(&to_reparent);
> +
>  	return 0;
>  }
> -- 
> 2.10.1
>
Ram Pai May 22, 2017, 8:15 a.m. UTC | #2
On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
> 
> It was observed that in some pathlogical cases that the current code
> does not unmount everything it should.  After investigation it was
> determined that the issue is that mnt_change_mntpoint can can change
> which mounts are available to be unmounted during mount propagation
> which is wrong.
> 
> The trivial reproducer is:
> $ cat ./pathological.sh
> 
> mount -t tmpfs test-base /mnt
> cd /mnt
> mkdir 1 2 1/1
> mount --bind 1 1
> mount --make-shared 1
> mount --bind 1 2
> mount --bind 1/1 1/1
> mount --bind 1/1 1/1
> echo
> grep test-base /proc/self/mountinfo
> umount 1/1
> echo
> grep test-base /proc/self/mountinfo
> 
> $ unshare -Urm ./pathological.sh
> 
> The expected output looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> The output without the fix looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> That last mount in the output was in the propgation tree to be unmounted but
> was missed because the mnt_change_mountpoint changed it's parent before the walk
> through the mount propagation tree observed it.
> 

Looks patch correct to me.
Reviewed-by: Ram Pai <linuxram@us.ibm.com>

BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
explicitly flag tucked mounts with MNT_TUCKED, and use that information
to determine if the child is really a topper?  Currently we determine
the topper if it entirely is covering. How do we diambiguate that from an
entirely-covering-mount that is explicitly mounted by the administrator?
A topper situation is applicable only when tucked, right?


> Cc: stable@vger.kernel.org
> Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |  1 +
>  fs/namespace.c |  1 +
>  fs/pnode.c     | 35 ++++++++++++++++++++++++++++++-----
>  3 files changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index bf1fda6eed8f..ede5a1d5cf99 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_reparent;	/* reparent list entry */
>  #ifdef CONFIG_FSNOTIFY
>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8bd3e4d448b9..51e49866e1fe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_reparent);
>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>  	}
>  	return mnt;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5bc7896d122a..52aca0a118ff 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -439,7 +439,7 @@ static void mark_umount_candidates(struct mount *mnt)
>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>   * parent propagates to.
>   */
> -static void __propagate_umount(struct mount *mnt)
> +static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
>  {
>  	struct mount *parent = mnt->mnt_parent;
>  	struct mount *m;
> @@ -464,17 +464,38 @@ static void __propagate_umount(struct mount *mnt)
>  		 */
>  		topper = find_topper(child);
>  		if (topper)
> -			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> -					      topper);
> +			list_add_tail(&topper->mnt_reparent, to_reparent);
> 
> -		if (list_empty(&child->mnt_mounts)) {
> +		if (topper || list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
> +			list_del_init(&child->mnt_reparent);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>  			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>  		}
>  	}
>  }
> 
> +static void reparent_mounts(struct list_head *to_reparent)
> +{
> +	while (!list_empty(to_reparent)) {
> +		struct mount *mnt, *parent;
> +		struct mountpoint *mp;
> +
> +		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
> +		list_del_init(&mnt->mnt_reparent);
> +
> +		/* Where should this mount be reparented to? */
> +		mp = mnt->mnt_mp;
> +		parent = mnt->mnt_parent;
> +		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
> +			mp = parent->mnt_mp;
> +			parent = parent->mnt_parent;
> +		}
> +
> +		mnt_change_mountpoint(parent, mp, mnt);
> +	}
> +}
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -485,11 +506,15 @@ static void __propagate_umount(struct mount *mnt)
>  int propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_reparent);
> 
>  	list_for_each_entry_reverse(mnt, list, mnt_list)
>  		mark_umount_candidates(mnt);
> 
>  	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> +		__propagate_umount(mnt, &to_reparent);
> +
> +	reparent_mounts(&to_reparent);
> +
>  	return 0;
>  }
> -- 
> 2.10.1
Eric W. Biederman May 22, 2017, 6:33 p.m. UTC | #3
Ram Pai <linuxram@us.ibm.com> writes:

> On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
>> 
>> It was observed that in some pathlogical cases that the current code
>> does not unmount everything it should.  After investigation it was
>> determined that the issue is that mnt_change_mntpoint can can change
>> which mounts are available to be unmounted during mount propagation
>> which is wrong.
>> 
>> The trivial reproducer is:
>> $ cat ./pathological.sh
>> 
>> mount -t tmpfs test-base /mnt
>> cd /mnt
>> mkdir 1 2 1/1
>> mount --bind 1 1
>> mount --make-shared 1
>> mount --bind 1 2
>> mount --bind 1/1 1/1
>> mount --bind 1/1 1/1
>> echo
>> grep test-base /proc/self/mountinfo
>> umount 1/1
>> echo
>> grep test-base /proc/self/mountinfo
>> 
>> $ unshare -Urm ./pathological.sh
>> 
>> The expected output looks like:
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> The output without the fix looks like:
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> That last mount in the output was in the propgation tree to be unmounted but
>> was missed because the mnt_change_mountpoint changed it's parent before the walk
>> through the mount propagation tree observed it.
>> 
>
> Looks patch correct to me.
> Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>
> BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
> explicitly flag tucked mounts with MNT_TUCKED, and use that information
> to determine if the child is really a topper?  Currently we determine
> the topper if it entirely is covering. How do we diambiguate that from an
> entirely-covering-mount that is explicitly mounted by the administrator?
> A topper situation is applicable only when tucked, right?

In the current code explictly does not care about the difference.
The code just restricts untucking mounts of any kind to umount
propagation.

This is where we have previously disagreed.

A short summary of our previous discussions:
Eric Biederman: find_topper makes tucked mounts ordinary mounts and is simple.
Eric Biederman: I don't see a compelling case for a MNT_TUCKED flag
Eric Biederman: I think the change is a nice behavioral improvement
Ram Pai: a MNT_TUCKED flag would perfectly preserve existing behavior
Ram Pai: find_topper while not perfect is better than the previous
         very special case for side/shadow mounts

With respect to backwards compatibility the set of bugs I am fixing
shows that it is possible to have some very egregious bugs in this
area and in practice no one cares.


Without a MNT_TUCKED flag I can readily tell what the following
code should do by simply inspection of the of the mount
propgation information in /proc/self/mountinfo:

$ mount -t tmpfs test-base /mnt
$ cd /mnt
$ mkdir -p 1 2 1/1
$ mount --bind 1 1
$ mount --make-shared 1
$ mount --bind 1 2
$ mount --bind 1/1 1/1
$ mount --bind 1/1 1/1
$ umount 1/1

Before the umount /proc/self/mountinfo shows:
 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=502,gid=502
 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502

So it is clear to me that umount /mnt/1/1 should just leave:
/mnt
/mnt/1
/mnt/2

I would argue that is what the code should always have done.

I believe the code with a MNT_TUCKED flag would leave:
/mnt
/mnt/1
/mnt/1/1
/mnt/2
But in truth it makes my head hurt to think about it.
I don't see that MNT_TUCKED adds anything except aditional code
complexity.

I don't actually see what the value is in keeping mounts that you can
not use (because they are overmounted) around.

If the scenarios we were talking about were all limited to perfoming a
mount and then undoing that mount I could almost see some value in a
MNT_TUCKED flag.  Given that one of the justications for tucking mounts
in the first place is what happens when you umount something on a slave
mount I really don't like it.  As now I get the question what happens
on a slave mount where a mount has been propagated and tucked, and
then the topper is unmounted and a new topper is added.  Should unmount
on the parent untuck the propagated mount or leave it there?  It was
propagated it was tucked but it wasn't tucked under what is currenty on
top.

I much prefer the current semantics where we just say mount propagation
can tuck and untuck things, and the history of how the mount tree got
into its current shape is not important.

Given how difficult it has been to make this code performant and correct
I am not particularly eager to add complexity for unnecessary bug
compatibility.  But if it creates a breaking regression for something
(other than a regression test) I am willing to add MNT_TUCKED.

Eric
Ram Pai May 22, 2017, 10:34 p.m. UTC | #4
On Mon, May 22, 2017 at 01:33:05PM -0500, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
> >> 
> >> It was observed that in some pathlogical cases that the current code
> >> does not unmount everything it should.  After investigation it was
> >> determined that the issue is that mnt_change_mntpoint can can change
> >> which mounts are available to be unmounted during mount propagation
> >> which is wrong.
> >> 
> >> The trivial reproducer is:
> >> $ cat ./pathological.sh
> >> 
> >> mount -t tmpfs test-base /mnt
> >> cd /mnt
> >> mkdir 1 2 1/1
> >> mount --bind 1 1
> >> mount --make-shared 1
> >> mount --bind 1 2
> >> mount --bind 1/1 1/1
> >> mount --bind 1/1 1/1
> >> echo
> >> grep test-base /proc/self/mountinfo
> >> umount 1/1
> >> echo
> >> grep test-base /proc/self/mountinfo
> >> 
> >> $ unshare -Urm ./pathological.sh
> >> 
> >> The expected output looks like:
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> The output without the fix looks like:
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> That last mount in the output was in the propgation tree to be unmounted but
> >> was missed because the mnt_change_mountpoint changed it's parent before the walk
> >> through the mount propagation tree observed it.
> >> 
> >
> > Looks patch correct to me.
> > Reviewed-by: Ram Pai <linuxram@us.ibm.com>
> >
> > BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
> > explicitly flag tucked mounts with MNT_TUCKED, and use that information
> > to determine if the child is really a topper?  Currently we determine
> > the topper if it entirely is covering. How do we diambiguate that from an
> > entirely-covering-mount that is explicitly mounted by the administrator?
> > A topper situation is applicable only when tucked, right?
> 
> In the current code explictly does not care about the difference.
> The code just restricts untucking mounts of any kind to umount
> propagation.
> 
> This is where we have previously disagreed.
> 
> A short summary of our previous discussions:
> Eric Biederman: find_topper makes tucked mounts ordinary mounts and is simple.
> Eric Biederman: I don't see a compelling case for a MNT_TUCKED flag
> Eric Biederman: I think the change is a nice behavioral improvement
> Ram Pai: a MNT_TUCKED flag would perfectly preserve existing behavior
> Ram Pai: find_topper while not perfect is better than the previous
>          very special case for side/shadow mounts
> 
> With respect to backwards compatibility the set of bugs I am fixing
> shows that it is possible to have some very egregious bugs in this
> area and in practice no one cares.
> 
> 
> Without a MNT_TUCKED flag I can readily tell what the following
> code should do by simply inspection of the of the mount
> propgation information in /proc/self/mountinfo:
> 
Step 1>  $ mount -t tmpfs test-base /mnt
Step 2>  $ cd /mnt
Step 3>  $ mkdir -p 1 2 1/1
Step 4>  $ mount --bind 1 1
Step 5>  $ mount --make-shared 1
Step 6>  $ mount --bind 1 2
Step 7>  $ mount --bind 1/1 1/1
Step 8>  $ mount --bind 1/1 1/1
Step 9>  $ umount 1/1
> 
> Before the umount /proc/self/mountinfo shows:
>  46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=502,gid=502
>  47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
> 
> So it is clear to me that umount /mnt/1/1 should just leave:
> /mnt
> /mnt/1
> /mnt/2

This is one other place where we disagree....  I expect things to peel
back to the state the trees were, when the Step 7 was executed. which
is 
/mnt 
/mnt/1
/mnt/2
/mnt/1/1
/mnt/2/1
And all tucked mounts disapper.

Dont get me wrong. I dont think we will agree because we have different
expections. There is no standard on what to expect. Someone
authoritative; may be Al Viro, has to define what to expect. 

> 
> I would argue that is what the code should always have done.
> 
> I believe the code with a MNT_TUCKED flag would leave:
> /mnt
> /mnt/1
> /mnt/1/1
> /mnt/2
> But in truth it makes my head hurt to think about it.

Yes it is extremely mind-bending; sometimes mind-bleeding. :-(

> I don't see that MNT_TUCKED adds anything except aditional code
> complexity.
> 
> I don't actually see what the value is in keeping mounts that you can
> not use (because they are overmounted) around.

I argue that MNT_TUCKED leaves markers that can be used to determine
what can be taken out and what needs to be kept.

I will stop here and say.. there is value in marking TUCKED mounts.
Someone will run into some obscure issue in the future; probably a
decade from now, and the same story will repeat.

I wish there was a mathematical formula, where you plugin the operation
and a state of the trees, and the new state of the mount-trees emerge.

For now your patches look good to me.
RP

> 
> If the scenarios we were talking about were all limited to perfoming a
> mount and then undoing that mount I could almost see some value in a
> MNT_TUCKED flag.  Given that one of the justications for tucking mounts
> in the first place is what happens when you umount something on a slave
> mount I really don't like it.  As now I get the question what happens
> on a slave mount where a mount has been propagated and tucked, and
> then the topper is unmounted and a new topper is added.  Should unmount
> on the parent untuck the propagated mount or leave it there?  It was
> propagated it was tucked but it wasn't tucked under what is currenty on
> top.
> 
> I much prefer the current semantics where we just say mount propagation
> can tuck and untuck things, and the history of how the mount tree got
> into its current shape is not important.
> 
> Given how difficult it has been to make this code performant and correct
> I am not particularly eager to add complexity for unnecessary bug
> compatibility.  But if it creates a breaking regression for something
> (other than a regression test) I am willing to add MNT_TUCKED.
> 
> Eric
Eric W. Biederman May 23, 2017, 1:58 p.m. UTC | #5
Ram Pai <linuxram@us.ibm.com> writes:

> On Mon, May 22, 2017 at 01:33:05PM -0500, Eric W. Biederman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
>> >> 
>> >> It was observed that in some pathlogical cases that the current code
>> >> does not unmount everything it should.  After investigation it was
>> >> determined that the issue is that mnt_change_mntpoint can can change
>> >> which mounts are available to be unmounted during mount propagation
>> >> which is wrong.
>> >> 
>> >> The trivial reproducer is:
>> >> $ cat ./pathological.sh
>> >> 
>> >> mount -t tmpfs test-base /mnt
>> >> cd /mnt
>> >> mkdir 1 2 1/1
>> >> mount --bind 1 1
>> >> mount --make-shared 1
>> >> mount --bind 1 2
>> >> mount --bind 1/1 1/1
>> >> mount --bind 1/1 1/1
>> >> echo
>> >> grep test-base /proc/self/mountinfo
>> >> umount 1/1
>> >> echo
>> >> grep test-base /proc/self/mountinfo
>> >> 
>> >> $ unshare -Urm ./pathological.sh
>> >> 
>> >> The expected output looks like:
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> The output without the fix looks like:
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> That last mount in the output was in the propgation tree to be unmounted but
>> >> was missed because the mnt_change_mountpoint changed it's parent before the walk
>> >> through the mount propagation tree observed it.
>> >> 
>> >
>> > Looks patch correct to me.
>> > Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>> >
>> > BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
>> > explicitly flag tucked mounts with MNT_TUCKED, and use that information
>> > to determine if the child is really a topper?  Currently we determine
>> > the topper if it entirely is covering. How do we diambiguate that from an
>> > entirely-covering-mount that is explicitly mounted by the administrator?
>> > A topper situation is applicable only when tucked, right?
>> 
>> In the current code explictly does not care about the difference.
>> The code just restricts untucking mounts of any kind to umount
>> propagation.
>> 
>> This is where we have previously disagreed.
>> 
>> A short summary of our previous discussions:
>> Eric Biederman: find_topper makes tucked mounts ordinary mounts and is simple.
>> Eric Biederman: I don't see a compelling case for a MNT_TUCKED flag
>> Eric Biederman: I think the change is a nice behavioral improvement
>> Ram Pai: a MNT_TUCKED flag would perfectly preserve existing behavior
>> Ram Pai: find_topper while not perfect is better than the previous
>>          very special case for side/shadow mounts
>> 
>> With respect to backwards compatibility the set of bugs I am fixing
>> shows that it is possible to have some very egregious bugs in this
>> area and in practice no one cares.
>> 
>> 
>> Without a MNT_TUCKED flag I can readily tell what the following
>> code should do by simply inspection of the of the mount
>> propgation information in /proc/self/mountinfo:
>> 
> Step 1>  $ mount -t tmpfs test-base /mnt
> Step 2>  $ cd /mnt
> Step 3>  $ mkdir -p 1 2 1/1
> Step 4>  $ mount --bind 1 1
> Step 5>  $ mount --make-shared 1
> Step 6>  $ mount --bind 1 2
> Step 7>  $ mount --bind 1/1 1/1
> Step 8>  $ mount --bind 1/1 1/1
> Step 9>  $ umount 1/1
>> 
>> Before the umount /proc/self/mountinfo shows:
>>  46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=502,gid=502
>>  47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>> 
>> So it is clear to me that umount /mnt/1/1 should just leave:
>> /mnt
>> /mnt/1
>> /mnt/2
>
> This is one other place where we disagree....  I expect things to peel
> back to the state the trees were, when the Step 7 was executed. which
> is 
> /mnt 
> /mnt/1
> /mnt/2
> /mnt/1/1
> /mnt/2/1
> And all tucked mounts disapper.
>
> Dont get me wrong. I dont think we will agree because we have different
> expections. There is no standard on what to expect. Someone
> authoritative; may be Al Viro, has to define what to expect. 
>
>> 
>> I would argue that is what the code should always have done.
>> 
>> I believe the code with a MNT_TUCKED flag would leave:
>> /mnt
>> /mnt/1
>> /mnt/1/1
>> /mnt/2
>> But in truth it makes my head hurt to think about it.
>
> Yes it is extremely mind-bending; sometimes mind-bleeding. :-(
>
>> I don't see that MNT_TUCKED adds anything except aditional code
>> complexity.
>> 
>> I don't actually see what the value is in keeping mounts that you can
>> not use (because they are overmounted) around.
>
> I argue that MNT_TUCKED leaves markers that can be used to determine
> what can be taken out and what needs to be kept.
>
> I will stop here and say.. there is value in marking TUCKED mounts.
> Someone will run into some obscure issue in the future; probably a
> decade from now, and the same story will repeat.
>
> I wish there was a mathematical formula, where you plugin the operation
> and a state of the trees, and the new state of the mount-trees emerge.
>
> For now your patches look good to me.

Then let me ask you this.  Please look at the two successor patches to
this.  I am confident in them but I also know I am human and may have
missed something.  

Then on top of those et's look at marking tucked mounts.  If we write
the code and examine motivating cases where the behavior differs we
should be able to make an informed choice.

I say motivating cases as there are use cases with slave mounts that
motivated the support of tucking mounts.  I figure if we go back through
and examine those we can at least see if there are any cases where
without marking them we have a practical issue.  Or perhaps we will
see that for all cases that we can think of that matter there are no
differences.

I am motivated to solve this because after fixing the perfomance issues
we need find a way for some set of mount namespaces to recreate their
mount propgation tree on a different machine.   That is needed for
CRIU and it may be needed for the plan9 case where logging into a remote
system you could bring all of your filesystems with you into your own
personal mount namespace.

Eric
diff mbox

Patch

diff --git a/fs/mount.h b/fs/mount.h
index bf1fda6eed8f..ede5a1d5cf99 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@  struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	struct list_head mnt_reparent;	/* reparent list entry */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 8bd3e4d448b9..51e49866e1fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,6 +236,7 @@  static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+		INIT_LIST_HEAD(&mnt->mnt_reparent);
 		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
 	}
 	return mnt;
diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896d122a..52aca0a118ff 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -439,7 +439,7 @@  static void mark_umount_candidates(struct mount *mnt)
  * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
  * parent propagates to.
  */
-static void __propagate_umount(struct mount *mnt)
+static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
@@ -464,17 +464,38 @@  static void __propagate_umount(struct mount *mnt)
 		 */
 		topper = find_topper(child);
 		if (topper)
-			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
-					      topper);
+			list_add_tail(&topper->mnt_reparent, to_reparent);
 
-		if (list_empty(&child->mnt_mounts)) {
+		if (topper || list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
+			list_del_init(&child->mnt_reparent);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
 			list_move_tail(&child->mnt_list, &mnt->mnt_list);
 		}
 	}
 }
 
+static void reparent_mounts(struct list_head *to_reparent)
+{
+	while (!list_empty(to_reparent)) {
+		struct mount *mnt, *parent;
+		struct mountpoint *mp;
+
+		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
+		list_del_init(&mnt->mnt_reparent);
+
+		/* Where should this mount be reparented to? */
+		mp = mnt->mnt_mp;
+		parent = mnt->mnt_parent;
+		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
+			mp = parent->mnt_mp;
+			parent = parent->mnt_parent;
+		}
+
+		mnt_change_mountpoint(parent, mp, mnt);
+	}
+}
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -485,11 +506,15 @@  static void __propagate_umount(struct mount *mnt)
 int propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
+	LIST_HEAD(to_reparent);
 
 	list_for_each_entry_reverse(mnt, list, mnt_list)
 		mark_umount_candidates(mnt);
 
 	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt);
+		__propagate_umount(mnt, &to_reparent);
+
+	reparent_mounts(&to_reparent);
+
 	return 0;
 }