diff mbox

mount -o noexdev

Message ID 20160508183542.GA16724@p183.telecom.by (mailing list archive)
State New, archived
Headers show

Commit Message

Alexey Dobriyan May 8, 2016, 6:35 p.m. UTC
Searching for "rename bint mount exdev" shows that failure with EXDEV
seems somewhat unintuitive behaviour. Allow users to bypass
this restriction with "-o noexdev" flag if the source of operation is on
such mount.

Keep old semantics default so "mount --bind /tmp /tmp" works.

"mount --bind" will inherit "noexdev" flag from parent mount but it can
be cleared with mount(MS_REMOUNT) so it is possible to create exclave
with regular mount point crossing rules inside mount with relaxed mount
point rules.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 fs/namei.c              |    4 ++--
 fs/namespace.c          |    2 ++
 fs/proc_namespace.c     |    1 +
 include/linux/mount.h   |   15 ++++++++++++++-
 include/uapi/linux/fs.h |    1 +
 5 files changed, 20 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Al Viro May 8, 2016, 7:46 p.m. UTC | #1
On Sun, May 08, 2016 at 09:35:42PM +0300, Alexey Dobriyan wrote:
> Searching for "rename bint mount exdev" shows that failure with EXDEV
> seems somewhat unintuitive behaviour. Allow users to bypass
> this restriction with "-o noexdev" flag if the source of operation is on
> such mount.
> 
> Keep old semantics default so "mount --bind /tmp /tmp" works.
> 
> "mount --bind" will inherit "noexdev" flag from parent mount but it can
> be cleared with mount(MS_REMOUNT) so it is possible to create exclave
> with regular mount point crossing rules inside mount with relaxed mount
> point rules.

NAK.  At least until you bother to explore the consequences of such
rename for vfsmounts involved.  Hint: look at the semantics of ..
and mountpoint crossing.

It's a bloody bad idea; we have to cope with attackers who'd managed to
do that kind of rename using a mount of a bigger subtree, but that's
"cope" - it's not a normal situation and the price is non-trivial.

... and before you go into "if you don't want it, don't mount that way, what's
the problem?", consider our, ah, noble adversaries who'd been very clear
regarding their treatment of any optional features.  I do _not_ want to
end up with the situation when systemd-infested distributions run the setups
that use this thing and any reports along the lines "it's trivial to degrade
the performance on that setup" get bounced our way.  With "no, we are not
going to stop depending on that feature; if the kernel folks had a problem with
it, they shouldn't have merged it in the first place" tacked on top of those
reports.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4118,7 +4118,7 @@  retry:
 		goto out;
 
 	error = -EXDEV;
-	if (old_path.mnt != new_path.mnt)
+	if (!mnt_can_cross(old_path.mnt, new_path.mnt))
 		goto out_dput;
 	error = may_linkat(&old_path);
 	if (unlikely(error))
@@ -4379,7 +4379,7 @@  retry:
 	}
 
 	error = -EXDEV;
-	if (old_path.mnt != new_path.mnt)
+	if (!mnt_can_cross(old_path.mnt, new_path.mnt))
 		goto exit2;
 
 	error = -EBUSY;
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2708,6 +2708,8 @@  long do_mount(const char *dev_name, const char __user *dir_name,
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
+	if (flags & MS_NOEXDEV)
+		mnt_flags |= MNT_NOEXDEV;
 
 	/* The default atime for remount is preservation */
 	if ((flags & MS_REMOUNT) &&
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -67,6 +67,7 @@  static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
+		{ MNT_NOEXDEV, ",noexdev" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -28,6 +28,7 @@  struct mnt_namespace;
 #define MNT_NODIRATIME	0x10
 #define MNT_RELATIME	0x20
 #define MNT_READONLY	0x40	/* does the user want this to be r/o? */
+#define MNT_NOEXDEV	0x80	/* allow link(), rename() to cross mount point */
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_WRITE_HOLD	0x200
@@ -44,7 +45,7 @@  struct mnt_namespace;
 #define MNT_SHARED_MASK	(MNT_UNBINDABLE)
 #define MNT_USER_SETTABLE_MASK  (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
 				 | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
-				 | MNT_READONLY)
+				 | MNT_READONLY | MNT_NOEXDEV)
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
@@ -95,4 +96,16 @@  extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern dev_t name_to_dev_t(const char *name);
 
+/*
+ * Can operation be done in mnt1 => mnt2 direction?
+ * Not symmetric relation!
+ */
+static inline bool mnt_can_cross(struct vfsmount *mnt1, struct vfsmount *mnt2)
+{
+	if (mnt1 == mnt2)
+		return true;
+	if ((mnt1->mnt_flags & MNT_NOEXDEV) && mnt1->mnt_sb == mnt2->mnt_sb)
+		return true;
+	return false;
+}
 #endif /* _LINUX_MOUNT_H */
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -130,6 +130,7 @@  struct inodes_stat_t {
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
+#define MS_NOEXDEV	(1<<26) /* Allow link(), rename() to cross mount point */
 
 /* These sb flags are internal to the kernel */
 #define MS_NOSEC	(1<<28)