[GIT,PULL] User namespace related changes for v4.3

Message ID	87io876aac.fsf@x220.int.ebiederm.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: ebiederm@xmission.com (Eric W. Biederman) To: Linus Torvalds <torvalds@linux-foundation.org> Cc: <linux-kernel@vger.kernel.org>, Linux Containers <containers@lists.linux-foundation.org>, <linux-fsdevel@vger.kernel.org> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/24.3 (gnu/linux) Date: Sat, 22 Aug 2015 02:40:27 -0500 Message-ID: <87io876aac.fsf@x220.int.ebiederm.org> MIME-Version: 1.0 Content-Type: text/plain parse: 2.1 (0.1%), extract_message_metadata: 19 (0.9%), get_uri_detail_list: 16 (0.7%), tests_pri_-1000: 6 (0.3%), tests_pri_-950: 2.2 (0.1%), tests_pri_-900: 1.76 (0.1%), tests_pri_-400: 86 (4.1%), check_bayes: 83 (4.0%), b_tokenize: 43 (2.1%), b_tok_get_all: 21 (1.0%), b_comp_prob: 8 (0.4%), b_tok_touch_all: 6 (0.3%), b_finish: 1.01 (0.0%), tests_pri_0: 1929 (92.7%), tests_pri_500: 8 (0.4%), rewrite_mail: 0.00 (0.0%) Subject: [GIT PULL] User namespace related changes for v4.3 Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/exec.c b/fs/exec.c index 1977c2a553ac..b06623a9347f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt) module_put(fmt->module); } +bool path_noexec(const struct path *path) +{ + return (path->mnt->mnt_flags & MNT_NOEXEC) || + (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); +} + #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to @@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto exit; error = -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&file->f_path)) goto exit; fsnotify_open(file); @@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (!S_ISREG(file_inode(file)->i_mode)) goto exit; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&file->f_path)) goto exit; err = deny_write_access(file); diff --git a/fs/libfs.c b/fs/libfs.c index 102edfd39000..c7cbfb092e94 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1185,7 +1185,7 @@ void make_empty_dir_inode(struct inode *inode) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; - inode->i_size = 2; + inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; diff --git a/fs/namespace.c b/fs/namespace.c index c7cb8a526c05..ce428cadd41f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3194,6 +3194,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; + int mnt_flags; + if (mnt->mnt.mnt_sb->s_type != type) continue; @@ -3203,17 +3205,30 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; + /* Read the mount flags and filter out flags that + * may safely be ignored. + */ + mnt_flags = mnt->mnt.mnt_flags; + if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) + mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); + /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ - if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; - if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + if ((mnt_flags & MNT_LOCK_NODEV) && !(new_flags & MNT_NODEV)) continue; - if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && - ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + if ((mnt_flags & MNT_LOCK_NOSUID) && + !(new_flags & MNT_NOSUID)) + continue; + if ((mnt_flags & MNT_LOCK_NOEXEC) && + !(new_flags & MNT_NOEXEC)) + continue; + if ((mnt_flags & MNT_LOCK_ATIME) && + ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any @@ -3223,16 +3238,18 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ - if (!(mnt->mnt.mnt_flags & MNT_LOCKED)) + if (!(mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ - *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \ - MNT_LOCK_NODEV | \ - MNT_LOCK_ATIME); + *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_NODEV | \ + MNT_LOCK_NOSUID | \ + MNT_LOCK_NOEXEC | \ + MNT_LOCK_ATIME); visible = true; goto found; next: ; diff --git a/fs/nsfs.c b/fs/nsfs.c index 99521e7c492b..e4905fbf3396 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -4,6 +4,7 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> +#include <linux/seq_file.h> static struct vfsmount *nsfs_mnt; @@ -136,9 +137,18 @@ out_invalid: return ERR_PTR(-EINVAL); } +static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); +} + static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, + .show_path = nsfs_show_path, }; static struct dentry *nsfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/fs/open.c b/fs/open.c index e33dab287fa0..b6f1e96a7c0b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -377,7 +377,7 @@ retry: * with the "noexec" flag. */ res = -EACCES; - if (path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&path)) goto out_path_release; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 68feb0f70e63..361ab4ee42fc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } sb->s_flags |= MS_ACTIVE; + /* User space would break if executables appear on proc */ + sb->s_iflags |= SB_I_NOEXEC; } return dget(sb->s_root); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 1c6ac6fcee9f..f3db82071cfb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + else if (new_sb) + /* Userspace would break if executables appear on sysfs */ + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a0653e560c26..42912f8d286e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1244,6 +1244,7 @@ struct mm_struct; /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ +#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ /* Possible states of 'frozen' field */ enum { @@ -3030,4 +3031,6 @@ static inline bool dir_relax(struct inode *inode) return !IS_DEADDIR(inode); } +extern bool path_noexec(const struct path *path); + #endif /* _LINUX_FS_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 1bfefc6f96a4..2c72b8a8ae24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1273,10 +1273,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* * If the new process will be in a different pid or user namespace - * do not allow it to share a thread group or signal handlers or - * parent with the forking task. + * do not allow it to share a thread group with the forking task. */ - if (clone_flags & CLONE_SIGHAND) { + if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) @@ -1866,13 +1865,21 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* - * Not implemented, but pretend it works if there is nothing to - * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND - * needs to unshare vm. + * Not implemented, but pretend it works if there is nothing + * to unshare. Note that unsharing the address space or the + * signal handlers also need to unshare the signal queues (aka + * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { - /* FIXME: get_task_mm() increments ->mm_users */ - if (atomic_read(&current->mm->mm_users) > 1) + if (!thread_group_empty(current)) + return -EINVAL; + } + if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { + if (atomic_read(&current->sighand->count) > 1) + return -EINVAL; + } + if (unshare_flags & CLONE_VM) { + if (!current_is_single_threaded()) return -EINVAL; } @@ -1936,21 +1943,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) int err; /* - * If unsharing a user namespace must also unshare the thread. + * If unsharing a user namespace must also unshare the thread group + * and unshare the filesystem root and working directories. */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* - * If unsharing a thread from a thread group, must also unshare vm. - */ - if (unshare_flags & CLONE_THREAD) - unshare_flags |= CLONE_VM; - /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; /* + * If unsharing a signal handlers, must also unshare the signal queues. + */ + if (unshare_flags & CLONE_SIGHAND) + unshare_flags |= CLONE_THREAD; + /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) diff --git a/kernel/sys.c b/kernel/sys.c index 259fda25eb6b..fa2f2f671a5c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(inode->i_mode) || - exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) goto exit; err = inode_permission(inode, MAY_EXEC); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4109f8320684..f65a0a06a8c0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -976,8 +976,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (user_ns == current_user_ns()) return -EINVAL; - /* Threaded processes may not enter a different user namespace */ - if (atomic_read(&current->mm->mm_users) > 1) + /* Tasks that share a thread group must share a user namespace */ + if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) diff --git a/mm/mmap.c b/mm/mmap.c index aa632ade2be7..f126923ce683 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1268,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; if (!(flags & MAP_FIXED)) @@ -1337,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; diff --git a/mm/nommu.c b/mm/nommu.c index 58ea3643b9e9..ce17abf087ff 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file, /* handle executable mappings and implied executable * mappings */ - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (prot & PROT_EXEC) return -EPERM; } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { diff --git a/security/security.c b/security/security.c index 595fffab48b0..062f3c997fdc 100644 --- a/security/security.c +++ b/security/security.c @@ -776,7 +776,7 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot) * ditto if it's not on noexec mount, except that on !MMU we need * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ - if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) { + if (!path_noexec(&file->f_path)) { #ifndef CONFIG_MMU if (file->f_op->mmap_capabilities) { unsigned caps = file->f_op->mmap_capabilities(file);

[GIT,PULL] User namespace related changes for v4.3

Commit Message

Patch