Message ID | 20221206152358.1966099-4-jeffxu@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/memfd: introduce MFD_NOEXEC_SEAL and MFD_EXEC | expand |
On Tue, Dec 06, 2022 at 03:23:55PM +0000, jeffxu@chromium.org wrote: > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -110,6 +110,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->ucounts = ucounts; > ns->pid_allocated = PIDNS_ADDING; > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > + ns->memfd_noexec_scope = > + task_active_pid_ns(current)->memfd_noexec_scope; > +#endif .c files should never have #if in them. Can't you put this in a .h file properly so that this does not get really messy over time? > + > return ns; > > out_free_idr: > @@ -255,6 +260,45 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) > return; > } > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) Same here. thanks, greg k-h
On Tue, Dec 6, 2022 at 8:04 AM Greg KH <gregkh@linuxfoundation.org> wrote: > > On Tue, Dec 06, 2022 at 03:23:55PM +0000, jeffxu@chromium.org wrote: > > --- a/kernel/pid_namespace.c > > +++ b/kernel/pid_namespace.c > > @@ -110,6 +110,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > > ns->ucounts = ucounts; > > ns->pid_allocated = PIDNS_ADDING; > > > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > > + ns->memfd_noexec_scope = > > + task_active_pid_ns(current)->memfd_noexec_scope; > > +#endif > > .c files should never have #if in them. Can't you put this in a .h file > properly so that this does not get really messy over time? > > Thanks for reviewing. It seems to me that checking for CONFIG_XXX is common in c code in kernel/ path. Do you have a sample code pattern (link/function) that I can follow? Thanks Jeff > > > + > > return ns; > > > > out_free_idr: > > @@ -255,6 +260,45 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) > > return; > > } > > > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > > Same here. > > thanks, > > greg k-h
On Tue, Dec 06, 2022 at 08:26:30AM -0800, Jeff Xu wrote: > On Tue, Dec 6, 2022 at 8:04 AM Greg KH <gregkh@linuxfoundation.org> wrote: > > > > On Tue, Dec 06, 2022 at 03:23:55PM +0000, jeffxu@chromium.org wrote: > > > --- a/kernel/pid_namespace.c > > > +++ b/kernel/pid_namespace.c > > > @@ -110,6 +110,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > > > ns->ucounts = ucounts; > > > ns->pid_allocated = PIDNS_ADDING; > > > > > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > > > + ns->memfd_noexec_scope = > > > + task_active_pid_ns(current)->memfd_noexec_scope; > > > +#endif > > > > .c files should never have #if in them. Can't you put this in a .h file > > properly so that this does not get really messy over time? > > > > > Thanks for reviewing. > It seems to me that checking for CONFIG_XXX is common in c code in > kernel/ path. Maybe, but please don't make it any worse if at all possible. It's tough to maintain code like that. > Do you have a sample code pattern (link/function) that I can follow? Any of the zillions of #if statements in .h files :) thanks, greg k-h
On Tue, Dec 6, 2022 at 8:35 AM Greg KH <gregkh@linuxfoundation.org> wrote: > > On Tue, Dec 06, 2022 at 08:26:30AM -0800, Jeff Xu wrote: > > On Tue, Dec 6, 2022 at 8:04 AM Greg KH <gregkh@linuxfoundation.org> wrote: > > > > > > On Tue, Dec 06, 2022 at 03:23:55PM +0000, jeffxu@chromium.org wrote: > > > > --- a/kernel/pid_namespace.c > > > > +++ b/kernel/pid_namespace.c > > > > @@ -110,6 +110,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > > > > ns->ucounts = ucounts; > > > > ns->pid_allocated = PIDNS_ADDING; > > > > > > > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > > > > + ns->memfd_noexec_scope = > > > > + task_active_pid_ns(current)->memfd_noexec_scope; > > > > +#endif > > > > > > .c files should never have #if in them. Can't you put this in a .h file > > > properly so that this does not get really messy over time? > > > > > > > > Thanks for reviewing. > > It seems to me that checking for CONFIG_XXX is common in c code in > > kernel/ path. > > Maybe, but please don't make it any worse if at all possible. It's > tough to maintain code like that. > > > Do you have a sample code pattern (link/function) that I can follow? > > Any of the zillions of #if statements in .h files :) > Thanks. I will take the approach of having real/stub implementation in the h file, and the c file using it without a compile flag. Please kindly let me know if this is not right. Thanks Jeff > thanks, > > greg k-h
On Tue, Dec 06, 2022 at 09:48:55AM -0800, Jeff Xu wrote: > On Tue, Dec 6, 2022 at 8:35 AM Greg KH <gregkh@linuxfoundation.org> wrote: > > > > On Tue, Dec 06, 2022 at 08:26:30AM -0800, Jeff Xu wrote: > > > On Tue, Dec 6, 2022 at 8:04 AM Greg KH <gregkh@linuxfoundation.org> wrote: > > > > > > > > On Tue, Dec 06, 2022 at 03:23:55PM +0000, jeffxu@chromium.org wrote: > > > > > --- a/kernel/pid_namespace.c > > > > > +++ b/kernel/pid_namespace.c > > > > > @@ -110,6 +110,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > > > > > ns->ucounts = ucounts; > > > > > ns->pid_allocated = PIDNS_ADDING; > > > > > > > > > > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > > > > > + ns->memfd_noexec_scope = > > > > > + task_active_pid_ns(current)->memfd_noexec_scope; > > > > > +#endif > > > > > > > > .c files should never have #if in them. Can't you put this in a .h file > > > > properly so that this does not get really messy over time? > > > > > > > > > > > Thanks for reviewing. > > > It seems to me that checking for CONFIG_XXX is common in c code in > > > kernel/ path. > > > > Maybe, but please don't make it any worse if at all possible. It's > > tough to maintain code like that. > > > > > Do you have a sample code pattern (link/function) that I can follow? > > > > Any of the zillions of #if statements in .h files :) > > > Thanks. > I will take the approach of having real/stub implementation in the h > file, and the c file using it without a compile flag. > Please kindly let me know if this is not right. Right; for example: in .h: #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline void ns_copy_memfd_scope(... dst, ... src) { dst->memfd_noexec_scope = src->memfd_noexec_scope; } #else static inline void ns_set_memfd_scope(... ns, ... scope) { } #endif in .c: ns_copy_memfd_scope(ns, task_active_pid_ns(current));
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 07481bb87d4e..a4789a7b34a9 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -16,6 +16,21 @@ struct fs_pin; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +/* + * sysctl for vm.memfd_noexec + * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_EXEC was set. + * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_NOEXEC_SEAL was set. + * 2: memfd_create() without MFD_NOEXEC_SEAL will be + * rejected. + */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +#endif + struct pid_namespace { struct idr idr; struct rcu_head rcu; @@ -31,6 +46,10 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + /* sysctl for vm.memfd_noexec */ + int memfd_noexec_scope; +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..273a4e15dfcf 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,10 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f4f8cb0435b4..2b7563ddd22c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -110,6 +110,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + ns->memfd_noexec_scope = + task_active_pid_ns(current)->memfd_noexec_scope; +#endif + return ns; out_free_idr: @@ -255,6 +260,45 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct ctl_table table_copy; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + table_copy = *table; + if (ns != &init_pid_ns) + table_copy.data = &ns->memfd_noexec_scope; + + /* + * set minimum to current value, the effect is only bigger + * value is accepted. + */ + if (*(int *)table_copy.data > *(int *)table_copy.extra1) + table_copy.extra1 = table_copy.data; + + return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table_vm[] = { + { + .procname = "memfd_noexec", + .data = &init_pid_ns.memfd_noexec_scope, + .maxlen = sizeof(init_pid_ns.memfd_noexec_scope), + .mode = 0644, + .proc_handler = pid_mfd_noexec_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; +static struct ctl_path vm_path[] = { { .procname = "vm", }, { } }; +#endif + #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -455,6 +499,10 @@ static __init int pid_namespaces_init(void) #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); #endif + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + register_sysctl_paths(vm_path, pid_ns_ctl_table_vm); +#endif return 0; } diff --git a/mm/memfd.c b/mm/memfd.c index 4ebeab94aa74..ec70675a7069 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -18,6 +18,7 @@ #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/memfd.h> +#include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> /* @@ -263,12 +264,14 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { + char comm[TASK_COMM_LEN]; + struct pid_namespace *ns; unsigned int *file_seals; struct file *file; int fd, error; @@ -285,6 +288,39 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; } + /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ + if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) + return -EINVAL; + + if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { +#ifdef CONFIG_SYSCTL + int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; + + ns = task_active_pid_ns(current); + if (ns) + sysctl = ns->memfd_noexec_scope; + + switch (sysctl) { + case MEMFD_NOEXEC_SCOPE_EXEC: + flags |= MFD_EXEC; + break; + case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL: + flags |= MFD_NOEXEC_SEAL; + break; + default: + pr_warn_ratelimited( + "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + return -EINVAL; + } +#else + flags |= MFD_EXEC; +#endif + pr_warn_ratelimited( + "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + } + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) @@ -328,7 +364,15 @@ SYSCALL_DEFINE2(memfd_create, file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) { + if (flags & MFD_NOEXEC_SEAL) { + struct inode *inode = file_inode(file); + + inode->i_mode &= ~0111; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } else if (flags & MFD_ALLOW_SEALING) { + /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); *file_seals &= ~F_SEAL_SEAL; }