@@ -483,6 +483,7 @@ GLOBAL(\label)
PTREGSCALL stub32_execveat, compat_sys_execveat
PTREGSCALL stub32_fork, sys_fork
PTREGSCALL stub32_vfork, sys_vfork
+ PTREGSCALL stub32_clone4, compat_sys_clone4
ALIGN
GLOBAL(stub32_clone)
@@ -520,6 +520,7 @@ END(\label)
FORK_LIKE clone
FORK_LIKE fork
FORK_LIKE vfork
+ FORK_LIKE clone4
FIXED_FRAME stub_iopl, sys_iopl
ENTRY(stub_execve)
@@ -365,3 +365,4 @@
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat
+359 i386 clone4 sys_clone4 stub32_clone4
@@ -329,6 +329,7 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 64 clone4 stub_clone4
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -368,3 +369,4 @@
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
545 x32 execveat stub_x32_execveat
+546 x32 clone4 stub32_clone4
@@ -293,6 +293,14 @@ struct compat_old_sigaction {
};
#endif
+struct compat_clone4_args {
+ compat_uptr_t ptid;
+ compat_uptr_t ctid;
+ compat_ulong_t stack_start;
+ compat_ulong_t stack_size;
+ compat_ulong_t tls;
+};
+
struct compat_statfs;
struct compat_statfs64;
struct compat_old_linux_dirent;
@@ -713,6 +721,10 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
int, const char __user *);
+
+asmlinkage long compat_sys_clone4(unsigned, unsigned, compat_ulong_t,
+ struct compat_clone4_args __user *);
+
#else
#define is_compat_task() (0)
@@ -1,6 +1,8 @@
#ifndef _UAPI_LINUX_SCHED_H
#define _UAPI_LINUX_SCHED_H
+#include <linux/types.h>
+
/*
* cloning flags:
*/
@@ -18,11 +20,8 @@
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
-#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
- and is now available for re-use. */
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
@@ -31,6 +30,34 @@
#define CLONE_IO 0x80000000 /* Clone io context */
/*
+ * Old flags, unused by current clone. clone does not return EINVAL for these
+ * flags, so they can't easily be reused. clone4 can use them.
+ */
+#define CLONE_PID 0x00001000
+#define CLONE_DETACHED 0x00400000
+#define CLONE_STOPPED 0x02000000
+
+/*
+ * Valid flags for clone and for clone4
+ */
+#define CLONE_VALID_FLAGS (0xffffffffULL & ~(CLONE_PID | CLONE_DETACHED | CLONE_STOPPED))
+#define CLONE4_VALID_FLAGS CLONE_VALID_FLAGS
+
+/*
+ * Structure passed to clone4 for additional arguments. Initialized to 0,
+ * then overwritten with arguments from userspace, so arguments not supplied by
+ * userspace will remain 0. New versions of the kernel may safely append new
+ * arguments to the end.
+ */
+struct clone4_args {
+ __kernel_pid_t __user *ptid;
+ __kernel_pid_t __user *ctid;
+ __kernel_ulong_t stack_start;
+ __kernel_ulong_t stack_size;
+ __kernel_ulong_t tls;
+};
+
+/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
@@ -1511,6 +1511,16 @@ config EVENTFD
If unsure, say Y.
+config CLONE4
+ bool "Enable clone4() system call" if EXPERT
+ depends on HAVE_COPY_THREAD_TLS
+ default y
+ help
+ Enable the clone4() system call, which supports passing additional
+ flags.
+
+ If unsure, say Y.
+
# syscall, maps, verifier
config BPF_SYSCALL
bool "Enable bpf() system call" if EXPERT
@@ -1187,7 +1187,7 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static struct task_struct *copy_process(unsigned long clone_flags,
+static struct task_struct *copy_process(u64 clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
@@ -1198,6 +1198,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
int retval;
struct task_struct *p;
+ if (clone_flags & ~CLONE4_VALID_FLAGS)
+ return ERR_PTR(-EINVAL);
+
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1630,7 +1633,7 @@ struct task_struct *fork_idle(int cpu)
* it and waits for it to finish using the VM if required.
*/
static long _do_fork(
- unsigned long clone_flags,
+ u64 clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
@@ -1701,6 +1704,15 @@ static long _do_fork(
return nr;
}
+/*
+ * Convenience function for callers passing unsigned long flags, to prevent old
+ * syscall entry points from unexpectedly returning EINVAL.
+ */
+static inline u64 squelch_clone_flags(unsigned long clone_flags)
+{
+ return (u32)(clone_flags & ~CLONE_VALID_FLAGS);
+}
+
#ifndef CONFIG_HAVE_COPY_THREAD_TLS
/* For compatibility with architectures that call do_fork directly rather than
* using the syscall entry points below. */
@@ -1710,7 +1722,8 @@ long do_fork(unsigned long clone_flags,
int __user *parent_tidptr,
int __user *child_tidptr)
{
- return _do_fork(clone_flags, stack_start, stack_size,
+ return _do_fork(squelch_clone_flags(clone_flags),
+ stack_start, stack_size,
parent_tidptr, child_tidptr, 0);
}
#endif
@@ -1768,10 +1781,49 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
unsigned long, tls)
#endif
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+ return _do_fork(squelch_clone_flags(clone_flags), newsp, 0,
+ parent_tidptr, child_tidptr, tls);
}
#endif
+#ifdef CONFIG_CLONE4
+SYSCALL_DEFINE4(clone4, unsigned, flags_high, unsigned, flags_low,
+ unsigned long, args_size, struct clone4_args __user *, args)
+{
+ struct clone4_args kargs = {};
+ if (args_size > sizeof(kargs)) {
+ return -EINVAL;
+ } else if (args_size) {
+ int ret = copy_from_user(&kargs, args, args_size);
+ if (ret < 0)
+ return ret;
+ }
+ return _do_fork((u64)flags_high << 32 | flags_low,
+ kargs.stack_start, kargs.stack_size,
+ kargs.ptid, kargs.ctid, kargs.tls);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(clone4, unsigned, flags_high, unsigned, flags_low,
+ compat_ulong_t, args_size,
+ struct compat_clone4_args __user *, args)
+{
+ struct compat_clone4_args kargs = {};
+ if (args_size > sizeof(kargs)) {
+ return -EINVAL;
+ } else if (args_size) {
+ int ret = copy_from_user(&kargs, args, args_size);
+ if (ret < 0)
+ return ret;
+ }
+ return _do_fork((u64)flags_high << 32 | flags_low,
+ kargs.stack_start, kargs.stack_size,
+ compat_ptr(kargs.ptid), compat_ptr(kargs.ctid),
+ kargs.tls);
+}
+#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_CLONE4 */
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
@@ -159,6 +159,7 @@ cond_syscall(sys_uselib);
cond_syscall(sys_fadvise64);
cond_syscall(sys_fadvise64_64);
cond_syscall(sys_madvise);
+cond_syscall(sys_clone4);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);