diff mbox

[v2,7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd

Message ID fdec4b70c7cd34e2eacf6a0e41d36f606a696da1.1426376419.git.josh@joshtriplett.org (mailing list archive)
State New, archived
Headers show

Commit Message

Josh Triplett March 15, 2015, 8 a.m. UTC
When passed CLONE_FD, clone4 hands the caller a file descriptor
referring to the new process.  When the new process exits, the file
descriptor becomes readable, producing a structure containing the exit
status, exit code, and user/system times.  The file descriptor also
works in epoll, poll, and select.

This allows libraries to safely launch and manage child processes on
behalf of a caller, without taking over or interfering with process-wide
signal handling.  Without this, such a library would need to take over
or cooperate with the entire process's SIGCHLD handling, either via a
signal handler or a signalfd.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Thiago Macieira <thiago.macieira@intel.com>
---
 include/linux/compat.h     |   2 +
 include/linux/sched.h      |   5 ++
 include/uapi/linux/sched.h |  16 +++++-
 init/Kconfig               |  11 +++++
 kernel/Makefile            |   1 +
 kernel/clonefd.c           | 121 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/clonefd.h           |  32 ++++++++++++
 kernel/exit.c              |   4 ++
 kernel/fork.c              |  22 +++++++--
 9 files changed, 209 insertions(+), 5 deletions(-)
 create mode 100644 kernel/clonefd.c
 create mode 100644 kernel/clonefd.h

Comments

David Drysdale March 23, 2015, 5:38 p.m. UTC | #1
On Sun, Mar 15, 2015 at 8:00 AM, Josh Triplett <josh@joshtriplett.org> wrote:
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index 6c4a68d..c90df5a 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -299,6 +299,8 @@ struct compat_clone4_args {
>         compat_ulong_t stack_start;
>         compat_ulong_t stack_size;
>         compat_ulong_t tls;
> +       compat_uptr_t clonefd;
> +       u32 clonefd_flags;
>  };
>
>  struct compat_statfs;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9daa017..1dc680b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1374,6 +1374,11 @@ struct task_struct {
>
>         unsigned autoreap:1; /* Do not become a zombie on exit */
>
> +#ifdef CONFIG_CLONEFD
> +       unsigned clonefd:1; /* Notify clonefd_wqh on exit */
> +       wait_queue_head_t clonefd_wqh;
> +#endif
> +
>         unsigned long atomic_flags; /* Flags needing atomic access. */
>
>         struct restart_block restart_block;

Idle thought: are there any concerns about the occupancy
impact of adding a wait_queue_head to every task_struct,
whether it has a clonefd or not?

I guess we could reduce the size somewhat by just
storing a struct file *clonefd_file in the task, and then have
a separate structure (with the wqh and a task_struct*) referenced
by file->private_data.  Not sure whether the added complication
would be worthwhile, though.

> diff --git a/kernel/clonefd.c b/kernel/clonefd.c
> new file mode 100644
> index 0000000..eac560c
> --- /dev/null
> +++ b/kernel/clonefd.c
> @@ -0,0 +1,121 @@
> +/*
> + * Support functions for CLONE_FD
> + *
> + * Copyright (c) 2015 Intel Corporation
> + * Original authors: Josh Triplett <josh@joshtriplett.org>
> + *                   Thiago Macieira <thiago@macieira.org>
> + */
> +#include <linux/anon_inodes.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include "clonefd.h"
> +
> +static int clonefd_release(struct inode *inode, struct file *file)
> +{
> +       put_task_struct(file->private_data);
> +       return 0;
> +}
> +
> +static unsigned int clonefd_poll(struct file *file, poll_table *wait)
> +{
> +       struct task_struct *p = file->private_data;
> +       poll_wait(file, &p->clonefd_wqh, wait);
> +       return p->exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0;
> +}
> +
> +static ssize_t clonefd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> +{
> +       struct task_struct *p = file->private_data;
> +       int ret = 0;
> +
> +       /* EOF after first read */
> +       if (*ppos)
> +               return 0;
> +
> +       if (file->f_flags & O_NONBLOCK)
> +               ret = -EAGAIN;
> +       else
> +               ret = wait_event_interruptible(p->clonefd_wqh, p->exit_state);
> +
> +       if (p->exit_state) {
> +               struct clonefd_info info = {};
> +               cputime_t utime, stime;
> +               task_exit_code_status(p->exit_code, &info.code, &info.status);
> +               info.code &= ~__SI_MASK;
> +               task_cputime(p, &utime, &stime);
> +               info.utime = cputime_to_clock_t(utime + p->signal->utime);
> +               info.stime = cputime_to_clock_t(stime + p->signal->stime);
> +               ret = simple_read_from_buffer(buf, count, ppos, &info, sizeof(info));
> +       }
> +       return ret;
> +}
> +
> +static struct file_operations clonefd_fops = {
> +       .release = clonefd_release,
> +       .poll = clonefd_poll,
> +       .read = clonefd_read,
> +       .llseek = no_llseek,
> +};

It might be nice to include a show_fdinfo() implementation that shows
(say) the pid that the clonefd refers to.  E.g. something like:

static void clonefd_show_fdinfo(struct seq_file *m, struct file *file)
{
    struct task_struct *p = file->private_data;

    seq_printf(m, "tid:\t%d\n", task_tgid_vnr(p));
}

> +
> +/* Do process exit notification for clonefd. */
> +void clonefd_do_notify(struct task_struct *p)
> +{
> +       if (p->clonefd)
> +               wake_up_all(&p->clonefd_wqh);
> +}
> +
> +/* Handle the CLONE_FD case for copy_process. */
> +int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
> +                    struct clone4_args *args, struct clonefd_setup *setup)
> +{
> +       int flags;
> +       struct file *file;
> +       int fd;
> +
> +       p->clonefd = !!(clone_flags & CLONE_FD);
> +       if (!p->clonefd)
> +               return 0;
> +
> +       if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK))
> +               return -EINVAL;
> +

Maybe also check for (args->clonefd == NULL) in advance, and
return -EINVAL or -EFAULT?

> +       init_waitqueue_head(&p->clonefd_wqh);
> +
> +       get_task_struct(p);
> +       flags = O_RDONLY | FMODE_ATOMIC_POS | args->clonefd_flags;
> +       file = anon_inode_getfile("[process]", &clonefd_fops, p, flags);
> +       if (IS_ERR(file)) {
> +               put_task_struct(p);
> +               return PTR_ERR(file);
> +       }
> +
> +       fd = get_unused_fd_flags(flags);
> +       if (fd < 0) {
> +               fput(file);
> +               return fd;
> +       }
> +
> +       setup->fd = fd;
> +       setup->file = file;
> +       return 0;
> +}
> +
> +/* Clean up clonefd information after a partially complete clone */
> +void clonefd_cleanup_failed_clone(struct clonefd_setup *setup)
> +{
> +       if (setup->file) {
> +               put_unused_fd(setup->fd);
> +               fput(setup->file);
> +       }
> +}
> +
> +/* Finish setting up the clonefd */
> +void clonefd_install_fd(struct clone4_args *args, struct clonefd_setup *setup)
> +{
> +       if (setup->file) {
> +               fd_install(setup->fd, setup->file);
> +               put_user(setup->fd, args->clonefd);
> +       }
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josh Triplett March 25, 2015, 2:53 p.m. UTC | #2
On Mon, Mar 23, 2015 at 05:38:45PM +0000, David Drysdale wrote:
> On Sun, Mar 15, 2015 at 8:00 AM, Josh Triplett <josh@joshtriplett.org> wrote:
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 9daa017..1dc680b 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1374,6 +1374,11 @@ struct task_struct {
> >
> >         unsigned autoreap:1; /* Do not become a zombie on exit */
> >
> > +#ifdef CONFIG_CLONEFD
> > +       unsigned clonefd:1; /* Notify clonefd_wqh on exit */
> > +       wait_queue_head_t clonefd_wqh;
> > +#endif
> > +
> >         unsigned long atomic_flags; /* Flags needing atomic access. */
> >
> >         struct restart_block restart_block;
> 
> Idle thought: are there any concerns about the occupancy
> impact of adding a wait_queue_head to every task_struct,
> whether it has a clonefd or not?
> 
> I guess we could reduce the size somewhat by just
> storing a struct file *clonefd_file in the task, and then have
> a separate structure (with the wqh and a task_struct*) referenced
> by file->private_data.  Not sure whether the added complication
> would be worthwhile, though.

My original patches did exactly that (minus the reference back to the
task_struct).  However, there are a couple of problems with that
approach.  First, it assumes that a task_struct has only a single file
referencing it, but in the future I'd like to support obtaining a
clonefd for an existing task.  Second, the task_struct really shouldn't
have a reference to the actual struct file, when it only needs the
wait_queue_head_t.

Also, AFAICT a wait_queue_head_t is normally (in the absence of kernel
lock debugging options) the size of two pointers.  Adding an indirection
and an extra allocation to change that to the size of one pointer seems
iffy, especially when looking at the rest of what's directly in
task_struct that's far larger.

> > --- /dev/null
> > +++ b/kernel/clonefd.c
> > @@ -0,0 +1,121 @@
> > +/*
> > + * Support functions for CLONE_FD
> > + *
> > + * Copyright (c) 2015 Intel Corporation
> > + * Original authors: Josh Triplett <josh@joshtriplett.org>
> > + *                   Thiago Macieira <thiago@macieira.org>
> > + */
> > +#include <linux/anon_inodes.h>
> > +#include <linux/file.h>
> > +#include <linux/fs.h>
> > +#include <linux/poll.h>
> > +#include <linux/slab.h>
> > +#include "clonefd.h"
> > +
> > +static int clonefd_release(struct inode *inode, struct file *file)
> > +{
> > +       put_task_struct(file->private_data);
> > +       return 0;
> > +}
> > +
> > +static unsigned int clonefd_poll(struct file *file, poll_table *wait)
> > +{
> > +       struct task_struct *p = file->private_data;
> > +       poll_wait(file, &p->clonefd_wqh, wait);
> > +       return p->exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0;
> > +}
> > +
> > +static ssize_t clonefd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> > +{
> > +       struct task_struct *p = file->private_data;
> > +       int ret = 0;
> > +
> > +       /* EOF after first read */
> > +       if (*ppos)
> > +               return 0;
> > +
> > +       if (file->f_flags & O_NONBLOCK)
> > +               ret = -EAGAIN;
> > +       else
> > +               ret = wait_event_interruptible(p->clonefd_wqh, p->exit_state);
> > +
> > +       if (p->exit_state) {
> > +               struct clonefd_info info = {};
> > +               cputime_t utime, stime;
> > +               task_exit_code_status(p->exit_code, &info.code, &info.status);
> > +               info.code &= ~__SI_MASK;
> > +               task_cputime(p, &utime, &stime);
> > +               info.utime = cputime_to_clock_t(utime + p->signal->utime);
> > +               info.stime = cputime_to_clock_t(stime + p->signal->stime);
> > +               ret = simple_read_from_buffer(buf, count, ppos, &info, sizeof(info));
> > +       }
> > +       return ret;
> > +}
> > +
> > +static struct file_operations clonefd_fops = {
> > +       .release = clonefd_release,
> > +       .poll = clonefd_poll,
> > +       .read = clonefd_read,
> > +       .llseek = no_llseek,
> > +};
> 
> It might be nice to include a show_fdinfo() implementation that shows
> (say) the pid that the clonefd refers to.  E.g. something like:
> 
> static void clonefd_show_fdinfo(struct seq_file *m, struct file *file)
> {
>     struct task_struct *p = file->private_data;
> 
>     seq_printf(m, "tid:\t%d\n", task_tgid_vnr(p));
> }

I thought about that, but that would add a couple of additional ifdefs
(CONFIG_PROC_FS), for an informational file of minimal value.  More
importantly, I don't want to add that until after adding an ioctl or
similar to programmatically obtain the pid from a clonefd; otherwise,
someone might try to use fdinfo as the "API" to do so, which would be
all kinds of awful.

So I'd prefer to add fdinfo in a future extension of clonefd, rather
than in the initial patch series.

> > +
> > +/* Do process exit notification for clonefd. */
> > +void clonefd_do_notify(struct task_struct *p)
> > +{
> > +       if (p->clonefd)
> > +               wake_up_all(&p->clonefd_wqh);
> > +}
> > +
> > +/* Handle the CLONE_FD case for copy_process. */
> > +int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
> > +                    struct clone4_args *args, struct clonefd_setup *setup)
> > +{
> > +       int flags;
> > +       struct file *file;
> > +       int fd;
> > +
> > +       p->clonefd = !!(clone_flags & CLONE_FD);
> > +       if (!p->clonefd)
> > +               return 0;
> > +
> > +       if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK))
> > +               return -EINVAL;
> > +
> 
> Maybe also check for (args->clonefd == NULL) in advance, and
> return -EINVAL or -EFAULT?

That wouldn't be consistent with how clone treats its various other
out argument pointers.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sergey Senozhatsky April 6, 2015, 8:30 a.m. UTC | #3
On (03/15/15 01:00), Josh Triplett wrote:
[..]
> +
> +/* Handle the CLONE_FD case for copy_process. */
> +int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
> +		     struct clone4_args *args, struct clonefd_setup *setup)
> +{
> +	int flags;
> +	struct file *file;
> +	int fd;
> +
> +	p->clonefd = !!(clone_flags & CLONE_FD);
> +	if (!p->clonefd)
> +		return 0;
> +
> +	if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK))
> +		return -EINVAL;
> +
> +	init_waitqueue_head(&p->clonefd_wqh);
> +
> +	get_task_struct(p);
> +	flags = O_RDONLY | FMODE_ATOMIC_POS | args->clonefd_flags;
> +	file = anon_inode_getfile("[process]", &clonefd_fops, p, flags);
> +	if (IS_ERR(file)) {
> +		put_task_struct(p);
> +		return PTR_ERR(file);
> +	}
> +
> +	fd = get_unused_fd_flags(flags);
> +	if (fd < 0) {

+		put_task_struct(p); ?

> +		fput(file);
> +		return fd;
> +	}
> +
> +	setup->fd = fd;
> +	setup->file = file;
> +	return 0;
> +}
[..]

	-ss
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josh Triplett April 6, 2015, 9:31 a.m. UTC | #4
On Mon, Apr 06, 2015 at 05:30:35PM +0900, Sergey Senozhatsky wrote:
> On (03/15/15 01:00), Josh Triplett wrote:
> [..]
> > +
> > +/* Handle the CLONE_FD case for copy_process. */
> > +int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
> > +		     struct clone4_args *args, struct clonefd_setup *setup)
> > +{
> > +	int flags;
> > +	struct file *file;
> > +	int fd;
> > +
> > +	p->clonefd = !!(clone_flags & CLONE_FD);
> > +	if (!p->clonefd)
> > +		return 0;
> > +
> > +	if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK))
> > +		return -EINVAL;
> > +
> > +	init_waitqueue_head(&p->clonefd_wqh);
> > +
> > +	get_task_struct(p);
> > +	flags = O_RDONLY | FMODE_ATOMIC_POS | args->clonefd_flags;
> > +	file = anon_inode_getfile("[process]", &clonefd_fops, p, flags);
> > +	if (IS_ERR(file)) {
> > +		put_task_struct(p);
> > +		return PTR_ERR(file);
> > +	}
> > +
> > +	fd = get_unused_fd_flags(flags);
> > +	if (fd < 0) {
> 
> +		put_task_struct(p); ?

No, once anon_inode_getfile has succeeded, the file owns the reference
to the task_struct, so fput(file) will call the release function which
calls put_task_struct.  Only the failure case for anon_inode_getfile
needs to call put_task_struct directly.

> > +		fput(file);
> > +		return fd;
> > +	}

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6c4a68d..c90df5a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -299,6 +299,8 @@  struct compat_clone4_args {
 	compat_ulong_t stack_start;
 	compat_ulong_t stack_size;
 	compat_ulong_t tls;
+	compat_uptr_t clonefd;
+	u32 clonefd_flags;
 };
 
 struct compat_statfs;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9daa017..1dc680b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1374,6 +1374,11 @@  struct task_struct {
 
 	unsigned autoreap:1; /* Do not become a zombie on exit */
 
+#ifdef CONFIG_CLONEFD
+	unsigned clonefd:1; /* Notify clonefd_wqh on exit */
+	wait_queue_head_t clonefd_wqh;
+#endif
+
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
 	struct restart_block restart_block;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index f606c0a..86627f0 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -41,6 +41,7 @@ 
  * Flags that only work with clone4.
  */
 #define CLONE_AUTOREAP	0x00001000	/* Automatically reap the process */
+#define CLONE_FD	0x00400000	/* Signal exit via file descriptor */
 
 #ifdef __KERNEL__
 /*
@@ -48,10 +49,21 @@ 
  * list above, but not exposed to userspace.
  */
 #define CLONE_VALID_FLAGS	(0xffffffffULL & ~(CLONE_PID | CLONE_DETACHED | CLONE_STOPPED))
-#define CLONE4_VALID_FLAGS	(CLONE_VALID_FLAGS | CLONE_AUTOREAP)
+#define CLONE4_VALID_FLAGS	(CLONE_VALID_FLAGS | CLONE_AUTOREAP | \
+				 (IS_ENABLED(CONFIG_CLONEFD) ? CLONE_FD : 0))
 #endif /* __KERNEL__ */
 
 /*
+ * Structure read from CLONE_FD file descriptor after process exits
+ */
+struct clonefd_info {
+	__s32 code;
+	__s32 status;
+	__u64 utime;
+	__u64 stime;
+};
+
+/*
  * Structure passed to clone4 for additional arguments.  Initialized to 0,
  * then overwritten with arguments from userspace, so arguments not supplied by
  * userspace will remain 0.  New versions of the kernel may safely append new
@@ -63,6 +75,8 @@  struct clone4_args {
 	__kernel_ulong_t stack_start;
 	__kernel_ulong_t stack_size;
 	__kernel_ulong_t tls;
+	int __user *clonefd;
+	__u32 clonefd_flags;
 };
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index 3ab6649..b444280 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1521,6 +1521,17 @@  config CLONE4
 
 	  If unsure, say Y.
 
+config CLONEFD
+	bool "Enable CLONE_FD flag for clone4()" if EXPERT
+	depends on CLONE4
+	select ANON_INODES
+	default y
+	help
+	  Enable the CLONE_FD flag for clone4(), which creates a file descriptor
+	  to receive child exit events rather than receiving a signal.
+
+	  If unsure, say Y.
+
 # syscall, maps, verifier
 config BPF_SYSCALL
 	bool "Enable bpf() system call" if EXPERT
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b33..368986c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -29,6 +29,7 @@  obj-y += rcu/
 obj-y += livepatch/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_CLONEFD) += clonefd.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/clonefd.c b/kernel/clonefd.c
new file mode 100644
index 0000000..eac560c
--- /dev/null
+++ b/kernel/clonefd.c
@@ -0,0 +1,121 @@ 
+/*
+ * Support functions for CLONE_FD
+ *
+ * Copyright (c) 2015 Intel Corporation
+ * Original authors: Josh Triplett <josh@joshtriplett.org>
+ *                   Thiago Macieira <thiago@macieira.org>
+ */
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include "clonefd.h"
+
+static int clonefd_release(struct inode *inode, struct file *file)
+{
+	put_task_struct(file->private_data);
+	return 0;
+}
+
+static unsigned int clonefd_poll(struct file *file, poll_table *wait)
+{
+	struct task_struct *p = file->private_data;
+	poll_wait(file, &p->clonefd_wqh, wait);
+	return p->exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0;
+}
+
+static ssize_t clonefd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct task_struct *p = file->private_data;
+	int ret = 0;
+
+	/* EOF after first read */
+	if (*ppos)
+		return 0;
+
+	if (file->f_flags & O_NONBLOCK)
+		ret = -EAGAIN;
+	else
+		ret = wait_event_interruptible(p->clonefd_wqh, p->exit_state);
+
+	if (p->exit_state) {
+		struct clonefd_info info = {};
+		cputime_t utime, stime;
+		task_exit_code_status(p->exit_code, &info.code, &info.status);
+		info.code &= ~__SI_MASK;
+		task_cputime(p, &utime, &stime);
+		info.utime = cputime_to_clock_t(utime + p->signal->utime);
+		info.stime = cputime_to_clock_t(stime + p->signal->stime);
+		ret = simple_read_from_buffer(buf, count, ppos, &info, sizeof(info));
+	}
+	return ret;
+}
+
+static struct file_operations clonefd_fops = {
+	.release = clonefd_release,
+	.poll = clonefd_poll,
+	.read = clonefd_read,
+	.llseek = no_llseek,
+};
+
+/* Do process exit notification for clonefd. */
+void clonefd_do_notify(struct task_struct *p)
+{
+	if (p->clonefd)
+		wake_up_all(&p->clonefd_wqh);
+}
+
+/* Handle the CLONE_FD case for copy_process. */
+int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
+		     struct clone4_args *args, struct clonefd_setup *setup)
+{
+	int flags;
+	struct file *file;
+	int fd;
+
+	p->clonefd = !!(clone_flags & CLONE_FD);
+	if (!p->clonefd)
+		return 0;
+
+	if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK))
+		return -EINVAL;
+
+	init_waitqueue_head(&p->clonefd_wqh);
+
+	get_task_struct(p);
+	flags = O_RDONLY | FMODE_ATOMIC_POS | args->clonefd_flags;
+	file = anon_inode_getfile("[process]", &clonefd_fops, p, flags);
+	if (IS_ERR(file)) {
+		put_task_struct(p);
+		return PTR_ERR(file);
+	}
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0) {
+		fput(file);
+		return fd;
+	}
+
+	setup->fd = fd;
+	setup->file = file;
+	return 0;
+}
+
+/* Clean up clonefd information after a partially complete clone */
+void clonefd_cleanup_failed_clone(struct clonefd_setup *setup)
+{
+	if (setup->file) {
+		put_unused_fd(setup->fd);
+		fput(setup->file);
+	}
+}
+
+/* Finish setting up the clonefd */
+void clonefd_install_fd(struct clone4_args *args, struct clonefd_setup *setup)
+{
+	if (setup->file) {
+		fd_install(setup->fd, setup->file);
+		put_user(setup->fd, args->clonefd);
+	}
+}
diff --git a/kernel/clonefd.h b/kernel/clonefd.h
new file mode 100644
index 0000000..2d8a67c
--- /dev/null
+++ b/kernel/clonefd.h
@@ -0,0 +1,32 @@ 
+/*
+ * Support functions for CLONE_FD
+ *
+ * Copyright (c) 2015 Intel Corporation
+ * Original authors: Josh Triplett <josh@joshtriplett.org>
+ *                   Thiago Macieira <thiago@macieira.org>
+ */
+#pragma once
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_CLONEFD
+struct clonefd_setup {
+	int fd;
+	struct file *file;
+};
+int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
+		     struct clone4_args *args, struct clonefd_setup *setup);
+void clonefd_cleanup_failed_clone(struct clonefd_setup *setup);
+void clonefd_install_fd(struct clone4_args *args, struct clonefd_setup *setup);
+void clonefd_do_notify(struct task_struct *p);
+#else /* CONFIG_CLONEFD */
+struct clonefd_setup {};
+static inline int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
+				   struct clone4_args *args, struct clonefd_setup *setup)
+{
+	return 0;
+}
+static inline void clonefd_cleanup_failed_clone(struct clonefd_setup *setup) {}
+static inline void clonefd_install_fd(struct clone4_args *args, struct clonefd_setup *setup) {}
+static inline void clonefd_do_notify(struct task_struct *p) {}
+#endif /* CONFIG_CLONEFD */
diff --git a/kernel/exit.c b/kernel/exit.c
index feff10b..83278b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,6 +59,8 @@ 
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+#include "clonefd.h"
+
 static void exit_mm(struct task_struct *tsk);
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -615,6 +617,8 @@  static void exit_notify(struct task_struct *tsk, int group_dead)
 	if (tsk->exit_state == EXIT_DEAD)
 		list_add(&tsk->ptrace_entry, &dead);
 
+	clonefd_do_notify(tsk);
+
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exit_task);
diff --git a/kernel/fork.c b/kernel/fork.c
index c297e5e..8fdf0ac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,8 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/task.h>
 
+#include "clonefd.h"
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -1190,7 +1192,8 @@  init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
 static struct task_struct *copy_process(u64 clone_flags,
 					struct clone4_args *args,
 					struct pid *pid,
-					int trace)
+					int trace,
+					struct clonefd_setup *clonefd_setup)
 {
 	int retval;
 	struct task_struct *p;
@@ -1413,6 +1416,10 @@  static struct task_struct *copy_process(u64 clone_flags,
 			goto bad_fork_cleanup_io;
 	}
 
+	retval = clonefd_do_clone(clone_flags, p, args, clonefd_setup);
+	if (retval)
+		goto bad_fork_free_pid;
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->ctid : NULL;
 	/*
 	 * Clear TID on mm_release()?
@@ -1507,7 +1514,7 @@  static struct task_struct *copy_process(u64 clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_cleanup_clonefd;
 	}
 
 	if (likely(p->pid)) {
@@ -1559,6 +1566,8 @@  static struct task_struct *copy_process(u64 clone_flags,
 
 	return p;
 
+bad_fork_cleanup_clonefd:
+	clonefd_cleanup_failed_clone(clonefd_setup);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
@@ -1617,7 +1626,7 @@  struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
 	struct clone4_args args = {};
-	task = copy_process(CLONE_VM, &args, &init_struct_pid, 0);
+	task = copy_process(CLONE_VM, &args, &init_struct_pid, 0, NULL);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
 		init_idle(task, cpu);
@@ -1637,6 +1646,7 @@  static long _do_fork(u64 clone_flags, struct clone4_args *args)
 	struct task_struct *p;
 	int trace = 0;
 	long nr;
+	struct clonefd_setup clonefd_setup = {};
 
 	/*
 	 * Determine whether and which event to report to ptracer.  When
@@ -1656,7 +1666,7 @@  static long _do_fork(u64 clone_flags, struct clone4_args *args)
 			trace = 0;
 	}
 
-	p = copy_process(clone_flags, args, NULL, trace);
+	p = copy_process(clone_flags, args, NULL, trace, &clonefd_setup);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1679,6 +1689,8 @@  static long _do_fork(u64 clone_flags, struct clone4_args *args)
 			get_task_struct(p);
 		}
 
+		clonefd_install_fd(args, &clonefd_setup);
+
 		wake_up_new_task(p);
 
 		/* forking complete and child started to run, tell ptracer */
@@ -1822,6 +1834,8 @@  COMPAT_SYSCALL_DEFINE4(clone4, unsigned, flags_high, unsigned, flags_low,
 	kargs.stack_start = compat_kargs.stack_start;
 	kargs.stack_size = compat_kargs.stack_size;
 	kargs.tls = compat_kargs.tls;
+	kargs.clonefd = compat_ptr(compat_kargs.clonefd);
+	kargs.clonefd_flags = compat_kargs.clonefd_flags;
 	return _do_fork(flags, &kargs);
 }
 #endif /* CONFIG_COMPAT */