diff mbox

[2/2] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

Message ID 149245057248.17600.1341652606136269734.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Kirill Tkhai April 17, 2017, 5:36 p.m. UTC
On implementing of nested pid namespaces support in CRIU
(checkpoint-restore in userspace tool) we run into
the situation, that it's impossible to create a task with
specific NSpid effectively. After commit 49f4d8b93ccf
"pidns: Capture the user namespace and filter ns_last_pid"
it is impossible to set ns_last_pid on any pid namespace,
except task's active pid_ns (before the commit it was possible
to write to pid_ns_for_children). Thus, if a restored task
in a container has more than one pid_ns levels, the restorer
code must have a task helper for every pid namespace
of the task's pid_ns hierarhy.

This is a big problem, because of communication with
a helper for every pid_ns in the hierarchy is not cheap
and not performance-good as it implies many helpers wakeups
to create a single task (independently, how you communicate
with the helpers). This patch tries to decide the problem.

It introduces a new pid_ns ns_ioctl(PIDNS_REQ_SET_LAST_PID_VEC),
which allows to write a vector of last pids on pid_ns hierarchy.
The vector is passed as a ":"-delimited string with pids,
written in reverse order. The first number corresponds to
the opened namespace ns_last_pid, the second is to its parent, etc.
So, if you have the pid namespaces hierarchy like:

pid_ns1 (grand father)
  |
  v
pid_ns2 (father)
  |
  v
pid_ns3 (child)

and the ns of task's of pid_ns3 is open, then the corresponding
vector will be "last_ns_pid3:last_ns_pid2:last_ns_pid1". This
vector may be short and it may contain less levels, for example,
"last_ns_pid3:last_ns_pid2" or even "last_ns_pid3", in dependence
of which levels you want to populate.

To write in a pid_ns's ns_last_pid we check that the writer task
has CAP_SYS_ADMIN permittions in this pid_ns's user_ns.

One note about struct pidns_ioc_req. It's made extensible and
may expanded in the future. The always existing fields present
at the moment, the future fields and they sizes may be determined
by pidns_ioc_req::req by the future code.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 include/uapi/linux/nsfs.h |    9 +++++
 kernel/pid_namespace.c    |   88 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

Comments

Serge E. Hallyn April 19, 2017, 8:27 p.m. UTC | #1
Quoting Kirill Tkhai (ktkhai@virtuozzo.com):
> On implementing of nested pid namespaces support in CRIU
> (checkpoint-restore in userspace tool) we run into
> the situation, that it's impossible to create a task with
> specific NSpid effectively. After commit 49f4d8b93ccf
> "pidns: Capture the user namespace and filter ns_last_pid"
> it is impossible to set ns_last_pid on any pid namespace,
> except task's active pid_ns (before the commit it was possible
> to write to pid_ns_for_children). Thus, if a restored task
> in a container has more than one pid_ns levels, the restorer
> code must have a task helper for every pid namespace
> of the task's pid_ns hierarhy.
> 
> This is a big problem, because of communication with
> a helper for every pid_ns in the hierarchy is not cheap
> and not performance-good as it implies many helpers wakeups
> to create a single task (independently, how you communicate
> with the helpers). This patch tries to decide the problem.
> 
> It introduces a new pid_ns ns_ioctl(PIDNS_REQ_SET_LAST_PID_VEC),
> which allows to write a vector of last pids on pid_ns hierarchy.
> The vector is passed as a ":"-delimited string with pids,
> written in reverse order. The first number corresponds to
> the opened namespace ns_last_pid, the second is to its parent, etc.
> So, if you have the pid namespaces hierarchy like:
> 
> pid_ns1 (grand father)
>   |
>   v
> pid_ns2 (father)
>   |
>   v
> pid_ns3 (child)
> 
> and the ns of task's of pid_ns3 is open, then the corresponding
> vector will be "last_ns_pid3:last_ns_pid2:last_ns_pid1". This
> vector may be short and it may contain less levels, for example,
> "last_ns_pid3:last_ns_pid2" or even "last_ns_pid3", in dependence
> of which levels you want to populate.
> 
> To write in a pid_ns's ns_last_pid we check that the writer task
> has CAP_SYS_ADMIN permittions in this pid_ns's user_ns.
> 
> One note about struct pidns_ioc_req. It's made extensible and
> may expanded in the future. The always existing fields present
> at the moment, the future fields and they sizes may be determined
> by pidns_ioc_req::req by the future code.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>

Reviewed-by: Serge Hallyn <serge@hallyn.com>

(for both patches)

> ---
>  include/uapi/linux/nsfs.h |    9 +++++
>  kernel/pid_namespace.c    |   88 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 97 insertions(+)
> 
> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
> index 544bbb661475..37bb4af917b5 100644
> --- a/include/uapi/linux/nsfs.h
> +++ b/include/uapi/linux/nsfs.h
> @@ -17,4 +17,13 @@
>  /* Execute namespace-specific ioctl */
>  #define NS_SPECIFIC_IOC		_IO(NSIO, 0x5)
>  
> +struct pidns_ioc_req {
> +/* Set vector of last pids in namespace hierarchy */
> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
> +	unsigned int req;
> +	void __user *data;
> +	unsigned int data_size;
> +	char std_fields[0];
> +};
> +
>  #endif /* __LINUX_NSFS_H */
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index de461aa0bf9a..0e86fa15cd92 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -21,6 +21,8 @@
>  #include <linux/export.h>
>  #include <linux/sched/task.h>
>  #include <linux/sched/signal.h>
> +#include <linux/vmalloc.h>
> +#include <uapi/linux/nsfs.h>
>  
>  struct pid_cache {
>  	int nr_ids;
> @@ -428,6 +430,91 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
>  	return &get_pid_ns(pid_ns)->ns;
>  }
>  
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
> +			     struct pidns_ioc_req *req)
> +{
> +	char *str, *p;
> +	int ret = 0;
> +	pid_t pid;
> +
> +	read_lock(&tasklist_lock);
> +	if (!pid_ns->child_reaper)
> +		ret = -EINVAL;
> +	read_unlock(&tasklist_lock);
> +	if (ret)
> +		return ret;
> +
> +	if (req->data_size >= PAGE_SIZE)
> +		return -EINVAL;
> +	str = vmalloc(req->data_size + 1);
> +	if (!str)
> +		return -ENOMEM;
> +	if (copy_from_user(str, req->data, req->data_size)) {
> +		ret = -EFAULT;
> +		goto out_vfree;
> +	}
> +	str[req->data_size] = '\0';
> +
> +	p = str;
> +	while (p && *p != '\0') {
> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
> +			ret = -EPERM;
> +			goto out_vfree;
> +		}
> +
> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
> +			ret = -EINVAL;
> +			goto out_vfree;
> +		}
> +
> +		/* Write directly: see the comment in pid_ns_ctl_handler() */
> +		pid_ns->last_pid = pid;
> +
> +		p = strchr(p, ':');
> +		pid_ns = pid_ns->parent;
> +		if (p) {
> +			if (!pid_ns) {
> +				ret = -EINVAL;
> +				goto out_vfree;
> +			}
> +			p++;
> +		}
> +	}
> +
> +	ret = 0;
> +out_vfree:
> +	vfree(str);
> +	return ret;
> +}
> +#else	/* CONFIG_CHECKPOINT_RESTORE */
> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
> +			     struct pidns_ioc_req *req)
> +{
> +	return -ENOTTY;
> +}
> +#endif	/* CONFIG_CHECKPOINT_RESTORE */
> +
> +static long pidns_ioctl(struct ns_common *ns, unsigned long arg)
> +{
> +	struct pid_namespace *pid_ns = to_pid_ns(ns);
> +	struct pidns_ioc_req user_req;
> +	int ret;
> +
> +	ret = copy_from_user(&user_req, (void *)arg,
> +			     offsetof(struct pidns_ioc_req, std_fields));
> +	if (ret)
> +		return ret;
> +
> +	switch (user_req.req) {
> +	case PIDNS_REQ_SET_LAST_PID_VEC:
> +		return set_last_pid_vec(pid_ns, &user_req);
> +	default:
> +		return -ENOTTY;
> +	}
> +	return 0;
> +}
> +
>  static struct user_namespace *pidns_owner(struct ns_common *ns)
>  {
>  	return to_pid_ns(ns)->user_ns;
> @@ -441,6 +528,7 @@ const struct proc_ns_operations pidns_operations = {
>  	.install	= pidns_install,
>  	.owner		= pidns_owner,
>  	.get_parent	= pidns_get_parent,
> +	.ns_ioctl	= pidns_ioctl,
>  };
>  
>  static __init int pid_namespaces_init(void)
Cyrill Gorcunov April 24, 2017, 7:03 p.m. UTC | #2
On Mon, Apr 17, 2017 at 8:36 PM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> On implementing of nested pid namespaces support in CRIU
> (checkpoint-restore in userspace tool) we run into
> the situation, that it's impossible to create a task with
> specific NSpid effectively. After commit 49f4d8b93ccf
> "pidns: Capture the user namespace and filter ns_last_pid"
> it is impossible to set ns_last_pid on any pid namespace,
> except task's active pid_ns (before the commit it was possible
> to write to pid_ns_for_children). Thus, if a restored task
> in a container has more than one pid_ns levels, the restorer
> code must have a task helper for every pid namespace
> of the task's pid_ns hierarhy.
>
> This is a big problem, because of communication with
> a helper for every pid_ns in the hierarchy is not cheap
> and not performance-good as it implies many helpers wakeups
> to create a single task (independently, how you communicate
> with the helpers). This patch tries to decide the problem.
>
> It introduces a new pid_ns ns_ioctl(PIDNS_REQ_SET_LAST_PID_VEC),
> which allows to write a vector of last pids on pid_ns hierarchy.
> The vector is passed as a ":"-delimited string with pids,
> written in reverse order. The first number corresponds to
> the opened namespace ns_last_pid, the second is to its parent, etc.
> So, if you have the pid namespaces hierarchy like:
>
> pid_ns1 (grand father)
>   |
>   v
> pid_ns2 (father)
>   |
>   v
> pid_ns3 (child)
>
> and the ns of task's of pid_ns3 is open, then the corresponding
> vector will be "last_ns_pid3:last_ns_pid2:last_ns_pid1". This
> vector may be short and it may contain less levels, for example,
> "last_ns_pid3:last_ns_pid2" or even "last_ns_pid3", in dependence
> of which levels you want to populate.
>
> To write in a pid_ns's ns_last_pid we check that the writer task
> has CAP_SYS_ADMIN permittions in this pid_ns's user_ns.
>
> One note about struct pidns_ioc_req. It's made extensible and
> may expanded in the future. The always existing fields present
> at the moment, the future fields and they sizes may be determined
> by pidns_ioc_req::req by the future code.
>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Oleg Nesterov April 26, 2017, 3:53 p.m. UTC | #3
On 04/17, Kirill Tkhai wrote:
>
> +struct pidns_ioc_req {
> +/* Set vector of last pids in namespace hierarchy */
> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
> +	unsigned int req;
> +	void __user *data;
> +	unsigned int data_size;
> +	char std_fields[0];
> +};

see below,

> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
> +			     struct pidns_ioc_req *req)
> +{
> +	char *str, *p;
> +	int ret = 0;
> +	pid_t pid;
> +
> +	read_lock(&tasklist_lock);
> +	if (!pid_ns->child_reaper)
> +		ret = -EINVAL;
> +	read_unlock(&tasklist_lock);
> +	if (ret)
> +		return ret;

why do you need to check ->child_reaper under tasklist_lock? this looks pointless.

In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
there must be at least one task in this namespace, otherwise you can't open a file
which has f_op == ns_file_operations, no?

> +	if (req->data_size >= PAGE_SIZE)
> +		return -EINVAL;
> +	str = vmalloc(req->data_size + 1);

then I don't understand why it makes sense to use vmalloc()

> +	if (!str)
> +		return -ENOMEM;
> +	if (copy_from_user(str, req->data, req->data_size)) {
> +		ret = -EFAULT;
> +		goto out_vfree;
> +	}
> +	str[req->data_size] = '\0';
> +
> +	p = str;
> +	while (p && *p != '\0') {
> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
> +			ret = -EPERM;
> +			goto out_vfree;
> +		}
> +
> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
> +			ret = -EINVAL;
> +			goto out_vfree;
> +		}

Well, this is ioctl(), do we really want to parse the strings?

Can't we make

	struct pidns_ioc_req {
		...
		int nr_pids;
		pid_t  pids[0];
	}

and just use get_user() in a loop? This way we can avoid vmalloc() or anything
else altogether.

Oleg.
Kirill Tkhai April 26, 2017, 4:11 p.m. UTC | #4
On 26.04.2017 18:53, Oleg Nesterov wrote:
> On 04/17, Kirill Tkhai wrote:
>>
>> +struct pidns_ioc_req {
>> +/* Set vector of last pids in namespace hierarchy */
>> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
>> +	unsigned int req;
>> +	void __user *data;
>> +	unsigned int data_size;
>> +	char std_fields[0];
>> +};
> 
> see below,
> 
>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>> +			     struct pidns_ioc_req *req)
>> +{
>> +	char *str, *p;
>> +	int ret = 0;
>> +	pid_t pid;
>> +
>> +	read_lock(&tasklist_lock);
>> +	if (!pid_ns->child_reaper)
>> +		ret = -EINVAL;
>> +	read_unlock(&tasklist_lock);
>> +	if (ret)
>> +		return ret;
> 
> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
> 
> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
> there must be at least one task in this namespace, otherwise you can't open a file
> which has f_op == ns_file_operations, no?

Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
it under impression of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
but here it's completely wrong. It will be removed in v2.
 
>> +	if (req->data_size >= PAGE_SIZE)
>> +		return -EINVAL;
>> +	str = vmalloc(req->data_size + 1);
> 
> then I don't understand why it makes sense to use vmalloc()
> 
>> +	if (!str)
>> +		return -ENOMEM;
>> +	if (copy_from_user(str, req->data, req->data_size)) {
>> +		ret = -EFAULT;
>> +		goto out_vfree;
>> +	}
>> +	str[req->data_size] = '\0';
>> +
>> +	p = str;
>> +	while (p && *p != '\0') {
>> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
>> +			ret = -EPERM;
>> +			goto out_vfree;
>> +		}
>> +
>> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
>> +			ret = -EINVAL;
>> +			goto out_vfree;
>> +		}
> 
> Well, this is ioctl(), do we really want to parse the strings?
> 
> Can't we make
> 
> 	struct pidns_ioc_req {
> 		...
> 		int nr_pids;
> 		pid_t  pids[0];
> 	}
> 
> and just use get_user() in a loop? This way we can avoid vmalloc() or anything
> else altogether.

Since it's a generic structure for different types of the requests, it may be extended
in the future. We won't be able to add new fields, if we compose the structure the way
you suggested, will we?
Eric W. Biederman April 26, 2017, 4:32 p.m. UTC | #5
Kirill Tkhai <ktkhai@virtuozzo.com> writes:

> On 26.04.2017 19:11, Kirill Tkhai wrote:
>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>> On 04/17, Kirill Tkhai wrote:
>>>>
>>>> +struct pidns_ioc_req {
>>>> +/* Set vector of last pids in namespace hierarchy */
>>>> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
>>>> +	unsigned int req;
>>>> +	void __user *data;
>>>> +	unsigned int data_size;
>>>> +	char std_fields[0];
>>>> +};
>>>
>>> see below,
>>>
>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>> +			     struct pidns_ioc_req *req)
>>>> +{
>>>> +	char *str, *p;
>>>> +	int ret = 0;
>>>> +	pid_t pid;
>>>> +
>>>> +	read_lock(&tasklist_lock);
>>>> +	if (!pid_ns->child_reaper)
>>>> +		ret = -EINVAL;
>>>> +	read_unlock(&tasklist_lock);
>>>> +	if (ret)
>>>> +		return ret;
>>>
>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>
>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>> there must be at least one task in this namespace, otherwise you can't open a file
>>> which has f_op == ns_file_operations, no?
>> 
>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>> it under impression of
>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>> but here it's completely wrong. It will be removed in v2.
>>  
>>>> +	if (req->data_size >= PAGE_SIZE)
>>>> +		return -EINVAL;
>>>> +	str = vmalloc(req->data_size + 1);
>>>
>>> then I don't understand why it makes sense to use vmalloc()
>>>
>>>> +	if (!str)
>>>> +		return -ENOMEM;
>>>> +	if (copy_from_user(str, req->data, req->data_size)) {
>>>> +		ret = -EFAULT;
>>>> +		goto out_vfree;
>>>> +	}
>>>> +	str[req->data_size] = '\0';
>>>> +
>>>> +	p = str;
>>>> +	while (p && *p != '\0') {
>>>> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
>>>> +			ret = -EPERM;
>>>> +			goto out_vfree;
>>>> +		}
>>>> +
>>>> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
>>>> +			ret = -EINVAL;
>>>> +			goto out_vfree;
>>>> +		}
>>>
>>> Well, this is ioctl(), do we really want to parse the strings?
>>>
>>> Can't we make
>>>
>>> 	struct pidns_ioc_req {
>>> 		...
>>> 		int nr_pids;
>>> 		pid_t  pids[0];
>>> 	}
>>>
>>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything
>>> else altogether.
>> 
>> Since it's a generic structure for different types of the requests, it may be extended
>> in the future. We won't be able to add new fields, if we compose the structure the way
>> you suggested, will we?
>
> Though, we may go this way if just do the fields generic:
>
> struct pidns_ioc_req {
>         unsigned int req;
>         unsigned int data_size;
>         union {
> 	        pid_t pid[0];
> 	};
> };
>
> Ok, I'll rework the patchset in this way.

You don't need that.  That is what new ioctl numbers are for.

Interfaces to the kernel don't need to become multiplexors to prepare
for the future when there is already an appropriate multiplexing
interface in place.

Eric
Kirill Tkhai April 26, 2017, 4:33 p.m. UTC | #6
On 26.04.2017 19:11, Kirill Tkhai wrote:
> On 26.04.2017 18:53, Oleg Nesterov wrote:
>> On 04/17, Kirill Tkhai wrote:
>>>
>>> +struct pidns_ioc_req {
>>> +/* Set vector of last pids in namespace hierarchy */
>>> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
>>> +	unsigned int req;
>>> +	void __user *data;
>>> +	unsigned int data_size;
>>> +	char std_fields[0];
>>> +};
>>
>> see below,
>>
>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>> +			     struct pidns_ioc_req *req)
>>> +{
>>> +	char *str, *p;
>>> +	int ret = 0;
>>> +	pid_t pid;
>>> +
>>> +	read_lock(&tasklist_lock);
>>> +	if (!pid_ns->child_reaper)
>>> +		ret = -EINVAL;
>>> +	read_unlock(&tasklist_lock);
>>> +	if (ret)
>>> +		return ret;
>>
>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>
>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>> there must be at least one task in this namespace, otherwise you can't open a file
>> which has f_op == ns_file_operations, no?
> 
> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
> it under impression of
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
> but here it's completely wrong. It will be removed in v2.
>  
>>> +	if (req->data_size >= PAGE_SIZE)
>>> +		return -EINVAL;
>>> +	str = vmalloc(req->data_size + 1);
>>
>> then I don't understand why it makes sense to use vmalloc()
>>
>>> +	if (!str)
>>> +		return -ENOMEM;
>>> +	if (copy_from_user(str, req->data, req->data_size)) {
>>> +		ret = -EFAULT;
>>> +		goto out_vfree;
>>> +	}
>>> +	str[req->data_size] = '\0';
>>> +
>>> +	p = str;
>>> +	while (p && *p != '\0') {
>>> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
>>> +			ret = -EPERM;
>>> +			goto out_vfree;
>>> +		}
>>> +
>>> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
>>> +			ret = -EINVAL;
>>> +			goto out_vfree;
>>> +		}
>>
>> Well, this is ioctl(), do we really want to parse the strings?
>>
>> Can't we make
>>
>> 	struct pidns_ioc_req {
>> 		...
>> 		int nr_pids;
>> 		pid_t  pids[0];
>> 	}
>>
>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything
>> else altogether.
> 
> Since it's a generic structure for different types of the requests, it may be extended
> in the future. We won't be able to add new fields, if we compose the structure the way
> you suggested, will we?

Though, we may go this way if just do the fields generic:

struct pidns_ioc_req {
        unsigned int req;
        unsigned int data_size;
        union {
	        pid_t pid[0];
	};
};

Ok, I'll rework the patchset in this way.
Kirill Tkhai April 26, 2017, 4:43 p.m. UTC | #7
On 26.04.2017 19:32, Eric W. Biederman wrote:
> Kirill Tkhai <ktkhai@virtuozzo.com> writes:
> 
>> On 26.04.2017 19:11, Kirill Tkhai wrote:
>>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>>> On 04/17, Kirill Tkhai wrote:
>>>>>
>>>>> +struct pidns_ioc_req {
>>>>> +/* Set vector of last pids in namespace hierarchy */
>>>>> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
>>>>> +	unsigned int req;
>>>>> +	void __user *data;
>>>>> +	unsigned int data_size;
>>>>> +	char std_fields[0];
>>>>> +};
>>>>
>>>> see below,
>>>>
>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>>> +			     struct pidns_ioc_req *req)
>>>>> +{
>>>>> +	char *str, *p;
>>>>> +	int ret = 0;
>>>>> +	pid_t pid;
>>>>> +
>>>>> +	read_lock(&tasklist_lock);
>>>>> +	if (!pid_ns->child_reaper)
>>>>> +		ret = -EINVAL;
>>>>> +	read_unlock(&tasklist_lock);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>
>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>>
>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>>> there must be at least one task in this namespace, otherwise you can't open a file
>>>> which has f_op == ns_file_operations, no?
>>>
>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>>> it under impression of
>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>>> but here it's completely wrong. It will be removed in v2.
>>>  
>>>>> +	if (req->data_size >= PAGE_SIZE)
>>>>> +		return -EINVAL;
>>>>> +	str = vmalloc(req->data_size + 1);
>>>>
>>>> then I don't understand why it makes sense to use vmalloc()
>>>>
>>>>> +	if (!str)
>>>>> +		return -ENOMEM;
>>>>> +	if (copy_from_user(str, req->data, req->data_size)) {
>>>>> +		ret = -EFAULT;
>>>>> +		goto out_vfree;
>>>>> +	}
>>>>> +	str[req->data_size] = '\0';
>>>>> +
>>>>> +	p = str;
>>>>> +	while (p && *p != '\0') {
>>>>> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
>>>>> +			ret = -EPERM;
>>>>> +			goto out_vfree;
>>>>> +		}
>>>>> +
>>>>> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
>>>>> +			ret = -EINVAL;
>>>>> +			goto out_vfree;
>>>>> +		}
>>>>
>>>> Well, this is ioctl(), do we really want to parse the strings?
>>>>
>>>> Can't we make
>>>>
>>>> 	struct pidns_ioc_req {
>>>> 		...
>>>> 		int nr_pids;
>>>> 		pid_t  pids[0];
>>>> 	}
>>>>
>>>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything
>>>> else altogether.
>>>
>>> Since it's a generic structure for different types of the requests, it may be extended
>>> in the future. We won't be able to add new fields, if we compose the structure the way
>>> you suggested, will we?
>>
>> Though, we may go this way if just do the fields generic:
>>
>> struct pidns_ioc_req {
>>         unsigned int req;
>>         unsigned int data_size;
>>         union {
>> 	        pid_t pid[0];
>> 	};
>> };
>>
>> Ok, I'll rework the patchset in this way.
> 
> You don't need that.  That is what new ioctl numbers are for.
> 
> Interfaces to the kernel don't need to become multiplexors to prepare
> for the future when there is already an appropriate multiplexing
> interface in place.

That is, do you suggest to not introduce NS_SPECIFIC_IO from the first patch,
and add PIDNS_REQ_SET_LAST_PID_VEC to the list of generic ns ioctls?

...
#define NS_GET_OWNER_UID		_IO(NSIO, 0x4)
#define PIDNS_REQ_SET_LAST_PID_VEC	_IO(NSIO, 0x5)
Eric W. Biederman April 26, 2017, 5:01 p.m. UTC | #8
Kirill Tkhai <ktkhai@virtuozzo.com> writes:

> On 26.04.2017 19:32, Eric W. Biederman wrote:
>> Kirill Tkhai <ktkhai@virtuozzo.com> writes:
>> 
>>> On 26.04.2017 19:11, Kirill Tkhai wrote:
>>>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>>>> On 04/17, Kirill Tkhai wrote:
>>>>>>
>>>>>> +struct pidns_ioc_req {
>>>>>> +/* Set vector of last pids in namespace hierarchy */
>>>>>> +#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
>>>>>> +	unsigned int req;
>>>>>> +	void __user *data;
>>>>>> +	unsigned int data_size;
>>>>>> +	char std_fields[0];
>>>>>> +};
>>>>>
>>>>> see below,
>>>>>
>>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>>>> +			     struct pidns_ioc_req *req)
>>>>>> +{
>>>>>> +	char *str, *p;
>>>>>> +	int ret = 0;
>>>>>> +	pid_t pid;
>>>>>> +
>>>>>> +	read_lock(&tasklist_lock);
>>>>>> +	if (!pid_ns->child_reaper)
>>>>>> +		ret = -EINVAL;
>>>>>> +	read_unlock(&tasklist_lock);
>>>>>> +	if (ret)
>>>>>> +		return ret;
>>>>>
>>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>>>
>>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>>>> there must be at least one task in this namespace, otherwise you can't open a file
>>>>> which has f_op == ns_file_operations, no?
>>>>
>>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>>>> it under impression of
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>>>> but here it's completely wrong. It will be removed in v2.
>>>>  
>>>>>> +	if (req->data_size >= PAGE_SIZE)
>>>>>> +		return -EINVAL;
>>>>>> +	str = vmalloc(req->data_size + 1);
>>>>>
>>>>> then I don't understand why it makes sense to use vmalloc()
>>>>>
>>>>>> +	if (!str)
>>>>>> +		return -ENOMEM;
>>>>>> +	if (copy_from_user(str, req->data, req->data_size)) {
>>>>>> +		ret = -EFAULT;
>>>>>> +		goto out_vfree;
>>>>>> +	}
>>>>>> +	str[req->data_size] = '\0';
>>>>>> +
>>>>>> +	p = str;
>>>>>> +	while (p && *p != '\0') {
>>>>>> +		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
>>>>>> +			ret = -EPERM;
>>>>>> +			goto out_vfree;
>>>>>> +		}
>>>>>> +
>>>>>> +		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
>>>>>> +			ret = -EINVAL;
>>>>>> +			goto out_vfree;
>>>>>> +		}
>>>>>
>>>>> Well, this is ioctl(), do we really want to parse the strings?
>>>>>
>>>>> Can't we make
>>>>>
>>>>> 	struct pidns_ioc_req {
>>>>> 		...
>>>>> 		int nr_pids;
>>>>> 		pid_t  pids[0];
>>>>> 	}
>>>>>
>>>>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything
>>>>> else altogether.
>>>>
>>>> Since it's a generic structure for different types of the requests, it may be extended
>>>> in the future. We won't be able to add new fields, if we compose the structure the way
>>>> you suggested, will we?
>>>
>>> Though, we may go this way if just do the fields generic:
>>>
>>> struct pidns_ioc_req {
>>>         unsigned int req;
>>>         unsigned int data_size;
>>>         union {
>>> 	        pid_t pid[0];
>>> 	};
>>> };
>>>
>>> Ok, I'll rework the patchset in this way.
>> 
>> You don't need that.  That is what new ioctl numbers are for.
>> 
>> Interfaces to the kernel don't need to become multiplexors to prepare
>> for the future when there is already an appropriate multiplexing
>> interface in place.
>
> That is, do you suggest to not introduce NS_SPECIFIC_IO from the first patch,
> and add PIDNS_REQ_SET_LAST_PID_VEC to the list of generic ns ioctls?
>
> ...
> #define NS_GET_OWNER_UID		_IO(NSIO, 0x4)
> #define PIDNS_REQ_SET_LAST_PID_VEC	_IO(NSIO, 0x5)

I have not looked at your proposal in detail.  But if we are going to do
this with ioctls there are enough that we should not need to play games.
There are 4 billion of them and 4194304 dedicated for namespace
operations.  Strictly it is 256 ioctls plus 14 bits dedicated for size.
Even that seems plenty.

Please let's make things as simple as we can.

Eric
Oleg Nesterov April 27, 2017, 4:12 p.m. UTC | #9
On 04/26, Kirill Tkhai wrote:
>
> On 26.04.2017 18:53, Oleg Nesterov wrote:
> >
> >> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
> >> +			     struct pidns_ioc_req *req)
> >> +{
> >> +	char *str, *p;
> >> +	int ret = 0;
> >> +	pid_t pid;
> >> +
> >> +	read_lock(&tasklist_lock);
> >> +	if (!pid_ns->child_reaper)
> >> +		ret = -EINVAL;
> >> +	read_unlock(&tasklist_lock);
> >> +	if (ret)
> >> +		return ret;
> >
> > why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
> >
> > In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
> > there must be at least one task in this namespace, otherwise you can't open a file
> > which has f_op == ns_file_operations, no?
>
> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
> it under impression of
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
> but here it's completely wrong. It will be removed in v2.

Hmm. But if I read this commit correctly then we really need to check
pid_ns->child_reaper != NULL ?

Currently we can't pick an "empty" pid_ns. But after the commit above a task
can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
/proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?

Or I am totally confused?

Oleg.
Oleg Nesterov April 27, 2017, 4:16 p.m. UTC | #10
On 04/26, Kirill Tkhai wrote:
>
> On 26.04.2017 18:53, Oleg Nesterov wrote:
> >>
> >> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
> >> +			     struct pidns_ioc_req *req)
> >> +{
> >> +	char *str, *p;
> >> +	int ret = 0;
> >> +	pid_t pid;
> >> +
> >> +	read_lock(&tasklist_lock);
> >> +	if (!pid_ns->child_reaper)
> >> +		ret = -EINVAL;
> >> +	read_unlock(&tasklist_lock);
> >> +	if (ret)
> >> +		return ret;
> > 
> > why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
> > 
> > In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
> > there must be at least one task in this namespace, otherwise you can't open a file
> > which has f_op == ns_file_operations, no?
> 
> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
> it under impression of
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
> but here it's completely wrong. It will be removed in v2.

Hmm. But if I read this commit correctly then we really need to check
pid_ns->child_reaper != NULL ?

Currently we can't pick an "empty" pid_ns. But after the commit above a task
can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
/proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?

Or I am totally confused?

Oleg.
Kirill Tkhai April 27, 2017, 4:17 p.m. UTC | #11
On 27.04.2017 19:12, Oleg Nesterov wrote:
> On 04/26, Kirill Tkhai wrote:
>>
>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>>
>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>> +			     struct pidns_ioc_req *req)
>>>> +{
>>>> +	char *str, *p;
>>>> +	int ret = 0;
>>>> +	pid_t pid;
>>>> +
>>>> +	read_lock(&tasklist_lock);
>>>> +	if (!pid_ns->child_reaper)
>>>> +		ret = -EINVAL;
>>>> +	read_unlock(&tasklist_lock);
>>>> +	if (ret)
>>>> +		return ret;
>>>
>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>
>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>> there must be at least one task in this namespace, otherwise you can't open a file
>>> which has f_op == ns_file_operations, no?
>>
>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>> it under impression of
>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>> but here it's completely wrong. It will be removed in v2.
> 
> Hmm. But if I read this commit correctly then we really need to check
> pid_ns->child_reaper != NULL ?
> 
> Currently we can't pick an "empty" pid_ns. But after the commit above a task
> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?

Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(),
because pid_for_children is available to open only after the 1st alloc_pid().
So, it's impossible to call ioctl() on it.
 
> Or I am totally confused?
Oleg Nesterov April 27, 2017, 4:22 p.m. UTC | #12
On 04/27, Kirill Tkhai wrote:
>
> On 27.04.2017 19:12, Oleg Nesterov wrote:
> > On 04/26, Kirill Tkhai wrote:
> >>
> >> On 26.04.2017 18:53, Oleg Nesterov wrote:
> >>>
> >>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
> >>>> +			     struct pidns_ioc_req *req)
> >>>> +{
> >>>> +	char *str, *p;
> >>>> +	int ret = 0;
> >>>> +	pid_t pid;
> >>>> +
> >>>> +	read_lock(&tasklist_lock);
> >>>> +	if (!pid_ns->child_reaper)
> >>>> +		ret = -EINVAL;
> >>>> +	read_unlock(&tasklist_lock);
> >>>> +	if (ret)
> >>>> +		return ret;
> >>>
> >>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
> >>>
> >>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
> >>> there must be at least one task in this namespace, otherwise you can't open a file
> >>> which has f_op == ns_file_operations, no?
> >>
> >> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
> >> it under impression of
> >> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
> >> but here it's completely wrong. It will be removed in v2.
> > 
> > Hmm. But if I read this commit correctly then we really need to check
> > pid_ns->child_reaper != NULL ?
> > 
> > Currently we can't pick an "empty" pid_ns. But after the commit above a task
> > can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
> > /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?
> 
> Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(),
> because pid_for_children is available to open only after the 1st alloc_pid().
> So, it's impossible to call ioctl() on it.

Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().

But note that it doesn't need tasklist_lock too.

Oleg.
Eric W. Biederman April 27, 2017, 4:39 p.m. UTC | #13
Kirill Tkhai <ktkhai@virtuozzo.com> writes:

> On 27.04.2017 19:12, Oleg Nesterov wrote:
>> On 04/26, Kirill Tkhai wrote:
>>>
>>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>>>
>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>>> +			     struct pidns_ioc_req *req)
>>>>> +{
>>>>> +	char *str, *p;
>>>>> +	int ret = 0;
>>>>> +	pid_t pid;
>>>>> +
>>>>> +	read_lock(&tasklist_lock);
>>>>> +	if (!pid_ns->child_reaper)
>>>>> +		ret = -EINVAL;
>>>>> +	read_unlock(&tasklist_lock);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>
>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>>
>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>>> there must be at least one task in this namespace, otherwise you can't open a file
>>>> which has f_op == ns_file_operations, no?
>>>
>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>>> it under impression of
>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>>> but here it's completely wrong. It will be removed in v2.
>> 
>> Hmm. But if I read this commit correctly then we really need to check
>> pid_ns->child_reaper != NULL ?
>> 
>> Currently we can't pick an "empty" pid_ns. But after the commit above a task
>> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
>> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?
>
> Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(),
> because pid_for_children is available to open only after the 1st alloc_pid().
> So, it's impossible to call ioctl() on it.

That sounds reasonable.

There is definitely the chance of the child_reaper dying after we have
joined a pid namespace.    So child_reaper can be stale if not NULL.

As long as we don't mess up the first pid allocation I don't
see any reason why we should care about last_pid in a pid_namespace.
And this ioctl can be used to set all of the other pids on the first
pid allocation by calling it in the parent pid namespace.

There is still the chance of racing with a pid reaper dying.   Why do we
care about child_reaper in this case?

Changing last_pid is completely pointless if child_reaper is dead or
missing but why would we care?

Although looking at it we probably want to call set_last_pid just to
be consistent with everything else.

Eric
Kirill Tkhai April 28, 2017, 9:17 a.m. UTC | #14
On 27.04.2017 19:22, Oleg Nesterov wrote:
> On 04/27, Kirill Tkhai wrote:
>>
>> On 27.04.2017 19:12, Oleg Nesterov wrote:
>>> On 04/26, Kirill Tkhai wrote:
>>>>
>>>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>>>>
>>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>>>> +			     struct pidns_ioc_req *req)
>>>>>> +{
>>>>>> +	char *str, *p;
>>>>>> +	int ret = 0;
>>>>>> +	pid_t pid;
>>>>>> +
>>>>>> +	read_lock(&tasklist_lock);
>>>>>> +	if (!pid_ns->child_reaper)
>>>>>> +		ret = -EINVAL;
>>>>>> +	read_unlock(&tasklist_lock);
>>>>>> +	if (ret)
>>>>>> +		return ret;
>>>>>
>>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>>>
>>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>>>> there must be at least one task in this namespace, otherwise you can't open a file
>>>>> which has f_op == ns_file_operations, no?
>>>>
>>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>>>> it under impression of
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>>>> but here it's completely wrong. It will be removed in v2.
>>>
>>> Hmm. But if I read this commit correctly then we really need to check
>>> pid_ns->child_reaper != NULL ?
>>>
>>> Currently we can't pick an "empty" pid_ns. But after the commit above a task
>>> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
>>> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?
>>
>> Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(),
>> because pid_for_children is available to open only after the 1st alloc_pid().
>> So, it's impossible to call ioctl() on it.
> 
> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().
> 
> But note that it doesn't need tasklist_lock too.

Hm, are there possible strange situations with memory ordering, when we see
ns->child_reaper of already died ns, which was placed in the same memory?
Do we have to use some memory barriers here?
Kirill Tkhai April 28, 2017, 9:22 a.m. UTC | #15
On 27.04.2017 19:39, Eric W. Biederman wrote:
> Kirill Tkhai <ktkhai@virtuozzo.com> writes:
> 
>> On 27.04.2017 19:12, Oleg Nesterov wrote:
>>> On 04/26, Kirill Tkhai wrote:
>>>>
>>>> On 26.04.2017 18:53, Oleg Nesterov wrote:
>>>>>
>>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns,
>>>>>> +			     struct pidns_ioc_req *req)
>>>>>> +{
>>>>>> +	char *str, *p;
>>>>>> +	int ret = 0;
>>>>>> +	pid_t pid;
>>>>>> +
>>>>>> +	read_lock(&tasklist_lock);
>>>>>> +	if (!pid_ns->child_reaper)
>>>>>> +		ret = -EINVAL;
>>>>>> +	read_unlock(&tasklist_lock);
>>>>>> +	if (ret)
>>>>>> +		return ret;
>>>>>
>>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless.
>>>>>
>>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL,
>>>>> there must be at least one task in this namespace, otherwise you can't open a file
>>>>> which has f_op == ns_file_operations, no?
>>>>
>>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added
>>>> it under impression of
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00
>>>> but here it's completely wrong. It will be removed in v2.
>>>
>>> Hmm. But if I read this commit correctly then we really need to check
>>> pid_ns->child_reaper != NULL ?
>>>
>>> Currently we can't pick an "empty" pid_ns. But after the commit above a task
>>> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its
>>> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ?
>>
>> Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(),
>> because pid_for_children is available to open only after the 1st alloc_pid().
>> So, it's impossible to call ioctl() on it.
> 
> That sounds reasonable.
> 
> There is definitely the chance of the child_reaper dying after we have
> joined a pid namespace.    So child_reaper can be stale if not NULL.
> 
> As long as we don't mess up the first pid allocation I don't
> see any reason why we should care about last_pid in a pid_namespace.
> And this ioctl can be used to set all of the other pids on the first
> pid allocation by calling it in the parent pid namespace.
> 
> There is still the chance of racing with a pid reaper dying.   Why do we
> care about child_reaper in this case?
> 
> Changing last_pid is completely pointless if child_reaper is dead or
> missing but why would we care?

I'm agree with you, there is no a reason we should care about died child_reaper.
The protection is already made in pidns_for_children_get(). It's only need to
prohibit creation of the first task with pid != 1, which leads to child_reaper-less
pid namespace.
Oleg Nesterov May 2, 2017, 4:33 p.m. UTC | #16
sorry for delay, vacation...

On 04/28, Kirill Tkhai wrote:
>
> On 27.04.2017 19:22, Oleg Nesterov wrote:
> >
> > Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().
> >
> > But note that it doesn't need tasklist_lock too.
>
> Hm, are there possible strange situations with memory ordering, when we see
> ns->child_reaper of already died ns, which was placed in the same memory?
> Do we have to use some memory barriers here?

Could you spell please? I don't understand your concerns...

I don't see how, say,

	static struct ns_common *pidns_for_children_get(struct task_struct *task)
	{
		struct ns_common *ns = NULL;
		struct pid_namespace *pid_ns;

		task_lock(task);
		if (task->nsproxy) {
			pid_ns = task->nsproxy->pid_ns_for_children;
			if (pid_ns->child_reaper) {
				ns = &pid_ns->ns;
				get_pid_ns(ns);
			}
		}
		task_unlock(task);

		return ns;
	}

can be wrong. It also looks more clean to me.

->child_reaper is not stable without tasklist, it can be dead/etc, but
we do not care?

Oleg.
Eric W. Biederman May 2, 2017, 5:22 p.m. UTC | #17
Oleg Nesterov <oleg@redhat.com> writes:

> sorry for delay, vacation...
>
> On 04/28, Kirill Tkhai wrote:
>>
>> On 27.04.2017 19:22, Oleg Nesterov wrote:
>> >
>> > Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().
>> >
>> > But note that it doesn't need tasklist_lock too.
>>
>> Hm, are there possible strange situations with memory ordering, when we see
>> ns->child_reaper of already died ns, which was placed in the same memory?
>> Do we have to use some memory barriers here?
>
> Could you spell please? I don't understand your concerns...
>
> I don't see how, say,
>
> 	static struct ns_common *pidns_for_children_get(struct task_struct *task)
> 	{
> 		struct ns_common *ns = NULL;
> 		struct pid_namespace *pid_ns;
>
> 		task_lock(task);
> 		if (task->nsproxy) {
> 			pid_ns = task->nsproxy->pid_ns_for_children;
> 			if (pid_ns->child_reaper) {
> 				ns = &pid_ns->ns;
> 				get_pid_ns(ns);
> 			}
> 		}
> 		task_unlock(task);
>
> 		return ns;
> 	}
>
> can be wrong. It also looks more clean to me.
>
> ->child_reaper is not stable without tasklist, it can be dead/etc, but
> we do not care?

It breaks a number of assumptions if you can join a pid namespace before
an init process is created in that pid namespace.  Checking for
child_reaper is a bit heavy handed but appears to ensure all of the
assumptions of initial pid namespace creation have been met.

Which means your simplified pidns_for_children_get is a bit insufficient.

Eric
Kirill Tkhai May 2, 2017, 5:33 p.m. UTC | #18
On 02.05.2017 19:33, Oleg Nesterov wrote:
> sorry for delay, vacation...
> 
> On 04/28, Kirill Tkhai wrote:
>>
>> On 27.04.2017 19:22, Oleg Nesterov wrote:
>>>
>>> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().
>>>
>>> But note that it doesn't need tasklist_lock too.
>>
>> Hm, are there possible strange situations with memory ordering, when we see
>> ns->child_reaper of already died ns, which was placed in the same memory?
>> Do we have to use some memory barriers here?
> 
> Could you spell please? I don't understand your concerns...
> 
> I don't see how, say,
> 
> 	static struct ns_common *pidns_for_children_get(struct task_struct *task)
> 	{
> 		struct ns_common *ns = NULL;
> 		struct pid_namespace *pid_ns;
> 
> 		task_lock(task);
> 		if (task->nsproxy) {
> 			pid_ns = task->nsproxy->pid_ns_for_children;
> 			if (pid_ns->child_reaper) {
> 				ns = &pid_ns->ns;
> 				get_pid_ns(ns);
> 			}
> 		}
> 		task_unlock(task);
> 
> 		return ns;
> 	}
> 
> can be wrong. It also looks more clean to me.
> 
> ->child_reaper is not stable without tasklist, it can be dead/etc, but
> we do not care?

I mean the following. We had a pid_ns1 with a child_reaper set. Then
it became dead, and a new pid_ns2 were allocated in the same memory.

A task on another cpu opens the pid_for_children file, and because
of there is no memory ordering, it sees pid_ns1->child_reaper,
when it opens pid_ns2.

I forgot, what guarantees this situation is impossible? What guarantees,
the renewed content of pid_ns2 on another cpu is seen not later, than
we can't open it?
Eric W. Biederman May 2, 2017, 9:13 p.m. UTC | #19
Kirill Tkhai <ktkhai@virtuozzo.com> writes:

> On 02.05.2017 19:33, Oleg Nesterov wrote:
>> sorry for delay, vacation...
>> 
>> On 04/28, Kirill Tkhai wrote:
>>>
>>> On 27.04.2017 19:22, Oleg Nesterov wrote:
>>>>
>>>> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().
>>>>
>>>> But note that it doesn't need tasklist_lock too.
>>>
>>> Hm, are there possible strange situations with memory ordering, when we see
>>> ns->child_reaper of already died ns, which was placed in the same memory?
>>> Do we have to use some memory barriers here?
>> 
>> Could you spell please? I don't understand your concerns...
>> 
>> I don't see how, say,
>> 
>> 	static struct ns_common *pidns_for_children_get(struct task_struct *task)
>> 	{
>> 		struct ns_common *ns = NULL;
>> 		struct pid_namespace *pid_ns;
>> 
>> 		task_lock(task);
>> 		if (task->nsproxy) {
>> 			pid_ns = task->nsproxy->pid_ns_for_children;
>> 			if (pid_ns->child_reaper) {
                            ^^^^^^^^^^^^^^^^^^^^
                            Oleg my apologies I missed this line earlier.
                            This does look like a valid way to skip read_lock(&tasklist_lock);
>> 				ns = &pid_ns->ns;
>> 				get_pid_ns(ns);
                                ^^^^^^^^^^^^^ This needs to be:
                                get_pid_ns(pid_ns);
                                
>> 			}
>> 		}
>> 		task_unlock(task);
>> 
>> 		return ns;
>> 	}
>> 
>> can be wrong. It also looks more clean to me.
>> 
>> ->child_reaper is not stable without tasklist, it can be dead/etc, but
>> we do not care?
>
> I mean the following. We had a pid_ns1 with a child_reaper set. Then
> it became dead, and a new pid_ns2 were allocated in the same memory.

task->nsproxy->pid_ns_for_children is always changed with
task_lock(task) held.  See switch_task_namespaces (used by unshare and
setns).  This also gives us the guarantee that the pid_ns reference
won't be freed/reused in any for until task_lock(task) is dropped.

> A task on another cpu opens the pid_for_children file, and because
> of there is no memory ordering, it sees pid_ns1->child_reaper,
> when it opens pid_ns2.
>
> I forgot, what guarantees this situation is impossible? What guarantees,
> the renewed content of pid_ns2 on another cpu is seen not later, than
> we can't open it?

Eric
Kirill Tkhai May 3, 2017, 10:20 a.m. UTC | #20
On 03.05.2017 00:13, Eric W. Biederman wrote:
> Kirill Tkhai <ktkhai@virtuozzo.com> writes:
> 
>> On 02.05.2017 19:33, Oleg Nesterov wrote:
>>> sorry for delay, vacation...
>>>
>>> On 04/28, Kirill Tkhai wrote:
>>>>
>>>> On 27.04.2017 19:22, Oleg Nesterov wrote:
>>>>>
>>>>> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get().
>>>>>
>>>>> But note that it doesn't need tasklist_lock too.
>>>>
>>>> Hm, are there possible strange situations with memory ordering, when we see
>>>> ns->child_reaper of already died ns, which was placed in the same memory?
>>>> Do we have to use some memory barriers here?
>>>
>>> Could you spell please? I don't understand your concerns...
>>>
>>> I don't see how, say,
>>>
>>> 	static struct ns_common *pidns_for_children_get(struct task_struct *task)
>>> 	{
>>> 		struct ns_common *ns = NULL;
>>> 		struct pid_namespace *pid_ns;
>>>
>>> 		task_lock(task);
>>> 		if (task->nsproxy) {
>>> 			pid_ns = task->nsproxy->pid_ns_for_children;
>>> 			if (pid_ns->child_reaper) {
>                             ^^^^^^^^^^^^^^^^^^^^
>                             Oleg my apologies I missed this line earlier.
>                             This does look like a valid way to skip read_lock(&tasklist_lock);
>>> 				ns = &pid_ns->ns;
>>> 				get_pid_ns(ns);
>                                 ^^^^^^^^^^^^^ This needs to be:
>                                 get_pid_ns(pid_ns);
>                                 
>>> 			}
>>> 		}
>>> 		task_unlock(task);
>>>
>>> 		return ns;
>>> 	}
>>>
>>> can be wrong. It also looks more clean to me.
>>>
>>> ->child_reaper is not stable without tasklist, it can be dead/etc, but
>>> we do not care?
>>
>> I mean the following. We had a pid_ns1 with a child_reaper set. Then
>> it became dead, and a new pid_ns2 were allocated in the same memory.
> 
> task->nsproxy->pid_ns_for_children is always changed with
> task_lock(task) held.  See switch_task_namespaces (used by unshare and
> setns).  This also gives us the guarantee that the pid_ns reference
> won't be freed/reused in any for until task_lock(task) is dropped.

Now I've checked kmem_cache_zalloc() and it looks like it zeroes cache memory
content synchronous on allocation (it seems there is no pre-zeroed memory
for GFP_ZERO cases).

So, the zeroing happens before switch_task_namespaces() (and task_unlock())
and we're really safe after task_lock() in pidns_for_children_get().

Ok, I'll send new version of the patchset.
diff mbox

Patch

diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 544bbb661475..37bb4af917b5 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -17,4 +17,13 @@ 
 /* Execute namespace-specific ioctl */
 #define NS_SPECIFIC_IOC		_IO(NSIO, 0x5)
 
+struct pidns_ioc_req {
+/* Set vector of last pids in namespace hierarchy */
+#define PIDNS_REQ_SET_LAST_PID_VEC	0x1
+	unsigned int req;
+	void __user *data;
+	unsigned int data_size;
+	char std_fields[0];
+};
+
 #endif /* __LINUX_NSFS_H */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..0e86fa15cd92 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,8 @@ 
 #include <linux/export.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
+#include <linux/vmalloc.h>
+#include <uapi/linux/nsfs.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -428,6 +430,91 @@  static struct ns_common *pidns_get_parent(struct ns_common *ns)
 	return &get_pid_ns(pid_ns)->ns;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long set_last_pid_vec(struct pid_namespace *pid_ns,
+			     struct pidns_ioc_req *req)
+{
+	char *str, *p;
+	int ret = 0;
+	pid_t pid;
+
+	read_lock(&tasklist_lock);
+	if (!pid_ns->child_reaper)
+		ret = -EINVAL;
+	read_unlock(&tasklist_lock);
+	if (ret)
+		return ret;
+
+	if (req->data_size >= PAGE_SIZE)
+		return -EINVAL;
+	str = vmalloc(req->data_size + 1);
+	if (!str)
+		return -ENOMEM;
+	if (copy_from_user(str, req->data, req->data_size)) {
+		ret = -EFAULT;
+		goto out_vfree;
+	}
+	str[req->data_size] = '\0';
+
+	p = str;
+	while (p && *p != '\0') {
+		if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out_vfree;
+		}
+
+		if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) {
+			ret = -EINVAL;
+			goto out_vfree;
+		}
+
+		/* Write directly: see the comment in pid_ns_ctl_handler() */
+		pid_ns->last_pid = pid;
+
+		p = strchr(p, ':');
+		pid_ns = pid_ns->parent;
+		if (p) {
+			if (!pid_ns) {
+				ret = -EINVAL;
+				goto out_vfree;
+			}
+			p++;
+		}
+	}
+
+	ret = 0;
+out_vfree:
+	vfree(str);
+	return ret;
+}
+#else	/* CONFIG_CHECKPOINT_RESTORE */
+static long set_last_pid_vec(struct pid_namespace *pid_ns,
+			     struct pidns_ioc_req *req)
+{
+	return -ENOTTY;
+}
+#endif	/* CONFIG_CHECKPOINT_RESTORE */
+
+static long pidns_ioctl(struct ns_common *ns, unsigned long arg)
+{
+	struct pid_namespace *pid_ns = to_pid_ns(ns);
+	struct pidns_ioc_req user_req;
+	int ret;
+
+	ret = copy_from_user(&user_req, (void *)arg,
+			     offsetof(struct pidns_ioc_req, std_fields));
+	if (ret)
+		return ret;
+
+	switch (user_req.req) {
+	case PIDNS_REQ_SET_LAST_PID_VEC:
+		return set_last_pid_vec(pid_ns, &user_req);
+	default:
+		return -ENOTTY;
+	}
+	return 0;
+}
+
 static struct user_namespace *pidns_owner(struct ns_common *ns)
 {
 	return to_pid_ns(ns)->user_ns;
@@ -441,6 +528,7 @@  const struct proc_ns_operations pidns_operations = {
 	.install	= pidns_install,
 	.owner		= pidns_owner,
 	.get_parent	= pidns_get_parent,
+	.ns_ioctl	= pidns_ioctl,
 };
 
 static __init int pid_namespaces_init(void)