Message ID | 149245057248.17600.1341652606136269734.stgit@localhost.localdomain (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Quoting Kirill Tkhai (ktkhai@virtuozzo.com): > On implementing of nested pid namespaces support in CRIU > (checkpoint-restore in userspace tool) we run into > the situation, that it's impossible to create a task with > specific NSpid effectively. After commit 49f4d8b93ccf > "pidns: Capture the user namespace and filter ns_last_pid" > it is impossible to set ns_last_pid on any pid namespace, > except task's active pid_ns (before the commit it was possible > to write to pid_ns_for_children). Thus, if a restored task > in a container has more than one pid_ns levels, the restorer > code must have a task helper for every pid namespace > of the task's pid_ns hierarhy. > > This is a big problem, because of communication with > a helper for every pid_ns in the hierarchy is not cheap > and not performance-good as it implies many helpers wakeups > to create a single task (independently, how you communicate > with the helpers). This patch tries to decide the problem. > > It introduces a new pid_ns ns_ioctl(PIDNS_REQ_SET_LAST_PID_VEC), > which allows to write a vector of last pids on pid_ns hierarchy. > The vector is passed as a ":"-delimited string with pids, > written in reverse order. The first number corresponds to > the opened namespace ns_last_pid, the second is to its parent, etc. > So, if you have the pid namespaces hierarchy like: > > pid_ns1 (grand father) > | > v > pid_ns2 (father) > | > v > pid_ns3 (child) > > and the ns of task's of pid_ns3 is open, then the corresponding > vector will be "last_ns_pid3:last_ns_pid2:last_ns_pid1". This > vector may be short and it may contain less levels, for example, > "last_ns_pid3:last_ns_pid2" or even "last_ns_pid3", in dependence > of which levels you want to populate. > > To write in a pid_ns's ns_last_pid we check that the writer task > has CAP_SYS_ADMIN permittions in this pid_ns's user_ns. > > One note about struct pidns_ioc_req. It's made extensible and > may expanded in the future. The always existing fields present > at the moment, the future fields and they sizes may be determined > by pidns_ioc_req::req by the future code. > > Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> Reviewed-by: Serge Hallyn <serge@hallyn.com> (for both patches) > --- > include/uapi/linux/nsfs.h | 9 +++++ > kernel/pid_namespace.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 97 insertions(+) > > diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h > index 544bbb661475..37bb4af917b5 100644 > --- a/include/uapi/linux/nsfs.h > +++ b/include/uapi/linux/nsfs.h > @@ -17,4 +17,13 @@ > /* Execute namespace-specific ioctl */ > #define NS_SPECIFIC_IOC _IO(NSIO, 0x5) > > +struct pidns_ioc_req { > +/* Set vector of last pids in namespace hierarchy */ > +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 > + unsigned int req; > + void __user *data; > + unsigned int data_size; > + char std_fields[0]; > +}; > + > #endif /* __LINUX_NSFS_H */ > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index de461aa0bf9a..0e86fa15cd92 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -21,6 +21,8 @@ > #include <linux/export.h> > #include <linux/sched/task.h> > #include <linux/sched/signal.h> > +#include <linux/vmalloc.h> > +#include <uapi/linux/nsfs.h> > > struct pid_cache { > int nr_ids; > @@ -428,6 +430,91 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns) > return &get_pid_ns(pid_ns)->ns; > } > > +#ifdef CONFIG_CHECKPOINT_RESTORE > +static long set_last_pid_vec(struct pid_namespace *pid_ns, > + struct pidns_ioc_req *req) > +{ > + char *str, *p; > + int ret = 0; > + pid_t pid; > + > + read_lock(&tasklist_lock); > + if (!pid_ns->child_reaper) > + ret = -EINVAL; > + read_unlock(&tasklist_lock); > + if (ret) > + return ret; > + > + if (req->data_size >= PAGE_SIZE) > + return -EINVAL; > + str = vmalloc(req->data_size + 1); > + if (!str) > + return -ENOMEM; > + if (copy_from_user(str, req->data, req->data_size)) { > + ret = -EFAULT; > + goto out_vfree; > + } > + str[req->data_size] = '\0'; > + > + p = str; > + while (p && *p != '\0') { > + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { > + ret = -EPERM; > + goto out_vfree; > + } > + > + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { > + ret = -EINVAL; > + goto out_vfree; > + } > + > + /* Write directly: see the comment in pid_ns_ctl_handler() */ > + pid_ns->last_pid = pid; > + > + p = strchr(p, ':'); > + pid_ns = pid_ns->parent; > + if (p) { > + if (!pid_ns) { > + ret = -EINVAL; > + goto out_vfree; > + } > + p++; > + } > + } > + > + ret = 0; > +out_vfree: > + vfree(str); > + return ret; > +} > +#else /* CONFIG_CHECKPOINT_RESTORE */ > +static long set_last_pid_vec(struct pid_namespace *pid_ns, > + struct pidns_ioc_req *req) > +{ > + return -ENOTTY; > +} > +#endif /* CONFIG_CHECKPOINT_RESTORE */ > + > +static long pidns_ioctl(struct ns_common *ns, unsigned long arg) > +{ > + struct pid_namespace *pid_ns = to_pid_ns(ns); > + struct pidns_ioc_req user_req; > + int ret; > + > + ret = copy_from_user(&user_req, (void *)arg, > + offsetof(struct pidns_ioc_req, std_fields)); > + if (ret) > + return ret; > + > + switch (user_req.req) { > + case PIDNS_REQ_SET_LAST_PID_VEC: > + return set_last_pid_vec(pid_ns, &user_req); > + default: > + return -ENOTTY; > + } > + return 0; > +} > + > static struct user_namespace *pidns_owner(struct ns_common *ns) > { > return to_pid_ns(ns)->user_ns; > @@ -441,6 +528,7 @@ const struct proc_ns_operations pidns_operations = { > .install = pidns_install, > .owner = pidns_owner, > .get_parent = pidns_get_parent, > + .ns_ioctl = pidns_ioctl, > }; > > static __init int pid_namespaces_init(void)
On Mon, Apr 17, 2017 at 8:36 PM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > On implementing of nested pid namespaces support in CRIU > (checkpoint-restore in userspace tool) we run into > the situation, that it's impossible to create a task with > specific NSpid effectively. After commit 49f4d8b93ccf > "pidns: Capture the user namespace and filter ns_last_pid" > it is impossible to set ns_last_pid on any pid namespace, > except task's active pid_ns (before the commit it was possible > to write to pid_ns_for_children). Thus, if a restored task > in a container has more than one pid_ns levels, the restorer > code must have a task helper for every pid namespace > of the task's pid_ns hierarhy. > > This is a big problem, because of communication with > a helper for every pid_ns in the hierarchy is not cheap > and not performance-good as it implies many helpers wakeups > to create a single task (independently, how you communicate > with the helpers). This patch tries to decide the problem. > > It introduces a new pid_ns ns_ioctl(PIDNS_REQ_SET_LAST_PID_VEC), > which allows to write a vector of last pids on pid_ns hierarchy. > The vector is passed as a ":"-delimited string with pids, > written in reverse order. The first number corresponds to > the opened namespace ns_last_pid, the second is to its parent, etc. > So, if you have the pid namespaces hierarchy like: > > pid_ns1 (grand father) > | > v > pid_ns2 (father) > | > v > pid_ns3 (child) > > and the ns of task's of pid_ns3 is open, then the corresponding > vector will be "last_ns_pid3:last_ns_pid2:last_ns_pid1". This > vector may be short and it may contain less levels, for example, > "last_ns_pid3:last_ns_pid2" or even "last_ns_pid3", in dependence > of which levels you want to populate. > > To write in a pid_ns's ns_last_pid we check that the writer task > has CAP_SYS_ADMIN permittions in this pid_ns's user_ns. > > One note about struct pidns_ioc_req. It's made extensible and > may expanded in the future. The always existing fields present > at the moment, the future fields and they sizes may be determined > by pidns_ioc_req::req by the future code. > > Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
On 04/17, Kirill Tkhai wrote: > > +struct pidns_ioc_req { > +/* Set vector of last pids in namespace hierarchy */ > +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 > + unsigned int req; > + void __user *data; > + unsigned int data_size; > + char std_fields[0]; > +}; see below, > +static long set_last_pid_vec(struct pid_namespace *pid_ns, > + struct pidns_ioc_req *req) > +{ > + char *str, *p; > + int ret = 0; > + pid_t pid; > + > + read_lock(&tasklist_lock); > + if (!pid_ns->child_reaper) > + ret = -EINVAL; > + read_unlock(&tasklist_lock); > + if (ret) > + return ret; why do you need to check ->child_reaper under tasklist_lock? this looks pointless. In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, there must be at least one task in this namespace, otherwise you can't open a file which has f_op == ns_file_operations, no? > + if (req->data_size >= PAGE_SIZE) > + return -EINVAL; > + str = vmalloc(req->data_size + 1); then I don't understand why it makes sense to use vmalloc() > + if (!str) > + return -ENOMEM; > + if (copy_from_user(str, req->data, req->data_size)) { > + ret = -EFAULT; > + goto out_vfree; > + } > + str[req->data_size] = '\0'; > + > + p = str; > + while (p && *p != '\0') { > + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { > + ret = -EPERM; > + goto out_vfree; > + } > + > + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { > + ret = -EINVAL; > + goto out_vfree; > + } Well, this is ioctl(), do we really want to parse the strings? Can't we make struct pidns_ioc_req { ... int nr_pids; pid_t pids[0]; } and just use get_user() in a loop? This way we can avoid vmalloc() or anything else altogether. Oleg.
On 26.04.2017 18:53, Oleg Nesterov wrote: > On 04/17, Kirill Tkhai wrote: >> >> +struct pidns_ioc_req { >> +/* Set vector of last pids in namespace hierarchy */ >> +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 >> + unsigned int req; >> + void __user *data; >> + unsigned int data_size; >> + char std_fields[0]; >> +}; > > see below, > >> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >> + struct pidns_ioc_req *req) >> +{ >> + char *str, *p; >> + int ret = 0; >> + pid_t pid; >> + >> + read_lock(&tasklist_lock); >> + if (!pid_ns->child_reaper) >> + ret = -EINVAL; >> + read_unlock(&tasklist_lock); >> + if (ret) >> + return ret; > > why do you need to check ->child_reaper under tasklist_lock? this looks pointless. > > In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, > there must be at least one task in this namespace, otherwise you can't open a file > which has f_op == ns_file_operations, no? Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added it under impression of https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 but here it's completely wrong. It will be removed in v2. >> + if (req->data_size >= PAGE_SIZE) >> + return -EINVAL; >> + str = vmalloc(req->data_size + 1); > > then I don't understand why it makes sense to use vmalloc() > >> + if (!str) >> + return -ENOMEM; >> + if (copy_from_user(str, req->data, req->data_size)) { >> + ret = -EFAULT; >> + goto out_vfree; >> + } >> + str[req->data_size] = '\0'; >> + >> + p = str; >> + while (p && *p != '\0') { >> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { >> + ret = -EPERM; >> + goto out_vfree; >> + } >> + >> + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { >> + ret = -EINVAL; >> + goto out_vfree; >> + } > > Well, this is ioctl(), do we really want to parse the strings? > > Can't we make > > struct pidns_ioc_req { > ... > int nr_pids; > pid_t pids[0]; > } > > and just use get_user() in a loop? This way we can avoid vmalloc() or anything > else altogether. Since it's a generic structure for different types of the requests, it may be extended in the future. We won't be able to add new fields, if we compose the structure the way you suggested, will we?
Kirill Tkhai <ktkhai@virtuozzo.com> writes: > On 26.04.2017 19:11, Kirill Tkhai wrote: >> On 26.04.2017 18:53, Oleg Nesterov wrote: >>> On 04/17, Kirill Tkhai wrote: >>>> >>>> +struct pidns_ioc_req { >>>> +/* Set vector of last pids in namespace hierarchy */ >>>> +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 >>>> + unsigned int req; >>>> + void __user *data; >>>> + unsigned int data_size; >>>> + char std_fields[0]; >>>> +}; >>> >>> see below, >>> >>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>> + struct pidns_ioc_req *req) >>>> +{ >>>> + char *str, *p; >>>> + int ret = 0; >>>> + pid_t pid; >>>> + >>>> + read_lock(&tasklist_lock); >>>> + if (!pid_ns->child_reaper) >>>> + ret = -EINVAL; >>>> + read_unlock(&tasklist_lock); >>>> + if (ret) >>>> + return ret; >>> >>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>> >>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>> there must be at least one task in this namespace, otherwise you can't open a file >>> which has f_op == ns_file_operations, no? >> >> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >> it under impression of >> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >> but here it's completely wrong. It will be removed in v2. >> >>>> + if (req->data_size >= PAGE_SIZE) >>>> + return -EINVAL; >>>> + str = vmalloc(req->data_size + 1); >>> >>> then I don't understand why it makes sense to use vmalloc() >>> >>>> + if (!str) >>>> + return -ENOMEM; >>>> + if (copy_from_user(str, req->data, req->data_size)) { >>>> + ret = -EFAULT; >>>> + goto out_vfree; >>>> + } >>>> + str[req->data_size] = '\0'; >>>> + >>>> + p = str; >>>> + while (p && *p != '\0') { >>>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { >>>> + ret = -EPERM; >>>> + goto out_vfree; >>>> + } >>>> + >>>> + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { >>>> + ret = -EINVAL; >>>> + goto out_vfree; >>>> + } >>> >>> Well, this is ioctl(), do we really want to parse the strings? >>> >>> Can't we make >>> >>> struct pidns_ioc_req { >>> ... >>> int nr_pids; >>> pid_t pids[0]; >>> } >>> >>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything >>> else altogether. >> >> Since it's a generic structure for different types of the requests, it may be extended >> in the future. We won't be able to add new fields, if we compose the structure the way >> you suggested, will we? > > Though, we may go this way if just do the fields generic: > > struct pidns_ioc_req { > unsigned int req; > unsigned int data_size; > union { > pid_t pid[0]; > }; > }; > > Ok, I'll rework the patchset in this way. You don't need that. That is what new ioctl numbers are for. Interfaces to the kernel don't need to become multiplexors to prepare for the future when there is already an appropriate multiplexing interface in place. Eric
On 26.04.2017 19:11, Kirill Tkhai wrote: > On 26.04.2017 18:53, Oleg Nesterov wrote: >> On 04/17, Kirill Tkhai wrote: >>> >>> +struct pidns_ioc_req { >>> +/* Set vector of last pids in namespace hierarchy */ >>> +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 >>> + unsigned int req; >>> + void __user *data; >>> + unsigned int data_size; >>> + char std_fields[0]; >>> +}; >> >> see below, >> >>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>> + struct pidns_ioc_req *req) >>> +{ >>> + char *str, *p; >>> + int ret = 0; >>> + pid_t pid; >>> + >>> + read_lock(&tasklist_lock); >>> + if (!pid_ns->child_reaper) >>> + ret = -EINVAL; >>> + read_unlock(&tasklist_lock); >>> + if (ret) >>> + return ret; >> >> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >> >> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >> there must be at least one task in this namespace, otherwise you can't open a file >> which has f_op == ns_file_operations, no? > > Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added > it under impression of > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 > but here it's completely wrong. It will be removed in v2. > >>> + if (req->data_size >= PAGE_SIZE) >>> + return -EINVAL; >>> + str = vmalloc(req->data_size + 1); >> >> then I don't understand why it makes sense to use vmalloc() >> >>> + if (!str) >>> + return -ENOMEM; >>> + if (copy_from_user(str, req->data, req->data_size)) { >>> + ret = -EFAULT; >>> + goto out_vfree; >>> + } >>> + str[req->data_size] = '\0'; >>> + >>> + p = str; >>> + while (p && *p != '\0') { >>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { >>> + ret = -EPERM; >>> + goto out_vfree; >>> + } >>> + >>> + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { >>> + ret = -EINVAL; >>> + goto out_vfree; >>> + } >> >> Well, this is ioctl(), do we really want to parse the strings? >> >> Can't we make >> >> struct pidns_ioc_req { >> ... >> int nr_pids; >> pid_t pids[0]; >> } >> >> and just use get_user() in a loop? This way we can avoid vmalloc() or anything >> else altogether. > > Since it's a generic structure for different types of the requests, it may be extended > in the future. We won't be able to add new fields, if we compose the structure the way > you suggested, will we? Though, we may go this way if just do the fields generic: struct pidns_ioc_req { unsigned int req; unsigned int data_size; union { pid_t pid[0]; }; }; Ok, I'll rework the patchset in this way.
On 26.04.2017 19:32, Eric W. Biederman wrote: > Kirill Tkhai <ktkhai@virtuozzo.com> writes: > >> On 26.04.2017 19:11, Kirill Tkhai wrote: >>> On 26.04.2017 18:53, Oleg Nesterov wrote: >>>> On 04/17, Kirill Tkhai wrote: >>>>> >>>>> +struct pidns_ioc_req { >>>>> +/* Set vector of last pids in namespace hierarchy */ >>>>> +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 >>>>> + unsigned int req; >>>>> + void __user *data; >>>>> + unsigned int data_size; >>>>> + char std_fields[0]; >>>>> +}; >>>> >>>> see below, >>>> >>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>>> + struct pidns_ioc_req *req) >>>>> +{ >>>>> + char *str, *p; >>>>> + int ret = 0; >>>>> + pid_t pid; >>>>> + >>>>> + read_lock(&tasklist_lock); >>>>> + if (!pid_ns->child_reaper) >>>>> + ret = -EINVAL; >>>>> + read_unlock(&tasklist_lock); >>>>> + if (ret) >>>>> + return ret; >>>> >>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>>> >>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>>> there must be at least one task in this namespace, otherwise you can't open a file >>>> which has f_op == ns_file_operations, no? >>> >>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >>> it under impression of >>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >>> but here it's completely wrong. It will be removed in v2. >>> >>>>> + if (req->data_size >= PAGE_SIZE) >>>>> + return -EINVAL; >>>>> + str = vmalloc(req->data_size + 1); >>>> >>>> then I don't understand why it makes sense to use vmalloc() >>>> >>>>> + if (!str) >>>>> + return -ENOMEM; >>>>> + if (copy_from_user(str, req->data, req->data_size)) { >>>>> + ret = -EFAULT; >>>>> + goto out_vfree; >>>>> + } >>>>> + str[req->data_size] = '\0'; >>>>> + >>>>> + p = str; >>>>> + while (p && *p != '\0') { >>>>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { >>>>> + ret = -EPERM; >>>>> + goto out_vfree; >>>>> + } >>>>> + >>>>> + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { >>>>> + ret = -EINVAL; >>>>> + goto out_vfree; >>>>> + } >>>> >>>> Well, this is ioctl(), do we really want to parse the strings? >>>> >>>> Can't we make >>>> >>>> struct pidns_ioc_req { >>>> ... >>>> int nr_pids; >>>> pid_t pids[0]; >>>> } >>>> >>>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything >>>> else altogether. >>> >>> Since it's a generic structure for different types of the requests, it may be extended >>> in the future. We won't be able to add new fields, if we compose the structure the way >>> you suggested, will we? >> >> Though, we may go this way if just do the fields generic: >> >> struct pidns_ioc_req { >> unsigned int req; >> unsigned int data_size; >> union { >> pid_t pid[0]; >> }; >> }; >> >> Ok, I'll rework the patchset in this way. > > You don't need that. That is what new ioctl numbers are for. > > Interfaces to the kernel don't need to become multiplexors to prepare > for the future when there is already an appropriate multiplexing > interface in place. That is, do you suggest to not introduce NS_SPECIFIC_IO from the first patch, and add PIDNS_REQ_SET_LAST_PID_VEC to the list of generic ns ioctls? ... #define NS_GET_OWNER_UID _IO(NSIO, 0x4) #define PIDNS_REQ_SET_LAST_PID_VEC _IO(NSIO, 0x5)
Kirill Tkhai <ktkhai@virtuozzo.com> writes: > On 26.04.2017 19:32, Eric W. Biederman wrote: >> Kirill Tkhai <ktkhai@virtuozzo.com> writes: >> >>> On 26.04.2017 19:11, Kirill Tkhai wrote: >>>> On 26.04.2017 18:53, Oleg Nesterov wrote: >>>>> On 04/17, Kirill Tkhai wrote: >>>>>> >>>>>> +struct pidns_ioc_req { >>>>>> +/* Set vector of last pids in namespace hierarchy */ >>>>>> +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 >>>>>> + unsigned int req; >>>>>> + void __user *data; >>>>>> + unsigned int data_size; >>>>>> + char std_fields[0]; >>>>>> +}; >>>>> >>>>> see below, >>>>> >>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>>>> + struct pidns_ioc_req *req) >>>>>> +{ >>>>>> + char *str, *p; >>>>>> + int ret = 0; >>>>>> + pid_t pid; >>>>>> + >>>>>> + read_lock(&tasklist_lock); >>>>>> + if (!pid_ns->child_reaper) >>>>>> + ret = -EINVAL; >>>>>> + read_unlock(&tasklist_lock); >>>>>> + if (ret) >>>>>> + return ret; >>>>> >>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>>>> >>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>>>> there must be at least one task in this namespace, otherwise you can't open a file >>>>> which has f_op == ns_file_operations, no? >>>> >>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >>>> it under impression of >>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >>>> but here it's completely wrong. It will be removed in v2. >>>> >>>>>> + if (req->data_size >= PAGE_SIZE) >>>>>> + return -EINVAL; >>>>>> + str = vmalloc(req->data_size + 1); >>>>> >>>>> then I don't understand why it makes sense to use vmalloc() >>>>> >>>>>> + if (!str) >>>>>> + return -ENOMEM; >>>>>> + if (copy_from_user(str, req->data, req->data_size)) { >>>>>> + ret = -EFAULT; >>>>>> + goto out_vfree; >>>>>> + } >>>>>> + str[req->data_size] = '\0'; >>>>>> + >>>>>> + p = str; >>>>>> + while (p && *p != '\0') { >>>>>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { >>>>>> + ret = -EPERM; >>>>>> + goto out_vfree; >>>>>> + } >>>>>> + >>>>>> + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { >>>>>> + ret = -EINVAL; >>>>>> + goto out_vfree; >>>>>> + } >>>>> >>>>> Well, this is ioctl(), do we really want to parse the strings? >>>>> >>>>> Can't we make >>>>> >>>>> struct pidns_ioc_req { >>>>> ... >>>>> int nr_pids; >>>>> pid_t pids[0]; >>>>> } >>>>> >>>>> and just use get_user() in a loop? This way we can avoid vmalloc() or anything >>>>> else altogether. >>>> >>>> Since it's a generic structure for different types of the requests, it may be extended >>>> in the future. We won't be able to add new fields, if we compose the structure the way >>>> you suggested, will we? >>> >>> Though, we may go this way if just do the fields generic: >>> >>> struct pidns_ioc_req { >>> unsigned int req; >>> unsigned int data_size; >>> union { >>> pid_t pid[0]; >>> }; >>> }; >>> >>> Ok, I'll rework the patchset in this way. >> >> You don't need that. That is what new ioctl numbers are for. >> >> Interfaces to the kernel don't need to become multiplexors to prepare >> for the future when there is already an appropriate multiplexing >> interface in place. > > That is, do you suggest to not introduce NS_SPECIFIC_IO from the first patch, > and add PIDNS_REQ_SET_LAST_PID_VEC to the list of generic ns ioctls? > > ... > #define NS_GET_OWNER_UID _IO(NSIO, 0x4) > #define PIDNS_REQ_SET_LAST_PID_VEC _IO(NSIO, 0x5) I have not looked at your proposal in detail. But if we are going to do this with ioctls there are enough that we should not need to play games. There are 4 billion of them and 4194304 dedicated for namespace operations. Strictly it is 256 ioctls plus 14 bits dedicated for size. Even that seems plenty. Please let's make things as simple as we can. Eric
On 04/26, Kirill Tkhai wrote: > > On 26.04.2017 18:53, Oleg Nesterov wrote: > > > >> +static long set_last_pid_vec(struct pid_namespace *pid_ns, > >> + struct pidns_ioc_req *req) > >> +{ > >> + char *str, *p; > >> + int ret = 0; > >> + pid_t pid; > >> + > >> + read_lock(&tasklist_lock); > >> + if (!pid_ns->child_reaper) > >> + ret = -EINVAL; > >> + read_unlock(&tasklist_lock); > >> + if (ret) > >> + return ret; > > > > why do you need to check ->child_reaper under tasklist_lock? this looks pointless. > > > > In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, > > there must be at least one task in this namespace, otherwise you can't open a file > > which has f_op == ns_file_operations, no? > > Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added > it under impression of > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 > but here it's completely wrong. It will be removed in v2. Hmm. But if I read this commit correctly then we really need to check pid_ns->child_reaper != NULL ? Currently we can't pick an "empty" pid_ns. But after the commit above a task can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? Or I am totally confused? Oleg.
On 04/26, Kirill Tkhai wrote: > > On 26.04.2017 18:53, Oleg Nesterov wrote: > >> > >> +static long set_last_pid_vec(struct pid_namespace *pid_ns, > >> + struct pidns_ioc_req *req) > >> +{ > >> + char *str, *p; > >> + int ret = 0; > >> + pid_t pid; > >> + > >> + read_lock(&tasklist_lock); > >> + if (!pid_ns->child_reaper) > >> + ret = -EINVAL; > >> + read_unlock(&tasklist_lock); > >> + if (ret) > >> + return ret; > > > > why do you need to check ->child_reaper under tasklist_lock? this looks pointless. > > > > In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, > > there must be at least one task in this namespace, otherwise you can't open a file > > which has f_op == ns_file_operations, no? > > Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added > it under impression of > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 > but here it's completely wrong. It will be removed in v2. Hmm. But if I read this commit correctly then we really need to check pid_ns->child_reaper != NULL ? Currently we can't pick an "empty" pid_ns. But after the commit above a task can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? Or I am totally confused? Oleg.
On 27.04.2017 19:12, Oleg Nesterov wrote: > On 04/26, Kirill Tkhai wrote: >> >> On 26.04.2017 18:53, Oleg Nesterov wrote: >>> >>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>> + struct pidns_ioc_req *req) >>>> +{ >>>> + char *str, *p; >>>> + int ret = 0; >>>> + pid_t pid; >>>> + >>>> + read_lock(&tasklist_lock); >>>> + if (!pid_ns->child_reaper) >>>> + ret = -EINVAL; >>>> + read_unlock(&tasklist_lock); >>>> + if (ret) >>>> + return ret; >>> >>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>> >>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>> there must be at least one task in this namespace, otherwise you can't open a file >>> which has f_op == ns_file_operations, no? >> >> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >> it under impression of >> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >> but here it's completely wrong. It will be removed in v2. > > Hmm. But if I read this commit correctly then we really need to check > pid_ns->child_reaper != NULL ? > > Currently we can't pick an "empty" pid_ns. But after the commit above a task > can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its > /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(), because pid_for_children is available to open only after the 1st alloc_pid(). So, it's impossible to call ioctl() on it. > Or I am totally confused?
On 04/27, Kirill Tkhai wrote: > > On 27.04.2017 19:12, Oleg Nesterov wrote: > > On 04/26, Kirill Tkhai wrote: > >> > >> On 26.04.2017 18:53, Oleg Nesterov wrote: > >>> > >>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, > >>>> + struct pidns_ioc_req *req) > >>>> +{ > >>>> + char *str, *p; > >>>> + int ret = 0; > >>>> + pid_t pid; > >>>> + > >>>> + read_lock(&tasklist_lock); > >>>> + if (!pid_ns->child_reaper) > >>>> + ret = -EINVAL; > >>>> + read_unlock(&tasklist_lock); > >>>> + if (ret) > >>>> + return ret; > >>> > >>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. > >>> > >>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, > >>> there must be at least one task in this namespace, otherwise you can't open a file > >>> which has f_op == ns_file_operations, no? > >> > >> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added > >> it under impression of > >> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 > >> but here it's completely wrong. It will be removed in v2. > > > > Hmm. But if I read this commit correctly then we really need to check > > pid_ns->child_reaper != NULL ? > > > > Currently we can't pick an "empty" pid_ns. But after the commit above a task > > can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its > > /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? > > Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(), > because pid_for_children is available to open only after the 1st alloc_pid(). > So, it's impossible to call ioctl() on it. Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). But note that it doesn't need tasklist_lock too. Oleg.
Kirill Tkhai <ktkhai@virtuozzo.com> writes: > On 27.04.2017 19:12, Oleg Nesterov wrote: >> On 04/26, Kirill Tkhai wrote: >>> >>> On 26.04.2017 18:53, Oleg Nesterov wrote: >>>> >>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>>> + struct pidns_ioc_req *req) >>>>> +{ >>>>> + char *str, *p; >>>>> + int ret = 0; >>>>> + pid_t pid; >>>>> + >>>>> + read_lock(&tasklist_lock); >>>>> + if (!pid_ns->child_reaper) >>>>> + ret = -EINVAL; >>>>> + read_unlock(&tasklist_lock); >>>>> + if (ret) >>>>> + return ret; >>>> >>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>>> >>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>>> there must be at least one task in this namespace, otherwise you can't open a file >>>> which has f_op == ns_file_operations, no? >>> >>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >>> it under impression of >>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >>> but here it's completely wrong. It will be removed in v2. >> >> Hmm. But if I read this commit correctly then we really need to check >> pid_ns->child_reaper != NULL ? >> >> Currently we can't pick an "empty" pid_ns. But after the commit above a task >> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its >> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? > > Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(), > because pid_for_children is available to open only after the 1st alloc_pid(). > So, it's impossible to call ioctl() on it. That sounds reasonable. There is definitely the chance of the child_reaper dying after we have joined a pid namespace. So child_reaper can be stale if not NULL. As long as we don't mess up the first pid allocation I don't see any reason why we should care about last_pid in a pid_namespace. And this ioctl can be used to set all of the other pids on the first pid allocation by calling it in the parent pid namespace. There is still the chance of racing with a pid reaper dying. Why do we care about child_reaper in this case? Changing last_pid is completely pointless if child_reaper is dead or missing but why would we care? Although looking at it we probably want to call set_last_pid just to be consistent with everything else. Eric
On 27.04.2017 19:22, Oleg Nesterov wrote: > On 04/27, Kirill Tkhai wrote: >> >> On 27.04.2017 19:12, Oleg Nesterov wrote: >>> On 04/26, Kirill Tkhai wrote: >>>> >>>> On 26.04.2017 18:53, Oleg Nesterov wrote: >>>>> >>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>>>> + struct pidns_ioc_req *req) >>>>>> +{ >>>>>> + char *str, *p; >>>>>> + int ret = 0; >>>>>> + pid_t pid; >>>>>> + >>>>>> + read_lock(&tasklist_lock); >>>>>> + if (!pid_ns->child_reaper) >>>>>> + ret = -EINVAL; >>>>>> + read_unlock(&tasklist_lock); >>>>>> + if (ret) >>>>>> + return ret; >>>>> >>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>>>> >>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>>>> there must be at least one task in this namespace, otherwise you can't open a file >>>>> which has f_op == ns_file_operations, no? >>>> >>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >>>> it under impression of >>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >>>> but here it's completely wrong. It will be removed in v2. >>> >>> Hmm. But if I read this commit correctly then we really need to check >>> pid_ns->child_reaper != NULL ? >>> >>> Currently we can't pick an "empty" pid_ns. But after the commit above a task >>> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its >>> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? >> >> Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(), >> because pid_for_children is available to open only after the 1st alloc_pid(). >> So, it's impossible to call ioctl() on it. > > Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). > > But note that it doesn't need tasklist_lock too. Hm, are there possible strange situations with memory ordering, when we see ns->child_reaper of already died ns, which was placed in the same memory? Do we have to use some memory barriers here?
On 27.04.2017 19:39, Eric W. Biederman wrote: > Kirill Tkhai <ktkhai@virtuozzo.com> writes: > >> On 27.04.2017 19:12, Oleg Nesterov wrote: >>> On 04/26, Kirill Tkhai wrote: >>>> >>>> On 26.04.2017 18:53, Oleg Nesterov wrote: >>>>> >>>>>> +static long set_last_pid_vec(struct pid_namespace *pid_ns, >>>>>> + struct pidns_ioc_req *req) >>>>>> +{ >>>>>> + char *str, *p; >>>>>> + int ret = 0; >>>>>> + pid_t pid; >>>>>> + >>>>>> + read_lock(&tasklist_lock); >>>>>> + if (!pid_ns->child_reaper) >>>>>> + ret = -EINVAL; >>>>>> + read_unlock(&tasklist_lock); >>>>>> + if (ret) >>>>>> + return ret; >>>>> >>>>> why do you need to check ->child_reaper under tasklist_lock? this looks pointless. >>>>> >>>>> In fact I do not understand how it is possible to hit pid_ns->child_reaper == NULL, >>>>> there must be at least one task in this namespace, otherwise you can't open a file >>>>> which has f_op == ns_file_operations, no? >>>> >>>> Sure, it's impossible to pick a pid_ns, if there is no the pid_ns's tasks. I added >>>> it under impression of >>>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dfda351c729733a401981e8738ce497eaffcaa00 >>>> but here it's completely wrong. It will be removed in v2. >>> >>> Hmm. But if I read this commit correctly then we really need to check >>> pid_ns->child_reaper != NULL ? >>> >>> Currently we can't pick an "empty" pid_ns. But after the commit above a task >>> can do sys_unshare(CLONE_NEWPID), another (or the same) task can open its >>> /proc/$pid/ns/pid_for_children and call ns_ioctl() before the 1st alloc_pid() ? >> >> Another task can't open /proc/$pid/ns/pid_for_children before the 1st alloc_pid(), >> because pid_for_children is available to open only after the 1st alloc_pid(). >> So, it's impossible to call ioctl() on it. > > That sounds reasonable. > > There is definitely the chance of the child_reaper dying after we have > joined a pid namespace. So child_reaper can be stale if not NULL. > > As long as we don't mess up the first pid allocation I don't > see any reason why we should care about last_pid in a pid_namespace. > And this ioctl can be used to set all of the other pids on the first > pid allocation by calling it in the parent pid namespace. > > There is still the chance of racing with a pid reaper dying. Why do we > care about child_reaper in this case? > > Changing last_pid is completely pointless if child_reaper is dead or > missing but why would we care? I'm agree with you, there is no a reason we should care about died child_reaper. The protection is already made in pidns_for_children_get(). It's only need to prohibit creation of the first task with pid != 1, which leads to child_reaper-less pid namespace.
sorry for delay, vacation... On 04/28, Kirill Tkhai wrote: > > On 27.04.2017 19:22, Oleg Nesterov wrote: > > > > Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). > > > > But note that it doesn't need tasklist_lock too. > > Hm, are there possible strange situations with memory ordering, when we see > ns->child_reaper of already died ns, which was placed in the same memory? > Do we have to use some memory barriers here? Could you spell please? I don't understand your concerns... I don't see how, say, static struct ns_common *pidns_for_children_get(struct task_struct *task) { struct ns_common *ns = NULL; struct pid_namespace *pid_ns; task_lock(task); if (task->nsproxy) { pid_ns = task->nsproxy->pid_ns_for_children; if (pid_ns->child_reaper) { ns = &pid_ns->ns; get_pid_ns(ns); } } task_unlock(task); return ns; } can be wrong. It also looks more clean to me. ->child_reaper is not stable without tasklist, it can be dead/etc, but we do not care? Oleg.
Oleg Nesterov <oleg@redhat.com> writes: > sorry for delay, vacation... > > On 04/28, Kirill Tkhai wrote: >> >> On 27.04.2017 19:22, Oleg Nesterov wrote: >> > >> > Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). >> > >> > But note that it doesn't need tasklist_lock too. >> >> Hm, are there possible strange situations with memory ordering, when we see >> ns->child_reaper of already died ns, which was placed in the same memory? >> Do we have to use some memory barriers here? > > Could you spell please? I don't understand your concerns... > > I don't see how, say, > > static struct ns_common *pidns_for_children_get(struct task_struct *task) > { > struct ns_common *ns = NULL; > struct pid_namespace *pid_ns; > > task_lock(task); > if (task->nsproxy) { > pid_ns = task->nsproxy->pid_ns_for_children; > if (pid_ns->child_reaper) { > ns = &pid_ns->ns; > get_pid_ns(ns); > } > } > task_unlock(task); > > return ns; > } > > can be wrong. It also looks more clean to me. > > ->child_reaper is not stable without tasklist, it can be dead/etc, but > we do not care? It breaks a number of assumptions if you can join a pid namespace before an init process is created in that pid namespace. Checking for child_reaper is a bit heavy handed but appears to ensure all of the assumptions of initial pid namespace creation have been met. Which means your simplified pidns_for_children_get is a bit insufficient. Eric
On 02.05.2017 19:33, Oleg Nesterov wrote: > sorry for delay, vacation... > > On 04/28, Kirill Tkhai wrote: >> >> On 27.04.2017 19:22, Oleg Nesterov wrote: >>> >>> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). >>> >>> But note that it doesn't need tasklist_lock too. >> >> Hm, are there possible strange situations with memory ordering, when we see >> ns->child_reaper of already died ns, which was placed in the same memory? >> Do we have to use some memory barriers here? > > Could you spell please? I don't understand your concerns... > > I don't see how, say, > > static struct ns_common *pidns_for_children_get(struct task_struct *task) > { > struct ns_common *ns = NULL; > struct pid_namespace *pid_ns; > > task_lock(task); > if (task->nsproxy) { > pid_ns = task->nsproxy->pid_ns_for_children; > if (pid_ns->child_reaper) { > ns = &pid_ns->ns; > get_pid_ns(ns); > } > } > task_unlock(task); > > return ns; > } > > can be wrong. It also looks more clean to me. > > ->child_reaper is not stable without tasklist, it can be dead/etc, but > we do not care? I mean the following. We had a pid_ns1 with a child_reaper set. Then it became dead, and a new pid_ns2 were allocated in the same memory. A task on another cpu opens the pid_for_children file, and because of there is no memory ordering, it sees pid_ns1->child_reaper, when it opens pid_ns2. I forgot, what guarantees this situation is impossible? What guarantees, the renewed content of pid_ns2 on another cpu is seen not later, than we can't open it?
Kirill Tkhai <ktkhai@virtuozzo.com> writes: > On 02.05.2017 19:33, Oleg Nesterov wrote: >> sorry for delay, vacation... >> >> On 04/28, Kirill Tkhai wrote: >>> >>> On 27.04.2017 19:22, Oleg Nesterov wrote: >>>> >>>> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). >>>> >>>> But note that it doesn't need tasklist_lock too. >>> >>> Hm, are there possible strange situations with memory ordering, when we see >>> ns->child_reaper of already died ns, which was placed in the same memory? >>> Do we have to use some memory barriers here? >> >> Could you spell please? I don't understand your concerns... >> >> I don't see how, say, >> >> static struct ns_common *pidns_for_children_get(struct task_struct *task) >> { >> struct ns_common *ns = NULL; >> struct pid_namespace *pid_ns; >> >> task_lock(task); >> if (task->nsproxy) { >> pid_ns = task->nsproxy->pid_ns_for_children; >> if (pid_ns->child_reaper) { ^^^^^^^^^^^^^^^^^^^^ Oleg my apologies I missed this line earlier. This does look like a valid way to skip read_lock(&tasklist_lock); >> ns = &pid_ns->ns; >> get_pid_ns(ns); ^^^^^^^^^^^^^ This needs to be: get_pid_ns(pid_ns); >> } >> } >> task_unlock(task); >> >> return ns; >> } >> >> can be wrong. It also looks more clean to me. >> >> ->child_reaper is not stable without tasklist, it can be dead/etc, but >> we do not care? > > I mean the following. We had a pid_ns1 with a child_reaper set. Then > it became dead, and a new pid_ns2 were allocated in the same memory. task->nsproxy->pid_ns_for_children is always changed with task_lock(task) held. See switch_task_namespaces (used by unshare and setns). This also gives us the guarantee that the pid_ns reference won't be freed/reused in any for until task_lock(task) is dropped. > A task on another cpu opens the pid_for_children file, and because > of there is no memory ordering, it sees pid_ns1->child_reaper, > when it opens pid_ns2. > > I forgot, what guarantees this situation is impossible? What guarantees, > the renewed content of pid_ns2 on another cpu is seen not later, than > we can't open it? Eric
On 03.05.2017 00:13, Eric W. Biederman wrote: > Kirill Tkhai <ktkhai@virtuozzo.com> writes: > >> On 02.05.2017 19:33, Oleg Nesterov wrote: >>> sorry for delay, vacation... >>> >>> On 04/28, Kirill Tkhai wrote: >>>> >>>> On 27.04.2017 19:22, Oleg Nesterov wrote: >>>>> >>>>> Ah, OK, I didn't notice the ns->child_reaper check in pidns_for_children_get(). >>>>> >>>>> But note that it doesn't need tasklist_lock too. >>>> >>>> Hm, are there possible strange situations with memory ordering, when we see >>>> ns->child_reaper of already died ns, which was placed in the same memory? >>>> Do we have to use some memory barriers here? >>> >>> Could you spell please? I don't understand your concerns... >>> >>> I don't see how, say, >>> >>> static struct ns_common *pidns_for_children_get(struct task_struct *task) >>> { >>> struct ns_common *ns = NULL; >>> struct pid_namespace *pid_ns; >>> >>> task_lock(task); >>> if (task->nsproxy) { >>> pid_ns = task->nsproxy->pid_ns_for_children; >>> if (pid_ns->child_reaper) { > ^^^^^^^^^^^^^^^^^^^^ > Oleg my apologies I missed this line earlier. > This does look like a valid way to skip read_lock(&tasklist_lock); >>> ns = &pid_ns->ns; >>> get_pid_ns(ns); > ^^^^^^^^^^^^^ This needs to be: > get_pid_ns(pid_ns); > >>> } >>> } >>> task_unlock(task); >>> >>> return ns; >>> } >>> >>> can be wrong. It also looks more clean to me. >>> >>> ->child_reaper is not stable without tasklist, it can be dead/etc, but >>> we do not care? >> >> I mean the following. We had a pid_ns1 with a child_reaper set. Then >> it became dead, and a new pid_ns2 were allocated in the same memory. > > task->nsproxy->pid_ns_for_children is always changed with > task_lock(task) held. See switch_task_namespaces (used by unshare and > setns). This also gives us the guarantee that the pid_ns reference > won't be freed/reused in any for until task_lock(task) is dropped. Now I've checked kmem_cache_zalloc() and it looks like it zeroes cache memory content synchronous on allocation (it seems there is no pre-zeroed memory for GFP_ZERO cases). So, the zeroing happens before switch_task_namespaces() (and task_unlock()) and we're really safe after task_lock() in pidns_for_children_get(). Ok, I'll send new version of the patchset.
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 544bbb661475..37bb4af917b5 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -17,4 +17,13 @@ /* Execute namespace-specific ioctl */ #define NS_SPECIFIC_IOC _IO(NSIO, 0x5) +struct pidns_ioc_req { +/* Set vector of last pids in namespace hierarchy */ +#define PIDNS_REQ_SET_LAST_PID_VEC 0x1 + unsigned int req; + void __user *data; + unsigned int data_size; + char std_fields[0]; +}; + #endif /* __LINUX_NSFS_H */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index de461aa0bf9a..0e86fa15cd92 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -21,6 +21,8 @@ #include <linux/export.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> +#include <linux/vmalloc.h> +#include <uapi/linux/nsfs.h> struct pid_cache { int nr_ids; @@ -428,6 +430,91 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns) return &get_pid_ns(pid_ns)->ns; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static long set_last_pid_vec(struct pid_namespace *pid_ns, + struct pidns_ioc_req *req) +{ + char *str, *p; + int ret = 0; + pid_t pid; + + read_lock(&tasklist_lock); + if (!pid_ns->child_reaper) + ret = -EINVAL; + read_unlock(&tasklist_lock); + if (ret) + return ret; + + if (req->data_size >= PAGE_SIZE) + return -EINVAL; + str = vmalloc(req->data_size + 1); + if (!str) + return -ENOMEM; + if (copy_from_user(str, req->data, req->data_size)) { + ret = -EFAULT; + goto out_vfree; + } + str[req->data_size] = '\0'; + + p = str; + while (p && *p != '\0') { + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_vfree; + } + + if (sscanf(p, "%d", &pid) != 1 || pid < 0 || pid > pid_max) { + ret = -EINVAL; + goto out_vfree; + } + + /* Write directly: see the comment in pid_ns_ctl_handler() */ + pid_ns->last_pid = pid; + + p = strchr(p, ':'); + pid_ns = pid_ns->parent; + if (p) { + if (!pid_ns) { + ret = -EINVAL; + goto out_vfree; + } + p++; + } + } + + ret = 0; +out_vfree: + vfree(str); + return ret; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static long set_last_pid_vec(struct pid_namespace *pid_ns, + struct pidns_ioc_req *req) +{ + return -ENOTTY; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + +static long pidns_ioctl(struct ns_common *ns, unsigned long arg) +{ + struct pid_namespace *pid_ns = to_pid_ns(ns); + struct pidns_ioc_req user_req; + int ret; + + ret = copy_from_user(&user_req, (void *)arg, + offsetof(struct pidns_ioc_req, std_fields)); + if (ret) + return ret; + + switch (user_req.req) { + case PIDNS_REQ_SET_LAST_PID_VEC: + return set_last_pid_vec(pid_ns, &user_req); + default: + return -ENOTTY; + } + return 0; +} + static struct user_namespace *pidns_owner(struct ns_common *ns) { return to_pid_ns(ns)->user_ns; @@ -441,6 +528,7 @@ const struct proc_ns_operations pidns_operations = { .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, + .ns_ioctl = pidns_ioctl, }; static __init int pid_namespaces_init(void)
On implementing of nested pid namespaces support in CRIU (checkpoint-restore in userspace tool) we run into the situation, that it's impossible to create a task with specific NSpid effectively. After commit 49f4d8b93ccf "pidns: Capture the user namespace and filter ns_last_pid" it is impossible to set ns_last_pid on any pid namespace, except task's active pid_ns (before the commit it was possible to write to pid_ns_for_children). Thus, if a restored task in a container has more than one pid_ns levels, the restorer code must have a task helper for every pid namespace of the task's pid_ns hierarhy. This is a big problem, because of communication with a helper for every pid_ns in the hierarchy is not cheap and not performance-good as it implies many helpers wakeups to create a single task (independently, how you communicate with the helpers). This patch tries to decide the problem. It introduces a new pid_ns ns_ioctl(PIDNS_REQ_SET_LAST_PID_VEC), which allows to write a vector of last pids on pid_ns hierarchy. The vector is passed as a ":"-delimited string with pids, written in reverse order. The first number corresponds to the opened namespace ns_last_pid, the second is to its parent, etc. So, if you have the pid namespaces hierarchy like: pid_ns1 (grand father) | v pid_ns2 (father) | v pid_ns3 (child) and the ns of task's of pid_ns3 is open, then the corresponding vector will be "last_ns_pid3:last_ns_pid2:last_ns_pid1". This vector may be short and it may contain less levels, for example, "last_ns_pid3:last_ns_pid2" or even "last_ns_pid3", in dependence of which levels you want to populate. To write in a pid_ns's ns_last_pid we check that the writer task has CAP_SYS_ADMIN permittions in this pid_ns's user_ns. One note about struct pidns_ioc_req. It's made extensible and may expanded in the future. The always existing fields present at the moment, the future fields and they sizes may be determined by pidns_ioc_req::req by the future code. Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> --- include/uapi/linux/nsfs.h | 9 +++++ kernel/pid_namespace.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+)