diff mbox

[2/2] pidns: Expose task pid_ns_for_children to userspace

Message ID 148440329770.30622.16593902895676160550.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Kirill Tkhai Jan. 14, 2017, 2:15 p.m. UTC
For correct checkpointing/restoring of a task from userspace
it's need to know the task's pid_ns_for_children. Currently,
there is no a sane way to do that (the only possible trick
is to force the task create a new child and to analize the
child's /proc/[pid]/ns/pid link, that is performance-stupid).

The patch exposes pid_ns_for_children to ns directory
in standard way with the name "pid_for_children":

~# ls /proc/5531/ns -l | grep pid
lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/proc/namespaces.c    |    1 +
 include/linux/proc_ns.h |    1 +
 kernel/pid_namespace.c  |   25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+)


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Andrey Vagin Jan. 17, 2017, 8 p.m. UTC | #1
On Sat, Jan 14, 2017 at 05:15:04PM +0300, Kirill Tkhai wrote:
> For correct checkpointing/restoring of a task from userspace
> it's need to know the task's pid_ns_for_children. Currently,
> there is no a sane way to do that (the only possible trick
> is to force the task create a new child and to analize the
> child's /proc/[pid]/ns/pid link, that is performance-stupid).
> 
> The patch exposes pid_ns_for_children to ns directory
> in standard way with the name "pid_for_children":
> 
> ~# ls /proc/5531/ns -l | grep pid
> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]
>

Cc: linux-api, Michael Kerrisk

Acked-by: Andrei Vagin <avagin@virtuozzo.com>

> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  fs/proc/namespaces.c    |    1 +
>  include/linux/proc_ns.h |    1 +
>  kernel/pid_namespace.c  |   25 +++++++++++++++++++++++++
>  3 files changed, 27 insertions(+)
> 
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index 766f0c637ad1..3803b24ca220 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
>  #endif
>  #ifdef CONFIG_PID_NS
>  	&pidns_operations,
> +	&pidns_for_children_operations,
>  #endif
>  #ifdef CONFIG_USER_NS
>  	&userns_operations,
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index 88dba3b53375..58ab28d81fc2 100644
> --- a/include/linux/proc_ns.h
> +++ b/include/linux/proc_ns.h
> @@ -27,6 +27,7 @@ extern const struct proc_ns_operations netns_operations;
>  extern const struct proc_ns_operations utsns_operations;
>  extern const struct proc_ns_operations ipcns_operations;
>  extern const struct proc_ns_operations pidns_operations;
> +extern const struct proc_ns_operations pidns_for_children_operations;
>  extern const struct proc_ns_operations userns_operations;
>  extern const struct proc_ns_operations mntns_operations;
>  extern const struct proc_ns_operations cgroupns_operations;
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index df9e8e9e0be7..cbe950d4a11e 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -369,6 +369,20 @@ static struct ns_common *pidns_get(struct task_struct *task)
>  	return ns ? &ns->ns : NULL;
>  }
>  
> +static struct ns_common *pidns_for_children_get(struct task_struct *task)
> +{
> +	struct pid_namespace *ns = NULL;
> +
> +	task_lock(task);
> +	if (task->nsproxy) {
> +		ns = task->nsproxy->pid_ns_for_children;
> +		get_pid_ns(ns);
> +	}
> +	task_unlock(task);
> +
> +	return ns ? &ns->ns : NULL;
> +}
> +
>  static void pidns_put(struct ns_common *ns)
>  {
>  	put_pid_ns(to_pid_ns(ns));
> @@ -438,6 +452,17 @@ const struct proc_ns_operations pidns_operations = {
>  	.get_parent	= pidns_get_parent,
>  };
>  
> +const struct proc_ns_operations pidns_for_children_operations = {
> +	.name		= "pid_for_children",
> +	.real_ns_name	= "pid",
> +	.type		= CLONE_NEWPID,
> +	.get		= pidns_for_children_get,
> +	.put		= pidns_put,
> +	.install	= pidns_install,
> +	.owner		= pidns_owner,
> +	.get_parent	= pidns_get_parent,
> +};
> +
>  static __init int pid_namespaces_init(void)
>  {
>  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alban Crequy Jan. 23, 2017, 9:49 p.m. UTC | #2
On 14 January 2017 at 15:15, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> For correct checkpointing/restoring of a task from userspace
> it's need to know the task's pid_ns_for_children. Currently,
> there is no a sane way to do that (the only possible trick
> is to force the task create a new child and to analize the
> child's /proc/[pid]/ns/pid link, that is performance-stupid).
>
> The patch exposes pid_ns_for_children to ns directory
> in standard way with the name "pid_for_children":
>
> ~# ls /proc/5531/ns -l | grep pid
> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]
>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>

What's happening if a process, after unsharing CLONE_NEWPID, does not
fork but instead let another process open the new "pid_for_children"
and then setns()+fork()? Is that other process allowed to create the
"pid 1" in the new pid namespaces? Is that also allowed if the other
process lives in a sibling pid namespace? If so, that would break what
pid_namespaces(7) says:

    "the parental relationship between processes mirrors the parental
     relationship between PID namespaces: the parent of a process is
     either in the same namespace or resides in the immediate parent
     PID namespace."
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill Tkhai Jan. 24, 2017, 8:35 a.m. UTC | #3
On 24.01.2017 00:49, Alban Crequy wrote:
> On 14 January 2017 at 15:15, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>> For correct checkpointing/restoring of a task from userspace
>> it's need to know the task's pid_ns_for_children. Currently,
>> there is no a sane way to do that (the only possible trick
>> is to force the task create a new child and to analize the
>> child's /proc/[pid]/ns/pid link, that is performance-stupid).
>>
>> The patch exposes pid_ns_for_children to ns directory
>> in standard way with the name "pid_for_children":
>>
>> ~# ls /proc/5531/ns -l | grep pid
>> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
>> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> 
> What's happening if a process, after unsharing CLONE_NEWPID, does not
> fork but instead let another process open the new "pid_for_children"
> and then setns()+fork()? Is that other process allowed to create the
> "pid 1" in the new pid namespaces? Is that also allowed if the other
> process lives in a sibling pid namespace? If so, that would break what
> pid_namespaces(7) says:
> 
>     "the parental relationship between processes mirrors the parental
>      relationship between PID namespaces: the parent of a process is
>      either in the same namespace or resides in the immediate parent
>      PID namespace."
> 

You can setns() on a pid_ns only if your active pid_ns is a (grand)parent
for the target pid_ns. So, the situation you described is not possible.
See pidns_install() for the details.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill Tkhai Jan. 30, 2017, 3:10 p.m. UTC | #4
ping

On 14.01.2017 17:15, Kirill Tkhai wrote:
> For correct checkpointing/restoring of a task from userspace
> it's need to know the task's pid_ns_for_children. Currently,
> there is no a sane way to do that (the only possible trick
> is to force the task create a new child and to analize the
> child's /proc/[pid]/ns/pid link, that is performance-stupid).
> 
> The patch exposes pid_ns_for_children to ns directory
> in standard way with the name "pid_for_children":
> 
> ~# ls /proc/5531/ns -l | grep pid
> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
> lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  fs/proc/namespaces.c    |    1 +
>  include/linux/proc_ns.h |    1 +
>  kernel/pid_namespace.c  |   25 +++++++++++++++++++++++++
>  3 files changed, 27 insertions(+)
> 
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index 766f0c637ad1..3803b24ca220 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
>  #endif
>  #ifdef CONFIG_PID_NS
>  	&pidns_operations,
> +	&pidns_for_children_operations,
>  #endif
>  #ifdef CONFIG_USER_NS
>  	&userns_operations,
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index 88dba3b53375..58ab28d81fc2 100644
> --- a/include/linux/proc_ns.h
> +++ b/include/linux/proc_ns.h
> @@ -27,6 +27,7 @@ extern const struct proc_ns_operations netns_operations;
>  extern const struct proc_ns_operations utsns_operations;
>  extern const struct proc_ns_operations ipcns_operations;
>  extern const struct proc_ns_operations pidns_operations;
> +extern const struct proc_ns_operations pidns_for_children_operations;
>  extern const struct proc_ns_operations userns_operations;
>  extern const struct proc_ns_operations mntns_operations;
>  extern const struct proc_ns_operations cgroupns_operations;
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index df9e8e9e0be7..cbe950d4a11e 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -369,6 +369,20 @@ static struct ns_common *pidns_get(struct task_struct *task)
>  	return ns ? &ns->ns : NULL;
>  }
>  
> +static struct ns_common *pidns_for_children_get(struct task_struct *task)
> +{
> +	struct pid_namespace *ns = NULL;
> +
> +	task_lock(task);
> +	if (task->nsproxy) {
> +		ns = task->nsproxy->pid_ns_for_children;
> +		get_pid_ns(ns);
> +	}
> +	task_unlock(task);
> +
> +	return ns ? &ns->ns : NULL;
> +}
> +
>  static void pidns_put(struct ns_common *ns)
>  {
>  	put_pid_ns(to_pid_ns(ns));
> @@ -438,6 +452,17 @@ const struct proc_ns_operations pidns_operations = {
>  	.get_parent	= pidns_get_parent,
>  };
>  
> +const struct proc_ns_operations pidns_for_children_operations = {
> +	.name		= "pid_for_children",
> +	.real_ns_name	= "pid",
> +	.type		= CLONE_NEWPID,
> +	.get		= pidns_for_children_get,
> +	.put		= pidns_put,
> +	.install	= pidns_install,
> +	.owner		= pidns_owner,
> +	.get_parent	= pidns_get_parent,
> +};
> +
>  static __init int pid_namespaces_init(void)
>  {
>  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@  static const struct proc_ns_operations *ns_entries[] = {
 #endif
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
+	&pidns_for_children_operations,
 #endif
 #ifdef CONFIG_USER_NS
 	&userns_operations,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 88dba3b53375..58ab28d81fc2 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -27,6 +27,7 @@  extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
 extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index df9e8e9e0be7..cbe950d4a11e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -369,6 +369,20 @@  static struct ns_common *pidns_get(struct task_struct *task)
 	return ns ? &ns->ns : NULL;
 }
 
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+	struct pid_namespace *ns = NULL;
+
+	task_lock(task);
+	if (task->nsproxy) {
+		ns = task->nsproxy->pid_ns_for_children;
+		get_pid_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
 static void pidns_put(struct ns_common *ns)
 {
 	put_pid_ns(to_pid_ns(ns));
@@ -438,6 +452,17 @@  const struct proc_ns_operations pidns_operations = {
 	.get_parent	= pidns_get_parent,
 };
 
+const struct proc_ns_operations pidns_for_children_operations = {
+	.name		= "pid_for_children",
+	.real_ns_name	= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_for_children_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+	.owner		= pidns_owner,
+	.get_parent	= pidns_get_parent,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);