Message ID | 20241105031024.3866383-1-yun.zhou@windriver.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | [v2] kernel: add pid_max to pid_namespace | expand |
On Tue, 5 Nov 2024 11:10:24 +0800 Yun Zhou <yun.zhou@windriver.com> wrote: > It is necessary to have a different pid_max in different containers. > For example, multiple containers are running on a host, one of which > is Android, and its 32 bit bionic libc only accepts pid <= 65535. So > it requires the global pid_max <= 65535. This will cause configuration > conflicts with other containers and also limit the maximum number of > tasks for the entire system. > > Signed-off-by: Yun Zhou <yun.zhou@windriver.com> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> -- Steve > --- > - Remove sentinels from ctl_table arrays. > v1 - https://lore.kernel.org/all/20241030052933.1041408-1-yun.zhou@windriver.com/ > --- > include/linux/pid_namespace.h | 1 + > kernel/pid.c | 12 +++++------ > kernel/pid_namespace.c | 34 ++++++++++++++++++++++++++----- > kernel/sysctl.c | 9 -------- > kernel/trace/pid_list.c | 2 +- > kernel/trace/trace.h | 2 -- > kernel/trace/trace_sched_switch.c | 2 +- > 7 files changed, 38 insertions(+), 24 deletions(-)
On Tue, Nov 05, 2024 at 11:10:24AM +0800, Yun Zhou wrote: > It is necessary to have a different pid_max in different containers. > For example, multiple containers are running on a host, one of which > is Android, and its 32 bit bionic libc only accepts pid <= 65535. So > it requires the global pid_max <= 65535. This will cause configuration > conflicts with other containers and also limit the maximum number of > tasks for the entire system. > > Signed-off-by: Yun Zhou <yun.zhou@windriver.com> > --- > - Remove sentinels from ctl_table arrays. > v1 - https://lore.kernel.org/all/20241030052933.1041408-1-yun.zhou@windriver.com/ > --- > include/linux/pid_namespace.h | 1 + > kernel/pid.c | 12 +++++------ > kernel/pid_namespace.c | 34 ++++++++++++++++++++++++++----- > kernel/sysctl.c | 9 -------- > kernel/trace/pid_list.c | 2 +- > kernel/trace/trace.h | 2 -- > kernel/trace/trace_sched_switch.c | 2 +- > 7 files changed, 38 insertions(+), 24 deletions(-) ... > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index d70ab49d5b4a..a5a8254825d5 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->user_ns = get_user_ns(user_ns); > ns->ucounts = ucounts; > ns->pid_allocated = PIDNS_ADDING; > + ns->pid_max = parent_pid_ns->pid_max; > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); > #endif > @@ -280,19 +281,44 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > > return ret; > } > +#endif /* CONFIG_CHECKPOINT_RESTORE */ > + > +static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write, > + void *buffer, size_t *lenp, loff_t *ppos) > +{ > + struct pid_namespace *pid_ns = task_active_pid_ns(current); > + struct ctl_table tmp = *table; > + > + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) > + return -EPERM; > + > + tmp.data = &pid_ns->pid_max; > + if (pid_ns->parent) > + tmp.extra2 = &pid_ns->parent->pid_max; > + > + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); > +} > > -extern int pid_max; > static struct ctl_table pid_ns_ctl_table[] = { > +#ifdef CONFIG_CHECKPOINT_RESTORE > { > .procname = "ns_last_pid", > .maxlen = sizeof(int), > .mode = 0666, /* permissions are checked in the handler */ > .proc_handler = pid_ns_ctl_handler, > .extra1 = SYSCTL_ZERO, > - .extra2 = &pid_max, > + .extra2 = &init_pid_ns.pid_max, > }, > -}; > #endif /* CONFIG_CHECKPOINT_RESTORE */ > + { > + .procname = "pid_max", > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = pid_max_ns_ctl_handler, > + .extra1 = &pid_max_min, > + .extra2 = &pid_max_max, > + }, > +}; I see here that the sysctls are without sentinel. Reviewed-by: Joel Granados <joel.granados@kernel.org>
On Tue, Nov 05, 2024 at 11:10:24AM +0800, Yun Zhou wrote: > It is necessary to have a different pid_max in different containers. > For example, multiple containers are running on a host, one of which > is Android, and its 32 bit bionic libc only accepts pid <= 65535. So > it requires the global pid_max <= 65535. This will cause configuration > conflicts with other containers and also limit the maximum number of > tasks for the entire system. > > Signed-off-by: Yun Zhou <yun.zhou@windriver.com> > --- Fwiw, I've done a patch like this years ago and then Alex revived it in [1] including selftests! There's downsides to consider: [1]: https://lore.kernel.org/lkml/20240222160915.315255-1-aleksandr.mikhalitsyn@canonical.com > - Remove sentinels from ctl_table arrays. > v1 - https://lore.kernel.org/all/20241030052933.1041408-1-yun.zhou@windriver.com/ > --- > include/linux/pid_namespace.h | 1 + > kernel/pid.c | 12 +++++------ > kernel/pid_namespace.c | 34 ++++++++++++++++++++++++++----- > kernel/sysctl.c | 9 -------- > kernel/trace/pid_list.c | 2 +- > kernel/trace/trace.h | 2 -- > kernel/trace/trace_sched_switch.c | 2 +- > 7 files changed, 38 insertions(+), 24 deletions(-) > > diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h > index f9f9931e02d6..064cfe2542fc 100644 > --- a/include/linux/pid_namespace.h > +++ b/include/linux/pid_namespace.h > @@ -38,6 +38,7 @@ struct pid_namespace { > struct ucounts *ucounts; > int reboot; /* group exit code if this pidns was rebooted */ > struct ns_common ns; > + int pid_max; > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > int memfd_noexec_scope; > #endif > diff --git a/kernel/pid.c b/kernel/pid.c > index 2715afb77eab..f8026a61436b 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -60,8 +60,6 @@ struct pid init_struct_pid = { > }, } > }; > > -int pid_max = PID_MAX_DEFAULT; > - > int pid_max_min = RESERVED_PIDS + 1; > int pid_max_max = PID_MAX_LIMIT; > /* > @@ -78,6 +76,7 @@ static u64 pidfs_ino = RESERVED_PIDS; > */ > struct pid_namespace init_pid_ns = { > .ns.count = REFCOUNT_INIT(2), > + .pid_max = PID_MAX_DEFAULT, > .idr = IDR_INIT(init_pid_ns.idr), > .pid_allocated = PIDNS_ADDING, > .level = 0, > @@ -198,7 +197,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > tid = set_tid[ns->level - i]; > > retval = -EINVAL; > - if (tid < 1 || tid >= pid_max) > + if (tid < 1 || tid >= tmp->pid_max) > goto out_free; > /* > * Also fail if a PID != 1 is requested and > @@ -238,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > * a partially initialized PID (see below). > */ > nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, > - pid_max, GFP_ATOMIC); > + tmp->pid_max, GFP_ATOMIC); > } > spin_unlock_irq(&pidmap_lock); > idr_preload_end(); > @@ -653,11 +652,12 @@ void __init pid_idr_init(void) > BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); > > /* bump default and minimum pid_max based on number of cpus */ > - pid_max = min(pid_max_max, max_t(int, pid_max, > + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, > PIDS_PER_CPU_DEFAULT * num_possible_cpus())); > pid_max_min = max_t(int, pid_max_min, > PIDS_PER_CPU_MIN * num_possible_cpus()); > - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); > + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, > + pid_max_min); > > idr_init(&init_pid_ns.idr); > > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index d70ab49d5b4a..a5a8254825d5 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->user_ns = get_user_ns(user_ns); > ns->ucounts = ucounts; > ns->pid_allocated = PIDNS_ADDING; > + ns->pid_max = parent_pid_ns->pid_max; > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); > #endif > @@ -280,19 +281,44 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > > return ret; > } > +#endif /* CONFIG_CHECKPOINT_RESTORE */ > + > +static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write, > + void *buffer, size_t *lenp, loff_t *ppos) > +{ > + struct pid_namespace *pid_ns = task_active_pid_ns(current); > + struct ctl_table tmp = *table; > + > + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) > + return -EPERM; > + > + tmp.data = &pid_ns->pid_max; > + if (pid_ns->parent) > + tmp.extra2 = &pid_ns->parent->pid_max; > + > + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); > +} > > -extern int pid_max; > static struct ctl_table pid_ns_ctl_table[] = { > +#ifdef CONFIG_CHECKPOINT_RESTORE > { > .procname = "ns_last_pid", > .maxlen = sizeof(int), > .mode = 0666, /* permissions are checked in the handler */ > .proc_handler = pid_ns_ctl_handler, > .extra1 = SYSCTL_ZERO, > - .extra2 = &pid_max, > + .extra2 = &init_pid_ns.pid_max, > }, > -}; > #endif /* CONFIG_CHECKPOINT_RESTORE */ > + { > + .procname = "pid_max", > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = pid_max_ns_ctl_handler, > + .extra1 = &pid_max_min, > + .extra2 = &pid_max_max, > + }, > +}; > > int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) > { > @@ -449,9 +475,7 @@ static __init int pid_namespaces_init(void) > { > pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); > > -#ifdef CONFIG_CHECKPOINT_RESTORE > register_sysctl_init("kernel", pid_ns_ctl_table); > -#endif > > register_pid_ns_sysctl_table_vm(); > return 0; > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index 79e6cb1d5c48..676a0d675e7f 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = { > .proc_handler = proc_dointvec, > }, > #endif > - { > - .procname = "pid_max", > - .data = &pid_max, > - .maxlen = sizeof (int), > - .mode = 0644, > - .proc_handler = proc_dointvec_minmax, > - .extra1 = &pid_max_min, > - .extra2 = &pid_max_max, > - }, > { > .procname = "panic_on_oops", > .data = &panic_on_oops, > diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c > index 4966e6bbdf6f..c62b9b3cfb3d 100644 > --- a/kernel/trace/pid_list.c > +++ b/kernel/trace/pid_list.c > @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) > int i; > > /* According to linux/thread.h, pids can be no bigger that 30 bits */ > - WARN_ON_ONCE(pid_max > (1 << 30)); > + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30)); > > pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); > if (!pid_list) > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index c866991b9c78..e51851d64e4d 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -715,8 +715,6 @@ extern unsigned long tracing_thresh; > > /* PID filtering */ > > -extern int pid_max; > - > bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, > pid_t search_pid); > bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, > diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c > index 8a407adb0e1c..c20c80abe065 100644 > --- a/kernel/trace/trace_sched_switch.c > +++ b/kernel/trace/trace_sched_switch.c > @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void) > if (tgid_map) > return 0; > > - tgid_map_max = pid_max; > + tgid_map_max = init_pid_ns.pid_max; > map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), > GFP_KERNEL); > if (!map) > -- > 2.27.0 >
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f9f9931e02d6..064cfe2542fc 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -38,6 +38,7 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; + int pid_max; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif diff --git a/kernel/pid.c b/kernel/pid.c index 2715afb77eab..f8026a61436b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -60,8 +60,6 @@ struct pid init_struct_pid = { }, } }; -int pid_max = PID_MAX_DEFAULT; - int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; /* @@ -78,6 +76,7 @@ static u64 pidfs_ino = RESERVED_PIDS; */ struct pid_namespace init_pid_ns = { .ns.count = REFCOUNT_INIT(2), + .pid_max = PID_MAX_DEFAULT, .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, @@ -198,7 +197,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tid = set_tid[ns->level - i]; retval = -EINVAL; - if (tid < 1 || tid >= pid_max) + if (tid < 1 || tid >= tmp->pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and @@ -238,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + tmp->pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); @@ -653,11 +652,12 @@ void __init pid_idr_init(void) BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ - pid_max = min(pid_max_max, max_t(int, pid_max, + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, + pid_max_min); idr_init(&init_pid_ns.idr); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d70ab49d5b4a..a5a8254825d5 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + ns->pid_max = parent_pid_ns->pid_max; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif @@ -280,19 +281,44 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, return ret; } +#endif /* CONFIG_CHECKPOINT_RESTORE */ + +static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(current); + struct ctl_table tmp = *table; + + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) + return -EPERM; + + tmp.data = &pid_ns->pid_max; + if (pid_ns->parent) + tmp.extra2 = &pid_ns->parent->pid_max; + + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +} -extern int pid_max; static struct ctl_table pid_ns_ctl_table[] = { +#ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &pid_max, + .extra2 = &init_pid_ns.pid_max, }, -}; #endif /* CONFIG_CHECKPOINT_RESTORE */ + { + .procname = "pid_max", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = pid_max_ns_ctl_handler, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, +}; int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { @@ -449,9 +475,7 @@ static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); -#ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_init("kernel", pid_ns_ctl_table); -#endif register_pid_ns_sysctl_table_vm(); return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 79e6cb1d5c48..676a0d675e7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, { .procname = "panic_on_oops", .data = &panic_on_oops, diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 4966e6bbdf6f..c62b9b3cfb3d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) int i; /* According to linux/thread.h, pids can be no bigger that 30 bits */ - WARN_ON_ONCE(pid_max > (1 << 30)); + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30)); pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c866991b9c78..e51851d64e4d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -715,8 +715,6 @@ extern unsigned long tracing_thresh; /* PID filtering */ -extern int pid_max; - bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8a407adb0e1c..c20c80abe065 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void) if (tgid_map) return 0; - tgid_map_max = pid_max; + tgid_map_max = init_pid_ns.pid_max; map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL); if (!map)
It is necessary to have a different pid_max in different containers. For example, multiple containers are running on a host, one of which is Android, and its 32 bit bionic libc only accepts pid <= 65535. So it requires the global pid_max <= 65535. This will cause configuration conflicts with other containers and also limit the maximum number of tasks for the entire system. Signed-off-by: Yun Zhou <yun.zhou@windriver.com> --- - Remove sentinels from ctl_table arrays. v1 - https://lore.kernel.org/all/20241030052933.1041408-1-yun.zhou@windriver.com/ --- include/linux/pid_namespace.h | 1 + kernel/pid.c | 12 +++++------ kernel/pid_namespace.c | 34 ++++++++++++++++++++++++++----- kernel/sysctl.c | 9 -------- kernel/trace/pid_list.c | 2 +- kernel/trace/trace.h | 2 -- kernel/trace/trace_sched_switch.c | 2 +- 7 files changed, 38 insertions(+), 24 deletions(-)