@@ -108,6 +108,9 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
+extern int pid_max;
+extern int pid_max_min, pid_max_max;
+
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held.
@@ -30,7 +30,6 @@ struct pid_namespace {
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
- int pid_max;
struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
struct fs_pin *bacct;
@@ -39,14 +38,9 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
- struct work_struct work;
-#ifdef CONFIG_SYSCTL
- struct ctl_table_set set;
- struct ctl_table_header *sysctls;
-#if defined(CONFIG_MEMFD_CREATE)
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
int memfd_noexec_scope;
#endif
-#endif
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
@@ -123,8 +117,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
-int register_pidns_sysctls(struct pid_namespace *pidns);
-void unregister_pidns_sysctls(struct pid_namespace *pidns);
static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{
@@ -61,8 +61,10 @@ struct pid init_struct_pid = {
}, }
};
-static int pid_max_min = RESERVED_PIDS + 1;
-static int pid_max_max = PID_MAX_LIMIT;
+int pid_max = PID_MAX_DEFAULT;
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -81,7 +83,6 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
- .pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
@@ -190,7 +191,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
- int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
@@ -644,118 +644,17 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
-{
- return &task_active_pid_ns(current)->set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
- return &task_active_pid_ns(current)->set == set;
-}
-
-static int pid_table_root_permissions(struct ctl_table_header *head,
- const struct ctl_table *table)
-{
- struct pid_namespace *pidns =
- container_of(head->set, struct pid_namespace, set);
- int mode = table->mode;
-
- if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
- uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
- mode = (mode & S_IRWXU) >> 6;
- else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
- mode = (mode & S_IRWXG) >> 3;
- else
- mode = mode & S_IROTH;
- return (mode << 6) | (mode << 3) | mode;
-}
-
-static void pid_table_root_set_ownership(struct ctl_table_header *head,
- kuid_t *uid, kgid_t *gid)
-{
- struct pid_namespace *pidns =
- container_of(head->set, struct pid_namespace, set);
- kuid_t ns_root_uid;
- kgid_t ns_root_gid;
-
- ns_root_uid = make_kuid(pidns->user_ns, 0);
- if (uid_valid(ns_root_uid))
- *uid = ns_root_uid;
-
- ns_root_gid = make_kgid(pidns->user_ns, 0);
- if (gid_valid(ns_root_gid))
- *gid = ns_root_gid;
-}
-
-static struct ctl_table_root pid_table_root = {
- .lookup = pid_table_root_lookup,
- .permissions = pid_table_root_permissions,
- .set_ownership = pid_table_root_set_ownership,
-};
-
-static const struct ctl_table pid_table[] = {
- {
- .procname = "pid_max",
- .data = &init_pid_ns.pid_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
-};
-#endif
-
-int register_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
- struct ctl_table *tbl;
-
- setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
-
- tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
- if (!tbl)
- return -ENOMEM;
- tbl->data = &pidns->pid_max;
- pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
-
- pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
- ARRAY_SIZE(pid_table));
- if (!pidns->sysctls) {
- kfree(tbl);
- retire_sysctl_set(&pidns->set);
- return -ENOMEM;
- }
-#endif
- return 0;
-}
-
-void unregister_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
- const struct ctl_table *tbl;
-
- tbl = pidns->sysctls->ctl_table_arg;
- unregister_sysctl_table(pidns->sysctls);
- retire_sysctl_set(&pidns->set);
- kfree(tbl);
-#endif
-}
-
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ pid_max = min(pid_max_max, max_t(int, pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
@@ -766,16 +665,6 @@ void __init pid_idr_init(void)
NULL);
}
-static __init int pid_namespace_sysctl_init(void)
-{
-#ifdef CONFIG_SYSCTL
- /* "kernel" directory will have already been initialized. */
- BUG_ON(register_pidns_sysctls(&init_pid_ns));
-#endif
- return 0;
-}
-subsys_initcall(pid_namespace_sysctl_init);
-
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
@@ -70,8 +70,6 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
-static void destroy_pid_namespace_work(struct work_struct *work);
-
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
@@ -107,27 +105,17 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
- ns->pid_max = parent_pid_ns->pid_max;
- err = register_pidns_sysctls(ns);
- if (err)
- goto out_free_inum;
-
refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
- INIT_WORK(&ns->work, destroy_pid_namespace_work);
-
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
-
return ns;
-out_free_inum:
- ns_free_inum(&ns->ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -149,28 +137,12 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- unregister_pidns_sysctls(ns);
-
ns_free_inum(&ns->ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
-static void destroy_pid_namespace_work(struct work_struct *work)
-{
- struct pid_namespace *ns =
- container_of(work, struct pid_namespace, work);
-
- do {
- struct pid_namespace *parent;
-
- parent = ns->parent;
- destroy_pid_namespace(ns);
- ns = parent;
- } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
-}
-
struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
@@ -183,8 +155,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
- schedule_work(&ns->work);
+ struct pid_namespace *parent;
+
+ while (ns != &init_pid_ns) {
+ parent = ns->parent;
+ if (!refcount_dec_and_test(&ns->ns.count))
+ break;
+ destroy_pid_namespace(ns);
+ ns = parent;
+ }
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -295,7 +274,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
- tmp.extra2 = &pid_ns->pid_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write)
idr_set_cursor(&pid_ns->idr, next + 1);
@@ -303,6 +281,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret;
}
+extern int pid_max;
static const struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
@@ -310,7 +289,7 @@ static const struct ctl_table pid_ns_ctl_table[] = {
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &init_pid_ns.pid_max,
+ .extra2 = &pid_max,
},
};
#endif /* CONFIG_CHECKPOINT_RESTORE */
@@ -1803,6 +1803,15 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "pid_max",
+ .data = &pid_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
{
.procname = "panic_on_oops",
.data = &panic_on_oops,
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
+ WARN_ON_ONCE(pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
@@ -732,6 +732,8 @@ extern unsigned long tracing_thresh;
/* PID filtering */
+extern int pid_max;
+
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = init_pid_ns.pid_max;
+ tgid_map_max = pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)
This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80. It is already difficult for users to troubleshoot which of multiple pid limits restricts their workload. I'm afraid making pid_max per-(hierarchical-)NS will contribute to confusion. Also, the implementation copies the limit upon creation from parent, this pattern showed cumbersome with some attributes in legacy cgroup controllers -- it's subject to race condition between parent's limit modification and children creation and once copied it must be changed in the descendant. This is very similar to what pids.max of a cgroup (already) does that can be used as an alternative. Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/ Signed-off-by: Michal Koutný <mkoutny@suse.com> --- include/linux/pid.h | 3 + include/linux/pid_namespace.h | 10 +-- kernel/pid.c | 125 ++---------------------------- kernel/pid_namespace.c | 43 +++------- kernel/sysctl.c | 9 +++ kernel/trace/pid_list.c | 2 +- kernel/trace/trace.h | 2 + kernel/trace/trace_sched_switch.c | 2 +- 8 files changed, 35 insertions(+), 161 deletions(-)