diff mbox series

[1/2] Revert "pid: allow pid_max to be set per pid namespace"

Message ID 20250221170249.890014-2-mkoutny@suse.com (mailing list archive)
State New
Headers show
Series Alternative "pid_max" for 32-bit userspace | expand

Commit Message

Michal Koutný Feb. 21, 2025, 5:02 p.m. UTC
This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80.

It is already difficult for users to troubleshoot which of multiple pid
limits restricts their workload. I'm afraid making pid_max
per-(hierarchical-)NS will contribute to confusion.
Also, the implementation copies the limit upon creation from
parent, this pattern showed cumbersome with some attributes in legacy
cgroup controllers -- it's subject to race condition between parent's
limit modification and children creation and once copied it must be
changed in the descendant.

This is very similar to what pids.max of a cgroup (already) does that
can be used as an alternative.

Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/pid.h               |   3 +
 include/linux/pid_namespace.h     |  10 +--
 kernel/pid.c                      | 125 ++----------------------------
 kernel/pid_namespace.c            |  43 +++-------
 kernel/sysctl.c                   |   9 +++
 kernel/trace/pid_list.c           |   2 +-
 kernel/trace/trace.h              |   2 +
 kernel/trace/trace_sched_switch.c |   2 +-
 8 files changed, 35 insertions(+), 161 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f33..fe575fcdb4afa 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -108,6 +108,9 @@  extern void exchange_tids(struct task_struct *task, struct task_struct *old);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
 			 enum pid_type);
 
+extern int pid_max;
+extern int pid_max_min, pid_max_max;
+
 /*
  * look up a PID in the hash table. Must be called with the tasklist_lock
  * or rcu_read_lock() held.
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 7c67a58111998..f9f9931e02d6a 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -30,7 +30,6 @@  struct pid_namespace {
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
-	int pid_max;
 	struct pid_namespace *parent;
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct fs_pin *bacct;
@@ -39,14 +38,9 @@  struct pid_namespace {
 	struct ucounts *ucounts;
 	int reboot;	/* group exit code if this pidns was rebooted */
 	struct ns_common ns;
-	struct work_struct	work;
-#ifdef CONFIG_SYSCTL
-	struct ctl_table_set	set;
-	struct ctl_table_header *sysctls;
-#if defined(CONFIG_MEMFD_CREATE)
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	int memfd_noexec_scope;
 #endif
-#endif
 } __randomize_layout;
 
 extern struct pid_namespace init_pid_ns;
@@ -123,8 +117,6 @@  static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
 void pid_idr_init(void);
-int register_pidns_sysctls(struct pid_namespace *pidns);
-void unregister_pidns_sysctls(struct pid_namespace *pidns);
 
 static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
 {
diff --git a/kernel/pid.c b/kernel/pid.c
index 924084713be8b..aa2a7d4da4555 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -61,8 +61,10 @@  struct pid init_struct_pid = {
 	}, }
 };
 
-static int pid_max_min = RESERVED_PIDS + 1;
-static int pid_max_max = PID_MAX_LIMIT;
+int pid_max = PID_MAX_DEFAULT;
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -81,7 +83,6 @@  struct pid_namespace init_pid_ns = {
 #ifdef CONFIG_PID_NS
 	.ns.ops = &pidns_operations,
 #endif
-	.pid_max = PID_MAX_DEFAULT,
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
 #endif
@@ -190,7 +191,6 @@  struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 
 	for (i = ns->level; i >= 0; i--) {
 		int tid = 0;
-		int pid_max = READ_ONCE(tmp->pid_max);
 
 		if (set_tid_size) {
 			tid = set_tid[ns->level - i];
@@ -644,118 +644,17 @@  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	return fd;
 }
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
-{
-	return &task_active_pid_ns(current)->set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
-	return &task_active_pid_ns(current)->set == set;
-}
-
-static int pid_table_root_permissions(struct ctl_table_header *head,
-				      const struct ctl_table *table)
-{
-	struct pid_namespace *pidns =
-		container_of(head->set, struct pid_namespace, set);
-	int mode = table->mode;
-
-	if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
-	    uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
-		mode = (mode & S_IRWXU) >> 6;
-	else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
-		mode = (mode & S_IRWXG) >> 3;
-	else
-		mode = mode & S_IROTH;
-	return (mode << 6) | (mode << 3) | mode;
-}
-
-static void pid_table_root_set_ownership(struct ctl_table_header *head,
-					 kuid_t *uid, kgid_t *gid)
-{
-	struct pid_namespace *pidns =
-		container_of(head->set, struct pid_namespace, set);
-	kuid_t ns_root_uid;
-	kgid_t ns_root_gid;
-
-	ns_root_uid = make_kuid(pidns->user_ns, 0);
-	if (uid_valid(ns_root_uid))
-		*uid = ns_root_uid;
-
-	ns_root_gid = make_kgid(pidns->user_ns, 0);
-	if (gid_valid(ns_root_gid))
-		*gid = ns_root_gid;
-}
-
-static struct ctl_table_root pid_table_root = {
-	.lookup		= pid_table_root_lookup,
-	.permissions	= pid_table_root_permissions,
-	.set_ownership	= pid_table_root_set_ownership,
-};
-
-static const struct ctl_table pid_table[] = {
-	{
-		.procname	= "pid_max",
-		.data		= &init_pid_ns.pid_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &pid_max_min,
-		.extra2		= &pid_max_max,
-	},
-};
-#endif
-
-int register_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
-
-	setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
-
-	tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
-	if (!tbl)
-		return -ENOMEM;
-	tbl->data = &pidns->pid_max;
-	pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
-			     PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
-
-	pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
-						 ARRAY_SIZE(pid_table));
-	if (!pidns->sysctls) {
-		kfree(tbl);
-		retire_sysctl_set(&pidns->set);
-		return -ENOMEM;
-	}
-#endif
-	return 0;
-}
-
-void unregister_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
-	const struct ctl_table *tbl;
-
-	tbl = pidns->sysctls->ctl_table_arg;
-	unregister_sysctl_table(pidns->sysctls);
-	retire_sysctl_set(&pidns->set);
-	kfree(tbl);
-#endif
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
-	init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
-				  PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+	pid_max = min(pid_max_max, max_t(int, pid_max,
+				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 	pid_max_min = max_t(int, pid_max_min,
 				PIDS_PER_CPU_MIN * num_possible_cpus());
-	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
+	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
 	idr_init(&init_pid_ns.idr);
 
@@ -766,16 +665,6 @@  void __init pid_idr_init(void)
 			NULL);
 }
 
-static __init int pid_namespace_sysctl_init(void)
-{
-#ifdef CONFIG_SYSCTL
-	/* "kernel" directory will have already been initialized. */
-	BUG_ON(register_pidns_sysctls(&init_pid_ns));
-#endif
-	return 0;
-}
-subsys_initcall(pid_namespace_sysctl_init);
-
 static struct file *__pidfd_fget(struct task_struct *task, int fd)
 {
 	struct file *file;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 8f6cfec87555a..0f23285be4f92 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -70,8 +70,6 @@  static void dec_pid_namespaces(struct ucounts *ucounts)
 	dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
 }
 
-static void destroy_pid_namespace_work(struct work_struct *work);
-
 static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
 	struct pid_namespace *parent_pid_ns)
 {
@@ -107,27 +105,17 @@  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
-	ns->pid_max = parent_pid_ns->pid_max;
-	err = register_pidns_sysctls(ns);
-	if (err)
-		goto out_free_inum;
-
 	refcount_set(&ns->ns.count, 1);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 	ns->pid_allocated = PIDNS_ADDING;
-	INIT_WORK(&ns->work, destroy_pid_namespace_work);
-
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
 #endif
-
 	return ns;
 
-out_free_inum:
-	ns_free_inum(&ns->ns);
 out_free_idr:
 	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
@@ -149,28 +137,12 @@  static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	unregister_pidns_sysctls(ns);
-
 	ns_free_inum(&ns->ns);
 
 	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
-static void destroy_pid_namespace_work(struct work_struct *work)
-{
-	struct pid_namespace *ns =
-		container_of(work, struct pid_namespace, work);
-
-	do {
-		struct pid_namespace *parent;
-
-		parent = ns->parent;
-		destroy_pid_namespace(ns);
-		ns = parent;
-	} while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
-}
-
 struct pid_namespace *copy_pid_ns(unsigned long flags,
 	struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
@@ -183,8 +155,15 @@  struct pid_namespace *copy_pid_ns(unsigned long flags,
 
 void put_pid_ns(struct pid_namespace *ns)
 {
-	if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
-		schedule_work(&ns->work);
+	struct pid_namespace *parent;
+
+	while (ns != &init_pid_ns) {
+		parent = ns->parent;
+		if (!refcount_dec_and_test(&ns->ns.count))
+			break;
+		destroy_pid_namespace(ns);
+		ns = parent;
+	}
 }
 EXPORT_SYMBOL_GPL(put_pid_ns);
 
@@ -295,7 +274,6 @@  static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
 	next = idr_get_cursor(&pid_ns->idr) - 1;
 
 	tmp.data = &next;
-	tmp.extra2 = &pid_ns->pid_max;
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	if (!ret && write)
 		idr_set_cursor(&pid_ns->idr, next + 1);
@@ -303,6 +281,7 @@  static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
 	return ret;
 }
 
+extern int pid_max;
 static const struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
@@ -310,7 +289,7 @@  static const struct ctl_table pid_ns_ctl_table[] = {
 		.mode = 0666, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
 		.extra1 = SYSCTL_ZERO,
-		.extra2 = &init_pid_ns.pid_max,
+		.extra2 = &pid_max,
 	},
 };
 #endif	/* CONFIG_CHECKPOINT_RESTORE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb57da499ebb1..bb739608680f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1803,6 +1803,15 @@  static const struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+	{
+		.procname	= "pid_max",
+		.data		= &pid_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &pid_max_min,
+		.extra2		= &pid_max_max,
+	},
 	{
 		.procname	= "panic_on_oops",
 		.data		= &panic_on_oops,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index c62b9b3cfb3d8..4966e6bbdf6f3 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@  struct trace_pid_list *trace_pid_list_alloc(void)
 	int i;
 
 	/* According to linux/thread.h, pids can be no bigger that 30 bits */
-	WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
+	WARN_ON_ONCE(pid_max > (1 << 30));
 
 	pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
 	if (!pid_list)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9c21ba45b7af6..46c65402ad7e5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -732,6 +732,8 @@  extern unsigned long tracing_thresh;
 
 /* PID filtering */
 
+extern int pid_max;
+
 bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
 			     pid_t search_pid);
 bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb49f7279dc80..573b5d8e8a28e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -442,7 +442,7 @@  int trace_alloc_tgid_map(void)
 	if (tgid_map)
 		return 0;
 
-	tgid_map_max = init_pid_ns.pid_max;
+	tgid_map_max = pid_max;
 	map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
 		       GFP_KERNEL);
 	if (!map)