@@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) {
+ is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}
@@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
#define task_uid(task) (task_cred_xxx((task), uid))
#define task_euid(task) (task_cred_xxx((task), euid))
+#define task_ucounts(task) (task_cred_xxx((task), ucounts))
#define current_cred_xxx(xxx) \
({ \
@@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred)
#define current_fsgid() (current_cred_xxx(fsgid))
#define current_cap() (current_cred_xxx(cap_effective))
#define current_user() (current_cred_xxx(user))
+#define current_ucounts() (current_cred_xxx(ucounts))
extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
@@ -12,7 +12,6 @@
*/
struct user_struct {
refcount_t __count; /* reference count */
- atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
@@ -50,9 +50,12 @@ enum ucount_type {
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
#endif
+ UCOUNT_RLIMIT_NPROC,
UCOUNT_COUNTS,
};
+#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
@@ -107,6 +110,16 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
void put_ucounts(struct ucounts *ucounts);
+static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
+{
+ return atomic_long_read(&ucounts->ucount[type]);
+}
+
+bool inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool inc_rlimit_ucounts_and_test(struct ucounts *ucounts, enum ucount_type type, long v, long max);
+void dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
+
#ifdef CONFIG_USER_NS
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
@@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
@@ -496,12 +496,12 @@ int commit_creds(struct cred *new)
* in set_user().
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
- atomic_dec(&old->user->processes);
+ dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
/* send notifications */
@@ -188,7 +188,7 @@ void release_task(struct task_struct *p)
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
cgroup_release(p);
@@ -819,9 +819,11 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < UCOUNT_COUNTS; i++)
+ for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
+ init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
+
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
@@ -1972,8 +1974,7 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
@@ -2376,7 +2377,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
p->state = TASK_DEAD;
@@ -473,7 +473,7 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
+ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
new_user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+ { },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -240,6 +241,55 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
put_ucounts(ucounts);
}
+bool inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+ struct ucounts *iter;
+ bool overlimit = false;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long max = READ_ONCE(iter->ns->ucount_max[type]);
+ long new = atomic_long_add_return(v, &iter->ucount[type]);
+ if (new < 0 || new > max)
+ overlimit = true;
+ }
+
+ return overlimit;
+}
+
+bool inc_rlimit_ucounts_and_test(struct ucounts *ucounts, enum ucount_type type,
+ long v, long max)
+{
+ bool overlimit = inc_rlimit_ucounts(ucounts, type, v);
+ if (!overlimit) {
+ long new = get_ucounts_value(ucounts, type);
+ if ( new < 0 || new > max)
+ overlimit = true;
+ }
+ return overlimit;
+}
+
+void dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+ struct ucounts *iter;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ }
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+{
+ struct ucounts *iter;
+ if (get_ucounts_value(ucounts, type) > max)
+ return true;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ max = READ_ONCE(iter->ns->ucount_max[type]);
+ if (get_ucounts_value(iter, type) > max)
+ return true;
+ }
+ return false;
+}
+
static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
@@ -256,6 +306,7 @@ static __init int user_namespace_sysctl_init(void)
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
hlist_add_ucounts(&init_ucounts);
+ inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .processes = ATOMIC_INIT(1),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
@@ -118,9 +118,10 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
INIT_WORK(&ns->work, free_user_ns);
- for (i = 0; i < UCOUNT_COUNTS; i++) {
+ for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
+ ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com> --- fs/exec.c | 2 +- include/linux/cred.h | 2 ++ include/linux/sched/user.h | 1 - include/linux/user_namespace.h | 13 +++++++++ kernel/cred.c | 10 +++---- kernel/exit.c | 2 +- kernel/fork.c | 9 +++--- kernel/sys.c | 2 +- kernel/ucount.c | 51 ++++++++++++++++++++++++++++++++++ kernel/user.c | 1 - kernel/user_namespace.c | 3 +- 11 files changed, 81 insertions(+), 15 deletions(-)