@@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is
tracked and exported as well, to allow detection of latency spikes
which wouldn't necessarily make a dent in the time averages, or to
average trends over custom time frames.
+
+Cgroup2 interface
+=================
+
+In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
+mounted, pressure stall information is also tracked for tasks grouped
+into cgroups. Each subdirectory in the cgroupfs mountpoint contains
+cpu.pressure, memory.pressure, and io.pressure files; the format is
+the same as the /proc/pressure/ files.
@@ -963,6 +963,12 @@ All time durations are in microseconds.
$PERIOD duration. "max" for $MAX indicates no limit. If only
one number is written, $MAX is updated.
+ cpu.pressure
+ A read-only nested-key file which exists on non-root cgroups.
+
+ Shows pressure stall information for CPU. See
+ Documentation/accounting/psi.txt for details.
+
Memory
------
@@ -1199,6 +1205,12 @@ PAGE_SIZE multiple when read back.
Swap usage hard limit. If a cgroup's swap usage reaches this
limit, anonymous memory of the cgroup will not be swapped out.
+ memory.pressure
+ A read-only nested-key file which exists on non-root cgroups.
+
+ Shows pressure stall information for memory. See
+ Documentation/accounting/psi.txt for details.
+
Usage Guidelines
~~~~~~~~~~~~~~~~
@@ -1334,6 +1346,12 @@ IO Interface Files
8:16 rbps=2097152 wbps=max riops=max wiops=max
+ io.pressure
+ A read-only nested-key file which exists on non-root cgroups.
+
+ Shows pressure stall information for IO. See
+ Documentation/accounting/psi.txt for details.
+
Writeback
~~~~~~~~~
@@ -20,6 +20,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
+#include <linux/psi_types.h>
#ifdef CONFIG_CGROUPS
@@ -424,6 +425,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;
+ /* used to track pressure stalls */
+ struct psi_group psi;
+
/* used to store eBPF programs */
struct cgroup_bpf bpf;
@@ -627,6 +627,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return &cgrp->psi;
+}
+
static inline void cgroup_init_kthreadd(void)
{
/*
@@ -680,6 +685,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
return NULL;
}
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ return NULL;
+}
+
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return NULL;
+}
+
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
@@ -4,6 +4,9 @@
#include <linux/psi_types.h>
#include <linux/sched.h>
+struct seq_file;
+struct css_set;
+
#ifdef CONFIG_PSI
extern bool psi_disabled;
@@ -15,6 +18,14 @@ void psi_task_change(struct task_struct *task, u64 now, int clear, int set);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);
+int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgrp);
+void psi_cgroup_free(struct cgroup *cgrp);
+void cgroup_move_task(struct task_struct *p, struct css_set *to);
+#endif
+
#else /* CONFIG_PSI */
static inline void psi_init(void) {}
@@ -22,6 +33,20 @@ static inline void psi_init(void) {}
static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {}
+#ifdef CONFIG_CGROUPS
+static inline int psi_cgroup_alloc(struct cgroup *cgrp)
+{
+ return 0;
+}
+static inline void psi_cgroup_free(struct cgroup *cgrp)
+{
+}
+static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
+{
+ rcu_assign_pointer(p->cgroups, to);
+}
+#endif
+
#endif /* CONFIG_PSI */
#endif /* _LINUX_PSI_H */
@@ -468,6 +468,10 @@ config PSI
the share of walltime in which some or all tasks in the system are
delayed due to contention of the respective resource.
+ In kernels with cgroup support (cgroup2 only), cgroups will
+ have cpu.pressure, memory.pressure, and io.pressure files,
+ which aggregate pressure stalls for the grouped tasks only.
+
For more details see Documentation/accounting/psi.txt.
Say N if unsure.
@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/psi.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -826,7 +827,7 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
- rcu_assign_pointer(task->cgroups, to_cset);
+ cgroup_move_task(task, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
@@ -3388,6 +3389,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}
+#ifdef CONFIG_PSI
+static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+}
+static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+}
+static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+}
+#endif
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -4499,6 +4515,23 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
+#ifdef CONFIG_PSI
+ {
+ .name = "io.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_io_pressure_show,
+ },
+ {
+ .name = "memory.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_memory_pressure_show,
+ },
+ {
+ .name = "cpu.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_cpu_pressure_show,
+ },
+#endif
{ } /* terminate */
};
@@ -4559,6 +4592,7 @@ static void css_free_rwork_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ psi_cgroup_free(cgrp);
if (cgroup_on_dfl(cgrp))
cgroup_stat_exit(cgrp);
kfree(cgrp);
@@ -4805,10 +4839,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
- ret = cgroup_bpf_inherit(cgrp);
+
+ ret = psi_cgroup_alloc(cgrp);
if (ret)
goto out_idr_free;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_psi_free;
+
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4846,6 +4885,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
return cgrp;
+out_psi_free:
+ psi_cgroup_free(cgrp);
out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
@@ -464,6 +464,9 @@ static void psi_group_change(struct psi_group *group, int cpu, u64 now,
void psi_task_change(struct task_struct *task, u64 now, int clear, int set)
{
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgroup, *parent;
+#endif
int cpu = task_cpu(task);
if (psi_disabled)
@@ -485,6 +488,18 @@ void psi_task_change(struct task_struct *task, u64 now, int clear, int set)
task->psi_flags |= set;
psi_group_change(&psi_system, cpu, now, clear, set);
+
+#ifdef CONFIG_CGROUPS
+ cgroup = task->cgroups->dfl_cgrp;
+ while (cgroup && (parent = cgroup_parent(cgroup))) {
+ struct psi_group *group;
+
+ group = cgroup_psi(cgroup);
+ psi_group_change(group, cpu, now, clear, set);
+
+ cgroup = parent;
+ }
+#endif
}
/**
@@ -551,8 +566,70 @@ void psi_memstall_leave(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}
-static int psi_show(struct seq_file *m, struct psi_group *group,
- enum psi_res res)
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgroup)
+{
+ cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi.pcpu)
+ return -ENOMEM;
+ psi_group_init(&cgroup->psi);
+ return 0;
+}
+
+void psi_cgroup_free(struct cgroup *cgroup)
+{
+ cancel_delayed_work_sync(&cgroup->psi.clock_work);
+ free_percpu(cgroup->psi.pcpu);
+}
+
+/**
+ * cgroup_move_task - move task to a different cgroup
+ * @task: the task
+ * @to: the target css_set
+ *
+ * Move task to a new cgroup and safely migrate its associated stall
+ * state between the different groups.
+ *
+ * This function acquires the task's rq lock to lock out concurrent
+ * changes to the task's scheduling state and - in case the task is
+ * running - concurrent changes to its stall state.
+ */
+void cgroup_move_task(struct task_struct *task, struct css_set *to)
+{
+ unsigned int task_flags = 0;
+ struct rq_flags rf;
+ struct rq *rq;
+ u64 now;
+
+ rq = task_rq_lock(task, &rf);
+
+ if (task_on_rq_queued(task))
+ task_flags = TSK_RUNNING;
+ else if (task->in_iowait)
+ task_flags = TSK_IOWAIT;
+ if (task->flags & PF_MEMSTALL)
+ task_flags |= TSK_MEMSTALL;
+
+ if (task_flags) {
+ update_rq_clock(rq);
+ now = rq_clock(rq);
+ psi_task_change(task, now, task_flags, 0);
+ }
+
+ /*
+ * Lame to do this here, but the scheduler cannot be locked
+ * from the outside, so we move cgroups from inside sched/.
+ */
+ rcu_assign_pointer(task->cgroups, to);
+
+ if (task_flags)
+ psi_task_change(task, now, 0, task_flags);
+
+ task_rq_unlock(rq, task, &rf);
+}
+#endif /* CONFIG_CGROUPS */
+
+int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
int full;