@@ -12,6 +12,7 @@ enum scx_consts {
SCX_EXIT_BT_LEN = 64,
SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
};
enum scx_exit_kind {
@@ -48,6 +49,9 @@ struct scx_exit_info {
/* informational message */
char *msg;
+
+ /* debug dump */
+ char *dump;
};
/* sched_ext_ops.flags */
@@ -330,6 +334,12 @@ struct sched_ext_ops {
*/
u32 timeout_ms;
+ /**
+ * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
/**
* name - BPF scheduler's name
*
@@ -2888,12 +2898,13 @@ static void scx_ops_bypass(bool bypass)
static void free_exit_info(struct scx_exit_info *ei)
{
+ kfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
}
-static struct scx_exit_info *alloc_exit_info(void)
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
{
struct scx_exit_info *ei;
@@ -2903,8 +2914,9 @@ static struct scx_exit_info *alloc_exit_info(void)
ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
- if (!ei->bt || !ei->msg) {
+ if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
return NULL;
}
@@ -3104,8 +3116,101 @@ static void scx_ops_disable(enum scx_exit_kind kind)
schedule_scx_ops_disable_work();
}
+static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker,
+ unsigned long now)
+{
+ static unsigned long bt[SCX_EXIT_BT_LEN];
+ char dsq_id_buf[19] = "(n/a)";
+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+ unsigned int bt_len;
+ size_t avail, used;
+ char *buf;
+
+ if (p->scx.dsq)
+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+ (unsigned long long)p->scx.dsq->id);
+
+ seq_buf_printf(s, "\n %c%c %s[%d] %+ldms\n",
+ marker, task_state_to_char(p), p->comm, p->pid,
+ jiffies_delta_msecs(p->scx.runnable_at, now));
+ seq_buf_printf(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu\n",
+ scx_get_task_state(p),
+ p->scx.flags & ~SCX_TASK_STATE_MASK,
+ ops_state & SCX_OPSS_STATE_MASK,
+ ops_state >> SCX_OPSS_QSEQ_SHIFT);
+ seq_buf_printf(s, " sticky/holding_cpu=%d/%d dsq_id=%s\n",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ seq_buf_printf(s, " cpus=%*pb\n\n", cpumask_pr_args(p->cpus_ptr));
+
+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+
+ avail = seq_buf_get_buf(s, &buf);
+ used = stack_trace_snprint(buf, avail, bt, bt_len, 3);
+ seq_buf_commit(s, used < avail ? used : -1);
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+ const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ unsigned long now = jiffies;
+ struct seq_buf s;
+ size_t avail, used;
+ char *buf;
+ int cpu;
+
+ if (dump_len <= sizeof(trunc_marker))
+ return;
+
+ seq_buf_init(&s, ei->dump, dump_len - sizeof(trunc_marker));
+
+ seq_buf_printf(&s, "%s[%d] triggered exit kind %d:\n %s (%s)\n\n",
+ current->comm, current->pid, ei->kind, ei->reason, ei->msg);
+ seq_buf_printf(&s, "Backtrace:\n");
+ avail = seq_buf_get_buf(&s, &buf);
+ used = stack_trace_snprint(buf, avail, ei->bt, ei->bt_len, 1);
+ seq_buf_commit(&s, used < avail ? used : -1);
+
+ seq_buf_printf(&s, "\nRunqueue states\n");
+ seq_buf_printf(&s, "---------------\n");
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+
+ rq_lock(rq, &rf);
+
+ if (list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class)
+ goto next;
+
+ seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u ops_qseq=%lu\n",
+ cpu, rq->scx.nr_running, rq->scx.ops_qseq);
+ seq_buf_printf(&s, " curr=%s[%d] class=%ps\n",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+
+ if (rq->curr->sched_class == &ext_sched_class)
+ scx_dump_task(&s, rq->curr, '*', now);
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ scx_dump_task(&s, p, ' ', now);
+ next:
+ rq_unlock(rq, &rf);
+ }
+
+ if (seq_buf_has_overflowed(&s))
+ memcpy(ei->dump + seq_buf_used(&s) - 1, trunc_marker,
+ sizeof(trunc_marker));
+}
+
static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
{
+ struct scx_exit_info *ei = scx_exit_info;
+
+ if (ei->kind >= SCX_EXIT_ERROR)
+ scx_dump_state(ei, scx_ops.exit_dump_len);
+
schedule_scx_ops_disable_work();
}
@@ -3131,6 +3236,13 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
va_end(args);
+ /*
+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+ * in scx_ops_disable_workfn().
+ */
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
irq_work_queue(&scx_ops_error_irq_work);
}
@@ -3192,7 +3304,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
if (ret < 0)
goto err;
- scx_exit_info = alloc_exit_info();
+ scx_exit_info = alloc_exit_info(ops->exit_dump_len);
if (!scx_exit_info) {
ret = -ENOMEM;
goto err_del;
@@ -3572,6 +3684,10 @@ static int bpf_scx_init_member(const struct btf_type *t,
return -E2BIG;
ops->timeout_ms = *(u32 *)(udata + moff);
return 1;
+ case offsetof(struct sched_ext_ops, exit_dump_len):
+ ops->exit_dump_len =
+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+ return 1;
}
return 0;
@@ -126,7 +126,8 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
* and attach it, backward compatibility is automatically maintained where
* reasonable.
*
- * - sched_ext_ops.tick(): Ignored on older kernels with a warning.
+ * - ops.tick(): Ignored on older kernels with a warning.
+ * - ops.exit_dump_len: Cleared to zero on older kernels with a warning.
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
@@ -136,7 +137,13 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
__skel; \
})
-#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
+ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
+ if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \
+ (__skel)->struct_ops.__ops_name->exit_dump_len) { \
+ fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
+ (__skel)->struct_ops.__ops_name->exit_dump_len = 0; \
+ } \
if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") && \
(__skel)->struct_ops.__ops_name->tick) { \
fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \
@@ -13,6 +13,7 @@
enum uei_sizes {
UEI_REASON_LEN = 128,
UEI_MSG_LEN = 1024,
+ UEI_DUMP_DFL_LEN = 32768,
};
struct user_exit_info {
@@ -28,6 +29,8 @@ struct user_exit_info {
#include <bpf/bpf_core_read.h>
#define UEI_DEFINE(__name) \
+ char RESIZABLE_ARRAY(data, __name##_dump); \
+ const volatile u32 __name##_dump_len; \
struct user_exit_info __name SEC(".data")
#define UEI_RECORD(__uei_name, __ei) ({ \
@@ -35,6 +38,8 @@ struct user_exit_info {
sizeof(__uei_name.reason), (__ei)->reason); \
bpf_probe_read_kernel_str(__uei_name.msg, \
sizeof(__uei_name.msg), (__ei)->msg); \
+ bpf_probe_read_kernel_str(__uei_name##_dump, \
+ __uei_name##_dump_len, (__ei)->dump); \
if (bpf_core_field_exists((__ei)->exit_code)) \
__uei_name.exit_code = (__ei)->exit_code; \
/* use __sync to force memory barrier */ \
@@ -47,6 +52,13 @@ struct user_exit_info {
#include <stdio.h>
#include <stdbool.h>
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \
+ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
+ (__skel)->rodata->__uei_name##_dump_len = __len; \
+ RESIZE_ARRAY(data, __uei_name##_dump, __len); \
+})
+
#define UEI_EXITED(__skel, __uei_name) ({ \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \
@@ -54,6 +66,13 @@ struct user_exit_info {
#define UEI_REPORT(__skel, __uei_name) ({ \
struct user_exit_info *__uei = &(__skel)->data->__uei_name; \
+ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+ if (__uei_dump[0] != '\0') { \
+ fputs("\nDEBUG DUMP\n", stderr); \
+ fputs("================================================================================\n\n", stderr); \
+ fputs(__uei_dump, stderr); \
+ fputs("\n================================================================================\n\n", stderr); \
+ } \
fprintf(stderr, "EXIT: %s", __uei->reason); \
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
@@ -20,7 +20,7 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
-" [-d PID] [-p] [-v]\n"
+" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,7 @@ const char help_fmt[] =
" -T COUNT Stall every COUNT'th kernel thread\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+" -D LEN Set scx_exit_info.dump buffer length\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
@@ -59,7 +60,7 @@ int main(int argc, char **argv)
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
- while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:pvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -81,6 +82,9 @@ int main(int argc, char **argv)
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
+ case 'D':
+ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+ break;
case 'p':
skel->rodata->switch_partial = true;
skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
@@ -94,7 +98,7 @@ int main(int argc, char **argv)
}
}
- SCX_OPS_LOAD(skel, qmap_ops, scx_qmap);
+ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops);
while (!exit_req && !UEI_EXITED(skel, uei)) {
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
}
}
- SCX_OPS_LOAD(skel, simple_ops, scx_simple);
+ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
link = SCX_OPS_ATTACH(skel, simple_ops);
while (!exit_req && !UEI_EXITED(skel, uei)) {