new file mode 100644
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched_ext
+
+#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_EXT_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(sched_ext_dump,
+
+ TP_PROTO(const char *line),
+
+ TP_ARGS(line),
+
+ TP_STRUCT__entry(
+ __string(line, line)
+ ),
+
+ TP_fast_assign(
+ __assign_str(line);
+ ),
+
+ TP_printk("%s",
+ __get_str(line)
+ )
+);
+
+#endif /* _TRACE_SCHED_EXT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
@@ -12,6 +12,7 @@ enum scx_consts {
SCX_EXIT_BT_LEN = 64,
SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
};
enum scx_exit_kind {
@@ -48,6 +49,9 @@ struct scx_exit_info {
/* informational message */
char *msg;
+
+ /* debug dump */
+ char *dump;
};
/* sched_ext_ops.flags */
@@ -105,6 +109,17 @@ struct scx_exit_task_args {
bool cancelled;
};
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
/**
* struct sched_ext_ops - Operation table for BPF scheduler implementation
*
@@ -296,6 +311,36 @@ struct sched_ext_ops {
*/
void (*disable)(struct task_struct *p);
+ /**
+ * dump - Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * dump_cpu - Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * dump_task - Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
/*
* All online ops must come before ops.init().
*/
@@ -330,6 +375,12 @@ struct sched_ext_ops {
*/
u32 timeout_ms;
+ /**
+ * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
/**
* name - BPF scheduler's name
*
@@ -567,10 +618,27 @@ struct scx_bstr_buf {
static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
static struct scx_bstr_buf scx_exit_bstr_buf;
+/* ops debug dump */
+struct scx_dump_data {
+ s32 cpu;
+ bool first;
+ s32 cursor;
+ struct seq_buf *s;
+ const char *prefix;
+ struct scx_bstr_buf buf;
+};
+
+struct scx_dump_data scx_dump_data = {
+ .cpu = -1,
+};
+
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
static struct kobject *scx_root_kobj;
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched_ext.h>
+
static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
s64 exit_code,
const char *fmt, ...);
@@ -2897,12 +2965,13 @@ static void scx_ops_bypass(bool bypass)
static void free_exit_info(struct scx_exit_info *ei)
{
+ kfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
}
-static struct scx_exit_info *alloc_exit_info(void)
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
{
struct scx_exit_info *ei;
@@ -2912,8 +2981,9 @@ static struct scx_exit_info *alloc_exit_info(void)
ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
- if (!ei->bt || !ei->msg) {
+ if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
return NULL;
}
@@ -3125,8 +3195,274 @@ static void scx_ops_disable(enum scx_exit_kind kind)
schedule_scx_ops_disable_work();
}
+static void dump_newline(struct seq_buf *s)
+{
+ trace_sched_ext_dump("");
+
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size)
+ seq_buf_putc(s, '\n');
+}
+
+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
+{
+ va_list args;
+
+#ifdef CONFIG_TRACEPOINTS
+ if (trace_sched_ext_dump_enabled()) {
+ /* protected by scx_dump_state()::dump_lock */
+ static char line_buf[SCX_EXIT_MSG_LEN];
+
+ va_start(args, fmt);
+ vscnprintf(line_buf, sizeof(line_buf), fmt, args);
+ va_end(args);
+
+ trace_sched_ext_dump(line_buf);
+ }
+#endif
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size) {
+ va_start(args, fmt);
+ seq_buf_vprintf(s, fmt, args);
+ va_end(args);
+
+ seq_buf_putc(s, '\n');
+ }
+}
+
+static void dump_stack_trace(struct seq_buf *s, const char *prefix,
+ const unsigned long *bt, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++)
+ dump_line(s, "%s%pS", prefix, (void *)bt[i]);
+}
+
+static void ops_dump_init(struct seq_buf *s, const char *prefix)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+
+ lockdep_assert_irqs_disabled();
+
+ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */
+ dd->first = true;
+ dd->cursor = 0;
+ dd->s = s;
+ dd->prefix = prefix;
+}
+
+static void ops_dump_flush(void)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ char *line = dd->buf.line;
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * There's something to flush and this is the first line. Insert a blank
+ * line to distinguish ops dump.
+ */
+ if (dd->first) {
+ dump_newline(dd->s);
+ dd->first = false;
+ }
+
+ /*
+ * There may be multiple lines in $line. Scan and emit each line
+ * separately.
+ */
+ while (true) {
+ char *end = line;
+ char c;
+
+ while (*end != '\n' && *end != '\0')
+ end++;
+
+ /*
+ * If $line overflowed, it may not have newline at the end.
+ * Always emit with a newline.
+ */
+ c = *end;
+ *end = '\0';
+ dump_line(dd->s, "%s%s", dd->prefix, line);
+ if (c == '\0')
+ break;
+
+ /* move to the next line */
+ end++;
+ if (*end == '\0')
+ break;
+ line = end;
+ }
+
+ dd->cursor = 0;
+}
+
+static void ops_dump_exit(void)
+{
+ ops_dump_flush();
+ scx_dump_data.cpu = -1;
+}
+
+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct task_struct *p, char marker)
+{
+ static unsigned long bt[SCX_EXIT_BT_LEN];
+ char dsq_id_buf[19] = "(n/a)";
+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+ unsigned int bt_len;
+
+ if (p->scx.dsq)
+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+ (unsigned long long)p->scx.dsq->id);
+
+ dump_newline(s);
+ dump_line(s, " %c%c %s[%d] %+ldms",
+ marker, task_state_to_char(p), p->comm, p->pid,
+ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
+ dump_line(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu",
+ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+ ops_state & SCX_OPSS_STATE_MASK,
+ ops_state >> SCX_OPSS_QSEQ_SHIFT);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+
+ if (SCX_HAS_OP(dump_task)) {
+ ops_dump_init(s, " ");
+ SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ ops_dump_exit();
+ }
+
+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+ if (bt_len) {
+ dump_newline(s);
+ dump_stack_trace(s, " ", bt, bt_len);
+ }
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+ static DEFINE_SPINLOCK(dump_lock);
+ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_dump_ctx dctx = {
+ .kind = ei->kind,
+ .exit_code = ei->exit_code,
+ .reason = ei->reason,
+ .at_ns = ktime_get_ns(),
+ .at_jiffies = jiffies,
+ };
+ struct seq_buf s;
+ unsigned long flags;
+ char *buf;
+ int cpu;
+
+ spin_lock_irqsave(&dump_lock, flags);
+
+ seq_buf_init(&s, ei->dump, dump_len);
+
+ if (ei->kind == SCX_EXIT_NONE) {
+ dump_line(&s, "Debug dump triggered by %s", ei->reason);
+ } else {
+ dump_line(&s, "%s[%d] triggered exit kind %d:",
+ current->comm, current->pid, ei->kind);
+ dump_line(&s, " %s (%s)", ei->reason, ei->msg);
+ dump_newline(&s);
+ dump_line(&s, "Backtrace:");
+ dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
+ }
+
+ if (SCX_HAS_OP(dump)) {
+ ops_dump_init(&s, "");
+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ ops_dump_exit();
+ }
+
+ dump_newline(&s);
+ dump_line(&s, "CPU states");
+ dump_line(&s, "----------");
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+ struct seq_buf ns;
+ size_t avail, used;
+ bool idle;
+
+ rq_lock(rq, &rf);
+
+ idle = list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class;
+
+ if (idle && !SCX_HAS_OP(dump_cpu))
+ goto next;
+
+ /*
+ * We don't yet know whether ops.dump_cpu() will produce output
+ * and we may want to skip the default CPU dump if it doesn't.
+ * Use a nested seq_buf to generate the standard dump so that we
+ * can decide whether to commit later.
+ */
+ avail = seq_buf_get_buf(&s, &buf);
+ seq_buf_init(&ns, buf, avail);
+
+ dump_newline(&ns);
+ dump_line(&ns, "CPU %-4d: nr_run=%u ops_qseq=%lu",
+ cpu, rq->scx.nr_running, rq->scx.ops_qseq);
+ dump_line(&ns, " curr=%s[%d] class=%ps",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+
+ used = seq_buf_used(&ns);
+ if (SCX_HAS_OP(dump_cpu)) {
+ ops_dump_init(&ns, " ");
+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ ops_dump_exit();
+ }
+
+ /*
+ * If idle && nothing generated by ops.dump_cpu(), there's
+ * nothing interesting. Skip.
+ */
+ if (idle && used == seq_buf_used(&ns))
+ goto next;
+
+ /*
+ * $s may already have overflowed when $ns was created. If so,
+ * calling commit on it will trigger BUG.
+ */
+ if (avail) {
+ seq_buf_commit(&s, seq_buf_used(&ns));
+ if (seq_buf_has_overflowed(&ns))
+ seq_buf_set_overflow(&s);
+ }
+
+ if (rq->curr->sched_class == &ext_sched_class)
+ scx_dump_task(&s, &dctx, rq->curr, '*');
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ scx_dump_task(&s, &dctx, p, ' ');
+ next:
+ rq_unlock(rq, &rf);
+ }
+
+ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
+ memcpy(ei->dump + dump_len - sizeof(trunc_marker),
+ trunc_marker, sizeof(trunc_marker));
+
+ spin_unlock_irqrestore(&dump_lock, flags);
+}
+
static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
{
+ struct scx_exit_info *ei = scx_exit_info;
+
+ if (ei->kind >= SCX_EXIT_ERROR)
+ scx_dump_state(ei, scx_ops.exit_dump_len);
+
schedule_scx_ops_disable_work();
}
@@ -3152,6 +3488,13 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
va_end(args);
+ /*
+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+ * in scx_ops_disable_workfn().
+ */
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
irq_work_queue(&scx_ops_error_irq_work);
}
@@ -3213,7 +3556,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret < 0)
goto err;
- scx_exit_info = alloc_exit_info();
+ scx_exit_info = alloc_exit_info(ops->exit_dump_len);
if (!scx_exit_info) {
ret = -ENOMEM;
goto err_del;
@@ -3592,6 +3935,10 @@ static int bpf_scx_init_member(const struct btf_type *t,
return -E2BIG;
ops->timeout_ms = *(u32 *)(udata + moff);
return 1;
+ case offsetof(struct sched_ext_ops, exit_dump_len):
+ ops->exit_dump_len =
+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+ return 1;
}
return 0;
@@ -3723,6 +4070,21 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
.enable_mask = SYSRQ_ENABLE_RTNICE,
};
+static void sysrq_handle_sched_ext_dump(u8 key)
+{
+ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+
+ if (scx_enabled())
+ scx_dump_state(&ei, 0);
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
+ .handler = sysrq_handle_sched_ext_dump,
+ .help_msg = "dump-sched-ext(D)",
+ .action_msg = "Trigger sched_ext debug dump",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
/**
* print_scx_info - print out sched_ext scheduler state
* @log_lvl: the log level to use when printing
@@ -3793,6 +4155,7 @@ void __init init_sched_ext_class(void)
}
register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+ register_sysrq_key('D', &sysrq_sched_ext_dump_op);
INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
}
@@ -4218,6 +4581,57 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
+/**
+ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
+ * @fmt: format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
+ * dump_task() to generate extra debug dump specific to the BPF scheduler.
+ *
+ * The extra dump may be multiple lines. A single line may be split over
+ * multiple calls. The last line is automatically terminated.
+ */
+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ struct scx_bstr_buf *buf = &dd->buf;
+ s32 ret;
+
+ if (raw_smp_processor_id() != dd->cpu) {
+ scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ return;
+ }
+
+ /* append the formatted string to the line buf */
+ ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
+ if (ret < 0) {
+ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
+ dd->prefix, fmt, data, data__sz, ret);
+ return;
+ }
+
+ dd->cursor += ret;
+ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * If the line buf overflowed or ends in a newline, flush it into the
+ * dump. This is to allow the caller to generate a single line over
+ * multiple calls. As ops_dump_flush() can also handle multiple lines in
+ * the line buf, the only case which can lead to an unexpected
+ * truncation is when the caller keeps generating newlines in the middle
+ * instead of the end consecutively. Don't do that.
+ */
+ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
+ ops_dump_flush();
+}
+
/**
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
*
@@ -4426,6 +4840,7 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
@@ -38,6 +38,7 @@ s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
@@ -97,6 +98,17 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
___scx_bpf_bstr_format_checker(fmt, ##args); \
})
+/*
+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
+ * instead of an array of u64. To be used from ops.dump() and friends.
+ */
+#define scx_bpf_dump(fmt, args...) \
+({ \
+ scx_bpf_bstr_preamble(fmt, args) \
+ scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \
+ ___scx_bpf_bstr_format_checker(fmt, ##args); \
+})
+
#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
BPF_PROG(name, ##args)
@@ -111,16 +111,23 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
* and attach it, backward compatibility is automatically maintained where
* reasonable.
+ *
+ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
+ * the current minimum required kernel version.
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
\
+ SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \
+ "sched_ext_ops.dump() missing, kernel too old?"); \
+ \
__skel = __scx_name##__open(); \
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
__skel; \
})
-#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
+ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
})
@@ -13,6 +13,7 @@
enum uei_sizes {
UEI_REASON_LEN = 128,
UEI_MSG_LEN = 1024,
+ UEI_DUMP_DFL_LEN = 32768,
};
struct user_exit_info {
@@ -28,6 +29,8 @@ struct user_exit_info {
#include <bpf/bpf_core_read.h>
#define UEI_DEFINE(__name) \
+ char RESIZABLE_ARRAY(data, __name##_dump); \
+ const volatile u32 __name##_dump_len; \
struct user_exit_info __name SEC(".data")
#define UEI_RECORD(__uei_name, __ei) ({ \
@@ -35,6 +38,8 @@ struct user_exit_info {
sizeof(__uei_name.reason), (__ei)->reason); \
bpf_probe_read_kernel_str(__uei_name.msg, \
sizeof(__uei_name.msg), (__ei)->msg); \
+ bpf_probe_read_kernel_str(__uei_name##_dump, \
+ __uei_name##_dump_len, (__ei)->dump); \
if (bpf_core_field_exists((__ei)->exit_code)) \
__uei_name.exit_code = (__ei)->exit_code; \
/* use __sync to force memory barrier */ \
@@ -47,6 +52,13 @@ struct user_exit_info {
#include <stdio.h>
#include <stdbool.h>
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \
+ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
+ (__skel)->rodata->__uei_name##_dump_len = __len; \
+ RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \
+})
+
#define UEI_EXITED(__skel, __uei_name) ({ \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \
@@ -54,6 +66,13 @@ struct user_exit_info {
#define UEI_REPORT(__skel, __uei_name) ({ \
struct user_exit_info *__uei = &(__skel)->data->__uei_name; \
+ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+ if (__uei_dump[0] != '\0') { \
+ fputs("\nDEBUG DUMP\n", stderr); \
+ fputs("================================================================================\n\n", stderr); \
+ fputs(__uei_dump, stderr); \
+ fputs("\n================================================================================\n\n", stderr); \
+ } \
fprintf(stderr, "EXIT: %s", __uei->reason); \
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
@@ -33,6 +33,7 @@ const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
const volatile u32 dsp_batch;
const volatile s32 disallow_tgid;
+const volatile bool suppress_dump;
u32 test_error_cnt;
@@ -258,6 +259,56 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
return -ENOMEM;
}
+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+{
+ s32 i, pid;
+
+ if (suppress_dump)
+ return;
+
+ bpf_for(i, 0, 5) {
+ void *fifo;
+
+ if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
+ return;
+
+ scx_bpf_dump("QMAP FIFO[%d]:", i);
+ bpf_repeat(4096) {
+ if (bpf_map_pop_elem(fifo, &pid))
+ break;
+ scx_bpf_dump(" %d", pid);
+ }
+ scx_bpf_dump("\n");
+ }
+}
+
+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
+{
+ u32 zero = 0;
+ struct cpu_ctx *cpuc;
+
+ if (suppress_dump || idle)
+ return;
+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
+ return;
+
+ scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
+ cpuc->dsp_idx, cpuc->dsp_cnt);
+}
+
+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
+{
+ struct task_ctx *taskc;
+
+ if (suppress_dump)
+ return;
+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ return;
+
+ scx_bpf_dump("QMAP: force_local=%d",
+ taskc->force_local);
+}
+
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
return scx_bpf_create_dsq(SHARED_DSQ, -1);
@@ -274,6 +325,9 @@ SCX_OPS_DEFINE(qmap_ops,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.init_task = (void *)qmap_init_task,
+ .dump = (void *)qmap_dump,
+ .dump_cpu = (void *)qmap_dump_cpu,
+ .dump_task = (void *)qmap_dump_task,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.timeout_ms = 5000U,
@@ -20,7 +20,7 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
-" [-d PID] [-p] [-v]\n"
+" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,8 @@ const char help_fmt[] =
" -T COUNT Stall every COUNT'th kernel thread\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+" -D LEN Set scx_exit_info.dump buffer length\n"
+" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
@@ -59,7 +61,7 @@ int main(int argc, char **argv)
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
- while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -81,6 +83,12 @@ int main(int argc, char **argv)
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
+ case 'D':
+ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ skel->rodata->suppress_dump = true;
+ break;
case 'p':
skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
break;
@@ -93,7 +101,7 @@ int main(int argc, char **argv)
}
}
- SCX_OPS_LOAD(skel, qmap_ops, scx_qmap);
+ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
while (!exit_req && !UEI_EXITED(skel, uei)) {
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
}
}
- SCX_OPS_LOAD(skel, simple_ops, scx_simple);
+ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
while (!exit_req && !UEI_EXITED(skel, uei)) {