@@ -8,6 +8,7 @@
enum scx_consts {
SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
SCX_EXIT_BT_LEN = 64,
@@ -598,6 +599,7 @@ static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
static struct kset *scx_kset;
static struct kobject *scx_root_kobj;
+static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
s64 exit_code,
const char *fmt, ...);
@@ -1840,6 +1842,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
struct scx_rq *scx_rq = &rq->scx;
struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ int nr_loops = SCX_DSP_MAX_LOOPS;
bool has_tasks = false;
lockdep_assert_rq_held(rq);
@@ -1896,6 +1899,20 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
goto has_tasks;
if (consume_dispatch_q(rq, rf, &scx_dsq_global))
goto has_tasks;
+
+ /*
+ * ops.dispatch() can trap us in this loop by repeatedly
+ * dispatching ineligible tasks. Break out once in a while to
+ * allow the watchdog to run. As IRQ can't be enabled in
+ * balance(), we want to complete this scheduling cycle and then
+ * start a new one. IOW, we want to call resched_curr() on the
+ * next, most likely idle, task, not the current one. Use
+ * scx_bpf_kick_cpu() for deferred kicking.
+ */
+ if (unlikely(!--nr_loops)) {
+ scx_bpf_kick_cpu(cpu_of(rq), 0);
+ break;
+ }
} while (dspc->nr_tasks);
goto out;
@@ -31,6 +31,7 @@ char _license[] SEC("license") = "GPL";
const volatile u64 slice_ns = SCX_SLICE_DFL;
const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
+const volatile u32 dsp_inf_loop_after;
const volatile u32 dsp_batch;
const volatile s32 disallow_tgid;
const volatile bool switch_partial;
@@ -198,6 +199,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
if (scx_bpf_consume(SHARED_DSQ))
return;
+ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+ /*
+ * PID 2 should be kthreadd which should mostly be idle and off
+ * the scheduler. Let's keep dispatching it to force the kernel
+ * to call this function over and over again.
+ */
+ p = bpf_task_from_pid(2);
+ if (p) {
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+ bpf_task_release(p);
+ return;
+ }
+ }
+
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
@@ -19,13 +19,14 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
+" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
@@ -60,7 +61,7 @@ int main(int argc, char **argv)
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
- while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:pvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:pvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -74,6 +75,9 @@ int main(int argc, char **argv)
case 'T':
skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
break;
+ case 'l':
+ skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
+ break;
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;