summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/ext.c95
-rw-r--r--kernel/sched/ext_idle.c2
-rw-r--r--kernel/sched/sched.h3
-rw-r--r--tools/testing/selftests/sched_ext/Makefile1
-rw-r--r--tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c68
-rw-r--r--tools/testing/selftests/sched_ext/cyclic_kick_wait.c194
6 files changed, 337 insertions, 26 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26a6ac2f8826..d5bdcdb3f700 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
{
struct scx_sched *sch = scx_root;
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
update_curr_scx(rq);
@@ -2447,6 +2447,48 @@ switch_class:
switch_class(rq, next);
}
+static void kick_sync_wait_bal_cb(struct rq *rq)
+{
+ struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
+ unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
+ bool waited;
+ s32 cpu;
+
+ /*
+ * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
+ * — a target CPU may be waiting for us to process an IPI (e.g. TLB
+ * flush) while we wait for its kick_sync to advance.
+ *
+ * Also, keep advancing our own kick_sync so that new kick_sync waits
+ * targeting us, which can start after we drop the lock, cannot form
+ * cyclic dependencies.
+ */
+retry:
+ waited = false;
+ for_each_cpu(cpu, rq->scx.cpus_to_sync) {
+ /*
+ * smp_load_acquire() pairs with smp_store_release() on
+ * kick_sync updates on the target CPUs.
+ */
+ if (cpu == cpu_of(rq) ||
+ smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
+ cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
+ continue;
+ }
+
+ raw_spin_rq_unlock_irq(rq);
+ while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+ cpu_relax();
+ }
+ raw_spin_rq_lock_irq(rq);
+ waited = true;
+ }
+
+ if (waited)
+ goto retry;
+}
+
static struct task_struct *first_local_task(struct rq *rq)
{
return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
bool keep_prev;
struct task_struct *p;
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
rq_modified_begin(rq, &ext_sched_class);
@@ -2471,6 +2513,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
maybe_queue_balance_callback(rq);
/*
+ * Defer to a balance callback which can drop rq lock and enable
+ * IRQs. Waiting directly in the pick path would deadlock against
+ * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
+ */
+ if (unlikely(rq->scx.kick_sync_pending)) {
+ rq->scx.kick_sync_pending = false;
+ queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
+ kick_sync_wait_bal_cb);
+ }
+
+ /*
* If any higher-priority sched class enqueued a runnable task on
* this rq during balance_one(), abort and return RETRY_TASK, so
* that the scheduler loop can restart.
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (!cpumask_empty(rq->scx.cpus_to_wait))
dump_line(&ns, " cpus_to_wait : %*pb",
cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_sync))
+ dump_line(&ns, " cpus_to_sync : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_sync));
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
if (cur_class == &ext_sched_class) {
+ cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
ksyncs[cpu] = rq->scx.kick_sync;
should_wait = true;
- } else {
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
resched_curr(rq);
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
- if (!should_wait)
- return;
-
- for_each_cpu(cpu, this_scx->cpus_to_wait) {
- unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
-
- /*
- * Busy-wait until the task running at the time of kicking is no
- * longer running. This can be used to implement e.g. core
- * scheduling.
- *
- * smp_cond_load_acquire() pairs with store_releases in
- * pick_task_scx() and put_prev_task_scx(). The former breaks
- * the wait if SCX's scheduling path is entered even if the same
- * task is picked subsequently. The latter is necessary to break
- * the wait when $cpu is taken by a higher sched class.
- */
- if (cpu != cpu_of(this_rq))
- smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ /*
+ * Can't wait in hardirq — kick_sync can't advance, deadlocking if
+ * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
+ */
+ if (should_wait) {
+ raw_spin_rq_lock(this_rq);
+ this_scx->kick_sync_pending = true;
+ resched_curr(this_rq);
+ raw_spin_rq_unlock(this_rq);
}
}
@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index ba298ac3ce6c..0ae93cd64004 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -543,7 +543,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* piled up on it even if there is an idle core elsewhere on
* the system.
*/
- waker_node = cpu_to_node(cpu);
+ waker_node = scx_cpu_node_if_enabled(cpu);
if (!(current->flags & PF_EXITING) &&
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..1ef9ba480f51 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,9 +805,12 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
+ cpumask_var_t cpus_to_sync;
+ bool kick_sync_pending;
unsigned long kick_sync;
local_t reenq_local_deferred;
struct balance_callback deferred_bal_cb;
+ struct balance_callback kick_sync_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
struct scx_dispatch_q bypass_dsq;
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 006300ac6dff..1c9ca328cca1 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -188,6 +188,7 @@ auto-test-targets := \
rt_stall \
test_example \
total_bw \
+ cyclic_kick_wait \
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
diff --git a/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c b/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c
new file mode 100644
index 000000000000..cb34d3335917
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock.
+ *
+ * Three CPUs are designated from userspace. Every enqueue from one of the
+ * three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a
+ * persistent A -> B -> C -> A wait cycle pressure.
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile s32 test_cpu_a;
+const volatile s32 test_cpu_b;
+const volatile s32 test_cpu_c;
+
+u64 nr_enqueues;
+u64 nr_wait_kicks;
+
+UEI_DEFINE(uei);
+
+static s32 target_cpu(s32 cpu)
+{
+ if (cpu == test_cpu_a)
+ return test_cpu_b;
+ if (cpu == test_cpu_b)
+ return test_cpu_c;
+ if (cpu == test_cpu_c)
+ return test_cpu_a;
+ return -1;
+}
+
+void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p,
+ u64 enq_flags)
+{
+ s32 this_cpu = bpf_get_smp_processor_id();
+ s32 tgt;
+
+ __sync_fetch_and_add(&nr_enqueues, 1);
+
+ if (p->flags & PF_KTHREAD) {
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+ enq_flags | SCX_ENQ_PREEMPT);
+ return;
+ }
+
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+
+ tgt = target_cpu(this_cpu);
+ if (tgt < 0 || tgt == this_cpu)
+ return;
+
+ __sync_fetch_and_add(&nr_wait_kicks, 1);
+ scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT);
+}
+
+void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops cyclic_kick_wait_ops = {
+ .enqueue = cyclic_kick_wait_enqueue,
+ .exit = cyclic_kick_wait_exit,
+ .name = "cyclic_kick_wait",
+ .timeout_ms = 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/cyclic_kick_wait.c b/tools/testing/selftests/sched_ext/cyclic_kick_wait.c
new file mode 100644
index 000000000000..c2e5aa9de715
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/cyclic_kick_wait.c
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Test SCX_KICK_WAIT forward progress under cyclic wait pressure.
+ *
+ * SCX_KICK_WAIT busy-waits until the target CPU enters the scheduling path.
+ * If multiple CPUs form a wait cycle (A waits for B, B waits for C, C waits
+ * for A), all CPUs deadlock unless the implementation breaks the cycle.
+ *
+ * This test creates that scenario: three CPUs are arranged in a ring. The BPF
+ * scheduler's ops.enqueue() kicks the next CPU in the ring with SCX_KICK_WAIT
+ * on every enqueue. Userspace pins 4 worker threads per CPU that loop calling
+ * sched_yield(), generating a steady stream of enqueues and thus sustained
+ * A->B->C->A kick_wait cycle pressure. The test passes if the system remains
+ * responsive for 5 seconds without the scheduler being killed by the watchdog.
+ */
+#define _GNU_SOURCE
+
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <pthread.h>
+#include <sched.h>
+#include <scx/common.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "scx_test.h"
+#include "cyclic_kick_wait.bpf.skel.h"
+
+#define WORKERS_PER_CPU 4
+#define NR_TEST_CPUS 3
+#define NR_WORKERS (NR_TEST_CPUS * WORKERS_PER_CPU)
+
+struct worker_ctx {
+ pthread_t tid;
+ int cpu;
+ volatile bool stop;
+ volatile __u64 iters;
+ bool started;
+};
+
+static void *worker_fn(void *arg)
+{
+ struct worker_ctx *worker = arg;
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(worker->cpu, &mask);
+
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ return (void *)(uintptr_t)errno;
+
+ while (!worker->stop) {
+ sched_yield();
+ worker->iters++;
+ }
+
+ return NULL;
+}
+
+static int join_worker(struct worker_ctx *worker)
+{
+ void *ret;
+ struct timespec ts;
+ int err;
+
+ if (!worker->started)
+ return 0;
+
+ if (clock_gettime(CLOCK_REALTIME, &ts))
+ return -errno;
+
+ ts.tv_sec += 2;
+ err = pthread_timedjoin_np(worker->tid, &ret, &ts);
+ if (err == ETIMEDOUT)
+ pthread_detach(worker->tid);
+ if (err)
+ return -err;
+
+ if ((uintptr_t)ret)
+ return -(int)(uintptr_t)ret;
+
+ return 0;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct cyclic_kick_wait *skel;
+
+ skel = cyclic_kick_wait__open();
+ SCX_FAIL_IF(!skel, "Failed to open skel");
+ SCX_ENUM_INIT(skel);
+
+ *ctx = skel;
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct cyclic_kick_wait *skel = ctx;
+ struct worker_ctx workers[NR_WORKERS] = {};
+ struct bpf_link *link = NULL;
+ enum scx_test_status status = SCX_TEST_PASS;
+ int test_cpus[NR_TEST_CPUS];
+ int nr_cpus = 0;
+ cpu_set_t mask;
+ int ret, i;
+
+ if (sched_getaffinity(0, sizeof(mask), &mask)) {
+ SCX_ERR("Failed to get affinity (%d)", errno);
+ return SCX_TEST_FAIL;
+ }
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &mask))
+ test_cpus[nr_cpus++] = i;
+ if (nr_cpus == NR_TEST_CPUS)
+ break;
+ }
+
+ if (nr_cpus < NR_TEST_CPUS)
+ return SCX_TEST_SKIP;
+
+ skel->rodata->test_cpu_a = test_cpus[0];
+ skel->rodata->test_cpu_b = test_cpus[1];
+ skel->rodata->test_cpu_c = test_cpus[2];
+
+ if (cyclic_kick_wait__load(skel)) {
+ SCX_ERR("Failed to load skel");
+ return SCX_TEST_FAIL;
+ }
+
+ link = bpf_map__attach_struct_ops(skel->maps.cyclic_kick_wait_ops);
+ if (!link) {
+ SCX_ERR("Failed to attach scheduler");
+ return SCX_TEST_FAIL;
+ }
+
+ for (i = 0; i < NR_WORKERS; i++)
+ workers[i].cpu = test_cpus[i / WORKERS_PER_CPU];
+
+ for (i = 0; i < NR_WORKERS; i++) {
+ ret = pthread_create(&workers[i].tid, NULL, worker_fn, &workers[i]);
+ if (ret) {
+ SCX_ERR("Failed to create worker thread %d (%d)", i, ret);
+ status = SCX_TEST_FAIL;
+ goto out;
+ }
+ workers[i].started = true;
+ }
+
+ sleep(5);
+
+ if (skel->data->uei.kind != EXIT_KIND(SCX_EXIT_NONE)) {
+ SCX_ERR("Scheduler exited unexpectedly (kind=%llu code=%lld)",
+ (unsigned long long)skel->data->uei.kind,
+ (long long)skel->data->uei.exit_code);
+ status = SCX_TEST_FAIL;
+ }
+
+out:
+ for (i = 0; i < NR_WORKERS; i++)
+ workers[i].stop = true;
+
+ for (i = 0; i < NR_WORKERS; i++) {
+ ret = join_worker(&workers[i]);
+ if (ret && status == SCX_TEST_PASS) {
+ SCX_ERR("Failed to join worker thread %d (%d)", i, ret);
+ status = SCX_TEST_FAIL;
+ }
+ }
+
+ if (link)
+ bpf_link__destroy(link);
+
+ return status;
+}
+
+static void cleanup(void *ctx)
+{
+ struct cyclic_kick_wait *skel = ctx;
+
+ cyclic_kick_wait__destroy(skel);
+}
+
+struct scx_test cyclic_kick_wait = {
+ .name = "cyclic_kick_wait",
+ .description = "Verify SCX_KICK_WAIT forward progress under a 3-CPU wait cycle",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&cyclic_kick_wait)