mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-05-05 09:57:21 +02:00
sched_ext: Add basic building blocks for nested sub-scheduler dispatching
This is an early-stage partial implementation that demonstrates the core building blocks for nested sub-scheduler dispatching. While significant work remains in the enqueue path and other areas, this patch establishes the fundamental mechanisms needed for hierarchical scheduler operation. The key building blocks introduced include: - Private stack support for ops.dispatch() to prevent stack overflow when walking down nested schedulers during dispatch operations - scx_bpf_sub_dispatch() kfunc that allows parent schedulers to trigger dispatch operations on their direct child schedulers - Proper parent-child relationship validation to ensure dispatch requests are only made to legitimate child schedulers - Updated scx_dispatch_sched() to handle both nested and non-nested invocations with appropriate kf_mask handling The qmap scheduler is updated to demonstrate the functionality by calling scx_bpf_sub_dispatch() on registered child schedulers when it has no tasks in its own queues. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
+105
-15
@@ -2444,8 +2444,14 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
|
||||
rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
|
||||
}
|
||||
|
||||
static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
|
||||
struct task_struct *prev)
|
||||
/*
|
||||
* One user of this function is scx_bpf_dispatch() which can be called
|
||||
* recursively as sub-sched dispatches nest. Always inline to reduce stack usage
|
||||
* from the call frame.
|
||||
*/
|
||||
static __always_inline bool
|
||||
scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
|
||||
struct task_struct *prev, bool nested)
|
||||
{
|
||||
struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
|
||||
int nr_loops = SCX_DSP_MAX_LOOPS;
|
||||
@@ -2499,8 +2505,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
|
||||
do {
|
||||
dspc->nr_tasks = 0;
|
||||
|
||||
SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
|
||||
prev_on_sch ? prev : NULL);
|
||||
if (nested) {
|
||||
/*
|
||||
* If nested, don't update kf_mask as the originating
|
||||
* invocation would already have set it up.
|
||||
*/
|
||||
SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
|
||||
prev_on_sch ? prev : NULL);
|
||||
} else {
|
||||
/*
|
||||
* If not nested, stash @prev so that nested invocations
|
||||
* can access it.
|
||||
*/
|
||||
rq->scx.sub_dispatch_prev = prev;
|
||||
SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
|
||||
prev_on_sch ? prev : NULL);
|
||||
rq->scx.sub_dispatch_prev = NULL;
|
||||
}
|
||||
|
||||
flush_dispatch_buf(sch, rq);
|
||||
|
||||
@@ -2541,7 +2562,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
|
||||
|
||||
static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
struct scx_sched *sch = scx_root, *pos;
|
||||
struct scx_sched *sch = scx_root;
|
||||
s32 cpu = cpu_of(rq);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
@@ -2585,13 +2606,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
if (rq->scx.local_dsq.nr)
|
||||
goto has_tasks;
|
||||
|
||||
/*
|
||||
* TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
|
||||
* hierarchical operation.
|
||||
*/
|
||||
list_for_each_entry_rcu(pos, &scx_sched_all, all)
|
||||
if (scx_dispatch_sched(pos, rq, prev))
|
||||
goto has_tasks;
|
||||
if (scx_dispatch_sched(sch, rq, prev, false))
|
||||
goto has_tasks;
|
||||
|
||||
/*
|
||||
* Didn't find another task to run. Keep running @prev unless
|
||||
@@ -4942,9 +4958,8 @@ static void scx_sub_disable(struct scx_sched *sch)
|
||||
|
||||
/*
|
||||
* Guarantee forward progress and wait for descendants to be disabled.
|
||||
* To limit
|
||||
* disruptions, $parent is not bypassed. Tasks are fully prepped and
|
||||
* then inserted back into $parent.
|
||||
* To limit disruptions, $parent is not bypassed. Tasks are fully
|
||||
* prepped and then inserted back into $parent.
|
||||
*/
|
||||
scx_bypass(sch, true);
|
||||
drain_descendants(sch);
|
||||
@@ -6580,6 +6595,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_EXT_SUB_SCHED
|
||||
static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
|
||||
{
|
||||
struct scx_sched *sch;
|
||||
|
||||
guard(rcu)();
|
||||
sch = scx_prog_sched(prog->aux);
|
||||
if (unlikely(!sch))
|
||||
return;
|
||||
|
||||
scx_error(sch, "dispatch recursion detected");
|
||||
}
|
||||
#endif /* CONFIG_EXT_SUB_SCHED */
|
||||
|
||||
static int bpf_scx_check_member(const struct btf_type *t,
|
||||
const struct btf_member *member,
|
||||
const struct bpf_prog *prog)
|
||||
@@ -6605,6 +6634,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_EXT_SUB_SCHED
|
||||
/*
|
||||
* Enable private stack for operations that can nest along the
|
||||
* hierarchy.
|
||||
*
|
||||
* XXX - Ideally, we should only do this for scheds that allow
|
||||
* sub-scheds and sub-scheds themselves but I don't know how to access
|
||||
* struct_ops from here.
|
||||
*/
|
||||
switch (moff) {
|
||||
case offsetof(struct sched_ext_ops, dispatch):
|
||||
prog->aux->priv_stack_requested = true;
|
||||
prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
|
||||
}
|
||||
#endif /* CONFIG_EXT_SUB_SCHED */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -7583,6 +7628,48 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
|
||||
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_EXT_SUB_SCHED
|
||||
/**
|
||||
* scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
|
||||
* @cgroup_id: cgroup ID of the child scheduler to dispatch
|
||||
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
|
||||
*
|
||||
* Allows a parent scheduler to trigger dispatching on one of its direct
|
||||
* child schedulers. The child scheduler runs its dispatch operation to
|
||||
* move tasks from dispatch queues to the local runqueue.
|
||||
*
|
||||
* Returns: true on success, false if cgroup_id is invalid, not a direct
|
||||
* child, or caller lacks dispatch permission.
|
||||
*/
|
||||
__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
struct scx_sched *parent, *child;
|
||||
|
||||
guard(rcu)();
|
||||
parent = scx_prog_sched(aux);
|
||||
if (unlikely(!parent))
|
||||
return false;
|
||||
|
||||
if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
|
||||
return false;
|
||||
|
||||
child = scx_find_sub_sched(cgroup_id);
|
||||
|
||||
if (unlikely(!child))
|
||||
return false;
|
||||
|
||||
if (unlikely(scx_parent(child) != parent)) {
|
||||
scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
|
||||
cgroup_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
|
||||
true);
|
||||
}
|
||||
#endif /* CONFIG_EXT_SUB_SCHED */
|
||||
|
||||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
|
||||
@@ -7593,6 +7680,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
|
||||
#ifdef CONFIG_EXT_SUB_SCHED
|
||||
BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
|
||||
#endif
|
||||
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
|
||||
|
||||
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
|
||||
|
||||
@@ -805,6 +805,9 @@ struct scx_rq {
|
||||
cpumask_var_t cpus_to_preempt;
|
||||
cpumask_var_t cpus_to_wait;
|
||||
unsigned long kick_sync;
|
||||
|
||||
struct task_struct *sub_dispatch_prev;
|
||||
|
||||
struct llist_head deferred_reenq_locals;
|
||||
struct balance_callback deferred_bal_cb;
|
||||
struct irq_work deferred_irq_work;
|
||||
|
||||
@@ -101,6 +101,7 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
|
||||
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
|
||||
u64 scx_bpf_now(void) __ksym __weak;
|
||||
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
|
||||
bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;
|
||||
|
||||
/*
|
||||
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
|
||||
|
||||
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
|
||||
u64 nr_highpri_queued;
|
||||
u32 test_error_cnt;
|
||||
|
||||
#define MAX_SUB_SCHEDS 8
|
||||
u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
|
||||
|
||||
UEI_DEFINE(uei);
|
||||
|
||||
struct qmap {
|
||||
@@ -451,6 +454,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
|
||||
cpuc->dsp_cnt = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
|
||||
if (sub_sched_cgroup_ids[i] &&
|
||||
scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* No other tasks. @prev will keep running. Update its core_sched_seq as
|
||||
* if the task were enqueued and dispatched immediately.
|
||||
@@ -895,7 +904,32 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
|
||||
|
||||
s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
|
||||
{
|
||||
return 0;
|
||||
s32 i;
|
||||
|
||||
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
|
||||
if (!sub_sched_cgroup_ids[i]) {
|
||||
sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
|
||||
bpf_printk("attaching sub-sched[%d] on %s",
|
||||
i, args->cgroup_path);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
|
||||
{
|
||||
s32 i;
|
||||
|
||||
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
|
||||
if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
|
||||
sub_sched_cgroup_ids[i] = 0;
|
||||
bpf_printk("detaching sub-sched[%d] on %s",
|
||||
i, args->cgroup_path);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SCX_OPS_DEFINE(qmap_ops,
|
||||
@@ -914,6 +948,7 @@ SCX_OPS_DEFINE(qmap_ops,
|
||||
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
|
||||
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
|
||||
.sub_attach = (void *)qmap_sub_attach,
|
||||
.sub_detach = (void *)qmap_sub_detach,
|
||||
.cpu_online = (void *)qmap_cpu_online,
|
||||
.cpu_offline = (void *)qmap_cpu_offline,
|
||||
.init = (void *)qmap_init,
|
||||
|
||||
Reference in New Issue
Block a user