mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-06-21 15:43:21 +02:00
5839c07e89
Add single-bit test and iterator over set cids in an scx_cmask. v2: Bound scx_cmask_for_each_cid() to the active span. (sashiko AI) Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Andrea Righi <arighi@nvidia.com>
272 lines
9.1 KiB
C
272 lines
9.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Topological CPU IDs (cids)
|
|
* --------------------------
|
|
*
|
|
* Raw cpu numbers are clumsy for sharding work and communication across
|
|
* topology units, especially from BPF: the space can be sparse, numerical
|
|
* closeness doesn't imply topological closeness (x86 hyperthreading often puts
|
|
* SMT siblings far apart), and a range of cpu ids doesn't mean anything.
|
|
* Sub-scheds make this acute - cpu allocation, revocation and other state are
|
|
* constantly communicated across sub-scheds, and passing whole cpumasks scales
|
|
* poorly with cpu count. cpumasks are also awkward in BPF: a variable-length
|
|
* kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences
|
|
* for every op.
|
|
*
|
|
* cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or
|
|
* NUMA node get contiguous cid ranges, so a topology unit becomes a (start,
|
|
* length) slice of cid space. Communication can pass a slice instead of a
|
|
* cpumask, and BPF code can process, for example, a u64 word's worth of cids at
|
|
* a time.
|
|
*
|
|
* The mapping is built once at root scheduler enable time by walking the
|
|
* topology of online cpus only. Going by online cpus is out of necessity:
|
|
* depending on the arch, topology info isn't reliably available for offline
|
|
* cpus. The expected usage model is restarting the scheduler on hotplug events
|
|
* so the mapping is rebuilt against the new online set. A scheduler that wants
|
|
* to handle hotplug without a restart can provide its own cid and shard mapping
|
|
* through the override interface.
|
|
*
|
|
* Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
|
|
* Copyright (c) 2026 Tejun Heo <tj@kernel.org>
|
|
*/
|
|
#ifndef _KERNEL_SCHED_EXT_CID_H
|
|
#define _KERNEL_SCHED_EXT_CID_H
|
|
|
|
struct scx_sched;
|
|
|
|
/*
|
|
* Cid space (total is always num_possible_cpus()) is laid out with
|
|
* topology-annotated cids first, then no-topo cids at the tail. The
|
|
* topology-annotated block covers the cpus that were online when scx_cid_init()
|
|
* ran and remains valid even after those cpus go offline. The tail block covers
|
|
* possible-but-not-online cpus and carries all-(-1) topo info (see
|
|
* scx_cid_topo); callers detect it via the -1 sentinels.
|
|
*
|
|
* See the comment above the table definitions in ext_cid.c for the
|
|
* memory-ordering and visibility contract.
|
|
*/
|
|
extern s16 *scx_cid_to_cpu_tbl;
|
|
extern s16 *scx_cpu_to_cid_tbl;
|
|
extern struct scx_cid_topo *scx_cid_topo;
|
|
extern struct btf_id_set8 scx_kfunc_ids_init;
|
|
|
|
void scx_cmask_clear(struct scx_cmask *m);
|
|
void scx_cmask_fill(struct scx_cmask *m);
|
|
void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
|
|
void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
|
|
void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
|
|
void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
|
|
void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
|
|
void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
|
|
bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
|
|
bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
|
|
bool scx_cmask_empty(const struct scx_cmask *m);
|
|
s32 scx_cid_init(struct scx_sched *sch);
|
|
int scx_cid_kfunc_init(void);
|
|
void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
|
|
|
|
/**
|
|
* cid_valid - Verify a cid value, to be used on ops input args
|
|
* @sch: scx_sched to abort on error
|
|
* @cid: cid which came from a BPF ops
|
|
*
|
|
* Return true if @cid is in [0, num_possible_cpus()). On failure, trigger
|
|
* scx_error() and return false.
|
|
*/
|
|
static inline bool cid_valid(struct scx_sched *sch, s32 cid)
|
|
{
|
|
if (likely(cid >= 0 && cid < num_possible_cpus()))
|
|
return true;
|
|
scx_error(sch, "invalid cid %d", cid);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* __scx_cid_to_cpu - Unchecked cid->cpu table lookup
|
|
* @cid: cid to look up. Must be in [0, num_possible_cpus()).
|
|
*
|
|
* Intended for callsites that have already validated @cid and that hold a
|
|
* non-NULL @sch from scx_prog_sched() - a live sched implies the table has
|
|
* been allocated, so no NULL check is needed here.
|
|
*/
|
|
static inline s32 __scx_cid_to_cpu(s32 cid)
|
|
{
|
|
/* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */
|
|
return READ_ONCE(scx_cid_to_cpu_tbl)[cid];
|
|
}
|
|
|
|
/**
|
|
* __scx_cpu_to_cid - Unchecked cpu->cid table lookup
|
|
* @cpu: cpu to look up. Must be a valid possible cpu id.
|
|
*
|
|
* Same usage constraints as __scx_cid_to_cpu().
|
|
*/
|
|
static inline s32 __scx_cpu_to_cid(s32 cpu)
|
|
{
|
|
return READ_ONCE(scx_cpu_to_cid_tbl)[cpu];
|
|
}
|
|
|
|
/**
|
|
* scx_cid_to_cpu - Translate @cid to its cpu
|
|
* @sch: scx_sched for error reporting
|
|
* @cid: cid to look up
|
|
*
|
|
* Return the cpu for @cid or a negative errno on failure. Invalid cid triggers
|
|
* scx_error() on @sch. The cid arrays are allocated on first scheduler enable
|
|
* and never freed, so the returned cpu is stable for the lifetime of the loaded
|
|
* scheduler.
|
|
*/
|
|
static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid)
|
|
{
|
|
if (!cid_valid(sch, cid))
|
|
return -EINVAL;
|
|
return __scx_cid_to_cpu(cid);
|
|
}
|
|
|
|
/**
|
|
* scx_cpu_to_cid - Translate @cpu to its cid
|
|
* @sch: scx_sched for error reporting
|
|
* @cpu: cpu to look up
|
|
*
|
|
* Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers
|
|
* scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu().
|
|
*/
|
|
static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
|
|
{
|
|
if (!scx_cpu_valid(sch, cpu, NULL))
|
|
return -EINVAL;
|
|
return __scx_cpu_to_cid(cpu);
|
|
}
|
|
|
|
/**
|
|
* scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form
|
|
*/
|
|
static inline bool scx_is_cid_type(void)
|
|
{
|
|
return static_branch_unlikely(&__scx_is_cid_type);
|
|
}
|
|
|
|
static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m)
|
|
{
|
|
return likely(cid >= m->base && cid < m->base + m->nr_cids);
|
|
}
|
|
|
|
/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
|
|
static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m)
|
|
{
|
|
return (u64 *)&m->bits[cid / 64 - m->base / 64];
|
|
}
|
|
|
|
/**
|
|
* __scx_cmask_init - Initialize @m with explicit storage capacity
|
|
* @m: cmask to initialize
|
|
* @base: first cid of the active range
|
|
* @nr_cids: number of cids in the active range
|
|
* @alloc_cids: storage capacity in cids, at least @nr_cids
|
|
*
|
|
* Use when storage is sized larger than the initial active range. All of
|
|
* bits[] is zeroed.
|
|
*/
|
|
static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
|
|
u32 alloc_cids)
|
|
{
|
|
if (WARN_ON_ONCE(alloc_cids < nr_cids))
|
|
nr_cids = alloc_cids;
|
|
|
|
m->base = base;
|
|
m->nr_cids = nr_cids;
|
|
m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
|
|
memset(m->bits, 0, m->alloc_words * sizeof(u64));
|
|
}
|
|
|
|
/**
|
|
* scx_cmask_init - Initialize @m on tight storage
|
|
* @m: cmask to initialize
|
|
* @base: first cid of the active range
|
|
* @nr_cids: number of cids in the active range
|
|
*
|
|
* All of bits[] is zeroed.
|
|
*/
|
|
static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
|
|
{
|
|
__scx_cmask_init(m, base, nr_cids, nr_cids);
|
|
}
|
|
|
|
/**
|
|
* scx_cmask_reframe - Reshape @m's active range without resizing storage
|
|
* @m: cmask to reframe
|
|
* @base: new active range base
|
|
* @nr_cids: new active range length, must fit within @m->alloc_words
|
|
*
|
|
* Body bits within the new range become garbage - only the head and tail
|
|
* words are zeroed to keep the padding invariant.
|
|
*/
|
|
static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
|
|
{
|
|
if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
|
|
return;
|
|
|
|
if (nr_cids) {
|
|
u32 last_word = ((base & 63) + nr_cids - 1) / 64;
|
|
|
|
m->bits[0] = 0;
|
|
m->bits[last_word] = 0;
|
|
}
|
|
|
|
m->base = base;
|
|
m->nr_cids = nr_cids;
|
|
}
|
|
|
|
static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m)
|
|
{
|
|
if (!__scx_cmask_contains(cid, m))
|
|
return;
|
|
*__scx_cmask_word(cid, m) |= BIT_U64(cid & 63);
|
|
}
|
|
|
|
/**
|
|
* scx_cmask_test - test whether @cid is set in @m
|
|
* @cid: cid to test
|
|
* @m: cmask to test
|
|
*
|
|
* Return %false if @cid is outside @m's active range. Otherwise return the
|
|
* bit's value. Read via READ_ONCE so callers can race set/clear writers.
|
|
*/
|
|
static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m)
|
|
{
|
|
if (!__scx_cmask_contains(cid, m))
|
|
return false;
|
|
return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63);
|
|
}
|
|
|
|
/*
|
|
* Words of bits[] the active range spans, 0 if empty. Tighter than the storage
|
|
* SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment.
|
|
*/
|
|
static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m)
|
|
{
|
|
if (!m->nr_cids)
|
|
return 0;
|
|
return ((m->base & 63) + m->nr_cids - 1) / 64 + 1;
|
|
}
|
|
|
|
/**
|
|
* scx_cmask_for_each_cid - iterate set cids in @m
|
|
* @cid: s32 loop var that receives each set cid in turn
|
|
* @m: cmask to iterate
|
|
*
|
|
* Visits set bits within @m's active range in ascending order. Scans only the
|
|
* words the active range spans, where head and tail padding is kept zero, so
|
|
* no per-cid range check is needed.
|
|
*/
|
|
#define scx_cmask_for_each_cid(cid, m) \
|
|
for (u64 __bs = (m)->base & ~63u, __wi = 0, \
|
|
__nw = scx_cmask_nr_used_words(m); \
|
|
__wi < __nw; __wi++) \
|
|
for (u64 __w = READ_ONCE((m)->bits[__wi]); \
|
|
__w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \
|
|
__w &= __w - 1)
|
|
|
|
#endif /* _KERNEL_SCHED_EXT_CID_H */
|