ktstr/workload/config/sched.rs
1//! Linux scheduling-class + sched-policy declarative types for the
2//! workload pipeline.
3//!
4//! Holds [`SchedPolicy`] (the per-task `sched_setattr` shape),
5//! [`SchedClass`] (the coarse class identifier consumed by
6//! `WorkType::AsymmetricWaker`), and three orthogonal knobs used by
7//! specific work types: [`FutexLockMode`] (PI vs plain futex for
8//! `WorkType::PriorityInversion`), [`WakeMechanism`] (pipe vs futex
9//! wake between stages of `WorkType::WakeChain`), and [`AluWidth`]
10//! (scalar / SIMD width for `WorkType::AluHot`).
11//!
12//! These types are declarative — the corresponding kernel-call
13//! helpers live in the [`crate::workload::worker`] submodule
14//! (`set_sched_policy` in `worker/sched.rs`, `SchedClass::to_policy`).
15
16use std::time::Duration;
17
18use super::humantime_serde_helper;
19
20/// Linux scheduling policy for a worker process.
21///
22/// `Fifo`, `RoundRobin`, and `Deadline` all require `CAP_SYS_NICE`
23/// (`user_check_sched_setscheduler` in `kernel/sched/syscalls.c`
24/// routes rt_policy and dl_policy through `req_priv`). `Normal`,
25/// `Batch`, and (entering) `Idle` are unprivileged transitions for
26/// fair-policy tasks. Priority values for `Fifo`/`RoundRobin` are
27/// clamped to 1-99.
28#[derive(
29 Debug, Clone, Copy, Default, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize,
30)]
31#[serde(rename_all = "snake_case")]
32pub enum SchedPolicy {
33 /// `SCHED_NORMAL` (CFS/EEVDF).
34 #[default]
35 Normal,
36 /// `SCHED_BATCH`.
37 Batch,
38 /// `SCHED_IDLE`.
39 Idle,
40 /// `SCHED_FIFO` with the given priority (1-99).
41 Fifo(u32),
42 /// `SCHED_RR` with the given priority (1-99).
43 RoundRobin(u32),
44 /// `SCHED_DEADLINE` with explicit `runtime`, `deadline`, and
45 /// `period`. Applied via `sched_setattr(2)`.
46 ///
47 /// Each field is a [`Duration`] — the nanosecond representation
48 /// the kernel requires is materialised at the syscall site, so
49 /// callers express intent in idiomatic Rust units
50 /// (`Duration::from_micros(100)`, `Duration::from_millis(1)`,
51 /// etc.) and don't have to thread integer-nanosecond literals
52 /// through their test fixtures.
53 ///
54 /// Constraints (from `__checkparam_dl` in
55 /// `kernel/sched/deadline.c`):
56 /// - `deadline != Duration::ZERO`.
57 /// - `runtime` must be at least 1024 ns (the kernel's
58 /// `DL_SCALE` floor); shorter runtimes are silently truncated
59 /// inside the kernel and break bandwidth accounting.
60 /// - `runtime <= deadline`.
61 /// - `period == Duration::ZERO` is legal — the kernel
62 /// substitutes `deadline` for the period when zero. When
63 /// non-zero, `deadline <= period`.
64 /// - The effective period (`period` if non-zero, else
65 /// `deadline`) is checked against
66 /// `/proc/sys/kernel/sched_deadline_period_min_us` (default
67 /// 100us = 100_000 ns) and
68 /// `/proc/sys/kernel/sched_deadline_period_max_us` (default
69 /// `1 << 22` us = 4_194_304_000 ns), inclusive. Both sysctls
70 /// are runtime-tunable; this crate does not pre-validate the
71 /// sysctl range and lets the kernel surface out-of-range
72 /// values as `EINVAL`.
73 /// - The nanosecond count of `deadline` and `period` must each
74 /// fit in 63 bits (`< 1 << 63`, i.e. `<= i64::MAX` ns ≈ 292
75 /// years) — the kernel uses bit 63 internally. Any longer
76 /// `Duration` is rejected at the syscall site.
77 ///
78 /// Transitions to/from `Deadline` always require `CAP_SYS_NICE`.
79 /// Tasks set to `Deadline` get exclusive bandwidth on the
80 /// admission-controlled root domain; oversubscription returns
81 /// `EBUSY` (see `sched_dl_overflow` in `kernel/sched/deadline.c`).
82 ///
83 /// `set_sched_policy` validates the structural constraints
84 /// (zero-deadline, DL_SCALE floor, ordering, top-bit) before
85 /// invoking `sched_setattr` so a malformed `Deadline` fails
86 /// fast in user space rather than tunneling an `EINVAL`
87 /// through the syscall.
88 Deadline {
89 /// Runtime budget per period.
90 #[serde(with = "humantime_serde_helper")]
91 runtime: Duration,
92 /// Relative deadline from period start.
93 #[serde(with = "humantime_serde_helper")]
94 deadline: Duration,
95 /// Period. `Duration::ZERO` means "use `deadline` as the
96 /// period" per the kernel's `__checkparam_dl` substitution.
97 #[serde(with = "humantime_serde_helper")]
98 period: Duration,
99 },
100 /// `SCHED_EXT` — routes the worker through the loaded sched_ext BPF
101 /// scheduler. Applied via `sched_setattr(2)` with `sched_policy =
102 /// SCHED_EXT` (7); glibc does not wrap `SCHED_EXT`, so
103 /// `set_sched_policy` issues the raw syscall. `SCHED_EXT` is a valid
104 /// policy whenever the kernel is built with `CONFIG_SCHED_CLASS_EXT`,
105 /// so the syscall SUCCEEDS whether or not a scheduler is attached:
106 /// attached, the task routes to `ext_sched_class`; with none attached
107 /// `task_should_scx` is false so it silently falls back to
108 /// `fair_sched_class` (still `policy == SCHED_EXT`). It `EINVAL`s only
109 /// on a kernel built WITHOUT `CONFIG_SCHED_CLASS_EXT`.
110 /// `scx_check_setscheduler` (kernel/sched/ext.c) returns `EACCES` when
111 /// the task carries `scx.disallow`. No priority or deadline
112 /// parameters apply.
113 ///
114 /// Unlike `Normal` under a switch-all scheduler — which the kernel
115 /// reroutes to the ext class via `task_should_scx` WITHOUT changing
116 /// the task's policy — `Ext` sets `policy == SCHED_EXT` explicitly,
117 /// so the task is BPF-scheduled even under a `SCX_OPS_SWITCH_PARTIAL`
118 /// scheduler that leaves SCHED_OTHER tasks in fair. That is what
119 /// makes a SCHED_EXT worker a switch-mode-agnostic "the BPF scheduler
120 /// dispatched me" probe.
121 Ext,
122}
123
124impl SchedPolicy {
125 /// `SCHED_FIFO` with the given priority (1-99).
126 pub const fn fifo(priority: u32) -> Self {
127 SchedPolicy::Fifo(priority)
128 }
129
130 /// `SCHED_RR` with the given priority (1-99).
131 pub const fn round_robin(priority: u32) -> Self {
132 SchedPolicy::RoundRobin(priority)
133 }
134
135 /// `SCHED_DEADLINE` with the given runtime / deadline / period.
136 /// See [`SchedPolicy::Deadline`] for parameter constraints.
137 ///
138 /// All three arguments share the same [`Duration`] type. The
139 /// canonical order is `(runtime, deadline, period)` — runtime
140 /// budget first, then the relative deadline, then the period.
141 /// For tests that need to make the order obvious at the call
142 /// site, prefer the struct-literal form
143 /// `SchedPolicy::Deadline { runtime: ..., deadline: ...,
144 /// period: ... }` which carries the field names through the
145 /// reader's eye.
146 ///
147 /// ```
148 /// # use std::time::Duration;
149 /// # use ktstr::workload::SchedPolicy;
150 /// // Convenience constructor — canonical (runtime, deadline, period) order.
151 /// let p = SchedPolicy::deadline(
152 /// Duration::from_micros(500), // runtime
153 /// Duration::from_millis(1), // deadline
154 /// Duration::from_millis(10), // period
155 /// );
156 /// // Struct-literal form — names elide positional confusion.
157 /// let q = SchedPolicy::Deadline {
158 /// runtime: Duration::from_micros(500),
159 /// deadline: Duration::from_millis(1),
160 /// period: Duration::from_millis(10),
161 /// };
162 /// assert!(matches!(p, SchedPolicy::Deadline { .. }));
163 /// assert!(matches!(q, SchedPolicy::Deadline { .. }));
164 /// ```
165 pub const fn deadline(runtime: Duration, deadline: Duration, period: Duration) -> Self {
166 SchedPolicy::Deadline {
167 runtime,
168 deadline,
169 period,
170 }
171 }
172}
173
174/// Whether `WorkType::PriorityInversion` uses a PI-aware mutex
175/// or a plain futex.
176///
177/// `Pi` exercises `FUTEX_LOCK_PI` and the rt_mutex priority-boost
178/// chain (`kernel/futex/pi.c`). When the low-priority lock holder
179/// is preempted by a medium-priority worker, the kernel boosts
180/// the holder to the high-priority waiter's priority for the
181/// duration of the hold — both unblocking `high` and pinning
182/// `medium` from preempting it. `Plain` uses a non-PI futex so
183/// the inversion is left unrepaired and the scheduler must
184/// surface the stall.
185///
186/// Carried as a typed wrapper rather than a `bool` to avoid
187/// positional-argument confusion at call sites and so the
188/// failure-dump diagnostic names the choice explicitly
189/// ("pi_mode = Pi" vs "pi_mode = Plain") instead of a bare
190/// boolean.
191#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
192#[serde(rename_all = "snake_case")]
193pub enum FutexLockMode {
194 /// `FUTEX_LOCK_PI` with rt_mutex PI chain.
195 Pi,
196 /// Plain futex (no PI boost). The default — exercises the
197 /// uncorrected inversion the workload exists to surface.
198 #[default]
199 Plain,
200}
201
202/// How a [`WorkType::CgroupAttachStorm`](crate::workload::WorkType::CgroupAttachStorm)
203/// worker reaps the transient children it forks each iteration.
204///
205/// Carried as a typed enum rather than a `bool` so call sites name the
206/// choice explicitly (`SigIgn` / `Waitpid`) instead of a bare
207/// `reap: true` / `false`, and so the failure-dump diagnostic names it.
208/// The serde wire form is snake_case (`"sig_ign"` / `"waitpid"`).
209#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
210#[serde(rename_all = "snake_case")]
211pub enum ReapMode {
212 /// Install `SIGCHLD = SIG_IGN` once at worker entry so each forked
213 /// child auto-reaps in its own exit path, concurrent with the
214 /// parent's `cgroup.procs` write. This reap-vs-write race is the
215 /// variant's reason to exist, so it is the default.
216 #[default]
217 SigIgn,
218 /// The parent blocking-`waitpid`s each child after writing its pid —
219 /// the non-racing control shape (mirrors the reaper in
220 /// [`WorkType::ForkExit`](crate::workload::WorkType::ForkExit)), so
221 /// the same primitive serves as an A/B baseline against the `SigIgn`
222 /// race.
223 Waitpid,
224}
225
226/// Wake mechanism between stages of a `WorkType::WakeChain`.
227///
228/// Carried as a typed enum rather than a `bool` so call sites
229/// name the choice explicitly (`Pipe` / `Futex`) instead of a
230/// bare `sync: true` / `sync: false`. The serde wire format is
231/// `"pipe"` / `"futex"` (snake_case).
232#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
233#[serde(rename_all = "snake_case")]
234pub enum WakeMechanism {
235 /// Anon-pipe ring (`depth` pipes per chain). Wakes carry
236 /// `WF_SYNC` via `wake_up_interruptible_sync_poll`, biasing
237 /// scheduler placement against migration. Tests the
238 /// `SCX_WAKE_SYNC` path that scx variants must respect. The
239 /// default — see `WakeChain` in `WorkType` for the kernel
240 /// call-chain citations.
241 #[default]
242 Pipe,
243 /// Single shared futex word per chain. The active stage
244 /// advances the word and `FUTEX_WAKE`s; the stage whose
245 /// `pos` matches runs, others re-park. No `WF_SYNC`.
246 Futex,
247}
248
249/// ALU/SIMD execution width for `WorkType::AluHot`.
250///
251/// Selects the widest data-path the worker exercises per
252/// multiply chain. Today every variant executes the same scalar
253/// four-stream multiply chain — the width selector is preserved
254/// on the wire so a downstream classifier can distinguish runs
255/// that requested SIMD from runs that requested scalar even
256/// though the dispatch is uniform. Wider variants WILL drive
257/// more functional-unit pressure and (for AVX-512 / AMX) draw
258/// the package into a frequency-throttled mode the kernel
259/// scheduler must observe once SIMD intrinsics land per-arm.
260/// The serde wire form is snake_case (`"scalar"`, `"vec128"`,
261/// `"vec256"`, `"vec512"`, `"amx"`, `"widest"`).
262///
263/// # Current behaviour
264///
265/// All widths run the same four-stream scalar multiply path;
266/// the width selector is preserved on the wire (the
267/// `WorkType::AluHot` / `WorkPhase::AluHot` config carries
268/// `width`) so a downstream classifier can distinguish runs
269/// that requested SIMD from runs that requested scalar even
270/// though the dispatch is uniform.
271///
272/// # Default semantics
273///
274/// `Scalar` is the type-level Rust default (the
275/// `#[derive(Default)]` fallback that serde uses when an
276/// `AluWidth` field is missing on the wire — keeps backward-
277/// compat for older capture data). `Widest` is the
278/// workload-level default the
279/// `super::defaults::ALU_HOT_WIDTH` constant resolves at runtime
280/// via `resolve_alu_width`: tests that take
281/// `WorkType::from_name("AluHot")` get the host's widest
282/// available data-path, not the type-level scalar fallback.
283/// The asymmetry is deliberate — type-level Default favours
284/// "always available everywhere"; workload-level default
285/// favours "stress the host as hard as it can run."
286///
287/// # Resolution rules
288///
289/// `Widest` is a runtime-resolved sentinel: at worker entry the
290/// dispatch arm probes the host CPU via
291/// [`std::is_x86_feature_detected!`] (x86_64) and picks the
292/// widest available variant in the order
293/// `Amx > Vec512 > Vec256 > Vec128 > Scalar`. On `aarch64` only
294/// `Scalar` and `Vec128` (NEON) are available; `Vec256` /
295/// `Vec512` / `Amx` are absent and `Widest` resolves to NEON
296/// when present, falling back to `Scalar`. A configured value
297/// that the host cannot run is downgraded to the next-widest
298/// available variant with a one-shot `tracing::warn!` so the
299/// test still produces useful telemetry rather than
300/// hard-failing — silent downgrade without the warn would
301/// mask the host capability gap.
302///
303/// # Frequency throttle on x86_64
304///
305/// On Intel client / server SKUs the AVX-512 license raises the
306/// per-core voltage and lowers the all-core turbo for the
307/// package; running [`Vec512`](Self::Vec512) workers under one
308/// scheduler while other workers run under another biases the
309/// comparison because the throttle is package-wide, not
310/// per-task. Tests that A/B-compare schedulers under
311/// [`Vec512`](Self::Vec512) or [`Amx`](Self::Amx) need the
312/// runs serialized on the same package — the framework does
313/// not currently coordinate this serialization across worker
314/// groups.
315#[derive(
316 Clone, Copy, Debug, Default, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize,
317)]
318#[serde(rename_all = "snake_case")]
319pub enum AluWidth {
320 /// 64-bit scalar integer multiply chain. Drives the integer
321 /// pipeline only; no SIMD or AVX licensing involved.
322 /// Available on every supported architecture.
323 #[default]
324 Scalar,
325 /// 128-bit vector integer multiply chain (SSE2 on x86_64,
326 /// NEON on aarch64). The widest baseline both architectures
327 /// support; a reasonable default when the test cares about
328 /// "vectorized ALU" without architecture-specific tuning.
329 Vec128,
330 /// 256-bit vector integer multiply chain (AVX2 on x86_64).
331 /// Not available on aarch64 — falls back to `Vec128`
332 /// (NEON) at worker entry with a one-shot warn.
333 Vec256,
334 /// 512-bit vector integer multiply chain (AVX-512F on
335 /// x86_64). Triggers the package-wide frequency throttle
336 /// described above. Not available on aarch64 — falls back
337 /// to `Vec128` (NEON) at worker entry.
338 Vec512,
339 /// AMX tile multiply chain (x86_64 server SKUs with AMX-INT8
340 /// or AMX-BF16). The widest data-path on x86_64; uses XFD
341 /// gating in the kernel
342 /// (the first AMX instruction raises a #NM trap that
343 /// `arch/x86/kernel/traps.c::handle_xfd_event` handles,
344 /// calling `arch/x86/kernel/fpu/xstate.c::__xfd_enable_feature`
345 /// to allocate the dynamic XSAVE area) so the kernel allocates
346 /// the dynamic XSAVE area lazily — adds a one-time per-task
347 /// latency spike on first use.
348 ///
349 /// AMX additionally requires
350 /// `prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILE_DATA)` per
351 /// process before the first AMX instruction; the framework
352 /// does NOT issue this prctl, so AMX is not yet runnable.
353 /// `resolve_alu_width` therefore downgrades `AluWidth::Amx`
354 /// to the host's widest stable-detectable variant; AMX is
355 /// not currently runnable end-to-end on this framework.
356 ///
357 /// Not available on aarch64 — falls back to `Vec128`.
358 Amx,
359 /// Resolve to the widest variant the host supports at
360 /// worker entry. See the type-level doc for the resolution
361 /// order. Useful as a default when the test author wants
362 /// "as much ALU pressure as the host can sustain" without
363 /// hardcoding an architecture or feature level.
364 Widest,
365}
366
367/// Coarse Linux scheduling class identifier.
368///
369/// Maps to one of the kernel's six core scheduler classes:
370/// `fair_sched_class` (CFS / EEVDF — covers `SCHED_NORMAL`,
371/// `SCHED_BATCH`, `SCHED_IDLE`), `rt_sched_class` (covers
372/// `SCHED_FIFO` and `SCHED_RR`), `dl_sched_class` (covers
373/// `SCHED_DEADLINE`), and `ext_sched_class` (covers `SCHED_EXT`
374/// when sched_ext is loaded). The class is a coarser concept
375/// than [`SchedPolicy`] — `Cfs` covers Normal/Batch/Idle, `Rt`
376/// covers Fifo/RoundRobin — and is what
377/// `WorkType::AsymmetricWaker` consumes when it wants to
378/// describe a waker / wakee pair without specifying priority
379/// values. When a per-worker class is applied,
380/// `SchedClass::to_policy` maps the variant to the equivalent
381/// [`SchedPolicy`] (using a default priority where applicable)
382/// and routes through `set_sched_policy`.
383#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
384#[serde(rename_all = "snake_case")]
385pub enum SchedClass {
386 /// `fair_sched_class` — `SCHED_NORMAL` (CFS / EEVDF). The
387 /// default; matches a freshly-forked task before any policy
388 /// override.
389 #[default]
390 Cfs,
391 /// `fair_sched_class` — `SCHED_BATCH` (background-friendly
392 /// fair task with longer wakeup latency targets).
393 Batch,
394 /// `fair_sched_class` — `SCHED_IDLE` (lowest fair-class
395 /// weight; runs only when nothing else is runnable).
396 Idle,
397 /// `rt_sched_class` — `SCHED_FIFO` at default priority
398 /// `RT_DEFAULT_PRIO`. Requires `CAP_SYS_NICE`. For explicit
399 /// priority control use [`SchedPolicy::Fifo`] directly.
400 Rt,
401 /// `dl_sched_class` — `SCHED_DEADLINE`. Maps to a
402 /// minimum-bandwidth deadline reservation
403 /// ([`SchedClass::default_deadline_reservation`]) so
404 /// `SchedClass::Deadline` is constructible without picking
405 /// runtime/deadline/period. Callers needing precise
406 /// reservations should use [`SchedPolicy::Deadline`]
407 /// directly.
408 Deadline,
409 /// `ext_sched_class` — `SCHED_EXT`. Routes the worker
410 /// through the loaded sched_ext BPF scheduler. Under
411 /// switch-all (the default scx-ktstr regime), this is the
412 /// same effective class as `Cfs` because every fair-policy
413 /// task already reroutes to ext via `task_should_scx` (see
414 /// kernel/sched/ext.c). `Cfs` is preserved as the explicit
415 /// "I want fair semantics" knob the user expresses; `Ext`
416 /// maps to [`SchedPolicy::Ext`], which sets `policy == SCHED_EXT`
417 /// on the task_struct so it is BPF-scheduled even under a
418 /// `SCX_OPS_SWITCH_PARTIAL` scheduler.
419 Ext,
420}
421
422/// Default `RT_DEFAULT_PRIO` for [`SchedClass::Rt`] when mapped to
423/// a [`SchedPolicy`]. Picked at the middle of the 1..=99 valid range
424/// so the worker neither preempts every other RT task in the system
425/// nor sits at the floor; tests that need a specific RT priority
426/// must construct [`SchedPolicy::Fifo`] directly.
427const RT_DEFAULT_PRIO: u32 = 50;
428
429impl SchedClass {
430 /// Resolve to an equivalent [`SchedPolicy`]. `Rt` uses
431 /// `RT_DEFAULT_PRIO`; `Deadline` uses the minimum-bandwidth
432 /// reservation (2us runtime, 1ms deadline, 10ms period — passes
433 /// `__checkparam_dl` and the default sysctl bounds).
434 /// `Ext` maps to [`SchedPolicy::Ext`], which issues `sched_setattr`
435 /// with `sched_policy = SCHED_EXT` (7) so the kernel reads
436 /// `policy == SCHED_EXT`. The task is BPF-scheduled only when a
437 /// sched_ext scheduler is attached; with none attached the syscall
438 /// still succeeds and the task runs in `fair_sched_class`.
439 pub const fn to_policy(self) -> SchedPolicy {
440 match self {
441 SchedClass::Cfs => SchedPolicy::Normal,
442 SchedClass::Ext => SchedPolicy::Ext,
443 SchedClass::Batch => SchedPolicy::Batch,
444 SchedClass::Idle => SchedPolicy::Idle,
445 SchedClass::Rt => SchedPolicy::Fifo(RT_DEFAULT_PRIO),
446 SchedClass::Deadline => Self::default_deadline_reservation(),
447 }
448 }
449
450 /// Minimum-bandwidth `SCHED_DEADLINE` reservation that passes
451 /// `__checkparam_dl`'s `runtime >= (1 << DL_SCALE)` (1024ns)
452 /// floor and the kernel's default `sched_deadline_period_min_us`
453 /// (100us). 2us runtime, 1ms deadline, 10ms period — bandwidth
454 /// fraction 0.0002, well below admission-control limits.
455 pub const fn default_deadline_reservation() -> SchedPolicy {
456 SchedPolicy::Deadline {
457 runtime: Duration::from_micros(2),
458 deadline: Duration::from_millis(1),
459 period: Duration::from_millis(10),
460 }
461 }
462}