ktstr/workload/config/
sched.rs

1//! Linux scheduling-class + sched-policy declarative types for the
2//! workload pipeline.
3//!
4//! Holds [`SchedPolicy`] (the per-task `sched_setattr` shape),
5//! [`SchedClass`] (the coarse class identifier consumed by
6//! `WorkType::AsymmetricWaker`), and three orthogonal knobs used by
7//! specific work types: [`FutexLockMode`] (PI vs plain futex for
8//! `WorkType::PriorityInversion`), [`WakeMechanism`] (pipe vs futex
9//! wake between stages of `WorkType::WakeChain`), and [`AluWidth`]
10//! (scalar / SIMD width for `WorkType::AluHot`).
11//!
12//! These types are declarative — the corresponding kernel-call
13//! helpers live in the [`crate::workload::worker`] submodule
14//! (`set_sched_policy` in `worker/sched.rs`, `SchedClass::to_policy`).
15
16use std::time::Duration;
17
18use super::humantime_serde_helper;
19
20/// Linux scheduling policy for a worker process.
21///
22/// `Fifo`, `RoundRobin`, and `Deadline` all require `CAP_SYS_NICE`
23/// (`user_check_sched_setscheduler` in `kernel/sched/syscalls.c`
24/// routes rt_policy and dl_policy through `req_priv`). `Normal`,
25/// `Batch`, and (entering) `Idle` are unprivileged transitions for
26/// fair-policy tasks. Priority values for `Fifo`/`RoundRobin` are
27/// clamped to 1-99.
28#[derive(
29    Debug, Clone, Copy, Default, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize,
30)]
31#[serde(rename_all = "snake_case")]
32pub enum SchedPolicy {
33    /// `SCHED_NORMAL` (CFS/EEVDF).
34    #[default]
35    Normal,
36    /// `SCHED_BATCH`.
37    Batch,
38    /// `SCHED_IDLE`.
39    Idle,
40    /// `SCHED_FIFO` with the given priority (1-99).
41    Fifo(u32),
42    /// `SCHED_RR` with the given priority (1-99).
43    RoundRobin(u32),
44    /// `SCHED_DEADLINE` with explicit `runtime`, `deadline`, and
45    /// `period`. Applied via `sched_setattr(2)`.
46    ///
47    /// Each field is a [`Duration`] — the nanosecond representation
48    /// the kernel requires is materialised at the syscall site, so
49    /// callers express intent in idiomatic Rust units
50    /// (`Duration::from_micros(100)`, `Duration::from_millis(1)`,
51    /// etc.) and don't have to thread integer-nanosecond literals
52    /// through their test fixtures.
53    ///
54    /// Constraints (from `__checkparam_dl` in
55    /// `kernel/sched/deadline.c`):
56    /// - `deadline != Duration::ZERO`.
57    /// - `runtime` must be at least 1024 ns (the kernel's
58    ///   `DL_SCALE` floor); shorter runtimes are silently truncated
59    ///   inside the kernel and break bandwidth accounting.
60    /// - `runtime <= deadline`.
61    /// - `period == Duration::ZERO` is legal — the kernel
62    ///   substitutes `deadline` for the period when zero. When
63    ///   non-zero, `deadline <= period`.
64    /// - The effective period (`period` if non-zero, else
65    ///   `deadline`) is checked against
66    ///   `/proc/sys/kernel/sched_deadline_period_min_us` (default
67    ///   100us = 100_000 ns) and
68    ///   `/proc/sys/kernel/sched_deadline_period_max_us` (default
69    ///   `1 << 22` us = 4_194_304_000 ns), inclusive. Both sysctls
70    ///   are runtime-tunable; this crate does not pre-validate the
71    ///   sysctl range and lets the kernel surface out-of-range
72    ///   values as `EINVAL`.
73    /// - The nanosecond count of `deadline` and `period` must each
74    ///   fit in 63 bits (`< 1 << 63`, i.e. `<= i64::MAX` ns ≈ 292
75    ///   years) — the kernel uses bit 63 internally. Any longer
76    ///   `Duration` is rejected at the syscall site.
77    ///
78    /// Transitions to/from `Deadline` always require `CAP_SYS_NICE`.
79    /// Tasks set to `Deadline` get exclusive bandwidth on the
80    /// admission-controlled root domain; oversubscription returns
81    /// `EBUSY` (see `sched_dl_overflow` in `kernel/sched/deadline.c`).
82    ///
83    /// `set_sched_policy` validates the structural constraints
84    /// (zero-deadline, DL_SCALE floor, ordering, top-bit) before
85    /// invoking `sched_setattr` so a malformed `Deadline` fails
86    /// fast in user space rather than tunneling an `EINVAL`
87    /// through the syscall.
88    Deadline {
89        /// Runtime budget per period.
90        #[serde(with = "humantime_serde_helper")]
91        runtime: Duration,
92        /// Relative deadline from period start.
93        #[serde(with = "humantime_serde_helper")]
94        deadline: Duration,
95        /// Period. `Duration::ZERO` means "use `deadline` as the
96        /// period" per the kernel's `__checkparam_dl` substitution.
97        #[serde(with = "humantime_serde_helper")]
98        period: Duration,
99    },
100    /// `SCHED_EXT` — routes the worker through the loaded sched_ext BPF
101    /// scheduler. Applied via `sched_setattr(2)` with `sched_policy =
102    /// SCHED_EXT` (7); glibc does not wrap `SCHED_EXT`, so
103    /// `set_sched_policy` issues the raw syscall. `SCHED_EXT` is a valid
104    /// policy whenever the kernel is built with `CONFIG_SCHED_CLASS_EXT`,
105    /// so the syscall SUCCEEDS whether or not a scheduler is attached:
106    /// attached, the task routes to `ext_sched_class`; with none attached
107    /// `task_should_scx` is false so it silently falls back to
108    /// `fair_sched_class` (still `policy == SCHED_EXT`). It `EINVAL`s only
109    /// on a kernel built WITHOUT `CONFIG_SCHED_CLASS_EXT`.
110    /// `scx_check_setscheduler` (kernel/sched/ext.c) returns `EACCES` when
111    /// the task carries `scx.disallow`. No priority or deadline
112    /// parameters apply.
113    ///
114    /// Unlike `Normal` under a switch-all scheduler — which the kernel
115    /// reroutes to the ext class via `task_should_scx` WITHOUT changing
116    /// the task's policy — `Ext` sets `policy == SCHED_EXT` explicitly,
117    /// so the task is BPF-scheduled even under a `SCX_OPS_SWITCH_PARTIAL`
118    /// scheduler that leaves SCHED_OTHER tasks in fair. That is what
119    /// makes a SCHED_EXT worker a switch-mode-agnostic "the BPF scheduler
120    /// dispatched me" probe.
121    Ext,
122}
123
124impl SchedPolicy {
125    /// `SCHED_FIFO` with the given priority (1-99).
126    pub const fn fifo(priority: u32) -> Self {
127        SchedPolicy::Fifo(priority)
128    }
129
130    /// `SCHED_RR` with the given priority (1-99).
131    pub const fn round_robin(priority: u32) -> Self {
132        SchedPolicy::RoundRobin(priority)
133    }
134
135    /// `SCHED_DEADLINE` with the given runtime / deadline / period.
136    /// See [`SchedPolicy::Deadline`] for parameter constraints.
137    ///
138    /// All three arguments share the same [`Duration`] type. The
139    /// canonical order is `(runtime, deadline, period)` — runtime
140    /// budget first, then the relative deadline, then the period.
141    /// For tests that need to make the order obvious at the call
142    /// site, prefer the struct-literal form
143    /// `SchedPolicy::Deadline { runtime: ..., deadline: ...,
144    /// period: ... }` which carries the field names through the
145    /// reader's eye.
146    ///
147    /// ```
148    /// # use std::time::Duration;
149    /// # use ktstr::workload::SchedPolicy;
150    /// // Convenience constructor — canonical (runtime, deadline, period) order.
151    /// let p = SchedPolicy::deadline(
152    ///     Duration::from_micros(500), // runtime
153    ///     Duration::from_millis(1),   // deadline
154    ///     Duration::from_millis(10),  // period
155    /// );
156    /// // Struct-literal form — names elide positional confusion.
157    /// let q = SchedPolicy::Deadline {
158    ///     runtime: Duration::from_micros(500),
159    ///     deadline: Duration::from_millis(1),
160    ///     period: Duration::from_millis(10),
161    /// };
162    /// assert!(matches!(p, SchedPolicy::Deadline { .. }));
163    /// assert!(matches!(q, SchedPolicy::Deadline { .. }));
164    /// ```
165    pub const fn deadline(runtime: Duration, deadline: Duration, period: Duration) -> Self {
166        SchedPolicy::Deadline {
167            runtime,
168            deadline,
169            period,
170        }
171    }
172}
173
174/// Whether `WorkType::PriorityInversion` uses a PI-aware mutex
175/// or a plain futex.
176///
177/// `Pi` exercises `FUTEX_LOCK_PI` and the rt_mutex priority-boost
178/// chain (`kernel/futex/pi.c`). When the low-priority lock holder
179/// is preempted by a medium-priority worker, the kernel boosts
180/// the holder to the high-priority waiter's priority for the
181/// duration of the hold — both unblocking `high` and pinning
182/// `medium` from preempting it. `Plain` uses a non-PI futex so
183/// the inversion is left unrepaired and the scheduler must
184/// surface the stall.
185///
186/// Carried as a typed wrapper rather than a `bool` to avoid
187/// positional-argument confusion at call sites and so the
188/// failure-dump diagnostic names the choice explicitly
189/// ("pi_mode = Pi" vs "pi_mode = Plain") instead of a bare
190/// boolean.
191#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
192#[serde(rename_all = "snake_case")]
193pub enum FutexLockMode {
194    /// `FUTEX_LOCK_PI` with rt_mutex PI chain.
195    Pi,
196    /// Plain futex (no PI boost). The default — exercises the
197    /// uncorrected inversion the workload exists to surface.
198    #[default]
199    Plain,
200}
201
202/// How a [`WorkType::CgroupAttachStorm`](crate::workload::WorkType::CgroupAttachStorm)
203/// worker reaps the transient children it forks each iteration.
204///
205/// Carried as a typed enum rather than a `bool` so call sites name the
206/// choice explicitly (`SigIgn` / `Waitpid`) instead of a bare
207/// `reap: true` / `false`, and so the failure-dump diagnostic names it.
208/// The serde wire form is snake_case (`"sig_ign"` / `"waitpid"`).
209#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
210#[serde(rename_all = "snake_case")]
211pub enum ReapMode {
212    /// Install `SIGCHLD = SIG_IGN` once at worker entry so each forked
213    /// child auto-reaps in its own exit path, concurrent with the
214    /// parent's `cgroup.procs` write. This reap-vs-write race is the
215    /// variant's reason to exist, so it is the default.
216    #[default]
217    SigIgn,
218    /// The parent blocking-`waitpid`s each child after writing its pid —
219    /// the non-racing control shape (mirrors the reaper in
220    /// [`WorkType::ForkExit`](crate::workload::WorkType::ForkExit)), so
221    /// the same primitive serves as an A/B baseline against the `SigIgn`
222    /// race.
223    Waitpid,
224}
225
226/// Wake mechanism between stages of a `WorkType::WakeChain`.
227///
228/// Carried as a typed enum rather than a `bool` so call sites
229/// name the choice explicitly (`Pipe` / `Futex`) instead of a
230/// bare `sync: true` / `sync: false`. The serde wire format is
231/// `"pipe"` / `"futex"` (snake_case).
232#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
233#[serde(rename_all = "snake_case")]
234pub enum WakeMechanism {
235    /// Anon-pipe ring (`depth` pipes per chain). Wakes carry
236    /// `WF_SYNC` via `wake_up_interruptible_sync_poll`, biasing
237    /// scheduler placement against migration. Tests the
238    /// `SCX_WAKE_SYNC` path that scx variants must respect. The
239    /// default — see `WakeChain` in `WorkType` for the kernel
240    /// call-chain citations.
241    #[default]
242    Pipe,
243    /// Single shared futex word per chain. The active stage
244    /// advances the word and `FUTEX_WAKE`s; the stage whose
245    /// `pos` matches runs, others re-park. No `WF_SYNC`.
246    Futex,
247}
248
249/// ALU/SIMD execution width for `WorkType::AluHot`.
250///
251/// Selects the widest data-path the worker exercises per
252/// multiply chain. Today every variant executes the same scalar
253/// four-stream multiply chain — the width selector is preserved
254/// on the wire so a downstream classifier can distinguish runs
255/// that requested SIMD from runs that requested scalar even
256/// though the dispatch is uniform. Wider variants WILL drive
257/// more functional-unit pressure and (for AVX-512 / AMX) draw
258/// the package into a frequency-throttled mode the kernel
259/// scheduler must observe once SIMD intrinsics land per-arm.
260/// The serde wire form is snake_case (`"scalar"`, `"vec128"`,
261/// `"vec256"`, `"vec512"`, `"amx"`, `"widest"`).
262///
263/// # Current behaviour
264///
265/// All widths run the same four-stream scalar multiply path;
266/// the width selector is preserved on the wire (the
267/// `WorkType::AluHot` / `WorkPhase::AluHot` config carries
268/// `width`) so a downstream classifier can distinguish runs
269/// that requested SIMD from runs that requested scalar even
270/// though the dispatch is uniform.
271///
272/// # Default semantics
273///
274/// `Scalar` is the type-level Rust default (the
275/// `#[derive(Default)]` fallback that serde uses when an
276/// `AluWidth` field is missing on the wire — keeps backward-
277/// compat for older capture data). `Widest` is the
278/// workload-level default the
279/// `super::defaults::ALU_HOT_WIDTH` constant resolves at runtime
280/// via `resolve_alu_width`: tests that take
281/// `WorkType::from_name("AluHot")` get the host's widest
282/// available data-path, not the type-level scalar fallback.
283/// The asymmetry is deliberate — type-level Default favours
284/// "always available everywhere"; workload-level default
285/// favours "stress the host as hard as it can run."
286///
287/// # Resolution rules
288///
289/// `Widest` is a runtime-resolved sentinel: at worker entry the
290/// dispatch arm probes the host CPU via
291/// [`std::is_x86_feature_detected!`] (x86_64) and picks the
292/// widest available variant in the order
293/// `Amx > Vec512 > Vec256 > Vec128 > Scalar`. On `aarch64` only
294/// `Scalar` and `Vec128` (NEON) are available; `Vec256` /
295/// `Vec512` / `Amx` are absent and `Widest` resolves to NEON
296/// when present, falling back to `Scalar`. A configured value
297/// that the host cannot run is downgraded to the next-widest
298/// available variant with a one-shot `tracing::warn!` so the
299/// test still produces useful telemetry rather than
300/// hard-failing — silent downgrade without the warn would
301/// mask the host capability gap.
302///
303/// # Frequency throttle on x86_64
304///
305/// On Intel client / server SKUs the AVX-512 license raises the
306/// per-core voltage and lowers the all-core turbo for the
307/// package; running [`Vec512`](Self::Vec512) workers under one
308/// scheduler while other workers run under another biases the
309/// comparison because the throttle is package-wide, not
310/// per-task. Tests that A/B-compare schedulers under
311/// [`Vec512`](Self::Vec512) or [`Amx`](Self::Amx) need the
312/// runs serialized on the same package — the framework does
313/// not currently coordinate this serialization across worker
314/// groups.
315#[derive(
316    Clone, Copy, Debug, Default, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize,
317)]
318#[serde(rename_all = "snake_case")]
319pub enum AluWidth {
320    /// 64-bit scalar integer multiply chain. Drives the integer
321    /// pipeline only; no SIMD or AVX licensing involved.
322    /// Available on every supported architecture.
323    #[default]
324    Scalar,
325    /// 128-bit vector integer multiply chain (SSE2 on x86_64,
326    /// NEON on aarch64). The widest baseline both architectures
327    /// support; a reasonable default when the test cares about
328    /// "vectorized ALU" without architecture-specific tuning.
329    Vec128,
330    /// 256-bit vector integer multiply chain (AVX2 on x86_64).
331    /// Not available on aarch64 — falls back to `Vec128`
332    /// (NEON) at worker entry with a one-shot warn.
333    Vec256,
334    /// 512-bit vector integer multiply chain (AVX-512F on
335    /// x86_64). Triggers the package-wide frequency throttle
336    /// described above. Not available on aarch64 — falls back
337    /// to `Vec128` (NEON) at worker entry.
338    Vec512,
339    /// AMX tile multiply chain (x86_64 server SKUs with AMX-INT8
340    /// or AMX-BF16). The widest data-path on x86_64; uses XFD
341    /// gating in the kernel
342    /// (the first AMX instruction raises a #NM trap that
343    /// `arch/x86/kernel/traps.c::handle_xfd_event` handles,
344    /// calling `arch/x86/kernel/fpu/xstate.c::__xfd_enable_feature`
345    /// to allocate the dynamic XSAVE area) so the kernel allocates
346    /// the dynamic XSAVE area lazily — adds a one-time per-task
347    /// latency spike on first use.
348    ///
349    /// AMX additionally requires
350    /// `prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILE_DATA)` per
351    /// process before the first AMX instruction; the framework
352    /// does NOT issue this prctl, so AMX is not yet runnable.
353    /// `resolve_alu_width` therefore downgrades `AluWidth::Amx`
354    /// to the host's widest stable-detectable variant; AMX is
355    /// not currently runnable end-to-end on this framework.
356    ///
357    /// Not available on aarch64 — falls back to `Vec128`.
358    Amx,
359    /// Resolve to the widest variant the host supports at
360    /// worker entry. See the type-level doc for the resolution
361    /// order. Useful as a default when the test author wants
362    /// "as much ALU pressure as the host can sustain" without
363    /// hardcoding an architecture or feature level.
364    Widest,
365}
366
367/// Coarse Linux scheduling class identifier.
368///
369/// Maps to one of the kernel's six core scheduler classes:
370/// `fair_sched_class` (CFS / EEVDF — covers `SCHED_NORMAL`,
371/// `SCHED_BATCH`, `SCHED_IDLE`), `rt_sched_class` (covers
372/// `SCHED_FIFO` and `SCHED_RR`), `dl_sched_class` (covers
373/// `SCHED_DEADLINE`), and `ext_sched_class` (covers `SCHED_EXT`
374/// when sched_ext is loaded). The class is a coarser concept
375/// than [`SchedPolicy`] — `Cfs` covers Normal/Batch/Idle, `Rt`
376/// covers Fifo/RoundRobin — and is what
377/// `WorkType::AsymmetricWaker` consumes when it wants to
378/// describe a waker / wakee pair without specifying priority
379/// values. When a per-worker class is applied,
380/// `SchedClass::to_policy` maps the variant to the equivalent
381/// [`SchedPolicy`] (using a default priority where applicable)
382/// and routes through `set_sched_policy`.
383#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
384#[serde(rename_all = "snake_case")]
385pub enum SchedClass {
386    /// `fair_sched_class` — `SCHED_NORMAL` (CFS / EEVDF). The
387    /// default; matches a freshly-forked task before any policy
388    /// override.
389    #[default]
390    Cfs,
391    /// `fair_sched_class` — `SCHED_BATCH` (background-friendly
392    /// fair task with longer wakeup latency targets).
393    Batch,
394    /// `fair_sched_class` — `SCHED_IDLE` (lowest fair-class
395    /// weight; runs only when nothing else is runnable).
396    Idle,
397    /// `rt_sched_class` — `SCHED_FIFO` at default priority
398    /// `RT_DEFAULT_PRIO`. Requires `CAP_SYS_NICE`. For explicit
399    /// priority control use [`SchedPolicy::Fifo`] directly.
400    Rt,
401    /// `dl_sched_class` — `SCHED_DEADLINE`. Maps to a
402    /// minimum-bandwidth deadline reservation
403    /// ([`SchedClass::default_deadline_reservation`]) so
404    /// `SchedClass::Deadline` is constructible without picking
405    /// runtime/deadline/period. Callers needing precise
406    /// reservations should use [`SchedPolicy::Deadline`]
407    /// directly.
408    Deadline,
409    /// `ext_sched_class` — `SCHED_EXT`. Routes the worker
410    /// through the loaded sched_ext BPF scheduler. Under
411    /// switch-all (the default scx-ktstr regime), this is the
412    /// same effective class as `Cfs` because every fair-policy
413    /// task already reroutes to ext via `task_should_scx` (see
414    /// kernel/sched/ext.c). `Cfs` is preserved as the explicit
415    /// "I want fair semantics" knob the user expresses; `Ext`
416    /// maps to [`SchedPolicy::Ext`], which sets `policy == SCHED_EXT`
417    /// on the task_struct so it is BPF-scheduled even under a
418    /// `SCX_OPS_SWITCH_PARTIAL` scheduler.
419    Ext,
420}
421
422/// Default `RT_DEFAULT_PRIO` for [`SchedClass::Rt`] when mapped to
423/// a [`SchedPolicy`]. Picked at the middle of the 1..=99 valid range
424/// so the worker neither preempts every other RT task in the system
425/// nor sits at the floor; tests that need a specific RT priority
426/// must construct [`SchedPolicy::Fifo`] directly.
427const RT_DEFAULT_PRIO: u32 = 50;
428
429impl SchedClass {
430    /// Resolve to an equivalent [`SchedPolicy`]. `Rt` uses
431    /// `RT_DEFAULT_PRIO`; `Deadline` uses the minimum-bandwidth
432    /// reservation (2us runtime, 1ms deadline, 10ms period — passes
433    /// `__checkparam_dl` and the default sysctl bounds).
434    /// `Ext` maps to [`SchedPolicy::Ext`], which issues `sched_setattr`
435    /// with `sched_policy = SCHED_EXT` (7) so the kernel reads
436    /// `policy == SCHED_EXT`. The task is BPF-scheduled only when a
437    /// sched_ext scheduler is attached; with none attached the syscall
438    /// still succeeds and the task runs in `fair_sched_class`.
439    pub const fn to_policy(self) -> SchedPolicy {
440        match self {
441            SchedClass::Cfs => SchedPolicy::Normal,
442            SchedClass::Ext => SchedPolicy::Ext,
443            SchedClass::Batch => SchedPolicy::Batch,
444            SchedClass::Idle => SchedPolicy::Idle,
445            SchedClass::Rt => SchedPolicy::Fifo(RT_DEFAULT_PRIO),
446            SchedClass::Deadline => Self::default_deadline_reservation(),
447        }
448    }
449
450    /// Minimum-bandwidth `SCHED_DEADLINE` reservation that passes
451    /// `__checkparam_dl`'s `runtime >= (1 << DL_SCALE)` (1024ns)
452    /// floor and the kernel's default `sched_deadline_period_min_us`
453    /// (100us). 2us runtime, 1ms deadline, 10ms period — bandwidth
454    /// fraction 0.0002, well below admission-control limits.
455    pub const fn default_deadline_reservation() -> SchedPolicy {
456        SchedPolicy::Deadline {
457            runtime: Duration::from_micros(2),
458            deadline: Duration::from_millis(1),
459            period: Duration::from_millis(10),
460        }
461    }
462}