ktstr/scenario/ops/types/
limits.rs

1//! Per-controller cgroup v2 limit structs ([`CpuLimits`],
2//! [`MemoryLimits`], [`IoLimits`], [`PidsLimits`]) — the typed
3//! knob-bundles attached to [`super::CgroupDef`]. Each maps directly
4//! onto one or more `cgroup.*` / `*.max` / `*.weight` files; the
5//! corresponding `CgroupDef::*` builder methods (e.g. `cpu_quota`,
6//! `memory_max`, `io_weight`, `pids_max`) populate these structs
7//! lazily via `get_or_insert_with(*::default)`.
8//!
9//! [`CpuLimits::default`] is the lone hand-written `Default` impl in
10//! this file; see the impl's doc for the kernel-period footgun it
11//! avoids.
12
13#[allow(unused_imports)] // referenced by intra-doc links
14use super::CgroupDef;
15
16// ---------------------------------------------------------------------------
17// Cgroup v2 resource limits
18// ---------------------------------------------------------------------------
19
20/// CPU controller limits (`cpu.max` + `cpu.weight`) for a cgroup. All
21/// fields default to "inherit from parent" — the framework only writes
22/// each knob when its corresponding field is `Some`.
23///
24/// Set via [`CgroupDef::cpu_quota_pct`] / [`CgroupDef::cpu_quota`] /
25/// [`CgroupDef::cpu_weight`] (clear a cap with
26/// [`CgroupDef::cpu_unlimited`]). The kernel allows `quota` and `weight`
27/// to coexist (per `Documentation/admin-guide/cgroup-v2.rst`,
28/// "CPU Interface Files"): `weight` biases relative CPU share inside
29/// `period`, `quota` enforces an absolute ceiling. Surfacing both as
30/// independent options lets a test author express "this cgroup gets
31/// at most 50% of one CPU AND should lose to a heavier sibling under
32/// contention" in a single declaration.
33///
34/// Validation runs at `apply_setup` time — any violation surfaces as
35/// `anyhow::bail!` so a misconfigured CgroupDef fails before any
36/// worker spawns.
37#[derive(Clone, Debug, PartialEq, Eq)]
38#[non_exhaustive]
39pub struct CpuLimits {
40    /// `cpu.max` quota and period in microseconds. `quota = None`
41    /// means "max" (no upper bound). `quota = Some(q)` allows the
42    /// cgroup `q` µs of CPU time per `period`. `q > period` is
43    /// legal: it lets the cgroup use multiple CPUs concurrently
44    /// (e.g. quota 200_000 / period 100_000 = up to 2 CPUs of
45    /// throughput).
46    ///
47    /// `period` defaults to 100_000 µs (100 ms) when omitted via
48    /// the [`CgroupDef::cpu_quota_pct`] convenience builder. Set
49    /// via [`CgroupDef::cpu_quota`] when a non-default period is
50    /// needed (e.g. tighter control loops with 10 ms periods for
51    /// latency-sensitive scheduler tests).
52    pub max_quota_us: Option<u64>,
53    /// `cpu.max` period component. Required whenever `max_quota_us`
54    /// is `Some`; ignored when `max_quota_us` is `None` (the
55    /// framework writes `"max <period>"` so the period stays
56    /// recorded for diagnostics).
57    pub max_period_us: u64,
58    /// `cpu.weight` relative-share weight (range 1..=10000, default
59    /// 100). `None` leaves the kernel default in place. Larger
60    /// values get a larger share when the parent cgroup's CPU is
61    /// contended.
62    pub weight: Option<u32>,
63}
64
65/// Memory controller limits (`memory.max` / `memory.high` /
66/// `memory.low` / `memory.swap.max`). Each field is `None` by
67/// default (inherit from parent / no limit).
68///
69/// Set via [`CgroupDef::memory_max`], [`CgroupDef::memory_high`],
70/// [`CgroupDef::memory_low`], [`CgroupDef::memory_unlimited`],
71/// [`CgroupDef::memory_swap_max`], or
72/// [`CgroupDef::memory_swap_unlimited`]. Construct directly only
73/// when copying a [`MemoryLimits`] across [`CgroupDef`]s — the
74/// builder methods are the preferred entry point because they
75/// keep test code in chain position and route the per-knob value
76/// through the framework's validation seam at `apply_setup`.
77#[derive(Clone, Debug, Default, PartialEq, Eq)]
78#[non_exhaustive]
79pub struct MemoryLimits {
80    /// `memory.max` hard ceiling in bytes. Crossing this triggers
81    /// the cgroup OOM killer per `Documentation/admin-guide/
82    /// cgroup-v2.rst`'s "Memory Interface Files". `None` writes
83    /// `"max"` (no hard limit).
84    pub max: Option<u64>,
85    /// `memory.high` soft throttle threshold in bytes. Crossing
86    /// this triggers reclaim throttling but NOT OOM-kill. `None`
87    /// writes `"max"`.
88    pub high: Option<u64>,
89    /// `memory.low` soft protection threshold in bytes. The kernel
90    /// preferentially reclaims FROM other cgroups before reclaiming
91    /// this cgroup's memory below `low`. `None` writes `"0"` (no
92    /// protection).
93    pub low: Option<u64>,
94    /// `memory.swap.max` ceiling on the cgroup's swap usage in bytes.
95    /// `None` writes `"max"` (no swap cap, the kernel default). The
96    /// kernel parses the wire value via `page_counter_memparse` —
97    /// either the literal `"max"` or a decimal byte count
98    /// (`swap_max_write` in `mm/memcontrol.c`).
99    ///
100    /// # `CONFIG_SWAP=n` kernel detection
101    ///
102    /// `memory.swap.max` only exists when the kernel was built with
103    /// `CONFIG_SWAP=y`; on swap-disabled builds the file is absent
104    /// and the wire-time write returns ENOENT. The framework only
105    /// emits the write when `swap_max.is_some()` — the explicit
106    /// opt-in matches the per-knob semantics of the pids block, so
107    /// tests that never call [`CgroupDef::memory_swap_max`] /
108    /// [`CgroupDef::memory_swap_unlimited`] succeed verbatim on a
109    /// swap-disabled kernel.
110    ///
111    /// **`swap_max = Some(N)` on a `CONFIG_SWAP=n` kernel surfaces
112    /// as a hard scenario failure**: `apply_setup` propagates the
113    /// ENOENT from `set_memory_swap_max`'s `write_with_timeout` up
114    /// the error chain with the `memory.swap.max` filename in the
115    /// context. Test authors who target the swap controller must
116    /// either (a) gate the swap_max call on a host probe, or (b)
117    /// require the test kernel be built with `CONFIG_SWAP=y` and
118    /// document the requirement on the test.
119    ///
120    /// # ktstr's kernel config and swap
121    ///
122    /// `ktstr.kconfig` (the project-level kernel-config fragment that
123    /// `cargo ktstr` merges into the test kernel's defconfig) does
124    /// NOT pin `CONFIG_SWAP=y` — swap is not a test-framework
125    /// requirement, and many test scenarios run faster without it.
126    /// Tests that call `memory_swap_max` therefore must either
127    /// extend the per-test kconfig fragment (passed alongside
128    /// `ktstr.kconfig` at kernel-build time) or detect at
129    /// scenario-setup time by reading `/proc/swaps` (a missing
130    /// file or empty body indicates no swap subsystem) or
131    /// `/proc/config.gz` (search for `CONFIG_SWAP=y`). The framework
132    /// does NOT auto-detect because host probing is policy that
133    /// belongs to the test author, not the workload runner.
134    pub swap_max: Option<u64>,
135}
136
137/// Pids controller limits (`pids.max`). `None` is the default
138/// (inherit from parent — typically `"max"`, no ceiling).
139///
140/// Per the kernel's `pids_max_write`, existing tasks are NOT killed
141/// when the limit lands below the current task count; only future
142/// `fork()` / `clone()` calls are blocked once the cgroup's task
143/// count meets the limit. Useful for fork-bomb / task-count-ceiling
144/// tests.
145///
146/// # Per-WorkType thread-budget guidance
147///
148/// `pids.max` counts every task (process AND thread) inside the
149/// cgroup. Sizing the limit below the workload's natural task
150/// budget produces silent fork failures that surface as
151/// `WorkloadConfig`-level workers refusing to start.
152///
153/// **Most variants spawn exactly one task per worker** — their
154/// [`worker_main`](crate::workload) dispatch arm neither spawns
155/// helper threads nor forks children. Two exceptions run internal
156/// helper threads inside the worker process: `Schbench`
157/// (`message_threads` message threads, each spawning
158/// `worker_threads` worker threads, plus a control thread) and
159/// `Taobench` (`client_threads` client threads + `slow_threads`
160/// dispatcher threads); their per-worker task counts are
161/// config/CPU-sized, not 1. Per-worker budget therefore depends on
162/// [`CloneMode`](crate::workload::CloneMode) (whether each worker
163/// is a process or a thread sharing the parent's tgid), the
164/// variant's internal helper-thread topology, and whether the
165/// variant transiently forks short-lived children inside its own
166/// loop. The columns below capture all three:
167///
168/// | Variant | Steady-state tasks | Transient peak |
169/// |---------|--------------------|----------------|
170/// | `SpinWait`, `YieldHeavy`, `Mixed` | 1/worker | — |
171/// | `Bursty`, `IdleChurn` | 1/worker | — |
172/// | `IoSyncWrite`, `IoRandRead`, `IoConvoy` | 1/worker | — |
173/// | `CachePressure`, `CacheYield`, `CachePipe` | 1/worker | — |
174/// | `PageFaultChurn` | 1/worker | — |
175/// | `AffinityChurn`, `PolicyChurn`, `NiceSweep` | 1/worker | — |
176/// | `NumaWorkingSetSweep`, `NumaMigrationChurn`, `CgroupChurn` | 1/worker | — |
177/// | `Sequence` | 1/worker | — |
178/// | `AluHot`, `SmtSiblingSpin`, `IpcVariance` | 1/worker | — |
179/// | `PipeIo`, `FutexPingPong`, `AsymmetricWaker`, `SignalStorm` | 1/worker | — |
180/// | `FutexFanOut`, `FanOutCompute` | 1/worker | — |
181/// | `ThunderingHerd`, `MutexContention`, `WakeChain` | 1/worker | — |
182/// | `PriorityInversion`, `ProducerConsumerImbalance` | 1/worker | — |
183/// | `RtStarvation`, `PreemptStorm`, `EpollStorm` | 1/worker | — |
184/// | `CrossAffinityChurn`, `TimerLatency`, `NetTraffic`, `IrqWake` | 1/worker | — |
185/// | `ForkExit` | 1/worker | +1/worker (waitpid'd before next iter) |
186/// | `CgroupAttachStorm` | 1/worker | +1/worker (forked child per iter, `_exit`s + auto-reaped) |
187/// | `Schbench`, `Taobench` | >1/worker (internal helper threads, config/CPU-sized) | — |
188/// | `Custom` | 1/worker | depends on user closure (see below) |
189///
190/// **`CloneMode::Fork`** (the default): each worker is a separate
191/// process placed in the cgroup. The cgroup's task count for one
192/// `WorkSpec` is exactly `num_workers`; for `ForkExit` the
193/// instantaneous peak is `2 × num_workers` (each parent forks one
194/// child, waitpid's, repeats).
195///
196/// **`CloneMode::Thread`**: every worker is a thread sharing the
197/// test runner's tgid. The pids controller counts each thread as
198/// a task, so the cgroup's task count for one `WorkSpec` is
199/// `num_workers + 1` (workers + the parent task). `ForkExit` is
200/// rejected at spawn time under Thread mode (see
201/// [`WorkType::ForkExit`](crate::workload::WorkType::ForkExit)).
202///
203/// **`Custom`**: the framework runs the user closure in a single
204/// task per worker (1/worker, identical to every other variant).
205/// Any fork/clone the closure issues inside its loop adds to the
206/// cgroup's task count for as long as the resulting child lives;
207/// `pids.max` must reserve headroom equal to the closure's peak
208/// child count per worker. Under `CloneMode::Fork` the framework
209/// reaps closure-spawned descendants at teardown via
210/// `killpg(worker_pid, SIGKILL)` against the worker's per-process
211/// group, so transient children are bounded by the closure
212/// itself. Under `CloneMode::Thread` the worker shares the test
213/// runner's pgid and `killpg`-based cleanup is unavailable, so
214/// the closure owns whatever helpers it spawns and must reap
215/// them explicitly before returning the
216/// [`WorkerReport`](crate::workload::WorkerReport).
217///
218/// **Sizing rule**: `pids.max ≥ Σ(steady-state + transient)` for
219/// every [`WorkSpec`](crate::workload::WorkSpec) in the cgroup,
220/// plus headroom for `cgroup.procs` migration scratch tasks and
221/// any payload-binary helper processes the test attaches via
222/// [`CgroupDef::workload`] (e.g. `stress-ng` spawns one task per
223/// `--cpu N`). Tests with composed `WorkSpec` groups must sum
224/// across every group — the framework does NOT auto-derive a
225/// budget from the work spec.
226///
227/// # Parent-cgroup hierarchical charging
228///
229/// `pids.max` is a per-cgroup ceiling, but every fork/clone
230/// charges every ancestor up to (but not including) the
231/// unified-hierarchy root. The kernel's `pids_can_fork` calls
232/// `pids_try_charge`, which loops
233/// `for (p = pids; parent_pids(p); p = parent_pids(p))` and
234/// charges each level (kernel/cgroup/pids.c) — root is NOT
235/// charged per the loop's `parent_pids(p)` termination
236/// condition. EAGAIN propagates from the FIRST level
237/// (leaf-to-root traversal order) whose post-charge counter
238/// exceeds its limit, so a child cgroup with `pids.max = 1024`
239/// still hits EAGAIN when a parent two levels up sits at its
240/// own ceiling.
241///
242/// Sizing rule for nested test trees: the *effective* limit is
243/// `min(pids.max)` along the path from the test cgroup up to the
244/// pids-controlled root, NOT just the value set on the test
245/// cgroup itself. When ktstr runs under a delegated parent slice
246/// (systemd `user.slice`, container runtime cgroup, ktstr's own
247/// build sandbox), inspect the parent's `pids.max` before sizing
248/// the test cgroup — a generous test-cgroup setting is silently
249/// shadowed by a tighter ancestor.
250///
251/// # `pids.max(0)` is rejected at apply_setup, not type-level
252///
253/// `Some(0)` would silently halt every fork/clone inside the
254/// cgroup, including the worker spawn itself for `CloneMode::Fork`
255/// and the `ForkExit` per-iteration child fork. The kernel accepts
256/// the value (it's a legitimate `pids_max_write` input), so
257/// `apply_setup` adds the bail at scenario-setup time; promoting
258/// it to a type-level invariant (e.g. `NonZeroU64`) would force
259/// every numeric literal through a non-`const` constructor and
260/// ripple into every test fixture. The runtime bail keeps the
261/// surface ergonomic while still surfacing the foot-cannon at
262/// construction time (before any worker spawns).
263///
264/// Set via [`CgroupDef::pids_max`] or
265/// [`CgroupDef::pids_unlimited`]. Construct directly only when
266/// copying a [`PidsLimits`] across [`CgroupDef`]s — the builder
267/// methods are the preferred entry point because they route the
268/// per-knob value through the framework's validation seam at
269/// `apply_setup`.
270#[derive(Clone, Debug, Default, PartialEq, Eq)]
271#[non_exhaustive]
272pub struct PidsLimits {
273    /// `pids.max` task-count ceiling. `None` writes the literal
274    /// string `"max"` (the kernel's `PIDS_MAX_STR` sentinel for
275    /// unlimited). `Some(n)` writes the decimal `n`. The kernel
276    /// rejects negative or `>= PIDS_MAX (PID_MAX_LIMIT + 1, typically ~4M on 64-bit)` values with
277    /// EINVAL; the framework's `apply_setup` rejects `Some(0)`
278    /// before the syscall (a 0 limit silently halts every fork
279    /// or clone inside the cgroup, blocking both worker spawn
280    /// under `CloneMode::Fork` and `ForkExit`'s per-iteration
281    /// child fork).
282    pub max: Option<u64>,
283}
284
285/// IO controller limits (`io.weight`). Per-device throughput caps
286/// (`io.max`) are intentionally not surfaced here — the per-device
287/// interface needs major:minor device-id lookup which has no
288/// in-tree consumer; surface it when a concrete use case lands.
289///
290/// Set via [`CgroupDef::io_weight`]. Construct directly only when
291/// copying an [`IoLimits`] across [`CgroupDef`]s — the builder
292/// method is the preferred entry point because it routes the
293/// per-knob value through the framework's validation seam at
294/// `apply_setup`.
295#[derive(Clone, Debug, Default, PartialEq, Eq)]
296#[non_exhaustive]
297pub struct IoLimits {
298    /// `io.weight` relative-share weight (range 1..=10000, default
299    /// 100). `None` leaves the kernel default in place.
300    pub weight: Option<u16>,
301}
302
303impl Default for CpuLimits {
304    /// `cpu.max` quota off, period 100 ms (the kernel default for
305    /// `cpu.max`'s second column), `cpu.weight` unset. Matches the
306    /// initial state used by the four `CgroupDef::cpu_*` builders;
307    /// changing the default period only edits here.
308    ///
309    /// The derived `Default` would produce `max_period_us: 0` which
310    /// `apply_setup` rejects (kernel requires period > 0). Manual
311    /// impl avoids that footgun for `..Default::default()` callers.
312    fn default() -> Self {
313        Self {
314            max_quota_us: None,
315            max_period_us: 100_000,
316            weight: None,
317        }
318    }
319}