ktstr/scenario/ops/types/cgroup_def.rs
1//! Declarative cgroup blueprint — [`CgroupDef`] struct + the
2//! full builder-method surface (Group I per-WorkSpec setters,
3//! Group II `default_*` merges, Group III in-place `pcomm`
4//! stamping, plus the cpu / memory / io / pids controller knobs).
5//! See the type-level doc on [`CgroupDef`] for the per-controller
6//! summary table and the three builder-pattern groups.
7//!
8//! `CgroupDef` deliberately has NO `Default` impl — see the note at
9//! the foot of this file (and `tests::assert_not_impl_default!` in
10//! `super::tests`) for the rationale (`name = "cg_0"` would
11//! silently collide with the conventional first cgroup name).
12
13use std::borrow::Cow;
14use std::collections::BTreeSet;
15use std::time::Duration;
16
17use crate::workload::{WorkSpec, WorkType};
18
19#[allow(unused_imports)] // referenced by intra-doc links
20use super::Op;
21use super::{CpuLimits, CpusetSpec, IoLimits, MemoryLimits, PidsLimits};
22
23// ---------------------------------------------------------------------------
24// CgroupDef
25// ---------------------------------------------------------------------------
26
27/// Declarative cgroup definition: name + cpuset + synthetic
28/// [`WorkSpec`] groups + optional userspace [`Payload`](crate::test_support::Payload).
29///
30/// Bundles the ops that always go together (AddCgroup + SetCpuset +
31/// Spawn) into a single value. The executor creates the cgroup, optionally
32/// sets its cpuset, spawns workers for each [`WorkSpec`] entry, and moves
33/// them into the cgroup.
34///
35/// Multiple [`WorkSpec`] entries run in parallel within the cgroup. Each
36/// entry spawns its own set of worker processes. The optional
37/// [`Self::payload`] slot is a *single* userspace binary that runs
38/// alongside those synthetic [`WorkSpec`] groups (hence "plural works,
39/// singular payload" — the pluralization in the legacy "workload(s)"
40/// prose elided this distinction).
41///
42/// Use `CgroupDef` in `Step::with_defs` for scenarios where cgroups are
43/// created once and run for the step duration. Use `Op::add_cgroup` +
44/// `Op::spawn(SpawnPlacement::cgroup(name), work)` directly when you
45/// need mid-step cgroup creation, removal, or other dynamic operations
46/// between spawn and collect.
47///
48/// # Resource controllers overview
49///
50/// `CgroupDef` exposes one builder method per cgroup v2 controller
51/// knob, each writing the corresponding `cgroup.*` / `*.max` /
52/// `*.weight` file at `apply_setup` time. The full surface:
53///
54/// | Controller | One-line description | Builder methods | Underlying file(s) |
55/// |------------|----------------------|-----------------|--------------------|
56/// | cpuset | Bind to a CPU subset and NUMA-node memory affinity. | [`Self::cpuset`], [`Self::cpuset_mems`] | `cpuset.cpus`, `cpuset.mems` |
57/// | cpu | Bandwidth ceiling (`cpu.max` quota/period) plus relative-share weight. | [`Self::cpu_quota_pct`], [`Self::cpu_quota`], [`Self::cpu_unlimited`], [`Self::cpu_weight`] | `cpu.max`, `cpu.weight` |
58/// | memory | Hard ceiling, soft throttle threshold, soft protection floor, swap cap. | [`Self::memory_max`], [`Self::memory_high`], [`Self::memory_low`], [`Self::memory_swap_max`], [`Self::memory_swap_unlimited`], [`Self::memory_unlimited`] | `memory.max`, `memory.high`, `memory.low`, `memory.swap.max` |
59/// | io | Relative IO share (BFQ / io.cost) when the io controller is enabled. | [`Self::io_weight`] | `io.weight` |
60/// | pids | Task-count ceiling — fork(2)/clone(2) returns EAGAIN once the cap is hit. | [`Self::pids_max`], [`Self::pids_unlimited`] | `pids.max` |
61/// | freeze | Pause/resume every task in the cgroup mid-run via the JOBCTL freeze path. | (Op-level) [`Op::freeze_cgroup`], [`Op::unfreeze_cgroup`] | `cgroup.freeze` |
62///
63/// `CgroupDef` covers steady-state resource limits — knobs that
64/// hold for the cgroup's whole lifetime. The freeze knob is
65/// intentionally exposed at the [`Op`] layer instead, because
66/// freeze/unfreeze describe transitions over time (suspend
67/// mid-step, resume later) rather than the cgroup's identity; see
68/// the "See also" section below for the full Op-variants list.
69///
70/// All builders are additive — a `CgroupDef` accumulates an
71/// optional [`CpuLimits`] / [`MemoryLimits`] / [`IoLimits`] /
72/// [`PidsLimits`] block. When a block is set (e.g. `def.memory`
73/// is `Some`), **all** knobs in that block are written —
74/// `None`-valued fields emit their kernel-default sentinel
75/// (`"max"` for `memory.max`/`memory.high`, `"0"` for
76/// `memory.low`). Only `memory.swap.max` is gated: `None` means
77/// no write (for `CONFIG_SWAP=n` compatibility). The "*_unlimited"
78/// builders explicitly rewind a knob to its sentinel value
79/// (`"max"` / `"0"`) so a base `CgroupDef` factory can cap a
80/// resource and a per-test extension can clear that cap without
81/// rewriting the whole `CgroupDef`.
82///
83/// Validation runs at `apply_setup` time (before any worker
84/// spawn): out-of-range weights, `cpu.max period == 0`, and
85/// `pids.max == Some(0)` all produce actionable bails before the
86/// syscall fires. The kernel is the final authority on
87/// per-controller numeric ranges; framework-level checks catch
88/// only the foot-cannons documented per-builder.
89///
90/// # Builder semantics
91///
92/// The setters fall into three groups:
93///
94/// **Group I — per-WorkSpec fan to `works[0]`:**
95/// [`workers`](Self::workers), [`workers_pct`](Self::workers_pct),
96/// [`work_type`](Self::work_type), [`sched_policy`](Self::sched_policy),
97/// [`affinity`](Self::affinity), [`mem_policy`](Self::mem_policy),
98/// [`mpol_flags`](Self::mpol_flags). Each mutates `self.works[0]`,
99/// auto-inserting a default [`WorkSpec`] when `works` is empty. There
100/// is NO cgroup-level default for these knobs — per-group identity (or
101/// per-group cpuset validation) makes fan-out semantically ambiguous.
102/// Use [`work`](Self::work) + per-`WorkSpec` setters for multi-group
103/// cgroups.
104///
105/// **Group II — cgroup-level `default_*` merge:**
106/// [`nice`](Self::nice), [`comm`](Self::comm), [`uid`](Self::uid),
107/// [`gid`](Self::gid), [`numa_node`](Self::numa_node). Each stores a
108/// value in a `default_*` field on `CgroupDef`. Every [`WorkSpec`] in
109/// [`works`](Self::works) whose corresponding `Option`-typed field is
110/// `None` inherits the default at [`merged_works`](Self::merged_works)
111/// time — ORDER-INDEPENDENT with [`work`](Self::work). `Some(_)`
112/// (including `Some(0)`) opts out.
113///
114/// **Group III — [`pcomm`](Self::pcomm):** mutates `works` in-place
115/// at call time, NOT order-independent — by design. See
116/// [`pcomm`](Self::pcomm) for the coalescing rationale.
117///
118/// Other setters ([`cpuset`](Self::cpuset),
119/// [`cpuset_mems`](Self::cpuset_mems), the
120/// [`cpu_quota`](Self::cpu_quota) / [`memory_max`](Self::memory_max)
121/// / [`io_weight`](Self::io_weight) / [`pids_max`](Self::pids_max)
122/// controller families, [`workload`](Self::workload),
123/// [`swappable`](Self::swappable)) set cgroup-level state directly
124/// and do not participate in either merge pattern.
125///
126/// # See also
127///
128/// `CgroupDef` only expresses the steady-state shape of a cgroup
129/// (name, cpuset, work groups, payload). State changes that need
130/// to happen DURING a step — without tearing the cgroup down and
131/// recreating it — go through dedicated [`Op`] variants instead:
132///
133/// * [`Op::FreezeCgroup`] / [`Op::UnfreezeCgroup`] — pause and
134/// resume every task in the cgroup via `cgroup.freeze` (the
135/// kernel-side asynchronous freeze path; not a SIGSTOP).
136/// Useful for scheduler suspend/resume tests that observe
137/// how the scheduler handles a workload that goes idle
138/// mid-step. **Do not freeze a cgroup hosting the test's own
139/// observers** — see the deadlock warning on
140/// [`Op::FreezeCgroup`].
141/// * [`Op::SetCpuset`] — re-pin an existing cgroup's cpuset to
142/// exercise the scheduler's response to a moving CPU mask
143/// without disrupting the worker tasks themselves.
144/// * [`Op::AddCgroup`] / [`Op::RemoveCgroup`] — add or destroy
145/// cgroups mid-step when a `CgroupDef`'s lifecycle is
146/// tied to step duration but the test wants a different
147/// (e.g. nested) cgroup to appear or disappear partway
148/// through.
149///
150/// These describe transitions over time rather than the cgroup's
151/// identity, which is why they live as `Op` variants alongside
152/// the rest of the operation vocabulary rather than as
153/// `CgroupDef` builders.
154///
155/// ```
156/// # use ktstr::scenario::ops::{CgroupDef, CpusetSpec};
157/// # use ktstr::workload::{WorkSpec, WorkType};
158/// // Single work group via convenience methods.
159/// let def = CgroupDef::named("workers")
160/// .cpuset(CpusetSpec::disjoint(0, 2))
161/// .workers(4)
162/// .work_type(WorkType::SpinWait);
163///
164/// assert_eq!(def.name, "workers");
165/// assert_eq!(def.works[0].num_workers, Some(4));
166///
167/// // Multiple concurrent work groups via .work().
168/// let def = CgroupDef::named("mixed")
169/// .work(WorkSpec::default().workers(4).work_type(WorkType::SpinWait))
170/// .work(WorkSpec::default().workers(2).work_type(WorkType::YieldHeavy));
171///
172/// assert_eq!(def.works.len(), 2);
173///
174/// // Synthetic work + userspace binary side-by-side via .workload(&X).
175/// // The binary runs inside the same cgroup as the WorkSpec handles;
176/// // both spawn in apply_setup, the WorkSpec groups first, then the
177/// // Payload after the cpuset settles.
178/// # use ktstr::test_support::Payload;
179/// # const BENCH: Payload = Payload::binary("bench", "bench");
180/// let def = CgroupDef::named("io_and_spin")
181/// .cpuset(CpusetSpec::disjoint(0, 2))
182/// .workers(2)
183/// .work_type(WorkType::SpinWait)
184/// .workload(&BENCH);
185///
186/// assert!(def.payload.is_some());
187/// assert_eq!(def.works[0].num_workers, Some(2));
188/// ```
189#[derive(Clone, Debug)]
190pub struct CgroupDef {
191 /// Cgroup name relative to the scenario's parent cgroup. Must be a
192 /// valid cgroupfs filename.
193 pub name: Cow<'static, str>,
194 /// Optional cpuset assignment. `None` inherits the parent cgroup's
195 /// cpuset (typically the scenario's usable CPU set).
196 pub cpuset: Option<CpusetSpec>,
197 /// WorkSpec groups to spawn. Empty means use a single default WorkSpec
198 /// (SpinWait, Normal, `ctx.workers_per_cgroup` workers — defaults to 1
199 /// from `CtxBuilder` unless the scenario overrides it explicitly).
200 pub works: Vec<WorkSpec>,
201 /// When true, the gauntlet work_type override replaces each WorkSpec's
202 /// work_type (applied per-WorkSpec via resolve_work_type).
203 pub swappable: bool,
204 /// Optional userspace [`Payload`](crate::test_support::Payload) to
205 /// launch inside this cgroup.
206 ///
207 /// **Spawn order within `apply_setup`**: the cgroup is created
208 /// (`add_cgroup_no_cpuset`), its cpuset is resolved + set, then
209 /// each `WorkSpec` entry is spawned and moved into the cgroup in
210 /// declaration order, and finally — after every synthetic
211 /// `WorkSpec` handle has started — the `Payload` is spawned via
212 /// `PayloadRun::new(ctx, p).in_cgroup(name).spawn()`. This
213 /// fixed order lets the cgroup cpuset and mempolicy settle on
214 /// the `WorkSpec` handles before the binary inherits placement, so
215 /// the binary sees a stable topology. Once spawned, all three
216 /// (cgroup, works, payload) run concurrently until teardown.
217 ///
218 /// Only
219 /// [`PayloadKind::Binary`](crate::test_support::PayloadKind::Binary)
220 /// payloads are accepted — scheduler-kind payloads are rejected
221 /// at construction time via [`Self::workload`]. The payload is
222 /// killed at step-teardown (before cgroup removal) so the cgroup
223 /// removal does not fail with EBUSY.
224 pub payload: Option<&'static crate::test_support::Payload>,
225 /// Optional cpuset.mems NUMA node binding. `None` inherits the
226 /// parent cgroup's `cpuset.mems`. Set via
227 /// [`Self::cpuset_mems`].
228 pub cpuset_mems: Option<BTreeSet<usize>>,
229 /// Optional cpu controller limits (`cpu.max`, `cpu.weight`).
230 /// `None` leaves both kernel defaults in place. Set via
231 /// [`Self::cpu_quota_pct`] / [`Self::cpu_quota`] /
232 /// [`Self::cpu_weight`].
233 pub cpu: Option<CpuLimits>,
234 /// Optional memory controller limits (`memory.max`,
235 /// `memory.high`, `memory.low`, `memory.swap.max`). `None`
236 /// leaves all four at the kernel defaults. Set via
237 /// [`Self::memory_max`] / [`Self::memory_high`] /
238 /// [`Self::memory_low`] / [`Self::memory_swap_max`].
239 pub memory: Option<MemoryLimits>,
240 /// Optional io controller limits (`io.weight`). `None` leaves
241 /// the kernel default in place. Set via [`Self::io_weight`].
242 pub io: Option<IoLimits>,
243 /// Optional pids controller limits (`pids.max`). `None` leaves
244 /// the kernel default in place (no ceiling). Set via
245 /// [`Self::pids_max`].
246 pub pids: Option<PidsLimits>,
247 /// Cgroup-level default for [`WorkSpec::nice`]. When `Some(n)`,
248 /// every [`WorkSpec`] in [`Self::works`] whose own `nice` field
249 /// is `None` (the framework's "skip setpriority(2)" state — see
250 /// [`WorkloadConfig::nice`](crate::workload::WorkloadConfig::nice))
251 /// inherits `Some(n)` at apply-setup time. Set via [`Self::nice`];
252 /// merged in [`Self::merged_works`].
253 ///
254 /// Order-independent with [`Self::work`]: `def.work(spec).nice(n)`
255 /// and `def.nice(n).work(spec)` produce identical effective
256 /// `WorkSpec` values because the merge runs at `merged_works()`
257 /// call time, not at builder-method call time.
258 pub default_nice: Option<i32>,
259 /// Cgroup-level default for [`WorkSpec::comm`]. Merged into any
260 /// [`WorkSpec`] whose own `comm` is `None` at apply-setup time.
261 /// Set via [`Self::comm`]; merged in [`Self::merged_works`].
262 pub default_comm: Option<Cow<'static, str>>,
263 /// Cgroup-level default for [`WorkSpec::uid`]. Merged into any
264 /// [`WorkSpec`] whose own `uid` is `None` at apply-setup time.
265 /// Set via [`Self::uid`]; merged in [`Self::merged_works`].
266 pub default_uid: Option<u32>,
267 /// Cgroup-level default for [`WorkSpec::gid`]. Merged into any
268 /// [`WorkSpec`] whose own `gid` is `None` at apply-setup time.
269 /// Set via [`Self::gid`]; merged in [`Self::merged_works`].
270 pub default_gid: Option<u32>,
271 /// Cgroup-level default for [`WorkSpec::numa_node`]. Merged into
272 /// any [`WorkSpec`] whose own `numa_node` is `None` at
273 /// apply-setup time. Set via [`Self::numa_node`]; merged in
274 /// [`Self::merged_works`].
275 pub default_numa_node: Option<u32>,
276}
277
278impl CgroupDef {
279 /// Create a CgroupDef with defaults (empty works, no cpuset).
280 ///
281 /// `apply_setup` fills an empty `works` slice with one default
282 /// [`WorkSpec`] (SpinWait, SCHED_NORMAL, `ctx.workers_per_cgroup`
283 /// workers — defaults to 1 from `CtxBuilder`). For an empty
284 /// move-target cgroup with no workers, declare it via
285 /// [`Op::AddCgroup`] at step or Backdrop level. For the common
286 /// `CgroupDef::named(name).workers(ctx.workers_per_cgroup)`
287 /// pattern use [`Ctx::cgroup_def`](crate::scenario::Ctx::cgroup_def).
288 #[must_use = "dropping a CgroupDef discards the cgroup specification"]
289 pub fn named(name: impl Into<Cow<'static, str>>) -> Self {
290 Self {
291 name: name.into(),
292 cpuset: None,
293 works: vec![],
294 swappable: false,
295 payload: None,
296 cpuset_mems: None,
297 cpu: None,
298 memory: None,
299 io: None,
300 pids: None,
301 default_nice: None,
302 default_comm: None,
303 default_uid: None,
304 default_gid: None,
305 default_numa_node: None,
306 }
307 }
308
309 /// Set [`Self::cpuset`]; see [`Op::SetCpuset`] for mid-run changes.
310 #[must_use = "builder methods consume self; bind the result"]
311 pub fn cpuset(mut self, cpus: CpusetSpec) -> Self {
312 self.cpuset = Some(cpus);
313 self
314 }
315
316 /// Append a [`WorkSpec`] group (multiple calls yield concurrent groups within this cgroup).
317 #[must_use = "builder methods consume self; bind the result"]
318 pub fn work(mut self, w: WorkSpec) -> Self {
319 self.works.push(w);
320 self
321 }
322
323 /// Ensure `works[0]` exists for single-WorkSpec builder methods.
324 fn ensure_default_work(&mut self) {
325 if self.works.is_empty() {
326 self.works.push(WorkSpec::default());
327 }
328 }
329
330 /// Set [`WorkSpec::num_workers`] on `works[0]` (Group I).
331 ///
332 /// `n` MUST be `>= 1`. `n == 0` is rejected at apply-setup time
333 /// by `resolve_num_workers` (before any worker spawn) with an
334 /// actionable diagnostic naming the cgroup;
335 /// [`WorkloadConfig::validate`](crate::workload::WorkloadConfig::validate)
336 /// is the downstream defense-in-depth gate. A zero-worker spawn
337 /// would silently produce no workload load, vacuously passing
338 /// scheduler assertions that rely on observable contention. Pass
339 /// `n >= 1`; for fraction-of-cpuset sizing use [`Self::workers_pct`].
340 #[must_use = "builder methods consume self; bind the result"]
341 pub fn workers(mut self, n: usize) -> Self {
342 self.ensure_default_work();
343 self.works[0].num_workers = Some(n);
344 self
345 }
346
347 /// Set [`WorkSpec::workers_pct`] on `works[0]` (Group I). Resolved
348 /// against the cgroup's cpuset at apply-setup via
349 /// `ceil(cpuset_cpus * pct)`. Mutually exclusive with
350 /// [`Self::workers`] — see [`WorkSpec::workers_pct`].
351 ///
352 /// # Panics
353 ///
354 /// Panics when `pct` is NaN, infinite, or `<= 0.0`. Extreme
355 /// finite values (e.g. `1e100`) pass the gate and saturate to
356 /// `usize::MAX` via the `as` cast in `resolve_workers_pct`
357 /// (RFC 2484 / Rust 1.45+) — attempting to spawn that many
358 /// workers would OOM the host. Keep `pct` near the intended
359 /// oversubscription factor (e.g. `1.0`, `2.0`, `4.0`).
360 #[must_use = "builder methods consume self; bind the result"]
361 pub fn workers_pct(mut self, pct: f64) -> Self {
362 assert!(
363 pct.is_finite() && pct > 0.0,
364 "CgroupDef::workers_pct({pct}): pct must be finite and > 0.0",
365 );
366 self.ensure_default_work();
367 self.works[0].workers_pct = Some(pct);
368 self
369 }
370
371 /// Set [`WorkSpec::work_type`] on `works[0]` (Group I).
372 #[must_use = "builder methods consume self; bind the result"]
373 pub fn work_type(mut self, wt: WorkType) -> Self {
374 self.ensure_default_work();
375 self.works[0].work_type = wt;
376 self
377 }
378
379 /// Set [`WorkSpec::sched_policy`] on `works[0]` (Group I).
380 #[must_use = "builder methods consume self; bind the result"]
381 pub fn sched_policy(mut self, p: crate::workload::SchedPolicy) -> Self {
382 self.ensure_default_work();
383 self.works[0].sched_policy = p;
384 self
385 }
386
387 /// Set [`WorkSpec::affinity`] on `works[0]` (Group I).
388 #[must_use = "builder methods consume self; bind the result"]
389 pub fn affinity(mut self, a: crate::workload::AffinityIntent) -> Self {
390 self.ensure_default_work();
391 self.works[0].affinity = a;
392 self
393 }
394
395 /// Set [`WorkSpec::mem_policy`] on `works[0]` (Group I). Validated
396 /// against the resolved cpuset per-group.
397 #[must_use = "builder methods consume self; bind the result"]
398 pub fn mem_policy(mut self, p: crate::workload::MemPolicy) -> Self {
399 self.ensure_default_work();
400 self.works[0].mem_policy = p;
401 self
402 }
403
404 /// Set [`WorkSpec::mpol_flags`] on `works[0]` (Group I).
405 #[must_use = "builder methods consume self; bind the result"]
406 pub fn mpol_flags(mut self, f: crate::workload::MpolFlags) -> Self {
407 self.ensure_default_work();
408 self.works[0].mpol_flags = f;
409 self
410 }
411
412 /// Set [`Self::default_nice`] (Group II). Note: `WorkSpec::nice(0)`
413 /// stores `Some(0)` and opts out of this default — the worker's
414 /// nice is explicitly set to 0 via `setpriority(2)` rather than
415 /// inheriting.
416 #[must_use = "builder methods consume self; bind the result"]
417 pub const fn nice(mut self, n: i32) -> Self {
418 self.default_nice = Some(n);
419 self
420 }
421
422 /// Set [`Self::default_comm`] (Group II).
423 ///
424 /// # Panics
425 ///
426 /// Panics on programmer-error inputs — mirrors
427 /// [`crate::workload::WorkSpec::pcomm`]'s `# Panics`:
428 /// - Empty string.
429 /// - Interior NUL byte.
430 /// - More than 15 bytes (`TASK_COMM_LEN - 1` cap).
431 ///
432 /// See
433 /// `validate_task_comm_string` (in `crate::workload`)
434 /// for the centralized rationale; `name.len()` is the BYTE
435 /// length (UTF-8 multi-byte chars count as their byte width).
436 #[must_use = "builder methods consume self; bind the result"]
437 pub fn comm(mut self, name: impl Into<std::borrow::Cow<'static, str>>) -> Self {
438 let name: std::borrow::Cow<'static, str> = name.into();
439 crate::workload::validate_task_comm_string("CgroupDef::comm", &name);
440 self.default_comm = Some(name);
441 self
442 }
443
444 /// Set [`Self::default_uid`] (Group II).
445 #[must_use = "builder methods consume self; bind the result"]
446 pub const fn uid(mut self, uid: u32) -> Self {
447 self.default_uid = Some(uid);
448 self
449 }
450
451 /// Set [`Self::default_gid`] (Group II).
452 #[must_use = "builder methods consume self; bind the result"]
453 pub const fn gid(mut self, gid: u32) -> Self {
454 self.default_gid = Some(gid);
455 self
456 }
457
458 /// Set the thread-group leader's comm on every WorkSpec in
459 /// this CgroupDef. Each affected WorkSpec gets `pcomm =
460 /// Some(name)`; existing per-WorkSpec `pcomm` values (set
461 /// before this call) are overwritten. Calling on an empty
462 /// `works` list pushes a default WorkSpec carrying the value.
463 ///
464 /// The pcomm string is applied via `prctl(PR_SET_NAME)` on
465 /// the forked thread-group leader. The builder rejects > 15
466 /// bytes (TASK_COMM_LEN-1) at construction so the
467 /// `task->group_leader->comm == pcomm` invariant the framework
468 /// relies on holds exactly.
469 /// Setting this triggers the fork-then-thread spawn path in
470 /// `apply_setup`: WorkSpecs sharing a `pcomm` value coalesce
471 /// into ONE thread-group leader per group; every worker
472 /// thread inside observes `task->group_leader->comm == pcomm`.
473 /// Each worker thread additionally sets its own `task->comm`
474 /// via `.comm()` on the per-WorkSpec [`WorkSpec::comm`] at
475 /// thread creation time.
476 ///
477 /// `pcomm` lives ONLY on [`WorkSpec`] — there is no
478 /// CgroupDef-level field. This builder writes the value into
479 /// every WorkSpec directly so `apply_setup` has a single
480 /// authoritative source per WorkSpec.
481 ///
482 /// **Not order-independent with [`Self::work`] — by design.**
483 /// Unlike Group II setters, `pcomm` mutates `works` in-place when
484 /// called: it stamps every WorkSpec that EXISTS at call time and
485 /// then returns. WorkSpecs added via subsequent [`Self::work`]
486 /// calls are not retroactively touched, and a WorkSpec that
487 /// already carried its own `pcomm` is OVERWRITTEN if it was
488 /// pushed before `.pcomm(..)` ran. This is intentional — `pcomm`
489 /// determines the thread-group leader's coalescing key in
490 /// `apply_setup`, so the framework needs the value baked onto
491 /// each WorkSpec by the time `merged_works()` runs. Storing it
492 /// as a default and merging at read time would break the
493 /// coalescing contract for the empty-works case (the synthesised
494 /// `WorkSpec::default()` would have to carry the pcomm without
495 /// distinguishing "default" from "explicit override").
496 ///
497 /// # Panics
498 ///
499 /// Panics on programmer-error inputs — mirrors
500 /// [`crate::workload::WorkSpec::pcomm`]'s `# Panics`:
501 /// - Empty string.
502 /// - Interior NUL byte.
503 /// - More than 15 bytes (`TASK_COMM_LEN - 1` cap).
504 ///
505 /// See
506 /// `validate_task_comm_string` (in `crate::workload`)
507 /// for the centralized rationale; `name.len()` is the BYTE
508 /// length (UTF-8 multi-byte chars count as their byte width).
509 #[must_use = "builder methods consume self; bind the result"]
510 pub fn pcomm(mut self, name: impl Into<Cow<'static, str>>) -> Self {
511 let name: Cow<'static, str> = name.into();
512 // Validate ONCE before the in-place loop so a bad input
513 // never partially writes the `works` vec — the per-builder
514 // assert fires with a single named site rather than firing
515 // mid-mutation across N entries.
516 crate::workload::validate_task_comm_string("CgroupDef::pcomm", &name);
517 if self.works.is_empty() {
518 self.works.push(WorkSpec::default());
519 }
520 for w in &mut self.works {
521 w.pcomm = Some(name.clone());
522 }
523 self
524 }
525
526 /// Set [`Self::default_numa_node`] (Group II).
527 #[must_use = "builder methods consume self; bind the result"]
528 pub const fn numa_node(mut self, node: u32) -> Self {
529 self.default_numa_node = Some(node);
530 self
531 }
532
533 /// Set [`Self::swappable`] (gauntlet work_type override).
534 #[must_use = "builder methods consume self; bind the result"]
535 pub const fn swappable(mut self, swappable: bool) -> Self {
536 self.swappable = swappable;
537 self
538 }
539
540 /// Attach a userspace payload binary that runs inside this cgroup
541 /// alongside any synthetic [`WorkSpec`] groups. The payload spawns
542 /// when the step enters `apply_setup` and is killed during
543 /// step-teardown so the cgroup can be removed cleanly.
544 ///
545 /// # Panics
546 ///
547 /// Panics when `p.is_scheduler()` (i.e. `p` is a scheduler-kind
548 /// [`Payload`](crate::test_support::Payload) — `KERNEL_DEFAULT`
549 /// or any other `PayloadKind::Scheduler*` variant). Only
550 /// [`PayloadKind::Binary`](crate::test_support::PayloadKind::Binary)
551 /// payloads are accepted; `CgroupDef.workload` is for userspace
552 /// binary payloads only, and scheduler placement uses
553 /// `#[ktstr_test(scheduler = ...)]` instead.
554 ///
555 /// **Why panic at declaration time, not at spawn time?** Three
556 /// reasons, all of which favor failing fast:
557 /// 1. **Discovery-time surfacing.** `CgroupDef` builders run
558 /// during test construction, which nextest's `--list`
559 /// invocation reaches BEFORE any VM boot. A panic here
560 /// emits a full backtrace inside the test binary and
561 /// surfaces the offending call site immediately; a deferred
562 /// runtime error would require a KVM-capable host + a
563 /// kernel image + an initramfs build to observe — a 30+
564 /// second feedback loop for what is purely a
565 /// typed-API misuse.
566 /// 2. **No side effects.** The panic happens before
567 /// `CgroupDef.payload = Some(p)` assignment runs, so the
568 /// in-progress builder is left in its prior (no-payload)
569 /// state. A caller that catches the panic via
570 /// `catch_unwind` sees a valid CgroupDef either way.
571 /// 3. **Scheduler-kind is always a programming error here.**
572 /// `Payload::KERNEL_DEFAULT` in `CgroupDef::workload` is never a
573 /// legitimate use case — it means the author confused the
574 /// `scheduler` slot (test-level) with the `workload` slot
575 /// (cgroup-level). There is no recovery path; the only
576 /// resolution is editing the source.
577 ///
578 /// Scheduler-kind payloads in the step-level `Op::RunPayload`
579 /// path bail with an `anyhow::Error` instead of panicking —
580 /// that path runs during scenario execution where one bad op
581 /// should not crash a whole test run.
582 #[must_use = "builder methods consume self; bind the result"]
583 pub fn workload(mut self, p: &'static crate::test_support::Payload) -> Self {
584 assert!(
585 !p.is_scheduler(),
586 "CgroupDef::workload called with a scheduler-kind Payload ({}); \
587 CgroupDef.workload is for userspace binary payloads only. \
588 Use #[ktstr_test(scheduler = ...)] for scheduler placement.",
589 p.name,
590 );
591 self.payload = Some(p);
592 self
593 }
594
595 /// Bind `cpuset.mems` for this cgroup. Mirrors [`Self::cpuset`]
596 /// for NUMA memory placement: the cgroup's tasks may only
597 /// allocate memory on the listed NUMA nodes. `None` (default)
598 /// inherits the parent's `cpuset.mems`.
599 ///
600 /// Required when the cgroup spans CPUs on a NUMA node whose
601 /// memory is NOT in the parent's `cpuset.mems` — allocations
602 /// from the cgroup's tasks are constrained to the parent's
603 /// allowed nodes per kernel/cgroup/cpuset.c. The framework
604 /// writes `cpuset.mems` immediately after `cpuset.cpus` so the
605 /// binding is in effect before any worker is moved in.
606 #[must_use = "builder methods consume self; bind the result"]
607 pub fn cpuset_mems(mut self, nodes: BTreeSet<usize>) -> Self {
608 self.cpuset_mems = Some(nodes);
609 self
610 }
611
612 /// Set `cpu.max` quota as a percentage of one CPU's
613 /// throughput, with a default 100 ms `period`. `100` means
614 /// "one full CPU" (quota=100_000, period=100_000); `200` means
615 /// "two CPUs". Use [`Self::cpu_quota`] for non-default periods.
616 #[must_use = "builder methods consume self; bind the result"]
617 pub fn cpu_quota_pct(mut self, pct: u32) -> Self {
618 let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
619 cpu.max_period_us = 100_000;
620 cpu.max_quota_us = Some((pct as u64) * 1_000);
621 self
622 }
623
624 /// Set `cpu.max` quota and period directly. `quota` may exceed
625 /// `period` (multi-CPU concurrency, see [`CpuLimits::max_quota_us`]).
626 /// Both arguments are converted to microseconds; sub-microsecond
627 /// fractions in the supplied [`Duration`]s are truncated.
628 #[must_use = "builder methods consume self; bind the result"]
629 pub fn cpu_quota(mut self, quota: Duration, period: Duration) -> Self {
630 let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
631 cpu.max_quota_us = Some(quota.as_micros() as u64);
632 cpu.max_period_us = period.as_micros() as u64;
633 self
634 }
635
636 /// Clear any previously-set `cpu.max` quota (writes `"max"`),
637 /// leaving `cpu.weight` (if set) intact. Useful when a base
638 /// CgroupDef builder applied a default cap and the test wants
639 /// only weight-based bias.
640 #[must_use = "builder methods consume self; bind the result"]
641 pub fn cpu_unlimited(mut self) -> Self {
642 let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
643 cpu.max_quota_us = None;
644 self
645 }
646
647 /// Set `cpu.weight` (`CGROUP_WEIGHT_MIN..=CGROUP_WEIGHT_MAX`,
648 /// 1..=10000; `CGROUP_WEIGHT_DFL` = 100; enforced by
649 /// `cpu_weight_write_u64` in kernel/sched/core.c). Larger values
650 /// get a larger share under contention. Independent of `cpu.max`.
651 #[must_use = "builder methods consume self; bind the result"]
652 pub fn cpu_weight(mut self, weight: u32) -> Self {
653 let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
654 cpu.weight = Some(weight);
655 self
656 }
657
658 /// Set `memory.max` hard ceiling in bytes. Crossing this
659 /// triggers reclaim first (`try_charge_memcg` in
660 /// mm/memcontrol.c); the cgroup OOM killer fires only after
661 /// `MAX_RECLAIM_RETRIES` failed retries, and is skipped when
662 /// the allocation carries `__GFP_NORETRY` or
663 /// `__GFP_RETRY_MAYFAIL`.
664 #[must_use = "builder methods consume self; bind the result"]
665 pub fn memory_max(mut self, bytes: u64) -> Self {
666 let m = self.memory.get_or_insert_with(MemoryLimits::default);
667 m.max = Some(bytes);
668 self
669 }
670
671 /// Set `memory.high` soft throttle threshold in bytes. Crossing
672 /// this triggers reclaim throttling but NOT OOM-kill — per
673 /// `__mem_cgroup_handle_over_high` in mm/memcontrol.c:
674 /// "memory.high enforcement isn't as strict, and there is no
675 /// OOM killer involved".
676 #[must_use = "builder methods consume self; bind the result"]
677 pub fn memory_high(mut self, bytes: u64) -> Self {
678 let m = self.memory.get_or_insert_with(MemoryLimits::default);
679 m.high = Some(bytes);
680 self
681 }
682
683 /// Set `memory.low` soft protection threshold in bytes.
684 /// Reclaim prefers other cgroups before this one's memory
685 /// drops below `low`.
686 #[must_use = "builder methods consume self; bind the result"]
687 pub fn memory_low(mut self, bytes: u64) -> Self {
688 let m = self.memory.get_or_insert_with(MemoryLimits::default);
689 m.low = Some(bytes);
690 self
691 }
692
693 /// Clear all three memory limits (writes `"max"` for max/high
694 /// and `"0"` for low). Equivalent to leaving `memory` unset
695 /// at construction; provided for symmetry with
696 /// [`Self::cpu_unlimited`].
697 #[must_use = "builder methods consume self; bind the result"]
698 pub fn memory_unlimited(mut self) -> Self {
699 self.memory = Some(MemoryLimits::default());
700 self
701 }
702
703 /// Set `io.weight` (`CGROUP_WEIGHT_MIN..=CGROUP_WEIGHT_MAX`,
704 /// 1..=10000; `CGROUP_WEIGHT_DFL` = 100; enforced by
705 /// `ioc_weight_write` in block/blk-iocost.c). Biases relative
706 /// IO share when the io controller is enabled. `io.max`
707 /// per-device caps are not surfaced — see [`IoLimits`].
708 #[must_use = "builder methods consume self; bind the result"]
709 pub fn io_weight(mut self, weight: u16) -> Self {
710 let io = self.io.get_or_insert_with(IoLimits::default);
711 io.weight = Some(weight);
712 self
713 }
714
715 /// Set `memory.swap.max` ceiling in bytes. The kernel parses the
716 /// wire value via `page_counter_memparse` and accepts a decimal
717 /// byte count (`swap_max_write` in `mm/memcontrol.c`). Distinct
718 /// from `memory.max`: this caps how much of the cgroup's memory
719 /// can spill to swap, separate from total memory consumption.
720 #[must_use = "builder methods consume self; bind the result"]
721 pub fn memory_swap_max(mut self, bytes: u64) -> Self {
722 let m = self.memory.get_or_insert_with(MemoryLimits::default);
723 m.swap_max = Some(bytes);
724 self
725 }
726
727 /// Clear any previously-set `memory.swap.max` (writes `"max"`).
728 /// Mirrors [`Self::cpu_unlimited`] / [`Self::memory_unlimited`]
729 /// for a single memory-knob unset; useful when a base
730 /// `CgroupDef` builder applied a swap cap and the test wants to
731 /// remove only that knob while preserving `memory.max`/`high`/
732 /// `low`.
733 ///
734 /// No-ops when `self.memory == None` — the default state already
735 /// means "no swap cap" (apply_setup emits no memory writes for an
736 /// unset `memory` field), so creating a fresh `MemoryLimits` just
737 /// to set `swap_max = None` would (a) be redundant and (b)
738 /// trigger 3 unwanted writes for `memory.max` / `memory.high` /
739 /// `memory.low` at apply_setup time. The no-op short-circuit
740 /// keeps "fresh CgroupDef + memory_swap_unlimited()" semantically
741 /// identical to "fresh CgroupDef".
742 #[must_use = "builder methods consume self; bind the result"]
743 pub fn memory_swap_unlimited(mut self) -> Self {
744 if let Some(m) = self.memory.as_mut() {
745 m.swap_max = None;
746 }
747 self
748 }
749
750 /// Set `pids.max` task-count ceiling. `n` is the maximum number
751 /// of processes the cgroup may host before subsequent
752 /// `fork()` / `clone()` calls return EAGAIN. Existing tasks are
753 /// NOT killed when the limit lands below the current count
754 /// (per the `pids_max_write` kernel comment: "Limit updates
755 /// don't need to be mutex'd, since it isn't critical that any
756 /// racing fork()s follow the new limit").
757 ///
758 /// `n = 0` is rejected at `apply_setup` time: a 0-limit cgroup
759 /// halts every fork/clone inside, including the worker spawn
760 /// under `CloneMode::Fork` and the `ForkExit` per-iteration
761 /// child fork. There is no kernel sentinel for "no fork ever";
762 /// `pids_max=0` silently fails every `fork()` inside with
763 /// `EAGAIN`, which is almost certainly a configuration bug.
764 #[must_use = "builder methods consume self; bind the result"]
765 pub fn pids_max(mut self, n: u64) -> Self {
766 let pids = self.pids.get_or_insert_with(PidsLimits::default);
767 pids.max = Some(n);
768 self
769 }
770
771 /// Clear any previously-set `pids.max` (writes `"max"`).
772 /// Mirrors [`Self::cpu_unlimited`] / [`Self::memory_unlimited`].
773 #[must_use = "builder methods consume self; bind the result"]
774 pub fn pids_unlimited(mut self) -> Self {
775 let pids = self.pids.get_or_insert_with(PidsLimits::default);
776 pids.max = None;
777 self
778 }
779
780 /// Materialize [`Self::works`] with cgroup-level defaults
781 /// merged into each entry. Called by `apply_setup` to resolve
782 /// the per-WorkSpec values before spawning workers.
783 ///
784 /// For every [`WorkSpec`] in [`Self::works`] (or a single
785 /// [`WorkSpec::default()`] when `works` is empty, matching
786 /// `apply_setup`'s default-substitution rule), each cgroup-level
787 /// default in [`Self::default_nice`] / [`Self::default_comm`] /
788 /// [`Self::default_uid`] / [`Self::default_gid`] /
789 /// [`Self::default_numa_node`] fills the corresponding
790 /// `WorkSpec` field when that field is "unset" at the WorkSpec
791 /// level.
792 ///
793 /// "Unset" means `None` for every `Option`-typed field —
794 /// `nice`, `comm`, `uid`, `gid`, `numa_node` are all
795 /// `Option<_>`. The framework's "skip setpriority(2)" state per
796 /// [`WorkloadConfig::nice`](crate::workload::WorkloadConfig::nice)
797 /// is `None`. A `WorkSpec` that explicitly sets `Some(n)`
798 /// (including `Some(0)`) keeps its value; the cgroup-level
799 /// default applies only when the WorkSpec is at the framework
800 /// default of `None`.
801 ///
802 /// `pcomm` is NOT propagated through `merged_works`. The
803 /// [`Self::pcomm`] convenience method writes `pcomm` directly
804 /// into every WorkSpec at builder time so coalescing in
805 /// `apply_setup` reads the per-WorkSpec value (the
806 /// authoritative source).
807 ///
808 /// Decoupling this merge from the convenience-method call sites
809 /// makes the builder order-independent —
810 /// `def.nice(5).work(spec)` and `def.work(spec).nice(5)`
811 /// produce identical effective `WorkSpec` values.
812 pub fn merged_works(&self) -> Vec<WorkSpec> {
813 let base: Vec<WorkSpec> = if self.works.is_empty() {
814 vec![WorkSpec::default()]
815 } else {
816 self.works.clone()
817 };
818 base.into_iter()
819 .map(|mut w| {
820 if w.nice.is_none()
821 && let Some(n) = self.default_nice
822 {
823 w.nice = Some(n);
824 }
825 if w.comm.is_none() {
826 w.comm = self.default_comm.clone();
827 }
828 if w.uid.is_none() {
829 w.uid = self.default_uid;
830 }
831 if w.gid.is_none() {
832 w.gid = self.default_gid;
833 }
834 if w.numa_node.is_none() {
835 w.numa_node = self.default_numa_node;
836 }
837 w
838 })
839 .collect()
840 }
841}
842
843// `CgroupDef` deliberately has NO `Default` impl. The previous
844// derived/hand-rolled default produced `name = "cg_0"`, which
845// collides with the conventional first cgroup name in nearly every
846// scenario (a test calling `..Default::default()` would silently
847// share a cgroup with the scenario's first named entry). Forcing
848// every construction site to go through [`CgroupDef::named`] makes
849// the name explicit and eliminates the footgun. The pattern is
850// documented in the type-level docstring and operator-facing
851// guidance at `doc/guide/src/architecture/workload-handle.md` under
852// the spread-default warning. The compile-time pin of the absence
853// lives in the `#[cfg(test)]` mod below (`assert_not_impl_default!`
854// from `src/test_macros.rs`).
855
856#[cfg(test)]
857mod tests {
858 use super::*;
859
860 assert_not_impl_default!(CgroupDef);
861
862 #[test]
863 #[should_panic(expected = "CgroupDef::comm: empty string rejected")]
864 fn cgroup_def_comm_rejects_empty() {
865 let _ = CgroupDef::named("cg").comm("");
866 }
867
868 #[test]
869 #[should_panic(expected = "interior NUL byte")]
870 fn cgroup_def_comm_rejects_interior_nul() {
871 let _ = CgroupDef::named("cg").comm("foo\0bar");
872 }
873
874 /// Pins the validate-on-builder contract: CgroupDef::pcomm
875 /// previously wrote `w.pcomm = Some(...)` directly, bypassing
876 /// WorkSpec::pcomm's asserts. Both builders now route through a
877 /// shared `validate_task_comm_string` helper — this test would
878 /// FAIL the pre-helper implementation and PASS the post-helper
879 /// implementation.
880 #[test]
881 #[should_panic(expected = "CgroupDef::pcomm: empty string rejected")]
882 fn cgroup_def_pcomm_rejects_empty() {
883 let _ = CgroupDef::named("cg").pcomm("");
884 }
885
886 #[test]
887 #[should_panic(expected = "interior NUL byte")]
888 fn cgroup_def_pcomm_rejects_interior_nul() {
889 let _ = CgroupDef::named("cg").pcomm("foo\0bar");
890 }
891
892 /// Per-builder boundary pins: a future refactor that re-routes
893 /// CgroupDef::comm or CgroupDef::pcomm around the shared
894 /// `validate_task_comm_string` helper would surface here even if
895 /// the helper-level tests still pass.
896 #[test]
897 fn cgroup_def_comm_accepts_15_byte_boundary() {
898 let fifteen = "a".repeat(15);
899 let def = CgroupDef::named("cg").comm(fifteen.clone());
900 assert_eq!(def.default_comm.as_deref(), Some(fifteen.as_str()));
901 }
902
903 #[test]
904 #[should_panic(expected = "16 bytes")]
905 fn cgroup_def_comm_rejects_16_byte_overflow() {
906 let _ = CgroupDef::named("cg").comm("a".repeat(16));
907 }
908
909 #[test]
910 fn cgroup_def_pcomm_accepts_15_byte_boundary() {
911 let fifteen = "a".repeat(15);
912 let def = CgroupDef::named("cg").pcomm(fifteen.clone());
913 // pcomm stamps every WorkSpec; default works is one entry.
914 assert_eq!(def.works.len(), 1);
915 assert_eq!(def.works[0].pcomm.as_deref(), Some(fifteen.as_str()));
916 }
917
918 #[test]
919 #[should_panic(expected = "16 bytes")]
920 fn cgroup_def_pcomm_rejects_16_byte_overflow() {
921 let _ = CgroupDef::named("cg").pcomm("a".repeat(16));
922 }
923}