ktstr/scenario/ops/types/
cgroup_def.rs

1//! Declarative cgroup blueprint — [`CgroupDef`] struct + the
2//! full builder-method surface (Group I per-WorkSpec setters,
3//! Group II `default_*` merges, Group III in-place `pcomm`
4//! stamping, plus the cpu / memory / io / pids controller knobs).
5//! See the type-level doc on [`CgroupDef`] for the per-controller
6//! summary table and the three builder-pattern groups.
7//!
8//! `CgroupDef` deliberately has NO `Default` impl — see the note at
9//! the foot of this file (and `tests::assert_not_impl_default!` in
10//! `super::tests`) for the rationale (`name = "cg_0"` would
11//! silently collide with the conventional first cgroup name).
12
13use std::borrow::Cow;
14use std::collections::BTreeSet;
15use std::time::Duration;
16
17use crate::workload::{WorkSpec, WorkType};
18
19#[allow(unused_imports)] // referenced by intra-doc links
20use super::Op;
21use super::{CpuLimits, CpusetSpec, IoLimits, MemoryLimits, PidsLimits};
22
23// ---------------------------------------------------------------------------
24// CgroupDef
25// ---------------------------------------------------------------------------
26
27/// Declarative cgroup definition: name + cpuset + synthetic
28/// [`WorkSpec`] groups + optional userspace [`Payload`](crate::test_support::Payload).
29///
30/// Bundles the ops that always go together (AddCgroup + SetCpuset +
31/// Spawn) into a single value. The executor creates the cgroup, optionally
32/// sets its cpuset, spawns workers for each [`WorkSpec`] entry, and moves
33/// them into the cgroup.
34///
35/// Multiple [`WorkSpec`] entries run in parallel within the cgroup. Each
36/// entry spawns its own set of worker processes. The optional
37/// [`Self::payload`] slot is a *single* userspace binary that runs
38/// alongside those synthetic [`WorkSpec`] groups (hence "plural works,
39/// singular payload" — the pluralization in the legacy "workload(s)"
40/// prose elided this distinction).
41///
42/// Use `CgroupDef` in `Step::with_defs` for scenarios where cgroups are
43/// created once and run for the step duration. Use `Op::add_cgroup` +
44/// `Op::spawn(SpawnPlacement::cgroup(name), work)` directly when you
45/// need mid-step cgroup creation, removal, or other dynamic operations
46/// between spawn and collect.
47///
48/// # Resource controllers overview
49///
50/// `CgroupDef` exposes one builder method per cgroup v2 controller
51/// knob, each writing the corresponding `cgroup.*` / `*.max` /
52/// `*.weight` file at `apply_setup` time. The full surface:
53///
54/// | Controller | One-line description | Builder methods | Underlying file(s) |
55/// |------------|----------------------|-----------------|--------------------|
56/// | cpuset | Bind to a CPU subset and NUMA-node memory affinity. | [`Self::cpuset`], [`Self::cpuset_mems`] | `cpuset.cpus`, `cpuset.mems` |
57/// | cpu    | Bandwidth ceiling (`cpu.max` quota/period) plus relative-share weight. | [`Self::cpu_quota_pct`], [`Self::cpu_quota`], [`Self::cpu_unlimited`], [`Self::cpu_weight`] | `cpu.max`, `cpu.weight` |
58/// | memory | Hard ceiling, soft throttle threshold, soft protection floor, swap cap. | [`Self::memory_max`], [`Self::memory_high`], [`Self::memory_low`], [`Self::memory_swap_max`], [`Self::memory_swap_unlimited`], [`Self::memory_unlimited`] | `memory.max`, `memory.high`, `memory.low`, `memory.swap.max` |
59/// | io     | Relative IO share (BFQ / io.cost) when the io controller is enabled. | [`Self::io_weight`] | `io.weight` |
60/// | pids   | Task-count ceiling — fork(2)/clone(2) returns EAGAIN once the cap is hit. | [`Self::pids_max`], [`Self::pids_unlimited`] | `pids.max` |
61/// | freeze | Pause/resume every task in the cgroup mid-run via the JOBCTL freeze path. | (Op-level) [`Op::freeze_cgroup`], [`Op::unfreeze_cgroup`] | `cgroup.freeze` |
62///
63/// `CgroupDef` covers steady-state resource limits — knobs that
64/// hold for the cgroup's whole lifetime. The freeze knob is
65/// intentionally exposed at the [`Op`] layer instead, because
66/// freeze/unfreeze describe transitions over time (suspend
67/// mid-step, resume later) rather than the cgroup's identity; see
68/// the "See also" section below for the full Op-variants list.
69///
70/// All builders are additive — a `CgroupDef` accumulates an
71/// optional [`CpuLimits`] / [`MemoryLimits`] / [`IoLimits`] /
72/// [`PidsLimits`] block. When a block is set (e.g. `def.memory`
73/// is `Some`), **all** knobs in that block are written —
74/// `None`-valued fields emit their kernel-default sentinel
75/// (`"max"` for `memory.max`/`memory.high`, `"0"` for
76/// `memory.low`). Only `memory.swap.max` is gated: `None` means
77/// no write (for `CONFIG_SWAP=n` compatibility). The "*_unlimited"
78/// builders explicitly rewind a knob to its sentinel value
79/// (`"max"` / `"0"`) so a base `CgroupDef` factory can cap a
80/// resource and a per-test extension can clear that cap without
81/// rewriting the whole `CgroupDef`.
82///
83/// Validation runs at `apply_setup` time (before any worker
84/// spawn): out-of-range weights, `cpu.max period == 0`, and
85/// `pids.max == Some(0)` all produce actionable bails before the
86/// syscall fires. The kernel is the final authority on
87/// per-controller numeric ranges; framework-level checks catch
88/// only the foot-cannons documented per-builder.
89///
90/// # Builder semantics
91///
92/// The setters fall into three groups:
93///
94/// **Group I — per-WorkSpec fan to `works[0]`:**
95/// [`workers`](Self::workers), [`workers_pct`](Self::workers_pct),
96/// [`work_type`](Self::work_type), [`sched_policy`](Self::sched_policy),
97/// [`affinity`](Self::affinity), [`mem_policy`](Self::mem_policy),
98/// [`mpol_flags`](Self::mpol_flags). Each mutates `self.works[0]`,
99/// auto-inserting a default [`WorkSpec`] when `works` is empty. There
100/// is NO cgroup-level default for these knobs — per-group identity (or
101/// per-group cpuset validation) makes fan-out semantically ambiguous.
102/// Use [`work`](Self::work) + per-`WorkSpec` setters for multi-group
103/// cgroups.
104///
105/// **Group II — cgroup-level `default_*` merge:**
106/// [`nice`](Self::nice), [`comm`](Self::comm), [`uid`](Self::uid),
107/// [`gid`](Self::gid), [`numa_node`](Self::numa_node). Each stores a
108/// value in a `default_*` field on `CgroupDef`. Every [`WorkSpec`] in
109/// [`works`](Self::works) whose corresponding `Option`-typed field is
110/// `None` inherits the default at [`merged_works`](Self::merged_works)
111/// time — ORDER-INDEPENDENT with [`work`](Self::work). `Some(_)`
112/// (including `Some(0)`) opts out.
113///
114/// **Group III — [`pcomm`](Self::pcomm):** mutates `works` in-place
115/// at call time, NOT order-independent — by design. See
116/// [`pcomm`](Self::pcomm) for the coalescing rationale.
117///
118/// Other setters ([`cpuset`](Self::cpuset),
119/// [`cpuset_mems`](Self::cpuset_mems), the
120/// [`cpu_quota`](Self::cpu_quota) / [`memory_max`](Self::memory_max)
121/// / [`io_weight`](Self::io_weight) / [`pids_max`](Self::pids_max)
122/// controller families, [`workload`](Self::workload),
123/// [`swappable`](Self::swappable)) set cgroup-level state directly
124/// and do not participate in either merge pattern.
125///
126/// # See also
127///
128/// `CgroupDef` only expresses the steady-state shape of a cgroup
129/// (name, cpuset, work groups, payload). State changes that need
130/// to happen DURING a step — without tearing the cgroup down and
131/// recreating it — go through dedicated [`Op`] variants instead:
132///
133/// * [`Op::FreezeCgroup`] / [`Op::UnfreezeCgroup`] — pause and
134///   resume every task in the cgroup via `cgroup.freeze` (the
135///   kernel-side asynchronous freeze path; not a SIGSTOP).
136///   Useful for scheduler suspend/resume tests that observe
137///   how the scheduler handles a workload that goes idle
138///   mid-step. **Do not freeze a cgroup hosting the test's own
139///   observers** — see the deadlock warning on
140///   [`Op::FreezeCgroup`].
141/// * [`Op::SetCpuset`] — re-pin an existing cgroup's cpuset to
142///   exercise the scheduler's response to a moving CPU mask
143///   without disrupting the worker tasks themselves.
144/// * [`Op::AddCgroup`] / [`Op::RemoveCgroup`] — add or destroy
145///   cgroups mid-step when a `CgroupDef`'s lifecycle is
146///   tied to step duration but the test wants a different
147///   (e.g. nested) cgroup to appear or disappear partway
148///   through.
149///
150/// These describe transitions over time rather than the cgroup's
151/// identity, which is why they live as `Op` variants alongside
152/// the rest of the operation vocabulary rather than as
153/// `CgroupDef` builders.
154///
155/// ```
156/// # use ktstr::scenario::ops::{CgroupDef, CpusetSpec};
157/// # use ktstr::workload::{WorkSpec, WorkType};
158/// // Single work group via convenience methods.
159/// let def = CgroupDef::named("workers")
160///     .cpuset(CpusetSpec::disjoint(0, 2))
161///     .workers(4)
162///     .work_type(WorkType::SpinWait);
163///
164/// assert_eq!(def.name, "workers");
165/// assert_eq!(def.works[0].num_workers, Some(4));
166///
167/// // Multiple concurrent work groups via .work().
168/// let def = CgroupDef::named("mixed")
169///     .work(WorkSpec::default().workers(4).work_type(WorkType::SpinWait))
170///     .work(WorkSpec::default().workers(2).work_type(WorkType::YieldHeavy));
171///
172/// assert_eq!(def.works.len(), 2);
173///
174/// // Synthetic work + userspace binary side-by-side via .workload(&X).
175/// // The binary runs inside the same cgroup as the WorkSpec handles;
176/// // both spawn in apply_setup, the WorkSpec groups first, then the
177/// // Payload after the cpuset settles.
178/// # use ktstr::test_support::Payload;
179/// # const BENCH: Payload = Payload::binary("bench", "bench");
180/// let def = CgroupDef::named("io_and_spin")
181///     .cpuset(CpusetSpec::disjoint(0, 2))
182///     .workers(2)
183///     .work_type(WorkType::SpinWait)
184///     .workload(&BENCH);
185///
186/// assert!(def.payload.is_some());
187/// assert_eq!(def.works[0].num_workers, Some(2));
188/// ```
189#[derive(Clone, Debug)]
190pub struct CgroupDef {
191    /// Cgroup name relative to the scenario's parent cgroup. Must be a
192    /// valid cgroupfs filename.
193    pub name: Cow<'static, str>,
194    /// Optional cpuset assignment. `None` inherits the parent cgroup's
195    /// cpuset (typically the scenario's usable CPU set).
196    pub cpuset: Option<CpusetSpec>,
197    /// WorkSpec groups to spawn. Empty means use a single default WorkSpec
198    /// (SpinWait, Normal, `ctx.workers_per_cgroup` workers — defaults to 1
199    /// from `CtxBuilder` unless the scenario overrides it explicitly).
200    pub works: Vec<WorkSpec>,
201    /// When true, the gauntlet work_type override replaces each WorkSpec's
202    /// work_type (applied per-WorkSpec via resolve_work_type).
203    pub swappable: bool,
204    /// Optional userspace [`Payload`](crate::test_support::Payload) to
205    /// launch inside this cgroup.
206    ///
207    /// **Spawn order within `apply_setup`**: the cgroup is created
208    /// (`add_cgroup_no_cpuset`), its cpuset is resolved + set, then
209    /// each `WorkSpec` entry is spawned and moved into the cgroup in
210    /// declaration order, and finally — after every synthetic
211    /// `WorkSpec` handle has started — the `Payload` is spawned via
212    /// `PayloadRun::new(ctx, p).in_cgroup(name).spawn()`. This
213    /// fixed order lets the cgroup cpuset and mempolicy settle on
214    /// the `WorkSpec` handles before the binary inherits placement, so
215    /// the binary sees a stable topology. Once spawned, all three
216    /// (cgroup, works, payload) run concurrently until teardown.
217    ///
218    /// Only
219    /// [`PayloadKind::Binary`](crate::test_support::PayloadKind::Binary)
220    /// payloads are accepted — scheduler-kind payloads are rejected
221    /// at construction time via [`Self::workload`]. The payload is
222    /// killed at step-teardown (before cgroup removal) so the cgroup
223    /// removal does not fail with EBUSY.
224    pub payload: Option<&'static crate::test_support::Payload>,
225    /// Optional cpuset.mems NUMA node binding. `None` inherits the
226    /// parent cgroup's `cpuset.mems`. Set via
227    /// [`Self::cpuset_mems`].
228    pub cpuset_mems: Option<BTreeSet<usize>>,
229    /// Optional cpu controller limits (`cpu.max`, `cpu.weight`).
230    /// `None` leaves both kernel defaults in place. Set via
231    /// [`Self::cpu_quota_pct`] / [`Self::cpu_quota`] /
232    /// [`Self::cpu_weight`].
233    pub cpu: Option<CpuLimits>,
234    /// Optional memory controller limits (`memory.max`,
235    /// `memory.high`, `memory.low`, `memory.swap.max`). `None`
236    /// leaves all four at the kernel defaults. Set via
237    /// [`Self::memory_max`] / [`Self::memory_high`] /
238    /// [`Self::memory_low`] / [`Self::memory_swap_max`].
239    pub memory: Option<MemoryLimits>,
240    /// Optional io controller limits (`io.weight`). `None` leaves
241    /// the kernel default in place. Set via [`Self::io_weight`].
242    pub io: Option<IoLimits>,
243    /// Optional pids controller limits (`pids.max`). `None` leaves
244    /// the kernel default in place (no ceiling). Set via
245    /// [`Self::pids_max`].
246    pub pids: Option<PidsLimits>,
247    /// Cgroup-level default for [`WorkSpec::nice`]. When `Some(n)`,
248    /// every [`WorkSpec`] in [`Self::works`] whose own `nice` field
249    /// is `None` (the framework's "skip setpriority(2)" state — see
250    /// [`WorkloadConfig::nice`](crate::workload::WorkloadConfig::nice))
251    /// inherits `Some(n)` at apply-setup time. Set via [`Self::nice`];
252    /// merged in [`Self::merged_works`].
253    ///
254    /// Order-independent with [`Self::work`]: `def.work(spec).nice(n)`
255    /// and `def.nice(n).work(spec)` produce identical effective
256    /// `WorkSpec` values because the merge runs at `merged_works()`
257    /// call time, not at builder-method call time.
258    pub default_nice: Option<i32>,
259    /// Cgroup-level default for [`WorkSpec::comm`]. Merged into any
260    /// [`WorkSpec`] whose own `comm` is `None` at apply-setup time.
261    /// Set via [`Self::comm`]; merged in [`Self::merged_works`].
262    pub default_comm: Option<Cow<'static, str>>,
263    /// Cgroup-level default for [`WorkSpec::uid`]. Merged into any
264    /// [`WorkSpec`] whose own `uid` is `None` at apply-setup time.
265    /// Set via [`Self::uid`]; merged in [`Self::merged_works`].
266    pub default_uid: Option<u32>,
267    /// Cgroup-level default for [`WorkSpec::gid`]. Merged into any
268    /// [`WorkSpec`] whose own `gid` is `None` at apply-setup time.
269    /// Set via [`Self::gid`]; merged in [`Self::merged_works`].
270    pub default_gid: Option<u32>,
271    /// Cgroup-level default for [`WorkSpec::numa_node`]. Merged into
272    /// any [`WorkSpec`] whose own `numa_node` is `None` at
273    /// apply-setup time. Set via [`Self::numa_node`]; merged in
274    /// [`Self::merged_works`].
275    pub default_numa_node: Option<u32>,
276}
277
278impl CgroupDef {
279    /// Create a CgroupDef with defaults (empty works, no cpuset).
280    ///
281    /// `apply_setup` fills an empty `works` slice with one default
282    /// [`WorkSpec`] (SpinWait, SCHED_NORMAL, `ctx.workers_per_cgroup`
283    /// workers — defaults to 1 from `CtxBuilder`). For an empty
284    /// move-target cgroup with no workers, declare it via
285    /// [`Op::AddCgroup`] at step or Backdrop level. For the common
286    /// `CgroupDef::named(name).workers(ctx.workers_per_cgroup)`
287    /// pattern use [`Ctx::cgroup_def`](crate::scenario::Ctx::cgroup_def).
288    #[must_use = "dropping a CgroupDef discards the cgroup specification"]
289    pub fn named(name: impl Into<Cow<'static, str>>) -> Self {
290        Self {
291            name: name.into(),
292            cpuset: None,
293            works: vec![],
294            swappable: false,
295            payload: None,
296            cpuset_mems: None,
297            cpu: None,
298            memory: None,
299            io: None,
300            pids: None,
301            default_nice: None,
302            default_comm: None,
303            default_uid: None,
304            default_gid: None,
305            default_numa_node: None,
306        }
307    }
308
309    /// Set [`Self::cpuset`]; see [`Op::SetCpuset`] for mid-run changes.
310    #[must_use = "builder methods consume self; bind the result"]
311    pub fn cpuset(mut self, cpus: CpusetSpec) -> Self {
312        self.cpuset = Some(cpus);
313        self
314    }
315
316    /// Append a [`WorkSpec`] group (multiple calls yield concurrent groups within this cgroup).
317    #[must_use = "builder methods consume self; bind the result"]
318    pub fn work(mut self, w: WorkSpec) -> Self {
319        self.works.push(w);
320        self
321    }
322
323    /// Ensure `works[0]` exists for single-WorkSpec builder methods.
324    fn ensure_default_work(&mut self) {
325        if self.works.is_empty() {
326            self.works.push(WorkSpec::default());
327        }
328    }
329
330    /// Set [`WorkSpec::num_workers`] on `works[0]` (Group I).
331    ///
332    /// `n` MUST be `>= 1`. `n == 0` is rejected at apply-setup time
333    /// by `resolve_num_workers` (before any worker spawn) with an
334    /// actionable diagnostic naming the cgroup;
335    /// [`WorkloadConfig::validate`](crate::workload::WorkloadConfig::validate)
336    /// is the downstream defense-in-depth gate. A zero-worker spawn
337    /// would silently produce no workload load, vacuously passing
338    /// scheduler assertions that rely on observable contention. Pass
339    /// `n >= 1`; for fraction-of-cpuset sizing use [`Self::workers_pct`].
340    #[must_use = "builder methods consume self; bind the result"]
341    pub fn workers(mut self, n: usize) -> Self {
342        self.ensure_default_work();
343        self.works[0].num_workers = Some(n);
344        self
345    }
346
347    /// Set [`WorkSpec::workers_pct`] on `works[0]` (Group I). Resolved
348    /// against the cgroup's cpuset at apply-setup via
349    /// `ceil(cpuset_cpus * pct)`. Mutually exclusive with
350    /// [`Self::workers`] — see [`WorkSpec::workers_pct`].
351    ///
352    /// # Panics
353    ///
354    /// Panics when `pct` is NaN, infinite, or `<= 0.0`. Extreme
355    /// finite values (e.g. `1e100`) pass the gate and saturate to
356    /// `usize::MAX` via the `as` cast in `resolve_workers_pct`
357    /// (RFC 2484 / Rust 1.45+) — attempting to spawn that many
358    /// workers would OOM the host. Keep `pct` near the intended
359    /// oversubscription factor (e.g. `1.0`, `2.0`, `4.0`).
360    #[must_use = "builder methods consume self; bind the result"]
361    pub fn workers_pct(mut self, pct: f64) -> Self {
362        assert!(
363            pct.is_finite() && pct > 0.0,
364            "CgroupDef::workers_pct({pct}): pct must be finite and > 0.0",
365        );
366        self.ensure_default_work();
367        self.works[0].workers_pct = Some(pct);
368        self
369    }
370
371    /// Set [`WorkSpec::work_type`] on `works[0]` (Group I).
372    #[must_use = "builder methods consume self; bind the result"]
373    pub fn work_type(mut self, wt: WorkType) -> Self {
374        self.ensure_default_work();
375        self.works[0].work_type = wt;
376        self
377    }
378
379    /// Set [`WorkSpec::sched_policy`] on `works[0]` (Group I).
380    #[must_use = "builder methods consume self; bind the result"]
381    pub fn sched_policy(mut self, p: crate::workload::SchedPolicy) -> Self {
382        self.ensure_default_work();
383        self.works[0].sched_policy = p;
384        self
385    }
386
387    /// Set [`WorkSpec::affinity`] on `works[0]` (Group I).
388    #[must_use = "builder methods consume self; bind the result"]
389    pub fn affinity(mut self, a: crate::workload::AffinityIntent) -> Self {
390        self.ensure_default_work();
391        self.works[0].affinity = a;
392        self
393    }
394
395    /// Set [`WorkSpec::mem_policy`] on `works[0]` (Group I). Validated
396    /// against the resolved cpuset per-group.
397    #[must_use = "builder methods consume self; bind the result"]
398    pub fn mem_policy(mut self, p: crate::workload::MemPolicy) -> Self {
399        self.ensure_default_work();
400        self.works[0].mem_policy = p;
401        self
402    }
403
404    /// Set [`WorkSpec::mpol_flags`] on `works[0]` (Group I).
405    #[must_use = "builder methods consume self; bind the result"]
406    pub fn mpol_flags(mut self, f: crate::workload::MpolFlags) -> Self {
407        self.ensure_default_work();
408        self.works[0].mpol_flags = f;
409        self
410    }
411
412    /// Set [`Self::default_nice`] (Group II). Note: `WorkSpec::nice(0)`
413    /// stores `Some(0)` and opts out of this default — the worker's
414    /// nice is explicitly set to 0 via `setpriority(2)` rather than
415    /// inheriting.
416    #[must_use = "builder methods consume self; bind the result"]
417    pub const fn nice(mut self, n: i32) -> Self {
418        self.default_nice = Some(n);
419        self
420    }
421
422    /// Set [`Self::default_comm`] (Group II).
423    ///
424    /// # Panics
425    ///
426    /// Panics on programmer-error inputs — mirrors
427    /// [`crate::workload::WorkSpec::pcomm`]'s `# Panics`:
428    /// - Empty string.
429    /// - Interior NUL byte.
430    /// - More than 15 bytes (`TASK_COMM_LEN - 1` cap).
431    ///
432    /// See
433    /// `validate_task_comm_string` (in `crate::workload`)
434    /// for the centralized rationale; `name.len()` is the BYTE
435    /// length (UTF-8 multi-byte chars count as their byte width).
436    #[must_use = "builder methods consume self; bind the result"]
437    pub fn comm(mut self, name: impl Into<std::borrow::Cow<'static, str>>) -> Self {
438        let name: std::borrow::Cow<'static, str> = name.into();
439        crate::workload::validate_task_comm_string("CgroupDef::comm", &name);
440        self.default_comm = Some(name);
441        self
442    }
443
444    /// Set [`Self::default_uid`] (Group II).
445    #[must_use = "builder methods consume self; bind the result"]
446    pub const fn uid(mut self, uid: u32) -> Self {
447        self.default_uid = Some(uid);
448        self
449    }
450
451    /// Set [`Self::default_gid`] (Group II).
452    #[must_use = "builder methods consume self; bind the result"]
453    pub const fn gid(mut self, gid: u32) -> Self {
454        self.default_gid = Some(gid);
455        self
456    }
457
458    /// Set the thread-group leader's comm on every WorkSpec in
459    /// this CgroupDef. Each affected WorkSpec gets `pcomm =
460    /// Some(name)`; existing per-WorkSpec `pcomm` values (set
461    /// before this call) are overwritten. Calling on an empty
462    /// `works` list pushes a default WorkSpec carrying the value.
463    ///
464    /// The pcomm string is applied via `prctl(PR_SET_NAME)` on
465    /// the forked thread-group leader. The builder rejects > 15
466    /// bytes (TASK_COMM_LEN-1) at construction so the
467    /// `task->group_leader->comm == pcomm` invariant the framework
468    /// relies on holds exactly.
469    /// Setting this triggers the fork-then-thread spawn path in
470    /// `apply_setup`: WorkSpecs sharing a `pcomm` value coalesce
471    /// into ONE thread-group leader per group; every worker
472    /// thread inside observes `task->group_leader->comm == pcomm`.
473    /// Each worker thread additionally sets its own `task->comm`
474    /// via `.comm()` on the per-WorkSpec [`WorkSpec::comm`] at
475    /// thread creation time.
476    ///
477    /// `pcomm` lives ONLY on [`WorkSpec`] — there is no
478    /// CgroupDef-level field. This builder writes the value into
479    /// every WorkSpec directly so `apply_setup` has a single
480    /// authoritative source per WorkSpec.
481    ///
482    /// **Not order-independent with [`Self::work`] — by design.**
483    /// Unlike Group II setters, `pcomm` mutates `works` in-place when
484    /// called: it stamps every WorkSpec that EXISTS at call time and
485    /// then returns. WorkSpecs added via subsequent [`Self::work`]
486    /// calls are not retroactively touched, and a WorkSpec that
487    /// already carried its own `pcomm` is OVERWRITTEN if it was
488    /// pushed before `.pcomm(..)` ran. This is intentional — `pcomm`
489    /// determines the thread-group leader's coalescing key in
490    /// `apply_setup`, so the framework needs the value baked onto
491    /// each WorkSpec by the time `merged_works()` runs. Storing it
492    /// as a default and merging at read time would break the
493    /// coalescing contract for the empty-works case (the synthesised
494    /// `WorkSpec::default()` would have to carry the pcomm without
495    /// distinguishing "default" from "explicit override").
496    ///
497    /// # Panics
498    ///
499    /// Panics on programmer-error inputs — mirrors
500    /// [`crate::workload::WorkSpec::pcomm`]'s `# Panics`:
501    /// - Empty string.
502    /// - Interior NUL byte.
503    /// - More than 15 bytes (`TASK_COMM_LEN - 1` cap).
504    ///
505    /// See
506    /// `validate_task_comm_string` (in `crate::workload`)
507    /// for the centralized rationale; `name.len()` is the BYTE
508    /// length (UTF-8 multi-byte chars count as their byte width).
509    #[must_use = "builder methods consume self; bind the result"]
510    pub fn pcomm(mut self, name: impl Into<Cow<'static, str>>) -> Self {
511        let name: Cow<'static, str> = name.into();
512        // Validate ONCE before the in-place loop so a bad input
513        // never partially writes the `works` vec — the per-builder
514        // assert fires with a single named site rather than firing
515        // mid-mutation across N entries.
516        crate::workload::validate_task_comm_string("CgroupDef::pcomm", &name);
517        if self.works.is_empty() {
518            self.works.push(WorkSpec::default());
519        }
520        for w in &mut self.works {
521            w.pcomm = Some(name.clone());
522        }
523        self
524    }
525
526    /// Set [`Self::default_numa_node`] (Group II).
527    #[must_use = "builder methods consume self; bind the result"]
528    pub const fn numa_node(mut self, node: u32) -> Self {
529        self.default_numa_node = Some(node);
530        self
531    }
532
533    /// Set [`Self::swappable`] (gauntlet work_type override).
534    #[must_use = "builder methods consume self; bind the result"]
535    pub const fn swappable(mut self, swappable: bool) -> Self {
536        self.swappable = swappable;
537        self
538    }
539
540    /// Attach a userspace payload binary that runs inside this cgroup
541    /// alongside any synthetic [`WorkSpec`] groups. The payload spawns
542    /// when the step enters `apply_setup` and is killed during
543    /// step-teardown so the cgroup can be removed cleanly.
544    ///
545    /// # Panics
546    ///
547    /// Panics when `p.is_scheduler()` (i.e. `p` is a scheduler-kind
548    /// [`Payload`](crate::test_support::Payload) — `KERNEL_DEFAULT`
549    /// or any other `PayloadKind::Scheduler*` variant). Only
550    /// [`PayloadKind::Binary`](crate::test_support::PayloadKind::Binary)
551    /// payloads are accepted; `CgroupDef.workload` is for userspace
552    /// binary payloads only, and scheduler placement uses
553    /// `#[ktstr_test(scheduler = ...)]` instead.
554    ///
555    /// **Why panic at declaration time, not at spawn time?** Three
556    /// reasons, all of which favor failing fast:
557    /// 1. **Discovery-time surfacing.** `CgroupDef` builders run
558    ///    during test construction, which nextest's `--list`
559    ///    invocation reaches BEFORE any VM boot. A panic here
560    ///    emits a full backtrace inside the test binary and
561    ///    surfaces the offending call site immediately; a deferred
562    ///    runtime error would require a KVM-capable host + a
563    ///    kernel image + an initramfs build to observe — a 30+
564    ///    second feedback loop for what is purely a
565    ///    typed-API misuse.
566    /// 2. **No side effects.** The panic happens before
567    ///    `CgroupDef.payload = Some(p)` assignment runs, so the
568    ///    in-progress builder is left in its prior (no-payload)
569    ///    state. A caller that catches the panic via
570    ///    `catch_unwind` sees a valid CgroupDef either way.
571    /// 3. **Scheduler-kind is always a programming error here.**
572    ///    `Payload::KERNEL_DEFAULT` in `CgroupDef::workload` is never a
573    ///    legitimate use case — it means the author confused the
574    ///    `scheduler` slot (test-level) with the `workload` slot
575    ///    (cgroup-level). There is no recovery path; the only
576    ///    resolution is editing the source.
577    ///
578    /// Scheduler-kind payloads in the step-level `Op::RunPayload`
579    /// path bail with an `anyhow::Error` instead of panicking —
580    /// that path runs during scenario execution where one bad op
581    /// should not crash a whole test run.
582    #[must_use = "builder methods consume self; bind the result"]
583    pub fn workload(mut self, p: &'static crate::test_support::Payload) -> Self {
584        assert!(
585            !p.is_scheduler(),
586            "CgroupDef::workload called with a scheduler-kind Payload ({}); \
587             CgroupDef.workload is for userspace binary payloads only. \
588             Use #[ktstr_test(scheduler = ...)] for scheduler placement.",
589            p.name,
590        );
591        self.payload = Some(p);
592        self
593    }
594
595    /// Bind `cpuset.mems` for this cgroup. Mirrors [`Self::cpuset`]
596    /// for NUMA memory placement: the cgroup's tasks may only
597    /// allocate memory on the listed NUMA nodes. `None` (default)
598    /// inherits the parent's `cpuset.mems`.
599    ///
600    /// Required when the cgroup spans CPUs on a NUMA node whose
601    /// memory is NOT in the parent's `cpuset.mems` — allocations
602    /// from the cgroup's tasks are constrained to the parent's
603    /// allowed nodes per kernel/cgroup/cpuset.c. The framework
604    /// writes `cpuset.mems` immediately after `cpuset.cpus` so the
605    /// binding is in effect before any worker is moved in.
606    #[must_use = "builder methods consume self; bind the result"]
607    pub fn cpuset_mems(mut self, nodes: BTreeSet<usize>) -> Self {
608        self.cpuset_mems = Some(nodes);
609        self
610    }
611
612    /// Set `cpu.max` quota as a percentage of one CPU's
613    /// throughput, with a default 100 ms `period`. `100` means
614    /// "one full CPU" (quota=100_000, period=100_000); `200` means
615    /// "two CPUs". Use [`Self::cpu_quota`] for non-default periods.
616    #[must_use = "builder methods consume self; bind the result"]
617    pub fn cpu_quota_pct(mut self, pct: u32) -> Self {
618        let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
619        cpu.max_period_us = 100_000;
620        cpu.max_quota_us = Some((pct as u64) * 1_000);
621        self
622    }
623
624    /// Set `cpu.max` quota and period directly. `quota` may exceed
625    /// `period` (multi-CPU concurrency, see [`CpuLimits::max_quota_us`]).
626    /// Both arguments are converted to microseconds; sub-microsecond
627    /// fractions in the supplied [`Duration`]s are truncated.
628    #[must_use = "builder methods consume self; bind the result"]
629    pub fn cpu_quota(mut self, quota: Duration, period: Duration) -> Self {
630        let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
631        cpu.max_quota_us = Some(quota.as_micros() as u64);
632        cpu.max_period_us = period.as_micros() as u64;
633        self
634    }
635
636    /// Clear any previously-set `cpu.max` quota (writes `"max"`),
637    /// leaving `cpu.weight` (if set) intact. Useful when a base
638    /// CgroupDef builder applied a default cap and the test wants
639    /// only weight-based bias.
640    #[must_use = "builder methods consume self; bind the result"]
641    pub fn cpu_unlimited(mut self) -> Self {
642        let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
643        cpu.max_quota_us = None;
644        self
645    }
646
647    /// Set `cpu.weight` (`CGROUP_WEIGHT_MIN..=CGROUP_WEIGHT_MAX`,
648    /// 1..=10000; `CGROUP_WEIGHT_DFL` = 100; enforced by
649    /// `cpu_weight_write_u64` in kernel/sched/core.c). Larger values
650    /// get a larger share under contention. Independent of `cpu.max`.
651    #[must_use = "builder methods consume self; bind the result"]
652    pub fn cpu_weight(mut self, weight: u32) -> Self {
653        let cpu = self.cpu.get_or_insert_with(CpuLimits::default);
654        cpu.weight = Some(weight);
655        self
656    }
657
658    /// Set `memory.max` hard ceiling in bytes. Crossing this
659    /// triggers reclaim first (`try_charge_memcg` in
660    /// mm/memcontrol.c); the cgroup OOM killer fires only after
661    /// `MAX_RECLAIM_RETRIES` failed retries, and is skipped when
662    /// the allocation carries `__GFP_NORETRY` or
663    /// `__GFP_RETRY_MAYFAIL`.
664    #[must_use = "builder methods consume self; bind the result"]
665    pub fn memory_max(mut self, bytes: u64) -> Self {
666        let m = self.memory.get_or_insert_with(MemoryLimits::default);
667        m.max = Some(bytes);
668        self
669    }
670
671    /// Set `memory.high` soft throttle threshold in bytes. Crossing
672    /// this triggers reclaim throttling but NOT OOM-kill — per
673    /// `__mem_cgroup_handle_over_high` in mm/memcontrol.c:
674    /// "memory.high enforcement isn't as strict, and there is no
675    /// OOM killer involved".
676    #[must_use = "builder methods consume self; bind the result"]
677    pub fn memory_high(mut self, bytes: u64) -> Self {
678        let m = self.memory.get_or_insert_with(MemoryLimits::default);
679        m.high = Some(bytes);
680        self
681    }
682
683    /// Set `memory.low` soft protection threshold in bytes.
684    /// Reclaim prefers other cgroups before this one's memory
685    /// drops below `low`.
686    #[must_use = "builder methods consume self; bind the result"]
687    pub fn memory_low(mut self, bytes: u64) -> Self {
688        let m = self.memory.get_or_insert_with(MemoryLimits::default);
689        m.low = Some(bytes);
690        self
691    }
692
693    /// Clear all three memory limits (writes `"max"` for max/high
694    /// and `"0"` for low). Equivalent to leaving `memory` unset
695    /// at construction; provided for symmetry with
696    /// [`Self::cpu_unlimited`].
697    #[must_use = "builder methods consume self; bind the result"]
698    pub fn memory_unlimited(mut self) -> Self {
699        self.memory = Some(MemoryLimits::default());
700        self
701    }
702
703    /// Set `io.weight` (`CGROUP_WEIGHT_MIN..=CGROUP_WEIGHT_MAX`,
704    /// 1..=10000; `CGROUP_WEIGHT_DFL` = 100; enforced by
705    /// `ioc_weight_write` in block/blk-iocost.c). Biases relative
706    /// IO share when the io controller is enabled. `io.max`
707    /// per-device caps are not surfaced — see [`IoLimits`].
708    #[must_use = "builder methods consume self; bind the result"]
709    pub fn io_weight(mut self, weight: u16) -> Self {
710        let io = self.io.get_or_insert_with(IoLimits::default);
711        io.weight = Some(weight);
712        self
713    }
714
715    /// Set `memory.swap.max` ceiling in bytes. The kernel parses the
716    /// wire value via `page_counter_memparse` and accepts a decimal
717    /// byte count (`swap_max_write` in `mm/memcontrol.c`). Distinct
718    /// from `memory.max`: this caps how much of the cgroup's memory
719    /// can spill to swap, separate from total memory consumption.
720    #[must_use = "builder methods consume self; bind the result"]
721    pub fn memory_swap_max(mut self, bytes: u64) -> Self {
722        let m = self.memory.get_or_insert_with(MemoryLimits::default);
723        m.swap_max = Some(bytes);
724        self
725    }
726
727    /// Clear any previously-set `memory.swap.max` (writes `"max"`).
728    /// Mirrors [`Self::cpu_unlimited`] / [`Self::memory_unlimited`]
729    /// for a single memory-knob unset; useful when a base
730    /// `CgroupDef` builder applied a swap cap and the test wants to
731    /// remove only that knob while preserving `memory.max`/`high`/
732    /// `low`.
733    ///
734    /// No-ops when `self.memory == None` — the default state already
735    /// means "no swap cap" (apply_setup emits no memory writes for an
736    /// unset `memory` field), so creating a fresh `MemoryLimits` just
737    /// to set `swap_max = None` would (a) be redundant and (b)
738    /// trigger 3 unwanted writes for `memory.max` / `memory.high` /
739    /// `memory.low` at apply_setup time. The no-op short-circuit
740    /// keeps "fresh CgroupDef + memory_swap_unlimited()" semantically
741    /// identical to "fresh CgroupDef".
742    #[must_use = "builder methods consume self; bind the result"]
743    pub fn memory_swap_unlimited(mut self) -> Self {
744        if let Some(m) = self.memory.as_mut() {
745            m.swap_max = None;
746        }
747        self
748    }
749
750    /// Set `pids.max` task-count ceiling. `n` is the maximum number
751    /// of processes the cgroup may host before subsequent
752    /// `fork()` / `clone()` calls return EAGAIN. Existing tasks are
753    /// NOT killed when the limit lands below the current count
754    /// (per the `pids_max_write` kernel comment: "Limit updates
755    /// don't need to be mutex'd, since it isn't critical that any
756    /// racing fork()s follow the new limit").
757    ///
758    /// `n = 0` is rejected at `apply_setup` time: a 0-limit cgroup
759    /// halts every fork/clone inside, including the worker spawn
760    /// under `CloneMode::Fork` and the `ForkExit` per-iteration
761    /// child fork. There is no kernel sentinel for "no fork ever";
762    /// `pids_max=0` silently fails every `fork()` inside with
763    /// `EAGAIN`, which is almost certainly a configuration bug.
764    #[must_use = "builder methods consume self; bind the result"]
765    pub fn pids_max(mut self, n: u64) -> Self {
766        let pids = self.pids.get_or_insert_with(PidsLimits::default);
767        pids.max = Some(n);
768        self
769    }
770
771    /// Clear any previously-set `pids.max` (writes `"max"`).
772    /// Mirrors [`Self::cpu_unlimited`] / [`Self::memory_unlimited`].
773    #[must_use = "builder methods consume self; bind the result"]
774    pub fn pids_unlimited(mut self) -> Self {
775        let pids = self.pids.get_or_insert_with(PidsLimits::default);
776        pids.max = None;
777        self
778    }
779
780    /// Materialize [`Self::works`] with cgroup-level defaults
781    /// merged into each entry. Called by `apply_setup` to resolve
782    /// the per-WorkSpec values before spawning workers.
783    ///
784    /// For every [`WorkSpec`] in [`Self::works`] (or a single
785    /// [`WorkSpec::default()`] when `works` is empty, matching
786    /// `apply_setup`'s default-substitution rule), each cgroup-level
787    /// default in [`Self::default_nice`] / [`Self::default_comm`] /
788    /// [`Self::default_uid`] / [`Self::default_gid`] /
789    /// [`Self::default_numa_node`] fills the corresponding
790    /// `WorkSpec` field when that field is "unset" at the WorkSpec
791    /// level.
792    ///
793    /// "Unset" means `None` for every `Option`-typed field —
794    /// `nice`, `comm`, `uid`, `gid`, `numa_node` are all
795    /// `Option<_>`. The framework's "skip setpriority(2)" state per
796    /// [`WorkloadConfig::nice`](crate::workload::WorkloadConfig::nice)
797    /// is `None`. A `WorkSpec` that explicitly sets `Some(n)`
798    /// (including `Some(0)`) keeps its value; the cgroup-level
799    /// default applies only when the WorkSpec is at the framework
800    /// default of `None`.
801    ///
802    /// `pcomm` is NOT propagated through `merged_works`. The
803    /// [`Self::pcomm`] convenience method writes `pcomm` directly
804    /// into every WorkSpec at builder time so coalescing in
805    /// `apply_setup` reads the per-WorkSpec value (the
806    /// authoritative source).
807    ///
808    /// Decoupling this merge from the convenience-method call sites
809    /// makes the builder order-independent —
810    /// `def.nice(5).work(spec)` and `def.work(spec).nice(5)`
811    /// produce identical effective `WorkSpec` values.
812    pub fn merged_works(&self) -> Vec<WorkSpec> {
813        let base: Vec<WorkSpec> = if self.works.is_empty() {
814            vec![WorkSpec::default()]
815        } else {
816            self.works.clone()
817        };
818        base.into_iter()
819            .map(|mut w| {
820                if w.nice.is_none()
821                    && let Some(n) = self.default_nice
822                {
823                    w.nice = Some(n);
824                }
825                if w.comm.is_none() {
826                    w.comm = self.default_comm.clone();
827                }
828                if w.uid.is_none() {
829                    w.uid = self.default_uid;
830                }
831                if w.gid.is_none() {
832                    w.gid = self.default_gid;
833                }
834                if w.numa_node.is_none() {
835                    w.numa_node = self.default_numa_node;
836                }
837                w
838            })
839            .collect()
840    }
841}
842
843// `CgroupDef` deliberately has NO `Default` impl. The previous
844// derived/hand-rolled default produced `name = "cg_0"`, which
845// collides with the conventional first cgroup name in nearly every
846// scenario (a test calling `..Default::default()` would silently
847// share a cgroup with the scenario's first named entry). Forcing
848// every construction site to go through [`CgroupDef::named`] makes
849// the name explicit and eliminates the footgun. The pattern is
850// documented in the type-level docstring and operator-facing
851// guidance at `doc/guide/src/architecture/workload-handle.md` under
852// the spread-default warning. The compile-time pin of the absence
853// lives in the `#[cfg(test)]` mod below (`assert_not_impl_default!`
854// from `src/test_macros.rs`).
855
856#[cfg(test)]
857mod tests {
858    use super::*;
859
860    assert_not_impl_default!(CgroupDef);
861
862    #[test]
863    #[should_panic(expected = "CgroupDef::comm: empty string rejected")]
864    fn cgroup_def_comm_rejects_empty() {
865        let _ = CgroupDef::named("cg").comm("");
866    }
867
868    #[test]
869    #[should_panic(expected = "interior NUL byte")]
870    fn cgroup_def_comm_rejects_interior_nul() {
871        let _ = CgroupDef::named("cg").comm("foo\0bar");
872    }
873
874    /// Pins the validate-on-builder contract: CgroupDef::pcomm
875    /// previously wrote `w.pcomm = Some(...)` directly, bypassing
876    /// WorkSpec::pcomm's asserts. Both builders now route through a
877    /// shared `validate_task_comm_string` helper — this test would
878    /// FAIL the pre-helper implementation and PASS the post-helper
879    /// implementation.
880    #[test]
881    #[should_panic(expected = "CgroupDef::pcomm: empty string rejected")]
882    fn cgroup_def_pcomm_rejects_empty() {
883        let _ = CgroupDef::named("cg").pcomm("");
884    }
885
886    #[test]
887    #[should_panic(expected = "interior NUL byte")]
888    fn cgroup_def_pcomm_rejects_interior_nul() {
889        let _ = CgroupDef::named("cg").pcomm("foo\0bar");
890    }
891
892    /// Per-builder boundary pins: a future refactor that re-routes
893    /// CgroupDef::comm or CgroupDef::pcomm around the shared
894    /// `validate_task_comm_string` helper would surface here even if
895    /// the helper-level tests still pass.
896    #[test]
897    fn cgroup_def_comm_accepts_15_byte_boundary() {
898        let fifteen = "a".repeat(15);
899        let def = CgroupDef::named("cg").comm(fifteen.clone());
900        assert_eq!(def.default_comm.as_deref(), Some(fifteen.as_str()));
901    }
902
903    #[test]
904    #[should_panic(expected = "16 bytes")]
905    fn cgroup_def_comm_rejects_16_byte_overflow() {
906        let _ = CgroupDef::named("cg").comm("a".repeat(16));
907    }
908
909    #[test]
910    fn cgroup_def_pcomm_accepts_15_byte_boundary() {
911        let fifteen = "a".repeat(15);
912        let def = CgroupDef::named("cg").pcomm(fifteen.clone());
913        // pcomm stamps every WorkSpec; default works is one entry.
914        assert_eq!(def.works.len(), 1);
915        assert_eq!(def.works[0].pcomm.as_deref(), Some(fifteen.as_str()));
916    }
917
918    #[test]
919    #[should_panic(expected = "16 bytes")]
920    fn cgroup_def_pcomm_rejects_16_byte_overflow() {
921        let _ = CgroupDef::named("cg").pcomm("a".repeat(16));
922    }
923}