ktstr/
cgroup.rs

1//! Cgroup v2 filesystem operations for test cgroup management.
2//!
3//! Creates, configures, and removes cgroups under a parent path
4//! (default `/sys/fs/cgroup/ktstr`). Provides cpuset assignment,
5//! task migration, and cleanup.
6//!
7//! # Walk root (cgroup-v2 delegation)
8//!
9//! [`CgroupManager`] carries a `walk_root` that bounds two operations:
10//! - [`CgroupManager::setup`] walks every ancestor's
11//!   `cgroup.subtree_control` from `walk_root` down to `parent`;
12//! - [`CgroupManager::drain_tasks`] / `cleanup_recursive` drain pids
13//!   into `{walk_root}/cgroup.procs` (a writable root that is exempt
14//!   from the kernel's no-internal-process constraint).
15//!
16//! `walk_root` defaults to `/sys/fs/cgroup` (Mode A: root-owned cgroup
17//! tree). [`CgroupManager::with_walk_root`] retargets it for Mode B/C
18//! delegation (systemd `Delegate=yes`, container `nsdelegate`) where
19//! the operator owns `subtree_control` writes only inside a delegated
20//! subtree. The constructor enforces that `parent` is at or below
21//! `walk_root` so the strip-prefix walk cannot escape.
22//!
23//! # Controller surface
24//!
25//! [`CgroupManager`] enables a fixed controller set in
26//! `cgroup.subtree_control` at `Self::setup` time so every method
27//! that writes a controller knob succeeds without per-call lazy
28//! enablement (which would race against concurrent sibling cgroup
29//! creation). The enabled controllers and the knobs each one exposes
30//! map to:
31//!
32//! | Controller | `setup` writes | Methods that touch the controller's files |
33//! |------------|----------------|-------------------------------------------|
34//! | `cpuset`   | when `Controller::Cpuset` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `cpuset`/`cpuset_mems`) | `Self::set_cpuset`, `Self::set_cpuset_mems`, `Self::clear_cpuset`, `Self::clear_cpuset_mems` |
35//! | `cpu`      | when `Controller::Cpu` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `cpu`) | `Self::set_cpu_max`, `Self::set_cpu_weight` |
36//! | `memory`   | when `Controller::Memory` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `memory`) | `Self::set_memory_max`, `Self::set_memory_high`, `Self::set_memory_low`, `Self::set_memory_swap_max` |
37//! | `pids`     | when `Controller::Pids` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `pids`) | `Self::set_pids_max` |
38//! | `io`       | when `Controller::Io` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `io`) | `Self::set_io_weight` |
39//! | (cgroup-core) | not gated   | `Self::set_freeze`, `Self::move_task`, `Self::move_tasks` |
40//!
41//! `cgroup.freeze` and `cgroup.procs` are cgroup-core files exposed on
42//! every non-root cgroup automatically; they do not require a
43//! controller in `subtree_control`. `memory.swap.max` only exists when
44//! the kernel was built with `CONFIG_SWAP=y` — the file is absent on
45//! swap-disabled kernels and a write returns ENOENT (callers route
46//! through the wire-time error chain).
47//!
48//! # Untrusted-name validation
49//!
50//! Cgroup names flow into [`Path::join`] under `parent` to address
51//! files inside cgroupfs. `validate_cgroup_name` rejects shapes that
52//! would escape that parent (`..`, absolute leading `/`, `NUL`) or
53//! that produce invisible cgroupfs entries (leading `.`); other ASCII
54//! is passed through to the kernel which is the final authority on
55//! per-component validity. Every public method that takes a `name`
56//! validates it before any filesystem write.
57
58use crate::topology::TestTopology;
59use anyhow::{Context, Result, anyhow, bail};
60use std::collections::BTreeSet;
61use std::fs;
62use std::path::{Path, PathBuf};
63use std::sync::atomic::{AtomicUsize, Ordering};
64use std::sync::mpsc;
65use std::time::Duration;
66
67/// Cgroup v2 controllers that [`CgroupManager::setup`] can enable in
68/// `cgroup.subtree_control`.
69///
70/// Each variant maps to a literal token the kernel parses in
71/// `cgroup_subtree_control_write`. The enum is exhaustive over the
72/// controllers the framework's [`CgroupOps`] surface actually writes
73/// to (cpuset, cpu, memory, pids, io); cgroup-core knobs
74/// (`cgroup.freeze`, `cgroup.procs`) are not gated by any controller
75/// and never appear here.
76///
77/// Callers pass a `BTreeSet<Controller>` to `setup` — sets compose
78/// naturally across nested CgroupDef declarations and the deterministic
79/// `BTreeSet` iteration order keeps the rendered subtree_control write
80/// stable between runs.
81#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
82pub enum Controller {
83    /// `+cpuset` — gates `cpuset.cpus`, `cpuset.cpus.effective`,
84    /// `cpuset.mems`, `cpuset.mems.effective` files on every child.
85    Cpuset,
86    /// `+cpu` — gates `cpu.max`, `cpu.weight`, `cpu.weight.nice`,
87    /// `cpu.stat`, `cpu.pressure` files on every child.
88    Cpu,
89    /// `+memory` — gates `memory.max`, `memory.high`, `memory.low`,
90    /// `memory.min`, `memory.current`, `memory.swap.max`,
91    /// `memory.events`, `memory.stat`, `memory.pressure` files.
92    Memory,
93    /// `+pids` — gates `pids.max`, `pids.current`, `pids.events` files.
94    Pids,
95    /// `+io` — gates `io.max`, `io.weight`, `io.bfq.weight`,
96    /// `io.stat`, `io.pressure` files.
97    Io,
98}
99
100impl Controller {
101    /// Kernel token written to `cgroup.subtree_control` (the bare name
102    /// without the `+`/`-` prefix; see `Self::as_subtree_control_add`
103    /// for the full token).
104    pub fn name(self) -> &'static str {
105        match self {
106            Controller::Cpuset => "cpuset",
107            Controller::Cpu => "cpu",
108            Controller::Memory => "memory",
109            Controller::Pids => "pids",
110            Controller::Io => "io",
111        }
112    }
113}
114
115/// Default timeout for cgroup filesystem writes. Normally <1ms; 2s catches
116/// real hangs without waiting so long the test result is meaningless.
117const CGROUP_WRITE_TIMEOUT: Duration = Duration::from_secs(2);
118
119/// Write `data` to `path` with a timeout. Spawns a thread for the blocking
120/// `fs::write` and waits on a channel. If the write does not complete within
121/// `timeout`, returns an error (the spawned thread may still be blocked in
122/// the kernel but will not prevent the caller from making progress).
123///
124/// # Stranded-writer thread semantics
125///
126/// On timeout the helper returns `Err` while the spawned thread stays
127/// blocked in the kernel inside `fs::write` — typically inside the
128/// cgroupfs `cgroup_kn_lock_live` / `cgroup_mutex` lock acquisition or
129/// the per-file `kn->active` semaphore. The host-side fd to `path` is
130/// owned by the spawned thread, so:
131///
132/// - **Per-file lock retention.** While the writer is blocked, the
133///   target cgroupfs file's `kn->active` (kernfs's per-knob writer
134///   semaphore) remains held by the stranded thread. Concurrent
135///   writes to the SAME file from any thread in the same process —
136///   including this same caller's retry — will queue behind the
137///   stranded write inside the kernel. Writes to OTHER files in the
138///   same cgroup are unaffected (kernfs holds `kn->active`
139///   per-knob, not per-cgroup).
140/// - **Thread-handle drop.** The `JoinHandle` returned by
141///   `thread::spawn` is dropped when the helper returns. Rust's
142///   `JoinHandle::Drop` implementation detaches the thread without
143///   waiting; the thread continues to run and is implicitly joined
144///   when the kernel write eventually unblocks (or when the process
145///   exits).
146/// - **Bounded leak under wedged cgroupfs.** A genuinely-wedged
147///   cgroupfs (e.g. a stuck filesystem driver in the kernel) would
148///   accumulate threads at a rate of one per timed-out write site.
149///   The 2s per-write timeout caps the per-site stall to 2s; the
150///   total accumulation is driven by how many distinct write sites
151///   the scenario hits, not by elapsed wall-clock time alone.
152///   Operators noticing stranded `<defunct>` cgroupfs writers in
153///   `ps` should investigate whether the underlying kernel cgroup
154///   subsystem is hung; the framework's own teardown does not
155///   block on these stranded threads.
156///
157/// Each stranded thread holds the file's `kn->active` until the
158/// kernel write returns. The OS-level memory cost per stranded
159/// thread is the default Rust thread stack (8 MiB on Linux, mostly
160/// virtual until touched).
161fn write_with_timeout(path: &Path, data: &str, timeout: Duration) -> Result<()> {
162    let display = path.display().to_string();
163    let path = path.to_owned();
164    let data = data.to_owned();
165    let (tx, rx) = mpsc::channel();
166    std::thread::spawn(move || {
167        let result = fs::write(&path, &data);
168        let _ = tx.send(result);
169    });
170    match rx.recv_timeout(timeout) {
171        Ok(Ok(())) => Ok(()),
172        Ok(Err(e)) => {
173            let errno_suffix = e
174                .raw_os_error()
175                .and_then(crate::errno_name)
176                .map(|name| format!(" ({name})"))
177                .unwrap_or_default();
178            Err(e).with_context(|| format!("write {display}{errno_suffix}"))
179        }
180        Err(_) => bail!(
181            "cgroup write to {display} timed out after {}ms",
182            timeout.as_millis()
183        ),
184    }
185}
186
187/// Validate a cgroup name before joining it onto the parent path.
188///
189/// Rejects shapes that would either escape the parent directory
190/// (`..` component, absolute leading `/`, embedded NUL) or produce
191/// a hidden / invisible cgroupfs entry (leading `.`). Empty names
192/// are also rejected — `parent.join("")` returns `parent`, which
193/// would let a caller accidentally clobber the parent's own
194/// `cpuset.cpus` / `cgroup.subtree_control` files via a method
195/// that expected to address a child.
196///
197/// Permits `/` only as a path separator between non-empty
198/// components (nested cgroups like `"cg_0/narrow"`); a leading
199/// `/` is rejected because `Path::join` would replace `parent`
200/// entirely with the absolute path.
201///
202/// Beyond these structural checks the kernel is the final authority
203/// on per-component validity: cgroupfs rejects names containing
204/// newlines or names colliding with reserved knobs (`cgroup.procs`,
205/// `cpuset.cpus`, etc.) at `mkdir` time with EINVAL / EEXIST. Those
206/// failures surface through the regular `fs::create_dir_all` /
207/// `fs::write` error chain.
208pub(crate) fn validate_cgroup_name(name: &str) -> Result<()> {
209    if name.is_empty() {
210        bail!("cgroup name must not be empty");
211    }
212    if name.starts_with('/') {
213        bail!(
214            "cgroup name '{name}' starts with '/' — would escape the \
215             managed parent via Path::join (absolute paths replace the \
216             join base)"
217        );
218    }
219    if name.contains('\0') {
220        bail!("cgroup name '{name}' contains a NUL byte");
221    }
222    // Per-component checks run before the whole-name leading-dot
223    // check so a component like `..` matches the more specific
224    // path-traversal diagnostic instead of the generic hidden-entry
225    // one. The ordering matters for error messages — `'..' component`
226    // is what callers grep for.
227    for component in name.split('/') {
228        if component.is_empty() {
229            bail!(
230                "cgroup name '{name}' contains an empty path component \
231                 (consecutive '/') — Path::join would emit a malformed path"
232            );
233        }
234        if component == ".." {
235            bail!(
236                "cgroup name '{name}' contains a '..' component — \
237                 would escape the managed parent via Path::join"
238            );
239        }
240        if component == "." {
241            bail!(
242                "cgroup name '{name}' contains a '.' component — \
243                 ambiguous self-reference, refuse before fs writes"
244            );
245        }
246        if component.starts_with('.') {
247            bail!(
248                "cgroup name '{name}' contains a leading-dot component \
249                 ('{component}') — produces a hidden cgroupfs entry"
250            );
251        }
252    }
253    Ok(())
254}
255
256/// Walk an `anyhow::Error` chain and return the first
257/// `std::io::Error`'s raw errno, if any. Shared helper for errno
258/// classification across cgroup orchestration — both this module's
259/// ESRCH/EBUSY checks and [`crate::vmm::cgroup_sandbox`]'s
260/// EACCES/EPERM/EBUSY branches walk the same chain shape.
261pub(crate) fn anyhow_first_io_errno(err: &anyhow::Error) -> Option<i32> {
262    err.chain()
263        .find_map(|cause| cause.downcast_ref::<std::io::Error>())
264        .and_then(|io| io.raw_os_error())
265}
266
267/// ESRCH: task exited between listing and migration
268/// (`cgroup_procs_write_start` -> `find_task_by_vpid` returns NULL).
269fn is_esrch(err: &anyhow::Error) -> bool {
270    anyhow_first_io_errno(err) == Some(libc::ESRCH)
271}
272
273/// EBUSY: either the cgroup v2 no-internal-process constraint
274/// (`cgroup_migrate_vet_dst` when `subtree_control` is set) or a
275/// transient rejection from a sched_ext BPF `cgroup_prep_move`
276/// callback (`scx_cgroup_can_attach`).
277fn is_ebusy(err: &anyhow::Error) -> bool {
278    anyhow_first_io_errno(err) == Some(libc::EBUSY)
279}
280
281/// Snapshot the cgroup-tree state at the moment a cpuset.cpus
282/// write fails, for diagnostic attachment to the returned error.
283///
284/// Captures (per the diagnostic contract on
285/// [`CgroupManager::set_cpuset`]):
286/// - the parent's `cgroup.controllers` (controllers AVAILABLE for
287///   children — confirms whether subtree_control already
288///   propagated to this child)
289/// - the parent's `cgroup.subtree_control` (controllers ENABLED
290///   for children — what setup() last wrote)
291/// - the child's `cgroup.controllers` (the set children of the
292///   CHILD inherit — useful for nested cgroups)
293/// - whether `cpuset.cpus` exists at the child (distinguishes a
294///   "controller never propagated" failure mode from a
295///   "kernel rejected this specific value" failure mode)
296/// - the child's directory listing (so an unexpected presence/
297///   absence of any cgroupfs knob is visible)
298///
299/// Read failures inside the snapshot are folded into the snapshot
300/// string as `<read failed: {err}>` rather than propagating —
301/// the caller's error path is what the caller cares about; the
302/// snapshot is best-effort instrumentation.
303fn capture_cpuset_state(parent: &Path, name: &str) -> String {
304    let child = parent.join(name);
305    let parent_controllers = read_or_label(&parent.join("cgroup.controllers"));
306    let parent_subtree_control = read_or_label(&parent.join("cgroup.subtree_control"));
307    let child_controllers = read_or_label(&child.join("cgroup.controllers"));
308    let cpuset_cpus_exists = child.join("cpuset.cpus").exists();
309    let child_listing = match fs::read_dir(&child) {
310        Ok(entries) => {
311            let mut names: Vec<String> = entries
312                .filter_map(|e| e.ok())
313                .map(|e| e.file_name().to_string_lossy().into_owned())
314                .collect();
315            names.sort_unstable();
316            format!("[{}]", names.join(", "))
317        }
318        Err(e) => format!("<read_dir failed: {e}>"),
319    };
320    format!(
321        "cgroup-state-snapshot: \
322         parent={} name={} \
323         parent.cgroup.controllers={:?} \
324         parent.cgroup.subtree_control={:?} \
325         child.cgroup.controllers={:?} \
326         child.cpuset.cpus.exists={} \
327         child.listing={}",
328        parent.display(),
329        name,
330        parent_controllers,
331        parent_subtree_control,
332        child_controllers,
333        cpuset_cpus_exists,
334        child_listing,
335    )
336}
337
338/// Read `path` to a string for snapshotting, returning a
339/// `<...>` placeholder if the read fails. Used by
340/// [`capture_cpuset_state`] so a missing or permission-denied
341/// snapshot field shows up as a labeled placeholder rather than
342/// killing the whole snapshot.
343fn read_or_label(path: &Path) -> String {
344    match fs::read_to_string(path) {
345        Ok(s) => s.trim().to_string(),
346        Err(e) => format!("<read failed: {e}>"),
347    }
348}
349
350/// Cap on the number of successive [`CgroupManager::remove_cgroup`]
351/// failures the manager will tolerate before bailing further removes.
352///
353/// A churn workload (rapid create→remove cycles) may legitimately
354/// race the freeze/drain path and see EBUSY/ENOENT on individual
355/// remove calls — those are absorbed and the un-removed cgroup is
356/// counted toward `outstanding_removes`. When the counter exceeds
357/// this cap, subsequent [`CgroupManager::remove_cgroup`] calls
358/// return Err immediately so the loop driving the churn (e.g.
359/// `custom_cgroup_rapid_churn` in scenario/dynamic.rs) can bail
360/// instead of accumulating cgroupfs entries unboundedly. Successful
361/// removes decrement the counter, so a transient stall that
362/// eventually clears does not strand the manager in the bailed
363/// state.
364const MAX_OUTSTANDING_REMOVES: usize = 10;
365
366/// RAII manager for cgroup v2 filesystem operations.
367///
368/// Creates, configures, and removes cgroups under a parent directory.
369/// Provides cpuset assignment and task migration.
370///
371/// # Outstanding-remove tracking
372///
373/// `outstanding_removes` counts cgroups whose
374/// [`Self::remove_cgroup`] call failed (the directory still exists
375/// in the cgroupfs tree). It increments on every removal failure,
376/// decrements on every removal success, and gates further calls:
377/// once the count exceeds `MAX_OUTSTANDING_REMOVES`,
378/// [`Self::remove_cgroup`] returns Err without attempting the
379/// underlying writes. The counter is `AtomicUsize` because
380/// scenario code holds the manager behind `&dyn CgroupOps` and
381/// shares it across threads via `&self` borrows.
382///
383/// # Walk root
384///
385/// `walk_root` bounds the cgroup-fs walk for two operations:
386/// 1. [`Self::setup`] walks every ancestor's `cgroup.subtree_control`
387///    between `walk_root` and `parent`.
388/// 2. [`Self::drain_tasks`] and `cleanup_recursive` drain pids into
389///    `{walk_root}/cgroup.procs` (the writable root exempt from the
390///    no-internal-process constraint).
391///
392/// Defaults to `/sys/fs/cgroup` in [`Self::new`] for Mode A (root-owned
393/// cgroup tree). Override via [`Self::with_walk_root`] for cgroup-v2
394/// user delegation (Mode B/C: systemd `Delegate=yes`, container
395/// `nsdelegate`). The override is validated against `parent` at
396/// construction — if `parent` is not at or below `walk_root`, the
397/// chained call returns an error rather than letting the strip-prefix
398/// walk fall through to an opaque cgroupfs EACCES at the delegation
399/// boundary.
400#[derive(Debug)]
401pub struct CgroupManager {
402    parent: PathBuf,
403    walk_root: PathBuf,
404    outstanding_removes: AtomicUsize,
405}
406
407/// Free-function inner of [`CgroupManager::move_tasks`] —
408/// extracted so the per-pid migration loop + ESRCH tolerance +
409/// all-vanished bail can be unit-tested without a real
410/// cgroupfs (which is what surfaces the kernel-side ESRCH that
411/// the bail guards against). The per-pid write closure is
412/// caller-supplied: production callers route through
413/// [`CgroupManager::move_task_with_retry`] (which talks to
414/// real `cgroup.procs` files); unit tests pass a closure that
415/// synthesises [`std::io::Error::from_raw_os_error`]`(libc::ESRCH)`
416/// for selected pids so the partial-vanish (allowed) and
417/// all-vanished (bail) paths are both directly observable.
418///
419/// The empty-slice exemption (`pids.is_empty() -> Ok`) is
420/// preserved here so the documented "no move requested" form
421/// (post-Drop diagnostic, post-mortem capture) stays a clean
422/// no-op rather than tripping the all-vanished gate.
423fn move_tasks_inner<W>(name: &str, pids: &[libc::pid_t], mut write_one: W) -> Result<()>
424where
425    W: FnMut(&str, libc::pid_t) -> Result<()>,
426{
427    let mut vanished = 0usize;
428    for &pid in pids {
429        if let Err(e) = write_one(name, pid) {
430            if is_esrch(&e) {
431                tracing::warn!(pid, cgroup = name, "task vanished during migration");
432                vanished += 1;
433                continue;
434            }
435            return Err(e);
436        }
437    }
438    if !pids.is_empty() && vanished == pids.len() {
439        anyhow::bail!(
440            "move_tasks to '{name}': ALL {n} pid(s) ESRCH'd before \
441             migration completed (pids: {pids:?}). Likely causes: \
442             (a) `WorkloadHandle::spawn` child pre_exec init-panic \
443             cascade (uid/gid/mempolicy/cgroup-handshake failure \
444             between fork and the start-pipe read — the parent is \
445             blocked on the start-pipe waiting for the child to \
446             reach work-ready and only observes the child's death \
447             via SIGCHLD reap, by which point the pid has already \
448             vanished from any cgroup it was placed in); (b) \
449             scheduler-attach-time cgroup-pull (sched_ext init may \
450             move existing tasks out of test-created cgroups); \
451             (c) external signal (SIGKILL from operator OR \
452             OOM-killer). The silent-Ok path this bail replaces \
453             was a no-silent-drops violation: a downstream \
454             `cgroup.procs` read would see 0 pids with no signal \
455             that ANY migration was even attempted. If the caller \
456             LEGITIMATELY moves an already-vanished cohort \
457             (post-Drop diagnostic), pass an empty pids slice \
458             instead — the empty-slice path returns Ok cleanly \
459             without bailing.",
460            n = pids.len(),
461        );
462    }
463    Ok(())
464}
465
466impl CgroupManager {
467    /// Default cgroup-fs root used by [`Self::new`]. Override per
468    /// instance via [`Self::with_walk_root`] for cgroup-v2 user
469    /// delegation.
470    const DEFAULT_WALK_ROOT: &'static str = "/sys/fs/cgroup";
471
472    /// Create a manager rooted at the given cgroup v2 path.
473    ///
474    /// The walk root defaults to `/sys/fs/cgroup` (Mode A: root-owned
475    /// cgroup tree). For cgroup-v2 user delegation (Mode B/C), chain
476    /// [`Self::with_walk_root`] before any [`Self::setup`] call.
477    pub fn new(parent: &str) -> Self {
478        Self {
479            parent: PathBuf::from(parent),
480            walk_root: PathBuf::from(Self::DEFAULT_WALK_ROOT),
481            outstanding_removes: AtomicUsize::new(0),
482        }
483    }
484
485    /// Retarget the cgroup-fs walk root used by [`Self::setup`] and
486    /// [`Self::drain_tasks`].
487    ///
488    /// `root` becomes the upper bound of the
489    /// `cgroup.subtree_control` enable walk and the destination
490    /// `{root}/cgroup.procs` for pid drains. Use for cgroup-v2 user
491    /// delegation (Mode B/C) where the operator owns
492    /// `subtree_control` writes only inside the delegated subtree and
493    /// a blind walk from `/sys/fs/cgroup` would EACCES at the
494    /// `user.slice` / container-root boundary.
495    ///
496    /// Returns an error when:
497    /// - **Either `parent` or `root` contains a `..` component** —
498    ///   [`Path::starts_with`](std::path::Path::starts_with) is component-based and treats `..`
499    ///   as a literal segment, so `/sys/fs/cgroup/op/../escape` would
500    ///   component-prefix `/sys/fs/cgroup/op` while the kernel
501    ///   resolves the path to `/sys/fs/cgroup/escape` (outside the
502    ///   delegation root). Rejecting `..` upfront keeps the prefix
503    ///   invariant honest against canonical-vs-component drift.
504    /// - **The manager's `parent` is not at or below `root`** —
505    ///   without the prefix invariant the `Self::setup_under_root`
506    ///   strip-prefix gate would silently skip the subtree_control
507    ///   walk and the caller would see downstream EACCES on the
508    ///   first `set_*` write. Surfaces the misconfiguration upfront
509    ///   with both paths in the error message.
510    pub fn with_walk_root(mut self, root: impl Into<PathBuf>) -> Result<Self> {
511        let root = root.into();
512        // Reject `..` components on either side. `PathBuf::starts_with`
513        // is component-based and treats `..` as a literal segment, so
514        // `/sys/fs/cgroup/operator/../escape` would pass the prefix
515        // check below while the kernel resolves the path to
516        // `/sys/fs/cgroup/escape` (outside walk_root). Either side
517        // carrying `..` is a misconfiguration; bail upfront before the
518        // canonical-vs-component mismatch becomes a downstream EACCES.
519        for (path, label) in [
520            (self.parent.as_path(), "parent"),
521            (root.as_path(), "walk_root"),
522        ] {
523            if path
524                .components()
525                .any(|c| matches!(c, std::path::Component::ParentDir))
526            {
527                bail!(
528                    "CgroupManager::with_walk_root: {label} {path:?} contains `..` components; \
529                     parent and walk_root must be normalized absolute paths because \
530                     PathBuf::starts_with is component-based and `/a/b/../c` is treated as \
531                     starting with `/a/b/..` not the kernel-resolved `/a/c` — the prefix \
532                     invariant would be silently violated",
533                );
534            }
535        }
536        if !self.parent.starts_with(&root) {
537            bail!(
538                "CgroupManager::with_walk_root: parent {:?} is not below walk_root {:?}; \
539                 the subtree_control walk must originate at a root that contains the parent — \
540                 either lower walk_root to a prefix of parent or raise parent to a descendant of \
541                 walk_root",
542                self.parent,
543                root,
544            );
545        }
546        self.walk_root = root;
547        Ok(self)
548    }
549
550    /// Path to the parent cgroup directory.
551    pub fn parent_path(&self) -> &std::path::Path {
552        &self.parent
553    }
554
555    /// Path to the cgroup-fs root [`Self::setup`] walks down from and
556    /// [`Self::drain_tasks`] drains pids to. See [`Self::with_walk_root`].
557    pub fn walk_root(&self) -> &std::path::Path {
558        &self.walk_root
559    }
560
561    /// Count of un-removed cgroups currently tracked by this
562    /// manager — incremented when [`Self::remove_cgroup`] fails,
563    /// decremented when it succeeds. Exposed for tests and for
564    /// callers that want to inspect the budget without forcing a
565    /// remove attempt.
566    pub fn outstanding_removes(&self) -> usize {
567        self.outstanding_removes.load(Ordering::Relaxed)
568    }
569
570    /// Create the parent directory and enable the requested cgroup
571    /// controllers in every ancestor `cgroup.subtree_control` between
572    /// `self.walk_root` (default `/sys/fs/cgroup`) and `self.parent`.
573    ///
574    /// Pass the controllers the test actually needs — empty set means
575    /// "create the parent dir, write nothing to subtree_control". The
576    /// scenario runtime computes the controller union from
577    /// [`CgroupDef`](crate::scenario::ops::CgroupDef) declarations
578    /// (cpuset/cpuset_mems → [`Controller::Cpuset`], cpu →
579    /// [`Controller::Cpu`], memory → [`Controller::Memory`], pids →
580    /// [`Controller::Pids`], io → [`Controller::Io`]) so a test
581    /// that never sets a memory limit never enables `+memory` and
582    /// vice versa. `cgroup.freeze` and `cgroup.procs` are
583    /// cgroup-core, ungated by any controller, and need no entry.
584    ///
585    /// # Walk root
586    ///
587    /// The ancestor walk stops at `self.walk_root` so cgroup-v2 user
588    /// delegation (Mode B/C) does not attempt subtree_control writes
589    /// above the delegation boundary. [`Self::with_walk_root`]
590    /// retargets the walk; the constructor validates that
591    /// `self.parent` is below `walk_root`.
592    ///
593    /// # Availability check
594    ///
595    /// Each requested controller is verified against
596    /// `{walk_root}/cgroup.controllers` before any write. A
597    /// requested controller missing from the kernel's available set
598    /// surfaces as `controller {ctrl} not available; cgroup.controllers
599    /// = {available:?}` rather than the bare ENOENT/EACCES the
600    /// downstream `set_*` write would otherwise emit.
601    ///
602    /// # Error propagation
603    ///
604    /// All filesystem writes propagate via `?`. A user inspecting
605    /// `RUST_BACKTRACE=1` output sees the exact subtree_control path
606    /// that failed and the underlying errno, instead of a swallowed
607    /// `tracing::warn!` followed by a downstream EACCES at the
608    /// controller-knob write site.
609    pub fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()> {
610        self.setup_under_root(controllers, &self.walk_root)
611    }
612
613    /// Does managing cgroups require root privileges for this
614    /// `(root, parent, euid)`? True only when `root` is the kernel-owned
615    /// default walk root (`/sys/fs/cgroup`), `parent` is actually under
616    /// that root (a real cgroupfs operation — create_dir_all of the
617    /// parent, or the subtree_control walk, that EACCESes for a non-root
618    /// euid), AND the euid is non-root. A `parent` OUTSIDE the root (e.g.
619    /// a tmpdir — the non-cgroup-path early-bail that creates a dir and
620    /// skips the walk) touches no cgroupfs and needs no root. A delegated
621    /// walk root (set via [`Self::with_walk_root`]) is exempt: cgroup-v2
622    /// delegation grants the delegatee write access to
623    /// `cgroup.subtree_control` inside the delegated subtree, so a
624    /// non-root euid can manage it (Documentation/admin-guide/cgroup-v2.rst,
625    /// Delegation). Pure + takes `parent`/`euid` explicitly so the
626    /// privilege gate is unit-tested regardless of the test runner's own
627    /// euid and working directory.
628    fn default_root_requires_root(root: &Path, parent: &Path, euid: u32) -> bool {
629        root == Path::new(Self::DEFAULT_WALK_ROOT) && parent.starts_with(root) && euid != 0
630    }
631
632    /// Inner setup that takes the cgroup-fs root as an explicit
633    /// argument so tests can drive the controller-enable path against
634    /// a tmpdir without touching `/sys/fs/cgroup`. Production
635    /// [`Self::setup`] threads `self.walk_root` (defaults to
636    /// `/sys/fs/cgroup` via [`Self::new`], overridable via
637    /// [`Self::with_walk_root`]). The strip-prefix gate stays — if
638    /// the parent is outside the supplied root, directory creation
639    /// still happens but no subtree_control walk fires (matches the
640    /// existing "non-cgroup-mount" early-bail).
641    fn setup_under_root(&self, controllers: &BTreeSet<Controller>, root: &Path) -> Result<()> {
642        // Managing cgroups under the kernel-owned default walk root
643        // (/sys/fs/cgroup, Mode A) requires root: create_dir_all of a
644        // parent UNDER /sys/fs/cgroup, or the subtree_control walk below,
645        // would EACCES for a non-root caller with an errno that buries
646        // the cause. Fail fast here so the message names the fix. Gated
647        // on the parent being under the root: a parent OUTSIDE it (the
648        // non-cgroup-path early-bail — create a dir, skip the walk)
649        // touches no cgroupfs and needs no root. Checked at setup (first
650        // real cgroup use), NOT at manager construction: host_only tests
651        // that never create a cgroup (macro-attribute fixtures,
652        // host-topology reads, nested-VM verifier orchestration) must not
653        // fail for a resource they never touch. A delegated walk root
654        // (Mode B/C via with_walk_root) is exempt — the operator owns
655        // subtree_control inside the delegated subtree.
656        let euid = unsafe { libc::geteuid() };
657        if Self::default_root_requires_root(root, &self.parent, euid) {
658            return Err(anyhow!(
659                "CgroupManager::setup: cannot manage cgroups under the \
660                 kernel-owned default walk root {root:?} as a non-root \
661                 process (euid {euid}); run as root, or for cgroup-v2 \
662                 user delegation set a delegated walk root via \
663                 CgroupManager::with_walk_root (a systemd Delegate=yes \
664                 subtree or a container nsdelegate root) — when driven by \
665                 cargo-ktstr, set the {walk_env} env var to that delegated \
666                 root",
667                walk_env = crate::KTSTR_CGROUP_WALK_ROOT_ENV,
668            ));
669        }
670        // No controllers to enable means no subtree_control walk, and the
671        // parent cgroup is only needed when the scenario actually creates
672        // child cgroups -- which `create_cgroup`'s `create_dir_all` makes
673        // lazily -- or enables controllers. Return BEFORE the eager parent
674        // mkdir so a cgroup-free scenario (no CgroupDefs, no workloads --
675        // e.g. snapshot-bridge tests, host-topology reads, macro-attribute
676        // fixtures) runs without root or a cgroup fs. Previously this mkdir
677        // fired unconditionally and EACCES'd a non-root caller (or a
678        // deliberately-unwritable dummy parent like `/nonexistent/...`).
679        if controllers.is_empty() {
680            return Ok(());
681        }
682        if !self.parent.exists() {
683            fs::create_dir_all(&self.parent)
684                .with_context(|| format!("mkdir {}", self.parent.display()))?;
685        }
686        if let Ok(rel) = self.parent.strip_prefix(root) {
687            let available_path = root.join("cgroup.controllers");
688            if available_path.exists() {
689                let raw = fs::read_to_string(&available_path).with_context(|| {
690                    format!("read cgroup.controllers: {}", available_path.display())
691                })?;
692                let available: BTreeSet<&str> = raw.split_whitespace().collect();
693                for c in controllers {
694                    if !available.contains(c.name()) {
695                        return Err(anyhow!(
696                            "cgroup controller '{}' not available at {}; \
697                             cgroup.controllers reports {:?}. CONFIG_{}_CONTROLLER \
698                             may be unset, or the controller is masked at this \
699                             level of the hierarchy",
700                            c.name(),
701                            available_path.display(),
702                            available,
703                            c.name().to_uppercase(),
704                        ));
705                    }
706                }
707            }
708            let line: String = controllers
709                .iter()
710                .map(|c| format!("+{}", c.name()))
711                .collect::<Vec<_>>()
712                .join(" ");
713            let mut cur = root.to_path_buf();
714            for c in rel.components() {
715                let sc = cur.join("cgroup.subtree_control");
716                if sc.exists() {
717                    write_with_timeout(&sc, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
718                        format!("enable controllers '{line}' at {}", sc.display())
719                    })?;
720                }
721                cur = cur.join(c);
722            }
723            let sc = self.parent.join("cgroup.subtree_control");
724            if sc.exists() {
725                write_with_timeout(&sc, &line, CGROUP_WRITE_TIMEOUT)
726                    .with_context(|| format!("enable controllers '{line}' at {}", sc.display()))?;
727            }
728        }
729        Ok(())
730    }
731
732    /// Create a child cgroup directory.
733    ///
734    /// For nested paths (e.g. `"cg_0/narrow"`), enables `+cpuset` on
735    /// each intermediate cgroup's `subtree_control` so the leaf has
736    /// `cpuset.cpus` / `cpuset.mems` files available. The kernel
737    /// requires each parent to have the controller in
738    /// `subtree_control` for its children to have the corresponding
739    /// files (`cgroup_control()` returns `parent->subtree_control`).
740    ///
741    /// # Limitation: only `+cpuset` is propagated through nested
742    /// intermediates
743    ///
744    /// `Self::enable_subtree_cpuset` writes ONLY `+cpuset` to each
745    /// intermediate's `cgroup.subtree_control`; the `+cpu` /
746    /// `+memory` / `+pids` / `+io` controllers enabled by
747    /// [`Self::setup`] cover only the manager's parent cgroup, not
748    /// arbitrary intermediate cgroups created via nested
749    /// `create_cgroup` calls. As a result, a nested leaf like
750    /// `"cg_0/narrow"` exposes `cpuset.*` knobs but NOT
751    /// `memory.max` / `pids.max` / `io.weight`. If a future
752    /// [`CgroupDef`](crate::scenario::ops::CgroupDef) addresses such
753    /// a leaf with a memory/pids/io knob, the corresponding
754    /// `set_*` write will return ENOENT.
755    ///
756    /// Today's in-tree consumers (host topology cpuset locks,
757    /// `BuildSandbox`, scenario ops) only nest cgroups for cpuset
758    /// scoping, so this matches the actual surface the framework
759    /// exercises. Extending `Self::enable_subtree_cpuset` to
760    /// propagate the remaining controllers across intermediates is
761    /// straightforward (write the same controller list as
762    /// [`Self::setup`] uses) but is deferred until a use case
763    /// concretely needs it; without one, the wider write would
764    /// race against concurrent sibling cgroup creation under the
765    /// same intermediate without buying anything.
766    pub fn create_cgroup(&self, name: &str) -> Result<()> {
767        validate_cgroup_name(name)?;
768        let p = self.parent.join(name);
769        if !p.exists() {
770            fs::create_dir_all(&p).with_context(|| format!("mkdir {}", p.display()))?;
771        }
772        self.enable_subtree_cpuset(name);
773        Ok(())
774    }
775
776    /// Enable a controller on the parent cgroup's `cgroup.subtree_control`.
777    ///
778    /// Writes `+{controller}` to `{parent}/cgroup.subtree_control` so
779    /// children created under the parent inherit the controller and
780    /// expose the corresponding `*.cpus`, `*.mems`, etc. files. No-op
781    /// (returns `Ok`) when the subtree_control file does not exist —
782    /// callers treat that as "parent is not a cgroup v2 node" and
783    /// degrade elsewhere.
784    ///
785    /// Unlike [`Self::setup`] and `Self::enable_subtree_cpuset`,
786    /// which swallow write failures via `tracing::warn!`, this method
787    /// propagates the underlying [`std::io::Error`] so callers can
788    /// classify errnos (EACCES/EPERM for permission, EBUSY for a
789    /// peer holding the subtree) via `anyhow_first_io_errno` and
790    /// map them to operator-facing degrade variants. Used by
791    /// `crate::vmm::cgroup_sandbox::BuildSandbox::try_create` under
792    /// the `--cpu-cap` hard-error contract.
793    pub fn add_parent_subtree_controller(&self, controller: &str) -> Result<()> {
794        let p = self.parent.join("cgroup.subtree_control");
795        if !p.exists() {
796            return Ok(());
797        }
798        write_with_timeout(&p, &format!("+{controller}"), CGROUP_WRITE_TIMEOUT)
799    }
800
801    /// Drain tasks from a child cgroup and remove it.
802    ///
803    /// Auto-unfreezes the cgroup before draining: a frozen cgroup that
804    /// reaches teardown (e.g. a step body issues `Op::FreezeCgroup` and
805    /// never pairs it with `Op::UnfreezeCgroup`) would migrate its
806    /// frozen tasks to the cgroup root via `drain_tasks` and rely on
807    /// the kernel's `cgroup_freezer_migrate_task` to clear the JOBCTL
808    /// freeze bit when the destination cgroup is unfrozen. The kernel
809    /// path is correct, but writing `cgroup.freeze=0` first makes the
810    /// teardown deterministic regardless of who froze the cgroup and
811    /// when. Tolerates ENOENT on the freeze file (cgroup directory
812    /// already gone, or `CONFIG_CGROUP_FREEZE` absent on legacy
813    /// kernels) silently — only non-ENOENT failures warn.
814    ///
815    /// # Post-drain settle window
816    ///
817    /// Between [`Self::drain_tasks`] and `rmdir`,
818    /// `remove_cgroup_inner` calls `wait_for_cgroup_unpopulated` with
819    /// a 1s budget. Writes to `cgroup.procs` queue the task move but
820    /// the source cgroup's populated state only clears once the
821    /// per-task css_set switch completes — `rmdir` returns EBUSY
822    /// while the cgroup is still populated. Rather than a blind
823    /// sleep, the wait is event-driven: it blocks on an
824    /// inotify(IN_MODIFY) watch of the cgroup's `cgroup.events` file
825    /// and returns as soon as that file reports `populated 0`, so it
826    /// wakes on the actual kernel state-transition write.
827    ///
828    /// The wait falls through to `rmdir` on deadline (or when
829    /// `cgroup.events` is absent / inotify setup fails), so a
830    /// genuinely stuck-populated cgroup still surfaces the same
831    /// EBUSY error from the subsequent `rmdir`.
832    ///
833    /// # Outstanding-remove cap
834    ///
835    /// A churn workload (rapid create→remove cycles) may legitimately
836    /// race freeze/drain and see EBUSY/ENOENT on individual remove
837    /// calls. Each failed remove increments
838    /// [`Self::outstanding_removes`]; once the counter exceeds
839    /// `MAX_OUTSTANDING_REMOVES`, the next call returns Err
840    /// without attempting any filesystem writes — bounding the peak
841    /// resident cgroup leak to that cap regardless of how long the
842    /// scenario runs. Successful removes decrement the counter, so a
843    /// transient stall that eventually clears (e.g. RCU drain
844    /// catches up between iterations) does not strand the manager
845    /// in the bailed state.
846    ///
847    /// A `name` whose directory does not exist returns `Ok(())`
848    /// without touching the counter — the cgroup was already
849    /// reaped (e.g. by [`Self::cleanup_all`] or a prior remove),
850    /// so it is not "outstanding".
851    pub fn remove_cgroup(&self, name: &str) -> Result<()> {
852        validate_cgroup_name(name)?;
853        let outstanding = self.outstanding_removes.load(Ordering::Relaxed);
854        if outstanding > MAX_OUTSTANDING_REMOVES {
855            bail!(
856                "remove_cgroup '{name}' refused: {outstanding} cgroups outstanding \
857                 (cap {MAX_OUTSTANDING_REMOVES}); cgroup.procs draining wedged or \
858                 churn loop outpacing the kernel's RCU grace period — bailing to \
859                 avoid unbounded cgroupfs accumulation"
860            );
861        }
862        let p = self.parent.join(name);
863        if !p.exists() {
864            return Ok(());
865        }
866        match self.remove_cgroup_inner(name, &p) {
867            Ok(()) => {
868                // Successful remove: decrement (saturating at 0 so a
869                // remove of a cgroup we never failed-to-remove does
870                // not underflow the counter into usize::MAX).
871                self.outstanding_removes
872                    .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |n| {
873                        Some(n.saturating_sub(1))
874                    })
875                    .ok();
876                Ok(())
877            }
878            Err(err) => {
879                self.outstanding_removes.fetch_add(1, Ordering::Relaxed);
880                Err(err)
881            }
882        }
883    }
884
885    /// Inner body of [`Self::remove_cgroup`] — exists so the public
886    /// method can wrap the unfreeze/drain/rmdir result in the
887    /// outstanding-counter bookkeeping without duplicating the
888    /// sequence in success and failure arms.
889    ///
890    /// Gates the pre-drain unfreeze on `cgroup.freeze` existence to
891    /// match [`cleanup_recursive`]'s same-file gate. `set_freeze`
892    /// goes through `fs::write` which CREATES the file when it does
893    /// not exist (open(O_WRONLY | O_CREAT | O_TRUNC)), so an
894    /// unconditional call would plant a stray 1-byte file under any
895    /// non-cgroupfs directory and cause the subsequent
896    /// `fs::remove_dir(p)` to fail with ENOTEMPTY. On a real cgroup
897    /// v2 tree the file is always present (cgroup-core, ungated by
898    /// controllers); on a legacy kernel without `CONFIG_CGROUP_FREEZE`
899    /// or on a non-cgroup directory entry the file is absent and the
900    /// unfreeze step is a no-op.
901    fn remove_cgroup_inner(&self, name: &str, p: &Path) -> Result<()> {
902        if p.join("cgroup.freeze").exists()
903            && let Err(err) = self.set_freeze(name, false)
904            && anyhow_first_io_errno(&err) != Some(libc::ENOENT)
905        {
906            tracing::warn!(
907                cgroup = name,
908                err = %format!("{err:#}"),
909                "remove_cgroup: pre-drain unfreeze failed; drain may strand frozen tasks at root"
910            );
911        }
912        self.drain_tasks(name)?;
913        // Wait for the kernel to reflect the empty state via
914        // cgroup.events `populated 0` (event-driven via inotify on
915        // the events file) before attempting rmdir. The legacy
916        // 50 ms blind sleep was a hopeful settle: too short under
917        // load (rmdir EBUSY) and too long on a quiet host (wasted
918        // tens of ms × every cgroup teardown). Falls through to
919        // rmdir on deadline so the caller still sees the same
920        // EBUSY error if the cgroup is genuinely stuck-populated;
921        // 1 s ceiling matches the prior pessimistic upper bound on
922        // a settling cgroup.
923        wait_for_cgroup_unpopulated(p, std::time::Duration::from_secs(1));
924        fs::remove_dir(p).with_context(|| format!("rmdir {}", p.display()))
925    }
926
927    /// Write `cpuset.cpus` for a child cgroup.
928    ///
929    /// On write failure, captures and emits a snapshot of the
930    /// cgroup-tree state at the moment of failure: the parent's
931    /// `cgroup.controllers` (controllers AVAILABLE to children),
932    /// the parent's `cgroup.subtree_control` (controllers ENABLED
933    /// for children), the child's `cgroup.controllers` (the
934    /// inheritance ROOT for children of the child), the
935    /// `cpuset.cpus` file's existence, and a directory listing of
936    /// the child cgroup's knob files. The capture lets a kernel /
937    /// hierarchy-state bug surface as a focused diagnostic instead
938    /// of a bare `EACCES` at the write site.
939    pub fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()> {
940        validate_cgroup_name(name)?;
941        let p = self.parent.join(name).join("cpuset.cpus");
942        match write_with_timeout(&p, &TestTopology::cpuset_string(cpus), CGROUP_WRITE_TIMEOUT) {
943            Ok(()) => Ok(()),
944            Err(e) => {
945                let snapshot = capture_cpuset_state(&self.parent, name);
946                Err(e.context(snapshot))
947            }
948        }
949    }
950
951    /// Enable `+cpuset` on `cgroup.subtree_control` for each ancestor
952    /// of the leaf in a nested cgroup path. For `"cg_0/narrow"`, writes
953    /// `+cpuset` to `{parent}/cgroup.subtree_control` and
954    /// `{parent}/cg_0/cgroup.subtree_control`. No-op for
955    /// single-component paths.
956    fn enable_subtree_cpuset(&self, name: &str) {
957        let components: Vec<&str> = name.split('/').collect();
958        if components.len() < 2 {
959            return;
960        }
961        let mut cur = self.parent.clone();
962        for c in &components[..components.len() - 1] {
963            let sc = cur.join("cgroup.subtree_control");
964            if sc.exists()
965                && let Err(e) = write_with_timeout(&sc, "+cpuset", CGROUP_WRITE_TIMEOUT)
966            {
967                tracing::warn!(path = %sc.display(), err = %e, "failed to enable cpuset");
968            }
969            cur = cur.join(c);
970        }
971        // Write at the last intermediate (direct parent of the leaf).
972        let sc = cur.join("cgroup.subtree_control");
973        if sc.exists()
974            && let Err(e) = write_with_timeout(&sc, "+cpuset", CGROUP_WRITE_TIMEOUT)
975        {
976            tracing::warn!(path = %sc.display(), err = %e, "failed to enable cpuset");
977        }
978    }
979
980    /// Clear `cpuset.cpus` for a child cgroup (empty string = inherit parent).
981    pub fn clear_cpuset(&self, name: &str) -> Result<()> {
982        validate_cgroup_name(name)?;
983        let p = self.parent.join(name).join("cpuset.cpus");
984        write_with_timeout(&p, "", CGROUP_WRITE_TIMEOUT).with_context(|| {
985            format!("cgroup '{name}': clear cpuset.cpus (write empty string for inherit-parent)")
986        })
987    }
988
989    /// Write `cpuset.mems` for a child cgroup. Constrains which NUMA
990    /// nodes the cgroup's tasks can allocate memory on.
991    ///
992    /// Shape mirrors `set_cpuset` exactly — [`TestTopology::cpuset_string`]
993    /// range-compact-formats the node set, `write_with_timeout` bounds
994    /// the filesystem-write at 2s. Used by `BuildSandbox` under the
995    /// `--cpu-cap` flow to bind build memory to the NUMA nodes hosting
996    /// the locked LLCs, avoiding cross-socket DRAM latency for gcc's
997    /// symbol tables and linker working sets.
998    ///
999    /// # Ordering contract
1000    ///
1001    /// Caller MUST have already called [`Self::set_cpuset`] (or
1002    /// equivalent direct write to `cpuset.cpus`) and — when running
1003    /// under a parent that may narrow the set — MUST have read back
1004    /// `cpuset.cpus.effective` to detect kernel-side narrowing
1005    /// BEFORE invoking this method. The per-knob ordering is
1006    /// load-bearing: `crate::vmm::cgroup_sandbox::BuildSandbox`
1007    /// interleaves `cpuset.cpus.effective` readback between the
1008    /// `cpuset.cpus` and `cpuset.mems` writes to abort on narrowing
1009    /// under the `--cpu-cap` hard-error contract; folding the two
1010    /// writes into a single helper would erase that gate.
1011    ///
1012    /// A cgroup whose `cpuset.cpus` is set should also have a
1013    /// non-empty `cpuset.mems.effective` before any task is migrated
1014    /// into it: the half-configured shape (cpus set locally, no
1015    /// nodemask anywhere up the hierarchy) is suspicious enough that
1016    /// the framework refuses it. The kernel itself does NOT
1017    /// SIGKILL on first allocation — `guarantee_online_mems`
1018    /// (`kernel/cgroup/cpuset.c`) walks UP via `parent_cs(cs)` until
1019    /// `effective_mems` intersects `node_states[N_MEMORY]`, and the
1020    /// top cpuset always has online memory, so the walk always finds
1021    /// a non-empty mask. The actual kernel behavior under a fully
1022    /// empty hierarchy is path-dependent (parent-walk fallback
1023    /// generally succeeds; degenerate states without any online
1024    /// memory may OOM). cgroup v2's `cpuset_can_attach_check` only
1025    /// rejects empty `effective_cpus`, not empty `effective_mems`.
1026    /// In cgroup v2, the local `cpuset.mems` file is normally empty
1027    /// (the cgroup inherits from its parent via `effective_mems`),
1028    /// so reading the local file alone would falsely flag every
1029    /// inheriting child. [`Self::move_task`] enforces the gate at
1030    /// runtime by reading the cgroup's `cpuset.cpus` and
1031    /// `cpuset.mems.effective` files before each migration and
1032    /// refusing the write if `cpuset.cpus` is non-empty while
1033    /// `cpuset.mems.effective` is empty — surfacing a focused
1034    /// error rather than letting a half-configured cgroup through
1035    /// to the kernel's path-dependent behavior.
1036    pub fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()> {
1037        validate_cgroup_name(name)?;
1038        let p = self.parent.join(name).join("cpuset.mems");
1039        let nodes_str = TestTopology::cpuset_string(nodes);
1040        write_with_timeout(&p, &nodes_str, CGROUP_WRITE_TIMEOUT).with_context(|| {
1041            format!(
1042                "cgroup '{name}': set cpuset.mems='{nodes_str}' (requires +cpuset in parent cgroup.subtree_control)"
1043            )
1044        })
1045    }
1046
1047    /// Clear `cpuset.mems` for a child cgroup (empty string = inherit parent).
1048    /// Parallels `clear_cpuset`; callers use it only when tearing
1049    /// down a cpuset-restricted cgroup that needs to accept a
1050    /// fresh task binding with a different NUMA budget.
1051    pub fn clear_cpuset_mems(&self, name: &str) -> Result<()> {
1052        validate_cgroup_name(name)?;
1053        let p = self.parent.join(name).join("cpuset.mems");
1054        write_with_timeout(&p, "", CGROUP_WRITE_TIMEOUT).with_context(|| {
1055            format!("cgroup '{name}': clear cpuset.mems (write empty string for inherit-parent)")
1056        })
1057    }
1058
1059    /// Write `cpu.max` for a child cgroup. `quota_us = None` writes
1060    /// `"max <period_us>"` (no upper bound — same as a freshly
1061    /// created cgroup); `Some(q)` writes `"<q> <period_us>"`.
1062    ///
1063    /// Per the kernel's cgroup v2 docs ("Documentation/admin-guide/
1064    /// cgroup-v2.rst", "CPU Interface Files"): each period the
1065    /// cgroup gets `quota` microseconds of CPU time across its
1066    /// CPUs, and is throttled until the next period boundary once
1067    /// the quota is exhausted. `quota` MAY exceed `period` to let
1068    /// the cgroup use multiple CPUs concurrently (e.g. quota
1069    /// 200_000 / period 100_000 = up to 2 CPUs of throughput).
1070    ///
1071    /// Requires `+cpu` in the parent's `cgroup.subtree_control`;
1072    /// missing controller surfaces as ENOENT on the file (handled
1073    /// generically by `write_with_timeout`'s error path with the
1074    /// errno suffix).
1075    pub fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()> {
1076        validate_cgroup_name(name)?;
1077        let p = self.parent.join(name).join("cpu.max");
1078        let line = match quota_us {
1079            Some(q) => format!("{q} {period_us}"),
1080            None => format!("max {period_us}"),
1081        };
1082        write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1083            format!(
1084                "cgroup '{name}': set cpu.max='{line}' (requires +cpu in parent cgroup.subtree_control)"
1085            )
1086        })
1087    }
1088
1089    /// Write `cpu.weight` for a child cgroup (cgroup v2 weight,
1090    /// range 1..=10000, default 100). Used together with sibling
1091    /// cgroups to bias relative CPU share inside the parent's
1092    /// quota. Independent from `cpu.max` — weights govern share
1093    /// when CPU is contended, max enforces an absolute ceiling.
1094    ///
1095    /// Per "Documentation/admin-guide/cgroup-v2.rst" the legacy
1096    /// "shares" knob is `cpu.weight.nice` (mapped from nice value);
1097    /// this method targets the canonical `cpu.weight` knob.
1098    pub fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()> {
1099        validate_cgroup_name(name)?;
1100        let p = self.parent.join(name).join("cpu.weight");
1101        write_with_timeout(&p, &weight.to_string(), CGROUP_WRITE_TIMEOUT).with_context(|| {
1102            format!(
1103                "cgroup '{name}': set cpu.weight={weight} (requires +cpu in parent cgroup.subtree_control)"
1104            )
1105        })
1106    }
1107
1108    /// Write `memory.max` for a child cgroup. `bytes = None` writes
1109    /// `"max"` (no hard limit). When the cgroup's RSS exceeds the
1110    /// limit, the kernel OOM-kills tasks per the documented
1111    /// `memory.max` semantics. Requires `+memory` in the parent's
1112    /// `cgroup.subtree_control`.
1113    pub fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1114        validate_cgroup_name(name)?;
1115        let p = self.parent.join(name).join("memory.max");
1116        let line = match bytes {
1117            Some(b) => b.to_string(),
1118            None => "max".to_string(),
1119        };
1120        write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1121            format!(
1122                "cgroup '{name}': set memory.max='{line}' (requires +memory in parent cgroup.subtree_control)"
1123            )
1124        })
1125    }
1126
1127    /// Write `memory.high` for a child cgroup. `bytes = None`
1128    /// writes `"max"` (no high-water mark). Crossing the high
1129    /// threshold triggers reclaim throttling but NOT OOM-kill,
1130    /// distinguishing it from `memory.max`.
1131    pub fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1132        validate_cgroup_name(name)?;
1133        let p = self.parent.join(name).join("memory.high");
1134        let line = match bytes {
1135            Some(b) => b.to_string(),
1136            None => "max".to_string(),
1137        };
1138        write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1139            format!(
1140                "cgroup '{name}': set memory.high='{line}' (requires +memory in parent cgroup.subtree_control)"
1141            )
1142        })
1143    }
1144
1145    /// Write `memory.low` for a child cgroup. `bytes = None` writes
1146    /// `"0"` (no low-water protection). The kernel preferentially
1147    /// reclaims FROM other cgroups before reclaiming this cgroup's
1148    /// memory below `memory.low`; not a hard reservation.
1149    pub fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1150        validate_cgroup_name(name)?;
1151        let p = self.parent.join(name).join("memory.low");
1152        let line = match bytes {
1153            Some(b) => b.to_string(),
1154            None => "0".to_string(),
1155        };
1156        write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1157            format!(
1158                "cgroup '{name}': set memory.low='{line}' (requires +memory in parent cgroup.subtree_control)"
1159            )
1160        })
1161    }
1162
1163    /// Write `io.weight` for a child cgroup (cgroup v2 weight,
1164    /// range 1..=10000, default 100). Biases relative IO share
1165    /// across sibling cgroups when the io controller is enabled
1166    /// in the parent's `cgroup.subtree_control`. The kernel's BFQ
1167    /// or io.cost backend (whichever is active) applies the
1168    /// weight when contending devices are saturated.
1169    ///
1170    /// `io.max` (per-device throughput cap) is intentionally NOT
1171    /// surfaced here — the per-device interface needs major:minor
1172    /// device-id lookup which has no in-tree consumer; surface it
1173    /// when a concrete use case lands.
1174    pub fn set_io_weight(&self, name: &str, weight: u16) -> Result<()> {
1175        validate_cgroup_name(name)?;
1176        let p = self.parent.join(name).join("io.weight");
1177        write_with_timeout(&p, &weight.to_string(), CGROUP_WRITE_TIMEOUT).with_context(|| {
1178            format!(
1179                "cgroup '{name}': set io.weight={weight} (requires +io in parent cgroup.subtree_control)"
1180            )
1181        })
1182    }
1183
1184    /// Write `cgroup.freeze` for a child cgroup. `frozen = true` writes
1185    /// `"1"`, `frozen = false` writes `"0"`.
1186    ///
1187    /// `cgroup.freeze` is a cgroup-core file exposed on every non-root
1188    /// cgroup automatically — it is NOT gated by `cgroup.subtree_control`.
1189    /// The kernel's `cgroup_freeze_write` parses the value via
1190    /// `kstrtoint`, rejects anything outside `{0, 1}` with `-ERANGE`,
1191    /// and dispatches `cgroup_freeze(cgrp, freeze)`. Writing `1` to a
1192    /// cgroup containing tasks transitions every task in the subtree to
1193    /// the frozen state; writing `0` releases. The transition is
1194    /// asynchronous — `cgroup.events`'s `frozen` field reaches `1` once
1195    /// every task has parked.
1196    pub fn set_freeze(&self, name: &str, frozen: bool) -> Result<()> {
1197        validate_cgroup_name(name)?;
1198        let p = self.parent.join(name).join("cgroup.freeze");
1199        let line = if frozen { "1" } else { "0" };
1200        write_with_timeout(&p, line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1201            format!("cgroup '{name}': set cgroup.freeze='{line}' (cgroup-core file, no controller required)")
1202        })
1203    }
1204
1205    /// Write `pids.max` for a child cgroup. `max = None` writes `"max"`
1206    /// (the kernel's `PIDS_MAX_STR` sentinel for unlimited);
1207    /// `Some(n)` writes the decimal `n`.
1208    ///
1209    /// Per the kernel's `pids_max_write`: the parser short-circuits to
1210    /// the unlimited limit when `buf == PIDS_MAX_STR`; otherwise
1211    /// `kstrtoll(buf, 0, &limit)` parses a signed integer and rejects
1212    /// `< 0` or `>= PIDS_MAX` with `-EINVAL`. The update is atomic
1213    /// (`atomic64_set(&pids->limit, limit)`); existing tasks are NOT
1214    /// killed when the limit lands below the current task count — only
1215    /// future `fork()` / `clone()` calls are blocked.
1216    ///
1217    /// Requires `+pids` in the parent's `cgroup.subtree_control`;
1218    /// [`Self::setup`] enables it unconditionally so this write
1219    /// succeeds on every ktstr-managed cgroup tree.
1220    pub fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()> {
1221        validate_cgroup_name(name)?;
1222        let p = self.parent.join(name).join("pids.max");
1223        let line = match max {
1224            Some(n) => n.to_string(),
1225            None => "max".to_string(),
1226        };
1227        write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1228            format!(
1229                "cgroup '{name}': set pids.max='{line}' (requires +pids in parent cgroup.subtree_control)"
1230            )
1231        })
1232    }
1233
1234    /// Write `memory.swap.max` for a child cgroup. `bytes = None` writes
1235    /// `"max"` (no swap cap); `Some(b)` writes the decimal byte count.
1236    ///
1237    /// Per the kernel's `swap_max_write`: the value is parsed via
1238    /// `page_counter_memparse(buf, "max", &max)`, which accepts the
1239    /// literal `"max"` token for unlimited or a numeric byte count.
1240    /// The store is `xchg(&memcg->swap.max, max)` — atomic, with no
1241    /// failure path beyond the parse.
1242    ///
1243    /// Requires `+memory` in the parent's `cgroup.subtree_control`;
1244    /// [`Self::setup`] enables it unconditionally.
1245    ///
1246    /// Requires CONFIG_SWAP=y in the test kernel. The file does not
1247    /// exist on swapless builds; the write returns ENOENT.
1248    pub fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1249        validate_cgroup_name(name)?;
1250        let p = self.parent.join(name).join("memory.swap.max");
1251        let line = match bytes {
1252            Some(b) => b.to_string(),
1253            None => "max".to_string(),
1254        };
1255        write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1256            format!(
1257                "cgroup '{name}': set memory.swap.max='{line}' (requires +memory in parent cgroup.subtree_control; file absent on CONFIG_SWAP=n kernels)"
1258            )
1259        })
1260    }
1261
1262    /// Move a single task into a child cgroup via `cgroup.procs`.
1263    ///
1264    /// `move_task` is host-side scenario orchestration, never
1265    /// invoked from a vCPU thread, so the bare `fs::read_to_string`
1266    /// reads in `Self::check_cpuset_ordering` are not bounded by
1267    /// the freeze-rendezvous timeout. A wedged cgroupfs read here
1268    /// would stall the orchestrator thread, not a vCPU.
1269    ///
1270    /// # cpuset ordering gate
1271    ///
1272    /// Before issuing the `cgroup.procs` write, the method reads the
1273    /// destination's `cpuset.cpus` (the local-write knob the caller
1274    /// either set or did not) and `cpuset.mems.effective` (the
1275    /// kernel's effective view, inheritance-aware). The gate
1276    /// refuses migrations into a cgroup whose `cpuset.cpus` is set
1277    /// but `cpuset.mems.effective` reads empty — a half-configured
1278    /// state we surface as a focused error rather than letting it
1279    /// through to the kernel.
1280    ///
1281    /// The kernel's behavior on the half-configured shape is
1282    /// path-dependent: `guarantee_online_mems`
1283    /// (`kernel/cgroup/cpuset.c`) walks UP via `parent_cs(cs)`
1284    /// until `effective_mems` intersects `node_states[N_MEMORY]`,
1285    /// and the top cpuset always has online memory, so the walk
1286    /// generally succeeds; the empty-nodemask OOM path is reachable
1287    /// only in degenerate hierarchies. cgroup v2's
1288    /// `cpuset_can_attach_check` rejects only empty `effective_cpus`
1289    /// (not empty `effective_mems`), so a v2 attach into a cgroup
1290    /// with empty `effective_mems` is not a hard kernel error
1291    /// either. The framework refuses the migration anyway because
1292    /// the half-configured shape almost always reflects a missing
1293    /// [`Self::set_cpuset_mems`] call; surfacing it directly is
1294    /// more debuggable than letting it become whatever the kernel
1295    /// happens to do on this particular hierarchy.
1296    ///
1297    /// # Why `cpuset.mems.effective`, not `cpuset.mems`
1298    ///
1299    /// In cgroup v2, the local `cpuset.mems` file echoes
1300    /// `cs->mems_allowed` — the LOCAL nodemask, which is empty by
1301    /// default until the caller explicitly writes it. The kernel's
1302    /// allocation path uses `cs->effective_mems` instead, which
1303    /// inherits from the parent when the local mask is empty (per
1304    /// `cpuset_common_seq_show`'s FILE_EFFECTIVE_MEMLIST branch and
1305    /// `guarantee_online_mems`'s `parent_cs(cs)` walk). A gate that
1306    /// reads the local file would falsely flag every inheriting
1307    /// child as half-configured even though the kernel sees a
1308    /// perfectly valid `effective_mems` from the parent. The
1309    /// effective view captures both "this cgroup wrote `cpuset.mems`
1310    /// directly" and "this cgroup inherits a non-empty mask from
1311    /// its parent" without false positives.
1312    ///
1313    /// Both reads are best-effort — a cgroup without cpuset
1314    /// controllers (`cpuset.cpus` does not exist) bypasses the
1315    /// gate, matching the kernel's "no cpuset constraints to
1316    /// enforce" path. Read errors on either knob are absorbed: the
1317    /// gate exists to catch the configured-but-half-configured
1318    /// shape, not to fight cgroupfs read failures. If
1319    /// `cpuset.mems.effective` cannot be read for any reason, the
1320    /// gate degrades to "accept" — it cannot make a sound decision
1321    /// without the kernel's effective view.
1322    pub fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()> {
1323        validate_cgroup_name(name)?;
1324        self.check_cpuset_ordering(name)?;
1325        let p = self.parent.join(name).join("cgroup.procs");
1326        write_with_timeout(&p, &pid.to_string(), CGROUP_WRITE_TIMEOUT)
1327    }
1328
1329    /// Verify that a cgroup's `cpuset.cpus` /
1330    /// `cpuset.mems.effective` are in a consistent state before
1331    /// admitting a task migration into it.
1332    ///
1333    /// Returns `Err` only when the destination has `cpuset.cpus`
1334    /// non-empty AND `cpuset.mems.effective` reads empty — a
1335    /// half-configured shape we surface as a focused error rather
1336    /// than letting through. The kernel's behavior in this state is
1337    /// path-dependent: `guarantee_online_mems` (`kernel/cgroup/
1338    /// cpuset.c`) walks UP via `parent_cs(cs)` until effective_mems
1339    /// intersects `node_states[N_MEMORY]` and the top cpuset always
1340    /// has online memory, so the parent-walk fallback usually
1341    /// succeeds; degenerate hierarchies may OOM. cgroup v2's
1342    /// `cpuset_can_attach_check` rejects only empty `effective_cpus`,
1343    /// not empty `effective_mems`. All other shapes (no cpuset
1344    /// controller, local cpus empty, effective mems non-empty
1345    /// whether locally written or parent-inherited) are accepted.
1346    ///
1347    /// Read failures on either knob are absorbed (the gate degrades
1348    /// to "accept" rather than blocking on any cgroupfs read
1349    /// error). The effective-view file is the source of truth
1350    /// because in cgroup v2 the local `cpuset.mems` is normally
1351    /// empty (the cgroup inherits from its parent via
1352    /// `effective_mems`); reading the local file would emit false
1353    /// positives for every child that inherits a parent's NUMA
1354    /// budget without writing its own.
1355    fn check_cpuset_ordering(&self, name: &str) -> Result<()> {
1356        let cpus_path = self.parent.join(name).join("cpuset.cpus");
1357        let mems_effective_path = self.parent.join(name).join("cpuset.mems.effective");
1358        let cpus = match fs::read_to_string(&cpus_path) {
1359            Ok(s) => s,
1360            Err(_) => return Ok(()),
1361        };
1362        // `cpuset.cpus` is empty when the cgroup inherits from its
1363        // parent — no constraint imposed locally, so the
1364        // `cpuset.mems` invariant doesn't apply.
1365        if cpus.trim().is_empty() {
1366            return Ok(());
1367        }
1368        let mems_effective = match fs::read_to_string(&mems_effective_path) {
1369            Ok(s) => s,
1370            Err(_) => return Ok(()),
1371        };
1372        if mems_effective.trim().is_empty() {
1373            bail!(
1374                "move_task into '{name}' refused: cpuset.cpus is set ({}) \
1375                 but cpuset.mems.effective reads empty — half-configured \
1376                 cgroup. The kernel's behavior here is path-dependent \
1377                 (guarantee_online_mems walks up to find a non-empty \
1378                 ancestor mask; the empty-nodemask OOM path is reachable \
1379                 only in degenerate hierarchies), but the framework \
1380                 surfaces a focused error rather than letting the \
1381                 migration through. Call set_cpuset_mems on this cgroup \
1382                 or widen an ancestor's cpuset.mems before move_task",
1383                cpus.trim(),
1384            );
1385        }
1386        Ok(())
1387    }
1388
1389    /// Write `child_pid` to `<cgroup_name>/cgroup.procs` during the
1390    /// payload-spawn cgroup-sync handshake.
1391    ///
1392    /// Distinct from [`Self::move_task`]: this is the
1393    /// placement-before-exec write that runs while the child is
1394    /// paused in pre_exec between `fork(2)` and `execve(2)`. The
1395    /// `move_task` cpuset-ordering gate does NOT apply here —
1396    /// placement runs before cpuset is finalised at scenario setup
1397    /// time, and the gate would reject otherwise-valid spawn
1398    /// requests. Callers that need the gate (post-spawn migration)
1399    /// invoke [`Self::move_task`] / [`Self::move_tasks`] instead.
1400    ///
1401    /// Uses the same `write_with_timeout` shape as the other
1402    /// `cgroup.procs` write sites so a wedged cgroupfs is bounded
1403    /// to `CGROUP_WRITE_TIMEOUT` rather than blocking the parent
1404    /// indefinitely.
1405    pub fn place_task_during_handshake(
1406        &self,
1407        cgroup_name: &str,
1408        child_pid: libc::pid_t,
1409    ) -> Result<()> {
1410        validate_cgroup_name(cgroup_name)?;
1411        let cgroup_procs_path = self.parent.join(cgroup_name).join("cgroup.procs");
1412        let line = format!("{child_pid}\n");
1413        write_with_timeout(&cgroup_procs_path, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1414            format!(
1415                "place pid {child_pid} into cgroup '{cgroup_name}' via {} during cgroup-sync handshake",
1416                cgroup_procs_path.display(),
1417            )
1418        })
1419    }
1420
1421    /// Move multiple tasks into a child cgroup by PID.
1422    ///
1423    /// Tolerates per-pid ESRCH (a task that exited between the listing
1424    /// snapshot and the migration write) and logs a warn for each
1425    /// vanished pid — partial migration is a legitimate outcome when
1426    /// one of N workers has voluntarily exited. Retries EBUSY up to
1427    /// 3 times with 100ms backoff for transient rejections from
1428    /// sched_ext BPF `cgroup_prep_move` callbacks
1429    /// (`scx_cgroup_can_attach`). Propagates EBUSY after retries
1430    /// exhausted. Propagates all other errors immediately.
1431    ///
1432    /// # All-vanished bail
1433    ///
1434    /// When `pids` is non-empty AND every supplied pid ESRCH'd, this
1435    /// fn bails with an actionable diagnostic rather than silently
1436    /// returning Ok. The silent-Ok path violates the project's
1437    /// no-silent-drops rule (any data loss must fail loudly):
1438    /// a downstream consumer reading the destination
1439    /// `cgroup.procs` would see 0 pids and have no idea whether
1440    /// the migration was supposed to move 0 or N — masking a real
1441    /// test-setup regression (e.g. `WorkloadHandle::spawn` child
1442    /// pre_exec init-panic cascade that killed every paused worker
1443    /// before move_tasks ran) behind a downstream-state empty-read.
1444    ///
1445    /// A test that LEGITIMATELY moves only already-exited workers
1446    /// (post-Drop diagnostic, post-mortem capture) should pass an
1447    /// empty `pids` slice rather than calling with non-empty + all
1448    /// pre-vanished — the empty-slice path is the documented "no
1449    /// move requested" form that returns Ok cleanly.
1450    pub fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()> {
1451        validate_cgroup_name(name)?;
1452        move_tasks_inner(name, pids, |n, pid| self.move_task_with_retry(n, pid))
1453    }
1454
1455    /// Move a single task with bounded EBUSY retry.
1456    fn move_task_with_retry(&self, name: &str, pid: libc::pid_t) -> Result<()> {
1457        const MAX_RETRIES: u32 = 3;
1458        const RETRY_DELAY: Duration = Duration::from_millis(100);
1459
1460        for attempt in 0..MAX_RETRIES {
1461            match self.move_task(name, pid) {
1462                Ok(()) => return Ok(()),
1463                Err(e) if is_ebusy(&e) && attempt + 1 < MAX_RETRIES => {
1464                    tracing::debug!(
1465                        pid,
1466                        cgroup = name,
1467                        attempt = attempt + 1,
1468                        "EBUSY on cgroup.procs write, retrying"
1469                    );
1470                    std::thread::sleep(RETRY_DELAY);
1471                }
1472                Err(e) => return Err(e),
1473            }
1474        }
1475        unreachable!()
1476    }
1477
1478    /// Clear `subtree_control` on a child cgroup by writing an empty
1479    /// string. Disables all controllers for the cgroup's children.
1480    ///
1481    /// Required before moving tasks into a cgroup that has
1482    /// `subtree_control` set: the kernel's no-internal-process
1483    /// constraint (`cgroup_migrate_vet_dst`) returns EBUSY when
1484    /// tasks are written to `cgroup.procs` of a cgroup with
1485    /// controllers in `subtree_control`.
1486    pub fn clear_subtree_control(&self, name: &str) -> Result<()> {
1487        validate_cgroup_name(name)?;
1488        let p = self.parent.join(name).join("cgroup.subtree_control");
1489        if !p.exists() {
1490            return Ok(());
1491        }
1492        // Read current controllers and disable each one.
1493        let content = fs::read_to_string(&p).with_context(|| format!("read {}", p.display()))?;
1494        let content = content.trim();
1495        if content.is_empty() {
1496            return Ok(());
1497        }
1498        // Each controller name needs a "-" prefix to disable.
1499        let disable: Vec<String> = content
1500            .split_whitespace()
1501            .map(|c| format!("-{c}"))
1502            .collect();
1503        let disable_str = disable.join(" ");
1504        write_with_timeout(&p, &disable_str, CGROUP_WRITE_TIMEOUT)
1505            .with_context(|| format!("clear subtree_control on {name}"))
1506    }
1507
1508    /// Move all tasks from a child cgroup to the walk-root cgroup.
1509    ///
1510    /// Drains to `{self.walk_root}/cgroup.procs` instead of the
1511    /// parent because the parent has `subtree_control` set (enabling
1512    /// cpuset for children), and the kernel's no-internal-process
1513    /// constraint rejects writes to `cgroup.procs` when
1514    /// `subtree_control` is active. The walk-root cgroup is the
1515    /// uppermost cgroup the operator can write to without crossing
1516    /// the delegation boundary; under Mode A it is the canonical
1517    /// `/sys/fs/cgroup` root (exempt from the no-internal-process
1518    /// constraint), under Mode B/C it is the delegated subtree root
1519    /// (which also has procs-writability inside the delegation).
1520    pub fn drain_tasks(&self, name: &str) -> Result<()> {
1521        validate_cgroup_name(name)?;
1522        let src = self.parent.join(name).join("cgroup.procs");
1523        if !src.exists() {
1524            return Ok(());
1525        }
1526        let dst = self.walk_root.join("cgroup.procs");
1527        drain_pids_to_root(&src, &dst, name);
1528        Ok(())
1529    }
1530
1531    /// Read `cgroup.procs` of `name`, returning the thread-group
1532    /// leaders (PIDs) currently in the cgroup.
1533    ///
1534    /// Distinct from [`Self::drain_tasks`]:
1535    /// - `drain_tasks` MIGRATES tasks to the walk-root and treats a
1536    ///   missing `cgroup.procs` file as a no-op (`Ok(())`) so
1537    ///   best-effort teardown of an already-rmdir'd cgroup is safe.
1538    /// - `read_procs` is a READ accessor for assertions
1539    ///   ([`Op::CaptureCgroupProcs`](crate::scenario::ops::Op::CaptureCgroupProcs)
1540    ///   and direct callers). A missing `cgroup.procs` file is a
1541    ///   real error (cgroup doesn't exist, typo'd name, race with
1542    ///   teardown) — propagating it lets the caller distinguish
1543    ///   "empty cgroup" from "no such cgroup."
1544    ///
1545    /// # Semantics
1546    ///
1547    /// - Returns thread-group leaders (PIDs / TGIDs) as the kernel
1548    ///   exposes them via `cgroup_procs_show` in `kernel/cgroup/cgroup.c`.
1549    ///   For per-thread TIDs the kernel exposes `cgroup.threads`; this
1550    ///   method reads ONLY `cgroup.procs`.
1551    /// - Non-atomic snapshot as exposed by the kernel's pidlist
1552    ///   iteration (`cgroup_procs_show` / `css_task_iter_next` in
1553    ///   `kernel/cgroup/cgroup.c`): the kernel walks the css_set's
1554    ///   task list one entry at a time, so a task that joins or exits
1555    ///   mid-read can appear in the next read but not this one (or
1556    ///   vice versa). The userspace `fs::read_to_string` here returns
1557    ///   when seq_file signals EOF; the per-pid atomicity is a kernel
1558    ///   property, not an impl one. Callers asserting on membership
1559    ///   of a stable task set (e.g. SpinWait workers spawned in the
1560    ///   prior op) are unaffected.
1561    /// - Empty cgroup: returns `Ok(Vec::new())` (kernel emits an
1562    ///   empty file, not an error). Lets callers distinguish "no
1563    ///   tasks" from "no such cgroup."
1564    /// - Malformed pid lines: skipped with a `tracing::warn!`
1565    ///   naming the offending line, matching
1566    ///   `drain_pids_to_root`'s tolerance. The kernel never emits
1567    ///   such lines today; the tolerance exists so a future kernel
1568    ///   gaining a header or comment line surfaces as a warn
1569    ///   instead of an opaque parse error.
1570    pub fn read_procs(&self, name: &str) -> Result<Vec<libc::pid_t>> {
1571        validate_cgroup_name(name)?;
1572        let procs_path = self.parent.join(name).join("cgroup.procs");
1573        let content = fs::read_to_string(&procs_path).with_context(|| {
1574            format!(
1575                "read cgroup.procs from '{}' (cgroup name '{name}'); the cgroup may not \
1576                 exist or may have been removed (check that `Op::AddCgroup(name)` or a \
1577                 `CgroupDef` covers this name, and that the test's `workload_root_cgroup` \
1578                 is correct)",
1579                procs_path.display(),
1580            )
1581        })?;
1582        let mut pids = Vec::new();
1583        for line in content.lines() {
1584            let trimmed = line.trim();
1585            if trimmed.is_empty() {
1586                continue;
1587            }
1588            match trimmed.parse::<libc::pid_t>() {
1589                Ok(pid) => pids.push(pid),
1590                Err(e) => {
1591                    tracing::warn!(
1592                        path = %procs_path.display(),
1593                        cgroup = name,
1594                        line = trimmed,
1595                        err = %e,
1596                        "read_procs: malformed pid line; skipping",
1597                    );
1598                }
1599            }
1600        }
1601        Ok(pids)
1602    }
1603
1604    /// Remove all child cgroups under the parent (keeps the parent itself).
1605    ///
1606    /// Returns `Ok` even when individual filesystem probes fail; callers
1607    /// treat cleanup as best-effort teardown (see the runner's warn-
1608    /// and-continue in `src/runner.rs`). Per-entry `read_dir` /
1609    /// `DirEntry` / `file_type` errors are surfaced via
1610    /// `tracing::warn!` — mirrors `CgroupGroup::drop` so a failure
1611    /// shows up in logs instead of silently leaving children behind.
1612    ///
1613    /// # Outer-read_dir failure semantic
1614    ///
1615    /// When `read_dir(self.parent)` itself fails — e.g. the parent
1616    /// directory is unreadable, the cgroup mount has been unmounted
1617    /// out from under us, or a stat-side IO error fires — the
1618    /// failure is surfaced via `tracing::warn!` and the function
1619    /// still returns `Ok(())`. The deliberate semantic here is
1620    /// "teardown that observes a hostile filesystem state must
1621    /// not block scenario completion": a hard `Err` would propagate
1622    /// up through the runner's teardown and abort the whole test
1623    /// run on a transient cgroupfs failure that the operator can
1624    /// follow up on by reading the warn line.
1625    ///
1626    /// Production callers (the runner's drop path, scenario teardown)
1627    /// already log-and-continue on `cleanup_all` errors, so the
1628    /// always-Ok return is consistent with how every consumer
1629    /// already treats the result. Operators who need to detect
1630    /// teardown leakage should grep `tracing` output for
1631    /// `"cleanup_all: read_dir failed"` rather than relying on a
1632    /// non-zero exit; the warn includes both the offending path and
1633    /// the underlying io::Error.
1634    pub fn cleanup_all(&self) -> Result<()> {
1635        if !self.parent.exists() {
1636            return Ok(());
1637        }
1638        let walk_root = self.walk_root.clone();
1639        if let Err(err) = for_each_child_dir(&self.parent, "cleanup_all", |p| {
1640            cleanup_recursive(p, &walk_root)
1641        }) {
1642            tracing::warn!(
1643                parent = %self.parent.display(),
1644                err = %err,
1645                "cleanup_all: read_dir failed; child cgroups may remain under parent",
1646            );
1647        }
1648        Ok(())
1649    }
1650}
1651
1652/// Abstraction over the cgroup v2 filesystem surface used by the
1653/// scenario runtime. The production implementation is [`CgroupManager`],
1654/// which translates each method into real writes under `/sys/fs/cgroup`.
1655///
1656/// Extracted so `scenario::ops::apply_setup` and related orchestration
1657/// code can be unit-tested against an in-memory double: tests construct
1658/// a recording or failure-injecting implementor, drive `apply_setup`
1659/// against it, and assert on the recorded call sequence without
1660/// touching the host cgroup hierarchy.
1661///
1662/// Object-safe by design — scenario code holds the trait object behind
1663/// `&dyn CgroupOps` rather than being generic. Callers keep writing
1664/// `ctx.cgroups.set_cpuset(...)` with no syntactic change; dynamic
1665/// dispatch resolves to `CgroupManager` in production and to the
1666/// test double under `#[cfg(test)]`. The per-call indirect-call cost
1667/// is dominated by the filesystem I/O the trait abstracts over.
1668pub trait CgroupOps {
1669    /// Path to the parent cgroup directory. See
1670    /// [`CgroupManager::parent_path`].
1671    fn parent_path(&self) -> &Path;
1672    /// Create the parent directory and enable controllers. See
1673    /// [`CgroupManager::setup`].
1674    fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()>;
1675    /// Create a child cgroup. See [`CgroupManager::create_cgroup`].
1676    fn create_cgroup(&self, name: &str) -> Result<()>;
1677    /// Drain and remove a child cgroup. See
1678    /// [`CgroupManager::remove_cgroup`].
1679    fn remove_cgroup(&self, name: &str) -> Result<()>;
1680    /// Write `cpuset.cpus`. See [`CgroupManager::set_cpuset`].
1681    fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()>;
1682    /// Clear `cpuset.cpus` (inherit from parent). See
1683    /// [`CgroupManager::clear_cpuset`].
1684    fn clear_cpuset(&self, name: &str) -> Result<()>;
1685    /// Write `cpuset.mems`. See [`CgroupManager::set_cpuset_mems`].
1686    fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()>;
1687    /// Clear `cpuset.mems` (inherit from parent). See
1688    /// [`CgroupManager::clear_cpuset_mems`].
1689    fn clear_cpuset_mems(&self, name: &str) -> Result<()>;
1690    /// Write `cpu.max`. See [`CgroupManager::set_cpu_max`].
1691    fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()>;
1692    /// Write `cpu.weight`. See [`CgroupManager::set_cpu_weight`].
1693    fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()>;
1694    /// Write `memory.max`. See [`CgroupManager::set_memory_max`].
1695    fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1696    /// Write `memory.high`. See [`CgroupManager::set_memory_high`].
1697    fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1698    /// Write `memory.low`. See [`CgroupManager::set_memory_low`].
1699    fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1700    /// Write `io.weight`. See [`CgroupManager::set_io_weight`].
1701    fn set_io_weight(&self, name: &str, weight: u16) -> Result<()>;
1702    /// Write `cgroup.freeze`. See [`CgroupManager::set_freeze`].
1703    fn set_freeze(&self, name: &str, frozen: bool) -> Result<()>;
1704    /// Write `pids.max`. See [`CgroupManager::set_pids_max`].
1705    fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()>;
1706    /// Write `memory.swap.max`. See
1707    /// [`CgroupManager::set_memory_swap_max`].
1708    fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1709    /// Move a single task via `cgroup.procs`. See
1710    /// [`CgroupManager::move_task`].
1711    fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()>;
1712    /// Move multiple tasks (tolerates ESRCH, retries EBUSY). See
1713    /// [`CgroupManager::move_tasks`].
1714    fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()>;
1715    /// Place a single task into a child cgroup's `cgroup.procs`
1716    /// during the payload-spawn cgroup-sync handshake.
1717    ///
1718    /// Distinct from [`Self::move_task`] / [`Self::move_tasks`]:
1719    /// those run post-spawn for synthetic workers whose pids are
1720    /// already in their final cgroup-permissive state. This method
1721    /// runs INSIDE the two-pipe handshake between the child's
1722    /// pre_exec pid-notify and the parent's release-signal write,
1723    /// when the child is paused between `fork(2)` and `execve(2)`.
1724    /// The write MUST land BEFORE the release byte so the child's
1725    /// `execve` lands in the destination cgroup — this is the
1726    /// placement-before-exec invariant required to keep tasks like
1727    /// `Op::RunPayload { cgroup: Some(name), ... }` from briefly
1728    /// inheriting the parent's cgroup at exec time.
1729    ///
1730    /// # Caller contract
1731    ///
1732    /// - MUST be invoked exactly once during the handshake between
1733    ///   pid-notify and release-signal.
1734    /// - Failure MUST propagate to the caller, which is responsible
1735    ///   for dropping the release pipe to unblock the child with
1736    ///   EOF so it bails out of pre_exec rather than execve'ing
1737    ///   into an unspecified cgroup.
1738    /// - The `cgroup_name` argument is the user-facing name the
1739    ///   test author passed in `Op::RunPayload { cgroup: Some(name),
1740    ///   ... }` or `PayloadRun::in_cgroup(name)` — NOT a derived
1741    ///   absolute path. The implementation derives the
1742    ///   `cgroup.procs` path from this name plus its own
1743    ///   parent-path knowledge.
1744    ///
1745    /// See [`CgroupManager::place_task_during_handshake`].
1746    fn place_task_during_handshake(&self, cgroup_name: &str, child_pid: libc::pid_t) -> Result<()>;
1747    /// Clear `cgroup.subtree_control` on a child. See
1748    /// [`CgroupManager::clear_subtree_control`].
1749    fn clear_subtree_control(&self, name: &str) -> Result<()>;
1750    /// Drain tasks from a child to the cgroup root. See
1751    /// [`CgroupManager::drain_tasks`].
1752    fn drain_tasks(&self, name: &str) -> Result<()>;
1753    /// Read `cgroup.procs` of a child, returning thread-group leaders.
1754    /// See [`CgroupManager::read_procs`].
1755    fn read_procs(&self, name: &str) -> Result<Vec<libc::pid_t>>;
1756    /// Remove all child cgroups under the parent. See
1757    /// [`CgroupManager::cleanup_all`].
1758    fn cleanup_all(&self) -> Result<()>;
1759}
1760
1761// Thin forwarding trait impl: inherent `CgroupManager` methods hold the
1762// real bodies; this trait impl exists so scenario code can hold
1763// `&dyn CgroupOps` for test-double injection without threading a generic
1764// through every caller. Trait default methods cannot access the private
1765// fields, and macro-generated delegation would lose Go-To-Definition.
1766impl CgroupOps for CgroupManager {
1767    fn parent_path(&self) -> &Path {
1768        CgroupManager::parent_path(self)
1769    }
1770    fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()> {
1771        CgroupManager::setup(self, controllers)
1772    }
1773    fn create_cgroup(&self, name: &str) -> Result<()> {
1774        CgroupManager::create_cgroup(self, name)
1775    }
1776    fn remove_cgroup(&self, name: &str) -> Result<()> {
1777        CgroupManager::remove_cgroup(self, name)
1778    }
1779    fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()> {
1780        CgroupManager::set_cpuset(self, name, cpus)
1781    }
1782    fn clear_cpuset(&self, name: &str) -> Result<()> {
1783        CgroupManager::clear_cpuset(self, name)
1784    }
1785    fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()> {
1786        CgroupManager::set_cpuset_mems(self, name, nodes)
1787    }
1788    fn clear_cpuset_mems(&self, name: &str) -> Result<()> {
1789        CgroupManager::clear_cpuset_mems(self, name)
1790    }
1791    fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()> {
1792        CgroupManager::set_cpu_max(self, name, quota_us, period_us)
1793    }
1794    fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()> {
1795        CgroupManager::set_cpu_weight(self, name, weight)
1796    }
1797    fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1798        CgroupManager::set_memory_max(self, name, bytes)
1799    }
1800    fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1801        CgroupManager::set_memory_high(self, name, bytes)
1802    }
1803    fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1804        CgroupManager::set_memory_low(self, name, bytes)
1805    }
1806    fn set_io_weight(&self, name: &str, weight: u16) -> Result<()> {
1807        CgroupManager::set_io_weight(self, name, weight)
1808    }
1809    fn set_freeze(&self, name: &str, frozen: bool) -> Result<()> {
1810        CgroupManager::set_freeze(self, name, frozen)
1811    }
1812    fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()> {
1813        CgroupManager::set_pids_max(self, name, max)
1814    }
1815    fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1816        CgroupManager::set_memory_swap_max(self, name, bytes)
1817    }
1818    fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()> {
1819        CgroupManager::move_task(self, name, pid)
1820    }
1821    fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()> {
1822        CgroupManager::move_tasks(self, name, pids)
1823    }
1824    fn place_task_during_handshake(&self, cgroup_name: &str, child_pid: libc::pid_t) -> Result<()> {
1825        CgroupManager::place_task_during_handshake(self, cgroup_name, child_pid)
1826    }
1827    fn clear_subtree_control(&self, name: &str) -> Result<()> {
1828        CgroupManager::clear_subtree_control(self, name)
1829    }
1830    fn drain_tasks(&self, name: &str) -> Result<()> {
1831        CgroupManager::drain_tasks(self, name)
1832    }
1833    fn read_procs(&self, name: &str) -> Result<Vec<libc::pid_t>> {
1834        CgroupManager::read_procs(self, name)
1835    }
1836    fn cleanup_all(&self) -> Result<()> {
1837        CgroupManager::cleanup_all(self)
1838    }
1839}
1840
1841/// Block until the cgroup at `cgroup_dir` reports `populated 0` via
1842/// its `cgroup.events` file, or until `budget` elapses. Event-driven
1843/// via inotify(IN_MODIFY) on the events file so the wait wakes on
1844/// the actual kernel state-transition write rather than a blind
1845/// sleep. Callers use the return value to decide whether to proceed
1846/// (cgroup empty — rmdir will succeed) or to fall through and let
1847/// the subsequent rmdir surface EBUSY for a genuinely-stuck cgroup.
1848///
1849/// Best-effort: a missing `cgroup.events` file (legacy kernels
1850/// without cgroup v2 events, non-cgroupfs paths threaded into this
1851/// helper by a test fixture, races where the parent dir was already
1852/// removed) returns `false` without waiting — the caller falls
1853/// through to its rmdir attempt which will surface the actual
1854/// error. inotify_init / add_watch failures degrade silently to a
1855/// short blocking sleep for the remaining budget.
1856fn wait_for_cgroup_unpopulated(cgroup_dir: &Path, budget: std::time::Duration) -> bool {
1857    use nix::poll::{PollFd, PollFlags, PollTimeout, poll};
1858    use nix::sys::inotify::{AddWatchFlags, InitFlags, Inotify};
1859    use std::os::unix::io::AsFd;
1860
1861    let events_path = cgroup_dir.join("cgroup.events");
1862    // Tight initial check so a cgroup that's already empty
1863    // (extremely common — most drain_tasks call sites finish
1864    // synchronously) returns immediately without setting up inotify
1865    // or sleeping.
1866    if cgroup_events_reports_unpopulated(&events_path) {
1867        return true;
1868    }
1869    let deadline = std::time::Instant::now() + budget;
1870    // Inotify on the events file. IN_MODIFY fires every time the
1871    // kernel updates the populated count (1 → 0 transition included).
1872    // IN_NONBLOCK so read_events returns EAGAIN when empty — we
1873    // drive wake-vs-timeout via poll(2).
1874    let inotify_result =
1875        Inotify::init(InitFlags::IN_CLOEXEC | InitFlags::IN_NONBLOCK).and_then(|i| {
1876            i.add_watch(&events_path, AddWatchFlags::IN_MODIFY)?;
1877            Ok(i)
1878        });
1879    loop {
1880        if cgroup_events_reports_unpopulated(&events_path) {
1881            return true;
1882        }
1883        let now = std::time::Instant::now();
1884        if now >= deadline {
1885            return false;
1886        }
1887        let remaining_ms = deadline
1888            .duration_since(now)
1889            .as_millis()
1890            .min(u16::MAX as u128) as u16;
1891        match inotify_result.as_ref() {
1892            Ok(inotify) => {
1893                let fd = inotify.as_fd();
1894                let mut pollfds = [PollFd::new(fd, PollFlags::POLLIN)];
1895                let _ = poll(&mut pollfds, PollTimeout::from(remaining_ms));
1896                let _ = inotify.read_events();
1897            }
1898            Err(_) => {
1899                // Inotify unavailable on this path (legacy kernel,
1900                // missing events file, transient race). Fall back
1901                // to a brief blocking sleep so the loop still makes
1902                // progress under the deadline.
1903                std::thread::sleep(
1904                    std::time::Duration::from_millis(10).min(deadline.duration_since(now)),
1905                );
1906            }
1907        }
1908    }
1909}
1910
1911/// Read `cgroup.events` and return `true` iff it contains a
1912/// `populated 0` line. Returns `false` for any read error or for
1913/// `populated 1` so the caller can keep waiting. The events file
1914/// is a small (~50 byte) flat key/value listing; full read each
1915/// poll iteration is cheap and avoids stateful parsing edge cases.
1916fn cgroup_events_reports_unpopulated(events_path: &Path) -> bool {
1917    match fs::read_to_string(events_path) {
1918        Ok(s) => s
1919            .lines()
1920            .any(|line| line.split_whitespace().eq(["populated", "0"])),
1921        Err(_) => false,
1922    }
1923}
1924
1925/// Drain all tasks from `procs_path` to `dst` (the walk-root
1926/// `cgroup.procs`).
1927///
1928/// `dst` must be the `cgroup.procs` file at the cgroup-fs root the
1929/// caller is permitted to write to (under Mode A: `/sys/fs/cgroup`;
1930/// under Mode B/C: the delegated subtree root the operator owns).
1931/// The walk-root cgroup is exempt from (or above) the
1932/// no-internal-process constraint inside its delegation, so writes
1933/// to its `cgroup.procs` succeed even when intermediate cgroups have
1934/// `subtree_control` set.
1935///
1936/// ESRCH (task exited) is silently tolerated; other errors are
1937/// logged. A `read_to_string` failure or a malformed pid line is
1938/// surfaced via `tracing::warn!` — silently dropping either would
1939/// hide a cgroup that still contains tasks and send it into cleanup,
1940/// which then fails with EBUSY and compounds the confusion.
1941fn drain_pids_to_root(procs_path: &Path, dst: &Path, context: &str) {
1942    let content = match fs::read_to_string(procs_path) {
1943        Ok(c) => c,
1944        Err(e) => {
1945            tracing::warn!(
1946                path = %procs_path.display(),
1947                cgroup = context,
1948                err = %e,
1949                "drain_pids_to_root: read_to_string failed; tasks may remain in cgroup",
1950            );
1951            return;
1952        }
1953    };
1954    for line in content.lines() {
1955        let trimmed = line.trim();
1956        if trimmed.is_empty() {
1957            continue;
1958        }
1959        let pid: u32 = match trimmed.parse() {
1960            Ok(p) => p,
1961            Err(e) => {
1962                tracing::warn!(
1963                    path = %procs_path.display(),
1964                    cgroup = context,
1965                    line = trimmed,
1966                    err = %e,
1967                    "drain_pids_to_root: malformed pid line; skipping",
1968                );
1969                continue;
1970            }
1971        };
1972        if let Err(e) = write_with_timeout(dst, &pid.to_string(), CGROUP_WRITE_TIMEOUT)
1973            && !is_esrch(&e)
1974        {
1975            tracing::warn!(pid, cgroup = context, err = %e, "failed to drain task");
1976        }
1977    }
1978}
1979
1980/// Iterate the direct child directories of `path`, calling `f` on each.
1981///
1982/// `context` is a short caller name (e.g. `"cleanup_all"`,
1983/// `"cleanup_recursive"`) that is prefixed into every per-entry
1984/// `tracing::warn!` message so operators grepping logs for
1985/// `"cleanup_all: "` still see both the outer read_dir failure (which
1986/// stays with the caller) and the per-entry `DirEntry` / `file_type`
1987/// warnings emitted here.
1988///
1989/// `read_dir` failure is surfaced to the caller via `Err`; the caller
1990/// owns the top-level warn message. Non-directory entries are skipped.
1991/// Per-entry errors are logged and the iteration continues.
1992///
1993/// The structured log field key is normalized to `path =` at this
1994/// boundary; `cleanup_all`'s outer warn still uses `parent =` for the
1995/// top-level read_dir failure since that warn is emitted by the
1996/// caller, not here.
1997fn for_each_child_dir(path: &Path, context: &str, mut f: impl FnMut(&Path)) -> std::io::Result<()> {
1998    for entry in fs::read_dir(path)? {
1999        let entry = match entry {
2000            Ok(e) => e,
2001            Err(err) => {
2002                tracing::warn!(
2003                    path = %path.display(),
2004                    err = %err,
2005                    "{context}: dir entry read failed; skipping",
2006                );
2007                continue;
2008            }
2009        };
2010        match entry.file_type() {
2011            Ok(t) if t.is_dir() => f(&entry.path()),
2012            Ok(_) => {}
2013            Err(err) => tracing::warn!(
2014                path = %entry.path().display(),
2015                err = %err,
2016                "{context}: file_type read failed; skipping entry",
2017            ),
2018        }
2019    }
2020    Ok(())
2021}
2022
2023/// Depth-first removal of `path` and every descendant cgroup
2024/// directory. Drains each cgroup's pids to `{walk_root}/cgroup.procs`
2025/// before rmdir.
2026///
2027/// `walk_root` mirrors [`CgroupManager::walk_root`]: under Mode A it
2028/// is `/sys/fs/cgroup` (the canonical cgroup-v2 mount); under Mode
2029/// B/C it is the delegated subtree root the operator owns. Threaded
2030/// through the recursion so every descendant drain targets the
2031/// caller's writable root and never the canonical
2032/// `/sys/fs/cgroup/cgroup.procs` (which would EACCES under
2033/// delegation).
2034fn cleanup_recursive(path: &std::path::Path, walk_root: &Path) {
2035    // Depth-first: clean children before parent
2036    if let Err(err) = for_each_child_dir(path, "cleanup_recursive", |child| {
2037        cleanup_recursive(child, walk_root)
2038    }) {
2039        tracing::warn!(
2040            path = %path.display(),
2041            err = %err,
2042            "cleanup_recursive: read_dir failed; child cgroups may remain",
2043        );
2044    }
2045    // Auto-unfreeze before draining tasks. Mirrors
2046    // `CgroupManager::remove_cgroup`'s pre-drain unfreeze, but for
2047    // defense-in-depth and source-cgroup state hygiene rather than
2048    // for correctness: the kernel's `cgroup_freezer_migrate_task`
2049    // path DOES unfreeze tasks when they migrate to an unfrozen
2050    // destination (the cgroup root is always unfrozen), so frozen
2051    // tasks would not actually strand at the root. The explicit
2052    // pre-drain `cgroup.freeze=0` write is still worthwhile because
2053    // it (a) makes the source cgroup's transient state visible in
2054    // tracing / `cgroup.events` before the directory disappears,
2055    // (b) avoids a brief frozen-counter churn while migration
2056    // batches advance, and (c) makes the teardown path symmetric
2057    // with `remove_cgroup` so operators reading either function
2058    // see the same auto-unfreeze step.
2059    //
2060    // Gate on existence: `fs::write` on a regular filesystem
2061    // CREATES the file when it doesn't exist (open(O_WRONLY |
2062    // O_CREAT | O_TRUNC)), so unconditionally writing
2063    // `cgroup.freeze` would plant a stray 1-byte file under any
2064    // non-cgroupfs directory and cause the subsequent
2065    // `fs::remove_dir(path)` to fail with ENOTEMPTY. On a real
2066    // cgroup v2 tree the file is always present (cgroup-core,
2067    // ungated by controllers); on a legacy kernel without
2068    // `CONFIG_CGROUP_FREEZE` or on a non-cgroup directory entry
2069    // the file is absent and the unfreeze step is a no-op.
2070    let freeze_path = path.join("cgroup.freeze");
2071    if freeze_path.exists()
2072        && let Err(err) = write_with_timeout(&freeze_path, "0", CGROUP_WRITE_TIMEOUT)
2073    {
2074        tracing::warn!(
2075            path = %path.display(),
2076            err = %format!("{err:#}"),
2077            "cleanup_recursive: pre-drain unfreeze failed; source-cgroup state-hygiene step skipped",
2078        );
2079    }
2080    drain_pids_to_root(
2081        &path.join("cgroup.procs"),
2082        &walk_root.join("cgroup.procs"),
2083        &path.display().to_string(),
2084    );
2085    // Wait event-driven on cgroup.events `populated 0` rather than
2086    // a blind 10 ms sleep — see `wait_for_cgroup_unpopulated`'s doc
2087    // for the rationale. 1 s deadline matches `remove_cgroup_inner`.
2088    wait_for_cgroup_unpopulated(path, std::time::Duration::from_secs(1));
2089    if let Err(err) = fs::remove_dir(path) {
2090        tracing::warn!(
2091            path = %path.display(),
2092            err = %err,
2093            "cleanup_recursive: remove_dir failed; cgroup directory may remain",
2094        );
2095    }
2096}
2097
2098#[cfg(test)]
2099#[path = "cgroup_tests.rs"]
2100mod tests;