ktstr/cgroup.rs
1//! Cgroup v2 filesystem operations for test cgroup management.
2//!
3//! Creates, configures, and removes cgroups under a parent path
4//! (default `/sys/fs/cgroup/ktstr`). Provides cpuset assignment,
5//! task migration, and cleanup.
6//!
7//! # Walk root (cgroup-v2 delegation)
8//!
9//! [`CgroupManager`] carries a `walk_root` that bounds two operations:
10//! - [`CgroupManager::setup`] walks every ancestor's
11//! `cgroup.subtree_control` from `walk_root` down to `parent`;
12//! - [`CgroupManager::drain_tasks`] / `cleanup_recursive` drain pids
13//! into `{walk_root}/cgroup.procs` (a writable root that is exempt
14//! from the kernel's no-internal-process constraint).
15//!
16//! `walk_root` defaults to `/sys/fs/cgroup` (Mode A: root-owned cgroup
17//! tree). [`CgroupManager::with_walk_root`] retargets it for Mode B/C
18//! delegation (systemd `Delegate=yes`, container `nsdelegate`) where
19//! the operator owns `subtree_control` writes only inside a delegated
20//! subtree. The constructor enforces that `parent` is at or below
21//! `walk_root` so the strip-prefix walk cannot escape.
22//!
23//! # Controller surface
24//!
25//! [`CgroupManager`] enables a fixed controller set in
26//! `cgroup.subtree_control` at `Self::setup` time so every method
27//! that writes a controller knob succeeds without per-call lazy
28//! enablement (which would race against concurrent sibling cgroup
29//! creation). The enabled controllers and the knobs each one exposes
30//! map to:
31//!
32//! | Controller | `setup` writes | Methods that touch the controller's files |
33//! |------------|----------------|-------------------------------------------|
34//! | `cpuset` | when `Controller::Cpuset` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `cpuset`/`cpuset_mems`) | `Self::set_cpuset`, `Self::set_cpuset_mems`, `Self::clear_cpuset`, `Self::clear_cpuset_mems` |
35//! | `cpu` | when `Controller::Cpu` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `cpu`) | `Self::set_cpu_max`, `Self::set_cpu_weight` |
36//! | `memory` | when `Controller::Memory` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `memory`) | `Self::set_memory_max`, `Self::set_memory_high`, `Self::set_memory_low`, `Self::set_memory_swap_max` |
37//! | `pids` | when `Controller::Pids` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `pids`) | `Self::set_pids_max` |
38//! | `io` | when `Controller::Io` in the set passed to `setup` (runtime adds it when a `CgroupDef` declares `io`) | `Self::set_io_weight` |
39//! | (cgroup-core) | not gated | `Self::set_freeze`, `Self::move_task`, `Self::move_tasks` |
40//!
41//! `cgroup.freeze` and `cgroup.procs` are cgroup-core files exposed on
42//! every non-root cgroup automatically; they do not require a
43//! controller in `subtree_control`. `memory.swap.max` only exists when
44//! the kernel was built with `CONFIG_SWAP=y` — the file is absent on
45//! swap-disabled kernels and a write returns ENOENT (callers route
46//! through the wire-time error chain).
47//!
48//! # Untrusted-name validation
49//!
50//! Cgroup names flow into [`Path::join`] under `parent` to address
51//! files inside cgroupfs. `validate_cgroup_name` rejects shapes that
52//! would escape that parent (`..`, absolute leading `/`, `NUL`) or
53//! that produce invisible cgroupfs entries (leading `.`); other ASCII
54//! is passed through to the kernel which is the final authority on
55//! per-component validity. Every public method that takes a `name`
56//! validates it before any filesystem write.
57
58use crate::topology::TestTopology;
59use anyhow::{Context, Result, anyhow, bail};
60use std::collections::BTreeSet;
61use std::fs;
62use std::path::{Path, PathBuf};
63use std::sync::atomic::{AtomicUsize, Ordering};
64use std::sync::mpsc;
65use std::time::Duration;
66
67/// Cgroup v2 controllers that [`CgroupManager::setup`] can enable in
68/// `cgroup.subtree_control`.
69///
70/// Each variant maps to a literal token the kernel parses in
71/// `cgroup_subtree_control_write`. The enum is exhaustive over the
72/// controllers the framework's [`CgroupOps`] surface actually writes
73/// to (cpuset, cpu, memory, pids, io); cgroup-core knobs
74/// (`cgroup.freeze`, `cgroup.procs`) are not gated by any controller
75/// and never appear here.
76///
77/// Callers pass a `BTreeSet<Controller>` to `setup` — sets compose
78/// naturally across nested CgroupDef declarations and the deterministic
79/// `BTreeSet` iteration order keeps the rendered subtree_control write
80/// stable between runs.
81#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
82pub enum Controller {
83 /// `+cpuset` — gates `cpuset.cpus`, `cpuset.cpus.effective`,
84 /// `cpuset.mems`, `cpuset.mems.effective` files on every child.
85 Cpuset,
86 /// `+cpu` — gates `cpu.max`, `cpu.weight`, `cpu.weight.nice`,
87 /// `cpu.stat`, `cpu.pressure` files on every child.
88 Cpu,
89 /// `+memory` — gates `memory.max`, `memory.high`, `memory.low`,
90 /// `memory.min`, `memory.current`, `memory.swap.max`,
91 /// `memory.events`, `memory.stat`, `memory.pressure` files.
92 Memory,
93 /// `+pids` — gates `pids.max`, `pids.current`, `pids.events` files.
94 Pids,
95 /// `+io` — gates `io.max`, `io.weight`, `io.bfq.weight`,
96 /// `io.stat`, `io.pressure` files.
97 Io,
98}
99
100impl Controller {
101 /// Kernel token written to `cgroup.subtree_control` (the bare name
102 /// without the `+`/`-` prefix; see `Self::as_subtree_control_add`
103 /// for the full token).
104 pub fn name(self) -> &'static str {
105 match self {
106 Controller::Cpuset => "cpuset",
107 Controller::Cpu => "cpu",
108 Controller::Memory => "memory",
109 Controller::Pids => "pids",
110 Controller::Io => "io",
111 }
112 }
113}
114
115/// Default timeout for cgroup filesystem writes. Normally <1ms; 2s catches
116/// real hangs without waiting so long the test result is meaningless.
117const CGROUP_WRITE_TIMEOUT: Duration = Duration::from_secs(2);
118
119/// Write `data` to `path` with a timeout. Spawns a thread for the blocking
120/// `fs::write` and waits on a channel. If the write does not complete within
121/// `timeout`, returns an error (the spawned thread may still be blocked in
122/// the kernel but will not prevent the caller from making progress).
123///
124/// # Stranded-writer thread semantics
125///
126/// On timeout the helper returns `Err` while the spawned thread stays
127/// blocked in the kernel inside `fs::write` — typically inside the
128/// cgroupfs `cgroup_kn_lock_live` / `cgroup_mutex` lock acquisition or
129/// the per-file `kn->active` semaphore. The host-side fd to `path` is
130/// owned by the spawned thread, so:
131///
132/// - **Per-file lock retention.** While the writer is blocked, the
133/// target cgroupfs file's `kn->active` (kernfs's per-knob writer
134/// semaphore) remains held by the stranded thread. Concurrent
135/// writes to the SAME file from any thread in the same process —
136/// including this same caller's retry — will queue behind the
137/// stranded write inside the kernel. Writes to OTHER files in the
138/// same cgroup are unaffected (kernfs holds `kn->active`
139/// per-knob, not per-cgroup).
140/// - **Thread-handle drop.** The `JoinHandle` returned by
141/// `thread::spawn` is dropped when the helper returns. Rust's
142/// `JoinHandle::Drop` implementation detaches the thread without
143/// waiting; the thread continues to run and is implicitly joined
144/// when the kernel write eventually unblocks (or when the process
145/// exits).
146/// - **Bounded leak under wedged cgroupfs.** A genuinely-wedged
147/// cgroupfs (e.g. a stuck filesystem driver in the kernel) would
148/// accumulate threads at a rate of one per timed-out write site.
149/// The 2s per-write timeout caps the per-site stall to 2s; the
150/// total accumulation is driven by how many distinct write sites
151/// the scenario hits, not by elapsed wall-clock time alone.
152/// Operators noticing stranded `<defunct>` cgroupfs writers in
153/// `ps` should investigate whether the underlying kernel cgroup
154/// subsystem is hung; the framework's own teardown does not
155/// block on these stranded threads.
156///
157/// Each stranded thread holds the file's `kn->active` until the
158/// kernel write returns. The OS-level memory cost per stranded
159/// thread is the default Rust thread stack (8 MiB on Linux, mostly
160/// virtual until touched).
161fn write_with_timeout(path: &Path, data: &str, timeout: Duration) -> Result<()> {
162 let display = path.display().to_string();
163 let path = path.to_owned();
164 let data = data.to_owned();
165 let (tx, rx) = mpsc::channel();
166 std::thread::spawn(move || {
167 let result = fs::write(&path, &data);
168 let _ = tx.send(result);
169 });
170 match rx.recv_timeout(timeout) {
171 Ok(Ok(())) => Ok(()),
172 Ok(Err(e)) => {
173 let errno_suffix = e
174 .raw_os_error()
175 .and_then(crate::errno_name)
176 .map(|name| format!(" ({name})"))
177 .unwrap_or_default();
178 Err(e).with_context(|| format!("write {display}{errno_suffix}"))
179 }
180 Err(_) => bail!(
181 "cgroup write to {display} timed out after {}ms",
182 timeout.as_millis()
183 ),
184 }
185}
186
187/// Validate a cgroup name before joining it onto the parent path.
188///
189/// Rejects shapes that would either escape the parent directory
190/// (`..` component, absolute leading `/`, embedded NUL) or produce
191/// a hidden / invisible cgroupfs entry (leading `.`). Empty names
192/// are also rejected — `parent.join("")` returns `parent`, which
193/// would let a caller accidentally clobber the parent's own
194/// `cpuset.cpus` / `cgroup.subtree_control` files via a method
195/// that expected to address a child.
196///
197/// Permits `/` only as a path separator between non-empty
198/// components (nested cgroups like `"cg_0/narrow"`); a leading
199/// `/` is rejected because `Path::join` would replace `parent`
200/// entirely with the absolute path.
201///
202/// Beyond these structural checks the kernel is the final authority
203/// on per-component validity: cgroupfs rejects names containing
204/// newlines or names colliding with reserved knobs (`cgroup.procs`,
205/// `cpuset.cpus`, etc.) at `mkdir` time with EINVAL / EEXIST. Those
206/// failures surface through the regular `fs::create_dir_all` /
207/// `fs::write` error chain.
208pub(crate) fn validate_cgroup_name(name: &str) -> Result<()> {
209 if name.is_empty() {
210 bail!("cgroup name must not be empty");
211 }
212 if name.starts_with('/') {
213 bail!(
214 "cgroup name '{name}' starts with '/' — would escape the \
215 managed parent via Path::join (absolute paths replace the \
216 join base)"
217 );
218 }
219 if name.contains('\0') {
220 bail!("cgroup name '{name}' contains a NUL byte");
221 }
222 // Per-component checks run before the whole-name leading-dot
223 // check so a component like `..` matches the more specific
224 // path-traversal diagnostic instead of the generic hidden-entry
225 // one. The ordering matters for error messages — `'..' component`
226 // is what callers grep for.
227 for component in name.split('/') {
228 if component.is_empty() {
229 bail!(
230 "cgroup name '{name}' contains an empty path component \
231 (consecutive '/') — Path::join would emit a malformed path"
232 );
233 }
234 if component == ".." {
235 bail!(
236 "cgroup name '{name}' contains a '..' component — \
237 would escape the managed parent via Path::join"
238 );
239 }
240 if component == "." {
241 bail!(
242 "cgroup name '{name}' contains a '.' component — \
243 ambiguous self-reference, refuse before fs writes"
244 );
245 }
246 if component.starts_with('.') {
247 bail!(
248 "cgroup name '{name}' contains a leading-dot component \
249 ('{component}') — produces a hidden cgroupfs entry"
250 );
251 }
252 }
253 Ok(())
254}
255
256/// Walk an `anyhow::Error` chain and return the first
257/// `std::io::Error`'s raw errno, if any. Shared helper for errno
258/// classification across cgroup orchestration — both this module's
259/// ESRCH/EBUSY checks and [`crate::vmm::cgroup_sandbox`]'s
260/// EACCES/EPERM/EBUSY branches walk the same chain shape.
261pub(crate) fn anyhow_first_io_errno(err: &anyhow::Error) -> Option<i32> {
262 err.chain()
263 .find_map(|cause| cause.downcast_ref::<std::io::Error>())
264 .and_then(|io| io.raw_os_error())
265}
266
267/// ESRCH: task exited between listing and migration
268/// (`cgroup_procs_write_start` -> `find_task_by_vpid` returns NULL).
269fn is_esrch(err: &anyhow::Error) -> bool {
270 anyhow_first_io_errno(err) == Some(libc::ESRCH)
271}
272
273/// EBUSY: either the cgroup v2 no-internal-process constraint
274/// (`cgroup_migrate_vet_dst` when `subtree_control` is set) or a
275/// transient rejection from a sched_ext BPF `cgroup_prep_move`
276/// callback (`scx_cgroup_can_attach`).
277fn is_ebusy(err: &anyhow::Error) -> bool {
278 anyhow_first_io_errno(err) == Some(libc::EBUSY)
279}
280
281/// Snapshot the cgroup-tree state at the moment a cpuset.cpus
282/// write fails, for diagnostic attachment to the returned error.
283///
284/// Captures (per the diagnostic contract on
285/// [`CgroupManager::set_cpuset`]):
286/// - the parent's `cgroup.controllers` (controllers AVAILABLE for
287/// children — confirms whether subtree_control already
288/// propagated to this child)
289/// - the parent's `cgroup.subtree_control` (controllers ENABLED
290/// for children — what setup() last wrote)
291/// - the child's `cgroup.controllers` (the set children of the
292/// CHILD inherit — useful for nested cgroups)
293/// - whether `cpuset.cpus` exists at the child (distinguishes a
294/// "controller never propagated" failure mode from a
295/// "kernel rejected this specific value" failure mode)
296/// - the child's directory listing (so an unexpected presence/
297/// absence of any cgroupfs knob is visible)
298///
299/// Read failures inside the snapshot are folded into the snapshot
300/// string as `<read failed: {err}>` rather than propagating —
301/// the caller's error path is what the caller cares about; the
302/// snapshot is best-effort instrumentation.
303fn capture_cpuset_state(parent: &Path, name: &str) -> String {
304 let child = parent.join(name);
305 let parent_controllers = read_or_label(&parent.join("cgroup.controllers"));
306 let parent_subtree_control = read_or_label(&parent.join("cgroup.subtree_control"));
307 let child_controllers = read_or_label(&child.join("cgroup.controllers"));
308 let cpuset_cpus_exists = child.join("cpuset.cpus").exists();
309 let child_listing = match fs::read_dir(&child) {
310 Ok(entries) => {
311 let mut names: Vec<String> = entries
312 .filter_map(|e| e.ok())
313 .map(|e| e.file_name().to_string_lossy().into_owned())
314 .collect();
315 names.sort_unstable();
316 format!("[{}]", names.join(", "))
317 }
318 Err(e) => format!("<read_dir failed: {e}>"),
319 };
320 format!(
321 "cgroup-state-snapshot: \
322 parent={} name={} \
323 parent.cgroup.controllers={:?} \
324 parent.cgroup.subtree_control={:?} \
325 child.cgroup.controllers={:?} \
326 child.cpuset.cpus.exists={} \
327 child.listing={}",
328 parent.display(),
329 name,
330 parent_controllers,
331 parent_subtree_control,
332 child_controllers,
333 cpuset_cpus_exists,
334 child_listing,
335 )
336}
337
338/// Read `path` to a string for snapshotting, returning a
339/// `<...>` placeholder if the read fails. Used by
340/// [`capture_cpuset_state`] so a missing or permission-denied
341/// snapshot field shows up as a labeled placeholder rather than
342/// killing the whole snapshot.
343fn read_or_label(path: &Path) -> String {
344 match fs::read_to_string(path) {
345 Ok(s) => s.trim().to_string(),
346 Err(e) => format!("<read failed: {e}>"),
347 }
348}
349
350/// Cap on the number of successive [`CgroupManager::remove_cgroup`]
351/// failures the manager will tolerate before bailing further removes.
352///
353/// A churn workload (rapid create→remove cycles) may legitimately
354/// race the freeze/drain path and see EBUSY/ENOENT on individual
355/// remove calls — those are absorbed and the un-removed cgroup is
356/// counted toward `outstanding_removes`. When the counter exceeds
357/// this cap, subsequent [`CgroupManager::remove_cgroup`] calls
358/// return Err immediately so the loop driving the churn (e.g.
359/// `custom_cgroup_rapid_churn` in scenario/dynamic.rs) can bail
360/// instead of accumulating cgroupfs entries unboundedly. Successful
361/// removes decrement the counter, so a transient stall that
362/// eventually clears does not strand the manager in the bailed
363/// state.
364const MAX_OUTSTANDING_REMOVES: usize = 10;
365
366/// RAII manager for cgroup v2 filesystem operations.
367///
368/// Creates, configures, and removes cgroups under a parent directory.
369/// Provides cpuset assignment and task migration.
370///
371/// # Outstanding-remove tracking
372///
373/// `outstanding_removes` counts cgroups whose
374/// [`Self::remove_cgroup`] call failed (the directory still exists
375/// in the cgroupfs tree). It increments on every removal failure,
376/// decrements on every removal success, and gates further calls:
377/// once the count exceeds `MAX_OUTSTANDING_REMOVES`,
378/// [`Self::remove_cgroup`] returns Err without attempting the
379/// underlying writes. The counter is `AtomicUsize` because
380/// scenario code holds the manager behind `&dyn CgroupOps` and
381/// shares it across threads via `&self` borrows.
382///
383/// # Walk root
384///
385/// `walk_root` bounds the cgroup-fs walk for two operations:
386/// 1. [`Self::setup`] walks every ancestor's `cgroup.subtree_control`
387/// between `walk_root` and `parent`.
388/// 2. [`Self::drain_tasks`] and `cleanup_recursive` drain pids into
389/// `{walk_root}/cgroup.procs` (the writable root exempt from the
390/// no-internal-process constraint).
391///
392/// Defaults to `/sys/fs/cgroup` in [`Self::new`] for Mode A (root-owned
393/// cgroup tree). Override via [`Self::with_walk_root`] for cgroup-v2
394/// user delegation (Mode B/C: systemd `Delegate=yes`, container
395/// `nsdelegate`). The override is validated against `parent` at
396/// construction — if `parent` is not at or below `walk_root`, the
397/// chained call returns an error rather than letting the strip-prefix
398/// walk fall through to an opaque cgroupfs EACCES at the delegation
399/// boundary.
400#[derive(Debug)]
401pub struct CgroupManager {
402 parent: PathBuf,
403 walk_root: PathBuf,
404 outstanding_removes: AtomicUsize,
405}
406
407/// Free-function inner of [`CgroupManager::move_tasks`] —
408/// extracted so the per-pid migration loop + ESRCH tolerance +
409/// all-vanished bail can be unit-tested without a real
410/// cgroupfs (which is what surfaces the kernel-side ESRCH that
411/// the bail guards against). The per-pid write closure is
412/// caller-supplied: production callers route through
413/// [`CgroupManager::move_task_with_retry`] (which talks to
414/// real `cgroup.procs` files); unit tests pass a closure that
415/// synthesises [`std::io::Error::from_raw_os_error`]`(libc::ESRCH)`
416/// for selected pids so the partial-vanish (allowed) and
417/// all-vanished (bail) paths are both directly observable.
418///
419/// The empty-slice exemption (`pids.is_empty() -> Ok`) is
420/// preserved here so the documented "no move requested" form
421/// (post-Drop diagnostic, post-mortem capture) stays a clean
422/// no-op rather than tripping the all-vanished gate.
423fn move_tasks_inner<W>(name: &str, pids: &[libc::pid_t], mut write_one: W) -> Result<()>
424where
425 W: FnMut(&str, libc::pid_t) -> Result<()>,
426{
427 let mut vanished = 0usize;
428 for &pid in pids {
429 if let Err(e) = write_one(name, pid) {
430 if is_esrch(&e) {
431 tracing::warn!(pid, cgroup = name, "task vanished during migration");
432 vanished += 1;
433 continue;
434 }
435 return Err(e);
436 }
437 }
438 if !pids.is_empty() && vanished == pids.len() {
439 anyhow::bail!(
440 "move_tasks to '{name}': ALL {n} pid(s) ESRCH'd before \
441 migration completed (pids: {pids:?}). Likely causes: \
442 (a) `WorkloadHandle::spawn` child pre_exec init-panic \
443 cascade (uid/gid/mempolicy/cgroup-handshake failure \
444 between fork and the start-pipe read — the parent is \
445 blocked on the start-pipe waiting for the child to \
446 reach work-ready and only observes the child's death \
447 via SIGCHLD reap, by which point the pid has already \
448 vanished from any cgroup it was placed in); (b) \
449 scheduler-attach-time cgroup-pull (sched_ext init may \
450 move existing tasks out of test-created cgroups); \
451 (c) external signal (SIGKILL from operator OR \
452 OOM-killer). The silent-Ok path this bail replaces \
453 was a no-silent-drops violation: a downstream \
454 `cgroup.procs` read would see 0 pids with no signal \
455 that ANY migration was even attempted. If the caller \
456 LEGITIMATELY moves an already-vanished cohort \
457 (post-Drop diagnostic), pass an empty pids slice \
458 instead — the empty-slice path returns Ok cleanly \
459 without bailing.",
460 n = pids.len(),
461 );
462 }
463 Ok(())
464}
465
466impl CgroupManager {
467 /// Default cgroup-fs root used by [`Self::new`]. Override per
468 /// instance via [`Self::with_walk_root`] for cgroup-v2 user
469 /// delegation.
470 const DEFAULT_WALK_ROOT: &'static str = "/sys/fs/cgroup";
471
472 /// Create a manager rooted at the given cgroup v2 path.
473 ///
474 /// The walk root defaults to `/sys/fs/cgroup` (Mode A: root-owned
475 /// cgroup tree). For cgroup-v2 user delegation (Mode B/C), chain
476 /// [`Self::with_walk_root`] before any [`Self::setup`] call.
477 pub fn new(parent: &str) -> Self {
478 Self {
479 parent: PathBuf::from(parent),
480 walk_root: PathBuf::from(Self::DEFAULT_WALK_ROOT),
481 outstanding_removes: AtomicUsize::new(0),
482 }
483 }
484
485 /// Retarget the cgroup-fs walk root used by [`Self::setup`] and
486 /// [`Self::drain_tasks`].
487 ///
488 /// `root` becomes the upper bound of the
489 /// `cgroup.subtree_control` enable walk and the destination
490 /// `{root}/cgroup.procs` for pid drains. Use for cgroup-v2 user
491 /// delegation (Mode B/C) where the operator owns
492 /// `subtree_control` writes only inside the delegated subtree and
493 /// a blind walk from `/sys/fs/cgroup` would EACCES at the
494 /// `user.slice` / container-root boundary.
495 ///
496 /// Returns an error when:
497 /// - **Either `parent` or `root` contains a `..` component** —
498 /// [`Path::starts_with`](std::path::Path::starts_with) is component-based and treats `..`
499 /// as a literal segment, so `/sys/fs/cgroup/op/../escape` would
500 /// component-prefix `/sys/fs/cgroup/op` while the kernel
501 /// resolves the path to `/sys/fs/cgroup/escape` (outside the
502 /// delegation root). Rejecting `..` upfront keeps the prefix
503 /// invariant honest against canonical-vs-component drift.
504 /// - **The manager's `parent` is not at or below `root`** —
505 /// without the prefix invariant the `Self::setup_under_root`
506 /// strip-prefix gate would silently skip the subtree_control
507 /// walk and the caller would see downstream EACCES on the
508 /// first `set_*` write. Surfaces the misconfiguration upfront
509 /// with both paths in the error message.
510 pub fn with_walk_root(mut self, root: impl Into<PathBuf>) -> Result<Self> {
511 let root = root.into();
512 // Reject `..` components on either side. `PathBuf::starts_with`
513 // is component-based and treats `..` as a literal segment, so
514 // `/sys/fs/cgroup/operator/../escape` would pass the prefix
515 // check below while the kernel resolves the path to
516 // `/sys/fs/cgroup/escape` (outside walk_root). Either side
517 // carrying `..` is a misconfiguration; bail upfront before the
518 // canonical-vs-component mismatch becomes a downstream EACCES.
519 for (path, label) in [
520 (self.parent.as_path(), "parent"),
521 (root.as_path(), "walk_root"),
522 ] {
523 if path
524 .components()
525 .any(|c| matches!(c, std::path::Component::ParentDir))
526 {
527 bail!(
528 "CgroupManager::with_walk_root: {label} {path:?} contains `..` components; \
529 parent and walk_root must be normalized absolute paths because \
530 PathBuf::starts_with is component-based and `/a/b/../c` is treated as \
531 starting with `/a/b/..` not the kernel-resolved `/a/c` — the prefix \
532 invariant would be silently violated",
533 );
534 }
535 }
536 if !self.parent.starts_with(&root) {
537 bail!(
538 "CgroupManager::with_walk_root: parent {:?} is not below walk_root {:?}; \
539 the subtree_control walk must originate at a root that contains the parent — \
540 either lower walk_root to a prefix of parent or raise parent to a descendant of \
541 walk_root",
542 self.parent,
543 root,
544 );
545 }
546 self.walk_root = root;
547 Ok(self)
548 }
549
550 /// Path to the parent cgroup directory.
551 pub fn parent_path(&self) -> &std::path::Path {
552 &self.parent
553 }
554
555 /// Path to the cgroup-fs root [`Self::setup`] walks down from and
556 /// [`Self::drain_tasks`] drains pids to. See [`Self::with_walk_root`].
557 pub fn walk_root(&self) -> &std::path::Path {
558 &self.walk_root
559 }
560
561 /// Count of un-removed cgroups currently tracked by this
562 /// manager — incremented when [`Self::remove_cgroup`] fails,
563 /// decremented when it succeeds. Exposed for tests and for
564 /// callers that want to inspect the budget without forcing a
565 /// remove attempt.
566 pub fn outstanding_removes(&self) -> usize {
567 self.outstanding_removes.load(Ordering::Relaxed)
568 }
569
570 /// Create the parent directory and enable the requested cgroup
571 /// controllers in every ancestor `cgroup.subtree_control` between
572 /// `self.walk_root` (default `/sys/fs/cgroup`) and `self.parent`.
573 ///
574 /// Pass the controllers the test actually needs — empty set means
575 /// "create the parent dir, write nothing to subtree_control". The
576 /// scenario runtime computes the controller union from
577 /// [`CgroupDef`](crate::scenario::ops::CgroupDef) declarations
578 /// (cpuset/cpuset_mems → [`Controller::Cpuset`], cpu →
579 /// [`Controller::Cpu`], memory → [`Controller::Memory`], pids →
580 /// [`Controller::Pids`], io → [`Controller::Io`]) so a test
581 /// that never sets a memory limit never enables `+memory` and
582 /// vice versa. `cgroup.freeze` and `cgroup.procs` are
583 /// cgroup-core, ungated by any controller, and need no entry.
584 ///
585 /// # Walk root
586 ///
587 /// The ancestor walk stops at `self.walk_root` so cgroup-v2 user
588 /// delegation (Mode B/C) does not attempt subtree_control writes
589 /// above the delegation boundary. [`Self::with_walk_root`]
590 /// retargets the walk; the constructor validates that
591 /// `self.parent` is below `walk_root`.
592 ///
593 /// # Availability check
594 ///
595 /// Each requested controller is verified against
596 /// `{walk_root}/cgroup.controllers` before any write. A
597 /// requested controller missing from the kernel's available set
598 /// surfaces as `controller {ctrl} not available; cgroup.controllers
599 /// = {available:?}` rather than the bare ENOENT/EACCES the
600 /// downstream `set_*` write would otherwise emit.
601 ///
602 /// # Error propagation
603 ///
604 /// All filesystem writes propagate via `?`. A user inspecting
605 /// `RUST_BACKTRACE=1` output sees the exact subtree_control path
606 /// that failed and the underlying errno, instead of a swallowed
607 /// `tracing::warn!` followed by a downstream EACCES at the
608 /// controller-knob write site.
609 pub fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()> {
610 self.setup_under_root(controllers, &self.walk_root)
611 }
612
613 /// Does managing cgroups require root privileges for this
614 /// `(root, parent, euid)`? True only when `root` is the kernel-owned
615 /// default walk root (`/sys/fs/cgroup`), `parent` is actually under
616 /// that root (a real cgroupfs operation — create_dir_all of the
617 /// parent, or the subtree_control walk, that EACCESes for a non-root
618 /// euid), AND the euid is non-root. A `parent` OUTSIDE the root (e.g.
619 /// a tmpdir — the non-cgroup-path early-bail that creates a dir and
620 /// skips the walk) touches no cgroupfs and needs no root. A delegated
621 /// walk root (set via [`Self::with_walk_root`]) is exempt: cgroup-v2
622 /// delegation grants the delegatee write access to
623 /// `cgroup.subtree_control` inside the delegated subtree, so a
624 /// non-root euid can manage it (Documentation/admin-guide/cgroup-v2.rst,
625 /// Delegation). Pure + takes `parent`/`euid` explicitly so the
626 /// privilege gate is unit-tested regardless of the test runner's own
627 /// euid and working directory.
628 fn default_root_requires_root(root: &Path, parent: &Path, euid: u32) -> bool {
629 root == Path::new(Self::DEFAULT_WALK_ROOT) && parent.starts_with(root) && euid != 0
630 }
631
632 /// Inner setup that takes the cgroup-fs root as an explicit
633 /// argument so tests can drive the controller-enable path against
634 /// a tmpdir without touching `/sys/fs/cgroup`. Production
635 /// [`Self::setup`] threads `self.walk_root` (defaults to
636 /// `/sys/fs/cgroup` via [`Self::new`], overridable via
637 /// [`Self::with_walk_root`]). The strip-prefix gate stays — if
638 /// the parent is outside the supplied root, directory creation
639 /// still happens but no subtree_control walk fires (matches the
640 /// existing "non-cgroup-mount" early-bail).
641 fn setup_under_root(&self, controllers: &BTreeSet<Controller>, root: &Path) -> Result<()> {
642 // Managing cgroups under the kernel-owned default walk root
643 // (/sys/fs/cgroup, Mode A) requires root: create_dir_all of a
644 // parent UNDER /sys/fs/cgroup, or the subtree_control walk below,
645 // would EACCES for a non-root caller with an errno that buries
646 // the cause. Fail fast here so the message names the fix. Gated
647 // on the parent being under the root: a parent OUTSIDE it (the
648 // non-cgroup-path early-bail — create a dir, skip the walk)
649 // touches no cgroupfs and needs no root. Checked at setup (first
650 // real cgroup use), NOT at manager construction: host_only tests
651 // that never create a cgroup (macro-attribute fixtures,
652 // host-topology reads, nested-VM verifier orchestration) must not
653 // fail for a resource they never touch. A delegated walk root
654 // (Mode B/C via with_walk_root) is exempt — the operator owns
655 // subtree_control inside the delegated subtree.
656 let euid = unsafe { libc::geteuid() };
657 if Self::default_root_requires_root(root, &self.parent, euid) {
658 return Err(anyhow!(
659 "CgroupManager::setup: cannot manage cgroups under the \
660 kernel-owned default walk root {root:?} as a non-root \
661 process (euid {euid}); run as root, or for cgroup-v2 \
662 user delegation set a delegated walk root via \
663 CgroupManager::with_walk_root (a systemd Delegate=yes \
664 subtree or a container nsdelegate root) — when driven by \
665 cargo-ktstr, set the {walk_env} env var to that delegated \
666 root",
667 walk_env = crate::KTSTR_CGROUP_WALK_ROOT_ENV,
668 ));
669 }
670 // No controllers to enable means no subtree_control walk, and the
671 // parent cgroup is only needed when the scenario actually creates
672 // child cgroups -- which `create_cgroup`'s `create_dir_all` makes
673 // lazily -- or enables controllers. Return BEFORE the eager parent
674 // mkdir so a cgroup-free scenario (no CgroupDefs, no workloads --
675 // e.g. snapshot-bridge tests, host-topology reads, macro-attribute
676 // fixtures) runs without root or a cgroup fs. Previously this mkdir
677 // fired unconditionally and EACCES'd a non-root caller (or a
678 // deliberately-unwritable dummy parent like `/nonexistent/...`).
679 if controllers.is_empty() {
680 return Ok(());
681 }
682 if !self.parent.exists() {
683 fs::create_dir_all(&self.parent)
684 .with_context(|| format!("mkdir {}", self.parent.display()))?;
685 }
686 if let Ok(rel) = self.parent.strip_prefix(root) {
687 let available_path = root.join("cgroup.controllers");
688 if available_path.exists() {
689 let raw = fs::read_to_string(&available_path).with_context(|| {
690 format!("read cgroup.controllers: {}", available_path.display())
691 })?;
692 let available: BTreeSet<&str> = raw.split_whitespace().collect();
693 for c in controllers {
694 if !available.contains(c.name()) {
695 return Err(anyhow!(
696 "cgroup controller '{}' not available at {}; \
697 cgroup.controllers reports {:?}. CONFIG_{}_CONTROLLER \
698 may be unset, or the controller is masked at this \
699 level of the hierarchy",
700 c.name(),
701 available_path.display(),
702 available,
703 c.name().to_uppercase(),
704 ));
705 }
706 }
707 }
708 let line: String = controllers
709 .iter()
710 .map(|c| format!("+{}", c.name()))
711 .collect::<Vec<_>>()
712 .join(" ");
713 let mut cur = root.to_path_buf();
714 for c in rel.components() {
715 let sc = cur.join("cgroup.subtree_control");
716 if sc.exists() {
717 write_with_timeout(&sc, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
718 format!("enable controllers '{line}' at {}", sc.display())
719 })?;
720 }
721 cur = cur.join(c);
722 }
723 let sc = self.parent.join("cgroup.subtree_control");
724 if sc.exists() {
725 write_with_timeout(&sc, &line, CGROUP_WRITE_TIMEOUT)
726 .with_context(|| format!("enable controllers '{line}' at {}", sc.display()))?;
727 }
728 }
729 Ok(())
730 }
731
732 /// Create a child cgroup directory.
733 ///
734 /// For nested paths (e.g. `"cg_0/narrow"`), enables `+cpuset` on
735 /// each intermediate cgroup's `subtree_control` so the leaf has
736 /// `cpuset.cpus` / `cpuset.mems` files available. The kernel
737 /// requires each parent to have the controller in
738 /// `subtree_control` for its children to have the corresponding
739 /// files (`cgroup_control()` returns `parent->subtree_control`).
740 ///
741 /// # Limitation: only `+cpuset` is propagated through nested
742 /// intermediates
743 ///
744 /// `Self::enable_subtree_cpuset` writes ONLY `+cpuset` to each
745 /// intermediate's `cgroup.subtree_control`; the `+cpu` /
746 /// `+memory` / `+pids` / `+io` controllers enabled by
747 /// [`Self::setup`] cover only the manager's parent cgroup, not
748 /// arbitrary intermediate cgroups created via nested
749 /// `create_cgroup` calls. As a result, a nested leaf like
750 /// `"cg_0/narrow"` exposes `cpuset.*` knobs but NOT
751 /// `memory.max` / `pids.max` / `io.weight`. If a future
752 /// [`CgroupDef`](crate::scenario::ops::CgroupDef) addresses such
753 /// a leaf with a memory/pids/io knob, the corresponding
754 /// `set_*` write will return ENOENT.
755 ///
756 /// Today's in-tree consumers (host topology cpuset locks,
757 /// `BuildSandbox`, scenario ops) only nest cgroups for cpuset
758 /// scoping, so this matches the actual surface the framework
759 /// exercises. Extending `Self::enable_subtree_cpuset` to
760 /// propagate the remaining controllers across intermediates is
761 /// straightforward (write the same controller list as
762 /// [`Self::setup`] uses) but is deferred until a use case
763 /// concretely needs it; without one, the wider write would
764 /// race against concurrent sibling cgroup creation under the
765 /// same intermediate without buying anything.
766 pub fn create_cgroup(&self, name: &str) -> Result<()> {
767 validate_cgroup_name(name)?;
768 let p = self.parent.join(name);
769 if !p.exists() {
770 fs::create_dir_all(&p).with_context(|| format!("mkdir {}", p.display()))?;
771 }
772 self.enable_subtree_cpuset(name);
773 Ok(())
774 }
775
776 /// Enable a controller on the parent cgroup's `cgroup.subtree_control`.
777 ///
778 /// Writes `+{controller}` to `{parent}/cgroup.subtree_control` so
779 /// children created under the parent inherit the controller and
780 /// expose the corresponding `*.cpus`, `*.mems`, etc. files. No-op
781 /// (returns `Ok`) when the subtree_control file does not exist —
782 /// callers treat that as "parent is not a cgroup v2 node" and
783 /// degrade elsewhere.
784 ///
785 /// Unlike [`Self::setup`] and `Self::enable_subtree_cpuset`,
786 /// which swallow write failures via `tracing::warn!`, this method
787 /// propagates the underlying [`std::io::Error`] so callers can
788 /// classify errnos (EACCES/EPERM for permission, EBUSY for a
789 /// peer holding the subtree) via `anyhow_first_io_errno` and
790 /// map them to operator-facing degrade variants. Used by
791 /// `crate::vmm::cgroup_sandbox::BuildSandbox::try_create` under
792 /// the `--cpu-cap` hard-error contract.
793 pub fn add_parent_subtree_controller(&self, controller: &str) -> Result<()> {
794 let p = self.parent.join("cgroup.subtree_control");
795 if !p.exists() {
796 return Ok(());
797 }
798 write_with_timeout(&p, &format!("+{controller}"), CGROUP_WRITE_TIMEOUT)
799 }
800
801 /// Drain tasks from a child cgroup and remove it.
802 ///
803 /// Auto-unfreezes the cgroup before draining: a frozen cgroup that
804 /// reaches teardown (e.g. a step body issues `Op::FreezeCgroup` and
805 /// never pairs it with `Op::UnfreezeCgroup`) would migrate its
806 /// frozen tasks to the cgroup root via `drain_tasks` and rely on
807 /// the kernel's `cgroup_freezer_migrate_task` to clear the JOBCTL
808 /// freeze bit when the destination cgroup is unfrozen. The kernel
809 /// path is correct, but writing `cgroup.freeze=0` first makes the
810 /// teardown deterministic regardless of who froze the cgroup and
811 /// when. Tolerates ENOENT on the freeze file (cgroup directory
812 /// already gone, or `CONFIG_CGROUP_FREEZE` absent on legacy
813 /// kernels) silently — only non-ENOENT failures warn.
814 ///
815 /// # Post-drain settle window
816 ///
817 /// Between [`Self::drain_tasks`] and `rmdir`,
818 /// `remove_cgroup_inner` calls `wait_for_cgroup_unpopulated` with
819 /// a 1s budget. Writes to `cgroup.procs` queue the task move but
820 /// the source cgroup's populated state only clears once the
821 /// per-task css_set switch completes — `rmdir` returns EBUSY
822 /// while the cgroup is still populated. Rather than a blind
823 /// sleep, the wait is event-driven: it blocks on an
824 /// inotify(IN_MODIFY) watch of the cgroup's `cgroup.events` file
825 /// and returns as soon as that file reports `populated 0`, so it
826 /// wakes on the actual kernel state-transition write.
827 ///
828 /// The wait falls through to `rmdir` on deadline (or when
829 /// `cgroup.events` is absent / inotify setup fails), so a
830 /// genuinely stuck-populated cgroup still surfaces the same
831 /// EBUSY error from the subsequent `rmdir`.
832 ///
833 /// # Outstanding-remove cap
834 ///
835 /// A churn workload (rapid create→remove cycles) may legitimately
836 /// race freeze/drain and see EBUSY/ENOENT on individual remove
837 /// calls. Each failed remove increments
838 /// [`Self::outstanding_removes`]; once the counter exceeds
839 /// `MAX_OUTSTANDING_REMOVES`, the next call returns Err
840 /// without attempting any filesystem writes — bounding the peak
841 /// resident cgroup leak to that cap regardless of how long the
842 /// scenario runs. Successful removes decrement the counter, so a
843 /// transient stall that eventually clears (e.g. RCU drain
844 /// catches up between iterations) does not strand the manager
845 /// in the bailed state.
846 ///
847 /// A `name` whose directory does not exist returns `Ok(())`
848 /// without touching the counter — the cgroup was already
849 /// reaped (e.g. by [`Self::cleanup_all`] or a prior remove),
850 /// so it is not "outstanding".
851 pub fn remove_cgroup(&self, name: &str) -> Result<()> {
852 validate_cgroup_name(name)?;
853 let outstanding = self.outstanding_removes.load(Ordering::Relaxed);
854 if outstanding > MAX_OUTSTANDING_REMOVES {
855 bail!(
856 "remove_cgroup '{name}' refused: {outstanding} cgroups outstanding \
857 (cap {MAX_OUTSTANDING_REMOVES}); cgroup.procs draining wedged or \
858 churn loop outpacing the kernel's RCU grace period — bailing to \
859 avoid unbounded cgroupfs accumulation"
860 );
861 }
862 let p = self.parent.join(name);
863 if !p.exists() {
864 return Ok(());
865 }
866 match self.remove_cgroup_inner(name, &p) {
867 Ok(()) => {
868 // Successful remove: decrement (saturating at 0 so a
869 // remove of a cgroup we never failed-to-remove does
870 // not underflow the counter into usize::MAX).
871 self.outstanding_removes
872 .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |n| {
873 Some(n.saturating_sub(1))
874 })
875 .ok();
876 Ok(())
877 }
878 Err(err) => {
879 self.outstanding_removes.fetch_add(1, Ordering::Relaxed);
880 Err(err)
881 }
882 }
883 }
884
885 /// Inner body of [`Self::remove_cgroup`] — exists so the public
886 /// method can wrap the unfreeze/drain/rmdir result in the
887 /// outstanding-counter bookkeeping without duplicating the
888 /// sequence in success and failure arms.
889 ///
890 /// Gates the pre-drain unfreeze on `cgroup.freeze` existence to
891 /// match [`cleanup_recursive`]'s same-file gate. `set_freeze`
892 /// goes through `fs::write` which CREATES the file when it does
893 /// not exist (open(O_WRONLY | O_CREAT | O_TRUNC)), so an
894 /// unconditional call would plant a stray 1-byte file under any
895 /// non-cgroupfs directory and cause the subsequent
896 /// `fs::remove_dir(p)` to fail with ENOTEMPTY. On a real cgroup
897 /// v2 tree the file is always present (cgroup-core, ungated by
898 /// controllers); on a legacy kernel without `CONFIG_CGROUP_FREEZE`
899 /// or on a non-cgroup directory entry the file is absent and the
900 /// unfreeze step is a no-op.
901 fn remove_cgroup_inner(&self, name: &str, p: &Path) -> Result<()> {
902 if p.join("cgroup.freeze").exists()
903 && let Err(err) = self.set_freeze(name, false)
904 && anyhow_first_io_errno(&err) != Some(libc::ENOENT)
905 {
906 tracing::warn!(
907 cgroup = name,
908 err = %format!("{err:#}"),
909 "remove_cgroup: pre-drain unfreeze failed; drain may strand frozen tasks at root"
910 );
911 }
912 self.drain_tasks(name)?;
913 // Wait for the kernel to reflect the empty state via
914 // cgroup.events `populated 0` (event-driven via inotify on
915 // the events file) before attempting rmdir. The legacy
916 // 50 ms blind sleep was a hopeful settle: too short under
917 // load (rmdir EBUSY) and too long on a quiet host (wasted
918 // tens of ms × every cgroup teardown). Falls through to
919 // rmdir on deadline so the caller still sees the same
920 // EBUSY error if the cgroup is genuinely stuck-populated;
921 // 1 s ceiling matches the prior pessimistic upper bound on
922 // a settling cgroup.
923 wait_for_cgroup_unpopulated(p, std::time::Duration::from_secs(1));
924 fs::remove_dir(p).with_context(|| format!("rmdir {}", p.display()))
925 }
926
927 /// Write `cpuset.cpus` for a child cgroup.
928 ///
929 /// On write failure, captures and emits a snapshot of the
930 /// cgroup-tree state at the moment of failure: the parent's
931 /// `cgroup.controllers` (controllers AVAILABLE to children),
932 /// the parent's `cgroup.subtree_control` (controllers ENABLED
933 /// for children), the child's `cgroup.controllers` (the
934 /// inheritance ROOT for children of the child), the
935 /// `cpuset.cpus` file's existence, and a directory listing of
936 /// the child cgroup's knob files. The capture lets a kernel /
937 /// hierarchy-state bug surface as a focused diagnostic instead
938 /// of a bare `EACCES` at the write site.
939 pub fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()> {
940 validate_cgroup_name(name)?;
941 let p = self.parent.join(name).join("cpuset.cpus");
942 match write_with_timeout(&p, &TestTopology::cpuset_string(cpus), CGROUP_WRITE_TIMEOUT) {
943 Ok(()) => Ok(()),
944 Err(e) => {
945 let snapshot = capture_cpuset_state(&self.parent, name);
946 Err(e.context(snapshot))
947 }
948 }
949 }
950
951 /// Enable `+cpuset` on `cgroup.subtree_control` for each ancestor
952 /// of the leaf in a nested cgroup path. For `"cg_0/narrow"`, writes
953 /// `+cpuset` to `{parent}/cgroup.subtree_control` and
954 /// `{parent}/cg_0/cgroup.subtree_control`. No-op for
955 /// single-component paths.
956 fn enable_subtree_cpuset(&self, name: &str) {
957 let components: Vec<&str> = name.split('/').collect();
958 if components.len() < 2 {
959 return;
960 }
961 let mut cur = self.parent.clone();
962 for c in &components[..components.len() - 1] {
963 let sc = cur.join("cgroup.subtree_control");
964 if sc.exists()
965 && let Err(e) = write_with_timeout(&sc, "+cpuset", CGROUP_WRITE_TIMEOUT)
966 {
967 tracing::warn!(path = %sc.display(), err = %e, "failed to enable cpuset");
968 }
969 cur = cur.join(c);
970 }
971 // Write at the last intermediate (direct parent of the leaf).
972 let sc = cur.join("cgroup.subtree_control");
973 if sc.exists()
974 && let Err(e) = write_with_timeout(&sc, "+cpuset", CGROUP_WRITE_TIMEOUT)
975 {
976 tracing::warn!(path = %sc.display(), err = %e, "failed to enable cpuset");
977 }
978 }
979
980 /// Clear `cpuset.cpus` for a child cgroup (empty string = inherit parent).
981 pub fn clear_cpuset(&self, name: &str) -> Result<()> {
982 validate_cgroup_name(name)?;
983 let p = self.parent.join(name).join("cpuset.cpus");
984 write_with_timeout(&p, "", CGROUP_WRITE_TIMEOUT).with_context(|| {
985 format!("cgroup '{name}': clear cpuset.cpus (write empty string for inherit-parent)")
986 })
987 }
988
989 /// Write `cpuset.mems` for a child cgroup. Constrains which NUMA
990 /// nodes the cgroup's tasks can allocate memory on.
991 ///
992 /// Shape mirrors `set_cpuset` exactly — [`TestTopology::cpuset_string`]
993 /// range-compact-formats the node set, `write_with_timeout` bounds
994 /// the filesystem-write at 2s. Used by `BuildSandbox` under the
995 /// `--cpu-cap` flow to bind build memory to the NUMA nodes hosting
996 /// the locked LLCs, avoiding cross-socket DRAM latency for gcc's
997 /// symbol tables and linker working sets.
998 ///
999 /// # Ordering contract
1000 ///
1001 /// Caller MUST have already called [`Self::set_cpuset`] (or
1002 /// equivalent direct write to `cpuset.cpus`) and — when running
1003 /// under a parent that may narrow the set — MUST have read back
1004 /// `cpuset.cpus.effective` to detect kernel-side narrowing
1005 /// BEFORE invoking this method. The per-knob ordering is
1006 /// load-bearing: `crate::vmm::cgroup_sandbox::BuildSandbox`
1007 /// interleaves `cpuset.cpus.effective` readback between the
1008 /// `cpuset.cpus` and `cpuset.mems` writes to abort on narrowing
1009 /// under the `--cpu-cap` hard-error contract; folding the two
1010 /// writes into a single helper would erase that gate.
1011 ///
1012 /// A cgroup whose `cpuset.cpus` is set should also have a
1013 /// non-empty `cpuset.mems.effective` before any task is migrated
1014 /// into it: the half-configured shape (cpus set locally, no
1015 /// nodemask anywhere up the hierarchy) is suspicious enough that
1016 /// the framework refuses it. The kernel itself does NOT
1017 /// SIGKILL on first allocation — `guarantee_online_mems`
1018 /// (`kernel/cgroup/cpuset.c`) walks UP via `parent_cs(cs)` until
1019 /// `effective_mems` intersects `node_states[N_MEMORY]`, and the
1020 /// top cpuset always has online memory, so the walk always finds
1021 /// a non-empty mask. The actual kernel behavior under a fully
1022 /// empty hierarchy is path-dependent (parent-walk fallback
1023 /// generally succeeds; degenerate states without any online
1024 /// memory may OOM). cgroup v2's `cpuset_can_attach_check` only
1025 /// rejects empty `effective_cpus`, not empty `effective_mems`.
1026 /// In cgroup v2, the local `cpuset.mems` file is normally empty
1027 /// (the cgroup inherits from its parent via `effective_mems`),
1028 /// so reading the local file alone would falsely flag every
1029 /// inheriting child. [`Self::move_task`] enforces the gate at
1030 /// runtime by reading the cgroup's `cpuset.cpus` and
1031 /// `cpuset.mems.effective` files before each migration and
1032 /// refusing the write if `cpuset.cpus` is non-empty while
1033 /// `cpuset.mems.effective` is empty — surfacing a focused
1034 /// error rather than letting a half-configured cgroup through
1035 /// to the kernel's path-dependent behavior.
1036 pub fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()> {
1037 validate_cgroup_name(name)?;
1038 let p = self.parent.join(name).join("cpuset.mems");
1039 let nodes_str = TestTopology::cpuset_string(nodes);
1040 write_with_timeout(&p, &nodes_str, CGROUP_WRITE_TIMEOUT).with_context(|| {
1041 format!(
1042 "cgroup '{name}': set cpuset.mems='{nodes_str}' (requires +cpuset in parent cgroup.subtree_control)"
1043 )
1044 })
1045 }
1046
1047 /// Clear `cpuset.mems` for a child cgroup (empty string = inherit parent).
1048 /// Parallels `clear_cpuset`; callers use it only when tearing
1049 /// down a cpuset-restricted cgroup that needs to accept a
1050 /// fresh task binding with a different NUMA budget.
1051 pub fn clear_cpuset_mems(&self, name: &str) -> Result<()> {
1052 validate_cgroup_name(name)?;
1053 let p = self.parent.join(name).join("cpuset.mems");
1054 write_with_timeout(&p, "", CGROUP_WRITE_TIMEOUT).with_context(|| {
1055 format!("cgroup '{name}': clear cpuset.mems (write empty string for inherit-parent)")
1056 })
1057 }
1058
1059 /// Write `cpu.max` for a child cgroup. `quota_us = None` writes
1060 /// `"max <period_us>"` (no upper bound — same as a freshly
1061 /// created cgroup); `Some(q)` writes `"<q> <period_us>"`.
1062 ///
1063 /// Per the kernel's cgroup v2 docs ("Documentation/admin-guide/
1064 /// cgroup-v2.rst", "CPU Interface Files"): each period the
1065 /// cgroup gets `quota` microseconds of CPU time across its
1066 /// CPUs, and is throttled until the next period boundary once
1067 /// the quota is exhausted. `quota` MAY exceed `period` to let
1068 /// the cgroup use multiple CPUs concurrently (e.g. quota
1069 /// 200_000 / period 100_000 = up to 2 CPUs of throughput).
1070 ///
1071 /// Requires `+cpu` in the parent's `cgroup.subtree_control`;
1072 /// missing controller surfaces as ENOENT on the file (handled
1073 /// generically by `write_with_timeout`'s error path with the
1074 /// errno suffix).
1075 pub fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()> {
1076 validate_cgroup_name(name)?;
1077 let p = self.parent.join(name).join("cpu.max");
1078 let line = match quota_us {
1079 Some(q) => format!("{q} {period_us}"),
1080 None => format!("max {period_us}"),
1081 };
1082 write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1083 format!(
1084 "cgroup '{name}': set cpu.max='{line}' (requires +cpu in parent cgroup.subtree_control)"
1085 )
1086 })
1087 }
1088
1089 /// Write `cpu.weight` for a child cgroup (cgroup v2 weight,
1090 /// range 1..=10000, default 100). Used together with sibling
1091 /// cgroups to bias relative CPU share inside the parent's
1092 /// quota. Independent from `cpu.max` — weights govern share
1093 /// when CPU is contended, max enforces an absolute ceiling.
1094 ///
1095 /// Per "Documentation/admin-guide/cgroup-v2.rst" the legacy
1096 /// "shares" knob is `cpu.weight.nice` (mapped from nice value);
1097 /// this method targets the canonical `cpu.weight` knob.
1098 pub fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()> {
1099 validate_cgroup_name(name)?;
1100 let p = self.parent.join(name).join("cpu.weight");
1101 write_with_timeout(&p, &weight.to_string(), CGROUP_WRITE_TIMEOUT).with_context(|| {
1102 format!(
1103 "cgroup '{name}': set cpu.weight={weight} (requires +cpu in parent cgroup.subtree_control)"
1104 )
1105 })
1106 }
1107
1108 /// Write `memory.max` for a child cgroup. `bytes = None` writes
1109 /// `"max"` (no hard limit). When the cgroup's RSS exceeds the
1110 /// limit, the kernel OOM-kills tasks per the documented
1111 /// `memory.max` semantics. Requires `+memory` in the parent's
1112 /// `cgroup.subtree_control`.
1113 pub fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1114 validate_cgroup_name(name)?;
1115 let p = self.parent.join(name).join("memory.max");
1116 let line = match bytes {
1117 Some(b) => b.to_string(),
1118 None => "max".to_string(),
1119 };
1120 write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1121 format!(
1122 "cgroup '{name}': set memory.max='{line}' (requires +memory in parent cgroup.subtree_control)"
1123 )
1124 })
1125 }
1126
1127 /// Write `memory.high` for a child cgroup. `bytes = None`
1128 /// writes `"max"` (no high-water mark). Crossing the high
1129 /// threshold triggers reclaim throttling but NOT OOM-kill,
1130 /// distinguishing it from `memory.max`.
1131 pub fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1132 validate_cgroup_name(name)?;
1133 let p = self.parent.join(name).join("memory.high");
1134 let line = match bytes {
1135 Some(b) => b.to_string(),
1136 None => "max".to_string(),
1137 };
1138 write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1139 format!(
1140 "cgroup '{name}': set memory.high='{line}' (requires +memory in parent cgroup.subtree_control)"
1141 )
1142 })
1143 }
1144
1145 /// Write `memory.low` for a child cgroup. `bytes = None` writes
1146 /// `"0"` (no low-water protection). The kernel preferentially
1147 /// reclaims FROM other cgroups before reclaiming this cgroup's
1148 /// memory below `memory.low`; not a hard reservation.
1149 pub fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1150 validate_cgroup_name(name)?;
1151 let p = self.parent.join(name).join("memory.low");
1152 let line = match bytes {
1153 Some(b) => b.to_string(),
1154 None => "0".to_string(),
1155 };
1156 write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1157 format!(
1158 "cgroup '{name}': set memory.low='{line}' (requires +memory in parent cgroup.subtree_control)"
1159 )
1160 })
1161 }
1162
1163 /// Write `io.weight` for a child cgroup (cgroup v2 weight,
1164 /// range 1..=10000, default 100). Biases relative IO share
1165 /// across sibling cgroups when the io controller is enabled
1166 /// in the parent's `cgroup.subtree_control`. The kernel's BFQ
1167 /// or io.cost backend (whichever is active) applies the
1168 /// weight when contending devices are saturated.
1169 ///
1170 /// `io.max` (per-device throughput cap) is intentionally NOT
1171 /// surfaced here — the per-device interface needs major:minor
1172 /// device-id lookup which has no in-tree consumer; surface it
1173 /// when a concrete use case lands.
1174 pub fn set_io_weight(&self, name: &str, weight: u16) -> Result<()> {
1175 validate_cgroup_name(name)?;
1176 let p = self.parent.join(name).join("io.weight");
1177 write_with_timeout(&p, &weight.to_string(), CGROUP_WRITE_TIMEOUT).with_context(|| {
1178 format!(
1179 "cgroup '{name}': set io.weight={weight} (requires +io in parent cgroup.subtree_control)"
1180 )
1181 })
1182 }
1183
1184 /// Write `cgroup.freeze` for a child cgroup. `frozen = true` writes
1185 /// `"1"`, `frozen = false` writes `"0"`.
1186 ///
1187 /// `cgroup.freeze` is a cgroup-core file exposed on every non-root
1188 /// cgroup automatically — it is NOT gated by `cgroup.subtree_control`.
1189 /// The kernel's `cgroup_freeze_write` parses the value via
1190 /// `kstrtoint`, rejects anything outside `{0, 1}` with `-ERANGE`,
1191 /// and dispatches `cgroup_freeze(cgrp, freeze)`. Writing `1` to a
1192 /// cgroup containing tasks transitions every task in the subtree to
1193 /// the frozen state; writing `0` releases. The transition is
1194 /// asynchronous — `cgroup.events`'s `frozen` field reaches `1` once
1195 /// every task has parked.
1196 pub fn set_freeze(&self, name: &str, frozen: bool) -> Result<()> {
1197 validate_cgroup_name(name)?;
1198 let p = self.parent.join(name).join("cgroup.freeze");
1199 let line = if frozen { "1" } else { "0" };
1200 write_with_timeout(&p, line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1201 format!("cgroup '{name}': set cgroup.freeze='{line}' (cgroup-core file, no controller required)")
1202 })
1203 }
1204
1205 /// Write `pids.max` for a child cgroup. `max = None` writes `"max"`
1206 /// (the kernel's `PIDS_MAX_STR` sentinel for unlimited);
1207 /// `Some(n)` writes the decimal `n`.
1208 ///
1209 /// Per the kernel's `pids_max_write`: the parser short-circuits to
1210 /// the unlimited limit when `buf == PIDS_MAX_STR`; otherwise
1211 /// `kstrtoll(buf, 0, &limit)` parses a signed integer and rejects
1212 /// `< 0` or `>= PIDS_MAX` with `-EINVAL`. The update is atomic
1213 /// (`atomic64_set(&pids->limit, limit)`); existing tasks are NOT
1214 /// killed when the limit lands below the current task count — only
1215 /// future `fork()` / `clone()` calls are blocked.
1216 ///
1217 /// Requires `+pids` in the parent's `cgroup.subtree_control`;
1218 /// [`Self::setup`] enables it unconditionally so this write
1219 /// succeeds on every ktstr-managed cgroup tree.
1220 pub fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()> {
1221 validate_cgroup_name(name)?;
1222 let p = self.parent.join(name).join("pids.max");
1223 let line = match max {
1224 Some(n) => n.to_string(),
1225 None => "max".to_string(),
1226 };
1227 write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1228 format!(
1229 "cgroup '{name}': set pids.max='{line}' (requires +pids in parent cgroup.subtree_control)"
1230 )
1231 })
1232 }
1233
1234 /// Write `memory.swap.max` for a child cgroup. `bytes = None` writes
1235 /// `"max"` (no swap cap); `Some(b)` writes the decimal byte count.
1236 ///
1237 /// Per the kernel's `swap_max_write`: the value is parsed via
1238 /// `page_counter_memparse(buf, "max", &max)`, which accepts the
1239 /// literal `"max"` token for unlimited or a numeric byte count.
1240 /// The store is `xchg(&memcg->swap.max, max)` — atomic, with no
1241 /// failure path beyond the parse.
1242 ///
1243 /// Requires `+memory` in the parent's `cgroup.subtree_control`;
1244 /// [`Self::setup`] enables it unconditionally.
1245 ///
1246 /// Requires CONFIG_SWAP=y in the test kernel. The file does not
1247 /// exist on swapless builds; the write returns ENOENT.
1248 pub fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1249 validate_cgroup_name(name)?;
1250 let p = self.parent.join(name).join("memory.swap.max");
1251 let line = match bytes {
1252 Some(b) => b.to_string(),
1253 None => "max".to_string(),
1254 };
1255 write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1256 format!(
1257 "cgroup '{name}': set memory.swap.max='{line}' (requires +memory in parent cgroup.subtree_control; file absent on CONFIG_SWAP=n kernels)"
1258 )
1259 })
1260 }
1261
1262 /// Move a single task into a child cgroup via `cgroup.procs`.
1263 ///
1264 /// `move_task` is host-side scenario orchestration, never
1265 /// invoked from a vCPU thread, so the bare `fs::read_to_string`
1266 /// reads in `Self::check_cpuset_ordering` are not bounded by
1267 /// the freeze-rendezvous timeout. A wedged cgroupfs read here
1268 /// would stall the orchestrator thread, not a vCPU.
1269 ///
1270 /// # cpuset ordering gate
1271 ///
1272 /// Before issuing the `cgroup.procs` write, the method reads the
1273 /// destination's `cpuset.cpus` (the local-write knob the caller
1274 /// either set or did not) and `cpuset.mems.effective` (the
1275 /// kernel's effective view, inheritance-aware). The gate
1276 /// refuses migrations into a cgroup whose `cpuset.cpus` is set
1277 /// but `cpuset.mems.effective` reads empty — a half-configured
1278 /// state we surface as a focused error rather than letting it
1279 /// through to the kernel.
1280 ///
1281 /// The kernel's behavior on the half-configured shape is
1282 /// path-dependent: `guarantee_online_mems`
1283 /// (`kernel/cgroup/cpuset.c`) walks UP via `parent_cs(cs)`
1284 /// until `effective_mems` intersects `node_states[N_MEMORY]`,
1285 /// and the top cpuset always has online memory, so the walk
1286 /// generally succeeds; the empty-nodemask OOM path is reachable
1287 /// only in degenerate hierarchies. cgroup v2's
1288 /// `cpuset_can_attach_check` rejects only empty `effective_cpus`
1289 /// (not empty `effective_mems`), so a v2 attach into a cgroup
1290 /// with empty `effective_mems` is not a hard kernel error
1291 /// either. The framework refuses the migration anyway because
1292 /// the half-configured shape almost always reflects a missing
1293 /// [`Self::set_cpuset_mems`] call; surfacing it directly is
1294 /// more debuggable than letting it become whatever the kernel
1295 /// happens to do on this particular hierarchy.
1296 ///
1297 /// # Why `cpuset.mems.effective`, not `cpuset.mems`
1298 ///
1299 /// In cgroup v2, the local `cpuset.mems` file echoes
1300 /// `cs->mems_allowed` — the LOCAL nodemask, which is empty by
1301 /// default until the caller explicitly writes it. The kernel's
1302 /// allocation path uses `cs->effective_mems` instead, which
1303 /// inherits from the parent when the local mask is empty (per
1304 /// `cpuset_common_seq_show`'s FILE_EFFECTIVE_MEMLIST branch and
1305 /// `guarantee_online_mems`'s `parent_cs(cs)` walk). A gate that
1306 /// reads the local file would falsely flag every inheriting
1307 /// child as half-configured even though the kernel sees a
1308 /// perfectly valid `effective_mems` from the parent. The
1309 /// effective view captures both "this cgroup wrote `cpuset.mems`
1310 /// directly" and "this cgroup inherits a non-empty mask from
1311 /// its parent" without false positives.
1312 ///
1313 /// Both reads are best-effort — a cgroup without cpuset
1314 /// controllers (`cpuset.cpus` does not exist) bypasses the
1315 /// gate, matching the kernel's "no cpuset constraints to
1316 /// enforce" path. Read errors on either knob are absorbed: the
1317 /// gate exists to catch the configured-but-half-configured
1318 /// shape, not to fight cgroupfs read failures. If
1319 /// `cpuset.mems.effective` cannot be read for any reason, the
1320 /// gate degrades to "accept" — it cannot make a sound decision
1321 /// without the kernel's effective view.
1322 pub fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()> {
1323 validate_cgroup_name(name)?;
1324 self.check_cpuset_ordering(name)?;
1325 let p = self.parent.join(name).join("cgroup.procs");
1326 write_with_timeout(&p, &pid.to_string(), CGROUP_WRITE_TIMEOUT)
1327 }
1328
1329 /// Verify that a cgroup's `cpuset.cpus` /
1330 /// `cpuset.mems.effective` are in a consistent state before
1331 /// admitting a task migration into it.
1332 ///
1333 /// Returns `Err` only when the destination has `cpuset.cpus`
1334 /// non-empty AND `cpuset.mems.effective` reads empty — a
1335 /// half-configured shape we surface as a focused error rather
1336 /// than letting through. The kernel's behavior in this state is
1337 /// path-dependent: `guarantee_online_mems` (`kernel/cgroup/
1338 /// cpuset.c`) walks UP via `parent_cs(cs)` until effective_mems
1339 /// intersects `node_states[N_MEMORY]` and the top cpuset always
1340 /// has online memory, so the parent-walk fallback usually
1341 /// succeeds; degenerate hierarchies may OOM. cgroup v2's
1342 /// `cpuset_can_attach_check` rejects only empty `effective_cpus`,
1343 /// not empty `effective_mems`. All other shapes (no cpuset
1344 /// controller, local cpus empty, effective mems non-empty
1345 /// whether locally written or parent-inherited) are accepted.
1346 ///
1347 /// Read failures on either knob are absorbed (the gate degrades
1348 /// to "accept" rather than blocking on any cgroupfs read
1349 /// error). The effective-view file is the source of truth
1350 /// because in cgroup v2 the local `cpuset.mems` is normally
1351 /// empty (the cgroup inherits from its parent via
1352 /// `effective_mems`); reading the local file would emit false
1353 /// positives for every child that inherits a parent's NUMA
1354 /// budget without writing its own.
1355 fn check_cpuset_ordering(&self, name: &str) -> Result<()> {
1356 let cpus_path = self.parent.join(name).join("cpuset.cpus");
1357 let mems_effective_path = self.parent.join(name).join("cpuset.mems.effective");
1358 let cpus = match fs::read_to_string(&cpus_path) {
1359 Ok(s) => s,
1360 Err(_) => return Ok(()),
1361 };
1362 // `cpuset.cpus` is empty when the cgroup inherits from its
1363 // parent — no constraint imposed locally, so the
1364 // `cpuset.mems` invariant doesn't apply.
1365 if cpus.trim().is_empty() {
1366 return Ok(());
1367 }
1368 let mems_effective = match fs::read_to_string(&mems_effective_path) {
1369 Ok(s) => s,
1370 Err(_) => return Ok(()),
1371 };
1372 if mems_effective.trim().is_empty() {
1373 bail!(
1374 "move_task into '{name}' refused: cpuset.cpus is set ({}) \
1375 but cpuset.mems.effective reads empty — half-configured \
1376 cgroup. The kernel's behavior here is path-dependent \
1377 (guarantee_online_mems walks up to find a non-empty \
1378 ancestor mask; the empty-nodemask OOM path is reachable \
1379 only in degenerate hierarchies), but the framework \
1380 surfaces a focused error rather than letting the \
1381 migration through. Call set_cpuset_mems on this cgroup \
1382 or widen an ancestor's cpuset.mems before move_task",
1383 cpus.trim(),
1384 );
1385 }
1386 Ok(())
1387 }
1388
1389 /// Write `child_pid` to `<cgroup_name>/cgroup.procs` during the
1390 /// payload-spawn cgroup-sync handshake.
1391 ///
1392 /// Distinct from [`Self::move_task`]: this is the
1393 /// placement-before-exec write that runs while the child is
1394 /// paused in pre_exec between `fork(2)` and `execve(2)`. The
1395 /// `move_task` cpuset-ordering gate does NOT apply here —
1396 /// placement runs before cpuset is finalised at scenario setup
1397 /// time, and the gate would reject otherwise-valid spawn
1398 /// requests. Callers that need the gate (post-spawn migration)
1399 /// invoke [`Self::move_task`] / [`Self::move_tasks`] instead.
1400 ///
1401 /// Uses the same `write_with_timeout` shape as the other
1402 /// `cgroup.procs` write sites so a wedged cgroupfs is bounded
1403 /// to `CGROUP_WRITE_TIMEOUT` rather than blocking the parent
1404 /// indefinitely.
1405 pub fn place_task_during_handshake(
1406 &self,
1407 cgroup_name: &str,
1408 child_pid: libc::pid_t,
1409 ) -> Result<()> {
1410 validate_cgroup_name(cgroup_name)?;
1411 let cgroup_procs_path = self.parent.join(cgroup_name).join("cgroup.procs");
1412 let line = format!("{child_pid}\n");
1413 write_with_timeout(&cgroup_procs_path, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
1414 format!(
1415 "place pid {child_pid} into cgroup '{cgroup_name}' via {} during cgroup-sync handshake",
1416 cgroup_procs_path.display(),
1417 )
1418 })
1419 }
1420
1421 /// Move multiple tasks into a child cgroup by PID.
1422 ///
1423 /// Tolerates per-pid ESRCH (a task that exited between the listing
1424 /// snapshot and the migration write) and logs a warn for each
1425 /// vanished pid — partial migration is a legitimate outcome when
1426 /// one of N workers has voluntarily exited. Retries EBUSY up to
1427 /// 3 times with 100ms backoff for transient rejections from
1428 /// sched_ext BPF `cgroup_prep_move` callbacks
1429 /// (`scx_cgroup_can_attach`). Propagates EBUSY after retries
1430 /// exhausted. Propagates all other errors immediately.
1431 ///
1432 /// # All-vanished bail
1433 ///
1434 /// When `pids` is non-empty AND every supplied pid ESRCH'd, this
1435 /// fn bails with an actionable diagnostic rather than silently
1436 /// returning Ok. The silent-Ok path violates the project's
1437 /// no-silent-drops rule (any data loss must fail loudly):
1438 /// a downstream consumer reading the destination
1439 /// `cgroup.procs` would see 0 pids and have no idea whether
1440 /// the migration was supposed to move 0 or N — masking a real
1441 /// test-setup regression (e.g. `WorkloadHandle::spawn` child
1442 /// pre_exec init-panic cascade that killed every paused worker
1443 /// before move_tasks ran) behind a downstream-state empty-read.
1444 ///
1445 /// A test that LEGITIMATELY moves only already-exited workers
1446 /// (post-Drop diagnostic, post-mortem capture) should pass an
1447 /// empty `pids` slice rather than calling with non-empty + all
1448 /// pre-vanished — the empty-slice path is the documented "no
1449 /// move requested" form that returns Ok cleanly.
1450 pub fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()> {
1451 validate_cgroup_name(name)?;
1452 move_tasks_inner(name, pids, |n, pid| self.move_task_with_retry(n, pid))
1453 }
1454
1455 /// Move a single task with bounded EBUSY retry.
1456 fn move_task_with_retry(&self, name: &str, pid: libc::pid_t) -> Result<()> {
1457 const MAX_RETRIES: u32 = 3;
1458 const RETRY_DELAY: Duration = Duration::from_millis(100);
1459
1460 for attempt in 0..MAX_RETRIES {
1461 match self.move_task(name, pid) {
1462 Ok(()) => return Ok(()),
1463 Err(e) if is_ebusy(&e) && attempt + 1 < MAX_RETRIES => {
1464 tracing::debug!(
1465 pid,
1466 cgroup = name,
1467 attempt = attempt + 1,
1468 "EBUSY on cgroup.procs write, retrying"
1469 );
1470 std::thread::sleep(RETRY_DELAY);
1471 }
1472 Err(e) => return Err(e),
1473 }
1474 }
1475 unreachable!()
1476 }
1477
1478 /// Clear `subtree_control` on a child cgroup by writing an empty
1479 /// string. Disables all controllers for the cgroup's children.
1480 ///
1481 /// Required before moving tasks into a cgroup that has
1482 /// `subtree_control` set: the kernel's no-internal-process
1483 /// constraint (`cgroup_migrate_vet_dst`) returns EBUSY when
1484 /// tasks are written to `cgroup.procs` of a cgroup with
1485 /// controllers in `subtree_control`.
1486 pub fn clear_subtree_control(&self, name: &str) -> Result<()> {
1487 validate_cgroup_name(name)?;
1488 let p = self.parent.join(name).join("cgroup.subtree_control");
1489 if !p.exists() {
1490 return Ok(());
1491 }
1492 // Read current controllers and disable each one.
1493 let content = fs::read_to_string(&p).with_context(|| format!("read {}", p.display()))?;
1494 let content = content.trim();
1495 if content.is_empty() {
1496 return Ok(());
1497 }
1498 // Each controller name needs a "-" prefix to disable.
1499 let disable: Vec<String> = content
1500 .split_whitespace()
1501 .map(|c| format!("-{c}"))
1502 .collect();
1503 let disable_str = disable.join(" ");
1504 write_with_timeout(&p, &disable_str, CGROUP_WRITE_TIMEOUT)
1505 .with_context(|| format!("clear subtree_control on {name}"))
1506 }
1507
1508 /// Move all tasks from a child cgroup to the walk-root cgroup.
1509 ///
1510 /// Drains to `{self.walk_root}/cgroup.procs` instead of the
1511 /// parent because the parent has `subtree_control` set (enabling
1512 /// cpuset for children), and the kernel's no-internal-process
1513 /// constraint rejects writes to `cgroup.procs` when
1514 /// `subtree_control` is active. The walk-root cgroup is the
1515 /// uppermost cgroup the operator can write to without crossing
1516 /// the delegation boundary; under Mode A it is the canonical
1517 /// `/sys/fs/cgroup` root (exempt from the no-internal-process
1518 /// constraint), under Mode B/C it is the delegated subtree root
1519 /// (which also has procs-writability inside the delegation).
1520 pub fn drain_tasks(&self, name: &str) -> Result<()> {
1521 validate_cgroup_name(name)?;
1522 let src = self.parent.join(name).join("cgroup.procs");
1523 if !src.exists() {
1524 return Ok(());
1525 }
1526 let dst = self.walk_root.join("cgroup.procs");
1527 drain_pids_to_root(&src, &dst, name);
1528 Ok(())
1529 }
1530
1531 /// Read `cgroup.procs` of `name`, returning the thread-group
1532 /// leaders (PIDs) currently in the cgroup.
1533 ///
1534 /// Distinct from [`Self::drain_tasks`]:
1535 /// - `drain_tasks` MIGRATES tasks to the walk-root and treats a
1536 /// missing `cgroup.procs` file as a no-op (`Ok(())`) so
1537 /// best-effort teardown of an already-rmdir'd cgroup is safe.
1538 /// - `read_procs` is a READ accessor for assertions
1539 /// ([`Op::CaptureCgroupProcs`](crate::scenario::ops::Op::CaptureCgroupProcs)
1540 /// and direct callers). A missing `cgroup.procs` file is a
1541 /// real error (cgroup doesn't exist, typo'd name, race with
1542 /// teardown) — propagating it lets the caller distinguish
1543 /// "empty cgroup" from "no such cgroup."
1544 ///
1545 /// # Semantics
1546 ///
1547 /// - Returns thread-group leaders (PIDs / TGIDs) as the kernel
1548 /// exposes them via `cgroup_procs_show` in `kernel/cgroup/cgroup.c`.
1549 /// For per-thread TIDs the kernel exposes `cgroup.threads`; this
1550 /// method reads ONLY `cgroup.procs`.
1551 /// - Non-atomic snapshot as exposed by the kernel's pidlist
1552 /// iteration (`cgroup_procs_show` / `css_task_iter_next` in
1553 /// `kernel/cgroup/cgroup.c`): the kernel walks the css_set's
1554 /// task list one entry at a time, so a task that joins or exits
1555 /// mid-read can appear in the next read but not this one (or
1556 /// vice versa). The userspace `fs::read_to_string` here returns
1557 /// when seq_file signals EOF; the per-pid atomicity is a kernel
1558 /// property, not an impl one. Callers asserting on membership
1559 /// of a stable task set (e.g. SpinWait workers spawned in the
1560 /// prior op) are unaffected.
1561 /// - Empty cgroup: returns `Ok(Vec::new())` (kernel emits an
1562 /// empty file, not an error). Lets callers distinguish "no
1563 /// tasks" from "no such cgroup."
1564 /// - Malformed pid lines: skipped with a `tracing::warn!`
1565 /// naming the offending line, matching
1566 /// `drain_pids_to_root`'s tolerance. The kernel never emits
1567 /// such lines today; the tolerance exists so a future kernel
1568 /// gaining a header or comment line surfaces as a warn
1569 /// instead of an opaque parse error.
1570 pub fn read_procs(&self, name: &str) -> Result<Vec<libc::pid_t>> {
1571 validate_cgroup_name(name)?;
1572 let procs_path = self.parent.join(name).join("cgroup.procs");
1573 let content = fs::read_to_string(&procs_path).with_context(|| {
1574 format!(
1575 "read cgroup.procs from '{}' (cgroup name '{name}'); the cgroup may not \
1576 exist or may have been removed (check that `Op::AddCgroup(name)` or a \
1577 `CgroupDef` covers this name, and that the test's `workload_root_cgroup` \
1578 is correct)",
1579 procs_path.display(),
1580 )
1581 })?;
1582 let mut pids = Vec::new();
1583 for line in content.lines() {
1584 let trimmed = line.trim();
1585 if trimmed.is_empty() {
1586 continue;
1587 }
1588 match trimmed.parse::<libc::pid_t>() {
1589 Ok(pid) => pids.push(pid),
1590 Err(e) => {
1591 tracing::warn!(
1592 path = %procs_path.display(),
1593 cgroup = name,
1594 line = trimmed,
1595 err = %e,
1596 "read_procs: malformed pid line; skipping",
1597 );
1598 }
1599 }
1600 }
1601 Ok(pids)
1602 }
1603
1604 /// Remove all child cgroups under the parent (keeps the parent itself).
1605 ///
1606 /// Returns `Ok` even when individual filesystem probes fail; callers
1607 /// treat cleanup as best-effort teardown (see the runner's warn-
1608 /// and-continue in `src/runner.rs`). Per-entry `read_dir` /
1609 /// `DirEntry` / `file_type` errors are surfaced via
1610 /// `tracing::warn!` — mirrors `CgroupGroup::drop` so a failure
1611 /// shows up in logs instead of silently leaving children behind.
1612 ///
1613 /// # Outer-read_dir failure semantic
1614 ///
1615 /// When `read_dir(self.parent)` itself fails — e.g. the parent
1616 /// directory is unreadable, the cgroup mount has been unmounted
1617 /// out from under us, or a stat-side IO error fires — the
1618 /// failure is surfaced via `tracing::warn!` and the function
1619 /// still returns `Ok(())`. The deliberate semantic here is
1620 /// "teardown that observes a hostile filesystem state must
1621 /// not block scenario completion": a hard `Err` would propagate
1622 /// up through the runner's teardown and abort the whole test
1623 /// run on a transient cgroupfs failure that the operator can
1624 /// follow up on by reading the warn line.
1625 ///
1626 /// Production callers (the runner's drop path, scenario teardown)
1627 /// already log-and-continue on `cleanup_all` errors, so the
1628 /// always-Ok return is consistent with how every consumer
1629 /// already treats the result. Operators who need to detect
1630 /// teardown leakage should grep `tracing` output for
1631 /// `"cleanup_all: read_dir failed"` rather than relying on a
1632 /// non-zero exit; the warn includes both the offending path and
1633 /// the underlying io::Error.
1634 pub fn cleanup_all(&self) -> Result<()> {
1635 if !self.parent.exists() {
1636 return Ok(());
1637 }
1638 let walk_root = self.walk_root.clone();
1639 if let Err(err) = for_each_child_dir(&self.parent, "cleanup_all", |p| {
1640 cleanup_recursive(p, &walk_root)
1641 }) {
1642 tracing::warn!(
1643 parent = %self.parent.display(),
1644 err = %err,
1645 "cleanup_all: read_dir failed; child cgroups may remain under parent",
1646 );
1647 }
1648 Ok(())
1649 }
1650}
1651
1652/// Abstraction over the cgroup v2 filesystem surface used by the
1653/// scenario runtime. The production implementation is [`CgroupManager`],
1654/// which translates each method into real writes under `/sys/fs/cgroup`.
1655///
1656/// Extracted so `scenario::ops::apply_setup` and related orchestration
1657/// code can be unit-tested against an in-memory double: tests construct
1658/// a recording or failure-injecting implementor, drive `apply_setup`
1659/// against it, and assert on the recorded call sequence without
1660/// touching the host cgroup hierarchy.
1661///
1662/// Object-safe by design — scenario code holds the trait object behind
1663/// `&dyn CgroupOps` rather than being generic. Callers keep writing
1664/// `ctx.cgroups.set_cpuset(...)` with no syntactic change; dynamic
1665/// dispatch resolves to `CgroupManager` in production and to the
1666/// test double under `#[cfg(test)]`. The per-call indirect-call cost
1667/// is dominated by the filesystem I/O the trait abstracts over.
1668pub trait CgroupOps {
1669 /// Path to the parent cgroup directory. See
1670 /// [`CgroupManager::parent_path`].
1671 fn parent_path(&self) -> &Path;
1672 /// Create the parent directory and enable controllers. See
1673 /// [`CgroupManager::setup`].
1674 fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()>;
1675 /// Create a child cgroup. See [`CgroupManager::create_cgroup`].
1676 fn create_cgroup(&self, name: &str) -> Result<()>;
1677 /// Drain and remove a child cgroup. See
1678 /// [`CgroupManager::remove_cgroup`].
1679 fn remove_cgroup(&self, name: &str) -> Result<()>;
1680 /// Write `cpuset.cpus`. See [`CgroupManager::set_cpuset`].
1681 fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()>;
1682 /// Clear `cpuset.cpus` (inherit from parent). See
1683 /// [`CgroupManager::clear_cpuset`].
1684 fn clear_cpuset(&self, name: &str) -> Result<()>;
1685 /// Write `cpuset.mems`. See [`CgroupManager::set_cpuset_mems`].
1686 fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()>;
1687 /// Clear `cpuset.mems` (inherit from parent). See
1688 /// [`CgroupManager::clear_cpuset_mems`].
1689 fn clear_cpuset_mems(&self, name: &str) -> Result<()>;
1690 /// Write `cpu.max`. See [`CgroupManager::set_cpu_max`].
1691 fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()>;
1692 /// Write `cpu.weight`. See [`CgroupManager::set_cpu_weight`].
1693 fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()>;
1694 /// Write `memory.max`. See [`CgroupManager::set_memory_max`].
1695 fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1696 /// Write `memory.high`. See [`CgroupManager::set_memory_high`].
1697 fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1698 /// Write `memory.low`. See [`CgroupManager::set_memory_low`].
1699 fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1700 /// Write `io.weight`. See [`CgroupManager::set_io_weight`].
1701 fn set_io_weight(&self, name: &str, weight: u16) -> Result<()>;
1702 /// Write `cgroup.freeze`. See [`CgroupManager::set_freeze`].
1703 fn set_freeze(&self, name: &str, frozen: bool) -> Result<()>;
1704 /// Write `pids.max`. See [`CgroupManager::set_pids_max`].
1705 fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()>;
1706 /// Write `memory.swap.max`. See
1707 /// [`CgroupManager::set_memory_swap_max`].
1708 fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()>;
1709 /// Move a single task via `cgroup.procs`. See
1710 /// [`CgroupManager::move_task`].
1711 fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()>;
1712 /// Move multiple tasks (tolerates ESRCH, retries EBUSY). See
1713 /// [`CgroupManager::move_tasks`].
1714 fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()>;
1715 /// Place a single task into a child cgroup's `cgroup.procs`
1716 /// during the payload-spawn cgroup-sync handshake.
1717 ///
1718 /// Distinct from [`Self::move_task`] / [`Self::move_tasks`]:
1719 /// those run post-spawn for synthetic workers whose pids are
1720 /// already in their final cgroup-permissive state. This method
1721 /// runs INSIDE the two-pipe handshake between the child's
1722 /// pre_exec pid-notify and the parent's release-signal write,
1723 /// when the child is paused between `fork(2)` and `execve(2)`.
1724 /// The write MUST land BEFORE the release byte so the child's
1725 /// `execve` lands in the destination cgroup — this is the
1726 /// placement-before-exec invariant required to keep tasks like
1727 /// `Op::RunPayload { cgroup: Some(name), ... }` from briefly
1728 /// inheriting the parent's cgroup at exec time.
1729 ///
1730 /// # Caller contract
1731 ///
1732 /// - MUST be invoked exactly once during the handshake between
1733 /// pid-notify and release-signal.
1734 /// - Failure MUST propagate to the caller, which is responsible
1735 /// for dropping the release pipe to unblock the child with
1736 /// EOF so it bails out of pre_exec rather than execve'ing
1737 /// into an unspecified cgroup.
1738 /// - The `cgroup_name` argument is the user-facing name the
1739 /// test author passed in `Op::RunPayload { cgroup: Some(name),
1740 /// ... }` or `PayloadRun::in_cgroup(name)` — NOT a derived
1741 /// absolute path. The implementation derives the
1742 /// `cgroup.procs` path from this name plus its own
1743 /// parent-path knowledge.
1744 ///
1745 /// See [`CgroupManager::place_task_during_handshake`].
1746 fn place_task_during_handshake(&self, cgroup_name: &str, child_pid: libc::pid_t) -> Result<()>;
1747 /// Clear `cgroup.subtree_control` on a child. See
1748 /// [`CgroupManager::clear_subtree_control`].
1749 fn clear_subtree_control(&self, name: &str) -> Result<()>;
1750 /// Drain tasks from a child to the cgroup root. See
1751 /// [`CgroupManager::drain_tasks`].
1752 fn drain_tasks(&self, name: &str) -> Result<()>;
1753 /// Read `cgroup.procs` of a child, returning thread-group leaders.
1754 /// See [`CgroupManager::read_procs`].
1755 fn read_procs(&self, name: &str) -> Result<Vec<libc::pid_t>>;
1756 /// Remove all child cgroups under the parent. See
1757 /// [`CgroupManager::cleanup_all`].
1758 fn cleanup_all(&self) -> Result<()>;
1759}
1760
1761// Thin forwarding trait impl: inherent `CgroupManager` methods hold the
1762// real bodies; this trait impl exists so scenario code can hold
1763// `&dyn CgroupOps` for test-double injection without threading a generic
1764// through every caller. Trait default methods cannot access the private
1765// fields, and macro-generated delegation would lose Go-To-Definition.
1766impl CgroupOps for CgroupManager {
1767 fn parent_path(&self) -> &Path {
1768 CgroupManager::parent_path(self)
1769 }
1770 fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()> {
1771 CgroupManager::setup(self, controllers)
1772 }
1773 fn create_cgroup(&self, name: &str) -> Result<()> {
1774 CgroupManager::create_cgroup(self, name)
1775 }
1776 fn remove_cgroup(&self, name: &str) -> Result<()> {
1777 CgroupManager::remove_cgroup(self, name)
1778 }
1779 fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()> {
1780 CgroupManager::set_cpuset(self, name, cpus)
1781 }
1782 fn clear_cpuset(&self, name: &str) -> Result<()> {
1783 CgroupManager::clear_cpuset(self, name)
1784 }
1785 fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()> {
1786 CgroupManager::set_cpuset_mems(self, name, nodes)
1787 }
1788 fn clear_cpuset_mems(&self, name: &str) -> Result<()> {
1789 CgroupManager::clear_cpuset_mems(self, name)
1790 }
1791 fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()> {
1792 CgroupManager::set_cpu_max(self, name, quota_us, period_us)
1793 }
1794 fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()> {
1795 CgroupManager::set_cpu_weight(self, name, weight)
1796 }
1797 fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1798 CgroupManager::set_memory_max(self, name, bytes)
1799 }
1800 fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1801 CgroupManager::set_memory_high(self, name, bytes)
1802 }
1803 fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1804 CgroupManager::set_memory_low(self, name, bytes)
1805 }
1806 fn set_io_weight(&self, name: &str, weight: u16) -> Result<()> {
1807 CgroupManager::set_io_weight(self, name, weight)
1808 }
1809 fn set_freeze(&self, name: &str, frozen: bool) -> Result<()> {
1810 CgroupManager::set_freeze(self, name, frozen)
1811 }
1812 fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()> {
1813 CgroupManager::set_pids_max(self, name, max)
1814 }
1815 fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
1816 CgroupManager::set_memory_swap_max(self, name, bytes)
1817 }
1818 fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()> {
1819 CgroupManager::move_task(self, name, pid)
1820 }
1821 fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()> {
1822 CgroupManager::move_tasks(self, name, pids)
1823 }
1824 fn place_task_during_handshake(&self, cgroup_name: &str, child_pid: libc::pid_t) -> Result<()> {
1825 CgroupManager::place_task_during_handshake(self, cgroup_name, child_pid)
1826 }
1827 fn clear_subtree_control(&self, name: &str) -> Result<()> {
1828 CgroupManager::clear_subtree_control(self, name)
1829 }
1830 fn drain_tasks(&self, name: &str) -> Result<()> {
1831 CgroupManager::drain_tasks(self, name)
1832 }
1833 fn read_procs(&self, name: &str) -> Result<Vec<libc::pid_t>> {
1834 CgroupManager::read_procs(self, name)
1835 }
1836 fn cleanup_all(&self) -> Result<()> {
1837 CgroupManager::cleanup_all(self)
1838 }
1839}
1840
1841/// Block until the cgroup at `cgroup_dir` reports `populated 0` via
1842/// its `cgroup.events` file, or until `budget` elapses. Event-driven
1843/// via inotify(IN_MODIFY) on the events file so the wait wakes on
1844/// the actual kernel state-transition write rather than a blind
1845/// sleep. Callers use the return value to decide whether to proceed
1846/// (cgroup empty — rmdir will succeed) or to fall through and let
1847/// the subsequent rmdir surface EBUSY for a genuinely-stuck cgroup.
1848///
1849/// Best-effort: a missing `cgroup.events` file (legacy kernels
1850/// without cgroup v2 events, non-cgroupfs paths threaded into this
1851/// helper by a test fixture, races where the parent dir was already
1852/// removed) returns `false` without waiting — the caller falls
1853/// through to its rmdir attempt which will surface the actual
1854/// error. inotify_init / add_watch failures degrade silently to a
1855/// short blocking sleep for the remaining budget.
1856fn wait_for_cgroup_unpopulated(cgroup_dir: &Path, budget: std::time::Duration) -> bool {
1857 use nix::poll::{PollFd, PollFlags, PollTimeout, poll};
1858 use nix::sys::inotify::{AddWatchFlags, InitFlags, Inotify};
1859 use std::os::unix::io::AsFd;
1860
1861 let events_path = cgroup_dir.join("cgroup.events");
1862 // Tight initial check so a cgroup that's already empty
1863 // (extremely common — most drain_tasks call sites finish
1864 // synchronously) returns immediately without setting up inotify
1865 // or sleeping.
1866 if cgroup_events_reports_unpopulated(&events_path) {
1867 return true;
1868 }
1869 let deadline = std::time::Instant::now() + budget;
1870 // Inotify on the events file. IN_MODIFY fires every time the
1871 // kernel updates the populated count (1 → 0 transition included).
1872 // IN_NONBLOCK so read_events returns EAGAIN when empty — we
1873 // drive wake-vs-timeout via poll(2).
1874 let inotify_result =
1875 Inotify::init(InitFlags::IN_CLOEXEC | InitFlags::IN_NONBLOCK).and_then(|i| {
1876 i.add_watch(&events_path, AddWatchFlags::IN_MODIFY)?;
1877 Ok(i)
1878 });
1879 loop {
1880 if cgroup_events_reports_unpopulated(&events_path) {
1881 return true;
1882 }
1883 let now = std::time::Instant::now();
1884 if now >= deadline {
1885 return false;
1886 }
1887 let remaining_ms = deadline
1888 .duration_since(now)
1889 .as_millis()
1890 .min(u16::MAX as u128) as u16;
1891 match inotify_result.as_ref() {
1892 Ok(inotify) => {
1893 let fd = inotify.as_fd();
1894 let mut pollfds = [PollFd::new(fd, PollFlags::POLLIN)];
1895 let _ = poll(&mut pollfds, PollTimeout::from(remaining_ms));
1896 let _ = inotify.read_events();
1897 }
1898 Err(_) => {
1899 // Inotify unavailable on this path (legacy kernel,
1900 // missing events file, transient race). Fall back
1901 // to a brief blocking sleep so the loop still makes
1902 // progress under the deadline.
1903 std::thread::sleep(
1904 std::time::Duration::from_millis(10).min(deadline.duration_since(now)),
1905 );
1906 }
1907 }
1908 }
1909}
1910
1911/// Read `cgroup.events` and return `true` iff it contains a
1912/// `populated 0` line. Returns `false` for any read error or for
1913/// `populated 1` so the caller can keep waiting. The events file
1914/// is a small (~50 byte) flat key/value listing; full read each
1915/// poll iteration is cheap and avoids stateful parsing edge cases.
1916fn cgroup_events_reports_unpopulated(events_path: &Path) -> bool {
1917 match fs::read_to_string(events_path) {
1918 Ok(s) => s
1919 .lines()
1920 .any(|line| line.split_whitespace().eq(["populated", "0"])),
1921 Err(_) => false,
1922 }
1923}
1924
1925/// Drain all tasks from `procs_path` to `dst` (the walk-root
1926/// `cgroup.procs`).
1927///
1928/// `dst` must be the `cgroup.procs` file at the cgroup-fs root the
1929/// caller is permitted to write to (under Mode A: `/sys/fs/cgroup`;
1930/// under Mode B/C: the delegated subtree root the operator owns).
1931/// The walk-root cgroup is exempt from (or above) the
1932/// no-internal-process constraint inside its delegation, so writes
1933/// to its `cgroup.procs` succeed even when intermediate cgroups have
1934/// `subtree_control` set.
1935///
1936/// ESRCH (task exited) is silently tolerated; other errors are
1937/// logged. A `read_to_string` failure or a malformed pid line is
1938/// surfaced via `tracing::warn!` — silently dropping either would
1939/// hide a cgroup that still contains tasks and send it into cleanup,
1940/// which then fails with EBUSY and compounds the confusion.
1941fn drain_pids_to_root(procs_path: &Path, dst: &Path, context: &str) {
1942 let content = match fs::read_to_string(procs_path) {
1943 Ok(c) => c,
1944 Err(e) => {
1945 tracing::warn!(
1946 path = %procs_path.display(),
1947 cgroup = context,
1948 err = %e,
1949 "drain_pids_to_root: read_to_string failed; tasks may remain in cgroup",
1950 );
1951 return;
1952 }
1953 };
1954 for line in content.lines() {
1955 let trimmed = line.trim();
1956 if trimmed.is_empty() {
1957 continue;
1958 }
1959 let pid: u32 = match trimmed.parse() {
1960 Ok(p) => p,
1961 Err(e) => {
1962 tracing::warn!(
1963 path = %procs_path.display(),
1964 cgroup = context,
1965 line = trimmed,
1966 err = %e,
1967 "drain_pids_to_root: malformed pid line; skipping",
1968 );
1969 continue;
1970 }
1971 };
1972 if let Err(e) = write_with_timeout(dst, &pid.to_string(), CGROUP_WRITE_TIMEOUT)
1973 && !is_esrch(&e)
1974 {
1975 tracing::warn!(pid, cgroup = context, err = %e, "failed to drain task");
1976 }
1977 }
1978}
1979
1980/// Iterate the direct child directories of `path`, calling `f` on each.
1981///
1982/// `context` is a short caller name (e.g. `"cleanup_all"`,
1983/// `"cleanup_recursive"`) that is prefixed into every per-entry
1984/// `tracing::warn!` message so operators grepping logs for
1985/// `"cleanup_all: "` still see both the outer read_dir failure (which
1986/// stays with the caller) and the per-entry `DirEntry` / `file_type`
1987/// warnings emitted here.
1988///
1989/// `read_dir` failure is surfaced to the caller via `Err`; the caller
1990/// owns the top-level warn message. Non-directory entries are skipped.
1991/// Per-entry errors are logged and the iteration continues.
1992///
1993/// The structured log field key is normalized to `path =` at this
1994/// boundary; `cleanup_all`'s outer warn still uses `parent =` for the
1995/// top-level read_dir failure since that warn is emitted by the
1996/// caller, not here.
1997fn for_each_child_dir(path: &Path, context: &str, mut f: impl FnMut(&Path)) -> std::io::Result<()> {
1998 for entry in fs::read_dir(path)? {
1999 let entry = match entry {
2000 Ok(e) => e,
2001 Err(err) => {
2002 tracing::warn!(
2003 path = %path.display(),
2004 err = %err,
2005 "{context}: dir entry read failed; skipping",
2006 );
2007 continue;
2008 }
2009 };
2010 match entry.file_type() {
2011 Ok(t) if t.is_dir() => f(&entry.path()),
2012 Ok(_) => {}
2013 Err(err) => tracing::warn!(
2014 path = %entry.path().display(),
2015 err = %err,
2016 "{context}: file_type read failed; skipping entry",
2017 ),
2018 }
2019 }
2020 Ok(())
2021}
2022
2023/// Depth-first removal of `path` and every descendant cgroup
2024/// directory. Drains each cgroup's pids to `{walk_root}/cgroup.procs`
2025/// before rmdir.
2026///
2027/// `walk_root` mirrors [`CgroupManager::walk_root`]: under Mode A it
2028/// is `/sys/fs/cgroup` (the canonical cgroup-v2 mount); under Mode
2029/// B/C it is the delegated subtree root the operator owns. Threaded
2030/// through the recursion so every descendant drain targets the
2031/// caller's writable root and never the canonical
2032/// `/sys/fs/cgroup/cgroup.procs` (which would EACCES under
2033/// delegation).
2034fn cleanup_recursive(path: &std::path::Path, walk_root: &Path) {
2035 // Depth-first: clean children before parent
2036 if let Err(err) = for_each_child_dir(path, "cleanup_recursive", |child| {
2037 cleanup_recursive(child, walk_root)
2038 }) {
2039 tracing::warn!(
2040 path = %path.display(),
2041 err = %err,
2042 "cleanup_recursive: read_dir failed; child cgroups may remain",
2043 );
2044 }
2045 // Auto-unfreeze before draining tasks. Mirrors
2046 // `CgroupManager::remove_cgroup`'s pre-drain unfreeze, but for
2047 // defense-in-depth and source-cgroup state hygiene rather than
2048 // for correctness: the kernel's `cgroup_freezer_migrate_task`
2049 // path DOES unfreeze tasks when they migrate to an unfrozen
2050 // destination (the cgroup root is always unfrozen), so frozen
2051 // tasks would not actually strand at the root. The explicit
2052 // pre-drain `cgroup.freeze=0` write is still worthwhile because
2053 // it (a) makes the source cgroup's transient state visible in
2054 // tracing / `cgroup.events` before the directory disappears,
2055 // (b) avoids a brief frozen-counter churn while migration
2056 // batches advance, and (c) makes the teardown path symmetric
2057 // with `remove_cgroup` so operators reading either function
2058 // see the same auto-unfreeze step.
2059 //
2060 // Gate on existence: `fs::write` on a regular filesystem
2061 // CREATES the file when it doesn't exist (open(O_WRONLY |
2062 // O_CREAT | O_TRUNC)), so unconditionally writing
2063 // `cgroup.freeze` would plant a stray 1-byte file under any
2064 // non-cgroupfs directory and cause the subsequent
2065 // `fs::remove_dir(path)` to fail with ENOTEMPTY. On a real
2066 // cgroup v2 tree the file is always present (cgroup-core,
2067 // ungated by controllers); on a legacy kernel without
2068 // `CONFIG_CGROUP_FREEZE` or on a non-cgroup directory entry
2069 // the file is absent and the unfreeze step is a no-op.
2070 let freeze_path = path.join("cgroup.freeze");
2071 if freeze_path.exists()
2072 && let Err(err) = write_with_timeout(&freeze_path, "0", CGROUP_WRITE_TIMEOUT)
2073 {
2074 tracing::warn!(
2075 path = %path.display(),
2076 err = %format!("{err:#}"),
2077 "cleanup_recursive: pre-drain unfreeze failed; source-cgroup state-hygiene step skipped",
2078 );
2079 }
2080 drain_pids_to_root(
2081 &path.join("cgroup.procs"),
2082 &walk_root.join("cgroup.procs"),
2083 &path.display().to_string(),
2084 );
2085 // Wait event-driven on cgroup.events `populated 0` rather than
2086 // a blind 10 ms sleep — see `wait_for_cgroup_unpopulated`'s doc
2087 // for the rationale. 1 s deadline matches `remove_cgroup_inner`.
2088 wait_for_cgroup_unpopulated(path, std::time::Duration::from_secs(1));
2089 if let Err(err) = fs::remove_dir(path) {
2090 tracing::warn!(
2091 path = %path.display(),
2092 err = %err,
2093 "cleanup_recursive: remove_dir failed; cgroup directory may remain",
2094 );
2095 }
2096}
2097
2098#[cfg(test)]
2099#[path = "cgroup_tests.rs"]
2100mod tests;