ktstr/test_support/
runtime.rs

1//! Runtime configuration primitives shared by `eval` and `probe`.
2//!
3//! `eval` calls `probe::attempt_auto_repro` from its failure path,
4//! so items shared between the two siblings live here to avoid a
5//! circular import chain. All items are `pub(crate)` and remain
6//! internal to `test_support`.
7
8use std::os::unix::fs::PermissionsExt;
9use std::path::{Path, PathBuf};
10use std::sync::OnceLock;
11use std::time::Duration;
12
13use super::entry::KtstrTestEntry;
14
15/// Stable PathBuf for the process-owned config scratch directory.
16///
17/// Populated once by [`scratch_dir`] on first access. Kept in a
18/// separate `OnceLock` from the `TempDir` itself so the `atexit`
19/// cleanup handler can read the path through `extern "C"` without
20/// involving the `tempfile::TempDir` value (whose `Drop` would
21/// otherwise never run — see the "leak bound" note on
22/// [`scratch_dir`]).
23static SCRATCH_PATH: OnceLock<PathBuf> = OnceLock::new();
24
25/// Process-owned scratch directory for all inline-config tempfile
26/// writes — both [`config_content_parts`] (in-VM eval path) and
27/// [`crate::export::export_test`]'s `config_content_addition`
28/// (host-side .run packaging path).
29///
30/// Created lazily on first access via `tempfile::Builder` with
31/// explicit `0o700` mode (overrides the crate default of umask-
32/// restricted `0o777`-via-`mkdir(2)`, which on a standard
33/// `umask=0o022` host yields `0o755` and would expose directory
34/// listings + filename predictability to other uids). The
35/// directory is a random-suffixed subdirectory of
36/// `std::env::temp_dir()`, owned by the current uid.
37///
38/// Both call sites share this single directory because the
39/// security and leak-bound properties are identical for both
40/// purposes, and a single `OnceLock` + single `atexit` handler is
41/// simpler than maintaining parallel scratch dirs that diverge
42/// silently. Filenames are independently prefixed at each call
43/// site (`ktstr-config-{hash:016x}.json` for the eval path,
44/// `ktstr-export-config-{hash:016x}-{basename}` for the export
45/// path) so the two purposes can be visually distinguished inside
46/// the same directory.
47///
48/// Two properties matter:
49///
50/// 1. **Symlink defense.** /tmp is sticky-bit world-writable, so an
51///    attacker can pre-plant a symlink at the predictable content-
52///    addressed path and have us write to wherever it points. A
53///    per-process 0o700 subdirectory blocks every cross-uid access
54///    mode (read, list, write, traverse); only our process can
55///    create or replace files inside it, which eliminates the
56///    symlink-attack surface for the tempfile-write path.
57///
58/// 2. **Leak bound.** Rust does NOT run `Drop` impls on values
59///    stored in `static` slots at process exit — so the
60///    `tempfile::TempDir`'s built-in cleanup would never fire here.
61///    Instead, the path is registered with `libc::atexit`
62///    (POSIX-spec process-exit handler) so a clean exit
63///    (`exit(3)`, fall-off-`main`) triggers
64///    [`std::fs::remove_dir_all`] on the directory. Crash, abort,
65///    SIGKILL, or panic-`abort` skip the atexit handler and leak
66///    the directory; the residual is bounded by the number of
67///    such ungraceful exits and the directory contents are
68///    text-sized config files. The tempdir's random suffix
69///    prevents collisions across runs, so accumulated leak dirs
70///    don't interfere with future runs.
71pub(crate) fn scratch_dir() -> &'static Path {
72    SCRATCH_PATH
73        .get_or_init(|| {
74            let td = tempfile::Builder::new()
75                .prefix("ktstr-config-")
76                .permissions(std::fs::Permissions::from_mode(0o700))
77                .tempdir()
78                .expect("create ktstr config scratch directory");
79            // `keep()` consumes the TempDir without running its
80            // Drop's cleanup (it flips the cleanup flag and returns
81            // the bare PathBuf we own). The atexit registration
82            // below takes over cleanup responsibility.
83            let path = td.keep();
84            // SAFETY: `cleanup_scratch_dir` has the required
85            // `extern "C" fn()` signature that `libc::atexit`
86            // accepts. The `unsafe` block here is required because
87            // `libc::atexit` itself is an `unsafe extern "C"` FFI
88            // call (the callback signature itself is plain
89            // `extern "C" fn()`, not `unsafe`). Registering more
90            // than once is the caller's responsibility;
91            // `OnceLock::get_or_init` guarantees this runs exactly
92            // once per process.
93            let rc = unsafe { libc::atexit(cleanup_scratch_dir) };
94            assert_eq!(
95                rc, 0,
96                "libc::atexit registration for ktstr config scratch dir failed"
97            );
98            path
99        })
100        .as_path()
101}
102
103/// Process-exit handler registered via `libc::atexit` by
104/// [`scratch_dir`] on first init. Removes the scratch directory and
105/// every config file inside it. Errors are ignored — by the time
106/// this runs the process is exiting and there is nowhere to surface
107/// a failure (no `eprintln!` ordering guarantees from inside an
108/// atexit handler, and panicking would be unsound across the C ABI
109/// boundary).
110extern "C" fn cleanup_scratch_dir() {
111    if let Some(path) = SCRATCH_PATH.get() {
112        let _ = std::fs::remove_dir_all(path);
113    }
114}
115
116/// True when `RUST_BACKTRACE` is set to `"1"` or `"full"`.
117///
118/// Controls whether the full guest kernel console is appended to the
119/// `--- diagnostics ---` section of a failed test, and whether
120/// auto-repro forwards the repro VM's COM1/COM2 output to the host
121/// terminal in real time. The scheduler-log and sched_ext-dump
122/// sections of a failure are always emitted regardless of this flag.
123pub(crate) fn verbose() -> bool {
124    std::env::var("RUST_BACKTRACE")
125        .map(|v| v == "1" || v == "full")
126        .unwrap_or(false)
127}
128
129/// True when `KTSTR_NO_PERF_MODE` is set to a NON-EMPTY value.
130///
131/// Centralises the perf-mode-disabled check used by the dispatch
132/// gauntlet routes (`run_named_test`, `run_gauntlet_test`, and the
133/// verifier-cell listing route `list_verifier_cells_all`, all in
134/// `super::dispatch`) and the eval entry path
135/// (`super::eval::run_ktstr_test_inner_impl`). All four sites
136/// previously called `std::env::var("KTSTR_NO_PERF_MODE").is_ok()`
137/// directly, which returned true for `KTSTR_NO_PERF_MODE=` (empty
138/// string set, e.g. via `unset`/`set` interplay in CI shells, or a
139/// `--env KTSTR_NO_PERF_MODE` Docker pass-through with no value) —
140/// silently skipping every `performance_mode` test. Requiring a
141/// non-empty value matches operator intent ("set it to something to
142/// disable perf mode") and rejects the empty-string accident.
143///
144/// `cargo ktstr test --no-perf-mode` exports `KTSTR_NO_PERF_MODE=1`
145/// (a non-empty value), so the existing CLI surface is unaffected.
146pub(crate) fn no_perf_mode_active() -> bool {
147    std::env::var(crate::KTSTR_NO_PERF_MODE_ENV)
148        .map(|v| !v.is_empty())
149        .unwrap_or(false)
150}
151
152/// True when `KTSTR_BYPASS_LLC_LOCKS` is set to a NON-EMPTY value.
153///
154/// Centralises the bypass check used at 7 reader sites:
155/// `vmm/builder.rs:1199`, `cli/kernel_build/build.rs:102` +
156/// `:488` (the latter the inverse `!bypass_llc_locks_active()`
157/// form), `bin/cargo_ktstr/kernel/mod.rs:720`,
158/// `bin/cargo_ktstr/misc/shell.rs:181`, and `bin/ktstr.rs:652` +
159/// `:1267`. All sites previously spelled the same
160/// `.ok().is_some_and(|v| !v.is_empty())` inline; centralising
161/// eliminates the drift hazard and matches the
162/// `no_perf_mode_active` shape so the empty-string contract is
163/// uniformly enforced.
164///
165/// Set via `--bypass-llc-locks` CLI flag or
166/// `KTSTR_BYPASS_LLC_LOCKS=1` direct export. Empty
167/// (`KTSTR_BYPASS_LLC_LOCKS=` from a Docker `--env` pass-through
168/// without value) does NOT activate per the empty-as-unset
169/// contract — preventing a stray export from silently disabling
170/// LLC flock contention enforcement in CI.
171pub fn bypass_llc_locks_active() -> bool {
172    std::env::var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV)
173        .ok()
174        .is_some_and(|v| !v.is_empty())
175}
176
177/// Effective no-perf-mode for a given test entry. The env override
178/// `KTSTR_NO_PERF_MODE` and the per-entry [`KtstrTestEntry::no_perf_mode`]
179/// attribute are OR'd: either source forces the no-perf path
180/// (cpuset/LLC locking still applies, but vCPU pinning, hugepages,
181/// NUMA mbind, RT scheduling, and KVM exit suppression are all
182/// skipped). The env override is the operator-level switch; the
183/// per-entry attribute lets a test author opt the test out
184/// permanently — e.g. tests that exercise wild virtual topologies
185/// the host hardware can't possibly satisfy under perf-mode pinning.
186pub(crate) fn no_perf_mode_for_entry(entry: &KtstrTestEntry) -> bool {
187    no_perf_mode_active() || entry.no_perf_mode
188}
189
190/// True when `KTSTR_PERF_ONLY` is set to a NON-EMPTY value.
191///
192/// Mirrors [`no_perf_mode_active`]'s empty-as-unset contract (see
193/// [`crate::KTSTR_PERF_ONLY_ENV`]): any non-empty value restricts the
194/// run to `performance_mode` tests, an empty value does not.
195/// Consulted at the dispatch named/gauntlet routes and the eval entry
196/// to skip non-perf entries before VM boot. Set by the mergebase
197/// perf-delta subcommand.
198pub(crate) fn perf_only_active() -> bool {
199    std::env::var(crate::KTSTR_PERF_ONLY_ENV)
200        .map(|v| !v.is_empty())
201        .unwrap_or(false)
202}
203
204/// Whether `perf_only_active()` requires SKIPPING this entry: perf-only
205/// mode is on and the entry is not a `performance_mode` test. A
206/// `performance_mode` entry is always kept (it is the selection
207/// target); every other entry is skipped so a perf-delta run measures
208/// only the perf-configured tests.
209pub(crate) fn perf_only_skips_entry(entry: &KtstrTestEntry) -> bool {
210    perf_only_active() && !entry.performance_mode
211}
212
213/// Derive initramfs archive path, host path, and guest path from a
214/// scheduler's `config_file`. Returns `None` when no config file is set.
215pub(crate) fn config_file_parts(entry: &KtstrTestEntry) -> Option<(String, PathBuf, String)> {
216    let config_path = entry.scheduler.config_file?;
217    let file_name = Path::new(config_path)
218        .file_name()
219        .and_then(|n| n.to_str())
220        .expect("config_file must have a valid filename");
221    let archive_path = format!("include-files/{file_name}");
222    let guest_path = format!("/include-files/{file_name}");
223    Some((archive_path, PathBuf::from(config_path), guest_path))
224}
225
226/// Stable u64 hash of arbitrary string content.
227///
228/// Used by the config-content tempfile path code, but suitable for
229/// any content-addressed naming site that needs determinism across
230/// rustc bumps.
231///
232/// Uses `siphasher::sip::SipHasher13::new_with_keys(0, 0)` rather
233/// than `std::collections::hash_map::DefaultHasher` because the std
234/// algorithm is explicitly unspecified across rustc versions (see
235/// workspace `Cargo.toml` for the dep-line rationale). The explicit
236/// `new_with_keys(0, 0)` form matches the project's other
237/// stable-hash sites (`src/test_support/sidecar/mod.rs`, `build.rs`)
238/// so a future audit of zero-keyed SipHasher13 callers finds every
239/// instance via one grep. Same content always produces the same u64
240/// across toolchain upgrades, so cached artifacts stay reproducible
241/// across machines and rustc bumps.
242pub(crate) fn content_hash(content: &str) -> u64 {
243    use std::hash::{Hash, Hasher};
244    let mut hasher = siphasher::sip::SipHasher13::new_with_keys(0, 0);
245    content.hash(&mut hasher);
246    hasher.finish()
247}
248
249/// Resolve inline config content into a temp file on disk, returning
250/// `(archive_path, host_path, guest_path, sched_args)` where
251/// `sched_args` are the CLI args derived from the scheduler's
252/// `config_file_def` arg template. Returns `None` when the entry has
253/// no `config_content`.
254pub(crate) fn config_content_parts(
255    entry: &KtstrTestEntry,
256) -> Option<(String, PathBuf, String, Vec<String>)> {
257    use std::io::Write as _;
258    let content = entry.config_content?;
259    let (arg_template, guest_path) = entry.scheduler.config_file_def?;
260    let archive_path = guest_path.trim_start_matches('/').to_string();
261    let hash = content_hash(content);
262    let dir = scratch_dir();
263    // Write to a uniquely-named scratch file, then atomic-rename to the
264    // canonical content-addressed path:
265    //   - Scratch acquisition via `NamedTempFile::new_in` uses
266    //     `mkstemp(3)` semantics: random suffix, opened O_EXCL so no
267    //     pre-existing file can be subverted as the write target.
268    //   - The atomic `persist` rename is the cross-thread / cross-process
269    //     race fix. Two writers of the same content race their renames
270    //     to the canonical path; the last writer wins, but since `hash`
271    //     is content-addressed both wrote byte-identical content, so the
272    //     winner's bytes match the loser's. No torn writes are possible
273    //     because `rename(2)` is atomic at the inode level — readers
274    //     either see the old inode or the new one, never a partial blend.
275    //   - On panic between `new_in` and `persist`, NamedTempFile's `Drop`
276    //     unlinks the scratch file. No `/tmp` leak from in-process aborts.
277    let canonical = dir.join(format!("ktstr-config-{hash:016x}.json"));
278    let mut scratch =
279        tempfile::NamedTempFile::new_in(dir).expect("create ktstr config scratch file");
280    scratch
281        .as_file_mut()
282        .write_all(content.as_bytes())
283        .expect("write ktstr config content to scratch");
284    scratch
285        .persist(&canonical)
286        .expect("atomic-rename ktstr config scratch to canonical path");
287    let expanded = arg_template.replace("{file}", guest_path);
288    let sched_args: Vec<String> = expanded.split_whitespace().map(|s| s.to_string()).collect();
289    Some((archive_path, canonical, guest_path.to_string(), sched_args))
290}
291
292/// Build the shared `cmdline=` string appended to every ktstr_test
293/// guest boot. Per-scheduler sysctls, per-scheduler kargs,
294/// `RUST_BACKTRACE` / `RUST_LOG` propagation, and the host-resolved
295/// `KTSTR_SIDECAR_DIR` so the guest's `sidecar_dir()` returns the
296/// SAME path the host's freeze coordinator writes to. Without that
297/// propagation, host and guest each compute the run directory
298/// independently — the host walks `gix::discover` from a real
299/// workspace cwd and produces `{kernel}-{commit}` whereas the
300/// guest's cwd is `/` (no git repo, no kernel env), yielding the
301/// `unknown-unknown` fallback. Anything the two VM-launch sites
302/// (`run_ktstr_test_inner` and `attempt_auto_repro`) previously
303/// re-implemented side-by-side lives here.
304pub(crate) fn build_cmdline_extra(entry: &KtstrTestEntry) -> String {
305    let mut parts: Vec<String> = Vec::new();
306    for s in entry.scheduler.sysctls {
307        parts.push(format!("sysctl.{}={}", s.key(), s.value()));
308    }
309    for &karg in entry.scheduler.kargs {
310        parts.push(karg.to_string());
311    }
312    // Per-test KASLR opt-out (see `KtstrTestEntry.kaslr` doc). The base
313    // cmdline `base_guest_cmdline` at `src/vmm/setup/mod.rs` does NOT
314    // inject `nokaslr` by default — KASLR is on. A test that needs determinism sets `kaslr = false` in
315    // its `#[ktstr_test]` attribute; that lands the token here, where it
316    // composes with any operator-supplied `Scheduler::kargs(&["nokaslr"])`
317    // above (kernel parses the flag as a bool; duplicates are harmless).
318    // Mirrored guest-side by `vmm::rust_init::create_cgroup_parent_from_sched_args`
319    // and `monitor::symbols::resolve_page_offset`, both of which handle the
320    // `nokaslr` case via the live-publisher fall back to `DEFAULT_PAGE_OFFSET`.
321    if !entry.kaslr {
322        parts.push("nokaslr".to_string());
323    }
324    if let Ok(bt) = std::env::var("RUST_BACKTRACE") {
325        parts.push(format!("RUST_BACKTRACE={bt}"));
326    }
327    if let Ok(log) = std::env::var("RUST_LOG") {
328        parts.push(format!("RUST_LOG={log}"));
329    }
330    // Propagate the host-resolved sidecar dir so the guest scenario
331    // computes the same path the host's freeze coordinator wrote to
332    // (e.g. when a test reads `sidecar_dir().join("foo.json")` from
333    // inside the guest, the path matches the host's writer site).
334    // The host resolves via the OnceLock-cached project commit walk
335    // from the workspace cwd; the guest's cwd is `/` and would
336    // otherwise fall back to `unknown-unknown`. Sidecar dir paths
337    // are filesystem-safe ASCII (kernel version + 7-char hex
338    // commit, optional `-dirty` suffix), so the cmdline-as-token
339    // shape is sound — no escaping needed for whitespace.
340    //
341    // Absolutize via `current_dir().join()` when the resolved path
342    // is relative (the default-branch shape:
343    // `target/ktstr/{kernel}-{commit}` against the host cwd). The
344    // guest's cwd is `/`, so a relative token would resolve there
345    // instead of at the host's workspace root — the propagation
346    // must carry the FULL absolute path so the guest's
347    // `sidecar_dir()` reports the same string the host's writer
348    // site used. Falls back to the raw resolved path when the cwd
349    // probe fails (extremely rare; happens only when the process's
350    // cwd was rmdir'd while alive — a metadata probe has no
351    // recourse, leave the path as-is).
352    let resolved = super::sidecar::sidecar_dir();
353    let absolute = if resolved.is_absolute() {
354        resolved
355    } else {
356        std::env::current_dir()
357            .map(|cwd| cwd.join(&resolved))
358            .unwrap_or(resolved)
359    };
360    if let Some(s) = absolute.to_str() {
361        parts.push(format!("KTSTR_SIDECAR_DIR={s}"));
362    }
363    parts.join(" ")
364}
365
366#[cfg(feature = "wprof")]
367pub(crate) fn attach_wprof_if_requested(
368    builder: crate::vmm::KtstrVmBuilder,
369    entry: &KtstrTestEntry,
370    label: &'static str,
371) -> anyhow::Result<crate::vmm::KtstrVmBuilder> {
372    if !entry.wprof {
373        return Ok(builder);
374    }
375    let mut config = crate::vmm::wprof::WprofConfig::from_env().map_err(|e| {
376        anyhow::anyhow!(
377            "ktstr_test: {label}: wprof requested by \
378             #[ktstr_test(wprof)] but WprofConfig::from_env failed: \
379             {e:#}. Ensure cargo-ktstr's install_env exported \
380             KTSTR_WPROF_PATH and the path is readable."
381        )
382    })?;
383    if let Some(custom_args) = entry.wprof_args {
384        config.args = custom_args.split_whitespace().map(String::from).collect();
385    }
386    Ok(builder.wprof(Some(config)))
387}
388
389/// Derive the test VM's memory floor from a CPU count + entry.
390///
391/// Returns `max(cpus * 64, 256, entry.memory_mib)`. When the
392/// `wprof` feature is enabled and `entry.wprof` is true, bumps
393/// to `WPROF_MIN_MEMORY_MIB` if below that floor.
394///
395/// The returned value is the LOWER BOUND on guest memory; the
396/// VM builder ultimately uses `.memory_deferred_min(mib)` which
397/// also accounts for the initramfs size, so the final boot memory
398/// may exceed this value.
399pub(crate) fn derive_test_memory_mib(cpus: u32, entry: &KtstrTestEntry) -> u32 {
400    let raw = (cpus * 64).max(256).max(entry.memory_mib);
401    #[cfg(feature = "wprof")]
402    {
403        use crate::vmm::wprof::{WPROF_MIN_MEMORY_MIB, apply_wprof_memory_floor};
404        let mem = apply_wprof_memory_floor(raw, entry.wprof);
405        if mem != raw {
406            tracing::info!(
407                test = %entry.name,
408                requested_mib = raw,
409                floored_mib = WPROF_MIN_MEMORY_MIB,
410                "wprof enabled; memory_mib floored to \
411                 WPROF_MIN_MEMORY_MIB"
412            );
413        }
414        mem
415    }
416    #[cfg(not(feature = "wprof"))]
417    raw
418}
419
420/// Resolve the VM topology and memory size from an optional
421/// TopoOverride.
422///
423/// Returns `(topology, memory_mib)` where `topology` is the
424/// `vmm::topology::Topology` passed to the VM builder and `memory_mib`
425/// is the LOWER BOUND on guest memory (the builder's
426/// `.memory_deferred_min(mib)` may raise the actual allocation
427/// to fit the initramfs). When `topo` is `Some`, both come from
428/// the override and the memory is honored verbatim (per the
429/// override-is-verbatim contract — see `topo.rs:42-44`). When
430/// `topo` is `None`, the topology comes from `entry.topology` and
431/// memory is derived by [`derive_test_memory_mib`]. Shared with
432/// `attempt_auto_repro` so the repro VM always sizes memory the
433/// same way as the first VM — reproducibility requires identical
434/// topology, including the wprof floor when applicable.
435///
436/// When the `wprof` feature is enabled and `entry.wprof` is true,
437/// a TopoOverride with memory_mib below the wprof floor triggers
438/// a warn-level log but is still honored verbatim.
439pub(crate) fn resolve_vm_topology(
440    entry: &KtstrTestEntry,
441    topo: Option<&super::topo::TopoOverride>,
442) -> (crate::vmm::topology::Topology, u32) {
443    match topo {
444        Some(t) => {
445            #[cfg(feature = "wprof")]
446            if entry.wprof && t.memory_mib < crate::vmm::wprof::WPROF_MIN_MEMORY_MIB {
447                tracing::warn!(
448                    test = %entry.name,
449                    override_mib = t.memory_mib,
450                    wprof_min_mib = crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
451                    "wprof enabled with TopoOverride.memory_mib below \
452                     WPROF_MIN_MEMORY_MIB; honoring the override per the \
453                     override-is-verbatim contract, but wprof may OOM-kill \
454                     mid-run"
455                );
456            }
457            (crate::vmm::topology::Topology::from(t), t.memory_mib)
458        }
459        None => {
460            let cpus = entry.topology.total_cpus();
461            let mem = derive_test_memory_mib(cpus, entry);
462            (entry.topology, mem)
463        }
464    }
465}
466
467/// Append per-scheduler `sched_args` entries shared by both VM-launch
468/// paths: `--config <guest_path>` if the scheduler declared one, the
469/// cgroup-parent switch, the scheduler's own fixed args, and
470/// per-entry extra args. Active-flag dispatch and probe-specific args
471/// remain at the call site because they differ between the paths.
472///
473/// The caller owns the `include_files` binding on the builder;
474/// `config_file_parts` and the guest-path push are returned separately
475/// so the caller decides whether to attach include files (production
476/// does, probe-only repro pipelines that already pass `include_files`
477/// can skip it).
478/// Concrete absolute-path example used by the panic messages that
479/// reject malformed `--cell-parent-cgroup` values — names the
480/// scheduler's declared default when one exists, falls back to a
481/// canonical `/ktstr` literal otherwise. The operator gets a
482/// copy-pasteable shape regardless of whether the scheduler is
483/// cell-aware. Centralised so both rejection arms (Value-invalid and
484/// MissingValue) display the same example.
485fn cgroup_parent_example(entry: &KtstrTestEntry) -> String {
486    entry
487        .scheduler
488        .cgroup_parent
489        .map(|p| p.as_str().to_string())
490        .unwrap_or_else(|| "/ktstr".to_string())
491}
492
493pub(crate) fn append_base_sched_args(entry: &KtstrTestEntry, args: &mut Vec<String>) {
494    // Fail-fast on a malformed user-supplied `--cell-parent-cgroup`
495    // value before the auto-inject branch. The host-side consumer
496    // `resolve_cgroup_root` (defined in `test_support::args`, used by
497    // the probe/setup path at `probe.rs::build_dispatch_ctx_parts`)
498    // interpolates the value into a
499    // `/sys/fs/cgroup{path}` literal and hands the result to
500    // `CgroupManager::new`, which has NO host-root guard — any path
501    // that doesn't start with `/` lands inside the host cgroup root
502    // (e.g. `""` → `/sys/fs/cgroup`, `"my_test"` →
503    // `/sys/fs/cgroupmy_test`) and corrupts unrelated cgroup state
504    // when subsequent `cgroups.setup(...)` calls run. The guest-side
505    // sibling `vmm::rust_init::create_cgroup_parent_from_sched_args`
506    // happens to be safe-by-coincidence for the empty case because
507    // `enable_subtree_controllers_to` early-returns when leaf equals
508    // the cgroup root — but probe.rs has no such gate, so the host
509    // fail-fast is what actually protects against corruption.
510    //
511    // The check is universal — independent of whether the scheduler
512    // declares a default `cgroup_parent` — because both routes
513    // (`extra_sched_args` from the test author, `sched_args` from
514    // the scheduler def) flow through the same parse + chain below,
515    // and the corruption risk is identical regardless of who
516    // supplied the bad value. Operator sees the message at test
517    // setup time, before any cgroup ops run.
518    match super::args::parse_cell_parent_cgroup(
519        entry
520            .scheduler
521            .sched_args
522            .iter()
523            .chain(entry.extra_sched_args.iter())
524            .copied(),
525    ) {
526        super::args::CellParentCgroupArg::Value(path)
527            if !super::args::cell_parent_path_is_valid(path) =>
528        {
529            let example = cgroup_parent_example(entry);
530            let mut fixes = format!(
531                "supply an absolute path under `/` with at least one non-`.`/`..` \
532                 segment (e.g. `{example}`) for the per-test cgroup root"
533            );
534            if let Some(default) = entry.scheduler.cgroup_parent {
535                fixes.push_str(&format!(
536                    " or omit the flag entirely (the framework will auto-inject \
537                     the scheduler's default `cgroup_parent = {default}`)"
538                ));
539            }
540            panic!(
541                "test `{}` supplies `--cell-parent-cgroup` with a value `{:?}` \
542                 (via `extra_sched_args` on the test or `sched_args` in the \
543                 scheduler def) that does not start with `/`, is `/` alone, or \
544                 contains `.`/`..` segments that normalize back to the host \
545                 cgroup root; {fixes}. Empty, bare `/`, relative, or paths \
546                 like `/.`, `/foo/..`, `/./bar/..` all resolve to a path \
547                 equal to or inside `/sys/fs/cgroup` (e.g. empty → \
548                 `/sys/fs/cgroup`, `/` → `/sys/fs/cgroup/`, `/.` → \
549                 `/sys/fs/cgroup` after canonicalization) and corrupt \
550                 unrelated cgroup state when the probe-side `CgroupManager` \
551                 operates on the resolved path. This gate mirrors the \
552                 const-eval check in `CgroupPath::new` so runtime values \
553                 share the validation contract that compile-time \
554                 declarations already pass.",
555                entry.name, path,
556            );
557        }
558        super::args::CellParentCgroupArg::MissingValue => {
559            let example = cgroup_parent_example(entry);
560            let mut fixes = format!(
561                "either remove the bare `--cell-parent-cgroup` and let the \
562                 framework auto-inject the scheduler's default (when one is \
563                 declared), or supply a value (e.g. `--cell-parent-cgroup={example}` \
564                 in combined form, or `--cell-parent-cgroup` followed by an \
565                 absolute path in two-token form)"
566            );
567            if entry.scheduler.cgroup_parent.is_none() {
568                fixes.push_str(
569                    "; the scheduler in this test declares no default \
570                     `cgroup_parent`, so an absolute-path value is required",
571                );
572            }
573            panic!(
574                "test `{}` supplies a bare `--cell-parent-cgroup` (via \
575                 `extra_sched_args` on the test or `sched_args` in the \
576                 scheduler def) with no following value; {fixes}. The \
577                 framework intercepts this here because letting it through \
578                 would silently combine with the framework's auto-inject \
579                 (when a default exists) and trip clap's `cannot be used \
580                 multiple times` diagnostic — a confusing error that buries \
581                 the actual missing-value mistake.",
582                entry.name,
583            );
584        }
585        super::args::CellParentCgroupArg::Value(_) => {
586            // User-supplied valid path — flows through the
587            // `args.extend(...)` calls below. Skip the auto-inject so
588            // clap doesn't reject the duplicate flag with `cannot be
589            // used multiple times`.
590        }
591        super::args::CellParentCgroupArg::Absent => {
592            // `cgroup_parent` controls the cgroup root where the
593            // framework places test cgroups (`resolve_cgroup_root`
594            // returns `/sys/fs/cgroup{cgroup_parent}` for guest
595            // CgroupManager). It does NOT auto-inject
596            // `--cell-parent-cgroup` into the scheduler's argv —
597            // cell-aware schedulers (scx_mitosis et al.) interpret
598            // that flag by enabling userspace_managed_cell_mode and
599            // starting an inotify-driven CellManager that can
600            // interfere with the host-side periodic-capture
601            // pipeline. If a scheduler genuinely needs
602            // `--cell-parent-cgroup`, the scheduler declaration's
603            // own `sched_args` array (or the per-test
604            // `extra_sched_args`) must include it explicitly. The
605            // guest-side `create_cgroup_parent_from_sched_args`
606            // mkdir + subtree-controller setup still fires when
607            // `--cell-parent-cgroup` is present in `/sched_args` —
608            // it's gated on the flag's presence, not on whether the
609            // framework injected it vs. the user added it manually.
610        }
611    }
612    args.extend(entry.scheduler.sched_args.iter().map(|s| s.to_string()));
613    args.extend(entry.extra_sched_args.iter().map(|s| s.to_string()));
614}
615
616/// Retry budget for the guest's `vmm::rust_init::send_sys_rdy_with_retry`
617/// loop. Boot-to-readiness wall time is a fixed base PLUS per-vCPU
618/// work: the virtio-console multiport handshake (DEVICE_READY →
619/// PORT_ADD → PORT_READY → PORT_OPEN per `drivers/char/virtio_console.c`)
620/// issues per-CPU work whose wall time grows roughly linearly with
621/// topology size, on top of a fixed device-enumeration / first-CPU
622/// cost. The budget is therefore ADDITIVE — `BASE_MS + vcpus *
623/// PER_VCPU_MS` — not `max(BASE, scaled)`.
624///
625/// The earlier `max` form left the per-vCPU term dead below ~67 vCPUs
626/// (since `vcpus * 150` only clears the 10 s floor at 67 vCPUs), so a
627/// 64-vCPU VM got the same 10 s budget as a 1-vCPU VM. Under host
628/// contention a 64-vCPU handshake was observed at ~10 s and timed out
629/// by ~8 ms — the disk-template gauntlet flake. The additive base
630/// gives every topology ~10 s of fixed headroom on top of its linear
631/// per-vCPU term (64 vCPUs → 19.6 s), so a slow handshake under load
632/// no longer races the floor.
633///
634/// Capped at 90 s as a sanity bound on a genuinely-stuck boot's
635/// guest-side retry loop — NOT to protect the host watchdog. The
636/// watchdog deadline IS derived from this budget ([`vm_boot_headroom`]
637/// feeds [`vm_timeout_from_entry`]), so the budget can never "blow" a
638/// deadline it defines; the old 30 s cap's stated rationale was
639/// inverted. That 30 s cap truncated the additive term above
640/// ~133 vCPUs — a 256-vCPU guest wants `10_000 + 256×150 = 48_400` ms
641/// but was clamped to 30_000 ms, starving the widest topologies of
642/// boot budget exactly where overcommit makes the boot slowest (the
643/// wide-SMP boot-timeout class). 90 s admits the full additive budget
644/// up to the 512-vCPU `MAX_VCPUS` (`10_000 + 512×150 = 86_800` ms);
645/// only pathological counts above 533 vCPUs clip.
646///
647/// The const-fn signature lets both the host (`vm_boot_headroom`,
648/// `vm_timeout_from_entry`) and the guest (`vmm::rust_init`) compute
649/// the same budget without trans-VM coordination — the guest reads
650/// its own vCPU count from `/sys/devices/system/cpu/online`. The guest
651/// uses this UN-scaled budget (it cannot read host overcommit); on an
652/// oversubscribed boot its `send_sys_rdy_with_retry` loop may exhaust
653/// and WARN, but that is non-fatal (the host monitor's `data_valid`
654/// gate keeps reads safe). The host's [`vm_timeout_from_entry`] is the
655/// authoritative deadline and DOES scale by the overcommit ratio.
656pub(crate) const fn sys_rdy_budget_ms(vcpus: u32) -> u64 {
657    const BASE_MS: u64 = 10_000;
658    const CAP_MS: u64 = 90_000;
659    const PER_VCPU_MS: u64 = 150;
660    let scaled = (vcpus as u64).saturating_mul(PER_VCPU_MS);
661    let total = BASE_MS.saturating_add(scaled);
662    if total > CAP_MS { CAP_MS } else { total }
663}
664
665/// Headroom for kernel init, scheduler attach, and BPF verifier time
666/// — the post-sys_rdy phase of guest startup. Distinct from
667/// [`sys_rdy_budget_ms`]'s base + per-vCPU budget (the pre-sys_rdy
668/// virtio-console handshake budget); the two add together to form
669/// the full [`vm_boot_headroom`].
670const KERNEL_INIT_HEADROOM: Duration = Duration::from_secs(10);
671
672/// Total boot headroom: covers kernel init + scheduler attach + BPF
673/// verifier time ([`KERNEL_INIT_HEADROOM`]) plus the guest's scaled
674/// `send_sys_rdy` retry loop ([`sys_rdy_budget_ms`]) before the
675/// workload phase begins. Scales with vCPU count so the host timeout
676/// doesn't fire while the guest is still inside its sys_rdy budget.
677pub(crate) fn vm_boot_headroom(vcpus: u32) -> Duration {
678    KERNEL_INIT_HEADROOM + Duration::from_millis(sys_rdy_budget_ms(vcpus))
679}
680
681/// Worst-case host-side latency the guest's `wait_for_map_write` latch
682/// blocks on before a `bpf_map_write` test's workload runs: the host
683/// builds the BPF-map accessor (ELF + BTF parse + symbol HashMap, ~4 s
684/// on a debug vmlinux per the freeze-coord accessor-init comment) in a
685/// retry loop bounded by a 30 s `phase1_deadline`. Under heavy `-j16`
686/// host-compile contention the parse scales and a cold vmlinux read adds
687/// seconds, so the latch can block up to that deadline. Added to the
688/// workload budget for any entry declaring a `bpf_map_write` — a
689/// framework cost every such test pays, not a per-test concern the
690/// author must remember to budget for.
691const COLD_BTF_PHASE1_BUDGET: Duration = Duration::from_secs(30);
692
693/// Oversubscription ratio at or beyond which a default/no-perf (auto)
694/// overcommit is SKIPPED rather than booted. Above it the host
695/// time-slices the vCPU threads so heavily the boot would race even the
696/// oversub-scaled [`vm_timeout_from_entry`] deadline, so the dispatch
697/// (`run_ktstr_test_inner_impl`) skips with a "host topology
698/// insufficient" signal instead of hard-failing. Set above the 4× the
699/// `cpu_budget` overcommit tests deliberately exercise (those carry an
700/// explicit `cpu_budget`, so they are NOT auto-collapse and never skip
701/// on this ratio) and far above the ~1.3× a 256-vCPU guest hits on a
702/// 192-CPU CI runner — which therefore RUNS and validates wide-SMP boot
703/// rather than skipping, so the boot invariant is never masked.
704pub(crate) const OVERCOMMIT_SKIP_RATIO: f64 = 6.0;
705
706/// Ceiling on the boot-headroom oversubscription multiplier in
707/// [`vm_timeout_from_entry`]. The boot-headroom term is multiplied by
708/// the host overcommit ratio (an oversubscribed boot's wall-clock grows
709/// ~linearly with the ratio as the host time-slices the vCPU threads),
710/// clamped here so the deadline stays bounded. Kept equal to
711/// [`OVERCOMMIT_SKIP_RATIO`] so the scaled deadline always covers every
712/// ratio that is allowed to run: auto-collapse beyond the ratio skips,
713/// and an explicit `cpu_budget` deeper than this clamp is a deliberate
714/// extreme the author budgets for via `duration`/`watchdog_timeout`.
715const OVERCOMMIT_HEADROOM_CAP: f64 = OVERCOMMIT_SKIP_RATIO;
716
717/// Stricter skip ratio for the `expect_auto_repro` chain. That inversion
718/// boots a SECOND wide-SMP VM which must replay the forced failure and land
719/// a shape-valid `.repro.wprof.pb` — a far more fragile path under host
720/// time-slicing than a single boot: the repro VM's system-wide wprof
721/// capture over hundreds of vCPUs stops reliably producing a transportable
722/// trace once the host oversubscribes (the boots themselves still finish
723/// inside the oversub-scaled deadline; the trace transport is what breaks).
724/// So this chain auto-skips well below the generic [`OVERCOMMIT_SKIP_RATIO`],
725/// while single-VM wide-SMP BOOT tests keep running (and validating boot) up
726/// to the generic cap. Tuned to sit between the ~1.3x a 256-vCPU guest hits
727/// on the 192-CPU wide-SMP design-target runner (which still RUNS the
728/// auto-repro hop, so it is validated there) and the ~2.7x of a 96-CPU host
729/// (which skips cleanly instead of hard-failing — the "overcommit OR
730/// auto-skip, never hard-fail" contract).
731pub(crate) const EXPECT_AUTO_REPRO_SKIP_RATIO: f64 = 2.0;
732
733/// Host overcommit ratio for a `vcpus`-wide guest given the host's
734/// allowed-CPU count and the test's optional explicit `cpu_budget`:
735/// vCPUs divided by the host CPUs the vCPU threads actually land on.
736/// With an explicit `cpu_budget` the threads collapse onto
737/// `min(cpu_budget, allowed)` (the per-test cap); without one the
738/// default/no-perf path collapses onto the whole allowed cpuset
739/// (`no_perf_cpu_budget`'s `vcpus.min(allowed)` floor when
740/// `allowed < vcpus`, else a fitting 1:1 pin). Under
741/// `KTSTR_CARGO_TEST_MODE` the planner ignores the explicit budget and
742/// masks to the full allowed cpuset, so with a `cpu_budget` the
743/// returned ratio is an UPPER bound there — deadline-safe, since
744/// over-estimating only lengthens the timeout (CI runs `cargo ktstr
745/// test` with that mode OFF, where the budget IS enforced and the ratio
746/// is exact). Floored at 1.0 (a
747/// fitting host is never under-subscribed for timeout purposes) and
748/// divide-by-zero-guarded (an unenumerable cpuset yields 1.0). Pure
749/// over `(vcpus, allowed_cpus, cpu_budget)` so the scaling is
750/// unit-testable without reading the host cpuset.
751pub(crate) fn overcommit_ratio(vcpus: u32, allowed_cpus: usize, cpu_budget: Option<u32>) -> f64 {
752    let allowed = allowed_cpus.max(1);
753    let effective = match cpu_budget {
754        Some(b) => (b as usize).min(allowed),
755        None => allowed,
756    }
757    .max(1);
758    (vcpus as f64 / effective as f64).max(1.0)
759}
760
761/// Reason to auto-skip an over-oversubscribed default/no-perf run, or
762/// `None` to run it. The default/no-perf path collapses the vCPU threads
763/// onto the allowed cpuset (`build_overcommit_run_locks` /
764/// `no_perf_cpu_budget`); at or beyond [`OVERCOMMIT_SKIP_RATIO`] the host
765/// time-slices so hard the boot would race even the oversub-scaled
766/// [`vm_timeout_from_entry`] deadline, so the dispatch
767/// (`run_ktstr_test_inner_impl`) skips with this reason — the "overcommit
768/// OR auto-skip, never hard-fail" contract. Returns `None` (runs) for:
769/// the fitting / mildly-oversubscribed case (< the ratio, e.g. a 256-vCPU
770/// guest at ~1.3x on a 192-CPU CI runner, so wide-SMP boot is VALIDATED
771/// there, never masked); an explicit `cpu_budget` (a deliberate
772/// oversubscription opt-in for contention testing always runs, its deeper
773/// ratio being the author's choice); and an empty (unenumerable) cpuset
774/// (no ratio is computable, so the overcommit warning is the sole
775/// signal). Pure over `(vcpus, allowed_cpus, cpu_budget)` so the skip
776/// boundary is unit-testable without booting a VM.
777pub(crate) fn overcommit_skip_reason(
778    vcpus: u32,
779    allowed_cpus: usize,
780    cpu_budget: Option<u32>,
781    expect_auto_repro: bool,
782) -> Option<String> {
783    if cpu_budget.is_some() || allowed_cpus == 0 {
784        return None;
785    }
786    let oversub = overcommit_ratio(vcpus, allowed_cpus, None);
787    // The two-VM expect_auto_repro inversion uses a much stricter cap
788    // ([`EXPECT_AUTO_REPRO_SKIP_RATIO`]) than a single-VM boot test: its
789    // repro-VM wprof-trace transport is fragile under time-slicing, so it
790    // skips at an oversubscription a boot-only wide-SMP test still runs at.
791    let skip_ratio = if expect_auto_repro {
792        EXPECT_AUTO_REPRO_SKIP_RATIO
793    } else {
794        OVERCOMMIT_SKIP_RATIO
795    };
796    if oversub < skip_ratio {
797        return None;
798    }
799    let chain = if expect_auto_repro {
800        " for the expect_auto_repro inversion chain"
801    } else {
802        ""
803    };
804    Some(format!(
805        "host topology insufficient: {vcpus} vCPUs auto-collapse onto \
806         {allowed_cpus} allowed host CPUs = {oversub:.1}x oversubscription \
807         (>= {skip_ratio:.0}x skip cap{chain}); widen the process cpuset \
808         or shrink the guest topology"
809    ))
810}
811
812/// Derive the host-side VM timeout from the test entry's watchdog and
813/// duration. Adds vCPU-scaled boot headroom so the workload gets its
814/// full duration even after a slow boot on a large topology, then
815/// multiplies THAT headroom by the host [`overcommit_ratio`] (clamped
816/// to [`OVERCOMMIT_HEADROOM_CAP`]) so an oversubscribed boot — whose
817/// wall-clock grows ~linearly with the ratio as the host time-slices
818/// the vCPU threads — still finishes inside the deadline. Only the boot
819/// headroom scales: `base` is the guest's own workload/watchdog budget
820/// plus the absolute host-side [`COLD_BTF_PHASE1_BUDGET`], neither of
821/// which is the AP-bring-up wall time the ratio models; the headroom's
822/// slack absorbs the small extra host wall-clock a short workload
823/// accrues under time-slicing. [`COLD_BTF_PHASE1_BUDGET`] is added when
824/// the entry declares a `bpf_map_write` (the guest blocks on the host's
825/// cold-BTF accessor build before the workload starts), and
826/// [`crate::vmm::freeze_coord::WPROF_SHIP_GRACE`] when it declares
827/// `wprof` (a crashing scheduler's late Phase-5 trace ship is held for
828/// that window before teardown).
829///
830/// `booted_vcpus` is the vCPU count of the topology the VM actually
831/// boots (`resolve_vm_topology(entry, topo).0.total_cpus()`), NOT the
832/// declared `entry.topology`: under a `TopoOverride` (gauntlet preset /
833/// `--ktstr-topo`) they diverge, and both the vCPU-scaled boot headroom
834/// and the oversubscription multiplier must scale to the topology that
835/// boots — otherwise the watchdog fires mid-boot on a larger-than-declared
836/// preset.
837pub(crate) fn vm_timeout_from_entry(
838    entry: &super::entry::KtstrTestEntry,
839    booted_vcpus: u32,
840) -> Duration {
841    let mut base = entry
842        .watchdog_timeout
843        .max(entry.duration)
844        .max(Duration::from_secs(1));
845    if !entry.bpf_map_write.is_empty() {
846        base += COLD_BTF_PHASE1_BUDGET;
847    }
848    // A wprof entry's scheduler may crash; on an error-class exit the
849    // freeze coordinator holds the VM open up to `WPROF_SHIP_GRACE` for
850    // the guest's late Phase-5 wprof trace ship before killing. Add that
851    // window to the host budget so a late crash's full ship grace fits
852    // inside the watchdog deadline (mirrors COLD_BTF_PHASE1_BUDGET).
853    if entry.wprof {
854        base += crate::vmm::freeze_coord::WPROF_SHIP_GRACE;
855    }
856    let vcpus = booted_vcpus;
857    let oversub = overcommit_ratio(
858        vcpus,
859        crate::vmm::host_topology::host_allowed_cpus().len(),
860        entry.cpu_budget,
861    )
862    .min(OVERCOMMIT_HEADROOM_CAP);
863    base + vm_boot_headroom(vcpus).mul_f64(oversub)
864}
865
866/// Configure the ktstr_test VM builder prefix shared by the main
867/// test path ([`super::eval::run_ktstr_test_inner`]) and the
868/// auto-repro path ([`super::probe::attempt_auto_repro`]).
869///
870/// Applies, in order: kernel, init binary, topology, memory floor,
871/// guest cmdline, SHM size, guest argv, host-side timeout, perf-mode
872/// disable flag, optional scheduler binary, every queued BPF map
873/// write, and the scheduler watchdog timeout.
874///
875/// The caller owns the divergent tail. `run_ktstr_test_inner`
876/// additionally wires `performance_mode`,
877/// `sched_enable_cmds`/`sched_disable_cmds` for kernel-built
878/// schedulers, and `monitor_thresholds`. `attempt_auto_repro`
879/// additionally wires `include_files` plus base `sched_args`.
880#[allow(clippy::too_many_arguments)]
881pub(crate) fn build_vm_builder_base(
882    entry: &KtstrTestEntry,
883    kernel: &Path,
884    ktstr_bin: &Path,
885    scheduler: Option<&Path>,
886    staged_schedulers: &[(String, std::path::PathBuf, Vec<String>)],
887    vm_topology: crate::vmm::topology::Topology,
888    memory_mib: u32,
889    cmdline_extra: &str,
890    guest_args: &[String],
891    no_perf_mode: bool,
892) -> crate::vmm::KtstrVmBuilder {
893    // The base builder deliberately does NOT set
894    // `failure_dump_path` — the per-VM target is caller-specific
895    // (primary vs auto-repro). Stale-file pre-clear lives at the
896    // dispatch sites (`test_support::eval` for primary;
897    // `test_support::probe::attempt_auto_repro` for repro), not
898    // inside the setter or this base call. The setter is pure
899    // (no FS side effects); placing the pre-clear in the dispatch
900    // layer prevents the auto-repro path's reuse of this base
901    // builder from accidentally erasing the primary dump that
902    // just landed.
903    let mut builder = crate::vmm::KtstrVm::builder()
904        .kernel(kernel)
905        .init_binary(ktstr_bin)
906        .topology(vm_topology)
907        .memory_deferred_min(memory_mib)
908        .cmdline(cmdline_extra)
909        .run_args(guest_args)
910        .timeout(vm_timeout_from_entry(entry, vm_topology.total_cpus()))
911        .workload_duration(entry.duration)
912        .no_perf_mode(no_perf_mode);
913
914    // Per-test no-perf CPU budget override (#[ktstr_test(cpu_budget = N)]).
915    // None leaves the builder's auto-size (vCPU count) in place; only the
916    // no-perf path consumes it.
917    if let Some(budget) = entry.cpu_budget {
918        builder = builder.cpu_budget(budget);
919    }
920
921    if let Some(sched_path) = scheduler {
922        builder = builder.scheduler_binary(sched_path);
923    }
924
925    // Push each pre-resolved staged scheduler into the builder's
926    // staging set. Caller is responsible for running each entry
927    // through the resolve_scheduler cascade so this fn stays
928    // infallible (sibling to the boot-time `scheduler: Option<&Path>`
929    // shape which is also caller-resolved). KernelBuiltin / Eevdf
930    // staged entries (no binary to resolve) are skipped at the
931    // caller side; only resolved (name, host_binary, sched_args)
932    // tuples reach this loop.
933    for (name, host_path, sched_args) in staged_schedulers {
934        builder = builder.staged_scheduler(name.clone(), host_path.clone(), sched_args.clone());
935    }
936
937    // Opt-in jemalloc-probe wiring. An integration test that needs
938    // the probe (see `tests/jemalloc_probe_tests.rs`) sets
939    // `KTSTR_JEMALLOC_PROBE_BINARY` to the absolute host path of
940    // `ktstr-jemalloc-probe` via `#[ctor]` before the test harness
941    // dispatches. When set, the probe is packed into every VM's
942    // base initramfs; the init binary stays stripped because the
943    // paired alloc-worker carries DWARF. Absent env var = existing
944    // behavior (no probe).
945    //
946    // Required ctor shape in a new test file that needs the probe
947    // in the guest — paste verbatim, adjust the two binary names.
948    // Either ctor form works (ktstr re-exports both): the proc-macro
949    // attribute shown below, or the declarative
950    // `::ktstr::__private::ctor::declarative::ctor! { ... }` block
951    // form (ktstr's own in-tree sites use the declarative form per
952    // src/test_support/dispatch.rs).
953    //
954    // ```ignore
955    // #[::ktstr::__private::ctor::ctor(unsafe, crate_path = ::ktstr::__private::ctor)]
956    // fn set_probe_binary_env_var() {
957    //     // SAFETY: ctor runs before any `#[ktstr_test]` thread or
958    //     // probe thread spawns; glibc's `__environ` mutation is
959    //     // single-threaded here.
960    //     unsafe {
961    //         std::env::set_var(
962    //             ::ktstr::KTSTR_JEMALLOC_PROBE_BINARY_ENV,
963    //             env!("CARGO_BIN_EXE_ktstr-jemalloc-probe"),
964    //         );
965    //         std::env::set_var(
966    //             ::ktstr::KTSTR_JEMALLOC_ALLOC_WORKER_BINARY_ENV,
967    //             env!("CARGO_BIN_EXE_ktstr-jemalloc-alloc-worker"),
968    //         );
969    //     }
970    // }
971    // ```
972    //
973    // Declarative-form equivalent (no `crate_path = ` plumbing required
974    // because the macro_rules! expansion resolves paths via `$crate`):
975    //
976    // ```ignore
977    // ::ktstr::__private::ctor::declarative::ctor! {
978    // #[ctor(unsafe)]
979    // fn set_probe_binary_env_var() {
980    //     // SAFETY: same as proc-macro form above.
981    //     unsafe {
982    //         std::env::set_var(
983    //             ::ktstr::KTSTR_JEMALLOC_PROBE_BINARY_ENV,
984    //             env!("CARGO_BIN_EXE_ktstr-jemalloc-probe"),
985    //         );
986    //         std::env::set_var(
987    //             ::ktstr::KTSTR_JEMALLOC_ALLOC_WORKER_BINARY_ENV,
988    //             env!("CARGO_BIN_EXE_ktstr-jemalloc-alloc-worker"),
989    //         );
990    //     }
991    // }
992    // }
993    // ```
994    //
995    // The `crate_path = ::ktstr::__private::ctor` argument is
996    // non-negotiable: `#[ctor::ctor(unsafe)]` without the
997    // re-export path panics at compile time because the `ctor`
998    // crate is not listed in the test crate's direct deps. ktstr
999    // re-exports `ctor` under `__private::ctor` exactly so test
1000    // authors do not need to add it themselves. ctor 1.0 also
1001    // mandates the `unsafe` marker as the first attribute
1002    // argument; bare `#[ctor::ctor]` no longer compiles.
1003    if let Ok(probe_path) = std::env::var(crate::KTSTR_JEMALLOC_PROBE_BINARY_ENV)
1004        && !probe_path.is_empty()
1005    {
1006        // Pack the probe binary into the guest initramfs at
1007        // `/bin/ktstr-jemalloc-probe`. Closed-loop probe tests run
1008        // the probe via `--pid <alloc_worker_pid>` against the
1009        // paired `ktstr-jemalloc-alloc-worker` target; DWARF comes
1010        // from the worker's own ELF, not the init's.
1011        builder = builder.jemalloc_probe_binary(std::path::PathBuf::from(probe_path));
1012    }
1013    if let Ok(worker_path) = std::env::var(crate::KTSTR_JEMALLOC_ALLOC_WORKER_BINARY_ENV)
1014        && !worker_path.is_empty()
1015    {
1016        // Pack the jemalloc-alloc-worker binary alongside the
1017        // probe. Only the cross-process closed-loop test sets
1018        // this; scheduler-only tests leave the env var unset and
1019        // skip the wiring.
1020        builder = builder.jemalloc_alloc_worker_binary(std::path::PathBuf::from(worker_path));
1021    }
1022
1023    for bpf_write in entry.bpf_map_write {
1024        builder = builder.bpf_map_write(
1025            bpf_write.map_name_suffix(),
1026            bpf_write.field(),
1027            bpf_write.value(),
1028        );
1029    }
1030
1031    for watch in entry.watch_bpf_maps {
1032        builder = builder.watch_bpf_map(
1033            watch.map_name_suffix(),
1034            watch.field(),
1035            watch.agg(),
1036            watch.label(),
1037        );
1038    }
1039
1040    if let Some(disk_cfg) = entry.disk.clone() {
1041        builder = builder.disk(disk_cfg);
1042    }
1043
1044    for net_cfg in entry.networks {
1045        builder = builder.network(*net_cfg);
1046    }
1047
1048    builder = builder.num_snapshots(entry.num_snapshots);
1049
1050    if let Some(root) = entry.workload_root_cgroup {
1051        builder = builder.workload_root_cgroup(root.as_str().to_string());
1052    }
1053    if let Some(parent) = entry.scheduler.cgroup_parent {
1054        builder = builder.scheduler_cgroup_parent(parent.as_str().to_string());
1055    }
1056
1057    builder.watchdog_timeout(entry.watchdog_timeout)
1058}
1059
1060#[cfg(test)]
1061mod tests {
1062    use super::super::entry::Scheduler;
1063    use super::super::test_helpers::{EnvVarGuard, lock_env};
1064    use super::*;
1065
1066    /// RAII pin of the host allowed-CPU set so [`vm_timeout_from_entry`]'s
1067    /// overcommit scaling is deterministic regardless of the runner's
1068    /// real cpuset. Mirrors `host_topology::tests::AllowedCpusGuard`
1069    /// (which is private to that test module). Sets the thread-local
1070    /// `ALLOWED_CPUS_OVERRIDE` and clears it on drop so a panicking test
1071    /// never leaks the override to a sibling sharing the thread.
1072    struct AllowedCpusPin;
1073    impl AllowedCpusPin {
1074        fn new(cpus: Vec<usize>) -> Self {
1075            crate::vmm::host_topology::ALLOWED_CPUS_OVERRIDE.with(|p| *p.borrow_mut() = Some(cpus));
1076            AllowedCpusPin
1077        }
1078    }
1079    impl Drop for AllowedCpusPin {
1080        fn drop(&mut self) {
1081            crate::vmm::host_topology::ALLOWED_CPUS_OVERRIDE.with(|p| *p.borrow_mut() = None);
1082        }
1083    }
1084
1085    #[test]
1086    fn vm_timeout_from_entry_adds_cold_btf_budget_for_bpf_map_write() {
1087        use super::super::entry::{BpfMapWrite, KtstrTestEntry};
1088        static W: BpfMapWrite = BpfMapWrite::new(".bss", "crash", 0);
1089        static WS: &[&BpfMapWrite] = &[&W];
1090        let no_write = KtstrTestEntry {
1091            name: "no_write",
1092            ..KtstrTestEntry::DEFAULT
1093        };
1094        let with_write = KtstrTestEntry {
1095            name: "with_write",
1096            bpf_map_write: WS,
1097            ..KtstrTestEntry::DEFAULT
1098        };
1099        // The cold-BTF phase-1 budget is added to the workload base only
1100        // when the entry declares a host-side bpf_map_write; the delta
1101        // between an otherwise-identical pair is exactly that budget.
1102        assert_eq!(
1103            vm_timeout_from_entry(&with_write, with_write.topology.total_cpus()),
1104            vm_timeout_from_entry(&no_write, no_write.topology.total_cpus())
1105                + COLD_BTF_PHASE1_BUDGET,
1106            "bpf_map_write entries get the cold-BTF phase-1 budget added",
1107        );
1108    }
1109
1110    #[test]
1111    fn no_perf_mode_active_true_when_env_set_to_value() {
1112        let _l = lock_env();
1113        let _g = EnvVarGuard::set(crate::KTSTR_NO_PERF_MODE_ENV, "1");
1114        assert!(no_perf_mode_active());
1115    }
1116
1117    #[test]
1118    fn no_perf_mode_active_false_when_env_unset() {
1119        let _l = lock_env();
1120        let _g = EnvVarGuard::remove(crate::KTSTR_NO_PERF_MODE_ENV);
1121        assert!(!no_perf_mode_active());
1122    }
1123
1124    /// Regression pin: empty-string-as-unset contract. Before the
1125    /// env-var-sweep cleanup, the bare `is_ok()` reader returned
1126    /// true on
1127    /// `KTSTR_NO_PERF_MODE=` (set but empty — e.g. Docker
1128    /// `--env KTSTR_NO_PERF_MODE` pass-through fired without a
1129    /// value), silently flipping perf-mode OFF for every
1130    /// `performance_mode` test. The fix at L146 treats
1131    /// empty-as-unset; this test pins that contract for ALL
1132    /// consumer sites (shell-mode VM at lib.rs, verifier at
1133    /// verifier.rs, dispatch + eval) since they all route
1134    /// through this helper.
1135    #[test]
1136    fn no_perf_mode_active_false_when_env_set_to_empty_string() {
1137        let _l = lock_env();
1138        let _g = EnvVarGuard::set(crate::KTSTR_NO_PERF_MODE_ENV, "");
1139        assert!(
1140            !no_perf_mode_active(),
1141            "empty-string env must be treated as UNSET — a regression \
1142             here flips perf-mode for every consumer that routes \
1143             through no_perf_mode_active",
1144        );
1145    }
1146
1147    #[test]
1148    fn perf_only_active_true_when_env_set_to_value() {
1149        let _l = lock_env();
1150        let _g = EnvVarGuard::set(crate::KTSTR_PERF_ONLY_ENV, "1");
1151        assert!(perf_only_active());
1152    }
1153
1154    #[test]
1155    fn perf_only_active_false_when_env_unset() {
1156        let _l = lock_env();
1157        let _g = EnvVarGuard::remove(crate::KTSTR_PERF_ONLY_ENV);
1158        assert!(!perf_only_active());
1159    }
1160
1161    /// Empty-as-unset contract (mirrors `no_perf_mode_active`): a
1162    /// `KTSTR_PERF_ONLY=` pass-through must NOT silently skip every
1163    /// non-perf test.
1164    #[test]
1165    fn perf_only_active_false_when_env_set_to_empty_string() {
1166        let _l = lock_env();
1167        let _g = EnvVarGuard::set(crate::KTSTR_PERF_ONLY_ENV, "");
1168        assert!(
1169            !perf_only_active(),
1170            "empty-string env must be treated as UNSET",
1171        );
1172    }
1173
1174    /// Selection logic: with perf-only active, a non-performance_mode
1175    /// entry is skipped while a performance_mode entry is kept. When
1176    /// perf-only is inactive, neither is skipped.
1177    #[test]
1178    fn perf_only_skips_entry_keeps_perf_skips_others() {
1179        use super::super::entry::KtstrTestEntry;
1180        let perf = KtstrTestEntry {
1181            name: "perf",
1182            performance_mode: true,
1183            ..KtstrTestEntry::DEFAULT
1184        };
1185        let plain = KtstrTestEntry {
1186            name: "plain",
1187            performance_mode: false,
1188            ..KtstrTestEntry::DEFAULT
1189        };
1190
1191        let _l = lock_env();
1192        {
1193            let _g = EnvVarGuard::set(crate::KTSTR_PERF_ONLY_ENV, "1");
1194            assert!(
1195                !perf_only_skips_entry(&perf),
1196                "a performance_mode test is the selection target, never skipped",
1197            );
1198            assert!(
1199                perf_only_skips_entry(&plain),
1200                "a non-performance_mode test must be skipped under perf-only",
1201            );
1202        }
1203        let _g = EnvVarGuard::remove(crate::KTSTR_PERF_ONLY_ENV);
1204        assert!(!perf_only_skips_entry(&perf));
1205        assert!(
1206            !perf_only_skips_entry(&plain),
1207            "perf-only inactive => nothing is skipped on this axis",
1208        );
1209    }
1210
1211    #[test]
1212    fn bypass_llc_locks_active_true_when_env_set_to_value() {
1213        let _l = lock_env();
1214        let _g = EnvVarGuard::set(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, "1");
1215        assert!(bypass_llc_locks_active());
1216    }
1217
1218    #[test]
1219    fn bypass_llc_locks_active_false_when_env_unset() {
1220        let _l = lock_env();
1221        let _g = EnvVarGuard::remove(crate::KTSTR_BYPASS_LLC_LOCKS_ENV);
1222        assert!(!bypass_llc_locks_active());
1223    }
1224
1225    /// Regression pin: empty-string-as-unset contract for
1226    /// KTSTR_BYPASS_LLC_LOCKS. A bare `KTSTR_BYPASS_LLC_LOCKS=`
1227    /// (CI shell / Docker `--env` pass-through without value)
1228    /// must NOT activate the bypass. The helper enforces this
1229    /// uniformly for all 7 reader sites (vmm/builder.rs,
1230    /// cli/kernel_build/build.rs ×2, bin/ktstr.rs ×2,
1231    /// bin/cargo_ktstr/{kernel/mod, misc/shell}) — a regression
1232    /// here flips the contention contract for every caller.
1233    #[test]
1234    fn bypass_llc_locks_active_false_when_env_set_to_empty_string() {
1235        let _l = lock_env();
1236        let _g = EnvVarGuard::set(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, "");
1237        assert!(
1238            !bypass_llc_locks_active(),
1239            "empty-string env must be treated as UNSET per the contract \
1240             shared with no_perf_mode_active — a regression here flips \
1241             LLC flock contention enforcement for every reader",
1242        );
1243    }
1244
1245    #[test]
1246    fn config_file_parts_nested_path() {
1247        static SCHED: Scheduler = Scheduler::named("cfg").config_file("configs/my_sched.toml");
1248        let entry = KtstrTestEntry {
1249            name: "cfg_test",
1250            scheduler: &SCHED,
1251            ..KtstrTestEntry::DEFAULT
1252        };
1253        let (archive, host, guest) = config_file_parts(&entry).unwrap();
1254        assert_eq!(archive, "include-files/my_sched.toml");
1255        assert_eq!(host, PathBuf::from("configs/my_sched.toml"));
1256        assert_eq!(guest, "/include-files/my_sched.toml");
1257    }
1258
1259    #[test]
1260    fn config_file_parts_bare_filename() {
1261        static SCHED: Scheduler = Scheduler::named("cfg").config_file("config.toml");
1262        let entry = KtstrTestEntry {
1263            name: "cfg_bare",
1264            scheduler: &SCHED,
1265            ..KtstrTestEntry::DEFAULT
1266        };
1267        let (archive, host, guest) = config_file_parts(&entry).unwrap();
1268        assert_eq!(archive, "include-files/config.toml");
1269        assert_eq!(host, PathBuf::from("config.toml"));
1270        assert_eq!(guest, "/include-files/config.toml");
1271    }
1272
1273    #[test]
1274    fn config_file_parts_none_when_unset() {
1275        let entry = KtstrTestEntry {
1276            name: "no_cfg",
1277            ..KtstrTestEntry::DEFAULT
1278        };
1279        assert!(config_file_parts(&entry).is_none());
1280    }
1281
1282    // -- build_cmdline_extra --
1283
1284    use super::super::entry::{KtstrTestEntry, Sysctl};
1285
1286    #[test]
1287    fn build_cmdline_extra_default_is_sidecar_only() {
1288        let _lock = lock_env();
1289        // Make sure the env does not inject spurious RUST_BACKTRACE /
1290        // RUST_LOG entries that would break the default assertion.
1291        let _env_bt = EnvVarGuard::remove("RUST_BACKTRACE");
1292        let _env_log = EnvVarGuard::remove("RUST_LOG");
1293        // Pin KTSTR_SIDECAR_DIR so the propagation token shape is
1294        // stable across tests; without the override, the call falls
1295        // through to the `{kernel}-{commit}` resolver whose output
1296        // depends on the test process's git state.
1297        let _env_sd = EnvVarGuard::set(crate::KTSTR_SIDECAR_DIR_ENV, "/tmp/ktstr-test");
1298
1299        let entry = KtstrTestEntry {
1300            name: "cmdline_test",
1301            ..KtstrTestEntry::DEFAULT
1302        };
1303        let out = build_cmdline_extra(&entry);
1304        assert_eq!(out, "KTSTR_SIDECAR_DIR=/tmp/ktstr-test");
1305    }
1306
1307    #[test]
1308    fn build_cmdline_extra_appends_sysctls_kargs() {
1309        let _lock = lock_env();
1310        let _env_bt = EnvVarGuard::remove("RUST_BACKTRACE");
1311        let _env_log = EnvVarGuard::remove("RUST_LOG");
1312        let _env_sd = EnvVarGuard::set(crate::KTSTR_SIDECAR_DIR_ENV, "/tmp/ktstr-test");
1313
1314        static SYSCTLS: &[Sysctl] = &[Sysctl::new("kernel.foo", "1")];
1315        static SCHED: Scheduler = Scheduler::named("s").sysctls(SYSCTLS).kargs(&["quiet"]);
1316        let entry = KtstrTestEntry {
1317            name: "cmd",
1318            scheduler: &SCHED,
1319            ..KtstrTestEntry::DEFAULT
1320        };
1321        let out = build_cmdline_extra(&entry);
1322        assert_eq!(
1323            out,
1324            "sysctl.kernel.foo=1 quiet KTSTR_SIDECAR_DIR=/tmp/ktstr-test"
1325        );
1326    }
1327
1328    #[test]
1329    fn build_cmdline_extra_propagates_rust_env() {
1330        let _lock = lock_env();
1331        let _env_bt = EnvVarGuard::set("RUST_BACKTRACE", "1");
1332        let _env_log = EnvVarGuard::set("RUST_LOG", "debug");
1333        let _env_sd = EnvVarGuard::set(crate::KTSTR_SIDECAR_DIR_ENV, "/tmp/ktstr-test");
1334
1335        let entry = KtstrTestEntry {
1336            name: "cmd",
1337            ..KtstrTestEntry::DEFAULT
1338        };
1339        let out = build_cmdline_extra(&entry);
1340        assert!(
1341            out.contains("RUST_BACKTRACE=1"),
1342            "expected RUST_BACKTRACE propagation: {out}"
1343        );
1344        assert!(
1345            out.contains("RUST_LOG=debug"),
1346            "expected RUST_LOG propagation: {out}"
1347        );
1348        assert!(
1349            out.contains("KTSTR_SIDECAR_DIR=/tmp/ktstr-test"),
1350            "expected KTSTR_SIDECAR_DIR propagation: {out}"
1351        );
1352    }
1353
1354    #[test]
1355    fn build_cmdline_extra_propagates_sidecar_dir() {
1356        let _lock = lock_env();
1357        let _env_bt = EnvVarGuard::remove("RUST_BACKTRACE");
1358        let _env_log = EnvVarGuard::remove("RUST_LOG");
1359        // Explicit override path proves the token shape is exactly
1360        // `KTSTR_SIDECAR_DIR=<path>` and uses the override verbatim
1361        // (host's `sidecar_dir()` honours the env var as the
1362        // operator-chosen override slot).
1363        let _env_sd = EnvVarGuard::set(crate::KTSTR_SIDECAR_DIR_ENV, "/explicit/sidecar/dir");
1364
1365        let entry = KtstrTestEntry {
1366            name: "cmd",
1367            ..KtstrTestEntry::DEFAULT
1368        };
1369        let out = build_cmdline_extra(&entry);
1370        assert_eq!(out, "KTSTR_SIDECAR_DIR=/explicit/sidecar/dir");
1371    }
1372
1373    // -- resolve_vm_topology --
1374
1375    #[test]
1376    fn resolve_vm_topology_override_is_verbatim() {
1377        let entry = KtstrTestEntry {
1378            name: "topo_test",
1379            ..KtstrTestEntry::DEFAULT
1380        };
1381        let over = super::super::topo::TopoOverride {
1382            numa_nodes: 2,
1383            llcs: 4,
1384            cores: 8,
1385            threads: 2,
1386            memory_mib: 4096,
1387        };
1388        let (topo, mem) = resolve_vm_topology(&entry, Some(&over));
1389        assert_eq!(mem, 4096);
1390        assert_eq!(topo.llcs, 4);
1391        assert_eq!(topo.cores_per_llc, 8);
1392        assert_eq!(topo.threads_per_core, 2);
1393        assert_eq!(topo.numa_nodes, 2);
1394    }
1395
1396    #[test]
1397    fn resolve_vm_topology_none_floors_memory_at_256() {
1398        // Tiny topology: 1*1*1=1 cpu -> 64 MiB raw, entry.memory_mib=0,
1399        // floor = max(64, 256, 0) = 256.
1400        //
1401        // Override memory_mib explicitly to 0 — KtstrTestEntry::DEFAULT
1402        // sets memory_mib=2048, which would bypass the floor entirely
1403        // and leave this test vacuously passing regardless of the
1404        // max(…, 256, …) branch. Setting memory_mib=0 makes the 256
1405        // floor the exact lower bound the assertion verifies.
1406        let entry = KtstrTestEntry {
1407            name: "tiny",
1408            memory_mib: 0,
1409            ..KtstrTestEntry::DEFAULT
1410        };
1411        let (_topo, mem) = resolve_vm_topology(&entry, None);
1412        assert_eq!(mem, 256, "memory floor = 256 MiB, got {mem}");
1413    }
1414
1415    #[test]
1416    fn resolve_vm_topology_none_honors_entry_memory_mib() {
1417        // Entry with explicit memory_mib above the cpu*64 and 256 floors.
1418        let entry = KtstrTestEntry {
1419            name: "mem",
1420            memory_mib: 8192,
1421            ..KtstrTestEntry::DEFAULT
1422        };
1423        let (_topo, mem) = resolve_vm_topology(&entry, None);
1424        assert_eq!(mem, 8192);
1425    }
1426
1427    #[cfg(feature = "wprof")]
1428    #[test]
1429    fn resolve_vm_topology_wprof_floors_memory_on_entry_path() {
1430        // Entry with wprof=true and memory below the wprof floor.
1431        // The entry-derived path must raise memory to WPROF_MIN_MEMORY_MIB.
1432        let entry = KtstrTestEntry {
1433            name: "wprof_floor",
1434            memory_mib: 512,
1435            wprof: true,
1436            ..KtstrTestEntry::DEFAULT
1437        };
1438        let (_topo, mem) = resolve_vm_topology(&entry, None);
1439        assert_eq!(
1440            mem,
1441            crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1442            "wprof=true must bump memory to >= WPROF_MIN_MEMORY_MIB \
1443             ({}), got {mem}",
1444            crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1445        );
1446    }
1447
1448    #[cfg(feature = "wprof")]
1449    #[test]
1450    fn resolve_vm_topology_wprof_no_bump_when_already_above_floor() {
1451        // Entry already above the wprof floor — must be honored unchanged.
1452        let entry = KtstrTestEntry {
1453            name: "wprof_high",
1454            memory_mib: 8192,
1455            wprof: true,
1456            ..KtstrTestEntry::DEFAULT
1457        };
1458        let (_topo, mem) = resolve_vm_topology(&entry, None);
1459        assert_eq!(
1460            mem, 8192,
1461            "memory_mib above WPROF_MIN_MEMORY_MIB must be honored \
1462             unchanged, got {mem}"
1463        );
1464    }
1465
1466    #[test]
1467    fn resolve_vm_topology_wprof_disabled_does_not_floor() {
1468        // wprof=false: the wprof floor must NOT apply, even when
1469        // entry.memory_mib falls below WPROF_MIN_MEMORY_MIB. Only
1470        // the universal 256 floor + cpu*64 derivation apply.
1471        let entry = KtstrTestEntry {
1472            name: "no_wprof",
1473            memory_mib: 512,
1474            wprof: false,
1475            ..KtstrTestEntry::DEFAULT
1476        };
1477        let (_topo, mem) = resolve_vm_topology(&entry, None);
1478        assert_eq!(
1479            mem, 512,
1480            "wprof=false must not invoke the WPROF_MIN_MEMORY_MIB \
1481             floor, got {mem}"
1482        );
1483    }
1484
1485    #[test]
1486    fn derive_test_memory_mib_baseline_without_wprof() {
1487        let entry = KtstrTestEntry {
1488            name: "baseline",
1489            memory_mib: 0,
1490            ..KtstrTestEntry::DEFAULT
1491        };
1492        let mem = derive_test_memory_mib(2, &entry);
1493        assert_eq!(mem, 256, "2 cpus * 64 = 128, floor 256 wins");
1494    }
1495
1496    #[cfg(feature = "wprof")]
1497    #[test]
1498    fn resolve_vm_topology_wprof_no_bump_at_exact_floor() {
1499        // Boundary case: derived memory equals WPROF_MIN_MEMORY_MIB
1500        // exactly. The handler uses strict `<` so 2048 passes through
1501        // unchanged. A regression that flipped to `<=` would be a
1502        // 2048→2048 no-op (still unobservable), but a regression
1503        // that flipped to `>` (or `>= ... { raw } else { FLOOR }`)
1504        // would catastrophically floor every test. This test pins
1505        // the strict-less-than direction.
1506        let entry = KtstrTestEntry {
1507            name: "wprof_exact",
1508            memory_mib: crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1509            wprof: true,
1510            ..KtstrTestEntry::DEFAULT
1511        };
1512        let (_topo, mem) = resolve_vm_topology(&entry, None);
1513        assert_eq!(
1514            mem,
1515            crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1516            "memory_mib equal to WPROF_MIN_MEMORY_MIB must pass \
1517             through unchanged (strict-less-than floor condition); \
1518             got {mem}"
1519        );
1520    }
1521
1522    #[cfg(feature = "wprof")]
1523    #[test]
1524    fn resolve_vm_topology_wprof_floors_zero_entry_memory_mib() {
1525        // Edge case: entry.memory_mib=0 with wprof=true. The raw
1526        // derivation `max(cpus*64, 256, 0)` resolves to 256 on the
1527        // default 1-CPU topology, which is well below the floor.
1528        // wprof must bump to WPROF_MIN_MEMORY_MIB.
1529        let entry = KtstrTestEntry {
1530            name: "wprof_zero_mib",
1531            memory_mib: 0,
1532            wprof: true,
1533            ..KtstrTestEntry::DEFAULT
1534        };
1535        let (_topo, mem) = resolve_vm_topology(&entry, None);
1536        assert_eq!(
1537            mem,
1538            crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1539            "entry.memory_mib=0 with wprof=true must floor to \
1540             WPROF_MIN_MEMORY_MIB ({}); got {mem}",
1541            crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1542        );
1543    }
1544
1545    #[cfg(feature = "wprof")]
1546    #[test]
1547    fn derive_test_memory_mib_helper_applies_wprof_floor() {
1548        // Direct test of the derivation helper used by BOTH
1549        // resolve_vm_topology AND the dispatch.rs sites that
1550        // construct TopoOverride from CLI / preset topology
1551        // (run_ktstr_test_with_topo_str, run_gauntlet_test).
1552        // Pins that the helper applies the wprof floor — a
1553        // regression that re-inlined the formula at the dispatch
1554        // sites without the wprof check would silently bypass
1555        // the floor when `cargo ktstr test --ktstr-topo` runs
1556        // against a wprof-tagged test.
1557        let entry = KtstrTestEntry {
1558            name: "helper",
1559            memory_mib: 0,
1560            wprof: true,
1561            ..KtstrTestEntry::DEFAULT
1562        };
1563        let mem = derive_test_memory_mib(2, &entry);
1564        assert_eq!(
1565            mem,
1566            crate::vmm::wprof::WPROF_MIN_MEMORY_MIB,
1567            "helper must floor wprof memory regardless of caller; got {mem}"
1568        );
1569
1570        // wprof=false: derivation returns the raw formula
1571        // without any floor.
1572        let entry_no_wprof = KtstrTestEntry {
1573            wprof: false,
1574            ..entry
1575        };
1576        let mem = derive_test_memory_mib(2, &entry_no_wprof);
1577        assert_eq!(
1578            mem, 256,
1579            "helper with wprof=false must NOT apply the floor; \
1580             expected max(2*64, 256, 0)=256, got {mem}"
1581        );
1582    }
1583
1584    #[cfg(feature = "wprof")]
1585    #[test]
1586    fn resolve_vm_topology_override_with_wprof_honors_override_verbatim() {
1587        // The override-is-verbatim contract: a TopoOverride with
1588        // memory_mib below WPROF_MIN_MEMORY_MIB is honored as the
1589        // operator's explicit choice. A warn-level log fires (not
1590        // verified in this unit test — tracing capture is out of
1591        // scope here) but the boot memory matches the override.
1592        let entry = KtstrTestEntry {
1593            name: "override_wprof",
1594            wprof: true,
1595            ..KtstrTestEntry::DEFAULT
1596        };
1597        let over = super::super::topo::TopoOverride {
1598            numa_nodes: 1,
1599            llcs: 1,
1600            cores: 1,
1601            threads: 1,
1602            memory_mib: 512,
1603        };
1604        let (_topo, mem) = resolve_vm_topology(&entry, Some(&over));
1605        assert_eq!(
1606            mem, 512,
1607            "TopoOverride.memory_mib must be honored verbatim even \
1608             with wprof enabled, got {mem}"
1609        );
1610    }
1611
1612    // -- append_base_sched_args --
1613
1614    #[test]
1615    fn append_base_sched_args_empty_when_none_set() {
1616        let entry = KtstrTestEntry {
1617            name: "nosched",
1618            ..KtstrTestEntry::DEFAULT
1619        };
1620        let mut args = Vec::new();
1621        append_base_sched_args(&entry, &mut args);
1622        assert!(args.is_empty(), "no sched args expected: {args:?}");
1623    }
1624
1625    /// `cgroup_parent` does NOT auto-inject `--cell-parent-cgroup`
1626    /// into the scheduler argv — the two concerns are decoupled.
1627    /// The scheduler-def `sched_args` and the per-test
1628    /// `extra_sched_args` flow through unchanged; the `cgroup_parent`
1629    /// setting controls the framework's cgroup root but never
1630    /// modifies the scheduler's CLI invocation.
1631    #[test]
1632    fn append_base_sched_args_does_not_auto_inject_cell_parent_cgroup() {
1633        static SCHED: Scheduler = Scheduler::named("s")
1634            .cgroup_parent("/sys/fs/cgroup/ktstr")
1635            .sched_args(&["-v", "--flag"]);
1636        let entry = KtstrTestEntry {
1637            name: "sched",
1638            scheduler: &SCHED,
1639            extra_sched_args: &["--extra"],
1640            ..KtstrTestEntry::DEFAULT
1641        };
1642        let mut args = Vec::new();
1643        append_base_sched_args(&entry, &mut args);
1644        assert_eq!(
1645            args,
1646            vec![
1647                "-v".to_string(),
1648                "--flag".to_string(),
1649                "--extra".to_string(),
1650            ],
1651            "cgroup_parent must not auto-inject --cell-parent-cgroup; \
1652             only sched_args + extra_sched_args reach the scheduler"
1653        );
1654    }
1655
1656    /// User-passed `--cell-parent-cgroup /user` via `extra_sched_args`
1657    /// suppresses the auto-inject so clap inside the scheduler binary
1658    /// doesn't reject the duplicate.
1659    #[test]
1660    fn append_base_sched_args_dedupes_extra_split_form() {
1661        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1662        let entry = KtstrTestEntry {
1663            name: "sched",
1664            scheduler: &SCHED,
1665            extra_sched_args: &["--cell-parent-cgroup", "/user"],
1666            ..KtstrTestEntry::DEFAULT
1667        };
1668        let mut args = Vec::new();
1669        append_base_sched_args(&entry, &mut args);
1670        assert_eq!(
1671            args,
1672            vec!["--cell-parent-cgroup".to_string(), "/user".to_string()],
1673            "auto-inject must be skipped when extra_sched_args carries \
1674             --cell-parent-cgroup in two-token form"
1675        );
1676    }
1677
1678    /// Combined form (`--cell-parent-cgroup=/user`) must also suppress
1679    /// the auto-inject.
1680    #[test]
1681    fn append_base_sched_args_dedupes_extra_combined_form() {
1682        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1683        let entry = KtstrTestEntry {
1684            name: "sched",
1685            scheduler: &SCHED,
1686            extra_sched_args: &["--cell-parent-cgroup=/user"],
1687            ..KtstrTestEntry::DEFAULT
1688        };
1689        let mut args = Vec::new();
1690        append_base_sched_args(&entry, &mut args);
1691        assert_eq!(
1692            args,
1693            vec!["--cell-parent-cgroup=/user".to_string()],
1694            "auto-inject must be skipped when extra_sched_args carries \
1695             --cell-parent-cgroup in combined `=` form"
1696        );
1697    }
1698
1699    /// Scheduler-def `sched_args` carrying `--cell-parent-cgroup`
1700    /// also suppresses the auto-inject.
1701    #[test]
1702    fn append_base_sched_args_dedupes_scheduler_sched_args() {
1703        static SCHED: Scheduler = Scheduler::named("s")
1704            .cgroup_parent("/sys/fs/cgroup/ktstr")
1705            .sched_args(&["--cell-parent-cgroup", "/user"]);
1706        let entry = KtstrTestEntry {
1707            name: "sched",
1708            scheduler: &SCHED,
1709            ..KtstrTestEntry::DEFAULT
1710        };
1711        let mut args = Vec::new();
1712        append_base_sched_args(&entry, &mut args);
1713        assert_eq!(
1714            args,
1715            vec!["--cell-parent-cgroup".to_string(), "/user".to_string()],
1716            "auto-inject must be skipped when scheduler.sched_args carries \
1717             --cell-parent-cgroup"
1718        );
1719    }
1720
1721    /// Scheduler-def `sched_args` carrying the combined `=` form also
1722    /// suppresses the auto-inject — completes the {source × form}
1723    /// 2×2 matrix.
1724    #[test]
1725    fn append_base_sched_args_dedupes_scheduler_sched_args_combined_form() {
1726        static SCHED: Scheduler = Scheduler::named("s")
1727            .cgroup_parent("/sys/fs/cgroup/ktstr")
1728            .sched_args(&["--cell-parent-cgroup=/user"]);
1729        let entry = KtstrTestEntry {
1730            name: "sched",
1731            scheduler: &SCHED,
1732            ..KtstrTestEntry::DEFAULT
1733        };
1734        let mut args = Vec::new();
1735        append_base_sched_args(&entry, &mut args);
1736        assert_eq!(
1737            args,
1738            vec!["--cell-parent-cgroup=/user".to_string()],
1739            "auto-inject must be skipped when scheduler.sched_args carries \
1740             --cell-parent-cgroup in combined `=` form"
1741        );
1742    }
1743
1744    /// When BOTH scheduler.sched_args AND extra_sched_args carry
1745    /// `--cell-parent-cgroup`, the framework's auto-inject is
1746    /// suppressed (`.any()` short-circuits on first match) but the
1747    /// user's duplicates flow through unchanged. The framework does
1748    /// not dedupe user-supplied duplicates — clap inside the
1749    /// scheduler binary will reject them with "cannot be used
1750    /// multiple times", as it should. Pin: the framework correctly
1751    /// avoids ADDING a third copy.
1752    #[test]
1753    fn append_base_sched_args_does_not_dedupe_user_dupes() {
1754        static SCHED: Scheduler = Scheduler::named("s")
1755            .cgroup_parent("/sys/fs/cgroup/ktstr")
1756            .sched_args(&["--cell-parent-cgroup", "/sched"]);
1757        let entry = KtstrTestEntry {
1758            name: "sched",
1759            scheduler: &SCHED,
1760            extra_sched_args: &["--cell-parent-cgroup", "/extra"],
1761            ..KtstrTestEntry::DEFAULT
1762        };
1763        let mut args = Vec::new();
1764        append_base_sched_args(&entry, &mut args);
1765        assert_eq!(
1766            args,
1767            vec![
1768                "--cell-parent-cgroup".to_string(),
1769                "/sched".to_string(),
1770                "--cell-parent-cgroup".to_string(),
1771                "/extra".to_string(),
1772            ],
1773            "framework auto-inject is suppressed; both user-supplied \
1774             entries flow through unchanged (user owns the dup)"
1775        );
1776    }
1777
1778    /// Empty combined value (`--cell-parent-cgroup=`) is rejected at
1779    /// the framework gate with an actionable panic that names the
1780    /// offending test and points the operator at the right fix.
1781    /// Empty values would resolve to `/sys/fs/cgroup` (the host
1782    /// cgroup root) downstream — guaranteed to corrupt unrelated
1783    /// cgroup state — so the framework rejects rather than letting
1784    /// clap surface a generic "value required" error after the
1785    /// cgroup hierarchy has already been built.
1786    #[test]
1787    #[should_panic(expected = "that does not start with `/`")]
1788    fn append_base_sched_args_panics_on_empty_combined_value_via_extra() {
1789        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1790        let entry = KtstrTestEntry {
1791            name: "sched",
1792            scheduler: &SCHED,
1793            extra_sched_args: &["--cell-parent-cgroup="],
1794            ..KtstrTestEntry::DEFAULT
1795        };
1796        let mut args = Vec::new();
1797        append_base_sched_args(&entry, &mut args);
1798    }
1799
1800    /// Two-token form with an empty value as the second token
1801    /// (`["--cell-parent-cgroup", ""]`) is rejected by the same gate.
1802    /// Covers the second route into `parse_cell_parent_cgroup` so a
1803    /// future refactor that switches the empty-detection logic on
1804    /// only one form gets caught.
1805    #[test]
1806    #[should_panic(expected = "that does not start with `/`")]
1807    fn append_base_sched_args_panics_on_empty_two_token_value_via_extra() {
1808        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1809        let entry = KtstrTestEntry {
1810            name: "sched_two_token",
1811            scheduler: &SCHED,
1812            extra_sched_args: &["--cell-parent-cgroup", ""],
1813            ..KtstrTestEntry::DEFAULT
1814        };
1815        let mut args = Vec::new();
1816        append_base_sched_args(&entry, &mut args);
1817    }
1818
1819    /// Bad value via the scheduler-def's own `sched_args` rather than
1820    /// the test's `extra_sched_args` — the chain at the parser site
1821    /// covers both sources, so the gate fires regardless of origin.
1822    /// Pins both the combined form and the scheduler origin.
1823    #[test]
1824    #[should_panic(expected = "that does not start with `/`")]
1825    fn append_base_sched_args_panics_on_empty_combined_value_via_scheduler_sched_args() {
1826        static SCHED: Scheduler = Scheduler::named("s")
1827            .cgroup_parent("/sys/fs/cgroup/ktstr")
1828            .sched_args(&["--cell-parent-cgroup="]);
1829        let entry = KtstrTestEntry {
1830            name: "sched_in_def",
1831            scheduler: &SCHED,
1832            ..KtstrTestEntry::DEFAULT
1833        };
1834        let mut args = Vec::new();
1835        append_base_sched_args(&entry, &mut args);
1836    }
1837
1838    /// Two-token form via the scheduler-def origin — completes the
1839    /// 2-source × 2-form matrix together with the three siblings.
1840    #[test]
1841    #[should_panic(expected = "that does not start with `/`")]
1842    fn append_base_sched_args_panics_on_empty_two_token_value_via_scheduler_sched_args() {
1843        static SCHED: Scheduler = Scheduler::named("s")
1844            .cgroup_parent("/sys/fs/cgroup/ktstr")
1845            .sched_args(&["--cell-parent-cgroup", ""]);
1846        let entry = KtstrTestEntry {
1847            name: "sched_in_def_two_token",
1848            scheduler: &SCHED,
1849            ..KtstrTestEntry::DEFAULT
1850        };
1851        let mut args = Vec::new();
1852        append_base_sched_args(&entry, &mut args);
1853    }
1854
1855    /// Empty-value gate fires even when the scheduler-def has no
1856    /// `cgroup_parent` default. Without the universal gate the empty
1857    /// value would slip through and corrupt unrelated host cgroup
1858    /// state at the downstream `resolve_cgroup_root` interpolation.
1859    #[test]
1860    #[should_panic(expected = "that does not start with `/`")]
1861    fn append_base_sched_args_panics_on_empty_combined_value_no_scheduler_cgroup_parent() {
1862        static SCHED: Scheduler = Scheduler::named("s");
1863        let entry = KtstrTestEntry {
1864            name: "no_default_cgroup",
1865            scheduler: &SCHED,
1866            extra_sched_args: &["--cell-parent-cgroup="],
1867            ..KtstrTestEntry::DEFAULT
1868        };
1869        let mut args = Vec::new();
1870        append_base_sched_args(&entry, &mut args);
1871    }
1872
1873    /// Two-token form, no scheduler default — completes the
1874    /// no-default matrix together with the combined-form sibling.
1875    #[test]
1876    #[should_panic(expected = "that does not start with `/`")]
1877    fn append_base_sched_args_panics_on_empty_two_token_value_no_scheduler_cgroup_parent() {
1878        static SCHED: Scheduler = Scheduler::named("s");
1879        let entry = KtstrTestEntry {
1880            name: "no_default_cgroup_two_token",
1881            scheduler: &SCHED,
1882            extra_sched_args: &["--cell-parent-cgroup", ""],
1883            ..KtstrTestEntry::DEFAULT
1884        };
1885        let mut args = Vec::new();
1886        append_base_sched_args(&entry, &mut args);
1887    }
1888
1889    /// Relative path (no leading `/`) is rejected by the same gate.
1890    /// Pins the broader contract (the message explicitly promises
1891    /// "absolute path under `/`"); empty is just one case of
1892    /// non-absolute.
1893    #[test]
1894    #[should_panic(expected = "that does not start with `/`")]
1895    fn append_base_sched_args_panics_on_relative_path_value() {
1896        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1897        let entry = KtstrTestEntry {
1898            name: "relative_path",
1899            scheduler: &SCHED,
1900            extra_sched_args: &["--cell-parent-cgroup=my_test"],
1901            ..KtstrTestEntry::DEFAULT
1902        };
1903        let mut args = Vec::new();
1904        append_base_sched_args(&entry, &mut args);
1905    }
1906
1907    /// Two-token form of the relative-path case. Closes the matrix
1908    /// gap: combined-form was pinned by the sibling above but a
1909    /// future refactor that split path validation between the
1910    /// combined and two-token branches could regress one form
1911    /// without test catching.
1912    #[test]
1913    #[should_panic(expected = "that does not start with `/`")]
1914    fn append_base_sched_args_panics_on_relative_path_value_two_token() {
1915        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1916        let entry = KtstrTestEntry {
1917            name: "relative_path_two_token",
1918            scheduler: &SCHED,
1919            extra_sched_args: &["--cell-parent-cgroup", "my_test"],
1920            ..KtstrTestEntry::DEFAULT
1921        };
1922        let mut args = Vec::new();
1923        append_base_sched_args(&entry, &mut args);
1924    }
1925
1926    /// `/.` is absolute and has more than one character, so a naive
1927    /// `starts_with('/') && len > 1` check passes — but the kernel
1928    /// canonicalizes `/sys/fs/cgroup/.` back to `/sys/fs/cgroup`
1929    /// (host cgroup root), corrupting unrelated cgroup state.
1930    /// `Path::components` strips the trailing `.`, yielding `[RootDir]`
1931    /// — the validator rejects via the "has no Normal component"
1932    /// check, not the CurDir arm (see `cell_parent_path_is_valid`).
1933    #[test]
1934    #[should_panic(expected = "contains `.`/`..` segments")]
1935    fn append_base_sched_args_panics_on_dot_normalizing_to_root() {
1936        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1937        let entry = KtstrTestEntry {
1938            name: "dot_normalize",
1939            scheduler: &SCHED,
1940            extra_sched_args: &["--cell-parent-cgroup=/."],
1941            ..KtstrTestEntry::DEFAULT
1942        };
1943        let mut args = Vec::new();
1944        append_base_sched_args(&entry, &mut args);
1945    }
1946
1947    /// `/foo/..` canonicalizes back to `/` → `/sys/fs/cgroup`. Same
1948    /// host-root corruption risk as the empty/bare-slash cases. The
1949    /// component-based gate rejects any `..` (ParentDir) segment.
1950    #[test]
1951    #[should_panic(expected = "contains `.`/`..` segments")]
1952    fn append_base_sched_args_panics_on_parent_dir_normalizing_to_root() {
1953        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1954        let entry = KtstrTestEntry {
1955            name: "parent_dir_normalize",
1956            scheduler: &SCHED,
1957            extra_sched_args: &["--cell-parent-cgroup=/foo/.."],
1958            ..KtstrTestEntry::DEFAULT
1959        };
1960        let mut args = Vec::new();
1961        append_base_sched_args(&entry, &mut args);
1962    }
1963
1964    /// Mixed `/./bar/..` — both kinds of normalizing segment in one
1965    /// path. `Path::components` strips the leading `/.`, yielding
1966    /// `[RootDir, Normal("bar"), ParentDir]`; the validator reaches
1967    /// the `ParentDir` and rejects via that arm. The `/.` never
1968    /// surfaces as a CurDir component.
1969    #[test]
1970    #[should_panic(expected = "contains `.`/`..` segments")]
1971    fn append_base_sched_args_panics_on_mixed_normalize_segments() {
1972        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1973        let entry = KtstrTestEntry {
1974            name: "mixed_normalize",
1975            scheduler: &SCHED,
1976            extra_sched_args: &["--cell-parent-cgroup=/./bar/.."],
1977            ..KtstrTestEntry::DEFAULT
1978        };
1979        let mut args = Vec::new();
1980        append_base_sched_args(&entry, &mut args);
1981    }
1982
1983    /// `/foo/./bar` is ACCEPTED — `Path::components` normalizes away
1984    /// every `CurDir` segment (see `cell_parent_path_is_valid` for
1985    /// the full per-position behavior); the canonical form
1986    /// `/foo/bar` is a real non-root path. Pin the accept path so a
1987    /// future refactor to a stricter `.contains("/./")` text check
1988    /// is caught. Also assert the user value flows through verbatim
1989    /// — a regression that canonicalized the path before forwarding
1990    /// would silently rewrite `/foo/./bar` to `/foo/bar`.
1991    #[test]
1992    fn append_base_sched_args_accepts_embedded_dot_segment() {
1993        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
1994        let entry = KtstrTestEntry {
1995            name: "embedded_dot_ok",
1996            scheduler: &SCHED,
1997            extra_sched_args: &["--cell-parent-cgroup=/foo/./bar"],
1998            ..KtstrTestEntry::DEFAULT
1999        };
2000        let mut args = Vec::new();
2001        append_base_sched_args(&entry, &mut args);
2002        assert!(
2003            args.iter().any(|a| a == "--cell-parent-cgroup=/foo/./bar"),
2004            "user value must pass through verbatim (no canonicalization); args: {args:?}",
2005        );
2006    }
2007
2008    /// Bare `/..` is the most damaging path-normalize edge:
2009    /// downstream interpolation `/sys/fs/cgroup/..` canonicalizes to
2010    /// `/sys/fs` — escapes the cgroup hierarchy entirely. The
2011    /// component walk hits `ParentDir` immediately after `RootDir`
2012    /// (no Normal segment between them) and rejects via the
2013    /// ParentDir arm.
2014    #[test]
2015    #[should_panic(expected = "contains `.`/`..` segments")]
2016    fn append_base_sched_args_panics_on_bare_parent_dir() {
2017        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
2018        let entry = KtstrTestEntry {
2019            name: "bare_parent_dir",
2020            scheduler: &SCHED,
2021            extra_sched_args: &["--cell-parent-cgroup=/.."],
2022            ..KtstrTestEntry::DEFAULT
2023        };
2024        let mut args = Vec::new();
2025        append_base_sched_args(&entry, &mut args);
2026    }
2027
2028    /// Mid-path `/foo/../bar` — ParentDir sits BETWEEN Normal
2029    /// segments. Different shape from `/foo/..` (trailing
2030    /// ParentDir): a regression that bailed only on
2031    /// `path.ends_with("/..")` would slip this past. Downstream
2032    /// interpolation `/sys/fs/cgroup/foo/../bar` canonicalizes to
2033    /// `/sys/fs/cgroup/bar` — an unintended sibling directory the
2034    /// test author didn't ask for. Component walk catches ParentDir
2035    /// in any position.
2036    #[test]
2037    #[should_panic(expected = "contains `.`/`..` segments")]
2038    fn append_base_sched_args_panics_on_mid_path_parent_dir() {
2039        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
2040        let entry = KtstrTestEntry {
2041            name: "mid_path_parent_dir",
2042            scheduler: &SCHED,
2043            extra_sched_args: &["--cell-parent-cgroup=/foo/../bar"],
2044            ..KtstrTestEntry::DEFAULT
2045        };
2046        let mut args = Vec::new();
2047        append_base_sched_args(&entry, &mut args);
2048    }
2049
2050    /// Bare `/` slips a naive `starts_with('/')` check but resolves
2051    /// downstream to `/sys/fs/cgroup/` — semantically the host cgroup
2052    /// root, same corruption risk as the empty case. The gate mirrors
2053    /// `CgroupPath::new`'s const-eval contract (rejects both
2054    /// no-leading-slash AND `"/"` alone) so runtime values share the
2055    /// same validation as compile-time declarations.
2056    #[test]
2057    #[should_panic(expected = "is `/` alone")]
2058    fn append_base_sched_args_panics_on_bare_slash_value() {
2059        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
2060        let entry = KtstrTestEntry {
2061            name: "bare_slash",
2062            scheduler: &SCHED,
2063            extra_sched_args: &["--cell-parent-cgroup=/"],
2064            ..KtstrTestEntry::DEFAULT
2065        };
2066        let mut args = Vec::new();
2067        append_base_sched_args(&entry, &mut args);
2068    }
2069
2070    /// Combined-form empty value via scheduler-def `sched_args`
2071    /// when the scheduler also has NO `cgroup_parent` default. Closes
2072    /// the matrix intersection: a future refactor that gates the
2073    /// scheduler-def-source check on `cgroup_parent.is_some()` would
2074    /// pass the other 6 empty tests but regress this cell.
2075    #[test]
2076    #[should_panic(expected = "that does not start with `/`")]
2077    fn append_base_sched_args_panics_on_empty_combined_value_in_scheduler_sched_args_no_default() {
2078        static SCHED: Scheduler = Scheduler::named("s").sched_args(&["--cell-parent-cgroup="]);
2079        let entry = KtstrTestEntry {
2080            name: "scheduler_def_origin_no_default",
2081            scheduler: &SCHED,
2082            ..KtstrTestEntry::DEFAULT
2083        };
2084        let mut args = Vec::new();
2085        append_base_sched_args(&entry, &mut args);
2086    }
2087
2088    /// Two-token-form sibling of the above — completes the
2089    /// 2-form coverage for the scheduler-def-origin × no-default
2090    /// intersection.
2091    #[test]
2092    #[should_panic(expected = "that does not start with `/`")]
2093    fn append_base_sched_args_panics_on_empty_two_token_value_in_scheduler_sched_args_no_default() {
2094        static SCHED: Scheduler = Scheduler::named("s").sched_args(&["--cell-parent-cgroup", ""]);
2095        let entry = KtstrTestEntry {
2096            name: "scheduler_def_origin_two_token_no_default",
2097            scheduler: &SCHED,
2098            ..KtstrTestEntry::DEFAULT
2099        };
2100        let mut args = Vec::new();
2101        append_base_sched_args(&entry, &mut args);
2102    }
2103
2104    /// Bare `--cell-parent-cgroup` flag with no following token
2105    /// (two-token form, trailing in argv) is rejected at the
2106    /// framework gate via the `CellParentCgroupArg::MissingValue`
2107    /// arm. Previously this shape parsed as "absent", triggered the
2108    /// auto-inject, and produced two copies of the flag in the final
2109    /// argv that clap then rejected with a confused "cannot be used
2110    /// multiple times" diagnostic. The gate intercepts here so the
2111    /// operator gets a "missing value" message anchored to their
2112    /// declaration.
2113    #[test]
2114    #[should_panic(expected = "supplies a bare `--cell-parent-cgroup`")]
2115    fn append_base_sched_args_panics_on_missing_value_via_extra() {
2116        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
2117        let entry = KtstrTestEntry {
2118            name: "missing_value_extra",
2119            scheduler: &SCHED,
2120            extra_sched_args: &["--cell-parent-cgroup"],
2121            ..KtstrTestEntry::DEFAULT
2122        };
2123        let mut args = Vec::new();
2124        append_base_sched_args(&entry, &mut args);
2125    }
2126
2127    /// Bare flag preceded by an unrelated trailing token still trips
2128    /// the MissingValue arm — the parser walks the chain in order,
2129    /// hits the bare flag, and `iter.next()` returns None at end of
2130    /// stream regardless of which unrelated tokens came before it.
2131    #[test]
2132    #[should_panic(expected = "supplies a bare `--cell-parent-cgroup`")]
2133    fn append_base_sched_args_panics_on_missing_value_after_other_flag() {
2134        static SCHED: Scheduler = Scheduler::named("s").cgroup_parent("/sys/fs/cgroup/ktstr");
2135        let entry = KtstrTestEntry {
2136            name: "missing_value_after_other",
2137            scheduler: &SCHED,
2138            extra_sched_args: &["--other-flag", "--cell-parent-cgroup"],
2139            ..KtstrTestEntry::DEFAULT
2140        };
2141        let mut args = Vec::new();
2142        append_base_sched_args(&entry, &mut args);
2143    }
2144
2145    /// Bare flag in the scheduler-def's `sched_args` also trips
2146    /// MissingValue — the parser chains both sources and the
2147    /// universal gate handles them identically.
2148    #[test]
2149    #[should_panic(expected = "supplies a bare `--cell-parent-cgroup`")]
2150    fn append_base_sched_args_panics_on_missing_value_in_scheduler_sched_args() {
2151        static SCHED: Scheduler = Scheduler::named("s")
2152            .cgroup_parent("/sys/fs/cgroup/ktstr")
2153            .sched_args(&["--cell-parent-cgroup"]);
2154        let entry = KtstrTestEntry {
2155            name: "missing_value_scheduler_def",
2156            scheduler: &SCHED,
2157            ..KtstrTestEntry::DEFAULT
2158        };
2159        let mut args = Vec::new();
2160        append_base_sched_args(&entry, &mut args);
2161    }
2162
2163    /// Bare flag with no scheduler default `cgroup_parent`. The
2164    /// universal gate must still fire — the panic message in this
2165    /// case omits the "let the framework auto-inject" suggestion
2166    /// (no default to inject) and adds a hint that an absolute path
2167    /// is required for cell-aware schedulers without a declared
2168    /// default.
2169    #[test]
2170    #[should_panic(expected = "supplies a bare `--cell-parent-cgroup`")]
2171    fn append_base_sched_args_panics_on_missing_value_no_scheduler_cgroup_parent() {
2172        static SCHED: Scheduler = Scheduler::named("s");
2173        let entry = KtstrTestEntry {
2174            name: "missing_value_no_default",
2175            scheduler: &SCHED,
2176            extra_sched_args: &["--cell-parent-cgroup"],
2177            ..KtstrTestEntry::DEFAULT
2178        };
2179        let mut args = Vec::new();
2180        append_base_sched_args(&entry, &mut args);
2181    }
2182
2183    /// Bare flag via scheduler-def `sched_args` with no default
2184    /// `cgroup_parent`. Closes the matrix intersection: a future
2185    /// refactor that gated the MissingValue check on
2186    /// `cgroup_parent.is_some()` (mirroring an earlier regression
2187    /// fixed for Value-invalid) would pass the other 4 MissingValue
2188    /// tests but regress this cell.
2189    #[test]
2190    #[should_panic(expected = "supplies a bare `--cell-parent-cgroup`")]
2191    fn append_base_sched_args_panics_on_missing_value_in_scheduler_sched_args_no_default() {
2192        static SCHED: Scheduler = Scheduler::named("s").sched_args(&["--cell-parent-cgroup"]);
2193        let entry = KtstrTestEntry {
2194            name: "missing_value_scheduler_def_no_default",
2195            scheduler: &SCHED,
2196            ..KtstrTestEntry::DEFAULT
2197        };
2198        let mut args = Vec::new();
2199        append_base_sched_args(&entry, &mut args);
2200    }
2201
2202    /// Bare flag after another flag, with no scheduler default.
2203    /// Completes the after-other-flag × default matrix together with
2204    /// the sibling test that has a default.
2205    #[test]
2206    #[should_panic(expected = "supplies a bare `--cell-parent-cgroup`")]
2207    fn append_base_sched_args_panics_on_missing_value_after_other_flag_no_default() {
2208        static SCHED: Scheduler = Scheduler::named("s");
2209        let entry = KtstrTestEntry {
2210            name: "missing_value_after_other_no_default",
2211            scheduler: &SCHED,
2212            extra_sched_args: &["--other-flag", "--cell-parent-cgroup"],
2213            ..KtstrTestEntry::DEFAULT
2214        };
2215        let mut args = Vec::new();
2216        append_base_sched_args(&entry, &mut args);
2217    }
2218
2219    // -- build_vm_builder_base --
2220
2221    /// Kernel-path surfaces in the builder's "kernel not found" error.
2222    /// Proves the `kernel()` setter is wired through the helper.
2223    #[test]
2224    fn build_vm_builder_base_propagates_kernel_path() {
2225        // build()'s no-perf path reads KTSTR_BYPASS_LLC_LOCKS + KTSTR_CPU_CAP
2226        // before the validation checks. Under the shared env lock, pin
2227        // bypass=1 + cpu_cap unset so build() short-circuits the slot/LLC
2228        // acquire path (no acquire_llc_plan contention; cpu_cap=None avoids
2229        // the bypass+cpu_cap bail), leaving the asserted error the only outcome.
2230        let _l = lock_env();
2231        let _g = EnvVarGuard::set(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, "1");
2232        let _c = EnvVarGuard::remove(crate::KTSTR_CPU_CAP_ENV);
2233        let entry = KtstrTestEntry {
2234            name: "vmb_kernel_path",
2235            ..KtstrTestEntry::DEFAULT
2236        };
2237        let exe = crate::resolve_current_exe().unwrap();
2238        let missing_kernel =
2239            PathBuf::from("/nonexistent/build_vm_builder_base_test_kernel.bzImage");
2240        let result = build_vm_builder_base(
2241            &entry,
2242            &missing_kernel,
2243            &exe,
2244            None,
2245            &[],
2246            crate::vmm::topology::Topology::new(1, 1, 1, 1),
2247            256,
2248            "",
2249            &["run".to_string()],
2250            true,
2251        )
2252        .build();
2253        // `KtstrVm` does not implement Debug, so `.unwrap_err()` is not
2254        // available — collapse Ok into a panic to extract the error by hand.
2255        let err = match result {
2256            Ok(_) => panic!("builder.build() unexpectedly succeeded for missing kernel"),
2257            Err(e) => e,
2258        };
2259        let msg = format!("{err}");
2260        assert!(
2261            msg.contains("kernel not found"),
2262            "expected kernel not found error, got: {msg}",
2263        );
2264        assert!(
2265            msg.contains("build_vm_builder_base_test_kernel"),
2266            "expected the fake kernel path to appear in the error, got: {msg}",
2267        );
2268    }
2269
2270    /// A zero-`llcs` topology is forwarded to the builder and surfaces
2271    /// as a validation error. Proves `topology()` is wired through.
2272    #[test]
2273    fn build_vm_builder_base_propagates_topology_validation() {
2274        // See build_vm_builder_base_propagates_kernel_path: pin bypass=1 +
2275        // cpu_cap unset under the shared env lock so build() short-circuits
2276        // the no-perf slot/LLC path and the asserted error is deterministic.
2277        let _l = lock_env();
2278        let _g = EnvVarGuard::set(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, "1");
2279        let _c = EnvVarGuard::remove(crate::KTSTR_CPU_CAP_ENV);
2280        let entry = KtstrTestEntry {
2281            name: "vmb_topology",
2282            ..KtstrTestEntry::DEFAULT
2283        };
2284        let exe = crate::resolve_current_exe().unwrap();
2285        let bad_topology = crate::vmm::topology::Topology {
2286            llcs: 0,
2287            cores_per_llc: 1,
2288            threads_per_core: 1,
2289            numa_nodes: 1,
2290            nodes: None,
2291            distances: None,
2292        };
2293        let result = build_vm_builder_base(
2294            &entry,
2295            &exe,
2296            &exe,
2297            None,
2298            &[],
2299            bad_topology,
2300            256,
2301            "",
2302            &["run".to_string()],
2303            true,
2304        )
2305        .build();
2306        let err = match result {
2307            Ok(_) => panic!("builder.build() unexpectedly succeeded for zero-llcs topology"),
2308            Err(e) => e,
2309        };
2310        let msg = format!("{err}");
2311        assert!(
2312            msg.contains("llcs must be > 0"),
2313            "expected topology validation error, got: {msg}",
2314        );
2315    }
2316
2317    /// An optional scheduler binary is attached when `Some(path)`
2318    /// is supplied, surfacing as a "scheduler binary not found"
2319    /// error when the path is missing.
2320    #[test]
2321    fn build_vm_builder_base_propagates_scheduler_binary() {
2322        // See build_vm_builder_base_propagates_kernel_path: pin bypass=1 +
2323        // cpu_cap unset under the shared env lock so build() short-circuits
2324        // the no-perf slot/LLC path and the asserted error is deterministic.
2325        let _l = lock_env();
2326        let _g = EnvVarGuard::set(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, "1");
2327        let _c = EnvVarGuard::remove(crate::KTSTR_CPU_CAP_ENV);
2328        let entry = KtstrTestEntry {
2329            name: "vmb_scheduler",
2330            ..KtstrTestEntry::DEFAULT
2331        };
2332        let exe = crate::resolve_current_exe().unwrap();
2333        let missing_scheduler = PathBuf::from("/nonexistent/build_vm_builder_base_test_scheduler");
2334        let result = build_vm_builder_base(
2335            &entry,
2336            &exe,
2337            &exe,
2338            Some(&missing_scheduler),
2339            &[],
2340            crate::vmm::topology::Topology::new(1, 1, 1, 1),
2341            256,
2342            "",
2343            &["run".to_string()],
2344            true,
2345        )
2346        .build();
2347        let err = match result {
2348            Ok(_) => panic!("builder.build() unexpectedly succeeded for missing scheduler"),
2349            Err(e) => e,
2350        };
2351        let msg = format!("{err}");
2352        assert!(
2353            msg.contains("scheduler binary not found"),
2354            "expected scheduler binary error, got: {msg}",
2355        );
2356        assert!(
2357            msg.contains("build_vm_builder_base_test_scheduler"),
2358            "expected the fake scheduler path to appear, got: {msg}",
2359        );
2360    }
2361
2362    // -- vm_timeout_from_entry tests --
2363
2364    #[test]
2365    fn vm_timeout_from_entry_uses_watchdog_when_largest() {
2366        // DEFAULT topology = 2 vCPUs → sys_rdy_budget_ms = 10_300
2367        // (10_000 base + 2×150) → vm_boot_headroom = 20.3 s; base =
2368        // max(60s, 30s, 1s) = 60s. Pin a wide cpuset so 2 vCPUs are
2369        // never oversubscribed (overcommit_ratio floors to 1.0).
2370        let _pin = AllowedCpusPin::new((0..256).collect());
2371        let entry = KtstrTestEntry {
2372            name: "wdog",
2373            watchdog_timeout: Duration::from_secs(60),
2374            duration: Duration::from_secs(30),
2375            ..KtstrTestEntry::DEFAULT
2376        };
2377        assert_eq!(
2378            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2379            Duration::from_millis(80_300)
2380        );
2381    }
2382
2383    #[test]
2384    fn vm_timeout_from_entry_uses_duration_when_largest() {
2385        let _pin = AllowedCpusPin::new((0..256).collect());
2386        let entry = KtstrTestEntry {
2387            name: "dur",
2388            watchdog_timeout: Duration::from_secs(5),
2389            duration: Duration::from_secs(120),
2390            ..KtstrTestEntry::DEFAULT
2391        };
2392        // base = max(5s, 120s, 1s) = 120s; vm_boot_headroom(2) = 20.3 s.
2393        assert_eq!(
2394            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2395            Duration::from_millis(140_300)
2396        );
2397    }
2398
2399    #[test]
2400    fn vm_timeout_from_entry_floor_when_both_small() {
2401        // base floors at 1 s; vm_boot_headroom for 2 vCPUs is 20.3 s.
2402        let _pin = AllowedCpusPin::new((0..256).collect());
2403        let entry = KtstrTestEntry {
2404            name: "tiny",
2405            watchdog_timeout: Duration::from_millis(10),
2406            duration: Duration::from_millis(50),
2407            ..KtstrTestEntry::DEFAULT
2408        };
2409        assert_eq!(
2410            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2411            Duration::from_millis(21_300)
2412        );
2413    }
2414
2415    #[test]
2416    fn vm_timeout_from_default_entry() {
2417        // DEFAULT watchdog = 5 s, duration = 12 s → base = 12 s.
2418        // vm_boot_headroom for 2 vCPUs = 20.3 s → 32.3 s total.
2419        let _pin = AllowedCpusPin::new((0..256).collect());
2420        let entry = KtstrTestEntry {
2421            name: "default",
2422            ..KtstrTestEntry::DEFAULT
2423        };
2424        assert_eq!(
2425            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2426            Duration::from_millis(32_300)
2427        );
2428    }
2429
2430    #[test]
2431    fn vm_timeout_from_entry_scales_headroom_with_topology() {
2432        // A reported case: numa=1, llcs=7, cores=9, threads=2 → 126 vCPUs.
2433        // sys_rdy_budget_ms(126) = 28_900 ms (10_000 base + 126×150) →
2434        // vm_boot_headroom = 38.9 s. base = max(5 s watchdog, 12 s
2435        // duration, 1 s) = 12 s → total = 50.9 s.
2436        // Pins the `entry.topology.total_cpus()` → `vm_boot_headroom` wiring.
2437        // Pin a 256-CPU cpuset so 126 vCPUs are not oversubscribed
2438        // (overcommit_ratio = 126/256 floors to 1.0); the oversub
2439        // multiplier is exercised separately below.
2440        let _pin = AllowedCpusPin::new((0..256).collect());
2441        let entry = KtstrTestEntry {
2442            name: "large_topo",
2443            topology: crate::vmm::topology::Topology {
2444                llcs: 7,
2445                cores_per_llc: 9,
2446                threads_per_core: 2,
2447                numa_nodes: 1,
2448                nodes: None,
2449                distances: None,
2450            },
2451            ..KtstrTestEntry::DEFAULT
2452        };
2453        assert_eq!(
2454            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2455            Duration::from_millis(50_900)
2456        );
2457    }
2458
2459    #[test]
2460    fn vm_timeout_from_entry_scales_on_booted_not_declared_vcpus() {
2461        // Under a TopoOverride the VM boots a different vCPU count than
2462        // entry.topology declares; the boot-headroom deadline must scale
2463        // to the BOOTED count passed in, not entry.topology. A default
2464        // 2-vCPU entry "booted" at 126 vCPUs must get the 126-vCPU
2465        // headroom (50.9 s, matching the declared-126 case above), not
2466        // the declared 2-vCPU headroom (32.3 s). Pin a wide cpuset so
2467        // neither count is oversubscribed (multiplier = 1.0). The
2468        // declared-vs-booted gap is otherwise untested — every other
2469        // vm_timeout test passes entry.topology.total_cpus() (declared ==
2470        // booted).
2471        let _pin = AllowedCpusPin::new((0..256).collect());
2472        let entry = KtstrTestEntry {
2473            name: "booted_override",
2474            ..KtstrTestEntry::DEFAULT
2475        };
2476        assert_eq!(
2477            vm_timeout_from_entry(&entry, 126),
2478            Duration::from_millis(50_900),
2479        );
2480        assert_eq!(
2481            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2482            Duration::from_millis(32_300),
2483        );
2484        assert_ne!(
2485            vm_timeout_from_entry(&entry, 126),
2486            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2487            "the deadline must key on the booted count, not the declared entry.topology",
2488        );
2489    }
2490
2491    // -- overcommit_ratio / oversub-scaled vm_timeout --
2492
2493    #[test]
2494    fn overcommit_ratio_floors_at_one_for_fitting_host() {
2495        // vCPUs <= allowed → not oversubscribed → 1.0 (never < 1).
2496        assert_eq!(overcommit_ratio(8, 192, None), 1.0);
2497        assert_eq!(overcommit_ratio(192, 192, None), 1.0);
2498    }
2499
2500    #[test]
2501    fn overcommit_ratio_auto_collapse_uses_allowed_cpuset() {
2502        // No explicit cpu_budget: the vCPU threads collapse onto the
2503        // whole allowed cpuset. 256 vCPUs on 192 allowed = the CI
2504        // wide-SMP case (~1.33x).
2505        let r = overcommit_ratio(256, 192, None);
2506        assert!((r - 256.0 / 192.0).abs() < 1e-9, "got {r}");
2507    }
2508
2509    #[test]
2510    fn overcommit_ratio_explicit_budget_collapses_onto_min_budget_allowed() {
2511        // Explicit cpu_budget caps the host CPUs the vCPU threads land
2512        // on (the deliberate _overcommit test): 256 / min(64, 192) = 4x.
2513        assert_eq!(overcommit_ratio(256, 192, Some(64)), 4.0);
2514        // A budget wider than the allowed set clamps to allowed.
2515        let r = overcommit_ratio(256, 192, Some(1000));
2516        assert!((r - 256.0 / 192.0).abs() < 1e-9, "got {r}");
2517    }
2518
2519    #[test]
2520    fn overcommit_ratio_guards_empty_cpuset() {
2521        // An unenumerable cpuset (allowed_cpus = 0) must not divide by
2522        // zero — treat it as a 1-CPU host.
2523        assert_eq!(overcommit_ratio(8, 0, None), 8.0);
2524    }
2525
2526    #[test]
2527    fn overcommit_skip_reason_skips_severe_auto_collapse() {
2528        // 256 vCPUs auto-collapse onto 8 host CPUs = 32x ≥ 6x → skip.
2529        // (boot-only path: expect_auto_repro = false.)
2530        let r = overcommit_skip_reason(256, 8, None, false);
2531        assert!(
2532            r.as_deref()
2533                .is_some_and(|m| m.contains("host topology insufficient")),
2534            "32x auto-collapse must skip with the typed reason, got {r:?}",
2535        );
2536    }
2537
2538    #[test]
2539    fn overcommit_skip_reason_runs_ci_wide_smp_ratio() {
2540        // 256 vCPUs on a 192-CPU CI runner = 1.33x < 6x → RUNS (None),
2541        // so wide-SMP boot is validated there, never masked.
2542        assert_eq!(overcommit_skip_reason(256, 192, None, false), None);
2543    }
2544
2545    #[test]
2546    fn overcommit_skip_reason_never_skips_explicit_budget() {
2547        // An explicit cpu_budget is a deliberate oversubscription opt-in
2548        // (contention testing): even 256 vCPUs on 8 host CPUs runs.
2549        assert_eq!(overcommit_skip_reason(256, 8, Some(4), false), None);
2550    }
2551
2552    #[test]
2553    fn overcommit_skip_reason_runs_on_empty_cpuset() {
2554        // An unenumerable cpuset (allowed = 0) cannot compute a ratio →
2555        // does not skip; the overcommit warning is the sole signal there.
2556        assert_eq!(overcommit_skip_reason(256, 0, None, false), None);
2557    }
2558
2559    #[test]
2560    fn overcommit_skip_reason_boundary_is_inclusive_at_cap() {
2561        // ≥ cap skips, just-below runs. 48 vCPUs on 8 = exactly 6.0x → skip;
2562        // 47 on 8 = 5.875x < 6.0 → run. (boot-only path.)
2563        assert!(overcommit_skip_reason(48, 8, None, false).is_some());
2564        assert_eq!(overcommit_skip_reason(47, 8, None, false), None);
2565    }
2566
2567    #[test]
2568    fn overcommit_skip_reason_expect_auto_repro_uses_stricter_cap() {
2569        // The expect_auto_repro inversion chain skips at a much lower
2570        // ratio (EXPECT_AUTO_REPRO_SKIP_RATIO = 2.0x) than a boot-only
2571        // wide test. Pins the failing CI case: 256 vCPUs on a 96-CPU host
2572        // = 2.67x.
2573        //   - 2.67x WITH expect_auto_repro -> SKIP (the two-VM wprof chain
2574        //     cannot run cleanly under that time-slicing).
2575        let skip = overcommit_skip_reason(256, 96, None, true);
2576        assert!(
2577            skip.as_deref().is_some_and(
2578                |m| m.contains("host topology insufficient") && m.contains("expect_auto_repro")
2579            ),
2580            "2.67x with expect_auto_repro must skip naming the chain, got {skip:?}",
2581        );
2582        //   - same 2.67x WITHOUT expect_auto_repro -> RUNS (a single-VM
2583        //     wide-SMP boot test still validates boot at 2.67x < 6.0x).
2584        assert_eq!(overcommit_skip_reason(256, 96, None, false), None);
2585        //   - the 192-CPU design-target runner (256/192 = 1.33x) RUNS the
2586        //     auto-repro hop even with expect_auto_repro (1.33x < 2.0x), so
2587        //     the >255 inversion is still validated there.
2588        assert_eq!(overcommit_skip_reason(256, 192, None, true), None);
2589        //   - an explicit cpu_budget stays a deliberate opt-in: no skip
2590        //     even with expect_auto_repro.
2591        assert_eq!(overcommit_skip_reason(256, 8, Some(4), true), None);
2592        //   - boundary: exactly 2.0x with expect_auto_repro skips (>= is
2593        //     inclusive); just below runs.
2594        assert!(overcommit_skip_reason(16, 8, None, true).is_some());
2595        assert_eq!(overcommit_skip_reason(15, 8, None, true), None);
2596    }
2597
2598    #[test]
2599    fn vm_timeout_scales_boot_headroom_by_overcommit_ratio() {
2600        // 256 vCPUs (16 LLCs × 16 cores) on a 64-CPU allowed cpuset =
2601        // 4x auto-collapse. The boot headroom (58.4 s) scales by 4x;
2602        // base = max(5 s watchdog, 12 s duration, 1 s) = 12 s →
2603        // 12 + 58.4×4 = 245.6 s. Pins that the ratio multiplies ONLY
2604        // the headroom, not base.
2605        let _pin = AllowedCpusPin::new((0..64).collect());
2606        let entry = KtstrTestEntry {
2607            name: "oversub",
2608            topology: crate::vmm::topology::Topology {
2609                llcs: 16,
2610                cores_per_llc: 16,
2611                threads_per_core: 1,
2612                numa_nodes: 1,
2613                nodes: None,
2614                distances: None,
2615            },
2616            ..KtstrTestEntry::DEFAULT
2617        };
2618        // 12_000 + 58_400 × 4 = 245_600 ms.
2619        assert_eq!(
2620            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2621            Duration::from_millis(245_600)
2622        );
2623    }
2624
2625    #[test]
2626    fn vm_timeout_overcommit_multiplier_clamps_at_cap() {
2627        // 256 vCPUs on an 8-CPU cpuset = 32x, but the headroom
2628        // multiplier clamps at OVERCOMMIT_HEADROOM_CAP (6x) so the
2629        // deadline stays bounded: 12 + 58.4×6 = 362.4 s. (Such a host
2630        // auto-SKIPS upstream; this pins the clamp independently.)
2631        let _pin = AllowedCpusPin::new((0..8).collect());
2632        let entry = KtstrTestEntry {
2633            name: "clamp",
2634            topology: crate::vmm::topology::Topology {
2635                llcs: 16,
2636                cores_per_llc: 16,
2637                threads_per_core: 1,
2638                numa_nodes: 1,
2639                nodes: None,
2640                distances: None,
2641            },
2642            ..KtstrTestEntry::DEFAULT
2643        };
2644        // 12_000 + 58_400 × 6 = 362_400 ms.
2645        assert_eq!(
2646            vm_timeout_from_entry(&entry, entry.topology.total_cpus()),
2647            Duration::from_millis(362_400)
2648        );
2649    }
2650
2651    // -- sys_rdy_budget_ms / vm_boot_headroom --
2652
2653    #[test]
2654    fn sys_rdy_budget_ms_base_plus_linear_per_vcpu() {
2655        // Additive: 10_000 ms base + vcpus × 150. Every topology gets
2656        // the base PLUS its per-vCPU term — no dead floor below 67
2657        // vCPUs (the bug that gave a 64-vCPU VM the same 10 s as a
2658        // 1-vCPU VM).
2659        assert_eq!(sys_rdy_budget_ms(1), 10_150);
2660        assert_eq!(sys_rdy_budget_ms(32), 14_800);
2661        assert_eq!(sys_rdy_budget_ms(66), 19_900);
2662    }
2663
2664    #[test]
2665    fn sys_rdy_budget_ms_scales_linearly_in_band() {
2666        // 10_000 ms base + vcpus × 150, in the band below the 90 s cap.
2667        assert_eq!(sys_rdy_budget_ms(67), 20_050);
2668        // The 126-vCPU case lands at 28.9 s.
2669        assert_eq!(sys_rdy_budget_ms(126), 28_900);
2670        // 256-vCPU wide-SMP gets its FULL additive budget (48.4 s) — the
2671        // case the old 30 s cap truncated to 30 s, starving the boot.
2672        assert_eq!(sys_rdy_budget_ms(256), 48_400);
2673    }
2674
2675    #[test]
2676    fn sys_rdy_budget_ms_caps_at_ninety_seconds() {
2677        // The 512-vCPU MAX_VCPUS topology gets its full additive budget
2678        // (10_000 + 512×150 = 86_800 ms), comfortably under the cap.
2679        assert_eq!(sys_rdy_budget_ms(512), 86_800);
2680        // 533 vCPUs is the last under the 90 s cap (10_000 + 533×150 =
2681        // 89_950); 534 is the first clipped (10_000 + 534×150 = 90_100
2682        // → 90_000). Only pathological >533-vCPU counts clip.
2683        assert_eq!(sys_rdy_budget_ms(533), 89_950);
2684        assert_eq!(sys_rdy_budget_ms(534), 90_000);
2685        assert_eq!(sys_rdy_budget_ms(u32::MAX), 90_000);
2686    }
2687
2688    #[test]
2689    fn sys_rdy_budget_ms_zero_returns_base() {
2690        // Guest fallback when /sys/devices/system/cpu/online is missing:
2691        // 0 vCPUs → the bare 10_000 ms base (no per-vCPU term).
2692        assert_eq!(sys_rdy_budget_ms(0), 10_000);
2693    }
2694
2695    #[test]
2696    fn vm_boot_headroom_is_ten_plus_sys_rdy_budget() {
2697        // KERNEL_INIT_HEADROOM (10 s) + sys_rdy_budget_ms(vcpus).
2698        assert_eq!(vm_boot_headroom(1), Duration::from_millis(20_150));
2699        assert_eq!(vm_boot_headroom(126), Duration::from_millis(38_900));
2700        // 256-vCPU wide-SMP: 10 s + 48.4 s = 58.4 s (un-oversubscribed
2701        // headroom; vm_timeout_from_entry scales THIS by the host
2702        // overcommit ratio).
2703        assert_eq!(vm_boot_headroom(256), Duration::from_millis(58_400));
2704        // 512-vCPU MAX_VCPUS budget (86.8 s) → 96.8 s headroom, uncapped
2705        // under the 90 s ceiling.
2706        assert_eq!(vm_boot_headroom(512), Duration::from_millis(96_800));
2707    }
2708
2709    /// Two calls to `content_hash` with the same input must return
2710    /// the same u64. Pins the within-process determinism invariant
2711    /// against a future regression that swaps in a per-call-seeded
2712    /// hasher — e.g. `std::hash::RandomState::new().build_hasher()`,
2713    /// which increments its keys per call within a process, or any
2714    /// time/thread-id-seeded scheme. Note: swapping to std's
2715    /// `DefaultHasher::new()` would NOT regress this test —
2716    /// `DefaultHasher` is itself `SipHasher13::new_with_keys(0, 0)`
2717    /// and therefore deterministic; the cross-rustc-version
2718    /// stability regression class is caught by the value-pin
2719    /// follow-up, not this assertion.
2720    #[test]
2721    fn content_hash_is_deterministic_across_calls() {
2722        let input = "scheduler config payload";
2723        assert_eq!(content_hash(input), content_hash(input));
2724    }
2725
2726    /// Distinct inputs must produce distinct hashes. Catches a trivial
2727    /// regression (constant-returning hasher) that the determinism
2728    /// test alone would silently accept.
2729    #[test]
2730    fn content_hash_differs_for_distinct_inputs() {
2731        assert_ne!(content_hash("alpha"), content_hash("beta"));
2732    }
2733
2734    /// Cross-toolchain stability pin: every `content_hash` output must
2735    /// equal the SipHasher13(keys=0,0) value emitted at commit time.
2736    /// Pins the algorithm choice — a future swap to a different
2737    /// stable hasher (e.g. xxhash, fxhash) would silently regenerate
2738    /// every content-addressed cache filename on disk, breaking cache
2739    /// hit rates without surfacing as a failed test. The companion
2740    /// `content_hash_is_deterministic_across_calls` pin guards
2741    /// within-process determinism; this pin guards cross-process /
2742    /// cross-toolchain / cross-machine stability.
2743    #[test]
2744    fn content_hash_value_pin() {
2745        // SipHasher13(keys=0,0) over the four corpora below. If any
2746        // assertion fails, the algorithm or its seeding changed —
2747        // STOP. `content_hash` names the inline-config tempfile in
2748        // `config_content_parts` at src/test_support/runtime.rs and
2749        // the export-config tempfile in `export.rs`; flipping the
2750        // hashes silently regenerates those filenames on every
2751        // process, breaking any future scheme that tries to dedup
2752        // across runs and breaking intra-run reproducibility if a
2753        // caller comes to depend on stable byte equality across
2754        // identical inputs. Update only after intentional algorithm
2755        // migration. The four corpora — empty + two short ASCII +
2756        // one realistic config payload — span the cases the
2757        // algorithm needs to handle correctly.
2758        assert_eq!(content_hash(""), 0x30406ea523c53def);
2759        assert_eq!(content_hash("alpha"), 0x3c87f3c3317bd39a);
2760        assert_eq!(content_hash("beta"), 0xbb8fd2aa1487d7ac);
2761        assert_eq!(content_hash("scheduler config payload"), 0xc678971ba48d5f80);
2762    }
2763
2764    /// Per-content-hash inline-config files MUST land inside the
2765    /// per-process `scratch_dir()` subtree, NOT bare
2766    /// `std::env::temp_dir()`. The 0o700 process-owned subdirectory
2767    /// blocks the cross-uid symlink-replacement attack on
2768    /// predictable content-addressed filenames in shared `/tmp`. A
2769    /// future "simplification" that reverts the path to bare
2770    /// `std::env::temp_dir().join(...)` silently restores the
2771    /// attack surface; this test fails loudly first.
2772    #[test]
2773    fn config_content_parts_writes_inside_process_scratch_dir() {
2774        use crate::assert::Assert;
2775        use crate::scenario::Ctx;
2776        use crate::test_support::entry::{
2777            KtstrTestEntry, Scheduler, SchedulerSpec, TopologyConstraints,
2778        };
2779        use crate::vmm::topology::Topology;
2780
2781        static SCHED: Scheduler = Scheduler {
2782            name: "config_parts_test_sched",
2783            binary: SchedulerSpec::Discover("nope"),
2784            sysctls: &[],
2785            kargs: &[],
2786            assert: Assert::NO_OVERRIDES,
2787            cgroup_parent: None,
2788            sched_args: &[],
2789            topology: Topology {
2790                llcs: 1,
2791                cores_per_llc: 1,
2792                threads_per_core: 1,
2793                numa_nodes: 1,
2794                nodes: None,
2795                distances: None,
2796            },
2797            constraints: TopologyConstraints::DEFAULT,
2798            config_file: None,
2799            config_file_def: Some(("--config={file}", "/include-files/p.json")),
2800            kernels: &[],
2801        };
2802        fn func(_: &Ctx) -> anyhow::Result<crate::assert::AssertResult> {
2803            Ok(crate::assert::AssertResult::pass())
2804        }
2805        let entry = KtstrTestEntry {
2806            name: "scratch_dir_path_test",
2807            func,
2808            scheduler: &SCHED,
2809            config_content: Some("{\"sentinel\":42}"),
2810            ..KtstrTestEntry::DEFAULT
2811        };
2812        let (_, host_path, _, _) =
2813            config_content_parts(&entry).expect("config_content_parts returns Some");
2814        assert!(
2815            host_path.starts_with(scratch_dir()),
2816            "config tempfile must live inside the process-owned scratch dir, \
2817             not bare std::env::temp_dir(): got host_path={host_path:?}, \
2818             scratch_dir={:?}",
2819            scratch_dir()
2820        );
2821    }
2822
2823    /// Two same-content calls produce the SAME canonical path
2824    /// (content-addressed naming idempotence). Callers using the
2825    /// returned PathBuf for downstream dedup decisions rely on this
2826    /// — a regression that breaks the content-hash → path mapping
2827    /// would silently spam the scratch dir with per-call distinct
2828    /// names instead of reusing the canonical entry.
2829    #[test]
2830    fn config_content_parts_same_content_same_canonical_path() {
2831        use crate::assert::Assert;
2832        use crate::scenario::Ctx;
2833        use crate::test_support::entry::{
2834            KtstrTestEntry, Scheduler, SchedulerSpec, TopologyConstraints,
2835        };
2836        use crate::vmm::topology::Topology;
2837
2838        static SCHED: Scheduler = Scheduler {
2839            name: "config_parts_idempotent_sched",
2840            binary: SchedulerSpec::Discover("nope"),
2841            sysctls: &[],
2842            kargs: &[],
2843            assert: Assert::NO_OVERRIDES,
2844            cgroup_parent: None,
2845            sched_args: &[],
2846            topology: Topology {
2847                llcs: 1,
2848                cores_per_llc: 1,
2849                threads_per_core: 1,
2850                numa_nodes: 1,
2851                nodes: None,
2852                distances: None,
2853            },
2854            constraints: TopologyConstraints::DEFAULT,
2855            config_file: None,
2856            config_file_def: Some(("--config={file}", "/include-files/p.json")),
2857            kernels: &[],
2858        };
2859        fn func(_: &Ctx) -> anyhow::Result<crate::assert::AssertResult> {
2860            Ok(crate::assert::AssertResult::pass())
2861        }
2862        let entry = KtstrTestEntry {
2863            name: "idempotent_path_test",
2864            func,
2865            scheduler: &SCHED,
2866            config_content: Some("{\"idempotent\":true}"),
2867            ..KtstrTestEntry::DEFAULT
2868        };
2869        let (_, p1, _, _) = config_content_parts(&entry).expect("first call returns Some");
2870        let (_, p2, _, _) = config_content_parts(&entry).expect("second call returns Some");
2871        assert_eq!(
2872            p1, p2,
2873            "same content_content -> same canonical path; content-addressed naming \
2874             must be idempotent across calls"
2875        );
2876        // The filename component encodes the content hash via the
2877        // `ktstr-config-{hash:016x}.json` template; verify the prefix
2878        // so a future filename-template change is caught.
2879        let name = p1.file_name().and_then(|n| n.to_str()).unwrap_or("");
2880        assert!(
2881            name.starts_with("ktstr-config-") && name.ends_with(".json"),
2882            "canonical filename must follow `ktstr-config-{{hash}}.json` template, got: {name}"
2883        );
2884    }
2885}