ktstr/test_support/sidecar/mod.rs
1//! Per-run sidecar JSON — the durable record of a ktstr test outcome.
2//!
3//! Every test (pass, fail, or skip) writes a [`SidecarResult`] to a
4//! JSON file under the run's sidecar directory; downstream analysis
5//! (`cargo ktstr stats`, CI dashboards) aggregates those files to
6//! compute pass/fail rates, verifier stats, callback profiles, and
7//! KVM stats across gauntlet variants.
8//!
9//! Responsibilities owned by this module:
10//! - [`SidecarResult`]: the on-disk schema. Writer-side: every field
11//! is always emitted — `null` for `None`, `[]` for empty `Vec` —
12//! with no `skip_serializing_if` and no `serde(default)`. Reader-
13//! side: serde's native `Option<T>` deserialize tolerates absence
14//! (a missing key parses as `None`); non-`Option` fields (e.g.
15//! `test_name`, `passed`, `stats`) are hard-required and a missing
16//! key fails deserialize. The contract is intentionally asymmetric
17//! so a future producer that drops an `Option` field still parses
18//! on older readers, while the current writer guarantees full
19//! round-trip symmetry. Pre-1.0: old sidecar JSON is disposable;
20//! regenerate by re-running the test rather than relying on the
21//! reader-side tolerance for migration.
22//! - [`collect_sidecars`]: load every `*.ktstr.json` under a directory
23//! (one level of subdirectories for per-job gauntlet layouts).
24//! - [`write_sidecar`] / [`write_skip_sidecar`]: serialize one run to
25//! disk; variant-hash the discriminating fields so gauntlet variants
26//! don't clobber each other.
27//! - [`sidecar_dir`], [`runs_root`], [`newest_run_dir`]: resolve where
28//! sidecars live (env override, or
29//! `{target}/ktstr/{kernel}-{project_commit}` where
30//! `{project_commit}` is the project tree's HEAD short hex from
31//! [`detect_project_commit`], suffixed `-dirty` when the
32//! worktree differs).
33//! - [`format_run_dirname`]: render the
34//! `{kernel}-{project_commit}` leaf name from the resolved
35//! kernel + commit slots, substituting the literal `unknown`
36//! when either probe returned `None` so the dirname stays
37//! filesystem-safe (see the unknown-commit collision
38//! semantics in the runs guide).
39//! - [`is_run_directory`]: predicate consumed by run-listing
40//! walkers ([`newest_run_dir`] here, `sorted_run_entries` in
41//! `crate::stats`). Filters non-directories and dotfile
42//! subdirectories (notably the `.locks/` flock-sentinel
43//! subdirectory) so the lock infrastructure cannot pollute
44//! `cargo ktstr stats list` output or claim the "most recent
45//! run" bucket.
46//! - [`pre_clear_run_dir_once`]: shallow-wipe `*.ktstr.json` files
47//! in the run directory at the FIRST write of each test
48//! process so a re-run at the same `{kernel}-{project_commit}`
49//! key produces a last-writer-wins snapshot rather than an
50//! append-only archive. Subsequent writes in the same process
51//! are gated by an internal `Mutex<HashSet<PathBuf>>` so only
52//! the first call per key per process clears.
53//! - [`acquire_run_dir_flock`]: cross-process `LOCK_EX` on the
54//! per-run-key sentinel
55//! (`{runs_root}/.locks/{key}.lock`) held for the duration of
56//! the pre-clear + serialize + write cycle. Two concurrent
57//! ktstr processes targeting the same key serialize through
58//! this lock so neither tears the other's mid-write
59//! sidecars. The override branch (operator-chosen
60//! `KTSTR_SIDECAR_DIR`) skips the flock for the same reason
61//! it skips pre-clear: the operator owns the directory's
62//! contents.
63//! - [`warn_unknown_project_commit_once`]: one-shot stderr warning
64//! on first sidecar write when `detect_project_commit` returns
65//! `None` (test process not in a git repo) so concurrent or
66//! successive non-git runs colliding on `{kernel}-unknown`
67//! surface the disambiguation hint
68//! (`KTSTR_SIDECAR_DIR=…` or place the tree under git) at
69//! first invocation rather than as a silent collision.
70//! - [`format_verifier_stats`], [`format_callback_profile`],
71//! [`format_kvm_stats`]: human-readable summaries from a
72//! `Vec<SidecarResult>` for CLI output.
73//! - [`detect_kernel_version`]: read the kernel version from
74//! `KTSTR_KERNEL` cache metadata for sidecar-dir naming and the
75//! `kernel_version` field, with fallback to
76//! `include/config/kernel.release` in the kernel source tree
77//! when the cache metadata is absent or does not carry a
78//! version (e.g. a raw source-tree path set in `KTSTR_KERNEL`
79//! rather than a cache key).
80//! - [`detect_kernel_commit`]: read the kernel SOURCE TREE's git
81//! HEAD short hex (with `-dirty` suffix when worktree differs
82//! from the index or HEAD differs from the index) for the
83//! `kernel_commit` field. Distinct from `kernel_version`
84//! (release string from `kernel.release`) and `project_commit`
85//! (ktstr framework HEAD): this records "what kernel commit
86//! produced this run" so two runs of the same `kernel_version`
87//! but different WIP source trees compare distinctly.
88
89use std::path::PathBuf;
90
91use anyhow::Context;
92
93use crate::assert::{AssertResult, ScenarioStats};
94use crate::monitor::MonitorSummary;
95use crate::sync::MutexExt;
96use crate::test_support::PayloadMetrics;
97use crate::timeline::StimulusEvent;
98use crate::vmm;
99
100use super::entry::KtstrTestEntry;
101use super::timefmt::{generate_run_id, now_iso8601};
102
103/// Test result sidecar written to KTSTR_SIDECAR_DIR for post-run analysis.
104#[derive(Debug, serde::Serialize, serde::Deserialize)]
105pub struct SidecarResult {
106 /// Fully qualified test name (matches `KtstrTestEntry::name`,
107 /// the bare function name without the `ktstr/` nextest prefix).
108 pub test_name: String,
109 /// Rendered topology label (e.g. `1n2l4c1t`) for the variant this
110 /// sidecar describes.
111 pub topology: String,
112 /// Scheduler name (matches `Scheduler::name`); `"eevdf"` for
113 /// tests run without an scx scheduler.
114 pub scheduler: String,
115 /// Best-effort git commit of the scheduler binary used for this
116 /// run. Currently ALWAYS `None` for every `SchedulerSpec`
117 /// variant — no variant today has a reliable commit source.
118 /// The field is reserved on the schema so stats tooling can
119 /// enrich it once a reliable source exists (e.g. a
120 /// `--version` probe or ELF-note read on the resolved
121 /// scheduler binary). See
122 /// [`crate::test_support::SchedulerSpec::scheduler_commit`]
123 /// for the full per-variant rationale.
124 ///
125 /// Writer always emits (`"scheduler_commit": null` on absence).
126 /// Reader-side: serde's native `Option<T>` deserialize tolerates
127 /// absence (a missing key parses as `None`); see the module-level
128 /// doc for the full asymmetric contract that governs every
129 /// nullable on this struct.
130 pub scheduler_commit: Option<String>,
131 /// How the userspace scheduler binary was resolved for this run —
132 /// the snake_case [`crate::test_support::ResolveSource::as_str`] tag
133 /// (`"path"`, `"env_var"`, `"path_lookup"`, `"sibling_dir"`,
134 /// `"target_debug"`, `"target_release"`, `"auto_built"`,
135 /// `"not_found"`). Provenance, not identity: distinct from
136 /// [`SidecarResult::scheduler_commit`] (the binary's git commit) —
137 /// this records the discovery PATH, so the stats CLI can answer "was
138 /// this run's scheduler auto-built from the workspace HEAD, or
139 /// resolved from a possibly-stale `target/` or `$PATH` binary?".
140 /// `"auto_built"` is the only tag whose source commit is known to
141 /// match the workspace tree; every other tag carries the stale-binary
142 /// hazard documented on the [`crate::test_support::ResolveSource`]
143 /// variant.
144 ///
145 /// Writer always emits (`"resolve_source": null` on absence — the
146 /// skip-sidecar path resolves no binary). Reader-side: serde's native
147 /// `Option<T>` deserialize tolerates absence (a missing key parses as
148 /// `None`); see the module-level doc for the full asymmetric
149 /// contract. Excluded from `sidecar_variant_hash` for the same
150 /// cross-host grouping reason as `scheduler_commit` / `run_source`:
151 /// two runs of the same semantic variant resolved via different
152 /// discovery paths must still bucket together.
153 pub resolve_source: Option<String>,
154 /// Best-effort git HEAD of the ktstr project tree at sidecar-
155 /// write time. Captured by `detect_project_commit` via
156 /// `gix::discover` from the test process's current working
157 /// directory; walks up to find the enclosing repo and reads
158 /// HEAD short-hex, suffixing `-dirty` when index-vs-HEAD or
159 /// worktree-vs-index changes are observed (submodules ignored,
160 /// matching the [`crate::fetch::local_source`] dirty-detection
161 /// pattern). `None` when cwd is not inside any git repo, or
162 /// when the gix probe fails for any reason — this is metadata,
163 /// not a gate, so probe failure must not abort the run.
164 ///
165 /// Distinct from [`SidecarResult::scheduler_commit`]: that
166 /// field tracks the userspace scheduler binary's commit
167 /// (currently always `None` per its own doc); this field
168 /// tracks the ktstr framework / test-runner commit, so the
169 /// stats CLI can answer "which version of the harness produced
170 /// this sidecar?" without inspecting the scheduler.
171 ///
172 /// Writer always emits (`"project_commit": null` on absence).
173 /// Reader-side: serde's native `Option<T>` deserialize tolerates
174 /// absence (a missing key parses as `None`) — see the module-
175 /// level doc for the full asymmetric contract. Excluded from
176 /// `sidecar_variant_hash` for the same cross-host grouping
177 /// reason `scheduler_commit` is excluded: two runs of the same
178 /// semantic variant on different ktstr commits must still bucket
179 /// together so `perf-delta` can diff them; the commit-drift
180 /// detection inspects this field directly via `--project-commit`
181 /// / `--a-project-commit` / `--b-project-commit`.
182 pub project_commit: Option<String>,
183 /// Binary payload name (matches `Payload::name` when
184 /// `entry.payload` is set). `None` when the test declared no
185 /// binary payload. Writer always emits (`"payload": null` on
186 /// absence); reader-side, serde's native `Option<T>` deserialize
187 /// tolerates absence — see the module-level doc for the full
188 /// asymmetric contract.
189 pub payload: Option<String>,
190 /// Per-payload extracted metrics collected from `ctx.payload(X).run()`
191 /// / `.spawn().wait()` call sites during the test body.
192 ///
193 /// One [`PayloadMetrics`] per invocation, in the order the calls
194 /// ran. Empty when no payload calls were made (scheduler-only
195 /// tests, or a binary-only test where the body bailed before
196 /// running the payload). Writer always emits (`"metrics": []` in
197 /// that case); reader-side, this `Vec` field is hard-required —
198 /// non-`Option` fields fail deserialize on absence. See the
199 /// module-level doc for the full contract.
200 pub metrics: Vec<PayloadMetrics>,
201 /// True when the run is a real pass — every assertion that
202 /// ran produced a positive verdict. Mirrors
203 /// [`crate::assert::AssertResult::is_pass`]. Mutually
204 /// exclusive with [`Self::skipped`] and [`Self::inconclusive`]:
205 /// the three bits `(passed, skipped, inconclusive)` form a
206 /// strict 4-state encoding where at most one is set per
207 /// record. The fourth state — Fail — is the all-false case
208 /// (no dedicated bit; [`Self::is_fail`] derives it). A real
209 /// pass requires `!skipped && !inconclusive` AND at least one
210 /// observed assertion (the empty / all-skip case routes
211 /// through [`Self::skipped`] instead).
212 pub passed: bool,
213 /// True when the run was skipped (e.g. topology mismatch,
214 /// missing resource, in-VM `AssertResult::skip` return).
215 /// Mutually exclusive with [`Self::passed`] (Pass requires a
216 /// real assertion; an all-skip stream is Skip, not Pass) and
217 /// with [`Self::inconclusive`]. Stats tooling subtracts
218 /// `skipped` runs from "pass count" so non-executions are not
219 /// reported as passes.
220 pub skipped: bool,
221 /// True when at least one assertion was [`Outcome::Inconclusive`](crate::assert::Outcome::Inconclusive) —
222 /// the run ran but a zero-denominator ratio gate could not be
223 /// evaluated (e.g. zero iterations across all workers under a
224 /// `max_migration_ratio` check). Mutually exclusive with
225 /// [`Self::passed`] and [`Self::skipped`]; in the
226 /// `Fail > Inconclusive > Pass > Skip` lattice, Inconclusive
227 /// dominates Pass/Skip but loses to Fail, so a run with both
228 /// Inconclusive and Fail outcomes records `inconclusive = false,
229 /// passed = false` (Fail wins) — `inconclusive = true` requires
230 /// `!is_fail() && !is_pass() && !is_skip()`.
231 ///
232 /// Distinct from `passed = false` (Fail) and `skipped = true`
233 /// (precondition unmet) so CI gates and stats tooling can
234 /// triage zero-denominator runs as "workload didn't produce
235 /// the signal the assertion needed" rather than misclassifying
236 /// them as silent passes (prior to the [`Outcome::Inconclusive`](crate::assert::Outcome::Inconclusive)
237 /// variant the zero-denominator case fell out as Pass) or as
238 /// hard failures.
239 pub inconclusive: bool,
240 /// True when the persisted verdict (`passed`/`skipped`/
241 /// `inconclusive`) is the POST-inversion FINAL outcome of a run
242 /// whose underlying scenario actually failed — i.e. an
243 /// `expect_err` / `expect_auto_repro` test whose induced failure was
244 /// inverted to a pass. Set by the sidecar finalize
245 /// (`finalize_sidecar_verdict`) after dispatch resolves the
246 /// verdict; `false` for an ordinary pass/skip/fail.
247 ///
248 /// The verdict bits carry the FINAL outcome so the footer, `stats`
249 /// analysis, and `replay` match nextest's exit code. This flag
250 /// preserves the one fact that overwrite loses: that the run's
251 /// telemetry is failure-mode-dominated (a deliberately short /
252 /// stalled run). `perf-delta` ORs it into its exclusion guard so
253 /// an inverted-to-pass row is still kept OUT of the regression math
254 /// (its induced-crash telemetry is not real scheduler behavior).
255 pub expected_failure: bool,
256 /// Aggregate per-cgroup statistics merged across every worker.
257 pub stats: ScenarioStats,
258 /// Monitor summary. `None` means the monitor loop did not run
259 /// (host-only tests, early VM failure) or sample collection
260 /// produced no valid data. Writer always emits (`"monitor": null`
261 /// on absence); reader-side, serde's native `Option<T>`
262 /// deserialize tolerates absence — see the module-level doc.
263 pub monitor: Option<MonitorSummary>,
264 /// Periodic-capture coverage for this run: how many periodic snapshot
265 /// boundaries actually fired (`periodic_fired`) out of the configured
266 /// `num_snapshots` target (`periodic_target`). Carried verbatim from
267 /// [`crate::prelude::VmResult`] so cross-run tooling can read the
268 /// coverage off the persisted sidecar (previously only the in-memory
269 /// result exposed it). `0`/`0` for runs with no periodic captures
270 /// configured. Hard-required `u32` fields — old sidecars predating
271 /// them re-generate on the next run (sidecar data is disposable).
272 pub periodic_fired: u32,
273 /// See [`Self::periodic_fired`].
274 pub periodic_target: u32,
275 /// Guest vCPU count and the effective host-CPU budget the vCPU threads
276 /// ran on, carried verbatim from [`crate::prelude::VmResult`]. Drive
277 /// the `cpu-budget` comparison Dimension (cross-budget runs are not
278 /// paired — confining 32 vCPUs to 4 host CPUs measures something else)
279 /// and the overcommit marker: `cpu_budget < vcpus` means the host
280 /// time-sliced the guest's vCPUs, confounding the timing metrics
281 /// (wake-latency / off-CPU / run-delay — schedstat run_delay tracks
282 /// rq->clock, which follows the guest TSC and is not steal-adjusted,
283 /// so the off-host window inflates it for tasks waiting across it).
284 /// Hard-required `u32`
285 /// (old sidecars re-generate; sidecar data is disposable). EXCLUDED
286 /// from `sidecar_variant_hash`: a budget change is a different
287 /// measurement, separated downstream by the Dimension, not the
288 /// identity bucket.
289 pub vcpus: u32,
290 /// See [`Self::vcpus`].
291 pub cpu_budget: u32,
292 /// Ordered stimulus events published by the guest step executor
293 /// while the scenario ran.
294 pub stimulus_events: Vec<StimulusEvent>,
295 /// WorkSpec type label used for post-hoc filtering and A/B comparison
296 /// (distinct from the `WorkType` enum — this is the text name).
297 pub work_type: String,
298 /// Per-BPF-program verifier statistics captured from the VM's
299 /// scheduler (when one was loaded). Empty when no scheduler
300 /// programs were inspected. Writer always emits as
301 /// `"verifier_stats": []` in that case; reader-side, this `Vec`
302 /// field is hard-required (non-`Option` fields fail deserialize
303 /// on absence). See the module-level doc.
304 pub verifier_stats: Vec<crate::monitor::bpf_prog::ProgVerifierStats>,
305 /// Aggregate per-vCPU KVM stats read after VM exit. `None` when
306 /// the VM did not run (host-only tests) or KVM stats were
307 /// unavailable. Writer always emits (`"kvm_stats": null` on
308 /// absence); reader-side, serde's native `Option<T>` deserialize
309 /// tolerates absence — see the module-level doc.
310 pub kvm_stats: Option<crate::vmm::KvmStatsTotals>,
311 /// Effective sysctls active during this test run, recorded as raw
312 /// `sysctl.key=value` cmdline strings. Writer always emits as
313 /// `"sysctls": []` when none; reader-side, this `Vec` field is
314 /// hard-required (non-`Option` fields fail deserialize on
315 /// absence). See the module-level doc.
316 pub sysctls: Vec<String>,
317 /// Effective kernel command-line args active during this test run.
318 /// Writer always emits as `"kargs": []` when none; reader-side,
319 /// this `Vec` field is hard-required (non-`Option` fields fail
320 /// deserialize on absence). See the module-level doc.
321 pub kargs: Vec<String>,
322 /// Kernel version of the VM under test (from cache metadata,
323 /// e.g. `"6.14.2"`). Populated from the cache entry's
324 /// `metadata.json` version field, with fallback to the kernel
325 /// source tree's `include/config/kernel.release` when
326 /// `KTSTR_KERNEL` points at a raw source path rather than a
327 /// cache key; `None` for host-only tests or when neither
328 /// source yields a version string. The host's running kernel
329 /// release is carried separately in `host.kernel_release`.
330 /// Writer always emits (`"kernel_version": null` on absence);
331 /// reader-side, serde's native `Option<T>` deserialize tolerates
332 /// absence — see the module-level doc for the full asymmetric
333 /// contract.
334 pub kernel_version: Option<String>,
335 /// Kernel SOURCE TREE git HEAD short hex (7 chars via
336 /// `oid::to_hex_with_len(7)`), with `-dirty` suffix appended
337 /// when HEAD-vs-index or index-vs-worktree changes are
338 /// observed. Probes via `gix::open` against the kernel
339 /// directory resolved from `KTSTR_KERNEL` (not `gix::discover`
340 /// — the kernel dir is explicit, not walked-up). Captured by
341 /// `detect_kernel_commit` at sidecar-write time.
342 ///
343 /// Distinct from sibling fields:
344 /// - [`SidecarResult::kernel_version`] — release string read
345 /// from cache metadata or `include/config/kernel.release`,
346 /// e.g. `"6.14.2"`. Two runs of `6.14.2` from a clean
347 /// tree and a `-dirty` worktree at the same HEAD share
348 /// `kernel_version` but differ on `kernel_commit`.
349 /// - [`SidecarResult::project_commit`] — ktstr framework
350 /// HEAD captured from the test process's cwd. Tracks
351 /// "what version of the harness produced this sidecar?"
352 /// independently of the kernel under test.
353 /// - [`SidecarResult::scheduler_commit`] — userspace
354 /// scheduler binary's commit (currently always `None`).
355 ///
356 /// `None` when:
357 /// - `KTSTR_KERNEL` is unset or empty;
358 /// - the resolved `KernelId` is `Version` / `CacheKey` whose
359 /// underlying source is `Tarball` / `Git` (no source tree
360 /// on disk to probe);
361 /// - the resolved kernel directory is not a git repository
362 /// (`gix::open` fails);
363 /// - HEAD cannot be read (unborn HEAD on a fresh `git init`
364 /// with zero commits);
365 /// - any other gix probe failure — metadata, not a gate.
366 ///
367 /// Writer always emits (`"kernel_commit": null` on absence);
368 /// reader-side, serde's native `Option<T>` deserialize tolerates
369 /// absence — see the module-level doc for the full asymmetric
370 /// contract. Excluded from `sidecar_variant_hash` for the same
371 /// cross-host grouping reason `scheduler_commit` and
372 /// `project_commit` are excluded: two runs of the same semantic
373 /// variant on different kernel-source HEADs must still bucket
374 /// together so `perf-delta` can diff them; the commit-drift
375 /// detection inspects this field directly via the
376 /// `--kernel-commit` filter.
377 pub kernel_commit: Option<String>,
378 /// ISO 8601 timestamp of when this test run started.
379 pub timestamp: String,
380 /// Unique identifier for the test run. Composed as
381 /// `{run_id_timestamp}-{counter}` — the `YYYYMMDDTHHMMSSZ`
382 /// process-start stamp followed by a process-local monotonic
383 /// counter. Every sidecar produced in one `cargo ktstr test`
384 /// invocation shares the same timestamp prefix; the counter
385 /// distinguishes concurrent gauntlet variants within that
386 /// invocation. Distinct from the run DIRECTORY name (keyed
387 /// `{kernel}-{project_commit}`, see [`sidecar_dir`]) — the
388 /// directory groups runs by what they tested, the `run_id`
389 /// groups sidecars by which process emitted them.
390 pub run_id: String,
391 /// Host context — static-ish runtime state (CPU model,
392 /// memory size, THP policy, kernel release, host cmdline,
393 /// scheduler tunables). Populated by production sidecar
394 /// writers.
395 ///
396 /// `None` causes:
397 /// - **test-fixture path**: not the production sidecar
398 /// writer (production writers always populate `host`).
399 /// - **pre-enrichment archive**: sidecar predates the
400 /// host-context landing — re-run the test to regenerate
401 /// under the current schema (no migration shim exists
402 /// per the pre-1.0 disposable-data contract).
403 ///
404 /// Deliberately excluded from the variant hash so
405 /// gauntlet variants on different hosts collapse into the same
406 /// hash bucket.
407 ///
408 /// No serde attributes: writer always emits (`"host": null` when
409 /// `None`); reader-side, serde's native `Option<T>` deserialize
410 /// tolerates absence (a missing key parses as `None`). The
411 /// asymmetric contract is crate-wide — see the module-level doc.
412 /// Pre-1.0, sidecar data is disposable, so regenerate by
413 /// re-running the test rather than carrying a compat shim for
414 /// older JSON; the reader-side tolerance exists so an in-flight
415 /// schema rename of an `Option` field does not break parsing of
416 /// older sidecars during the same producer-version, not as a
417 /// long-term migration story.
418 pub host: Option<crate::host_context::HostContext>,
419 /// Wall-clock milliseconds spent in
420 /// `KtstrVm::collect_results` — the host-side
421 /// teardown window from BSP exit through SHM drain (mirrors
422 /// [`VmResult::cleanup_duration`](crate::vmm::VmResult::cleanup_duration);
423 /// `Duration` is converted to `u64` ms here because every other
424 /// timing field on this struct that lands in a sidecar-comparison
425 /// CLI uses integer ms or seconds, and JSON has no native
426 /// `Duration`). `None` when the run was killed by the watchdog
427 /// before `collect_results` returned, or for the `host_only` /
428 /// host-only-stub paths that never boot a VM. Writer always emits
429 /// (`"cleanup_duration_ms": null` on absence); reader-side,
430 /// serde's native `Option<T>` deserialize tolerates absence — see
431 /// the module-level doc for the full asymmetric contract.
432 pub cleanup_duration_ms: Option<u64>,
433 /// Provenance tag for this sidecar — distinguishes a developer's
434 /// local run from a CI run so cross-environment comparisons in
435 /// `perf-delta` can narrow on (or contrast across) the run
436 /// environment without inferring it from `host`.
437 ///
438 /// Recorded by `detect_run_source` at sidecar-write time:
439 /// - `Some("ci")` when `KTSTR_CI_ENV` is set non-empty (CI runner
440 /// scripts export it before invoking the test binary; local
441 /// runs never set it).
442 /// - `Some("local")` otherwise — the default for any sidecar
443 /// produced by a developer-driven invocation.
444 /// - The third documented value (`"archive"`) is NEVER written
445 /// here: a sidecar cannot know it will later be archived. The
446 /// stats CLI applies the `"archive"` tag at LOAD time when its
447 /// `--dir` flag points at a non-default pool root, overriding
448 /// whatever was on disk via `apply_archive_source_override`.
449 ///
450 /// `Option<String>` (rather than an enum) keeps the schema
451 /// extensible without a serde-version bump if a future producer
452 /// wants a new tag (e.g. `"benchmark"`); the consumer side
453 /// treats unknown values the same as known ones — they are
454 /// strings the operator can pass via `--run-source` to filter on.
455 /// Writer always emits (`"run_source": null` on absence);
456 /// reader-side, serde's native `Option<T>` deserialize tolerates
457 /// absence — see the module-level doc for the full asymmetric
458 /// contract. Excluded from `sidecar_variant_hash` for the same
459 /// cross-host grouping reason `host` is excluded — two runs of
460 /// the same semantic variant from different environments must
461 /// still bucket together so `perf-delta` can pair them; `--run-source`
462 /// is the explicit knob for source-aware narrowing.
463 ///
464 /// Field name `run_source` (renamed from `source`) disambiguates
465 /// from [`crate::cache::KernelSource`] / `KernelMetadata.source`
466 /// — those describe the kernel build's input (tarball / git /
467 /// local), this describes the run-environment provenance.
468 ///
469 /// **On-disk JSON key changed from `"source"` to `"run_source"`
470 /// in the field rename.** No `#[serde(alias = "source")]` is
471 /// in place: archived sidecars written before the rename carry
472 /// the `"source"` key, which the current schema treats as an
473 /// unknown field. Because `SidecarResult`'s derive does NOT
474 /// set `deny_unknown_fields`, the deserialize does not fail
475 /// outright — instead serde silently DROPS the stale `"source"`
476 /// payload and lands `run_source = None` (since `Option<T>`'s
477 /// "tolerate absence" rule kicks in for the missing
478 /// `"run_source"` field). The data is lost, not preserved. This
479 /// is deliberate per the project's pre-1.0 disposable-data
480 /// contract: re-running tests regenerates sidecars under the
481 /// new key rather than carrying compat shims forward. Consumers
482 /// who need the run-source classification on archived JSON
483 /// must either rename the key in-place before deserialize, or
484 /// re-run the test to regenerate the sidecar with the new
485 /// schema. Tooling that runs against the renamed schema and
486 /// observes a `None` `run_source` cannot distinguish "sidecar
487 /// pre-dates the field" from "sidecar pre-dates the rename and
488 /// lost its tag" — both lower-bound at `None` for filter
489 /// purposes.
490 pub run_source: Option<String>,
491 /// Per-test [`crate::test_support::PerfDeltaAssertion`]s declared on the
492 /// entry, serialized so `cargo ktstr perf-delta --noise-adjust`'s host-side
493 /// compare can enforce them across commits (the entry registry in the parent
494 /// process describes only HEAD's tests, not a baseline/cached sidecar's
495 /// commit, so the declaration must travel WITH the run). Empty when the test
496 /// declared none. Inert here — a normal `cargo ktstr test` writes them but
497 /// never gates on them; only the `--noise-adjust` compare consults them (the
498 /// scalar compare warns that declared gates were skipped).
499 ///
500 /// Writer always emits (`"perf_delta_assertions": []` on absence); reader-
501 /// side this `Vec` field is hard-required (non-`Option` fails deserialize on
502 /// absence) — see the module-level doc for the full contract.
503 pub perf_delta_assertions: Vec<PerfDeltaAssertionRecord>,
504}
505
506/// Owned, serialized mirror of [`crate::test_support::PerfDeltaAssertion`]. The
507/// public declaration type uses `&'static str` (so it stays const/E0493-safe on
508/// the entry) and therefore cannot `Deserialize` into an owned value; this
509/// `String`-backed record is the sidecar carrier the perf-delta compare reads.
510/// `pub` because it is a field of the `pub` [`SidecarResult`] (constructed
511/// across the workspace, including by the `cargo-ktstr` binary crate); the
512/// author-facing declaration type is [`crate::test_support::PerfDeltaAssertion`].
513#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
514pub struct PerfDeltaAssertionRecord {
515 /// Registry metric name this assertion gates (see `stats list-metrics`).
516 pub metric: String,
517 /// Pinned regression direction, or `None` to inherit the registry polarity.
518 pub direction: Option<crate::test_support::Polarity>,
519 /// Relative-regression override (percent), or `None` for `default_rel`.
520 pub max_regression_pct: Option<f64>,
521 /// Absolute-materiality override, or `None` for `default_abs`.
522 pub min_abs: Option<f64>,
523 /// Phase scope (`step_index`), or `None` to gate the aggregate value.
524 pub phase: Option<u16>,
525}
526
527impl From<&crate::test_support::PerfDeltaAssertion> for PerfDeltaAssertionRecord {
528 fn from(a: &crate::test_support::PerfDeltaAssertion) -> Self {
529 Self {
530 metric: a.metric().to_string(),
531 direction: a.direction(),
532 max_regression_pct: a.max_regression_pct(),
533 min_abs: a.min_abs(),
534 phase: a.phase(),
535 }
536 }
537}
538
539impl SidecarResult {
540 /// Convenience accessor mirroring
541 /// [`crate::assert::AssertResult::is_pass`]. SidecarResult is the
542 /// wire-format mirror of an AssertResult; this method exposes the
543 /// same is_pass / is_fail / is_skip / is_inconclusive vocabulary
544 /// so consumers can swap between the two without re-learning
545 /// field names.
546 ///
547 /// Returns true only when the run reached a real Pass — neither
548 /// skipped, inconclusive, nor failed. The triple-conjunct guard
549 /// matches AssertResult's `Fail > Inconclusive > Pass > Skip`
550 /// dominance under the strict 4-state mutex this struct encodes.
551 /// CI gates that want "ship-on-pass" semantics call this method
552 /// and only this method.
553 ///
554 /// Part of the `is_pass` / `is_fail` / `is_inconclusive` /
555 /// `is_skip` vocabulary uniform across the verdict surfaces:
556 /// [`crate::assert::AssertResult::is_pass`] / `Self::is_pass` /
557 /// [`crate::assert::Outcome::is_pass`] / `MonitorVerdict::is_pass`
558 /// (in the `monitor` module, which is `pub(crate)`) /
559 /// `Verdict::is_pass` (re-exported at [`crate::assert::Verdict`]) /
560 /// `GauntletRow::is_pass` (in the `stats` module, which is
561 /// `pub(crate)`).
562 pub fn is_pass(&self) -> bool {
563 self.passed && !self.skipped && !self.inconclusive
564 }
565 /// Convenience accessor mirroring
566 /// [`crate::assert::AssertResult::is_fail`]. The four-state
567 /// encoding uses three stored bits `(passed, skipped,
568 /// inconclusive)` in strict mutual exclusion (at most one
569 /// set); Fail is the all-false derived state, no dedicated
570 /// bit. `is_fail` reads "none of the three bits are set",
571 /// which under `Fail > Inconclusive > Pass > Skip` dominance
572 /// correctly resolves a mixed Fail+Inconclusive stream as
573 /// Fail.
574 pub fn is_fail(&self) -> bool {
575 !self.passed && !self.skipped && !self.inconclusive
576 }
577 /// Convenience accessor mirroring
578 /// [`crate::assert::AssertResult::is_skip`].
579 pub fn is_skip(&self) -> bool {
580 self.skipped
581 }
582 /// Convenience accessor mirroring
583 /// [`crate::assert::AssertResult::is_inconclusive`]. True when
584 /// the run could not be evaluated (zero-denominator ratio gate);
585 /// false on real Pass, real Fail, or Skip. CI gates that gate
586 /// on "did we get a real verdict?" should test
587 /// `r.is_pass() || r.is_fail()` and treat both `is_skip()` and
588 /// `is_inconclusive()` as "couldn't measure".
589 pub fn is_inconclusive(&self) -> bool {
590 self.inconclusive
591 }
592}
593
594#[cfg(test)]
595impl SidecarResult {
596 /// Populated [`SidecarResult`] for unit tests. Every field has a
597 /// reasonable default so call sites only spell out what they want
598 /// to vary via struct-update syntax:
599 ///
600 /// ```ignore
601 /// let sc = SidecarResult {
602 /// test_name: "my_test".to_string(),
603 /// passed: false,
604 /// ..SidecarResult::test_fixture()
605 /// };
606 /// ```
607 ///
608 /// Defaults model a passing EEVDF run on a minimal `1n1l1c1t`
609 /// topology with no payload and no VM telemetry: `test_name="t"`,
610 /// `topology="1n1l1c1t"`, `scheduler="eevdf"`, `work_type="SpinWait"`,
611 /// `passed=true`, `skipped=false`, `inconclusive=false`, every
612 /// [`Option`] `None`, every [`Vec`] empty, `stats` is
613 /// `ScenarioStats::default()`, and both `timestamp`/`run_id` are
614 /// empty strings.
615 ///
616 /// **Prefer this over local `base = || SidecarResult { ... }`
617 /// closures.** A local closure duplicates the default set and
618 /// drifts the moment [`SidecarResult`] grows a field; this fixture
619 /// is the single place those defaults live.
620 ///
621 /// **Hash-stability tests must not rely on these defaults for
622 /// hash-participating fields** (`topology`, `scheduler`, `payload`,
623 /// `work_type`, `sysctls`, `kargs`). Tests that pin
624 /// a [`sidecar_variant_hash`] output against a literal constant
625 /// must spell every hash-participating field out explicitly so a
626 /// future change to these defaults cannot silently shift the
627 /// pinned value.
628 pub(crate) fn test_fixture() -> SidecarResult {
629 SidecarResult {
630 test_name: "t".to_string(),
631 perf_delta_assertions: Vec::new(),
632 topology: "1n1l1c1t".to_string(),
633 scheduler: "eevdf".to_string(),
634 scheduler_commit: None,
635 resolve_source: None,
636 project_commit: None,
637 payload: None,
638 metrics: Vec::new(),
639 passed: true,
640 skipped: false,
641 inconclusive: false,
642 expected_failure: false,
643 stats: crate::assert::ScenarioStats::default(),
644 monitor: None,
645 periodic_fired: 0,
646 periodic_target: 0,
647 vcpus: 1,
648 cpu_budget: 1,
649 stimulus_events: Vec::new(),
650 work_type: "SpinWait".to_string(),
651 verifier_stats: Vec::new(),
652 kvm_stats: None,
653 sysctls: Vec::new(),
654 kargs: Vec::new(),
655 kernel_version: None,
656 kernel_commit: None,
657 timestamp: String::new(),
658 run_id: String::new(),
659 host: None,
660 cleanup_duration_ms: None,
661 run_source: None,
662 }
663 }
664}
665
666/// Predicate: is `path` a ktstr sidecar JSON filename?
667///
668/// True iff the path's extension is `json` AND the path's
669/// FILENAME COMPONENT (`Path::file_name`) contains `.ktstr.` —
670/// matching the on-disk shape produced by [`write_sidecar`]
671/// (`<test>-<variant_hash>.ktstr.json`). Both gates are required:
672/// bare `*.json` files (cargo cache, stray fixtures) and non-json
673/// files whose name happens to contain `.ktstr.` (e.g. a log)
674/// are excluded.
675///
676/// The filename-component check (rather than full-path string)
677/// is load-bearing: a parent directory like
678/// `target/foo.ktstr.bar/extra.json` would falsely match a
679/// whole-path `contains(".ktstr.")` while NOT being a sidecar.
680/// `Path::file_name()` returns only the trailing component, so
681/// `.ktstr.` in any ancestor segment cannot trigger the predicate.
682///
683/// Single source of truth for "is this file a sidecar?" — used
684/// by [`collect_sidecars_with_errors`]'s parsing walker and by
685/// the explain-sidecar file-count walker
686/// (`crate::cli::stats_cmds::explain_sidecar::count_sidecar_files`). Both
687/// walkers MUST agree on the predicate so `walked` (count) and
688/// `valid + errors` (parse outcomes) reconcile against each
689/// other; a divergence would let a file count toward `walked`
690/// without contributing to either bucket, manifesting as a
691/// silent-drop count that has no source.
692pub(crate) fn is_sidecar_filename(path: &std::path::Path) -> bool {
693 path.extension().and_then(|e| e.to_str()) == Some("json")
694 && path
695 .file_name()
696 .and_then(|n| n.to_str())
697 .is_some_and(|n| n.contains(".ktstr."))
698}
699
700/// Scan a directory for ktstr sidecar JSON files. Recurses one level
701/// into subdirectories to handle per-job gauntlet layouts.
702///
703/// Convenience wrapper over [`collect_sidecars_with_errors`] for
704/// single-directory callers that only need the parsed sidecars and not
705/// the per-file parse-failure list. Emits ONE aggregated
706/// [`warn_skipped_sidecars`] summary for that directory when stale
707/// sidecars were dropped. Multi-directory walkers must NOT use this in a
708/// loop (it would print one summary per directory) — they call
709/// [`collect_sidecars_with_errors`] per directory and aggregate the counts
710/// into a single summary (see `collect_pool`).
711pub(crate) fn collect_sidecars(dir: &std::path::Path) -> Vec<SidecarResult> {
712 let (sidecars, parse_errors, _io_errors) = collect_sidecars_with_errors(dir);
713 warn_skipped_sidecars(dir, parse_errors.len());
714 sidecars
715}
716
717/// Emit a single aggregated summary for the stale/unparseable sidecars a
718/// walk skipped, or nothing when `skipped == 0`. Sidecars written before a
719/// schema field was added fail to deserialize and are dropped (sidecar data
720/// is disposable — re-running regenerates it); this collapses what would
721/// otherwise be one `eprintln!` per file into one line. Per-file detail
722/// stays available through [`collect_sidecars_with_errors`]'s parse-error
723/// Vec, which `cargo ktstr stats explain-sidecar` renders.
724///
725/// `pub(crate)` so multi-directory walkers outside this module (e.g.
726/// `stats::analyze::sorted_run_entries`) can accumulate `parse_errors.len()`
727/// across run directories and emit ONE pool-wide summary rather than one
728/// per directory.
729pub(crate) fn warn_skipped_sidecars(dir: &std::path::Path, skipped: usize) {
730 if skipped > 0 {
731 eprintln!(
732 "ktstr_test: skipped {skipped} stale sidecar(s) under {} (older \
733 schema — re-run the affected tests to regenerate; \
734 `cargo ktstr stats explain-sidecar --run <run>` shows per-file \
735 detail)",
736 dir.display(),
737 );
738 }
739}
740
741/// Per-file parse-failure record returned by
742/// [`collect_sidecars_with_errors`] and threaded through
743/// `crate::cli::WalkStats::errors` to the renderers.
744///
745/// Named-field struct (rather than a `(PathBuf, String,
746/// Option<String>)` tuple) so call sites read fields by name —
747/// pattern-matching `for err in errors` and accessing
748/// `err.path` / `err.raw_error` / `err.enriched_message`
749/// resists the tuple-position-swap class of bug where positional
750/// fields could destructure in either order without compiler help.
751pub(crate) struct SidecarParseError {
752 /// On-disk path of the sidecar JSON that failed to parse.
753 pub path: std::path::PathBuf,
754 /// Verbatim serde-error string. Kept raw for
755 /// grep-friendly parse-error tracking and surfaced through
756 /// the JSON channel as the `error` key.
757 pub raw_error: String,
758 /// Operator-facing remediation prose computed by
759 /// [`enriched_parse_error_message`]. `Some(...)` for known
760 /// schema-drift cases (currently the `host` missing-field
761 /// pattern), `None` otherwise. Surfaced through the JSON
762 /// channel as `enriched_message`.
763 pub enriched_message: Option<String>,
764}
765
766/// Per-file IO-failure record returned by
767/// [`collect_sidecars_with_errors`] and threaded through
768/// `crate::cli::WalkStats::io_errors` to the renderers.
769///
770/// Captures files where the filename predicate matched but
771/// `std::fs::read_to_string` failed before parsing could begin —
772/// permission denied, mid-rotate truncation, broken symlink,
773/// etc. Distinct from [`SidecarParseError`] (which represents
774/// "file read OK but JSON parse failed"); separating the two
775/// lets dashboard consumers triage filesystem incidents apart
776/// from schema drift.
777///
778/// Named-field struct mirroring [`SidecarParseError`]'s shape so
779/// the renderer side can iterate by field name without tuple-
780/// position fragility. No `enriched_message` field — there is no
781/// remediation catalog for IO failures (causes vary per host:
782/// fix permissions, fix the filesystem, retry the test).
783pub(crate) struct SidecarIoError {
784 /// On-disk path the predicate matched as a sidecar candidate.
785 pub path: std::path::PathBuf,
786 /// Verbatim `std::io::Error` Display string. Surfaced through
787 /// the JSON channel as the `error` key on
788 /// `crate::cli::WalkIoError` entries and through the text
789 /// channel as the `error: ...` line under the `io errors`
790 /// trailing block.
791 pub raw_error: String,
792}
793
794/// Test-only re-export of [`enriched_parse_error_message`] so
795/// `cli::tests` can verify the enrichment-pattern logic
796/// directly against synthetic error strings. The helper itself
797/// stays private so production code routes through
798/// [`collect_sidecars_with_errors`].
799#[cfg(test)]
800pub(crate) fn enriched_parse_error_message_for_test(
801 path: &std::path::Path,
802 raw_error: &str,
803) -> Option<String> {
804 enriched_parse_error_message(path, raw_error)
805}
806
807/// Compute the operator-prose enrichment for a serde parse-error
808/// message, when one applies. Today the only enriched case is the
809/// `host` missing-field schema-drift diagnostic; the function
810/// returns `None` for any other shape so consumers can branch on
811/// "enrichment exists" without re-implementing the match.
812///
813/// Pulled out of [`collect_sidecars_with_errors`]'s parse path so the
814/// enrichment prose is computed in one place and stored in the returned
815/// [`SidecarParseError`]'s `enriched_message` field — parse failures are
816/// surfaced only through that Vec, not a separate stderr channel.
817///
818/// Matching on the Display text is deliberate: serde's typed-error
819/// surface for `missing field "X"` is not stable across
820/// serde_json versions, but the rendered message is — a
821/// forward-compat regression-resilient check costs one string
822/// search.
823fn enriched_parse_error_message(path: &std::path::Path, raw_error: &str) -> Option<String> {
824 let is_missing_host = raw_error.contains("missing field") && raw_error.contains("`host`");
825 if is_missing_host {
826 Some(format!(
827 "ktstr_test: skipping {}: {raw_error} — the `host` field \
828 was added to SidecarResult; pre-1.0 policy is \
829 disposable-sidecar: re-run the test to regenerate this \
830 file under the current schema (no migration shim exists)",
831 path.display(),
832 ))
833 } else {
834 None
835 }
836}
837
838/// Scan a directory for ktstr sidecar JSON files, returning the
839/// parsed sidecars, a [`SidecarParseError`] record (named fields
840/// `path`, `raw_error`, `enriched_message`) for every file that
841/// passed the filename predicate but failed to deserialize, and a
842/// [`SidecarIoError`] record (named fields `path`, `raw_error`)
843/// for every file that passed the predicate but whose
844/// `read_to_string` failed before parsing could begin. Recurses
845/// one level into subdirectories to handle per-job gauntlet
846/// layouts.
847///
848/// Parse failures are captured ONLY in the returned parse-errors vec —
849/// this walker no longer logs per file. Each failure is a
850/// [`SidecarParseError`] record (named fields `path`, `raw_error`,
851/// `enriched_message`) for structured callers (`explain-sidecar`'s walker
852/// output). Both raw and enriched are exposed so dashboard consumers can
853/// pick: raw for parse-error grepping, enriched for human-facing
854/// remediation prose. Callers that only need the sidecars aggregate
855/// `parse_errors.len()` and emit one [`warn_skipped_sidecars`] summary
856/// (see [`collect_sidecars`] / `collect_pool`) rather than one line per
857/// file.
858///
859/// IO failures (third return) get a single eprintln line plus a
860/// structured [`SidecarIoError`] record. Distinguished from
861/// parse failures so dashboard consumers can triage filesystem
862/// incidents (permission denied, mid-rotate truncation, broken
863/// symlink) apart from schema drift. With this third channel,
864/// every predicate-matching file lands in exactly one of the
865/// three returned vecs — the prior implicit
866/// `walked - valid - parse_errors.len()` silent-drop count is
867/// now zero by construction.
868///
869/// Callers that don't need structured errors should use
870/// [`collect_sidecars`].
871pub(crate) fn collect_sidecars_with_errors(
872 dir: &std::path::Path,
873) -> (
874 Vec<SidecarResult>,
875 Vec<SidecarParseError>,
876 Vec<SidecarIoError>,
877) {
878 let mut sidecars = Vec::new();
879 let mut parse_errors: Vec<SidecarParseError> = Vec::new();
880 let mut io_errors: Vec<SidecarIoError> = Vec::new();
881 let entries = match std::fs::read_dir(dir) {
882 Ok(e) => e,
883 Err(e) => {
884 tracing::warn!(
885 dir = %dir.display(),
886 error = %e,
887 "ktstr_test: collect_sidecars_with_errors cannot read root dir",
888 );
889 return (sidecars, parse_errors, io_errors);
890 }
891 };
892 let mut subdirs = Vec::new();
893 let try_load = |path: &std::path::Path,
894 out: &mut Vec<SidecarResult>,
895 parse_errs: &mut Vec<SidecarParseError>,
896 io_errs: &mut Vec<SidecarIoError>| {
897 if !is_sidecar_filename(path) {
898 return;
899 }
900 let data = match std::fs::read_to_string(path) {
901 Ok(d) => d,
902 Err(e) => {
903 let raw = e.to_string();
904 eprintln!("ktstr_test: cannot read {}: {raw}", path.display());
905 io_errs.push(SidecarIoError {
906 path: path.to_path_buf(),
907 raw_error: raw,
908 });
909 return;
910 }
911 };
912 match serde_json::from_str::<SidecarResult>(&data) {
913 Ok(sc) => out.push(sc),
914 Err(e) => {
915 let raw = e.to_string();
916 let enriched = enriched_parse_error_message(path, &raw);
917 // Capture (do not log) the per-file skip: callers emit one
918 // aggregated `warn_skipped_sidecars` summary so a directory
919 // of stale sidecars produces a single line, not a flood.
920 // `cargo ktstr stats explain-sidecar --run <run>` renders the
921 // per-file detail (raw + enriched remediation) from this Vec.
922 parse_errs.push(SidecarParseError {
923 path: path.to_path_buf(),
924 raw_error: raw,
925 enriched_message: enriched,
926 });
927 }
928 }
929 };
930 for entry in entries {
931 let entry = match entry {
932 Ok(e) => e,
933 Err(e) => {
934 tracing::warn!(
935 dir = %dir.display(),
936 error = %e,
937 "ktstr_test: skipping unreadable DirEntry while collecting sidecars",
938 );
939 continue;
940 }
941 };
942 let path = entry.path();
943 if path.is_dir() {
944 subdirs.push(path);
945 continue;
946 }
947 try_load(&path, &mut sidecars, &mut parse_errors, &mut io_errors);
948 }
949 for sub in subdirs {
950 let sub_entries = match std::fs::read_dir(&sub) {
951 Ok(e) => e,
952 Err(e) => {
953 tracing::warn!(
954 subdir = %sub.display(),
955 error = %e,
956 "ktstr_test: skipping unreadable subdirectory while collecting sidecars",
957 );
958 continue;
959 }
960 };
961 for entry in sub_entries {
962 let entry = match entry {
963 Ok(e) => e,
964 Err(e) => {
965 tracing::warn!(
966 subdir = %sub.display(),
967 error = %e,
968 "ktstr_test: skipping unreadable DirEntry in sidecar subdirectory",
969 );
970 continue;
971 }
972 };
973 try_load(
974 &entry.path(),
975 &mut sidecars,
976 &mut parse_errors,
977 &mut io_errors,
978 );
979 }
980 }
981 (sidecars, parse_errors, io_errors)
982}
983
984/// Pool every sidecar JSON under every run directory at `root`.
985///
986/// Walks each immediate subdirectory of `root` (one per run, named
987/// `{kernel}-{project_commit}` by [`sidecar_dir`] where
988/// `{project_commit}` is the project tree's HEAD short hex with
989/// `-dirty` suffix when the worktree differs from HEAD) and
990/// concatenates the sidecars each one yields via
991/// `collect_sidecars_with_errors` (per directory, so the per-directory
992/// stale-sidecar skip counts aggregate into one pool-wide summary). The
993/// result is a flat
994/// `Vec<SidecarResult>` covering every recorded run on disk —
995/// `cargo ktstr perf-delta`'s pool-driven sourcing reads it
996/// once, applies the typed `--a-*` / `--b-*` filters in memory,
997/// and partitions the survivors into A/B sides.
998///
999/// `root` is typically [`runs_root`]; pass an alternate path when
1000/// comparing archived sidecar trees copied off a CI host (the
1001/// `--dir` escape hatch on `perf-delta`).
1002///
1003/// Returns an empty Vec when `root` does not exist or contains no
1004/// run directories. Per-run failure (a corrupt sidecar, a partial
1005/// directory) is counted and skipped — pool-collection never aborts
1006/// on a single bad file, and emits ONE aggregated
1007/// `warn_skipped_sidecars` summary for the whole walk rather than a
1008/// per-file line.
1009///
1010/// Performance: this is a full filesystem walk over `root`. On a
1011/// host with many archived runs (dozens to hundreds), each
1012/// invocation re-reads every sidecar JSON. The cost is acceptable
1013/// for the current operator workflow (one comparison per
1014/// session) but is taskifyable if it becomes a hot path — a
1015/// directory-name fast-path could skip runs whose
1016/// `{kernel}-{project_commit}` prefix does not match the active
1017/// `--a-kernel` / `--b-kernel` filter.
1018pub fn collect_pool(root: &std::path::Path) -> Vec<SidecarResult> {
1019 let entries = match std::fs::read_dir(root) {
1020 Ok(e) => e,
1021 Err(e) => {
1022 tracing::warn!(
1023 root = %root.display(),
1024 error = %e,
1025 "ktstr_test: collect_pool cannot read root; returning empty pool",
1026 );
1027 return Vec::new();
1028 }
1029 };
1030 let mut pool = Vec::new();
1031 let mut skipped = 0usize;
1032 for entry in entries {
1033 let entry = match entry {
1034 Ok(e) => e,
1035 Err(e) => {
1036 tracing::warn!(
1037 root = %root.display(),
1038 error = %e,
1039 "ktstr_test: skipping unreadable DirEntry while collecting pool",
1040 );
1041 continue;
1042 }
1043 };
1044 let path = entry.path();
1045 if path.is_dir() {
1046 // `collect_sidecars_with_errors` already handles "one level of
1047 // subdirectories for per-job gauntlet layouts" inside each run
1048 // directory, so the two-level `{root}/{run_dir}/{job_subdir}`
1049 // shape works without a third walker level. Use the
1050 // error-returning variant (not `collect_sidecars`, which emits
1051 // its own per-directory summary) so the skip counts aggregate
1052 // into ONE pool-wide summary below.
1053 let (sidecars, parse_errors, _io_errors) = collect_sidecars_with_errors(&path);
1054 pool.extend(sidecars);
1055 skipped += parse_errors.len();
1056 }
1057 }
1058 warn_skipped_sidecars(root, skipped);
1059 pool
1060}
1061
1062/// BPF verifier complexity limit (BPF_COMPLEXITY_LIMIT_INSNS).
1063const VERIFIER_INSN_LIMIT: u32 = 1_000_000;
1064
1065/// Percentage of the verifier limit that triggers a warning.
1066const VERIFIER_WARN_PCT: f64 = 75.0;
1067
1068/// Aggregate BPF verifier stats across sidecars into a summary table.
1069///
1070/// verified_insns is deterministic for a given binary, so per-program
1071/// values are deduplicated (max across observations). Flags programs
1072/// using >=75% of the 1M verifier complexity limit.
1073pub(crate) fn format_verifier_stats(sidecars: &[SidecarResult]) -> String {
1074 use std::collections::BTreeMap;
1075
1076 let mut by_name: BTreeMap<&str, u32> = BTreeMap::new();
1077 for sc in sidecars {
1078 for info in &sc.verifier_stats {
1079 let entry = by_name.entry(&info.name).or_insert(0);
1080 *entry = (*entry).max(info.verified_insns);
1081 }
1082 }
1083
1084 if by_name.is_empty() {
1085 return String::new();
1086 }
1087
1088 let mut out = String::from("\n=== BPF VERIFIER STATS ===\n\n");
1089 out.push_str(&format!(
1090 " {:<24} {:>12} {:>8}\n",
1091 "program", "verified", "limit%"
1092 ));
1093 out.push_str(&format!(" {:-<24} {:-<12} {:-<8}\n", "", "", ""));
1094
1095 let mut warnings = Vec::new();
1096 let mut total: u64 = 0;
1097
1098 for (&name, &verified_insns) in &by_name {
1099 let pct = (verified_insns as f64 / VERIFIER_INSN_LIMIT as f64) * 100.0;
1100 let flag = if pct >= VERIFIER_WARN_PCT { " !" } else { "" };
1101 out.push_str(&format!(
1102 " {:<24} {:>12} {:>7.1}%{flag}\n",
1103 name, verified_insns, pct,
1104 ));
1105 if pct >= VERIFIER_WARN_PCT {
1106 warnings.push(format!(
1107 " {name}: {pct:.1}% of 1M limit ({verified_insns} verified insns)",
1108 ));
1109 }
1110 total += verified_insns as u64;
1111 }
1112
1113 out.push_str(&format!("\n total verified insns: {total}\n"));
1114
1115 if !warnings.is_empty() {
1116 out.push_str("\nWARNING: programs near verifier complexity limit:\n");
1117 for w in &warnings {
1118 out.push_str(w);
1119 out.push('\n');
1120 }
1121 }
1122
1123 out
1124}
1125
1126/// Per-test BPF callback profile from monitor prog_stats_deltas.
1127///
1128/// Shows per-program invocation count, total CPU time, and average
1129/// nanoseconds per call. Each test's profile is printed independently.
1130pub(crate) fn format_callback_profile(sidecars: &[SidecarResult]) -> String {
1131 let mut out = String::new();
1132
1133 for sc in sidecars {
1134 let deltas = match sc
1135 .monitor
1136 .as_ref()
1137 .and_then(|m| m.prog_stats_deltas.as_ref())
1138 {
1139 Some(d) if !d.is_empty() => d,
1140 _ => continue,
1141 };
1142
1143 if out.is_empty() {
1144 out.push_str("\n=== BPF CALLBACK PROFILE ===\n");
1145 }
1146 out.push_str(&format!("\n {} ({}):\n", sc.test_name, sc.topology));
1147 out.push_str(&format!(
1148 " {:<24} {:>12} {:>14} {:>12}\n",
1149 "program", "cnt", "total_ns", "avg_ns"
1150 ));
1151 out.push_str(&format!(
1152 " {:-<24} {:-<12} {:-<14} {:-<12}\n",
1153 "", "", "", ""
1154 ));
1155 for d in deltas {
1156 out.push_str(&format!(
1157 " {:<24} {:>12} {:>14} {:>12.0}\n",
1158 d.name, d.cnt, d.nsecs, d.nsecs_per_call,
1159 ));
1160 }
1161 }
1162
1163 out
1164}
1165
1166/// Aggregate KVM stats across sidecars into a compact summary.
1167///
1168/// Averages each stat across all tests that returned `Some(KvmStatsTotals)`.
1169/// Tests without KVM stats (non-VM tests, old kernels) are excluded
1170/// from the denominator.
1171pub(crate) fn format_kvm_stats(sidecars: &[SidecarResult]) -> String {
1172 let with_stats: Vec<&crate::vmm::KvmStatsTotals> = sidecars
1173 .iter()
1174 .filter_map(|sc| sc.kvm_stats.as_ref())
1175 .collect();
1176
1177 if with_stats.is_empty() {
1178 return String::new();
1179 }
1180
1181 let n_vms = with_stats.len();
1182
1183 // Compute cross-VM averages for each stat.
1184 let vm_avg = |name: &str| -> u64 {
1185 let sum: u64 = with_stats.iter().map(|d| d.avg(name)).sum();
1186 sum / n_vms as u64
1187 };
1188
1189 let exits = vm_avg("exits");
1190 let halt = vm_avg("halt_exits");
1191 let halt_wait_ns = vm_avg("halt_wait_ns");
1192 let preempted = vm_avg("preemption_reported");
1193 let signal = vm_avg("signal_exits");
1194 let hypercalls = vm_avg("hypercalls");
1195
1196 // Halt poll efficiency across all vCPUs and VMs.
1197 let total_poll_ok: u64 = with_stats
1198 .iter()
1199 .map(|d| d.sum("halt_successful_poll"))
1200 .sum();
1201 let total_poll_try: u64 = with_stats
1202 .iter()
1203 .map(|d| d.sum("halt_attempted_poll"))
1204 .sum();
1205
1206 if exits == 0 {
1207 return String::new();
1208 }
1209
1210 let halt_wait_ms = halt_wait_ns as f64 / 1_000_000.0;
1211 let poll_pct = if total_poll_try > 0 {
1212 (total_poll_ok as f64 / total_poll_try as f64) * 100.0
1213 } else {
1214 0.0
1215 };
1216
1217 let mut out = format!("\n=== KVM STATS (avg across {n_vms} VMs) ===\n\n");
1218 out.push_str(&format!(
1219 " exits/vcpu {:>7} halt/vcpu {:>5} halt_wait_ms {:>7.1}\n",
1220 exits, halt, halt_wait_ms,
1221 ));
1222 out.push_str(&format!(
1223 " poll_ok% {:>6.1}% preempted/vcpu {:>4} signal/vcpu {:>7}\n",
1224 poll_pct, preempted, signal,
1225 ));
1226 if hypercalls > 0 {
1227 out.push_str(&format!(" hypercalls/vcpu {:>4}\n", hypercalls));
1228 }
1229
1230 // Trust warnings.
1231 if preempted > 0 {
1232 let total: u64 = with_stats
1233 .iter()
1234 .map(|d| d.sum("preemption_reported"))
1235 .sum();
1236 out.push_str(&format!(
1237 "\n WARNING: {total} host preemptions detected \
1238 -- timing results may be unreliable\n",
1239 ));
1240 }
1241
1242 out
1243}
1244
1245/// Resolve the sidecar output directory for the current test process.
1246///
1247/// Override: `KTSTR_SIDECAR_DIR` (used as-is when non-empty). When
1248/// the override is set, `serialize_and_write_sidecar` ALSO skips
1249/// the per-directory pre-clear so any pre-existing sidecars in
1250/// the operator-chosen directory are preserved verbatim — see
1251/// `sidecar_dir_override`.
1252///
1253/// Default: `{CARGO_TARGET_DIR or "target"}/ktstr/{kernel}-{project_commit}/`,
1254/// where `{kernel}` is the version detected from `KTSTR_KERNEL`'s
1255/// metadata (or `"unknown"` when no kernel is set / detection fails)
1256/// and `{project_commit}` is the project-tree HEAD short hex from
1257/// `detect_project_commit` (with `-dirty` suffix when the worktree
1258/// differs from HEAD), or `"unknown"` when the test process is not
1259/// running inside a git repository or the probe fails. Every sidecar
1260/// written from the same `cargo ktstr test` invocation lands in the
1261/// same directory; two runs sharing the same kernel + project commit
1262/// (e.g. re-running the same suite without committing changes) reuse
1263/// the same directory, with the second run pre-clearing any
1264/// `*.ktstr.json` files left by the first via
1265/// `pre_clear_run_dir_once` — the directory is a last-writer-wins
1266/// snapshot keyed on (kernel, project commit), not an append-only
1267/// archive of every invocation.
1268pub fn sidecar_dir() -> PathBuf {
1269 sidecar_dir_override().unwrap_or_else(resolve_default_sidecar_dir)
1270}
1271
1272/// Compute the default-path sidecar directory:
1273/// `{runs_root}/{kernel}-{project_commit}` where `{kernel}` and
1274/// `{project_commit}` come from [`detect_kernel_version`] and
1275/// [`detect_project_commit`] respectively, with `"unknown"`
1276/// substituted via [`format_run_dirname`] when either probe
1277/// returns `None`. Emits the one-shot
1278/// [`warn_unknown_project_commit_once`] stderr warning when the
1279/// project commit probe falls back to `"unknown"` (operators in
1280/// this state lose the per-commit run-directory discriminator).
1281///
1282/// Shared by [`sidecar_dir`] and the default-path branch of
1283/// [`serialize_and_write_sidecar`] so both call sites resolve the
1284/// same kernel/commit/warn/format chain through one place.
1285/// `serialize_and_write_sidecar` cannot call [`sidecar_dir`]
1286/// directly because it needs a single-read of
1287/// [`sidecar_dir_override`] (gated against the env-var flipping
1288/// mid-call between the dir-resolve and the pre-clear gate); the
1289/// helper supplies the default-branch body so the override read
1290/// stays at one site.
1291fn resolve_default_sidecar_dir() -> PathBuf {
1292 let kernel = detect_kernel_version();
1293 let commit = detect_project_commit();
1294 if commit.is_none() {
1295 warn_unknown_project_commit_once();
1296 }
1297 runs_root().join(format_run_dirname(kernel.as_deref(), commit.as_deref()))
1298}
1299
1300/// Build the run-directory leaf name from optional kernel and commit
1301/// components. `None` collapses to the literal `"unknown"` sentinel
1302/// in either slot, so a non-git cwd produces `"{kernel}-unknown"`
1303/// and a missing kernel produces `"unknown-{project_commit}"`. Pure
1304/// function over the two inputs — no I/O — so unit tests can pin
1305/// every shape (clean, dirty, missing-kernel, missing-commit, both
1306/// missing) without driving the [`detect_kernel_version`] /
1307/// [`detect_project_commit`] OnceLocks.
1308///
1309/// SENTINEL ASYMMETRY: the on-disk dirname uses `"unknown"` for
1310/// missing values, but the in-memory [`SidecarResult::project_commit`]
1311/// / [`SidecarResult::kernel_version`] fields stay `None` (`null`
1312/// in JSON). a `project_commit` filter for a specific commit
1313/// will NOT match a sidecar whose `project_commit` is `None` —
1314/// omit the filter to include `None`-commit rows. The asymmetry
1315/// is deliberate: the dirname needs a filesystem-safe sentinel,
1316/// while the JSON field preserves the original probe outcome for
1317/// downstream tooling that distinguishes "no probe ran" from
1318/// "probe ran but found nothing."
1319fn format_run_dirname(kernel: Option<&str>, commit: Option<&str>) -> String {
1320 let kernel = kernel.unwrap_or("unknown");
1321 let commit = commit.unwrap_or("unknown");
1322 format!("{kernel}-{commit}")
1323}
1324
1325/// Resolve the parent directory that holds all test-run subdirectories.
1326///
1327/// Resolution order:
1328/// 1. [`crate::KTSTR_RUNS_ROOT_ENV`] (absolute) — the `cargo ktstr`
1329/// orchestrator stamps this once at startup so its footer / `stats`
1330/// / `replay` reads AND the child test processes' sidecar writes
1331/// resolve the SAME directory regardless of CWD. This is the
1332/// primary path under `cargo ktstr`.
1333/// 2. `{CARGO_TARGET_DIR}/ktstr` when that env is set non-empty.
1334/// 3. `target/ktstr` (CWD-relative) — the raw `cargo nextest run`
1335/// fallback. CWD-relative is fragile across a Cargo workspace (the
1336/// test binary's CWD is the package dir, which differs from a
1337/// workspace-root invocation), which is exactly why the
1338/// orchestrator pins the absolute override above; raw nextest has
1339/// no footer to mismatch, so the fallback is acceptable there.
1340///
1341/// Used by `cargo ktstr stats` / `replay` and the post-run footer to
1342/// enumerate runs without reconstructing a specific run key.
1343pub fn runs_root() -> PathBuf {
1344 if let Some(root) = std::env::var_os(crate::KTSTR_RUNS_ROOT_ENV).filter(|v| !v.is_empty()) {
1345 return PathBuf::from(root);
1346 }
1347 let target = std::env::var("CARGO_TARGET_DIR")
1348 .ok()
1349 .filter(|d| !d.is_empty())
1350 .map(PathBuf::from)
1351 .unwrap_or_else(|| PathBuf::from("target"));
1352 target.join("ktstr")
1353}
1354
1355/// Predicate: is `entry` a candidate run directory under
1356/// [`runs_root`]?
1357///
1358/// True iff `entry`'s path is a directory AND its filename does
1359/// NOT begin with a `.` byte. The dotfile filter excludes the
1360/// flock sentinel subdirectory ([`crate::flock::LOCK_DIR_NAME`] =
1361/// `.locks`) plus any other operator-created or filesystem-
1362/// reserved dotfile directories from run-listing walkers
1363/// ([`newest_run_dir`] here, `sorted_run_entries` in
1364/// `crate::stats`) so the lock infrastructure does not pollute
1365/// `cargo ktstr stats list` output or claim the "most recent
1366/// run" bucket. Checking the first byte directly via
1367/// `as_encoded_bytes` is OS-string-safe (no UTF-8 round-trip)
1368/// and short-circuits cleanly on non-UTF-8 names that would
1369/// confuse a `to_str().starts_with('.')` chain.
1370///
1371/// Single source of truth for "is this a run-dir entry?" — both
1372/// run-listing call sites must pipe through this predicate so a
1373/// future relocation of `.locks/` (or any other added reserved
1374/// dotfile) updates one place.
1375pub(crate) fn is_run_directory(entry: &std::fs::DirEntry) -> bool {
1376 let path = entry.path();
1377 if !path.is_dir() {
1378 return false;
1379 }
1380 path.file_name()
1381 .and_then(|n| n.as_encoded_bytes().first().copied())
1382 .is_none_or(|b| b != b'.')
1383}
1384
1385/// Find the most recently modified run directory under [`runs_root`].
1386///
1387/// Used by bare `cargo ktstr stats` (no subcommand) when
1388/// `KTSTR_SIDECAR_DIR` isn't set: the stats command doesn't itself
1389/// run a kernel, so it can't reconstruct the
1390/// `{kernel}-{project_commit}` key that the test process used.
1391/// Picking the newest subdirectory by mtime mirrors "show me the
1392/// report from my last test run."
1393///
1394/// Dotfile-prefixed entries (notably the flock sentinel
1395/// subdirectory `.locks/`) are excluded via `is_run_directory`
1396/// so the lock infrastructure cannot claim the "most recent
1397/// run" bucket — `.locks/`'s mtime tracks per-write flock
1398/// activity and would otherwise eclipse the actual newest run
1399/// dir on every default-path sidecar write.
1400pub fn newest_run_dir() -> Option<PathBuf> {
1401 let root = runs_root();
1402 let entries = std::fs::read_dir(&root).ok()?;
1403 entries
1404 .filter_map(|e| e.ok())
1405 .filter(is_run_directory)
1406 .max_by_key(|e| e.metadata().and_then(|m| m.modified()).ok())
1407 .map(|e| e.path())
1408}
1409
1410/// One failed test's on-disk artifacts within a single run directory,
1411/// for the `cargo ktstr test` post-run footer.
1412///
1413/// `scheduler` / `topology` come from a FAILING variant's
1414/// `.ktstr.json` sidecar and are `None` when the test failed BEFORE
1415/// writing one — e.g. a scheduler BPF-load failure that produced only
1416/// a placeholder `.failure-dump.json` via
1417/// [`crate::test_support::eval`] and never reached [`write_sidecar`].
1418/// Each `Option` path is `Some` only when that artifact exists AND
1419/// was written in the current run (the mtime gate in
1420/// [`summarize_run_artifacts`]).
1421pub(crate) struct FailedTest {
1422 /// Bare test function name (the artifact filename prefix).
1423 pub(crate) test_name: String,
1424 /// Scheduler under test, from a FAILING variant's `.ktstr.json`
1425 /// sidecar; `None` when no variant sidecar recorded a failure (a
1426 /// dump-only pre-sidecar failure).
1427 pub(crate) scheduler: Option<String>,
1428 /// Topology label, from the same failing variant as `scheduler`;
1429 /// `None` under the same condition. For a gauntlet test with
1430 /// multiple failing variants this is a representative one; the
1431 /// full per-variant set is in `stats_sidecars`.
1432 pub(crate) topology: Option<String>,
1433 /// `{test}-{variant_hash}.failure-dump.json` for whichever variant
1434 /// the run-dir scan classified last (unsorted read_dir order);
1435 /// single-slot. The fail signal keys off the per-variant
1436 /// `dump_hashes` set, not this path.
1437 pub(crate) failure_dump: Option<PathBuf>,
1438 /// `{test}-{variant_hash}.repro.failure-dump.json` for whichever
1439 /// variant the scan classified last (auto-repro retry).
1440 pub(crate) repro_failure_dump: Option<PathBuf>,
1441 /// Every `{test}-{variant_hash}.ktstr.json` stats sidecar for
1442 /// this test, sorted — one per gauntlet variant (distinct
1443 /// variant hashes coexist). Empty for a dump-only failure.
1444 pub(crate) stats_sidecars: Vec<PathBuf>,
1445 /// `{test}-{variant_hash}.wprof.pb`.
1446 pub(crate) wprof: Option<PathBuf>,
1447 /// `{test}-{variant_hash}.repro.wprof.pb` (auto-repro retry).
1448 pub(crate) repro_wprof: Option<PathBuf>,
1449 /// True when ANY of this test's variant sidecars is `is_fail()`,
1450 /// so `cargo ktstr replay --filter <name>` (which selects from
1451 /// `is_fail` sidecars — see `replay.rs::select_failed_names`)
1452 /// will re-run it. False for dump-only failures (no sidecar), for
1453 /// which replay's pool selection finds nothing.
1454 pub(crate) replayable: bool,
1455}
1456
1457/// Per-run-directory artifact summary for the post-run footer.
1458pub(crate) struct RunDirSummary {
1459 /// The `{runs_root}/{kernel}-{project_commit}` run directory.
1460 pub(crate) dir: PathBuf,
1461 /// Failed tests in this dir, ordered by `test_name`.
1462 pub(crate) failed: Vec<FailedTest>,
1463 /// Count of `.ktstr.json` stats sidecars written this run
1464 /// (every executed VM test that reached [`write_sidecar`],
1465 /// pass or fail).
1466 pub(crate) stats_sidecars: usize,
1467 /// Count of `.wprof.pb` traces written this run (excludes the
1468 /// `.repro.wprof.pb` auto-repro variant).
1469 pub(crate) wprof_traces: usize,
1470}
1471
1472/// The five per-test artifact shapes a run directory holds.
1473enum RunArtifactKind {
1474 FailureDump,
1475 ReproFailureDump,
1476 StatsSidecar,
1477 Wprof,
1478 ReproWprof,
1479}
1480
1481/// Split a `{test}-{16-hex variant hash}` stem into `(test, hash)`.
1482///
1483/// Test function names are Rust identifiers (never contain `-`), so the
1484/// LAST `-` is the variant-hash separator. Falls back to `(stem, 0)`
1485/// when the trailing token is not a valid 16-hex hash — so a NON-variant
1486/// dump (a stale pre-variant-keying file, or a future writer that omits
1487/// the hash) still classifies by its full prefix instead of vanishing
1488/// (the "no silent drops" rule). The mtime gate already excludes stale
1489/// prior-run files; the fallback removes the silent-drop risk entirely.
1490fn split_variant_stem(stem: &str) -> (&str, u64) {
1491 if let Some((test, hash)) = stem.rsplit_once('-')
1492 && hash.len() == 16
1493 && let Ok(h) = u64::from_str_radix(hash, 16)
1494 {
1495 (test, h)
1496 } else {
1497 (stem, 0)
1498 }
1499}
1500
1501/// Parse a run-directory filename into `(test_name, variant_hash, kind)`.
1502///
1503/// Returns `None` for filenames that are not a recognized per-test
1504/// artifact — `.ktstr.json.tmp.<pid>.<run_id>` atomic-write staging
1505/// residue, stray non-ktstr files, or a `.ktstr.json` whose stem
1506/// lacks the `-{16-hex variant hash}` suffix [`write_sidecar`]
1507/// always appends.
1508///
1509/// The `variant_hash` lets the footer correlate each artifact with the
1510/// SAME-variant sidecar (a gauntlet test's per-preset dumps + sidecars
1511/// carry distinct hashes): a failure dump whose variant has no parsed
1512/// sidecar is a per-variant pre-sidecar failure even when a sibling
1513/// preset passed. failure-dump / wprof names fall back to `(stem, 0)`
1514/// when un-hashed (see [`split_variant_stem`]); a `.ktstr.json` sidecar
1515/// is ALWAYS variant-keyed by [`write_sidecar`], so a non-hashed one is
1516/// malformed and is dropped (`None`).
1517///
1518/// Suffix order is load-bearing: the `.repro.` shapes are checked
1519/// BEFORE their bare counterparts so `{test}-{hash}.repro.failure-dump.json`
1520/// classifies as [`RunArtifactKind::ReproFailureDump`] with
1521/// `test_name = {test}` rather than the bare-`.failure-dump.json`
1522/// branch stripping less and yielding `{test}-{hash}.repro`.
1523fn classify_run_artifact(name: &str) -> Option<(&str, u64, RunArtifactKind)> {
1524 if let Some(stem) = name.strip_suffix(".repro.failure-dump.json") {
1525 let (test, hash) = split_variant_stem(stem);
1526 return Some((test, hash, RunArtifactKind::ReproFailureDump));
1527 }
1528 if let Some(stem) = name.strip_suffix(".failure-dump.json") {
1529 let (test, hash) = split_variant_stem(stem);
1530 return Some((test, hash, RunArtifactKind::FailureDump));
1531 }
1532 if let Some(stem) = name.strip_suffix(".repro.wprof.pb") {
1533 let (test, hash) = split_variant_stem(stem);
1534 return Some((test, hash, RunArtifactKind::ReproWprof));
1535 }
1536 if let Some(stem) = name.strip_suffix(".wprof.pb") {
1537 let (test, hash) = split_variant_stem(stem);
1538 return Some((test, hash, RunArtifactKind::Wprof));
1539 }
1540 if let Some(stem) = name.strip_suffix(".ktstr.json") {
1541 // A sidecar is ALWAYS `{test}-{16-hex}` ({:016x} in
1542 // serialize_and_write_sidecar). A stem without a valid hash
1543 // suffix is a hand-named / malformed file — drop it (unlike the
1544 // dump arms, there's no un-hashed-sidecar writer to be lenient
1545 // for).
1546 let (test, hash) = stem.rsplit_once('-')?;
1547 if hash.len() == 16
1548 && let Ok(h) = u64::from_str_radix(hash, 16)
1549 {
1550 return Some((test, h, RunArtifactKind::StatsSidecar));
1551 }
1552 }
1553 None
1554}
1555
1556/// Summarize the per-test artifacts a single run directory holds,
1557/// counting only files written at or after `since`.
1558///
1559/// The mtime gate is the freshness boundary: a run directory is
1560/// keyed `{kernel}-{project_commit}` (see [`sidecar_dir`]), so
1561/// re-running the same suite reuses the directory, and
1562/// [`pre_clear_run_dir_once`] wipes only `*.ktstr.json` — stale
1563/// `*.failure-dump.json` / `*.wprof.pb` from an earlier run linger.
1564/// Filtering on `mtime >= since` (where `since` is captured before
1565/// the nextest build+run begins, so genuine artifacts — written
1566/// after the build — sort comfortably after it) keeps a stale dump
1567/// from a prior run from surfacing as a current failure.
1568///
1569/// Returns `None` when the directory holds no fresh artifacts (it
1570/// belongs to an earlier run, or cannot be read).
1571fn summarize_one_run_dir(
1572 dir: &std::path::Path,
1573 since: std::time::SystemTime,
1574) -> Option<RunDirSummary> {
1575 use std::collections::{BTreeMap, BTreeSet};
1576 #[derive(Default)]
1577 struct Acc {
1578 // The writer names these per-variant (hashed, `{test}-{hash}.…`);
1579 // each is a single Option collapsed to whichever variant the
1580 // read_dir scan classified last (unsorted order). The fail
1581 // signal below does NOT rely on them — it keys off the
1582 // per-variant `dump_hashes` set.
1583 failure_dump: Option<PathBuf>,
1584 repro_failure_dump: Option<PathBuf>,
1585 wprof: Option<PathBuf>,
1586 repro_wprof: Option<PathBuf>,
1587 // EVERY variant's stats sidecar (distinct variant-hash
1588 // filenames coexist), so a passing variant cannot mask a
1589 // failing sibling.
1590 stats_sidecars: Vec<PathBuf>,
1591 // OR of `is_fail` across all of this name's variant sidecars.
1592 // Post-finalize the sidecar carries the FINAL (post-inversion)
1593 // verdict, so a passing expect_err / expect_auto_repro test
1594 // reads `false` here even though its scenario failed.
1595 any_fail: bool,
1596 // Variant hashes whose stats sidecar PARSED, and variant hashes
1597 // that left a failure dump. The gate is PER-VARIANT: a dump whose
1598 // variant has no parsed sidecar is a pre-sidecar failure
1599 // (scheduler load / VM boot crash) for THAT preset and flags
1600 // FAILED even when a sibling preset's sidecar parsed; a dump whose
1601 // variant DID parse a (final, non-failing) sidecar is an
1602 // expected-failure run whose dump must NOT flag — the sidecar's
1603 // finalized verdict already classified it. (A gauntlet test's
1604 // per-preset dumps + sidecars carry distinct variant hashes.)
1605 parsed_sidecar_hashes: BTreeSet<u64>,
1606 dump_hashes: BTreeSet<u64>,
1607 // (scheduler, topology) of the FIRST failing variant seen,
1608 // for the FAILED block header; `None` when no variant sidecar
1609 // parsed as a failure (a dump-only pre-sidecar failure).
1610 fail_variant: Option<(String, String)>,
1611 }
1612 let entries = std::fs::read_dir(dir).ok()?;
1613 let mut by_test: BTreeMap<String, Acc> = BTreeMap::new();
1614 let mut stats_sidecars = 0usize;
1615 let mut wprof_traces = 0usize;
1616 for entry in entries.flatten() {
1617 let Ok(meta) = entry.metadata() else {
1618 continue;
1619 };
1620 if !meta.is_file() {
1621 continue;
1622 }
1623 match meta.modified() {
1624 Ok(m) if m >= since => {}
1625 _ => continue,
1626 }
1627 let path = entry.path();
1628 let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1629 continue;
1630 };
1631 let Some((test, variant_hash, kind)) = classify_run_artifact(name) else {
1632 continue;
1633 };
1634 let acc = by_test.entry(test.to_string()).or_default();
1635 match kind {
1636 RunArtifactKind::FailureDump => {
1637 acc.dump_hashes.insert(variant_hash);
1638 acc.failure_dump = Some(path);
1639 }
1640 RunArtifactKind::ReproFailureDump => {
1641 acc.dump_hashes.insert(variant_hash);
1642 acc.repro_failure_dump = Some(path);
1643 }
1644 RunArtifactKind::Wprof => {
1645 wprof_traces += 1;
1646 acc.wprof = Some(path);
1647 }
1648 RunArtifactKind::ReproWprof => acc.repro_wprof = Some(path),
1649 RunArtifactKind::StatsSidecar => {
1650 stats_sidecars += 1;
1651 // Accumulate EVERY variant (never overwrite by bare
1652 // name) and OR the fail signal, so one gauntlet
1653 // variant's pass cannot mask a failing sibling.
1654 match std::fs::read_to_string(&path)
1655 .ok()
1656 .and_then(|s| serde_json::from_str::<SidecarResult>(&s).ok())
1657 {
1658 Some(sc) => {
1659 acc.parsed_sidecar_hashes.insert(variant_hash);
1660 if sc.is_fail() {
1661 acc.any_fail = true;
1662 if acc.fail_variant.is_none() {
1663 acc.fail_variant = Some((sc.scheduler, sc.topology));
1664 }
1665 }
1666 }
1667 None => {
1668 // Counted in `stats_sidecars` but
1669 // unclassifiable. Warn so the count and the
1670 // failed list cannot silently disagree (a
1671 // corrupt `is_fail` sidecar would otherwise be
1672 // swallowed).
1673 tracing::warn!(
1674 path = %path.display(),
1675 "ktstr footer: unreadable/unparseable stats sidecar — \
1676 counted but not classified",
1677 );
1678 }
1679 }
1680 acc.stats_sidecars.push(path);
1681 }
1682 }
1683 }
1684 if by_test.is_empty() {
1685 return None;
1686 }
1687 let mut failed = Vec::new();
1688 for (test_name, mut acc) in by_test {
1689 // A test FAILED this run if ANY of its variant sidecars records
1690 // `is_fail` (the FINAL post-inversion verdict), OR it left a
1691 // failure dump WITHOUT any parsed sidecar. A dump with no sidecar
1692 // is a pre-sidecar failure (scheduler load / VM boot) that never
1693 // reached `write_sidecar` — it must still flag. But a dump
1694 // alongside a parsed, non-failing sidecar is an expected-failure
1695 // run (expect_err / expect_auto_repro) whose induced-crash dump
1696 // must NOT flag: the sidecar's finalized verdict is authoritative.
1697 // The `is_fail` aggregate covers in-VM assertion failures,
1698 // including one failing gauntlet variant among passing siblings.
1699 // Per-variant dump gate: a failure dump whose variant has NO
1700 // parsed sidecar is a pre-sidecar failure for that preset and
1701 // flags — even when a SIBLING preset's sidecar parsed. (If every
1702 // dump-variant has a parsed sidecar, dump_hashes ⊆ parsed and the
1703 // sidecars' finalized verdicts are authoritative.) Closes the
1704 // mixed-gauntlet masking the old test-name-granularity gate had.
1705 let dump_only_failure = !acc.dump_hashes.is_subset(&acc.parsed_sidecar_hashes);
1706 if !acc.any_fail && !dump_only_failure {
1707 continue;
1708 }
1709 let (scheduler, topology) = match acc.fail_variant {
1710 Some((sch, topo)) => (Some(sch), Some(topo)),
1711 None => (None, None),
1712 };
1713 // Sort so the rendered footer is deterministic regardless of
1714 // `read_dir` order.
1715 acc.stats_sidecars.sort();
1716 failed.push(FailedTest {
1717 test_name,
1718 scheduler,
1719 topology,
1720 failure_dump: acc.failure_dump,
1721 repro_failure_dump: acc.repro_failure_dump,
1722 stats_sidecars: acc.stats_sidecars,
1723 wprof: acc.wprof,
1724 repro_wprof: acc.repro_wprof,
1725 replayable: acc.any_fail,
1726 });
1727 }
1728 Some(RunDirSummary {
1729 dir: dir.to_path_buf(),
1730 failed,
1731 stats_sidecars,
1732 wprof_traces,
1733 })
1734}
1735
1736/// Summarize the artifacts every run directory directly under
1737/// `runs_root` holds, keeping only files written at or after
1738/// `since`. Each [`RunDirSummary`] names its failed tests and the
1739/// concrete artifact path for each, so the `cargo ktstr test`
1740/// footer can point an operator at the exact file for the exact
1741/// test that failed rather than a directory + glob legend.
1742///
1743/// `since` is the wall-clock instant captured before the nextest
1744/// build+run; the mtime gate it drives is what excludes stale
1745/// artifacts left in a reused run directory (see
1746/// [`summarize_one_run_dir`]). Directories are returned sorted by
1747/// path so multi-kernel gauntlet output renders deterministically.
1748pub(crate) fn summarize_run_artifacts(
1749 runs_root: &std::path::Path,
1750 since: std::time::SystemTime,
1751) -> Vec<RunDirSummary> {
1752 let Ok(entries) = std::fs::read_dir(runs_root) else {
1753 return Vec::new();
1754 };
1755 let mut out: Vec<RunDirSummary> = entries
1756 .flatten()
1757 .filter(is_run_directory)
1758 .filter_map(|e| summarize_one_run_dir(&e.path(), since))
1759 .collect();
1760 out.sort_by(|a, b| a.dir.cmp(&b.dir));
1761 out
1762}
1763
1764/// Render the `cargo ktstr test` post-run footer: for each run
1765/// directory written at or after `since`, name every FAILED test
1766/// and the concrete path to each of its artifacts (failure dump,
1767/// auto-repro dump, stats sidecar, wprof trace), plus a per-dir
1768/// count of stats sidecars and wprof traces.
1769///
1770/// Returns the empty string when no run directory under `runs_root`
1771/// holds fresh artifacts — a host-only run (no VM tests) writes no
1772/// sidecars, so there is nothing to point at and the caller emits
1773/// no footer.
1774///
1775/// A test is listed FAILED when it left a failure dump (real or
1776/// placeholder) or an `is_fail` stats sidecar. This is NOT an
1777/// exhaustive failure list: a failure that writes neither — a
1778/// `builder.build()` / `vm.run()` error, a pre-build host error
1779/// (kvm probe, kernel/scheduler resolve, validation), a host panic,
1780/// or an unparseable guest result — leaves no on-disk artifact and
1781/// no entry here. The caller (`cargo_ktstr::run_cargo`) treats
1782/// nextest's own exit status as the authoritative pass/fail signal
1783/// and notes, when nextest reports failures, that any failure
1784/// without an entry left no artifact.
1785///
1786/// This replaces a directory + `*.glob` legend that carried no
1787/// test attribution: a reused run directory mixes artifacts from
1788/// many tests (and, before the mtime gate, prior runs), so a glob
1789/// legend pointed an operator at the directory and left them to
1790/// guess which `*.failure-dump.json` belonged to the test that
1791/// just failed.
1792pub fn format_run_artifact_footer(
1793 runs_root: &std::path::Path,
1794 since: std::time::SystemTime,
1795) -> String {
1796 let summaries = summarize_run_artifacts(runs_root, since);
1797 if summaries.is_empty() {
1798 return String::new();
1799 }
1800 let mut out = String::new();
1801 out.push_str("\ncargo ktstr: test outputs\n");
1802 for s in &summaries {
1803 out.push_str(&format!(" {}\n", s.dir.display()));
1804 for f in &s.failed {
1805 // scheduler/topology are set together (both from one
1806 // failing variant) or both absent (dump-only) — see
1807 // `summarize_one_run_dir`; no mixed arm is reachable.
1808 let variant = match (&f.scheduler, &f.topology) {
1809 (Some(sch), Some(topo)) => format!(" [{sch} {topo}]"),
1810 _ => String::new(),
1811 };
1812 out.push_str(&format!(" FAILED {}{variant}\n", f.test_name));
1813 if let Some(p) = &f.failure_dump {
1814 out.push_str(&format!(" {:<13} {}\n", "failure dump", p.display()));
1815 }
1816 if let Some(p) = &f.repro_failure_dump {
1817 out.push_str(&format!(" {:<13} {}\n", "repro dump", p.display()));
1818 }
1819 for p in &f.stats_sidecars {
1820 out.push_str(&format!(" {:<13} {}\n", "stats", p.display()));
1821 }
1822 if let Some(p) = &f.wprof {
1823 out.push_str(&format!(" {:<13} {}\n", "wprof", p.display()));
1824 }
1825 if let Some(p) = &f.repro_wprof {
1826 out.push_str(&format!(" {:<13} {}\n", "repro wprof", p.display()));
1827 }
1828 if f.replayable {
1829 out.push_str(&format!(
1830 " {:<13} cargo ktstr replay --filter {} --exec\n",
1831 "replay", f.test_name,
1832 ));
1833 }
1834 }
1835 out.push_str(&format!(
1836 " ({} stats sidecar(s), {} wprof trace(s) written this run)\n",
1837 s.stats_sidecars, s.wprof_traces,
1838 ));
1839 }
1840 out
1841}
1842
1843/// Detect the kernel version associated with the current test run.
1844///
1845/// Routes through [`crate::ktstr_kernel_env`] for the raw env value
1846/// and [`crate::kernel_path::KernelId`] for variant dispatch so the
1847/// three [`crate::kernel_path::KernelId`] variants are honoured symmetrically:
1848///
1849/// - `KernelId::Path(dir)`: read `metadata.json` (cache entry
1850/// layout) or `include/config/kernel.release` (source tree
1851/// layout). Unchanged from the previous behaviour.
1852/// - `KernelId::Version(ver)`: the user asked for a specific
1853/// version — return it directly. No cache access needed; a
1854/// version string IS a version string.
1855/// - `KernelId::CacheKey(key)`: look up the cache entry and
1856/// return `entry.metadata.version`. The previous code path
1857/// silently treated the key as a directory name and read
1858/// `<cwd>/<key>/metadata.json`, which never matched — producing
1859/// `None` + `sidecar_dir()` using the `"unknown"` fallback even
1860/// though the cache metadata already carried the version.
1861///
1862/// Returns `None` when the env var is unset, or when the env
1863/// resolves to a variant whose underlying source doesn't yield a
1864/// version string (e.g. a Path whose metadata.json / kernel.release
1865/// are both absent, or a CacheKey with no cache hit).
1866pub(crate) fn detect_kernel_version() -> Option<String> {
1867 use crate::kernel_path::KernelId;
1868 let raw = crate::ktstr_kernel_env()?;
1869 match KernelId::parse(&raw) {
1870 KernelId::Path(_) => {
1871 let p = std::path::Path::new(&raw);
1872 let meta_path = p.join("metadata.json");
1873 if let Ok(data) = std::fs::read_to_string(&meta_path)
1874 && let Ok(meta) = serde_json::from_str::<crate::cache::KernelMetadata>(&data)
1875 {
1876 return meta.version;
1877 }
1878 let ver_path = p.join("include/config/kernel.release");
1879 if let Ok(v) = std::fs::read_to_string(ver_path) {
1880 let v = v.trim();
1881 if !v.is_empty() {
1882 return Some(v.to_string());
1883 }
1884 }
1885 None
1886 }
1887 KernelId::Version(ver) => Some(ver),
1888 KernelId::CacheKey(key) => {
1889 let cache = crate::cache::CacheDir::new().ok()?;
1890 let entry = cache.lookup(&key)?;
1891 entry.metadata.version
1892 }
1893 // Multi-kernel specs in KTSTR_KERNEL never reach this
1894 // function in production — `find_kernel`'s env reader bails
1895 // before sidecar writing happens. This arm is defensive: if
1896 // the env value is somehow a range or git spec, return
1897 // `None` rather than guessing one endpoint, and the sidecar
1898 // record will leave `kernel_version` as null.
1899 KernelId::Range { .. } | KernelId::Git { .. } => None,
1900 }
1901}
1902
1903/// Detect the ktstr project's git HEAD at sidecar-write time.
1904///
1905/// Walks up from the test process's current working directory via
1906/// `gix::discover` to find an enclosing repository, then reads HEAD
1907/// short-hex (7 chars via `oid::to_hex_with_len(7)`) and appends
1908/// `-dirty` when index-vs-HEAD or worktree-vs-index changes are
1909/// observed. Submodules are ignored
1910/// (`Submodule::Given { ignore: All }`).
1911///
1912/// Dirt-detection runs through the shared [`repo_is_dirty`]
1913/// helper (peel HEAD to its tree, diff tree-vs-index, then
1914/// `status()` for worktree-vs-index, submodules skipped); see its
1915/// doc for cascade details. The cascade is similar in spirit to
1916/// [`crate::fetch::local_source`]'s dirt probe but deliberately
1917/// diverges in missing-index handling: the sidecar path silently
1918/// degrades a missing index leg to "treat as clean" so metadata
1919/// probes never gate sidecar writes, whereas `local_source`'s
1920/// cache-key path treats every leg as load-bearing. The HASH
1921/// REPRESENTATION also DIFFERS: `fetch::local_source` DROPS the
1922/// short hash entirely on dirty (returns `None`) because the
1923/// commit no longer describes the build input the cache key
1924/// embeds — publishing a stale hash there would misidentify the
1925/// build. This helper KEEPS the hash with a `-dirty` suffix
1926/// instead because the sidecar's `project_commit` is a debugging
1927/// breadcrumb (operator-readable identity, not a cache-key input);
1928/// the hash plus dirty flag carries strictly more information
1929/// than `None` for the operator's "which ktstr commit did this
1930/// sidecar come from?" question.
1931///
1932/// Returns `None` when:
1933/// - `current_dir()` cannot be resolved (process has no valid
1934/// cwd — extremely rare; happens only for processes whose cwd
1935/// was rmdir'd while alive);
1936/// - cwd is not inside any git repository (`gix::discover` fails);
1937/// - HEAD cannot be read (an unborn HEAD on a fresh `git init`
1938/// with zero commits, or a corrupt repository).
1939///
1940/// Returns `Some(short_hash)` (without the `-dirty` suffix) when
1941/// the HEAD read succeeds but a downstream dirt-detection call
1942/// fails — including a missing index, an unreadable working tree,
1943/// or `head_tree()` failure. Each failed leg degrades to "treat
1944/// as clean" rather than aborting the probe, because metadata
1945/// must not gate sidecar writes.
1946///
1947/// `None` is the documented fallback — sidecar writing must not
1948/// abort because of a metadata probe failure. Stats tooling that
1949/// reads `project_commit` already tolerates `None` rows by
1950/// treating them as wildcards (no `--project-commit` filter narrowing
1951/// applies).
1952///
1953/// `gix::discover` is preferred over `gix::open` because tests can
1954/// be launched from a subdirectory of the repo (e.g.
1955/// `cd src && cargo test`); `discover` walks parents until it
1956/// finds the `.git` marker, while `open` requires the exact root
1957/// path. The walk is cheap — a few stat() calls bounded by the
1958/// depth of the cwd inside the repo.
1959///
1960/// `env!("CARGO_MANIFEST_DIR")` is deliberately NOT used here:
1961/// `env!` resolves at compile time and bakes the build-host's
1962/// absolute manifest path into the binary's read-only data
1963/// segment, leaking the build environment into every published
1964/// artifact. Resolving cwd at runtime instead means the recorded
1965/// commit reflects the project tree the test was launched FROM —
1966/// for a scheduler crate using ktstr as a dev-dependency, this is
1967/// the scheduler crate's commit, not ktstr's. That is the more
1968/// accurate semantic anyway: "what code produced this sidecar"
1969/// depends on the cwd at test launch (which crate is exercising
1970/// ktstr), not the build host.
1971pub(crate) fn detect_project_commit() -> Option<String> {
1972 // Explicit override: an orchestrator (perf-delta) that checked the
1973 // project tree out WITHOUT a `.git` — a plain gix checkout of a baseline
1974 // commit into a temp dir — passes the commit label via
1975 // KTSTR_PROJECT_COMMIT_ENV so the sidecar records it verbatim instead of
1976 // a `gix::discover` that would resolve to the wrong repo (or none). It is
1977 // also set on the HEAD run so the recorded `project_commit` equals the
1978 // exact label perf-delta filters the pool on, closing the -dirty-suffix
1979 // mismatch between the filter (`short_hash`) and this recorder. Empty is
1980 // treated as unset. Mirrors the KTSTR_KERNEL_COMMIT_ENV override.
1981 if let Ok(explicit) = std::env::var(crate::KTSTR_PROJECT_COMMIT_ENV)
1982 && !explicit.is_empty()
1983 {
1984 return Some(explicit);
1985 }
1986 // Per-process memoization of the SUCCESS case only.
1987 //
1988 // The cwd is stable for the lifetime of a test process (no
1989 // caller mutates it), and the project tree's HEAD plus dirty
1990 // state cannot change underneath us without an explicit user
1991 // action that's outside the scope of any individual sidecar
1992 // write. Gauntlet runs invoke this function once per sidecar —
1993 // thousands of times per process — so caching the resolved
1994 // hash collapses every post-first successful call to a
1995 // `Clone`. The probe itself does ~3 syscalls (gix discover +
1996 // head_id + status) which dominate the sidecar-write critical
1997 // path; eliminating that cost on the hot path is the only
1998 // meaningful perf win available here.
1999 //
2000 // FAILURE IS NOT CACHED: a `None` probe outcome (no git repo
2001 // discoverable from cwd, unborn HEAD, transient FS / gix open
2002 // failure) does NOT seed the OnceLock. A FIRST call from a
2003 // momentarily-broken context (e.g. a test that swapped CWD via
2004 // some indirect path before ever calling
2005 // `detect_project_commit`, or a transient I/O hiccup during
2006 // `gix::discover`) would otherwise lock in `None` for the
2007 // rest of the process — every subsequent sidecar would land
2008 // under `target/ktstr/{kernel}-unknown/` even though the
2009 // commit IS resolvable from a healthy cwd. Retrying on failure
2010 // costs the same ~3 syscalls the success case pays once; the
2011 // re-probe only fires while the answer is still unknown.
2012 //
2013 // CACHE DOES NOT INVALIDATE on success: a user who commits /
2014 // amends / resets the project tree mid-run and expects the
2015 // new HEAD to surface in subsequent sidecars will see stale
2016 // values. This is acceptable — the
2017 // project tree is treated as stable-enough for a single suite
2018 // run; callers mutating the tree during a run own the
2019 // consequences.
2020 static PROJECT_COMMIT: std::sync::OnceLock<String> = std::sync::OnceLock::new();
2021 if let Some(cached) = PROJECT_COMMIT.get() {
2022 return Some(cached.clone());
2023 }
2024 let cwd = std::env::current_dir().ok()?;
2025 let probed = detect_commit_at(&cwd)?;
2026 // `set` on a hot OnceLock is a no-op `Err` — safe to ignore.
2027 // First successful caller wins; a second concurrent caller's
2028 // identical hash discards harmlessly.
2029 let _ = PROJECT_COMMIT.set(probed.clone());
2030 Some(probed)
2031}
2032
2033/// Path-taking core of [`detect_project_commit`]. Factored out so
2034/// unit tests can drive the full branch matrix (clean repo, dirty
2035/// repo, non-git directory, unborn HEAD, concurrent calls) against
2036/// `gix::init`-built fixtures in tempdirs without mutating the
2037/// process-wide `current_dir`. The public entry point reads `cwd`
2038/// once and delegates here.
2039///
2040/// `gix::discover` walks parents until it finds a `.git` marker —
2041/// tests can be launched from a subdirectory of the repo (e.g.
2042/// `cd src && cargo test`); the parent walk handles that, where
2043/// `gix::open` would require the exact root. The
2044/// open-vs-discover distinction is the ONLY difference between
2045/// this function and [`detect_kernel_commit`]; the post-open
2046/// "read HEAD, format short hex, append `-dirty` on dirt" body
2047/// lives in the shared [`commit_with_dirty_suffix`] helper.
2048fn detect_commit_at(path: &std::path::Path) -> Option<String> {
2049 let repo = gix::discover(path).ok()?;
2050 commit_with_dirty_suffix(&repo)
2051}
2052
2053/// Shared post-open body for [`detect_commit_at`] and
2054/// [`detect_kernel_commit`]: read `repo.head_id()`, format the
2055/// 7-char short hex, and append `-dirty` when [`repo_is_dirty`]
2056/// returns `Some(true)`.
2057///
2058/// Returns `None` when `head_id()` fails (unborn HEAD on a fresh
2059/// `gix::init` with zero commits, or a corrupt repository) — the
2060/// short-hex cannot be formed.
2061///
2062/// Returns `Some(short_hash)` (without `-dirty`) when the HEAD
2063/// read succeeds but the [`repo_is_dirty`] probe returns `None`
2064/// (HEAD-tree peel failure). This matches the documented "treat
2065/// as clean on probe failure" degradation: metadata probes must
2066/// not gate sidecar writes, so a probe failure flows through as
2067/// "clean" rather than aborting.
2068///
2069/// `to_hex_with_len(7)` produces a `HexDisplay` that formats 7
2070/// hex chars without the 40-char intermediate `format!("{}")`
2071/// allocation. `Id` derefs to `oid` (gix-hash) which owns the
2072/// method.
2073///
2074/// CALL SITES diverge ONLY on the open mode (`gix::discover` for
2075/// the project commit, `gix::open` for the kernel commit). The
2076/// helper takes a `&Repository` so each caller picks the open
2077/// strategy that matches its semantics: project commit walks
2078/// parents (cwd may be inside a subdir of the repo); kernel
2079/// commit demands the explicit root (the kernel directory is
2080/// not walked-up to avoid resolving the parent ktstr repo).
2081fn commit_with_dirty_suffix(repo: &gix::Repository) -> Option<String> {
2082 let head = repo.head_id().ok()?;
2083 let short_hash = head.to_hex_with_len(7).to_string();
2084 if repo_is_dirty(repo).unwrap_or(false) {
2085 Some(format!("{short_hash}-dirty"))
2086 } else {
2087 Some(short_hash)
2088 }
2089}
2090
2091/// Probe whether a gix repository's working tree differs from its
2092/// HEAD commit, ignoring submodules.
2093///
2094/// Returns `Some(true)` when the index differs from the HEAD tree
2095/// or the worktree differs from the index for any tracked file;
2096/// `Some(false)` when neither leg observed a difference; `None`
2097/// when the HEAD-tree peel itself failed (HEAD points at something
2098/// that cannot be read as a tree).
2099///
2100/// Callers in [`detect_commit_at`] / [`detect_kernel_commit`]
2101/// degrade `None` to "treat as clean" via `unwrap_or(false)` so
2102/// metadata probes never gate sidecar writes.
2103///
2104/// PROBE LEGS:
2105/// - tree-vs-index: peel HEAD to its tree, then `tree_index_status`
2106/// diff against the on-disk index. `repo.index()` returning Err
2107/// (missing index — partially-checked-out clones, or fresh
2108/// `git init` before the first commit) silently leaves the
2109/// index-dirty leg false. `index_or_empty()` is deliberately
2110/// NOT used because it would substitute an empty index and the
2111/// diff would flag every tracked file as "deleted from index",
2112/// tripping false-dirty.
2113/// - index-vs-worktree: `repo.status()` configured with
2114/// `Submodule::Given { ignore: All }` so submodule worktree
2115/// state is skipped. Short-circuited when the tree-vs-index leg
2116/// already flipped dirty: the result only needs one positive
2117/// signal, so a known-dirty index makes the worktree walk
2118/// redundant. Matches the equivalent short-circuit in
2119/// [`crate::fetch::local_source`].
2120///
2121/// FAILURE DEGRADATION: any individual leg failure (missing index,
2122/// `repo.status()` failure, `into_index_worktree_iter()` failure)
2123/// silently degrades that leg to "no signal" rather than aborting.
2124/// The function only returns `None` when the HEAD-tree peel
2125/// fails, because at that point neither leg can run at all.
2126///
2127/// `pub` (not `pub(crate)`) because `cargo-ktstr.rs` is a
2128/// separate `[[bin]]` crate that consumes `ktstr` as an
2129/// external dependency and needs this helper to compute the
2130/// `-dirty` suffix in
2131/// the baseline/HEAD commit in `cargo ktstr perf-delta`. Hidden
2132/// from rustdoc via `#[doc(hidden)]` because it is a probe-
2133/// style helper without a stable API contract — external
2134/// consumers should not depend on it.
2135#[doc(hidden)]
2136pub fn repo_is_dirty(repo: &gix::Repository) -> Option<bool> {
2137 let head_tree_id = repo.head_tree().ok()?.id;
2138
2139 let mut index_dirty = false;
2140 if let Ok(index) = repo.index() {
2141 let _ = repo.tree_index_status(
2142 &head_tree_id,
2143 &index,
2144 None,
2145 gix::status::tree_index::TrackRenames::Disabled,
2146 |_, _, _| {
2147 index_dirty = true;
2148 Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Break(()))
2149 },
2150 );
2151 }
2152
2153 let worktree_dirty = if index_dirty {
2154 false
2155 } else {
2156 repo.status(gix::progress::Discard)
2157 .ok()
2158 .and_then(|s| {
2159 s.index_worktree_rewrites(None)
2160 .index_worktree_submodules(gix::status::Submodule::Given {
2161 ignore: gix::submodule::config::Ignore::All,
2162 check_dirty: false,
2163 })
2164 .index_worktree_options_mut(|opts| {
2165 opts.dirwalk_options = None;
2166 })
2167 .into_index_worktree_iter(Vec::new())
2168 .ok()
2169 .map(|mut iter| iter.next().is_some())
2170 })
2171 .unwrap_or(false)
2172 };
2173
2174 Some(index_dirty || worktree_dirty)
2175}
2176
2177/// Detect the kernel SOURCE TREE's git HEAD at sidecar-write time.
2178///
2179/// `kernel_dir` is the explicit kernel source directory — typically
2180/// resolved from `KTSTR_KERNEL` for `KernelId::Path`, or from the
2181/// cache entry's `KernelSource::Local::source_tree_path` when
2182/// `KTSTR_KERNEL` is a Version / CacheKey whose underlying build
2183/// recorded a local tree. Uses `gix::open(kernel_dir)` (NOT
2184/// `gix::discover`) because the kernel directory is explicit, not
2185/// walked-up: the parent walk that `discover` performs would
2186/// resolve to whichever ancestor `.git` it found first, which
2187/// might be the ktstr project's repo when `kernel_dir` is a
2188/// non-git subdirectory inside it. `open` requires `kernel_dir`
2189/// itself to be the repo root, which is the documented invariant
2190/// for kernel checkouts.
2191///
2192/// Reads HEAD short-hex (7 chars via `oid::to_hex_with_len(7)`)
2193/// and appends `-dirty` when index-vs-HEAD or worktree-vs-index
2194/// changes are observed. Dirt-detection runs through the shared
2195/// [`repo_is_dirty`] helper (submodules skipped via
2196/// `Submodule::Given { ignore: All }`); see its doc for cascade
2197/// details. The cascade matches [`detect_project_commit`] and is
2198/// similar in spirit to [`crate::fetch::local_source`] but
2199/// deliberately diverges in missing-index handling: the sidecar
2200/// path silently degrades a missing index leg to "treat as
2201/// clean" so metadata probes never gate sidecar writes, whereas
2202/// `local_source`'s cache-key path treats every leg as
2203/// load-bearing. Same "treat as clean on probe failure"
2204/// degradation rules apply otherwise: a missing index, an
2205/// unreadable worktree, or `head_tree()` failure each fall
2206/// through as "clean" rather than aborting the probe — metadata
2207/// must not gate sidecar writes.
2208///
2209/// HASH REPRESENTATION matches [`detect_project_commit`]: keeps
2210/// the hash with `-dirty` appended (operator-readable identity).
2211/// Distinct from [`crate::fetch::local_source`], which DROPS the
2212/// hash on dirty because the commit no longer describes the
2213/// build INPUT for cache-key purposes.
2214///
2215/// Returns `None` when:
2216/// - `kernel_dir` is not a git repository (`gix::open` fails);
2217/// - HEAD cannot be read (unborn HEAD on a fresh `git init` with
2218/// zero commits, or a corrupt repository).
2219///
2220/// Returns `Some(short_hash)` (without the `-dirty` suffix) when
2221/// the HEAD read succeeds but a downstream dirt-detection call
2222/// fails — including a missing index, an unreadable working
2223/// tree, or `head_tree()` failure. Each failed leg degrades to
2224/// "treat as clean" rather than aborting the probe, because
2225/// metadata must not gate sidecar writes.
2226///
2227/// `pub` (not `pub(crate)`) + `#[doc(hidden)]` for the same reason as
2228/// `repo_is_dirty`: `cargo-ktstr` is a separate `[[bin]]` crate that
2229/// consumes `ktstr` as a dependency and calls this in `run_cargo` once
2230/// per resolved kernel to pre-compute the `dir=commit` map it exports
2231/// via [`crate::KTSTR_KERNEL_COMMIT_ENV`], letting each per-test process
2232/// skip its own gix dirty-walk. Hidden from rustdoc — a probe helper
2233/// with no stable API contract. The env fast-path that CONSUMES that
2234/// map lives at the sidecar call site (`kernel_commit_for_sidecar`), not
2235/// here, so this stays a pure directory→commit walk safe for the
2236/// orchestrator to call while building the map.
2237#[doc(hidden)]
2238pub fn detect_kernel_commit(kernel_dir: &std::path::Path) -> Option<String> {
2239 // Per-process, path-keyed memoization of the SUCCESS case
2240 // only. Same rationale as `detect_project_commit`: gauntlet
2241 // runs invoke this function once per sidecar — thousands of
2242 // times — and the kernel tree's HEAD plus dirty state cannot
2243 // change underneath us mid-suite without an explicit user
2244 // action outside any sidecar's control. The path key handles
2245 // the fixture-test case where unit tests rotate through
2246 // synthetic `tempfile::TempDir` kernel paths in the same
2247 // process; each distinct path memoizes independently.
2248 //
2249 // `Mutex<HashMap>` rather than `OnceLock` because the input
2250 // is parameterized on `kernel_dir` — a `OnceLock` collapses
2251 // every input to one cached result, which would conflate
2252 // different kernel directories into a single value.
2253 // Contention is bounded: post-warm reads are O(1) hash
2254 // lookups against a near-empty map (in production typically
2255 // ONE kernel per process), and the mutex is held only for
2256 // the duration of the lookup + insert.
2257 //
2258 // FAILURE IS NOT CACHED: a `None` probe outcome (kernel_dir
2259 // is not a git repo, unborn HEAD, transient `gix::open`
2260 // failure) does NOT seed the cache. Caching `None` would lock
2261 // in `unknown` for every subsequent sidecar even after the
2262 // condition resolves (e.g. a kernel directory that becomes a
2263 // valid checkout mid-suite, or a flaky FS that recovers).
2264 // Re-probing on failure costs the same gix-open + dirt-walk
2265 // the success case pays once; the re-probe only fires while
2266 // the answer is still unknown for that path.
2267 //
2268 // Mutex poisoning recovery: a panic mid-probe could poison
2269 // the lock; acquiring via
2270 // [`crate::sync::MutexExt::lock_unpoisoned`] returns the
2271 // guard regardless of poison state so a future caller doesn't
2272 // fail catastrophically. The cached map is just a HashMap of
2273 // owned strings; no invariant beyond "key→value mapping" can
2274 // be broken by an interrupted probe.
2275 use std::collections::HashMap;
2276 use std::path::PathBuf;
2277 use std::sync::{Mutex, OnceLock};
2278 static KERNEL_COMMIT_CACHE: OnceLock<Mutex<HashMap<PathBuf, String>>> = OnceLock::new();
2279 // Canonicalize the cache key so two paths that resolve to the
2280 // same on-disk directory share one entry. Without this, a
2281 // symlinked alias (`./linux` symlinked to `/abs/.../linux`)
2282 // and the resolved target would each populate their own slot,
2283 // re-running the gix-open + dirt-walk on every alias and
2284 // defeating the memoization. `canonicalize` resolves symlinks,
2285 // collapses `..` / `.`, and yields the absolute path the
2286 // kernel actually lives at. Falls back to the raw path on
2287 // canonicalize failure (e.g. caller passed a non-existent
2288 // `kernel_dir`) — gix::open will fail downstream and re-probe
2289 // each call until the path becomes resolvable.
2290 let cache_key = kernel_dir
2291 .canonicalize()
2292 .unwrap_or_else(|_| kernel_dir.to_path_buf());
2293 let cache = KERNEL_COMMIT_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
2294 {
2295 let guard = cache.lock_unpoisoned();
2296 if let Some(cached) = guard.get(&cache_key) {
2297 return Some(cached.clone());
2298 }
2299 }
2300 // `gix::open` (NOT `gix::discover`) — `kernel_dir` must BE the
2301 // repo root. Without this the parent walk could resolve to the
2302 // ktstr project's own `.git` when `kernel_dir` is a non-git
2303 // subdirectory inside the ktstr checkout. The
2304 // open-vs-discover distinction is the ONLY difference between
2305 // this function and [`detect_commit_at`]; the post-open
2306 // "read HEAD, format short hex, append `-dirty` on dirt" body
2307 // lives in the shared [`commit_with_dirty_suffix`] helper.
2308 //
2309 // Open against `kernel_dir` (the caller-supplied path) rather
2310 // than `cache_key`. The two paths point at the same on-disk
2311 // repo by construction (canonicalize resolves to the same
2312 // place), so gix opens the same repository either way; passing
2313 // the original keeps any user-facing diagnostics (gix's
2314 // internal error chain) consistent with the input shape.
2315 let result = gix::open(kernel_dir)
2316 .ok()
2317 .and_then(|repo| commit_with_dirty_suffix(&repo));
2318 if let Some(ref hash) = result {
2319 let mut guard = cache.lock_unpoisoned();
2320 // First successful caller wins; a concurrent caller's
2321 // identical hash would overwrite harmlessly because
2322 // success is deterministic for a given (canonicalized
2323 // path, HEAD, dirty state) tuple.
2324 guard.insert(cache_key, hash.clone());
2325 }
2326 result
2327}
2328
2329/// Environment variable CI runners set to mark sidecars they produce
2330/// as `"ci"`-source. Any non-empty value flips the tag; empty string
2331/// is treated as unset so a defensively-cleared variable does not
2332/// accidentally classify a developer run as CI.
2333///
2334/// Read at sidecar-write time by [`detect_run_source`]; matches the
2335/// `KTSTR_KERNEL` / `KTSTR_CACHE_DIR` env-name convention so the
2336/// full set of ktstr-controlled env vars is `KTSTR_*`-prefixed.
2337pub const KTSTR_CI_ENV: &str = "KTSTR_CI";
2338
2339/// Tag value written to [`SidecarResult::run_source`] for sidecars
2340/// produced under [`KTSTR_CI_ENV`].
2341pub const SIDECAR_RUN_SOURCE_CI: &str = "ci";
2342
2343/// Tag value written to [`SidecarResult::run_source`] for sidecars
2344/// produced without [`KTSTR_CI_ENV`] — the developer-machine
2345/// default.
2346pub const SIDECAR_RUN_SOURCE_LOCAL: &str = "local";
2347
2348/// Tag value applied to [`SidecarResult::run_source`] /
2349/// [`GauntletRow::run_source`](crate::stats::GauntletRow::run_source)
2350/// at LOAD time when the consumer pulls sidecars from a non-default
2351/// pool root via `cargo ktstr stats show-host --dir` /
2352/// `cargo ktstr stats list-values --dir`. NEVER written by
2353/// [`write_sidecar`] — the writer cannot know the file will later
2354/// be moved off-host. See [`apply_archive_source_override`].
2355pub const SIDECAR_RUN_SOURCE_ARCHIVE: &str = "archive";
2356
2357/// Read [`KTSTR_CI_ENV`] and classify the run as `"ci"` (when the
2358/// env var is set non-empty) or `"local"` (the default for any
2359/// developer-driven invocation). Empty-string env values count as
2360/// unset — see [`KTSTR_CI_ENV`] for rationale.
2361///
2362/// Returns `Some(_)` unconditionally because every sidecar producer
2363/// is, by construction, either local or CI; an `Option` return
2364/// keeps the field shape symmetric with the other nullable
2365/// `SidecarResult` fields and reserves room for a future "unknown"
2366/// arm without a serde-version bump.
2367pub(crate) fn detect_run_source() -> Option<String> {
2368 match std::env::var(KTSTR_CI_ENV) {
2369 Ok(v) if !v.is_empty() => Some(SIDECAR_RUN_SOURCE_CI.to_string()),
2370 _ => Some(SIDECAR_RUN_SOURCE_LOCAL.to_string()),
2371 }
2372}
2373
2374/// Override every sidecar's `run_source` field to
2375/// [`SIDECAR_RUN_SOURCE_ARCHIVE`] when the consumer pulled the pool
2376/// from a non-default root via `--dir`. Called at the boundary
2377/// between [`collect_pool`] and the downstream stats pipeline so
2378/// on-disk values stay untouched while the in-memory pool reflects
2379/// the operator's intent: "these sidecars were copied off another
2380/// host; treat them as archives, not as the local-machine record."
2381///
2382/// Mutation strategy is in-place rewrite of the entire `run_source`
2383/// field — the `"local"` / `"ci"` distinction is meaningful on the
2384/// PRODUCING host but irrelevant once the sidecars have been
2385/// moved off, where the only useful classification is "archived
2386/// elsewhere." Operators who need to retain the producer-side
2387/// distinction inside an archive bucket can keep `--dir`
2388/// untargeted (read from the default root) and let the on-disk
2389/// values pass through.
2390pub(crate) fn apply_archive_source_override(pool: &mut [SidecarResult]) {
2391 for sc in pool {
2392 sc.run_source = Some(SIDECAR_RUN_SOURCE_ARCHIVE.to_string());
2393 }
2394}
2395
2396/// Resolve the kernel source-tree path for [`detect_kernel_commit`]
2397/// from the [`crate::KTSTR_KERNEL_ENV`] env var.
2398///
2399/// Routes through [`crate::ktstr_kernel_env`] for the raw env
2400/// value and [`crate::kernel_path::KernelId`] for variant
2401/// dispatch:
2402///
2403/// - `KernelId::Path(p)`: probes the path's `metadata.json` first
2404/// — `cargo-ktstr`'s `--kernel /path/to/linux` resolver routes
2405/// clean source trees through the cache pipeline (see
2406/// [`crate::cli::resolve_kernel_dir_to_entry`]) and exports the
2407/// CACHE ENTRY directory through `KTSTR_KERNEL`, not the
2408/// literal source tree. When `metadata.json` parses and carries
2409/// a `KernelSource::Local::source_tree_path`, that path is the
2410/// underlying source tree and is returned. When parsing fails
2411/// (the path IS the source tree, the dirty-tree path that
2412/// skipped the cache store), falls back to using the raw env
2413/// value verbatim — that path is itself the source tree.
2414/// - `KernelId::Version(ver)`: looks for a Local cache entry
2415/// whose `metadata.version == ver` carrying a
2416/// `source_tree_path`. The tarball-shaped key (`{ver}-tarball-
2417/// {arch}-kc{suffix}`) is checked first because it is the
2418/// most-common form a Version-shaped env points at; on miss
2419/// (or hit yielding `Tarball` / `Git` source, both of which
2420/// are transient with no on-disk tree to probe), the function
2421/// falls back to scanning every valid cache entry for a Local
2422/// match on version. Without this fallback,
2423/// a cache populated by `kernel build --kernel
2424/// /path/to/linux` (a Local entry with source_tree_path) is
2425/// never found by a sidecar writer that has
2426/// `KTSTR_KERNEL=6.14.2`, even though the local tree is
2427/// exactly what the kernel_commit field needs to probe.
2428/// - `KernelId::CacheKey(k)`: uses `k` verbatim — the cache key
2429/// already carries every detail (source-type prefix, arch,
2430/// kconfig hash). On hit, returns
2431/// `KernelSource::Local::source_tree_path` if set, else
2432/// `None` (Tarball / Git entries are transient and have no
2433/// persisted source tree).
2434/// - `KernelId::Range { .. }` / `KernelId::Git { .. }`:
2435/// multi-kernel specs in `KTSTR_KERNEL` never reach this
2436/// helper in production (find_kernel's env reader bails
2437/// before sidecar writing). Defensive: returns `None`.
2438///
2439/// Returns `None` when the env var is unset, when no source
2440/// tree path is recoverable, or when the cache lookup fails.
2441fn resolve_kernel_source_dir() -> Option<std::path::PathBuf> {
2442 source_dir_for(&crate::ktstr_kernel_env()?)
2443}
2444
2445/// Resolve a `KTSTR_KERNEL` identifier string to the on-disk SOURCE
2446/// tree whose git HEAD is the kernel's commit (or `None` for transient
2447/// Range/Git specs or an unrecoverable cache lookup).
2448///
2449/// `pub` + `#[doc(hidden)]` for the same reason as `detect_kernel_commit`:
2450/// the cargo-ktstr `[[bin]]` calls this in `run_cargo` to pre-compute the
2451/// [`crate::KTSTR_KERNEL_COMMIT_ENV`] map. Computing the map value via THIS
2452/// function (then `detect_kernel_commit`) makes the map identical to the
2453/// sidecar's own fallback (`resolve_kernel_source_dir().and_then(
2454/// detect_kernel_commit)`), so a clean Path kernel — whose resolved dir is
2455/// a cache entry, not a git tree — still gets its real source commit into
2456/// the map instead of re-paying a per-test walk.
2457#[doc(hidden)]
2458pub fn source_dir_for(raw: &str) -> Option<std::path::PathBuf> {
2459 use crate::kernel_path::KernelId;
2460 let id = KernelId::parse(raw);
2461 match id {
2462 KernelId::Path(_) => {
2463 let p = std::path::Path::new(raw);
2464 // Cache-entry layout: `metadata.json` carries the
2465 // `KernelSource::Local::source_tree_path` recorded at
2466 // build time. Source-tree layout (dirty path that
2467 // skipped cache store): no metadata, so the env value
2468 // IS the source tree. The shared helper handles both.
2469 crate::cache::recover_local_source_tree(p)
2470 .or_else(|| Some(std::path::PathBuf::from(raw)))
2471 }
2472 KernelId::Version(_) | KernelId::CacheKey(_) => {
2473 let cache = crate::cache::CacheDir::new().ok()?;
2474 resolve_kernel_source_dir_with_cache(&id, &cache)
2475 }
2476 KernelId::Range { .. } | KernelId::Git { .. } => None,
2477 }
2478}
2479
2480/// Pure helper for [`resolve_kernel_source_dir`] that takes the
2481/// parsed `KernelId` and an opened `CacheDir`, returning the source
2482/// tree path if recoverable.
2483///
2484/// Split out from [`resolve_kernel_source_dir`] so tests can pin a
2485/// `CacheDir` at a tempdir root without mutating env vars (which
2486/// would race other tests reading `KTSTR_KERNEL` /
2487/// `KTSTR_CACHE_DIR`).
2488///
2489/// Lookup order for [`crate::kernel_path::KernelId::Version`]:
2490/// 1. Tarball-shaped cache key (`{ver}-tarball-{arch}-kc{suffix}`),
2491/// direct lookup. Returns `Some` only if the entry is a
2492/// `KernelSource::Local` carrying a `source_tree_path`.
2493/// 2. Fallback scan: every valid cache entry whose
2494/// `metadata.version == ver`. First match with
2495/// `KernelSource::Local::source_tree_path` set wins. Handles
2496/// the case where the user built `--kernel /path/to/linux`
2497/// (a Local cache entry without the tarball cache-key prefix)
2498/// but later set `KTSTR_KERNEL=6.14.2` for the test run —
2499/// without this fallback, the local source tree would be
2500/// invisible to the sidecar writer.
2501///
2502/// `KernelSource::Tarball` and `KernelSource::Git` entries are
2503/// skipped at every step because their source trees are transient
2504/// (deleted by the cache pipeline after build), so probing them
2505/// for a `kernel_commit` would always fail.
2506///
2507/// For [`crate::kernel_path::KernelId::CacheKey`], performs a single direct lookup —
2508/// the cache key already encodes every detail (source-type
2509/// prefix, arch, kconfig hash) so no fallback scan is needed.
2510fn resolve_kernel_source_dir_with_cache(
2511 id: &crate::kernel_path::KernelId,
2512 cache: &crate::cache::CacheDir,
2513) -> Option<std::path::PathBuf> {
2514 use crate::kernel_path::KernelId;
2515 match id {
2516 KernelId::Version(ver) => {
2517 let arch = std::env::consts::ARCH;
2518 let tarball_key = format!("{ver}-tarball-{arch}-kc{}", crate::cache_key_suffix());
2519 if let Some(entry) = cache.lookup(&tarball_key)
2520 && let crate::cache::KernelSource::Local {
2521 source_tree_path: Some(p),
2522 ..
2523 } = &entry.metadata.source
2524 {
2525 return Some(p.clone());
2526 }
2527 let entries = cache.list().ok()?;
2528 for listed in entries {
2529 let crate::cache::ListedEntry::Valid(entry) = listed else {
2530 continue;
2531 };
2532 if entry.metadata.version.as_deref() != Some(ver.as_str()) {
2533 continue;
2534 }
2535 if let crate::cache::KernelSource::Local {
2536 source_tree_path: Some(p),
2537 ..
2538 } = &entry.metadata.source
2539 {
2540 return Some(p.clone());
2541 }
2542 }
2543 None
2544 }
2545 KernelId::CacheKey(k) => {
2546 let entry = cache.lookup(k)?;
2547 match entry.metadata.source {
2548 crate::cache::KernelSource::Local {
2549 source_tree_path: Some(ref p),
2550 ..
2551 } => Some(p.clone()),
2552 _ => None,
2553 }
2554 }
2555 // Path / Range / Git callers do not reach this helper —
2556 // resolve_kernel_source_dir handles them inline. Defensive
2557 // None covers any future caller that adds a new arm.
2558 _ => None,
2559 }
2560}
2561
2562/// The kernel commit recorded in a sidecar: the env fast-path first,
2563/// then the in-process gix walk.
2564///
2565/// cargo-ktstr pre-probes every resolved kernel's HEAD once and exports
2566/// a `dir=commit;...` map in [`crate::KTSTR_KERNEL_COMMIT_ENV`], keyed
2567/// by the dir it also exports as `KTSTR_KERNEL`. This process looks
2568/// itself up by its own [`crate::ktstr_kernel_env`] value — string-equal
2569/// to the map key by construction, since cargo-ktstr built both from the
2570/// same resolved dir. A hit skips `detect_kernel_commit`'s gix HEAD +
2571/// dirty-walk, which is memoized per process but NOT across the per-test
2572/// nextest processes (so without the map each of N processes re-pays
2573/// it).
2574///
2575/// Keying on `ktstr_kernel_env()` (the raw `KTSTR_KERNEL`) rather than
2576/// on `resolve_kernel_source_dir()` is deliberate — that is exactly the
2577/// key cargo-ktstr used. The map's commit VALUE matches this function's
2578/// own fallback because cargo-ktstr computes it via the SAME resolution
2579/// (`source_dir_for` then `detect_kernel_commit`); a kernel with no
2580/// recoverable source tree is simply absent from the map, so the miss
2581/// falls through to the identical resolve-and-walk here.
2582///
2583/// Miss / absent env / empty commit → the walk. Optimization only.
2584fn kernel_commit_for_sidecar() -> Option<String> {
2585 if let Some(self_dir) = crate::ktstr_kernel_env()
2586 && let Ok(raw) = std::env::var(crate::KTSTR_KERNEL_COMMIT_ENV)
2587 {
2588 for seg in raw.split(';') {
2589 if let Some((dir, commit)) = seg.rsplit_once('=')
2590 && dir == self_dir
2591 && !commit.is_empty()
2592 {
2593 return Some(commit.to_string());
2594 }
2595 }
2596 }
2597 resolve_kernel_source_dir().and_then(|d| detect_kernel_commit(&d))
2598}
2599
2600/// Compute a stable 64-bit discriminator over the fields that
2601/// distinguish gauntlet variants of the same test. Used to suffix
2602/// the sidecar filename so concurrent variants do not clobber each
2603/// other's output.
2604///
2605/// Uses [`siphasher::sip::SipHasher13`] with zero keys for the same
2606/// cross-toolchain stability reason as the other zero-keyed
2607/// SipHasher13 sites (`build.rs`, `runtime.rs` `content_hash`) —
2608/// the discriminator
2609/// must be the same across Rust toolchain versions or downstream
2610/// tooling that groups variants by filename breaks.
2611///
2612/// # Host-state collision caveat
2613///
2614/// The hash is over test-identity fields (topology, scheduler,
2615/// payload, work_type, sysctls, kargs) — NOT over
2616/// [`crate::host_context::HostContext`], NOT over `scheduler_commit`, NOT over
2617/// `project_commit`, NOT over `kernel_commit`, NOT over
2618/// `run_source`, NOT over `resolve_source`, and NOT over
2619/// `cpu_budget` / `vcpus`. The
2620/// [`crate::host_context::HostContext`] exclusion is pinned by
2621/// `sidecar_variant_hash_excludes_host_context`; the
2622/// `scheduler_commit` exclusion by
2623/// `sidecar_variant_hash_excludes_scheduler_commit`; the
2624/// `project_commit` exclusion by
2625/// `sidecar_variant_hash_excludes_project_commit`; the
2626/// `kernel_commit` exclusion by
2627/// `sidecar_variant_hash_excludes_kernel_commit`; the
2628/// `run_source` exclusion by
2629/// `sidecar_variant_hash_excludes_run_source`; the
2630/// `resolve_source` exclusion by
2631/// `sidecar_variant_hash_excludes_resolve_source`; the
2632/// `cpu_budget` / `vcpus` exclusion by
2633/// `sidecar_variant_hash_excludes_cpu_budget`. All seven are
2634/// deliberate for the same cross-host grouping reason — a
2635/// gauntlet rebuilt against a different userspace scheduler
2636/// commit, a bumped ktstr checkout, a kernel source tree at a
2637/// different HEAD, a different CI runner / developer machine, a
2638/// run that resolved its scheduler via a different discovery
2639/// path, or a run that confined its vCPUs to a different
2640/// host-CPU budget must still bucket with the same-named variant so
2641/// `compare_partitions` can diff two runs of the "same" test
2642/// without the commit hash, run-source tag, or budget shattering
2643/// them into one-row-per-commit islands. `cpu_budget` / `vcpus`
2644/// are instead surfaced as the [`crate::stats::Dimension::CpuBudget`]
2645/// pairing axis, which separates cross-budget runs at compare time
2646/// rather than at the identity bucket. Callers that want to detect
2647/// a commit drift or compare across run environments inspect
2648/// [`SidecarResult::scheduler_commit`] /
2649/// [`SidecarResult::project_commit`] /
2650/// [`SidecarResult::kernel_commit`] /
2651/// [`SidecarResult::run_source`] /
2652/// [`SidecarResult::resolve_source`] directly (via
2653/// `--project-commit` / `--kernel-commit` / `--run-source` /
2654/// `--resolve-source` on `perf-delta`); the filename stays stable
2655/// across commits and run environments by design.
2656///
2657/// The corollary of the HostContext exclusion: if the host's
2658/// observable state mutates mid-suite — NUMA hotplug, hugepage
2659/// reconfiguration, a `sysctl -w` from a parallel process — two
2660/// runs of the same test will produce the same sidecar filename
2661/// and the later write clobbers the earlier. ktstr treats host
2662/// state as stable-enough for a single suite run; callers
2663/// mutating host state during a run own the ordering themselves
2664/// (e.g. by writing to a different `KTSTR_SIDECAR_DIR` per host
2665/// snapshot).
2666/// The single canonical-JSON + siphash site for the variant hash.
2667///
2668/// [`sidecar_variant_hash`] (from a written [`SidecarResult`]) and
2669/// [`variant_hash_from_parts`] (from a test entry + resolved topology +
2670/// work_type, before any sidecar exists) both route through this so the
2671/// two derivations can never drift. `sysctls`/`kargs` are sorted here
2672/// for order-independence.
2673fn variant_hash_of(
2674 topology: &str,
2675 scheduler: &str,
2676 payload: Option<&str>,
2677 work_type: &str,
2678 sysctls: &[String],
2679 kargs: &[String],
2680) -> u64 {
2681 use siphasher::sip::SipHasher13;
2682 use std::hash::Hasher;
2683 let mut sorted_sysctls = sysctls.to_vec();
2684 sorted_sysctls.sort();
2685 let mut sorted_kargs = kargs.to_vec();
2686 sorted_kargs.sort();
2687 let canonical = serde_json::json!({
2688 "topology": topology,
2689 "scheduler": scheduler,
2690 "payload": payload,
2691 "work_type": work_type,
2692 "sysctls": sorted_sysctls,
2693 "kargs": sorted_kargs,
2694 });
2695 let bytes = serde_json::to_vec(&canonical).expect("json serialization cannot fail for strings");
2696 let mut h = SipHasher13::new_with_keys(0, 0);
2697 h.write(&bytes);
2698 h.finish()
2699}
2700
2701pub(crate) fn sidecar_variant_hash(sidecar: &SidecarResult) -> u64 {
2702 variant_hash_of(
2703 &sidecar.topology,
2704 &sidecar.scheduler,
2705 sidecar.payload.as_deref(),
2706 &sidecar.work_type,
2707 &sidecar.sysctls,
2708 &sidecar.kargs,
2709 )
2710}
2711
2712/// The variant hash for a test entry's run at a given resolved topology
2713/// and `work_type`, computed BEFORE any sidecar exists — the
2714/// failure-dump path (and the Ctx/VmResult `variant_hash` stamp) need
2715/// the identity at VM-build time. Mirrors [`write_sidecar`]'s field
2716/// derivation (topology = the resolved topology, scheduler/sysctls/kargs
2717/// = [`scheduler_fingerprint`], payload = `entry.payload`) so the dump
2718/// filename carries the SAME variant hash the sidecar will. Pinned
2719/// equal to [`sidecar_variant_hash`] by a roundtrip test.
2720pub(crate) fn variant_hash_from_parts(
2721 entry: &KtstrTestEntry,
2722 resolved_topology: &crate::vmm::topology::Topology,
2723 work_type: &str,
2724) -> u64 {
2725 let fp = scheduler_fingerprint(entry);
2726 variant_hash_of(
2727 &resolved_topology.to_string(),
2728 &fp.scheduler,
2729 entry.payload.map(|p| p.name),
2730 work_type,
2731 &fp.sysctls,
2732 &fp.kargs,
2733 )
2734}
2735
2736/// Entry-derived scheduler metadata that every sidecar carries
2737/// regardless of pass/fail/skip.
2738///
2739/// Both write paths ([`write_sidecar`] and [`write_skip_sidecar`])
2740/// thread the same materialized fields through to their
2741/// `SidecarResult` constructors; keeping the derivation in a
2742/// named struct (rather than a 4-tuple) means a new
2743/// scheduler-level field shows up as a named field at both
2744/// writer sites and in every call-site binding, instead of as
2745/// an additional anonymous tuple slot that readers have to
2746/// remember the ordering of.
2747///
2748/// `pub(crate)` rather than `pub`: the intermediate struct is a
2749/// write-path detail, not a public API surface. No serde — this
2750/// is not a persisted shape, just a grouped return value.
2751///
2752/// Derives `Debug` for `assert_eq!` diagnostics, `Clone` so tests
2753/// can materialize a fixture once and reuse it across assertions,
2754/// and `PartialEq`/`Eq` so tests can compare whole fingerprints
2755/// in one statement rather than destructuring and asserting on
2756/// each field.
2757#[derive(Debug, Clone, PartialEq, Eq)]
2758pub(crate) struct SchedulerFingerprint {
2759 /// Pretty scheduler name (matches `SidecarResult::scheduler`),
2760 /// e.g. `"eevdf"` or a scheduler-kind payload's declared name.
2761 pub(crate) scheduler: String,
2762 /// Best-effort userspace scheduler commit; `None` for every
2763 /// current variant per
2764 /// [`crate::test_support::SchedulerSpec::scheduler_commit`].
2765 pub(crate) scheduler_commit: Option<String>,
2766 /// Formatted `sysctl.<key>=<value>` lines derived from the
2767 /// scheduler's declared `sysctls()`.
2768 pub(crate) sysctls: Vec<String>,
2769 /// Kernel command-line args declared by the scheduler,
2770 /// forwarded verbatim.
2771 pub(crate) kargs: Vec<String>,
2772}
2773
2774/// Materialize the [`SchedulerFingerprint`] for a test entry.
2775///
2776/// A change to the sidecar schema (e.g. a new scheduler-level
2777/// field) extends this function + [`SchedulerFingerprint`] in
2778/// one place and every writer picks it up automatically.
2779fn scheduler_fingerprint(entry: &KtstrTestEntry) -> SchedulerFingerprint {
2780 let scheduler = entry.scheduler.name.to_string();
2781 // `SchedulerSpec::scheduler_commit()` returns `None` for every
2782 // variant (Eevdf, Discover, Path, KernelBuiltin) — the commit
2783 // string is not carried in the static spec; it comes from the
2784 // sidecar's run-time git probe instead. This call is here only
2785 // to surface the slot in the fingerprint so a future spec
2786 // variant carrying a commit would flow through automatically.
2787 let scheduler_commit = entry
2788 .scheduler
2789 .binary
2790 .scheduler_commit()
2791 .map(|s| s.to_string());
2792 let sysctls: Vec<String> = entry
2793 .scheduler
2794 .sysctls
2795 .iter()
2796 .map(|s| format!("sysctl.{}={}", s.key(), s.value()))
2797 .collect();
2798 let kargs: Vec<String> = entry
2799 .scheduler
2800 .kargs
2801 .iter()
2802 .map(|s| s.to_string())
2803 .collect();
2804 SchedulerFingerprint {
2805 scheduler,
2806 scheduler_commit,
2807 sysctls,
2808 kargs,
2809 }
2810}
2811
2812/// Compute the per-variant sidecar path and serialize + write the
2813/// result to disk.
2814///
2815/// Gauntlet variants of the same test differ by work_type, flags
2816/// (via scheduler args → sysctls/kargs), scheduler, and topology. A
2817/// filename of just `{test_name}.ktstr.json` causes variants to
2818/// overwrite each other, erasing all but the last-written result.
2819/// `sidecar_variant_hash` hashes the discriminating fields into a
2820/// short stable suffix so each variant gets its own sidecar file.
2821///
2822/// On the first call PER UNIQUE DIRECTORY within a process,
2823/// [`pre_clear_run_dir_once`] removes any pre-existing
2824/// `*.ktstr.json` files in the resolved directory so the run is a
2825/// clean snapshot rather than a mosaic of sidecars carried over
2826/// from a prior invocation that shared the same
2827/// `{kernel}-{project_commit}` key (e.g. re-running the suite
2828/// without committing changes).
2829/// Subsequent writes within the same process to the same directory
2830/// append into the cleared directory.
2831///
2832/// Pre-clear is SKIPPED when `KTSTR_SIDECAR_DIR` is set: the
2833/// operator chose that directory and owns its contents — silent
2834/// data loss is not acceptable on an explicit override. When the
2835/// override is unset (the default-path branch),
2836/// `std::fs::create_dir_all` materializes the directory BEFORE
2837/// pre-clear runs so the helper's canonicalize step always sees
2838/// an existing on-disk path; without this ordering, a missing
2839/// dir on the very first call would key the cache against the
2840/// raw path while a later call (after the dir exists) would key
2841/// against the canonicalized absolute path, splitting the cache
2842/// and causing the second call to re-fire pre-clear and wipe the
2843/// first call's sidecars.
2844///
2845/// CROSS-PROCESS SERIALIZATION: on the default path (override
2846/// unset), the call acquires advisory `LOCK_EX` on a per-run-key
2847/// sentinel file (`{runs_root}/.locks/{key}.lock`) before
2848/// pre-clear runs and holds it for the duration of the
2849/// pre-clear + serialize + write cycle. The lock prevents
2850/// process B's `pre_clear_run_dir_once` from interleaving with
2851/// process A's mid-write `std::fs::write` — the kernel-flock
2852/// critical section makes the (read_dir + remove_file) +
2853/// (serialize + write) sequence atomic with respect to peer
2854/// processes targeting the same `{kernel}-{project_commit}`
2855/// directory. The override path skips the lock for the same
2856/// reason it skips pre-clear: operator-chosen directories are
2857/// owned by the operator, so we do not place a `.locks/` sibling
2858/// inside (or above) their custom layout.
2859///
2860/// EX-around-the-whole-cycle (not just pre-clear) is the correct
2861/// choice: it makes the (read_dir + remove_file) + (serialize +
2862/// write) sequence atomic against concurrent peers, so no peer
2863/// observes a half-cleared directory or a mid-write sidecar.
2864///
2865/// A later peer process still RUNS its own pre-clear (its
2866/// `OnceLock` is process-local), but `pre_clear_run_dir_once` skips
2867/// the wipe when the dir's `.ktstr_run_epoch` sentinel already
2868/// records this session's [`crate::KTSTR_RUN_EPOCH_ENV`] token (a
2869/// peer cleared it earlier this session), sparing every
2870/// `{test}-{hash}.ktstr.json` written THIS session. Without that
2871/// sentinel a later peer's pre-clear would delete an earlier peer's
2872/// freshly-written sidecar — silent stats loss; the session token
2873/// closes that window. (Raw `cargo nextest run` sets no token, so
2874/// its peers fall back to wipe-everything and the loss can recur —
2875/// the orchestrated path is the supported one.)
2876///
2877/// PER-FILE ATOMICITY (both branches): the JSON is written to a
2878/// `<final>.tmp.<pid>.<run_id>` sibling and then `rename(2)`'d into
2879/// place. POSIX `rename` is atomic for same-directory destinations,
2880/// so a peer reader (`collect_sidecars`) never observes a partial
2881/// JSON payload — either the old contents stay or the new contents
2882/// replace them in one filesystem step. Two concurrent writers that
2883/// both target the same `{test_name}-{variant_hash}.ktstr.json`
2884/// (override path: two CI jobs sharing one operator-chosen dir;
2885/// default path: a torn-write window inside the flock body that the
2886/// flock would otherwise have to cover) cannot leave a half-written
2887/// JSON behind — last-rename-wins, both files are individually
2888/// well-formed. The `.tmp.<pid>.<run_id>` discriminator on the
2889/// staging name keeps two writers from racing on the same staging
2890/// path even when their final destinations collide. The flock on
2891/// the default path remains load-bearing for the pre-clear leg
2892/// (atomic write only protects the write itself, not the
2893/// `read_dir + remove_file` walk that pre-clear runs).
2894///
2895/// `label` is a caller-supplied noun for the context message ("skip
2896/// sidecar" / "sidecar") so the error chain points at the right call
2897/// site.
2898fn serialize_and_write_sidecar(sidecar: &SidecarResult, label: &str) -> anyhow::Result<()> {
2899 // Read the override ONCE. The two branches below carry the
2900 // result through structurally so neither leg re-reads
2901 // `KTSTR_SIDECAR_DIR` — preventing the override from flipping
2902 // mid-call (which would otherwise let an external mutation
2903 // between the dir resolve and the pre-clear gate either skip
2904 // the wipe on a default-path dir or fire a wipe on an
2905 // operator-chosen one).
2906 let (dir, do_pre_clear) = match sidecar_dir_override() {
2907 Some(path) => (path, false),
2908 None => (resolve_default_sidecar_dir(), true),
2909 };
2910 // Materialize the directory FIRST so `pre_clear_run_dir_once`
2911 // can canonicalize a path that exists on disk. Without this,
2912 // the very first invocation in a process resolves the cache
2913 // key against the raw relative path (canonicalize fails on a
2914 // missing dir, falls back to raw); subsequent invocations
2915 // resolve against the canonicalized absolute path because the
2916 // dir now exists. Two distinct keys for the same logical dir
2917 // → second invocation re-fires pre-clear and wipes the first
2918 // invocation's sidecars. Materializing pre-pre-clear closes
2919 // the relative-vs-absolute split.
2920 std::fs::create_dir_all(&dir)
2921 .with_context(|| format!("create sidecar dir {}", dir.display()))?;
2922 // Acquire the per-run-key cross-process flock for the duration
2923 // of the pre-clear + write cycle. The override branch (operator-
2924 // chosen directory) skips the lock for the same reason it skips
2925 // pre-clear — see the function-level doc. `_run_dir_lock` is
2926 // scoped to this function body so the kernel-side flock releases
2927 // via `OwnedFd::drop` when the function returns (success or
2928 // error path), making the lock RAII-managed without an explicit
2929 // unlock call.
2930 let _run_dir_lock = if do_pre_clear {
2931 Some(acquire_run_dir_flock(&dir)?)
2932 } else {
2933 None
2934 };
2935 if do_pre_clear {
2936 pre_clear_run_dir_once(&dir);
2937 }
2938 let variant_hash = sidecar_variant_hash(sidecar);
2939 let path = dir.join(format!(
2940 "{}-{:016x}.ktstr.json",
2941 sidecar.test_name, variant_hash
2942 ));
2943 let json = serde_json::to_string_pretty(sidecar)
2944 .with_context(|| format!("serialize {label} for '{}'", sidecar.test_name))?;
2945 // Atomic write: stage into a `.tmp.<pid>.<run_id>` sibling and
2946 // rename(2) into the final path. `rename` is atomic for
2947 // same-directory destinations on every filesystem ktstr supports
2948 // (ext4, btrfs, xfs, tmpfs, overlayfs); a peer reader never
2949 // observes a partial payload. The staging name carries the pid
2950 // AND the unique sidecar `run_id` so two writers in the same
2951 // process targeting identical final paths (e.g. two threads in
2952 // the budget-test stdout-capture path) cannot stomp each other's
2953 // staging file before either rename lands. On rename failure the
2954 // staging file is removed so a partial sidecar does not survive
2955 // as garbage in the run dir; rename success consumes the staging
2956 // entry and there is nothing to clean up.
2957 let pid = std::process::id();
2958 let staging = dir.join(format!(
2959 "{}-{:016x}.ktstr.json.tmp.{pid}.{}",
2960 sidecar.test_name, variant_hash, sidecar.run_id,
2961 ));
2962 std::fs::write(&staging, &json)
2963 .with_context(|| format!("write {label} staging {}", staging.display()))?;
2964 if let Err(e) = std::fs::rename(&staging, &path) {
2965 // Best-effort cleanup of the staged payload; ignore the
2966 // unlink error so the rename failure is what surfaces
2967 // (the rename error names the actual problem).
2968 let _ = std::fs::remove_file(&staging);
2969 return Err(anyhow::Error::from(e).context(format!(
2970 "rename {label} staging {} -> {}",
2971 staging.display(),
2972 path.display(),
2973 )));
2974 }
2975 LAST_SIDECAR_PATH.with(|p| *p.borrow_mut() = Some(path.clone()));
2976 Ok(())
2977}
2978
2979thread_local! {
2980 /// Absolute path of the most recent sidecar this thread wrote (via
2981 /// [`serialize_and_write_sidecar`]). The dispatch run loop
2982 /// ([`crate::test_support::eval::run_ktstr_test_inner`]) reads and
2983 /// clears it after the run to finalize the persisted verdict to the
2984 /// test's FINAL (post-inversion) outcome. nextest is process-per-test
2985 /// so a run writes one sidecar; a value left from an earlier phase is
2986 /// overwritten by the current write, so the take always yields this
2987 /// run's sidecar.
2988 static LAST_SIDECAR_PATH: std::cell::RefCell<Option<PathBuf>> =
2989 const { std::cell::RefCell::new(None) };
2990}
2991
2992/// Take (read + clear) the path of the sidecar most recently written on
2993/// this thread, or `None` when no sidecar was written this run (an
2994/// early bail before any write). See [`LAST_SIDECAR_PATH`].
2995///
2996/// MUST be drained exactly once per run — `run_ktstr_test_inner` does
2997/// this after each dispatch. The thread-local persists across calls in
2998/// a process, so a caller that writes a sidecar WITHOUT a following take
2999/// would leave a stale path for the next take to consume; in practice
3000/// only `run_ktstr_test_inner` pairs a write with a take, and a stale
3001/// path points at a dropped tempdir so the finalize read fails benignly.
3002pub(crate) fn take_last_sidecar_path() -> Option<PathBuf> {
3003 LAST_SIDECAR_PATH.with(|p| p.borrow_mut().take())
3004}
3005
3006/// Overwrite a written sidecar's verdict bits with the test's FINAL
3007/// (post-inversion) `(passed, skipped, inconclusive)` outcome — see
3008/// [`crate::test_support::dispatch::Verdict::sidecar_bits`] — and set
3009/// [`SidecarResult::expected_failure`] when an actual scenario
3010/// failure/inconclusive was inverted to a pass/skip. Rewrites the file
3011/// atomically (temp + rename).
3012///
3013/// A no-op when the final verdict already matches what was persisted (an
3014/// ordinary pass/fail/skip — no `expect_err`/`expect_auto_repro`
3015/// inversion). Best-effort: a read/parse/serialize/write error is
3016/// surfaced on stderr and swallowed so the raw sidecar stands (the
3017/// footer then falls back to it) rather than failing the run.
3018pub(crate) fn finalize_sidecar_verdict(
3019 path: &std::path::Path,
3020 passed: bool,
3021 skipped: bool,
3022 inconclusive: bool,
3023) {
3024 let Ok(json) = std::fs::read_to_string(path) else {
3025 return;
3026 };
3027 let Ok(mut sc) = serde_json::from_str::<SidecarResult>(&json) else {
3028 eprintln!(
3029 "ktstr: finalize_sidecar_verdict: unparseable sidecar {}",
3030 path.display()
3031 );
3032 return;
3033 };
3034 // The run's telemetry is failure-mode-dominated when its scenario
3035 // actually failed/was-inconclusive but the final verdict is a
3036 // pass/skip (an inversion) — `perf-delta` excludes such rows.
3037 let raw_failed = sc.is_fail() || sc.is_inconclusive();
3038 let expected_failure = raw_failed && (passed || skipped);
3039 if sc.passed == passed
3040 && sc.skipped == skipped
3041 && sc.inconclusive == inconclusive
3042 && sc.expected_failure == expected_failure
3043 {
3044 return;
3045 }
3046 sc.passed = passed;
3047 sc.skipped = skipped;
3048 sc.inconclusive = inconclusive;
3049 sc.expected_failure = expected_failure;
3050 let Ok(out) = serde_json::to_string_pretty(&sc) else {
3051 return;
3052 };
3053 // Stage with a `.ktstr.json.tmp.…` suffix (append, NOT
3054 // `with_extension`, which would drop `.json`) so a hard-crash orphan
3055 // — write succeeded but rename did not — is reaped by
3056 // `pre_clear_run_dir_once` via `is_sidecar_staging_filename`, the
3057 // same way the primary write's staging file is.
3058 let pid = std::process::id();
3059 let mut staging = path.as_os_str().to_owned();
3060 staging.push(format!(".tmp.finalize.{pid}"));
3061 let staging = std::path::PathBuf::from(staging);
3062 if std::fs::write(&staging, &out).is_ok() && std::fs::rename(&staging, path).is_err() {
3063 let _ = std::fs::remove_file(&staging);
3064 }
3065}
3066
3067/// Remove the failure-dump artifacts
3068/// (`{test}-{variant_hash}.failure-dump.json` and
3069/// `{test}-{variant_hash}.repro.failure-dump.json`) for `test_name` in
3070/// the current sidecar dir.
3071///
3072/// Called when a run's FINAL outcome is a pass/skip but it wrote NO
3073/// sidecar — the run crashed before the guest produced a parseable
3074/// result (e.g. an `expect_err` test with a host-triggered BPF crash),
3075/// so [`finalize_sidecar_verdict`] had nothing to finalize. The freeze
3076/// coordinator wrote the dump unconditionally; without a sidecar to mark
3077/// the pass, the footer's dump-only trigger
3078/// ([`summarize_one_run_dir`] flags a dump with no parsed sidecar) would
3079/// surface this PASSING test as FAILED. Removing the dump keeps the
3080/// footer consistent with nextest's pass. Best-effort: a missing dump
3081/// (the normal clean-pass case) is fine. A genuine pre-sidecar failure
3082/// (final = Fail) does NOT call this, so its dump still flags.
3083pub(crate) fn suppress_failure_dumps(test_name: &str, variant_hash: u64) {
3084 let dir = sidecar_dir();
3085 // Remove THIS variant's dumps by the precise `{test}-{hash}` key, not
3086 // a `{test}-*` glob: a glob would also delete a SIBLING gauntlet
3087 // preset's legitimately-failing dump in the same run dir.
3088 for suffix in [".failure-dump.json", ".repro.failure-dump.json"] {
3089 let _ = std::fs::remove_file(dir.join(format!("{test_name}-{variant_hash:016x}{suffix}")));
3090 }
3091}
3092
3093/// `Some(path)` when `KTSTR_SIDECAR_DIR` is set non-empty,
3094/// returning the override path verbatim; `None` when the env
3095/// var is unset or empty (default-path branch). Single source
3096/// of truth for the override read so [`sidecar_dir`] and
3097/// [`serialize_and_write_sidecar`] (which gates pre-clear on
3098/// the override's presence) share one env-read site rather
3099/// than each calling `std::env::var` independently.
3100///
3101/// The `is_empty()` filter is deliberate: a defensively-cleared
3102/// `KTSTR_SIDECAR_DIR=""` must NOT be treated as an override
3103/// (joining an empty path onto the run-root would silently
3104/// alias the runs-root itself, contaminating the listing).
3105/// Empty-string aliases unset, matching the
3106/// `if let Ok(d) ... && !d.is_empty()` predicate the function
3107/// replaced.
3108///
3109/// `serialize_and_write_sidecar` interprets `Some(_)` as the
3110/// "operator chose this dir, do not pre-clear" gate — silent
3111/// data loss is unacceptable on an explicit override (the
3112/// override is for users who want exact control over where
3113/// sidecars land: test isolation, archival capture, custom CI
3114/// layouts).
3115fn sidecar_dir_override() -> Option<PathBuf> {
3116 std::env::var(crate::KTSTR_SIDECAR_DIR_ENV)
3117 .ok()
3118 .filter(|d| !d.is_empty())
3119 .map(PathBuf::from)
3120}
3121
3122/// Emit a one-shot stderr warning when [`detect_project_commit`]
3123/// resolves to `None` and the run directory therefore lands at
3124/// `{kernel}-unknown`. Operators in this state lose the
3125/// `{project_commit}` discriminator on the run-directory name —
3126/// every non-git invocation at the same kernel collides on a
3127/// single directory, with the latest run pre-clearing the
3128/// previous one's sidecars. The warning surfaces this loss-of-isolation
3129/// risk so the operator can either set `KTSTR_SIDECAR_DIR` to
3130/// disambiguate per-run, or place the project tree under git
3131/// so each run carries its own commit hash.
3132///
3133/// `OnceLock<()>` gates the warning to fire EXACTLY ONCE per
3134/// process: every gauntlet variant resolves a sidecar directory
3135/// independently (via [`sidecar_dir`] and
3136/// [`serialize_and_write_sidecar`]), so without the gate the
3137/// operator would see thousands of duplicate warnings interleaved
3138/// with test output. Called via [`resolve_default_sidecar_dir`] —
3139/// which is the shared default-path body that both [`sidecar_dir`]
3140/// and [`serialize_and_write_sidecar`] funnel through — so the
3141/// warning fires only on the default-path branch. The override
3142/// branch in either caller returns before
3143/// [`resolve_default_sidecar_dir`] is reached, so an operator who
3144/// set `KTSTR_SIDECAR_DIR` to disambiguate non-git runs does not
3145/// see a misleading "commit unknown" warning that does not apply
3146/// to their effective directory layout.
3147///
3148/// Implementation is split into a public-facing wrapper
3149/// (this function) that owns the process-global `OnceLock` and
3150/// targets stderr, and a pure inner helper
3151/// [`warn_unknown_project_commit_inner`] that takes the
3152/// `&OnceLock<()>` gate and the `&mut dyn Write` sink as
3153/// parameters. The split lets tests drive the warning logic
3154/// against a local `OnceLock` and a `Vec<u8>` sink without
3155/// fighting the process-global gate or the global stderr fd —
3156/// the wrapper's behavior is what the inner does, just with
3157/// the static gate and stderr supplied.
3158fn warn_unknown_project_commit_once() {
3159 static WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
3160 let mut sink = std::io::stderr();
3161 warn_unknown_project_commit_inner(&WARNED, &mut sink);
3162}
3163
3164/// Pure helper for [`warn_unknown_project_commit_once`]: gate the
3165/// warning on `gate` and write the warning text to `sink` exactly
3166/// once across the gate's lifetime. Both parameters are taken by
3167/// reference so call sites supply ownership semantics that match
3168/// their gating story:
3169/// - The production wrapper passes a `'static` `OnceLock<()>` so
3170/// the gate spans the whole process and a stderr handle so the
3171/// warning lands in the operator's terminal.
3172/// - Tests pass a local `OnceLock<()>` so each test gets a fresh
3173/// gate (no cross-test contamination via a process-global)
3174/// and a `Vec<u8>` sink so the test can read back the emitted
3175/// bytes and assert on the warning text.
3176///
3177/// Errors from `writeln!` are ignored via `let _ =`: a metadata
3178/// probe warning must not gate sidecar writes. This DEPARTS from
3179/// the previous `eprintln!` semantics (which panic on stderr
3180/// write failure per the std docs) — here we drop the write
3181/// error silently because a metadata probe warning must not gate
3182/// sidecar writes.
3183fn warn_unknown_project_commit_inner(
3184 gate: &std::sync::OnceLock<()>,
3185 sink: &mut dyn std::io::Write,
3186) {
3187 gate.get_or_init(|| {
3188 let _ = writeln!(
3189 sink,
3190 "ktstr: WARNING: project commit unavailable (cwd not in a git \
3191 repo, or HEAD unreadable); runs at this kernel overwrite \
3192 each other in target/ktstr/{{kernel}}-unknown/. Set \
3193 KTSTR_SIDECAR_DIR=<unique-path> per run, or run from inside a \
3194 git repo with at least one commit."
3195 );
3196 });
3197}
3198
3199/// Remove PRIOR-SESSION `*.ktstr.json` files (and orphaned staging
3200/// files) in the resolved run directory, exactly once per unique
3201/// directory per process.
3202///
3203/// "Prior-session" is gated on the [`crate::KTSTR_RUN_EPOCH_ENV`]
3204/// session token: when set (the orchestrated `cargo ktstr test`
3205/// path) the first process to clear a dir records the token in the
3206/// `.ktstr_run_epoch` sentinel, and a later peer process whose token
3207/// matches SKIPS the wipe entirely — sparing every sidecar this
3208/// session's peers wrote (nextest is process-per-test). A
3209/// differing/absent sentinel (new session, or raw `cargo nextest
3210/// run` with no token) wipes every `*.ktstr.json` match and records
3211/// the token — see the CONCURRENT WRITERS (cross-process) section.
3212///
3213/// The run-key format is `{kernel}-{project_commit}` (see
3214/// [`sidecar_dir`]), so two `cargo ktstr test` invocations sharing
3215/// the same kernel and project commit (the typical "re-run the
3216/// suite without committing changes" loop) resolve to the same
3217/// directory. Without
3218/// pre-clearing, each subsequent run would land its sidecars next
3219/// to the previous run's, leaving downstream `cargo ktstr stats`
3220/// readers to see a mosaic of two distinct test outcomes for the
3221/// same variant — the variant-hash suffix on each filename
3222/// prevents overwrites within a single run, but ALSO prevents the
3223/// next run from naturally clobbering the previous one's files
3224/// when the test set or pass/fail mix changes. Wiping
3225/// `*.ktstr.json` once at first-write makes each run a clean
3226/// snapshot of (kernel, project commit) — last-SESSION-wins (a new
3227/// session's full sidecar set replaces the prior session's, while
3228/// peers within one session coexist via the epoch gate).
3229///
3230/// PER-DIRECTORY KEYING: the cache is a `Mutex<HashSet<PathBuf>>`
3231/// keyed on the canonicalized `dir` (with raw `dir` as fallback
3232/// when canonicalize fails — e.g. the directory does not yet
3233/// exist). A `OnceLock<()>` would fire once for the FIRST
3234/// directory only, leaving subsequent writes to other directories
3235/// unprotected. The HashSet ensures every distinct directory the
3236/// process writes to gets pre-cleared exactly once, regardless of
3237/// ordering. Canonicalization collapses symlink aliases so two
3238/// path spellings of the same on-disk dir share one entry.
3239///
3240/// In production today only the default-path
3241/// `runs_root().join({kernel}-{project_commit})` is fed into this
3242/// function (the override path skips pre-clear entirely via
3243/// [`sidecar_dir_override`]), so per-process cache size
3244/// stays at exactly 1 entry. The HashSet shape is the
3245/// future-proof keying for direct unit-test fixtures (which
3246/// rotate tempdir paths through this helper) and any future
3247/// production code path that writes default-path sidecars from
3248/// multiple distinct (kernel, commit) pairs in one process.
3249///
3250/// SCOPE: only `*.ktstr.json` sidecars and orphaned `.tmp` staging
3251/// files in the immediate directory are removed. Subdirectories
3252/// (per-job gauntlet layouts written by external orchestrators) and
3253/// non-sidecar files are left untouched — pre-clear is shallow. Note
3254/// that `collect_sidecars` walks one level of subdirectories, so
3255/// stale sidecars left in subdirectories from a prior run will still
3256/// appear in `cargo ktstr stats` output until the operator removes
3257/// them. The function never deletes the directory itself; production
3258/// callers (`serialize_and_write_sidecar`) materialize the directory
3259/// via `create_dir_all` BEFORE invoking this helper. Beyond the
3260/// wipe, the only other side effect is writing the `.ktstr_run_epoch`
3261/// session sentinel (when a token is set — see CONCURRENT WRITERS).
3262///
3263/// CONCURRENT WRITERS (intra-process): the per-process
3264/// `Mutex<HashSet>` guards against multiple writes within a single
3265/// process re-clearing the same directory. The cache mutex is held
3266/// ACROSS the `read_dir` walk and per-file removals — releasing it
3267/// after the cache insert but before the walk would open a TOCTOU
3268/// window where a sibling thread observes the cached entry, skips
3269/// its own pre-clear, writes a sidecar, and then the original
3270/// thread's still-pending walk deletes that sibling's fresh file.
3271/// Holding the lock across the bounded walk closes the window.
3272///
3273/// CONCURRENT WRITERS (cross-process): nextest is process-per-test,
3274/// so distinct `#[ktstr_test]` functions run as separate processes
3275/// sharing one `{kernel}-{project_commit}` dir. Each has its own
3276/// `OnceLock` and runs its own pre-clear. The
3277/// [`crate::KTSTR_RUN_EPOCH_ENV`] session token is what keeps a
3278/// later peer from deleting an earlier peer's fresh sidecar: the
3279/// first process records the token in the `.ktstr_run_epoch`
3280/// sentinel; a peer whose token matches SKIPS its wipe, sparing
3281/// every `{test}-{hash}.ktstr.json` this session wrote.
3282/// `serialize_and_write_sidecar`'s `LOCK_EX` serializes the
3283/// pre-clear+write cycle so the sentinel read/wipe/write is atomic
3284/// against peers — but serialization ALONE does NOT spare A's
3285/// already-written file from B's later wipe (B runs after A released
3286/// the lock); the sentinel does. Without a token (raw `cargo nextest
3287/// run`) peers fall back to wipe-everything and can lose each other's
3288/// sidecars — the orchestrated path is the supported one.
3289///
3290/// FAILURE: `read_dir` errors are silently ignored — defensive
3291/// behavior for direct callers (e.g. unit tests probing the
3292/// missing-dir edge); production callers materialize the
3293/// directory before invoking this helper, so the missing-dir
3294/// branch is unreachable in production today. Metadata probes
3295/// must not gate sidecar writes. Per-file `remove_file`
3296/// errors are also silently ignored — a partial pre-clear leaves
3297/// either an overwrite (when the new run reproduces a stale
3298/// file's exact `{test_name}-{variant_hash}.ktstr.json` name —
3299/// the desired outcome) or a coexistence (when the new run's
3300/// variant set differs from the prior run's, leaving stale
3301/// sidecars next to fresh ones — the undesired outcome that
3302/// pre-clear was meant to prevent). Coexistence is the acceptable
3303/// degradation here: a noisy pre-clear failure should not abort
3304/// the test run.
3305fn pre_clear_run_dir_once(dir: &std::path::Path) {
3306 use std::collections::HashSet;
3307 use std::path::PathBuf;
3308 use std::sync::{Mutex, OnceLock};
3309 static PRE_CLEARED: OnceLock<Mutex<HashSet<PathBuf>>> = OnceLock::new();
3310 // Canonicalize so two spellings of the same on-disk dir share
3311 // one cache entry. Falls back to the raw path when canonicalize
3312 // fails (the directory may not exist yet on the very first
3313 // write, in which case the raw path keys the entry; subsequent
3314 // calls with the same raw path also miss canonicalize the
3315 // same way and share the entry).
3316 let cache_key = dir.canonicalize().unwrap_or_else(|_| dir.to_path_buf());
3317 let cache = PRE_CLEARED.get_or_init(|| Mutex::new(HashSet::new()));
3318 let mut guard = cache.lock_unpoisoned();
3319 if guard.contains(&cache_key) {
3320 return;
3321 }
3322 // First time this directory has been seen — wipe sidecars while
3323 // the cache mutex is still held. Releasing the guard before the
3324 // read_dir walk would open a TOCTOU window: a sibling thread that
3325 // observes the now-cached entry would skip its own pre-clear,
3326 // proceed to write a sidecar, and the original thread's walk
3327 // (running after the drop) would then delete that sibling's
3328 // freshly-written file. The walk is one read_dir + a bounded
3329 // number of `*.ktstr.json` removals, so holding the lock across
3330 // it is brief; concurrent calls against DIFFERENT directories
3331 // serialize through this critical section but each does a small,
3332 // bounded amount of I/O, which is acceptable for a metadata
3333 // probe call pattern. The cache insert happens AFTER the wipe
3334 // completes (rather than before) so a panic mid-wipe does not
3335 // poison the cache with an entry whose wipe never actually ran.
3336 // The mutex itself enforces serialization across threads; the
3337 // entry only records "wipe completed for this dir" and must
3338 // never be observed without the wipe having succeeded. `guard`
3339 // is dropped at end-of-scope so the lock release happens after
3340 // the loop completes.
3341 let session_token = run_session_token();
3342 let sentinel = dir.join(SESSION_SENTINEL);
3343 if let Some(token) = &session_token
3344 && std::fs::read_to_string(&sentinel).is_ok_and(|recorded| recorded == *token)
3345 {
3346 // A peer test process in THIS session already cleared the dir
3347 // (the sentinel records the session token under the flock);
3348 // its and the other peers' current-session sidecars must
3349 // survive, so skip the wipe entirely. See CONCURRENT WRITERS.
3350 guard.insert(cache_key);
3351 return;
3352 }
3353 if let Ok(entries) = std::fs::read_dir(dir) {
3354 for entry in entries.flatten() {
3355 let path = entry.path();
3356 if !path.is_file() {
3357 continue;
3358 }
3359 // Two file shapes are reaped here (current-session peers
3360 // were already spared by the sentinel skip above, so a
3361 // file reaching this point is prior-session or orphaned
3362 // residue):
3363 // - `<test>-<hash>.ktstr.json` — sidecars from a PRIOR
3364 // session sharing this `{kernel}-{project_commit}` key.
3365 // - `<test>-<hash>.ktstr.json.tmp.<pid>.<run_id>` —
3366 // orphaned staging from a writer that died between
3367 // `write` and `rename` in `serialize_and_write_sidecar`
3368 // (`is_sidecar_filename` excludes these — the extension
3369 // is `<run_id>`, not `json` — so the staging sweep is
3370 // what reaps them). The flock makes reaping an in-flight
3371 // stage impossible: a live peer holds the lock we hold.
3372 if is_sidecar_filename(&path) || is_sidecar_staging_filename(&path) {
3373 let _ = std::fs::remove_file(&path);
3374 }
3375 }
3376 }
3377 // Record this session's token so peer processes skip re-wiping.
3378 // Best-effort: if the write fails, a later peer won't see the
3379 // token and re-wipes (the pre-fix behavior) — no worse than the
3380 // unfixed code, just the cross-test loss left unfixed for this
3381 // dir. Written AFTER the wipe so a crash mid-wipe leaves no
3382 // stale sentinel falsely claiming the dir was cleared.
3383 if let Some(token) = &session_token {
3384 let _ = std::fs::write(&sentinel, token);
3385 }
3386 // Record completion AFTER the wipe finishes, not before. If a
3387 // panic interrupts the loop above, the cache remains empty so
3388 // a subsequent call retries the wipe rather than skipping it
3389 // on the assumption that a prior call already cleared the dir.
3390 guard.insert(cache_key);
3391 drop(guard);
3392}
3393
3394/// Filename of the per-run-directory session sentinel that records
3395/// the [`crate::KTSTR_RUN_EPOCH_ENV`] token of the session that last
3396/// cleared the dir. A dotfile so every sidecar reader ignores it
3397/// (`is_sidecar_filename` requires a `.json` extension and
3398/// `classify_run_artifact` matches none of its suffixes), and it
3399/// lives in the run dir itself (which the caller already
3400/// `create_dir_all`'d) rather than the `.locks/` sibling.
3401const SESSION_SENTINEL: &str = ".ktstr_run_epoch";
3402
3403/// Read the `cargo ktstr test` session token from
3404/// [`crate::KTSTR_RUN_EPOCH_ENV`] — an opaque per-invocation value
3405/// the orchestrator stamps once before nextest spawns, inherited by
3406/// every child test process.
3407///
3408/// `None` when the variable is unset or empty (raw `cargo nextest
3409/// run` — no orchestrator); [`pre_clear_run_dir_once`] then wipes
3410/// every sidecar match (status quo for the unorchestrated path).
3411/// `Some` lets pre-clear record/match the `.ktstr_run_epoch`
3412/// sentinel so a later peer process skips re-wiping a dir this
3413/// session already cleared, sparing the peers' sidecars.
3414fn run_session_token() -> Option<String> {
3415 std::env::var(crate::KTSTR_RUN_EPOCH_ENV)
3416 .ok()
3417 .filter(|v| !v.is_empty())
3418}
3419
3420/// Predicate: is `path` an atomic-write staging file produced by
3421/// [`serialize_and_write_sidecar`]?
3422///
3423/// True iff the filename matches the `<test>-<hash>.ktstr.json.tmp.…`
3424/// shape — `is_sidecar_filename` rejects these because the
3425/// extension is `<run_id>` rather than `json`, so a separate
3426/// predicate is needed for the [`pre_clear_run_dir_once`] sweep
3427/// that reaps orphaned staging files. Filename-component check
3428/// (rather than full-path string) for the same load-bearing reason
3429/// `is_sidecar_filename` uses `Path::file_name()`: a `.ktstr.json.tmp.`
3430/// substring inside an ancestor segment must not match.
3431fn is_sidecar_staging_filename(path: &std::path::Path) -> bool {
3432 path.file_name()
3433 .and_then(|n| n.to_str())
3434 .is_some_and(|n| n.contains(".ktstr.json.tmp."))
3435}
3436
3437/// Wall-clock timeout for [`acquire_run_dir_flock`] before it gives
3438/// up and returns an error. 30 s is generous for the per-write
3439/// critical section: each peer writer holds the lock for at most
3440/// one (read_dir + bounded removes) + one (serialize + write)
3441/// cycle, all measured in milliseconds. A holder that does not
3442/// release within 30 s has stalled (a stuck filesystem, a panic
3443/// inside the locked section that somehow survived the RAII
3444/// drop, etc.) and surfacing that as an actionable error beats
3445/// hanging the test run indefinitely. The timeout is asymmetric
3446/// with the cache-store 300 s (5 minute) timeout because
3447/// cache-store waits for tens of test runs to drain whereas this
3448/// lock waits for at most one peer write.
3449const RUN_DIR_LOCK_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
3450
3451/// Compute the per-run-key flock sentinel path for `dir`.
3452///
3453/// Layout: `{dir.parent()}/.locks/{dir.file_name()}.lock`. When
3454/// `dir = {runs_root}/{key}` (the production default-path shape),
3455/// this resolves to `{runs_root}/.locks/{key}.lock`. Sourced from
3456/// [`crate::flock::LOCK_DIR_NAME`] so a relocation of the lock
3457/// subdirectory updates one place across both this surface and
3458/// the cache module.
3459///
3460/// Returns `None` when `dir` has no parent (root) or no
3461/// `file_name` component (current dir, root) — neither case is
3462/// reachable on the production default path
3463/// ([`runs_root`] always returns a non-root multi-component
3464/// path), but the function is total over the input domain so a
3465/// future caller passing an unusual path surfaces a clean `None`
3466/// rather than panicking on `unwrap`.
3467///
3468/// Pure function over the input path — no I/O. The caller is
3469/// responsible for materializing the parent `.locks/`
3470/// subdirectory before opening the lockfile —
3471/// [`crate::flock::acquire_flock_with_timeout`] handles that
3472/// lazily.
3473fn run_dir_lock_path(dir: &std::path::Path) -> Option<PathBuf> {
3474 let parent = dir.parent()?;
3475 let leaf = dir.file_name()?;
3476 let mut filename = std::ffi::OsString::from(leaf);
3477 filename.push(".lock");
3478 Some(parent.join(crate::flock::LOCK_DIR_NAME).join(filename))
3479}
3480
3481/// Acquire `LOCK_EX` on the per-run-key flock sentinel for `dir`.
3482/// Default-timeout wrapper over [`acquire_run_dir_flock_with_timeout`];
3483/// see that helper's doc for the full behavior contract. The
3484/// timeout split exists so tests can exercise the contention /
3485/// timeout path with a sub-second deadline rather than waiting
3486/// 30 s of real time per assertion.
3487fn acquire_run_dir_flock(dir: &std::path::Path) -> anyhow::Result<std::os::fd::OwnedFd> {
3488 acquire_run_dir_flock_with_timeout(dir, RUN_DIR_LOCK_TIMEOUT)
3489}
3490
3491/// Test-parametrizable inner of [`acquire_run_dir_flock`].
3492///
3493/// Resolves the per-run-key lockfile path via [`run_dir_lock_path`]
3494/// then delegates to [`crate::flock::acquire_flock_with_timeout`],
3495/// which handles parent-directory creation, the poll loop, the
3496/// `tracing::debug!` contention log, and the formatted timeout
3497/// error. The `context` argument names the run directory and the
3498/// `remediation` argument supplies the operator-facing recovery
3499/// hint about peer cargo ktstr test processes that the shared
3500/// helper appends to the timeout error.
3501///
3502/// Returns `Err` on:
3503/// - `run_dir_lock_path(dir)` returning `None` (no parent / no
3504/// file_name — production default path always satisfies both,
3505/// so this is a defensive arm),
3506/// - any error from [`crate::flock::acquire_flock_with_timeout`]
3507/// (parent directory creation failure, `try_flock` error, or
3508/// wall-clock `timeout` elapsing).
3509///
3510/// Returns `Ok(OwnedFd)` on successful acquire. Caller drops the
3511/// fd to release the kernel-side flock; the OFD-bound semantics
3512/// of `flock(2)` mean no explicit unlock call is required —
3513/// `OwnedFd::drop` runs `close(2)` which releases the lock when
3514/// no other fd refers to the same OFD (the fresh `try_flock`
3515/// open guarantees uniqueness).
3516fn acquire_run_dir_flock_with_timeout(
3517 dir: &std::path::Path,
3518 timeout: std::time::Duration,
3519) -> anyhow::Result<std::os::fd::OwnedFd> {
3520 let lock_path = run_dir_lock_path(dir).ok_or_else(|| {
3521 anyhow::anyhow!(
3522 "cannot derive run-dir lock path from {} (no parent or no file_name component)",
3523 dir.display(),
3524 )
3525 })?;
3526 let context = format!("run-dir {}", dir.display());
3527 crate::flock::acquire_flock_with_timeout(
3528 &lock_path,
3529 crate::flock::FlockMode::Exclusive,
3530 timeout,
3531 &context,
3532 Some(
3533 "A peer cargo ktstr test process is writing sidecars to the \
3534 same {kernel}-{project_commit} directory; wait for it to \
3535 finish or kill it, then retry.",
3536 ),
3537 )
3538}
3539
3540/// Emit a minimal sidecar for a PRE-VM-BOOT skip path.
3541///
3542/// Stats tooling enumerates sidecars to compute pass/skip/fail
3543/// rates; when a test bails before `run_ktstr_test_inner` reaches
3544/// the VM-run site that calls [`write_sidecar`], the skip is
3545/// invisible to post-run analysis — it shows up as a missing
3546/// result rather than a recorded skip.
3547///
3548/// This helper writes a sidecar flagged `skipped: true, passed: true`
3549/// with empty VM telemetry (no monitor, no stimulus events, no
3550/// verifier stats, no kvm stats, no payload metrics). Stats tooling
3551/// that subtracts skipped runs from the pass count treats the entry
3552/// correctly.
3553///
3554/// # Distinction from in-VM `AssertResult::skip` paths
3555///
3556/// There are TWO classes of skip, each with its own sidecar writer:
3557///
3558/// 1. **Pre-VM-boot skips** route through this helper
3559/// (`write_skip_sidecar`). Examples:
3560/// - `performance_mode` gated off via `KTSTR_NO_PERF_MODE`
3561/// (see `run_ktstr_test_inner`),
3562/// - `ResourceContention` at `builder.build()` or `vm.run()`
3563/// (all-slots-busy / transient host-resource contention — the
3564/// VM never booted).
3565///
3566/// These paths write a MINIMAL sidecar: empty VM telemetry,
3567/// `skipped: true`, and BOTH `payload` and `work_type` resolved
3568/// exactly as a run of this config would (the entry's declared
3569/// payload and [`crate::test_support::args::current_work_type`]) so
3570/// the skip shares the run's variant identity — a later run of the
3571/// same config overwrites this skip's sidecar instead of coexisting
3572/// with it. There is no VmResult to drain because the VM didn't boot.
3573///
3574/// 2. **In-VM `AssertResult::skip` returns** — e.g. the
3575/// empty-cpuset skip in `scenario::run_scenario`
3576/// (`AssertResult::skip("not enough CPUs/LLCs")`), or the
3577/// `need >= 4 CPUs` checks in `scenario::dynamic::*` — route
3578/// through [`write_sidecar`] at `run_ktstr_test_inner`'s end.
3579/// The guest VM fully booted, ran through scenario setup,
3580/// discovered the topology couldn't accommodate the test, and
3581/// returned early. The resulting sidecar carries REAL VM
3582/// telemetry (monitor, kvm_stats, verifier_stats) alongside
3583/// `skipped: true` — not a blind spot, just a richer record
3584/// than what this helper emits.
3585///
3586/// The asymmetry is intentional: pre-VM-boot skips have no
3587/// telemetry to record, while in-VM skips do. Stats tooling that
3588/// wants to uniformly discount skipped runs filters on
3589/// [`SidecarResult::skipped == true`] regardless of which writer
3590/// produced the entry — both set the field identically.
3591///
3592/// Returns `Err` when the sidecar directory cannot be created, the
3593/// JSON cannot be serialized, or the file write fails. Callers that
3594/// ignore the Result accept the risk of stats-tooling blind spots on
3595/// this run.
3596pub(crate) fn write_skip_sidecar(
3597 entry: &KtstrTestEntry,
3598 resolved_topology: &crate::vmm::topology::Topology,
3599) -> anyhow::Result<()> {
3600 let SchedulerFingerprint {
3601 scheduler,
3602 scheduler_commit,
3603 sysctls,
3604 kargs,
3605 } = scheduler_fingerprint(entry);
3606 let sidecar = SidecarResult {
3607 test_name: entry.name.to_string(),
3608 perf_delta_assertions: entry
3609 .perf_delta_assertions
3610 .iter()
3611 .map(|&a| a.into())
3612 .collect(),
3613 // The RESOLVED topology a run of this preset would boot
3614 // (resolve_vm_topology(entry, topo)), NOT the declared
3615 // entry.topology — for a topology gauntlet each preset boots a
3616 // distinct topology, so recording the declared value would make
3617 // every preset share one variant_hash and clobber. For a plain
3618 // test (no override) resolved == declared. The skip and the run
3619 // of one preset thus share a variant_hash (the run path records
3620 // the same resolved topology), so a flaky test that skips on one
3621 // attempt and runs on the retry writes one sidecar.
3622 topology: resolved_topology.to_string(),
3623 scheduler,
3624 scheduler_commit,
3625 // A skip resolves no scheduler binary (no run), so there is no
3626 // discovery path to record.
3627 resolve_source: None,
3628 project_commit: detect_project_commit(),
3629 // A skip never runs the payload. Still record the declared
3630 // payload name so stats tooling can attribute the skip to
3631 // the payload-gauntlet variant rather than losing the
3632 // association.
3633 payload: entry.payload.map(|p| p.name.to_string()),
3634 metrics: Vec::new(),
3635 passed: false,
3636 skipped: true,
3637 inconclusive: false,
3638 expected_failure: false,
3639 stats: Default::default(),
3640 monitor: None,
3641 // A skip never ran the VM, so no periodic captures fired.
3642 periodic_fired: 0,
3643 periodic_target: 0,
3644 // A skip never booted the VM, so it has no measured budget. 0/0
3645 // maps to None on the GauntletRow's cpu_budget dim (skips carry no
3646 // budget identity; the skipped=true flag, not a sentinel field
3647 // value, marks them).
3648 vcpus: 0,
3649 cpu_budget: 0,
3650 stimulus_events: Vec::new(),
3651 // A skip never ran the workload, but it carries the SAME
3652 // work_type a run of this config would (current_work_type reads
3653 // the per-variant --ktstr-work-type arg, identical across nextest
3654 // retry attempts). That keeps the skip's variant_hash equal to
3655 // the run's, so a flaky test that skips on one attempt and runs
3656 // on the retry writes one sidecar (the retry overwrites the skip)
3657 // rather than two coexisting files the footer would both flag.
3658 // Skips stay identified by skipped=true, not by a work_type
3659 // sentinel (see the variant-hash + skipped-bool contract above).
3660 work_type: super::args::current_work_type(),
3661 verifier_stats: Vec::new(),
3662 kvm_stats: None,
3663 sysctls,
3664 kargs,
3665 kernel_version: detect_kernel_version(),
3666 kernel_commit: kernel_commit_for_sidecar(),
3667 timestamp: now_iso8601(),
3668 run_id: generate_run_id(),
3669 host: Some(crate::host_context::collect_host_context()),
3670 // Skip paths never reach `collect_results`, so cleanup
3671 // duration is undefined. Emit `null` per the sidecar's
3672 // symmetric serialize/deserialize contract.
3673 cleanup_duration_ms: None,
3674 run_source: detect_run_source(),
3675 };
3676 serialize_and_write_sidecar(&sidecar, "skip sidecar")
3677}
3678
3679/// Write a sidecar JSON file for post-run analysis.
3680///
3681/// Output goes to the current run's sidecar directory
3682/// (`KTSTR_SIDECAR_DIR` override, or
3683/// `{CARGO_TARGET_DIR or "target"}/ktstr/{kernel}-{project_commit}/`,
3684/// where `{project_commit}` is the project HEAD short hex with
3685/// `-dirty` when the worktree differs).
3686///
3687/// `payload_metrics` is the accumulated per-invocation output from
3688/// `ctx.payload(X).run()` / `.spawn().wait()` calls made in the
3689/// test body. Empty vec when the test body never called
3690/// `Ctx::payload` (scheduler-only tests, host-only probes).
3691///
3692/// Returns `Err` when the sidecar directory cannot be created, the
3693/// JSON cannot be serialized, or the file write fails. Callers that
3694/// ignore the Result accept the risk of stats-tooling blind spots on
3695/// this run.
3696pub(crate) fn write_sidecar(
3697 entry: &KtstrTestEntry,
3698 vm_result: &vmm::VmResult,
3699 stimulus_events: &[StimulusEvent],
3700 check_result: &AssertResult,
3701 work_type: &str,
3702 payload_metrics: &[PayloadMetrics],
3703 resolved_topology: &crate::vmm::topology::Topology,
3704) -> anyhow::Result<()> {
3705 let SchedulerFingerprint {
3706 scheduler,
3707 scheduler_commit,
3708 sysctls,
3709 kargs,
3710 } = scheduler_fingerprint(entry);
3711 let sidecar = SidecarResult {
3712 test_name: entry.name.to_string(),
3713 perf_delta_assertions: entry
3714 .perf_delta_assertions
3715 .iter()
3716 .map(|&a| a.into())
3717 .collect(),
3718 // The RESOLVED topology this run booted (resolve_vm_topology
3719 // result), NOT the declared entry.topology — a topology gauntlet
3720 // boots a distinct topology per preset, so the declared value
3721 // would collapse every preset to one variant_hash. resolved ==
3722 // declared for a plain test (no override).
3723 topology: resolved_topology.to_string(),
3724 scheduler,
3725 scheduler_commit,
3726 // Scheduler-resolution provenance, carried on VmResult from the
3727 // host eval path (run_ktstr_test_inner_impl resolves the binary
3728 // once and stamps the source), mirroring how vcpus / cpu_budget
3729 // ride VmResult to this stamp.
3730 resolve_source: vm_result.resolve_source.clone(),
3731 project_commit: detect_project_commit(),
3732 payload: entry.payload.map(|p| p.name.to_string()),
3733 metrics: payload_metrics.to_vec(),
3734 passed: check_result.is_pass(),
3735 skipped: check_result.is_skip(),
3736 inconclusive: check_result.is_inconclusive(),
3737 // Raw scenario verdict at write time; the dispatch-layer
3738 // finalize (finalize_sidecar_verdict) overwrites these bits with
3739 // the post-inversion outcome and sets expected_failure.
3740 expected_failure: false,
3741 stats: check_result.stats.clone(),
3742 monitor: vm_result.monitor.as_ref().map(|m| m.summary.clone()),
3743 periodic_fired: vm_result.periodic_fired,
3744 periodic_target: vm_result.periodic_target,
3745 vcpus: vm_result.vcpus,
3746 cpu_budget: vm_result.cpu_budget,
3747 stimulus_events: stimulus_events.to_vec(),
3748 work_type: work_type.to_string(),
3749 verifier_stats: vm_result.verifier_stats.clone(),
3750 kvm_stats: vm_result.kvm_stats.clone(),
3751 sysctls,
3752 kargs,
3753 kernel_version: detect_kernel_version(),
3754 kernel_commit: kernel_commit_for_sidecar(),
3755 timestamp: now_iso8601(),
3756 run_id: generate_run_id(),
3757 host: Some(crate::host_context::collect_host_context()),
3758 cleanup_duration_ms: vm_result.cleanup_duration.map(|d| d.as_millis() as u64),
3759 run_source: detect_run_source(),
3760 };
3761 serialize_and_write_sidecar(&sidecar, "sidecar")
3762}
3763
3764#[cfg(test)]
3765mod tests;