ktstr/test_support/sidecar/
mod.rs

1//! Per-run sidecar JSON — the durable record of a ktstr test outcome.
2//!
3//! Every test (pass, fail, or skip) writes a [`SidecarResult`] to a
4//! JSON file under the run's sidecar directory; downstream analysis
5//! (`cargo ktstr stats`, CI dashboards) aggregates those files to
6//! compute pass/fail rates, verifier stats, callback profiles, and
7//! KVM stats across gauntlet variants.
8//!
9//! Responsibilities owned by this module:
10//! - [`SidecarResult`]: the on-disk schema. Writer-side: every field
11//!   is always emitted — `null` for `None`, `[]` for empty `Vec` —
12//!   with no `skip_serializing_if` and no `serde(default)`. Reader-
13//!   side: serde's native `Option<T>` deserialize tolerates absence
14//!   (a missing key parses as `None`); non-`Option` fields (e.g.
15//!   `test_name`, `passed`, `stats`) are hard-required and a missing
16//!   key fails deserialize. The contract is intentionally asymmetric
17//!   so a future producer that drops an `Option` field still parses
18//!   on older readers, while the current writer guarantees full
19//!   round-trip symmetry. Pre-1.0: old sidecar JSON is disposable;
20//!   regenerate by re-running the test rather than relying on the
21//!   reader-side tolerance for migration.
22//! - [`collect_sidecars`]: load every `*.ktstr.json` under a directory
23//!   (one level of subdirectories for per-job gauntlet layouts).
24//! - [`write_sidecar`] / [`write_skip_sidecar`]: serialize one run to
25//!   disk; variant-hash the discriminating fields so gauntlet variants
26//!   don't clobber each other.
27//! - [`sidecar_dir`], [`runs_root`], [`newest_run_dir`]: resolve where
28//!   sidecars live (env override, or
29//!   `{target}/ktstr/{kernel}-{project_commit}` where
30//!   `{project_commit}` is the project tree's HEAD short hex from
31//!   [`detect_project_commit`], suffixed `-dirty` when the
32//!   worktree differs).
33//! - [`format_run_dirname`]: render the
34//!   `{kernel}-{project_commit}` leaf name from the resolved
35//!   kernel + commit slots, substituting the literal `unknown`
36//!   when either probe returned `None` so the dirname stays
37//!   filesystem-safe (see the unknown-commit collision
38//!   semantics in the runs guide).
39//! - [`is_run_directory`]: predicate consumed by run-listing
40//!   walkers ([`newest_run_dir`] here, `sorted_run_entries` in
41//!   `crate::stats`). Filters non-directories and dotfile
42//!   subdirectories (notably the `.locks/` flock-sentinel
43//!   subdirectory) so the lock infrastructure cannot pollute
44//!   `cargo ktstr stats list` output or claim the "most recent
45//!   run" bucket.
46//! - [`pre_clear_run_dir_once`]: shallow-wipe `*.ktstr.json` files
47//!   in the run directory at the FIRST write of each test
48//!   process so a re-run at the same `{kernel}-{project_commit}`
49//!   key produces a last-writer-wins snapshot rather than an
50//!   append-only archive. Subsequent writes in the same process
51//!   are gated by an internal `Mutex<HashSet<PathBuf>>` so only
52//!   the first call per key per process clears.
53//! - [`acquire_run_dir_flock`]: cross-process `LOCK_EX` on the
54//!   per-run-key sentinel
55//!   (`{runs_root}/.locks/{key}.lock`) held for the duration of
56//!   the pre-clear + serialize + write cycle. Two concurrent
57//!   ktstr processes targeting the same key serialize through
58//!   this lock so neither tears the other's mid-write
59//!   sidecars. The override branch (operator-chosen
60//!   `KTSTR_SIDECAR_DIR`) skips the flock for the same reason
61//!   it skips pre-clear: the operator owns the directory's
62//!   contents.
63//! - [`warn_unknown_project_commit_once`]: one-shot stderr warning
64//!   on first sidecar write when `detect_project_commit` returns
65//!   `None` (test process not in a git repo) so concurrent or
66//!   successive non-git runs colliding on `{kernel}-unknown`
67//!   surface the disambiguation hint
68//!   (`KTSTR_SIDECAR_DIR=…` or place the tree under git) at
69//!   first invocation rather than as a silent collision.
70//! - [`format_verifier_stats`], [`format_callback_profile`],
71//!   [`format_kvm_stats`]: human-readable summaries from a
72//!   `Vec<SidecarResult>` for CLI output.
73//! - [`detect_kernel_version`]: read the kernel version from
74//!   `KTSTR_KERNEL` cache metadata for sidecar-dir naming and the
75//!   `kernel_version` field, with fallback to
76//!   `include/config/kernel.release` in the kernel source tree
77//!   when the cache metadata is absent or does not carry a
78//!   version (e.g. a raw source-tree path set in `KTSTR_KERNEL`
79//!   rather than a cache key).
80//! - [`detect_kernel_commit`]: read the kernel SOURCE TREE's git
81//!   HEAD short hex (with `-dirty` suffix when worktree differs
82//!   from the index or HEAD differs from the index) for the
83//!   `kernel_commit` field. Distinct from `kernel_version`
84//!   (release string from `kernel.release`) and `project_commit`
85//!   (ktstr framework HEAD): this records "what kernel commit
86//!   produced this run" so two runs of the same `kernel_version`
87//!   but different WIP source trees compare distinctly.
88
89use std::path::PathBuf;
90
91use anyhow::Context;
92
93use crate::assert::{AssertResult, ScenarioStats};
94use crate::monitor::MonitorSummary;
95use crate::sync::MutexExt;
96use crate::test_support::PayloadMetrics;
97use crate::timeline::StimulusEvent;
98use crate::vmm;
99
100use super::entry::KtstrTestEntry;
101use super::timefmt::{generate_run_id, now_iso8601};
102
103/// Test result sidecar written to KTSTR_SIDECAR_DIR for post-run analysis.
104#[derive(Debug, serde::Serialize, serde::Deserialize)]
105pub struct SidecarResult {
106    /// Fully qualified test name (matches `KtstrTestEntry::name`,
107    /// the bare function name without the `ktstr/` nextest prefix).
108    pub test_name: String,
109    /// Rendered topology label (e.g. `1n2l4c1t`) for the variant this
110    /// sidecar describes.
111    pub topology: String,
112    /// Scheduler name (matches `Scheduler::name`); `"eevdf"` for
113    /// tests run without an scx scheduler.
114    pub scheduler: String,
115    /// Best-effort git commit of the scheduler binary used for this
116    /// run. Currently ALWAYS `None` for every `SchedulerSpec`
117    /// variant — no variant today has a reliable commit source.
118    /// The field is reserved on the schema so stats tooling can
119    /// enrich it once a reliable source exists (e.g. a
120    /// `--version` probe or ELF-note read on the resolved
121    /// scheduler binary). See
122    /// [`crate::test_support::SchedulerSpec::scheduler_commit`]
123    /// for the full per-variant rationale.
124    ///
125    /// Writer always emits (`"scheduler_commit": null` on absence).
126    /// Reader-side: serde's native `Option<T>` deserialize tolerates
127    /// absence (a missing key parses as `None`); see the module-level
128    /// doc for the full asymmetric contract that governs every
129    /// nullable on this struct.
130    pub scheduler_commit: Option<String>,
131    /// How the userspace scheduler binary was resolved for this run —
132    /// the snake_case [`crate::test_support::ResolveSource::as_str`] tag
133    /// (`"path"`, `"env_var"`, `"path_lookup"`, `"sibling_dir"`,
134    /// `"target_debug"`, `"target_release"`, `"auto_built"`,
135    /// `"not_found"`). Provenance, not identity: distinct from
136    /// [`SidecarResult::scheduler_commit`] (the binary's git commit) —
137    /// this records the discovery PATH, so the stats CLI can answer "was
138    /// this run's scheduler auto-built from the workspace HEAD, or
139    /// resolved from a possibly-stale `target/` or `$PATH` binary?".
140    /// `"auto_built"` is the only tag whose source commit is known to
141    /// match the workspace tree; every other tag carries the stale-binary
142    /// hazard documented on the [`crate::test_support::ResolveSource`]
143    /// variant.
144    ///
145    /// Writer always emits (`"resolve_source": null` on absence — the
146    /// skip-sidecar path resolves no binary). Reader-side: serde's native
147    /// `Option<T>` deserialize tolerates absence (a missing key parses as
148    /// `None`); see the module-level doc for the full asymmetric
149    /// contract. Excluded from `sidecar_variant_hash` for the same
150    /// cross-host grouping reason as `scheduler_commit` / `run_source`:
151    /// two runs of the same semantic variant resolved via different
152    /// discovery paths must still bucket together.
153    pub resolve_source: Option<String>,
154    /// Best-effort git HEAD of the ktstr project tree at sidecar-
155    /// write time. Captured by `detect_project_commit` via
156    /// `gix::discover` from the test process's current working
157    /// directory; walks up to find the enclosing repo and reads
158    /// HEAD short-hex, suffixing `-dirty` when index-vs-HEAD or
159    /// worktree-vs-index changes are observed (submodules ignored,
160    /// matching the [`crate::fetch::local_source`] dirty-detection
161    /// pattern). `None` when cwd is not inside any git repo, or
162    /// when the gix probe fails for any reason — this is metadata,
163    /// not a gate, so probe failure must not abort the run.
164    ///
165    /// Distinct from [`SidecarResult::scheduler_commit`]: that
166    /// field tracks the userspace scheduler binary's commit
167    /// (currently always `None` per its own doc); this field
168    /// tracks the ktstr framework / test-runner commit, so the
169    /// stats CLI can answer "which version of the harness produced
170    /// this sidecar?" without inspecting the scheduler.
171    ///
172    /// Writer always emits (`"project_commit": null` on absence).
173    /// Reader-side: serde's native `Option<T>` deserialize tolerates
174    /// absence (a missing key parses as `None`) — see the module-
175    /// level doc for the full asymmetric contract. Excluded from
176    /// `sidecar_variant_hash` for the same cross-host grouping
177    /// reason `scheduler_commit` is excluded: two runs of the same
178    /// semantic variant on different ktstr commits must still bucket
179    /// together so `perf-delta` can diff them; the commit-drift
180    /// detection inspects this field directly via `--project-commit`
181    /// / `--a-project-commit` / `--b-project-commit`.
182    pub project_commit: Option<String>,
183    /// Binary payload name (matches `Payload::name` when
184    /// `entry.payload` is set). `None` when the test declared no
185    /// binary payload. Writer always emits (`"payload": null` on
186    /// absence); reader-side, serde's native `Option<T>` deserialize
187    /// tolerates absence — see the module-level doc for the full
188    /// asymmetric contract.
189    pub payload: Option<String>,
190    /// Per-payload extracted metrics collected from `ctx.payload(X).run()`
191    /// / `.spawn().wait()` call sites during the test body.
192    ///
193    /// One [`PayloadMetrics`] per invocation, in the order the calls
194    /// ran. Empty when no payload calls were made (scheduler-only
195    /// tests, or a binary-only test where the body bailed before
196    /// running the payload). Writer always emits (`"metrics": []` in
197    /// that case); reader-side, this `Vec` field is hard-required —
198    /// non-`Option` fields fail deserialize on absence. See the
199    /// module-level doc for the full contract.
200    pub metrics: Vec<PayloadMetrics>,
201    /// True when the run is a real pass — every assertion that
202    /// ran produced a positive verdict. Mirrors
203    /// [`crate::assert::AssertResult::is_pass`]. Mutually
204    /// exclusive with [`Self::skipped`] and [`Self::inconclusive`]:
205    /// the three bits `(passed, skipped, inconclusive)` form a
206    /// strict 4-state encoding where at most one is set per
207    /// record. The fourth state — Fail — is the all-false case
208    /// (no dedicated bit; [`Self::is_fail`] derives it). A real
209    /// pass requires `!skipped && !inconclusive` AND at least one
210    /// observed assertion (the empty / all-skip case routes
211    /// through [`Self::skipped`] instead).
212    pub passed: bool,
213    /// True when the run was skipped (e.g. topology mismatch,
214    /// missing resource, in-VM `AssertResult::skip` return).
215    /// Mutually exclusive with [`Self::passed`] (Pass requires a
216    /// real assertion; an all-skip stream is Skip, not Pass) and
217    /// with [`Self::inconclusive`]. Stats tooling subtracts
218    /// `skipped` runs from "pass count" so non-executions are not
219    /// reported as passes.
220    pub skipped: bool,
221    /// True when at least one assertion was [`Outcome::Inconclusive`](crate::assert::Outcome::Inconclusive) —
222    /// the run ran but a zero-denominator ratio gate could not be
223    /// evaluated (e.g. zero iterations across all workers under a
224    /// `max_migration_ratio` check). Mutually exclusive with
225    /// [`Self::passed`] and [`Self::skipped`]; in the
226    /// `Fail > Inconclusive > Pass > Skip` lattice, Inconclusive
227    /// dominates Pass/Skip but loses to Fail, so a run with both
228    /// Inconclusive and Fail outcomes records `inconclusive = false,
229    /// passed = false` (Fail wins) — `inconclusive = true` requires
230    /// `!is_fail() && !is_pass() && !is_skip()`.
231    ///
232    /// Distinct from `passed = false` (Fail) and `skipped = true`
233    /// (precondition unmet) so CI gates and stats tooling can
234    /// triage zero-denominator runs as "workload didn't produce
235    /// the signal the assertion needed" rather than misclassifying
236    /// them as silent passes (prior to the [`Outcome::Inconclusive`](crate::assert::Outcome::Inconclusive)
237    /// variant the zero-denominator case fell out as Pass) or as
238    /// hard failures.
239    pub inconclusive: bool,
240    /// True when the persisted verdict (`passed`/`skipped`/
241    /// `inconclusive`) is the POST-inversion FINAL outcome of a run
242    /// whose underlying scenario actually failed — i.e. an
243    /// `expect_err` / `expect_auto_repro` test whose induced failure was
244    /// inverted to a pass. Set by the sidecar finalize
245    /// (`finalize_sidecar_verdict`) after dispatch resolves the
246    /// verdict; `false` for an ordinary pass/skip/fail.
247    ///
248    /// The verdict bits carry the FINAL outcome so the footer, `stats`
249    /// analysis, and `replay` match nextest's exit code. This flag
250    /// preserves the one fact that overwrite loses: that the run's
251    /// telemetry is failure-mode-dominated (a deliberately short /
252    /// stalled run). `perf-delta` ORs it into its exclusion guard so
253    /// an inverted-to-pass row is still kept OUT of the regression math
254    /// (its induced-crash telemetry is not real scheduler behavior).
255    pub expected_failure: bool,
256    /// Aggregate per-cgroup statistics merged across every worker.
257    pub stats: ScenarioStats,
258    /// Monitor summary. `None` means the monitor loop did not run
259    /// (host-only tests, early VM failure) or sample collection
260    /// produced no valid data. Writer always emits (`"monitor": null`
261    /// on absence); reader-side, serde's native `Option<T>`
262    /// deserialize tolerates absence — see the module-level doc.
263    pub monitor: Option<MonitorSummary>,
264    /// Periodic-capture coverage for this run: how many periodic snapshot
265    /// boundaries actually fired (`periodic_fired`) out of the configured
266    /// `num_snapshots` target (`periodic_target`). Carried verbatim from
267    /// [`crate::prelude::VmResult`] so cross-run tooling can read the
268    /// coverage off the persisted sidecar (previously only the in-memory
269    /// result exposed it). `0`/`0` for runs with no periodic captures
270    /// configured. Hard-required `u32` fields — old sidecars predating
271    /// them re-generate on the next run (sidecar data is disposable).
272    pub periodic_fired: u32,
273    /// See [`Self::periodic_fired`].
274    pub periodic_target: u32,
275    /// Guest vCPU count and the effective host-CPU budget the vCPU threads
276    /// ran on, carried verbatim from [`crate::prelude::VmResult`]. Drive
277    /// the `cpu-budget` comparison Dimension (cross-budget runs are not
278    /// paired — confining 32 vCPUs to 4 host CPUs measures something else)
279    /// and the overcommit marker: `cpu_budget < vcpus` means the host
280    /// time-sliced the guest's vCPUs, confounding the timing metrics
281    /// (wake-latency / off-CPU / run-delay — schedstat run_delay tracks
282    /// rq->clock, which follows the guest TSC and is not steal-adjusted,
283    /// so the off-host window inflates it for tasks waiting across it).
284    /// Hard-required `u32`
285    /// (old sidecars re-generate; sidecar data is disposable). EXCLUDED
286    /// from `sidecar_variant_hash`: a budget change is a different
287    /// measurement, separated downstream by the Dimension, not the
288    /// identity bucket.
289    pub vcpus: u32,
290    /// See [`Self::vcpus`].
291    pub cpu_budget: u32,
292    /// Ordered stimulus events published by the guest step executor
293    /// while the scenario ran.
294    pub stimulus_events: Vec<StimulusEvent>,
295    /// WorkSpec type label used for post-hoc filtering and A/B comparison
296    /// (distinct from the `WorkType` enum — this is the text name).
297    pub work_type: String,
298    /// Per-BPF-program verifier statistics captured from the VM's
299    /// scheduler (when one was loaded). Empty when no scheduler
300    /// programs were inspected. Writer always emits as
301    /// `"verifier_stats": []` in that case; reader-side, this `Vec`
302    /// field is hard-required (non-`Option` fields fail deserialize
303    /// on absence). See the module-level doc.
304    pub verifier_stats: Vec<crate::monitor::bpf_prog::ProgVerifierStats>,
305    /// Aggregate per-vCPU KVM stats read after VM exit. `None` when
306    /// the VM did not run (host-only tests) or KVM stats were
307    /// unavailable. Writer always emits (`"kvm_stats": null` on
308    /// absence); reader-side, serde's native `Option<T>` deserialize
309    /// tolerates absence — see the module-level doc.
310    pub kvm_stats: Option<crate::vmm::KvmStatsTotals>,
311    /// Effective sysctls active during this test run, recorded as raw
312    /// `sysctl.key=value` cmdline strings. Writer always emits as
313    /// `"sysctls": []` when none; reader-side, this `Vec` field is
314    /// hard-required (non-`Option` fields fail deserialize on
315    /// absence). See the module-level doc.
316    pub sysctls: Vec<String>,
317    /// Effective kernel command-line args active during this test run.
318    /// Writer always emits as `"kargs": []` when none; reader-side,
319    /// this `Vec` field is hard-required (non-`Option` fields fail
320    /// deserialize on absence). See the module-level doc.
321    pub kargs: Vec<String>,
322    /// Kernel version of the VM under test (from cache metadata,
323    /// e.g. `"6.14.2"`). Populated from the cache entry's
324    /// `metadata.json` version field, with fallback to the kernel
325    /// source tree's `include/config/kernel.release` when
326    /// `KTSTR_KERNEL` points at a raw source path rather than a
327    /// cache key; `None` for host-only tests or when neither
328    /// source yields a version string. The host's running kernel
329    /// release is carried separately in `host.kernel_release`.
330    /// Writer always emits (`"kernel_version": null` on absence);
331    /// reader-side, serde's native `Option<T>` deserialize tolerates
332    /// absence — see the module-level doc for the full asymmetric
333    /// contract.
334    pub kernel_version: Option<String>,
335    /// Kernel SOURCE TREE git HEAD short hex (7 chars via
336    /// `oid::to_hex_with_len(7)`), with `-dirty` suffix appended
337    /// when HEAD-vs-index or index-vs-worktree changes are
338    /// observed. Probes via `gix::open` against the kernel
339    /// directory resolved from `KTSTR_KERNEL` (not `gix::discover`
340    /// — the kernel dir is explicit, not walked-up). Captured by
341    /// `detect_kernel_commit` at sidecar-write time.
342    ///
343    /// Distinct from sibling fields:
344    /// - [`SidecarResult::kernel_version`] — release string read
345    ///   from cache metadata or `include/config/kernel.release`,
346    ///   e.g. `"6.14.2"`. Two runs of `6.14.2` from a clean
347    ///   tree and a `-dirty` worktree at the same HEAD share
348    ///   `kernel_version` but differ on `kernel_commit`.
349    /// - [`SidecarResult::project_commit`] — ktstr framework
350    ///   HEAD captured from the test process's cwd. Tracks
351    ///   "what version of the harness produced this sidecar?"
352    ///   independently of the kernel under test.
353    /// - [`SidecarResult::scheduler_commit`] — userspace
354    ///   scheduler binary's commit (currently always `None`).
355    ///
356    /// `None` when:
357    /// - `KTSTR_KERNEL` is unset or empty;
358    /// - the resolved `KernelId` is `Version` / `CacheKey` whose
359    ///   underlying source is `Tarball` / `Git` (no source tree
360    ///   on disk to probe);
361    /// - the resolved kernel directory is not a git repository
362    ///   (`gix::open` fails);
363    /// - HEAD cannot be read (unborn HEAD on a fresh `git init`
364    ///   with zero commits);
365    /// - any other gix probe failure — metadata, not a gate.
366    ///
367    /// Writer always emits (`"kernel_commit": null` on absence);
368    /// reader-side, serde's native `Option<T>` deserialize tolerates
369    /// absence — see the module-level doc for the full asymmetric
370    /// contract. Excluded from `sidecar_variant_hash` for the same
371    /// cross-host grouping reason `scheduler_commit` and
372    /// `project_commit` are excluded: two runs of the same semantic
373    /// variant on different kernel-source HEADs must still bucket
374    /// together so `perf-delta` can diff them; the commit-drift
375    /// detection inspects this field directly via the
376    /// `--kernel-commit` filter.
377    pub kernel_commit: Option<String>,
378    /// ISO 8601 timestamp of when this test run started.
379    pub timestamp: String,
380    /// Unique identifier for the test run. Composed as
381    /// `{run_id_timestamp}-{counter}` — the `YYYYMMDDTHHMMSSZ`
382    /// process-start stamp followed by a process-local monotonic
383    /// counter. Every sidecar produced in one `cargo ktstr test`
384    /// invocation shares the same timestamp prefix; the counter
385    /// distinguishes concurrent gauntlet variants within that
386    /// invocation. Distinct from the run DIRECTORY name (keyed
387    /// `{kernel}-{project_commit}`, see [`sidecar_dir`]) — the
388    /// directory groups runs by what they tested, the `run_id`
389    /// groups sidecars by which process emitted them.
390    pub run_id: String,
391    /// Host context — static-ish runtime state (CPU model,
392    /// memory size, THP policy, kernel release, host cmdline,
393    /// scheduler tunables). Populated by production sidecar
394    /// writers.
395    ///
396    /// `None` causes:
397    /// - **test-fixture path**: not the production sidecar
398    ///   writer (production writers always populate `host`).
399    /// - **pre-enrichment archive**: sidecar predates the
400    ///   host-context landing — re-run the test to regenerate
401    ///   under the current schema (no migration shim exists
402    ///   per the pre-1.0 disposable-data contract).
403    ///
404    /// Deliberately excluded from the variant hash so
405    /// gauntlet variants on different hosts collapse into the same
406    /// hash bucket.
407    ///
408    /// No serde attributes: writer always emits (`"host": null` when
409    /// `None`); reader-side, serde's native `Option<T>` deserialize
410    /// tolerates absence (a missing key parses as `None`). The
411    /// asymmetric contract is crate-wide — see the module-level doc.
412    /// Pre-1.0, sidecar data is disposable, so regenerate by
413    /// re-running the test rather than carrying a compat shim for
414    /// older JSON; the reader-side tolerance exists so an in-flight
415    /// schema rename of an `Option` field does not break parsing of
416    /// older sidecars during the same producer-version, not as a
417    /// long-term migration story.
418    pub host: Option<crate::host_context::HostContext>,
419    /// Wall-clock milliseconds spent in
420    /// `KtstrVm::collect_results` — the host-side
421    /// teardown window from BSP exit through SHM drain (mirrors
422    /// [`VmResult::cleanup_duration`](crate::vmm::VmResult::cleanup_duration);
423    /// `Duration` is converted to `u64` ms here because every other
424    /// timing field on this struct that lands in a sidecar-comparison
425    /// CLI uses integer ms or seconds, and JSON has no native
426    /// `Duration`). `None` when the run was killed by the watchdog
427    /// before `collect_results` returned, or for the `host_only` /
428    /// host-only-stub paths that never boot a VM. Writer always emits
429    /// (`"cleanup_duration_ms": null` on absence); reader-side,
430    /// serde's native `Option<T>` deserialize tolerates absence — see
431    /// the module-level doc for the full asymmetric contract.
432    pub cleanup_duration_ms: Option<u64>,
433    /// Provenance tag for this sidecar — distinguishes a developer's
434    /// local run from a CI run so cross-environment comparisons in
435    /// `perf-delta` can narrow on (or contrast across) the run
436    /// environment without inferring it from `host`.
437    ///
438    /// Recorded by `detect_run_source` at sidecar-write time:
439    /// - `Some("ci")` when `KTSTR_CI_ENV` is set non-empty (CI runner
440    ///   scripts export it before invoking the test binary; local
441    ///   runs never set it).
442    /// - `Some("local")` otherwise — the default for any sidecar
443    ///   produced by a developer-driven invocation.
444    /// - The third documented value (`"archive"`) is NEVER written
445    ///   here: a sidecar cannot know it will later be archived. The
446    ///   stats CLI applies the `"archive"` tag at LOAD time when its
447    ///   `--dir` flag points at a non-default pool root, overriding
448    ///   whatever was on disk via `apply_archive_source_override`.
449    ///
450    /// `Option<String>` (rather than an enum) keeps the schema
451    /// extensible without a serde-version bump if a future producer
452    /// wants a new tag (e.g. `"benchmark"`); the consumer side
453    /// treats unknown values the same as known ones — they are
454    /// strings the operator can pass via `--run-source` to filter on.
455    /// Writer always emits (`"run_source": null` on absence);
456    /// reader-side, serde's native `Option<T>` deserialize tolerates
457    /// absence — see the module-level doc for the full asymmetric
458    /// contract. Excluded from `sidecar_variant_hash` for the same
459    /// cross-host grouping reason `host` is excluded — two runs of
460    /// the same semantic variant from different environments must
461    /// still bucket together so `perf-delta` can pair them; `--run-source`
462    /// is the explicit knob for source-aware narrowing.
463    ///
464    /// Field name `run_source` (renamed from `source`) disambiguates
465    /// from [`crate::cache::KernelSource`] / `KernelMetadata.source`
466    /// — those describe the kernel build's input (tarball / git /
467    /// local), this describes the run-environment provenance.
468    ///
469    /// **On-disk JSON key changed from `"source"` to `"run_source"`
470    /// in the field rename.** No `#[serde(alias = "source")]` is
471    /// in place: archived sidecars written before the rename carry
472    /// the `"source"` key, which the current schema treats as an
473    /// unknown field. Because `SidecarResult`'s derive does NOT
474    /// set `deny_unknown_fields`, the deserialize does not fail
475    /// outright — instead serde silently DROPS the stale `"source"`
476    /// payload and lands `run_source = None` (since `Option<T>`'s
477    /// "tolerate absence" rule kicks in for the missing
478    /// `"run_source"` field). The data is lost, not preserved. This
479    /// is deliberate per the project's pre-1.0 disposable-data
480    /// contract: re-running tests regenerates sidecars under the
481    /// new key rather than carrying compat shims forward. Consumers
482    /// who need the run-source classification on archived JSON
483    /// must either rename the key in-place before deserialize, or
484    /// re-run the test to regenerate the sidecar with the new
485    /// schema. Tooling that runs against the renamed schema and
486    /// observes a `None` `run_source` cannot distinguish "sidecar
487    /// pre-dates the field" from "sidecar pre-dates the rename and
488    /// lost its tag" — both lower-bound at `None` for filter
489    /// purposes.
490    pub run_source: Option<String>,
491    /// Per-test [`crate::test_support::PerfDeltaAssertion`]s declared on the
492    /// entry, serialized so `cargo ktstr perf-delta --noise-adjust`'s host-side
493    /// compare can enforce them across commits (the entry registry in the parent
494    /// process describes only HEAD's tests, not a baseline/cached sidecar's
495    /// commit, so the declaration must travel WITH the run). Empty when the test
496    /// declared none. Inert here — a normal `cargo ktstr test` writes them but
497    /// never gates on them; only the `--noise-adjust` compare consults them (the
498    /// scalar compare warns that declared gates were skipped).
499    ///
500    /// Writer always emits (`"perf_delta_assertions": []` on absence); reader-
501    /// side this `Vec` field is hard-required (non-`Option` fails deserialize on
502    /// absence) — see the module-level doc for the full contract.
503    pub perf_delta_assertions: Vec<PerfDeltaAssertionRecord>,
504}
505
506/// Owned, serialized mirror of [`crate::test_support::PerfDeltaAssertion`]. The
507/// public declaration type uses `&'static str` (so it stays const/E0493-safe on
508/// the entry) and therefore cannot `Deserialize` into an owned value; this
509/// `String`-backed record is the sidecar carrier the perf-delta compare reads.
510/// `pub` because it is a field of the `pub` [`SidecarResult`] (constructed
511/// across the workspace, including by the `cargo-ktstr` binary crate); the
512/// author-facing declaration type is [`crate::test_support::PerfDeltaAssertion`].
513#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
514pub struct PerfDeltaAssertionRecord {
515    /// Registry metric name this assertion gates (see `stats list-metrics`).
516    pub metric: String,
517    /// Pinned regression direction, or `None` to inherit the registry polarity.
518    pub direction: Option<crate::test_support::Polarity>,
519    /// Relative-regression override (percent), or `None` for `default_rel`.
520    pub max_regression_pct: Option<f64>,
521    /// Absolute-materiality override, or `None` for `default_abs`.
522    pub min_abs: Option<f64>,
523    /// Phase scope (`step_index`), or `None` to gate the aggregate value.
524    pub phase: Option<u16>,
525}
526
527impl From<&crate::test_support::PerfDeltaAssertion> for PerfDeltaAssertionRecord {
528    fn from(a: &crate::test_support::PerfDeltaAssertion) -> Self {
529        Self {
530            metric: a.metric().to_string(),
531            direction: a.direction(),
532            max_regression_pct: a.max_regression_pct(),
533            min_abs: a.min_abs(),
534            phase: a.phase(),
535        }
536    }
537}
538
539impl SidecarResult {
540    /// Convenience accessor mirroring
541    /// [`crate::assert::AssertResult::is_pass`]. SidecarResult is the
542    /// wire-format mirror of an AssertResult; this method exposes the
543    /// same is_pass / is_fail / is_skip / is_inconclusive vocabulary
544    /// so consumers can swap between the two without re-learning
545    /// field names.
546    ///
547    /// Returns true only when the run reached a real Pass — neither
548    /// skipped, inconclusive, nor failed. The triple-conjunct guard
549    /// matches AssertResult's `Fail > Inconclusive > Pass > Skip`
550    /// dominance under the strict 4-state mutex this struct encodes.
551    /// CI gates that want "ship-on-pass" semantics call this method
552    /// and only this method.
553    ///
554    /// Part of the `is_pass` / `is_fail` / `is_inconclusive` /
555    /// `is_skip` vocabulary uniform across the verdict surfaces:
556    /// [`crate::assert::AssertResult::is_pass`] / `Self::is_pass` /
557    /// [`crate::assert::Outcome::is_pass`] / `MonitorVerdict::is_pass`
558    /// (in the `monitor` module, which is `pub(crate)`) /
559    /// `Verdict::is_pass` (re-exported at [`crate::assert::Verdict`]) /
560    /// `GauntletRow::is_pass` (in the `stats` module, which is
561    /// `pub(crate)`).
562    pub fn is_pass(&self) -> bool {
563        self.passed && !self.skipped && !self.inconclusive
564    }
565    /// Convenience accessor mirroring
566    /// [`crate::assert::AssertResult::is_fail`]. The four-state
567    /// encoding uses three stored bits `(passed, skipped,
568    /// inconclusive)` in strict mutual exclusion (at most one
569    /// set); Fail is the all-false derived state, no dedicated
570    /// bit. `is_fail` reads "none of the three bits are set",
571    /// which under `Fail > Inconclusive > Pass > Skip` dominance
572    /// correctly resolves a mixed Fail+Inconclusive stream as
573    /// Fail.
574    pub fn is_fail(&self) -> bool {
575        !self.passed && !self.skipped && !self.inconclusive
576    }
577    /// Convenience accessor mirroring
578    /// [`crate::assert::AssertResult::is_skip`].
579    pub fn is_skip(&self) -> bool {
580        self.skipped
581    }
582    /// Convenience accessor mirroring
583    /// [`crate::assert::AssertResult::is_inconclusive`]. True when
584    /// the run could not be evaluated (zero-denominator ratio gate);
585    /// false on real Pass, real Fail, or Skip. CI gates that gate
586    /// on "did we get a real verdict?" should test
587    /// `r.is_pass() || r.is_fail()` and treat both `is_skip()` and
588    /// `is_inconclusive()` as "couldn't measure".
589    pub fn is_inconclusive(&self) -> bool {
590        self.inconclusive
591    }
592}
593
594#[cfg(test)]
595impl SidecarResult {
596    /// Populated [`SidecarResult`] for unit tests. Every field has a
597    /// reasonable default so call sites only spell out what they want
598    /// to vary via struct-update syntax:
599    ///
600    /// ```ignore
601    /// let sc = SidecarResult {
602    ///     test_name: "my_test".to_string(),
603    ///     passed: false,
604    ///     ..SidecarResult::test_fixture()
605    /// };
606    /// ```
607    ///
608    /// Defaults model a passing EEVDF run on a minimal `1n1l1c1t`
609    /// topology with no payload and no VM telemetry: `test_name="t"`,
610    /// `topology="1n1l1c1t"`, `scheduler="eevdf"`, `work_type="SpinWait"`,
611    /// `passed=true`, `skipped=false`, `inconclusive=false`, every
612    /// [`Option`] `None`, every [`Vec`] empty, `stats` is
613    /// `ScenarioStats::default()`, and both `timestamp`/`run_id` are
614    /// empty strings.
615    ///
616    /// **Prefer this over local `base = || SidecarResult { ... }`
617    /// closures.** A local closure duplicates the default set and
618    /// drifts the moment [`SidecarResult`] grows a field; this fixture
619    /// is the single place those defaults live.
620    ///
621    /// **Hash-stability tests must not rely on these defaults for
622    /// hash-participating fields** (`topology`, `scheduler`, `payload`,
623    /// `work_type`, `sysctls`, `kargs`). Tests that pin
624    /// a [`sidecar_variant_hash`] output against a literal constant
625    /// must spell every hash-participating field out explicitly so a
626    /// future change to these defaults cannot silently shift the
627    /// pinned value.
628    pub(crate) fn test_fixture() -> SidecarResult {
629        SidecarResult {
630            test_name: "t".to_string(),
631            perf_delta_assertions: Vec::new(),
632            topology: "1n1l1c1t".to_string(),
633            scheduler: "eevdf".to_string(),
634            scheduler_commit: None,
635            resolve_source: None,
636            project_commit: None,
637            payload: None,
638            metrics: Vec::new(),
639            passed: true,
640            skipped: false,
641            inconclusive: false,
642            expected_failure: false,
643            stats: crate::assert::ScenarioStats::default(),
644            monitor: None,
645            periodic_fired: 0,
646            periodic_target: 0,
647            vcpus: 1,
648            cpu_budget: 1,
649            stimulus_events: Vec::new(),
650            work_type: "SpinWait".to_string(),
651            verifier_stats: Vec::new(),
652            kvm_stats: None,
653            sysctls: Vec::new(),
654            kargs: Vec::new(),
655            kernel_version: None,
656            kernel_commit: None,
657            timestamp: String::new(),
658            run_id: String::new(),
659            host: None,
660            cleanup_duration_ms: None,
661            run_source: None,
662        }
663    }
664}
665
666/// Predicate: is `path` a ktstr sidecar JSON filename?
667///
668/// True iff the path's extension is `json` AND the path's
669/// FILENAME COMPONENT (`Path::file_name`) contains `.ktstr.` —
670/// matching the on-disk shape produced by [`write_sidecar`]
671/// (`<test>-<variant_hash>.ktstr.json`). Both gates are required:
672/// bare `*.json` files (cargo cache, stray fixtures) and non-json
673/// files whose name happens to contain `.ktstr.` (e.g. a log)
674/// are excluded.
675///
676/// The filename-component check (rather than full-path string)
677/// is load-bearing: a parent directory like
678/// `target/foo.ktstr.bar/extra.json` would falsely match a
679/// whole-path `contains(".ktstr.")` while NOT being a sidecar.
680/// `Path::file_name()` returns only the trailing component, so
681/// `.ktstr.` in any ancestor segment cannot trigger the predicate.
682///
683/// Single source of truth for "is this file a sidecar?" — used
684/// by [`collect_sidecars_with_errors`]'s parsing walker and by
685/// the explain-sidecar file-count walker
686/// (`crate::cli::stats_cmds::explain_sidecar::count_sidecar_files`). Both
687/// walkers MUST agree on the predicate so `walked` (count) and
688/// `valid + errors` (parse outcomes) reconcile against each
689/// other; a divergence would let a file count toward `walked`
690/// without contributing to either bucket, manifesting as a
691/// silent-drop count that has no source.
692pub(crate) fn is_sidecar_filename(path: &std::path::Path) -> bool {
693    path.extension().and_then(|e| e.to_str()) == Some("json")
694        && path
695            .file_name()
696            .and_then(|n| n.to_str())
697            .is_some_and(|n| n.contains(".ktstr."))
698}
699
700/// Scan a directory for ktstr sidecar JSON files. Recurses one level
701/// into subdirectories to handle per-job gauntlet layouts.
702///
703/// Convenience wrapper over [`collect_sidecars_with_errors`] for
704/// single-directory callers that only need the parsed sidecars and not
705/// the per-file parse-failure list. Emits ONE aggregated
706/// [`warn_skipped_sidecars`] summary for that directory when stale
707/// sidecars were dropped. Multi-directory walkers must NOT use this in a
708/// loop (it would print one summary per directory) — they call
709/// [`collect_sidecars_with_errors`] per directory and aggregate the counts
710/// into a single summary (see `collect_pool`).
711pub(crate) fn collect_sidecars(dir: &std::path::Path) -> Vec<SidecarResult> {
712    let (sidecars, parse_errors, _io_errors) = collect_sidecars_with_errors(dir);
713    warn_skipped_sidecars(dir, parse_errors.len());
714    sidecars
715}
716
717/// Emit a single aggregated summary for the stale/unparseable sidecars a
718/// walk skipped, or nothing when `skipped == 0`. Sidecars written before a
719/// schema field was added fail to deserialize and are dropped (sidecar data
720/// is disposable — re-running regenerates it); this collapses what would
721/// otherwise be one `eprintln!` per file into one line. Per-file detail
722/// stays available through [`collect_sidecars_with_errors`]'s parse-error
723/// Vec, which `cargo ktstr stats explain-sidecar` renders.
724///
725/// `pub(crate)` so multi-directory walkers outside this module (e.g.
726/// `stats::analyze::sorted_run_entries`) can accumulate `parse_errors.len()`
727/// across run directories and emit ONE pool-wide summary rather than one
728/// per directory.
729pub(crate) fn warn_skipped_sidecars(dir: &std::path::Path, skipped: usize) {
730    if skipped > 0 {
731        eprintln!(
732            "ktstr_test: skipped {skipped} stale sidecar(s) under {} (older \
733             schema — re-run the affected tests to regenerate; \
734             `cargo ktstr stats explain-sidecar --run <run>` shows per-file \
735             detail)",
736            dir.display(),
737        );
738    }
739}
740
741/// Per-file parse-failure record returned by
742/// [`collect_sidecars_with_errors`] and threaded through
743/// `crate::cli::WalkStats::errors` to the renderers.
744///
745/// Named-field struct (rather than a `(PathBuf, String,
746/// Option<String>)` tuple) so call sites read fields by name —
747/// pattern-matching `for err in errors` and accessing
748/// `err.path` / `err.raw_error` / `err.enriched_message`
749/// resists the tuple-position-swap class of bug where positional
750/// fields could destructure in either order without compiler help.
751pub(crate) struct SidecarParseError {
752    /// On-disk path of the sidecar JSON that failed to parse.
753    pub path: std::path::PathBuf,
754    /// Verbatim serde-error string. Kept raw for
755    /// grep-friendly parse-error tracking and surfaced through
756    /// the JSON channel as the `error` key.
757    pub raw_error: String,
758    /// Operator-facing remediation prose computed by
759    /// [`enriched_parse_error_message`]. `Some(...)` for known
760    /// schema-drift cases (currently the `host` missing-field
761    /// pattern), `None` otherwise. Surfaced through the JSON
762    /// channel as `enriched_message`.
763    pub enriched_message: Option<String>,
764}
765
766/// Per-file IO-failure record returned by
767/// [`collect_sidecars_with_errors`] and threaded through
768/// `crate::cli::WalkStats::io_errors` to the renderers.
769///
770/// Captures files where the filename predicate matched but
771/// `std::fs::read_to_string` failed before parsing could begin —
772/// permission denied, mid-rotate truncation, broken symlink,
773/// etc. Distinct from [`SidecarParseError`] (which represents
774/// "file read OK but JSON parse failed"); separating the two
775/// lets dashboard consumers triage filesystem incidents apart
776/// from schema drift.
777///
778/// Named-field struct mirroring [`SidecarParseError`]'s shape so
779/// the renderer side can iterate by field name without tuple-
780/// position fragility. No `enriched_message` field — there is no
781/// remediation catalog for IO failures (causes vary per host:
782/// fix permissions, fix the filesystem, retry the test).
783pub(crate) struct SidecarIoError {
784    /// On-disk path the predicate matched as a sidecar candidate.
785    pub path: std::path::PathBuf,
786    /// Verbatim `std::io::Error` Display string. Surfaced through
787    /// the JSON channel as the `error` key on
788    /// `crate::cli::WalkIoError` entries and through the text
789    /// channel as the `error: ...` line under the `io errors`
790    /// trailing block.
791    pub raw_error: String,
792}
793
794/// Test-only re-export of [`enriched_parse_error_message`] so
795/// `cli::tests` can verify the enrichment-pattern logic
796/// directly against synthetic error strings. The helper itself
797/// stays private so production code routes through
798/// [`collect_sidecars_with_errors`].
799#[cfg(test)]
800pub(crate) fn enriched_parse_error_message_for_test(
801    path: &std::path::Path,
802    raw_error: &str,
803) -> Option<String> {
804    enriched_parse_error_message(path, raw_error)
805}
806
807/// Compute the operator-prose enrichment for a serde parse-error
808/// message, when one applies. Today the only enriched case is the
809/// `host` missing-field schema-drift diagnostic; the function
810/// returns `None` for any other shape so consumers can branch on
811/// "enrichment exists" without re-implementing the match.
812///
813/// Pulled out of [`collect_sidecars_with_errors`]'s parse path so the
814/// enrichment prose is computed in one place and stored in the returned
815/// [`SidecarParseError`]'s `enriched_message` field — parse failures are
816/// surfaced only through that Vec, not a separate stderr channel.
817///
818/// Matching on the Display text is deliberate: serde's typed-error
819/// surface for `missing field "X"` is not stable across
820/// serde_json versions, but the rendered message is — a
821/// forward-compat regression-resilient check costs one string
822/// search.
823fn enriched_parse_error_message(path: &std::path::Path, raw_error: &str) -> Option<String> {
824    let is_missing_host = raw_error.contains("missing field") && raw_error.contains("`host`");
825    if is_missing_host {
826        Some(format!(
827            "ktstr_test: skipping {}: {raw_error} — the `host` field \
828             was added to SidecarResult; pre-1.0 policy is \
829             disposable-sidecar: re-run the test to regenerate this \
830             file under the current schema (no migration shim exists)",
831            path.display(),
832        ))
833    } else {
834        None
835    }
836}
837
838/// Scan a directory for ktstr sidecar JSON files, returning the
839/// parsed sidecars, a [`SidecarParseError`] record (named fields
840/// `path`, `raw_error`, `enriched_message`) for every file that
841/// passed the filename predicate but failed to deserialize, and a
842/// [`SidecarIoError`] record (named fields `path`, `raw_error`)
843/// for every file that passed the predicate but whose
844/// `read_to_string` failed before parsing could begin. Recurses
845/// one level into subdirectories to handle per-job gauntlet
846/// layouts.
847///
848/// Parse failures are captured ONLY in the returned parse-errors vec —
849/// this walker no longer logs per file. Each failure is a
850/// [`SidecarParseError`] record (named fields `path`, `raw_error`,
851/// `enriched_message`) for structured callers (`explain-sidecar`'s walker
852/// output). Both raw and enriched are exposed so dashboard consumers can
853/// pick: raw for parse-error grepping, enriched for human-facing
854/// remediation prose. Callers that only need the sidecars aggregate
855/// `parse_errors.len()` and emit one [`warn_skipped_sidecars`] summary
856/// (see [`collect_sidecars`] / `collect_pool`) rather than one line per
857/// file.
858///
859/// IO failures (third return) get a single eprintln line plus a
860/// structured [`SidecarIoError`] record. Distinguished from
861/// parse failures so dashboard consumers can triage filesystem
862/// incidents (permission denied, mid-rotate truncation, broken
863/// symlink) apart from schema drift. With this third channel,
864/// every predicate-matching file lands in exactly one of the
865/// three returned vecs — the prior implicit
866/// `walked - valid - parse_errors.len()` silent-drop count is
867/// now zero by construction.
868///
869/// Callers that don't need structured errors should use
870/// [`collect_sidecars`].
871pub(crate) fn collect_sidecars_with_errors(
872    dir: &std::path::Path,
873) -> (
874    Vec<SidecarResult>,
875    Vec<SidecarParseError>,
876    Vec<SidecarIoError>,
877) {
878    let mut sidecars = Vec::new();
879    let mut parse_errors: Vec<SidecarParseError> = Vec::new();
880    let mut io_errors: Vec<SidecarIoError> = Vec::new();
881    let entries = match std::fs::read_dir(dir) {
882        Ok(e) => e,
883        Err(e) => {
884            tracing::warn!(
885                dir = %dir.display(),
886                error = %e,
887                "ktstr_test: collect_sidecars_with_errors cannot read root dir",
888            );
889            return (sidecars, parse_errors, io_errors);
890        }
891    };
892    let mut subdirs = Vec::new();
893    let try_load = |path: &std::path::Path,
894                    out: &mut Vec<SidecarResult>,
895                    parse_errs: &mut Vec<SidecarParseError>,
896                    io_errs: &mut Vec<SidecarIoError>| {
897        if !is_sidecar_filename(path) {
898            return;
899        }
900        let data = match std::fs::read_to_string(path) {
901            Ok(d) => d,
902            Err(e) => {
903                let raw = e.to_string();
904                eprintln!("ktstr_test: cannot read {}: {raw}", path.display());
905                io_errs.push(SidecarIoError {
906                    path: path.to_path_buf(),
907                    raw_error: raw,
908                });
909                return;
910            }
911        };
912        match serde_json::from_str::<SidecarResult>(&data) {
913            Ok(sc) => out.push(sc),
914            Err(e) => {
915                let raw = e.to_string();
916                let enriched = enriched_parse_error_message(path, &raw);
917                // Capture (do not log) the per-file skip: callers emit one
918                // aggregated `warn_skipped_sidecars` summary so a directory
919                // of stale sidecars produces a single line, not a flood.
920                // `cargo ktstr stats explain-sidecar --run <run>` renders the
921                // per-file detail (raw + enriched remediation) from this Vec.
922                parse_errs.push(SidecarParseError {
923                    path: path.to_path_buf(),
924                    raw_error: raw,
925                    enriched_message: enriched,
926                });
927            }
928        }
929    };
930    for entry in entries {
931        let entry = match entry {
932            Ok(e) => e,
933            Err(e) => {
934                tracing::warn!(
935                    dir = %dir.display(),
936                    error = %e,
937                    "ktstr_test: skipping unreadable DirEntry while collecting sidecars",
938                );
939                continue;
940            }
941        };
942        let path = entry.path();
943        if path.is_dir() {
944            subdirs.push(path);
945            continue;
946        }
947        try_load(&path, &mut sidecars, &mut parse_errors, &mut io_errors);
948    }
949    for sub in subdirs {
950        let sub_entries = match std::fs::read_dir(&sub) {
951            Ok(e) => e,
952            Err(e) => {
953                tracing::warn!(
954                    subdir = %sub.display(),
955                    error = %e,
956                    "ktstr_test: skipping unreadable subdirectory while collecting sidecars",
957                );
958                continue;
959            }
960        };
961        for entry in sub_entries {
962            let entry = match entry {
963                Ok(e) => e,
964                Err(e) => {
965                    tracing::warn!(
966                        subdir = %sub.display(),
967                        error = %e,
968                        "ktstr_test: skipping unreadable DirEntry in sidecar subdirectory",
969                    );
970                    continue;
971                }
972            };
973            try_load(
974                &entry.path(),
975                &mut sidecars,
976                &mut parse_errors,
977                &mut io_errors,
978            );
979        }
980    }
981    (sidecars, parse_errors, io_errors)
982}
983
984/// Pool every sidecar JSON under every run directory at `root`.
985///
986/// Walks each immediate subdirectory of `root` (one per run, named
987/// `{kernel}-{project_commit}` by [`sidecar_dir`] where
988/// `{project_commit}` is the project tree's HEAD short hex with
989/// `-dirty` suffix when the worktree differs from HEAD) and
990/// concatenates the sidecars each one yields via
991/// `collect_sidecars_with_errors` (per directory, so the per-directory
992/// stale-sidecar skip counts aggregate into one pool-wide summary). The
993/// result is a flat
994/// `Vec<SidecarResult>` covering every recorded run on disk —
995/// `cargo ktstr perf-delta`'s pool-driven sourcing reads it
996/// once, applies the typed `--a-*` / `--b-*` filters in memory,
997/// and partitions the survivors into A/B sides.
998///
999/// `root` is typically [`runs_root`]; pass an alternate path when
1000/// comparing archived sidecar trees copied off a CI host (the
1001/// `--dir` escape hatch on `perf-delta`).
1002///
1003/// Returns an empty Vec when `root` does not exist or contains no
1004/// run directories. Per-run failure (a corrupt sidecar, a partial
1005/// directory) is counted and skipped — pool-collection never aborts
1006/// on a single bad file, and emits ONE aggregated
1007/// `warn_skipped_sidecars` summary for the whole walk rather than a
1008/// per-file line.
1009///
1010/// Performance: this is a full filesystem walk over `root`. On a
1011/// host with many archived runs (dozens to hundreds), each
1012/// invocation re-reads every sidecar JSON. The cost is acceptable
1013/// for the current operator workflow (one comparison per
1014/// session) but is taskifyable if it becomes a hot path — a
1015/// directory-name fast-path could skip runs whose
1016/// `{kernel}-{project_commit}` prefix does not match the active
1017/// `--a-kernel` / `--b-kernel` filter.
1018pub fn collect_pool(root: &std::path::Path) -> Vec<SidecarResult> {
1019    let entries = match std::fs::read_dir(root) {
1020        Ok(e) => e,
1021        Err(e) => {
1022            tracing::warn!(
1023                root = %root.display(),
1024                error = %e,
1025                "ktstr_test: collect_pool cannot read root; returning empty pool",
1026            );
1027            return Vec::new();
1028        }
1029    };
1030    let mut pool = Vec::new();
1031    let mut skipped = 0usize;
1032    for entry in entries {
1033        let entry = match entry {
1034            Ok(e) => e,
1035            Err(e) => {
1036                tracing::warn!(
1037                    root = %root.display(),
1038                    error = %e,
1039                    "ktstr_test: skipping unreadable DirEntry while collecting pool",
1040                );
1041                continue;
1042            }
1043        };
1044        let path = entry.path();
1045        if path.is_dir() {
1046            // `collect_sidecars_with_errors` already handles "one level of
1047            // subdirectories for per-job gauntlet layouts" inside each run
1048            // directory, so the two-level `{root}/{run_dir}/{job_subdir}`
1049            // shape works without a third walker level. Use the
1050            // error-returning variant (not `collect_sidecars`, which emits
1051            // its own per-directory summary) so the skip counts aggregate
1052            // into ONE pool-wide summary below.
1053            let (sidecars, parse_errors, _io_errors) = collect_sidecars_with_errors(&path);
1054            pool.extend(sidecars);
1055            skipped += parse_errors.len();
1056        }
1057    }
1058    warn_skipped_sidecars(root, skipped);
1059    pool
1060}
1061
1062/// BPF verifier complexity limit (BPF_COMPLEXITY_LIMIT_INSNS).
1063const VERIFIER_INSN_LIMIT: u32 = 1_000_000;
1064
1065/// Percentage of the verifier limit that triggers a warning.
1066const VERIFIER_WARN_PCT: f64 = 75.0;
1067
1068/// Aggregate BPF verifier stats across sidecars into a summary table.
1069///
1070/// verified_insns is deterministic for a given binary, so per-program
1071/// values are deduplicated (max across observations). Flags programs
1072/// using >=75% of the 1M verifier complexity limit.
1073pub(crate) fn format_verifier_stats(sidecars: &[SidecarResult]) -> String {
1074    use std::collections::BTreeMap;
1075
1076    let mut by_name: BTreeMap<&str, u32> = BTreeMap::new();
1077    for sc in sidecars {
1078        for info in &sc.verifier_stats {
1079            let entry = by_name.entry(&info.name).or_insert(0);
1080            *entry = (*entry).max(info.verified_insns);
1081        }
1082    }
1083
1084    if by_name.is_empty() {
1085        return String::new();
1086    }
1087
1088    let mut out = String::from("\n=== BPF VERIFIER STATS ===\n\n");
1089    out.push_str(&format!(
1090        "  {:<24} {:>12} {:>8}\n",
1091        "program", "verified", "limit%"
1092    ));
1093    out.push_str(&format!("  {:-<24} {:-<12} {:-<8}\n", "", "", ""));
1094
1095    let mut warnings = Vec::new();
1096    let mut total: u64 = 0;
1097
1098    for (&name, &verified_insns) in &by_name {
1099        let pct = (verified_insns as f64 / VERIFIER_INSN_LIMIT as f64) * 100.0;
1100        let flag = if pct >= VERIFIER_WARN_PCT { " !" } else { "" };
1101        out.push_str(&format!(
1102            "  {:<24} {:>12} {:>7.1}%{flag}\n",
1103            name, verified_insns, pct,
1104        ));
1105        if pct >= VERIFIER_WARN_PCT {
1106            warnings.push(format!(
1107                "  {name}: {pct:.1}% of 1M limit ({verified_insns} verified insns)",
1108            ));
1109        }
1110        total += verified_insns as u64;
1111    }
1112
1113    out.push_str(&format!("\n  total verified insns: {total}\n"));
1114
1115    if !warnings.is_empty() {
1116        out.push_str("\nWARNING: programs near verifier complexity limit:\n");
1117        for w in &warnings {
1118            out.push_str(w);
1119            out.push('\n');
1120        }
1121    }
1122
1123    out
1124}
1125
1126/// Per-test BPF callback profile from monitor prog_stats_deltas.
1127///
1128/// Shows per-program invocation count, total CPU time, and average
1129/// nanoseconds per call. Each test's profile is printed independently.
1130pub(crate) fn format_callback_profile(sidecars: &[SidecarResult]) -> String {
1131    let mut out = String::new();
1132
1133    for sc in sidecars {
1134        let deltas = match sc
1135            .monitor
1136            .as_ref()
1137            .and_then(|m| m.prog_stats_deltas.as_ref())
1138        {
1139            Some(d) if !d.is_empty() => d,
1140            _ => continue,
1141        };
1142
1143        if out.is_empty() {
1144            out.push_str("\n=== BPF CALLBACK PROFILE ===\n");
1145        }
1146        out.push_str(&format!("\n  {} ({}):\n", sc.test_name, sc.topology));
1147        out.push_str(&format!(
1148            "    {:<24} {:>12} {:>14} {:>12}\n",
1149            "program", "cnt", "total_ns", "avg_ns"
1150        ));
1151        out.push_str(&format!(
1152            "    {:-<24} {:-<12} {:-<14} {:-<12}\n",
1153            "", "", "", ""
1154        ));
1155        for d in deltas {
1156            out.push_str(&format!(
1157                "    {:<24} {:>12} {:>14} {:>12.0}\n",
1158                d.name, d.cnt, d.nsecs, d.nsecs_per_call,
1159            ));
1160        }
1161    }
1162
1163    out
1164}
1165
1166/// Aggregate KVM stats across sidecars into a compact summary.
1167///
1168/// Averages each stat across all tests that returned `Some(KvmStatsTotals)`.
1169/// Tests without KVM stats (non-VM tests, old kernels) are excluded
1170/// from the denominator.
1171pub(crate) fn format_kvm_stats(sidecars: &[SidecarResult]) -> String {
1172    let with_stats: Vec<&crate::vmm::KvmStatsTotals> = sidecars
1173        .iter()
1174        .filter_map(|sc| sc.kvm_stats.as_ref())
1175        .collect();
1176
1177    if with_stats.is_empty() {
1178        return String::new();
1179    }
1180
1181    let n_vms = with_stats.len();
1182
1183    // Compute cross-VM averages for each stat.
1184    let vm_avg = |name: &str| -> u64 {
1185        let sum: u64 = with_stats.iter().map(|d| d.avg(name)).sum();
1186        sum / n_vms as u64
1187    };
1188
1189    let exits = vm_avg("exits");
1190    let halt = vm_avg("halt_exits");
1191    let halt_wait_ns = vm_avg("halt_wait_ns");
1192    let preempted = vm_avg("preemption_reported");
1193    let signal = vm_avg("signal_exits");
1194    let hypercalls = vm_avg("hypercalls");
1195
1196    // Halt poll efficiency across all vCPUs and VMs.
1197    let total_poll_ok: u64 = with_stats
1198        .iter()
1199        .map(|d| d.sum("halt_successful_poll"))
1200        .sum();
1201    let total_poll_try: u64 = with_stats
1202        .iter()
1203        .map(|d| d.sum("halt_attempted_poll"))
1204        .sum();
1205
1206    if exits == 0 {
1207        return String::new();
1208    }
1209
1210    let halt_wait_ms = halt_wait_ns as f64 / 1_000_000.0;
1211    let poll_pct = if total_poll_try > 0 {
1212        (total_poll_ok as f64 / total_poll_try as f64) * 100.0
1213    } else {
1214        0.0
1215    };
1216
1217    let mut out = format!("\n=== KVM STATS (avg across {n_vms} VMs) ===\n\n");
1218    out.push_str(&format!(
1219        "  exits/vcpu  {:>7}   halt/vcpu     {:>5}   halt_wait_ms {:>7.1}\n",
1220        exits, halt, halt_wait_ms,
1221    ));
1222    out.push_str(&format!(
1223        "  poll_ok%    {:>6.1}%   preempted/vcpu {:>4}   signal/vcpu  {:>7}\n",
1224        poll_pct, preempted, signal,
1225    ));
1226    if hypercalls > 0 {
1227        out.push_str(&format!("  hypercalls/vcpu {:>4}\n", hypercalls));
1228    }
1229
1230    // Trust warnings.
1231    if preempted > 0 {
1232        let total: u64 = with_stats
1233            .iter()
1234            .map(|d| d.sum("preemption_reported"))
1235            .sum();
1236        out.push_str(&format!(
1237            "\n  WARNING: {total} host preemptions detected \
1238             -- timing results may be unreliable\n",
1239        ));
1240    }
1241
1242    out
1243}
1244
1245/// Resolve the sidecar output directory for the current test process.
1246///
1247/// Override: `KTSTR_SIDECAR_DIR` (used as-is when non-empty). When
1248/// the override is set, `serialize_and_write_sidecar` ALSO skips
1249/// the per-directory pre-clear so any pre-existing sidecars in
1250/// the operator-chosen directory are preserved verbatim — see
1251/// `sidecar_dir_override`.
1252///
1253/// Default: `{CARGO_TARGET_DIR or "target"}/ktstr/{kernel}-{project_commit}/`,
1254/// where `{kernel}` is the version detected from `KTSTR_KERNEL`'s
1255/// metadata (or `"unknown"` when no kernel is set / detection fails)
1256/// and `{project_commit}` is the project-tree HEAD short hex from
1257/// `detect_project_commit` (with `-dirty` suffix when the worktree
1258/// differs from HEAD), or `"unknown"` when the test process is not
1259/// running inside a git repository or the probe fails. Every sidecar
1260/// written from the same `cargo ktstr test` invocation lands in the
1261/// same directory; two runs sharing the same kernel + project commit
1262/// (e.g. re-running the same suite without committing changes) reuse
1263/// the same directory, with the second run pre-clearing any
1264/// `*.ktstr.json` files left by the first via
1265/// `pre_clear_run_dir_once` — the directory is a last-writer-wins
1266/// snapshot keyed on (kernel, project commit), not an append-only
1267/// archive of every invocation.
1268pub fn sidecar_dir() -> PathBuf {
1269    sidecar_dir_override().unwrap_or_else(resolve_default_sidecar_dir)
1270}
1271
1272/// Compute the default-path sidecar directory:
1273/// `{runs_root}/{kernel}-{project_commit}` where `{kernel}` and
1274/// `{project_commit}` come from [`detect_kernel_version`] and
1275/// [`detect_project_commit`] respectively, with `"unknown"`
1276/// substituted via [`format_run_dirname`] when either probe
1277/// returns `None`. Emits the one-shot
1278/// [`warn_unknown_project_commit_once`] stderr warning when the
1279/// project commit probe falls back to `"unknown"` (operators in
1280/// this state lose the per-commit run-directory discriminator).
1281///
1282/// Shared by [`sidecar_dir`] and the default-path branch of
1283/// [`serialize_and_write_sidecar`] so both call sites resolve the
1284/// same kernel/commit/warn/format chain through one place.
1285/// `serialize_and_write_sidecar` cannot call [`sidecar_dir`]
1286/// directly because it needs a single-read of
1287/// [`sidecar_dir_override`] (gated against the env-var flipping
1288/// mid-call between the dir-resolve and the pre-clear gate); the
1289/// helper supplies the default-branch body so the override read
1290/// stays at one site.
1291fn resolve_default_sidecar_dir() -> PathBuf {
1292    let kernel = detect_kernel_version();
1293    let commit = detect_project_commit();
1294    if commit.is_none() {
1295        warn_unknown_project_commit_once();
1296    }
1297    runs_root().join(format_run_dirname(kernel.as_deref(), commit.as_deref()))
1298}
1299
1300/// Build the run-directory leaf name from optional kernel and commit
1301/// components. `None` collapses to the literal `"unknown"` sentinel
1302/// in either slot, so a non-git cwd produces `"{kernel}-unknown"`
1303/// and a missing kernel produces `"unknown-{project_commit}"`. Pure
1304/// function over the two inputs — no I/O — so unit tests can pin
1305/// every shape (clean, dirty, missing-kernel, missing-commit, both
1306/// missing) without driving the [`detect_kernel_version`] /
1307/// [`detect_project_commit`] OnceLocks.
1308///
1309/// SENTINEL ASYMMETRY: the on-disk dirname uses `"unknown"` for
1310/// missing values, but the in-memory [`SidecarResult::project_commit`]
1311/// / [`SidecarResult::kernel_version`] fields stay `None` (`null`
1312/// in JSON). a `project_commit` filter for a specific commit
1313/// will NOT match a sidecar whose `project_commit` is `None` —
1314/// omit the filter to include `None`-commit rows. The asymmetry
1315/// is deliberate: the dirname needs a filesystem-safe sentinel,
1316/// while the JSON field preserves the original probe outcome for
1317/// downstream tooling that distinguishes "no probe ran" from
1318/// "probe ran but found nothing."
1319fn format_run_dirname(kernel: Option<&str>, commit: Option<&str>) -> String {
1320    let kernel = kernel.unwrap_or("unknown");
1321    let commit = commit.unwrap_or("unknown");
1322    format!("{kernel}-{commit}")
1323}
1324
1325/// Resolve the parent directory that holds all test-run subdirectories.
1326///
1327/// Resolution order:
1328/// 1. [`crate::KTSTR_RUNS_ROOT_ENV`] (absolute) — the `cargo ktstr`
1329///    orchestrator stamps this once at startup so its footer / `stats`
1330///    / `replay` reads AND the child test processes' sidecar writes
1331///    resolve the SAME directory regardless of CWD. This is the
1332///    primary path under `cargo ktstr`.
1333/// 2. `{CARGO_TARGET_DIR}/ktstr` when that env is set non-empty.
1334/// 3. `target/ktstr` (CWD-relative) — the raw `cargo nextest run`
1335///    fallback. CWD-relative is fragile across a Cargo workspace (the
1336///    test binary's CWD is the package dir, which differs from a
1337///    workspace-root invocation), which is exactly why the
1338///    orchestrator pins the absolute override above; raw nextest has
1339///    no footer to mismatch, so the fallback is acceptable there.
1340///
1341/// Used by `cargo ktstr stats` / `replay` and the post-run footer to
1342/// enumerate runs without reconstructing a specific run key.
1343pub fn runs_root() -> PathBuf {
1344    if let Some(root) = std::env::var_os(crate::KTSTR_RUNS_ROOT_ENV).filter(|v| !v.is_empty()) {
1345        return PathBuf::from(root);
1346    }
1347    let target = std::env::var("CARGO_TARGET_DIR")
1348        .ok()
1349        .filter(|d| !d.is_empty())
1350        .map(PathBuf::from)
1351        .unwrap_or_else(|| PathBuf::from("target"));
1352    target.join("ktstr")
1353}
1354
1355/// Predicate: is `entry` a candidate run directory under
1356/// [`runs_root`]?
1357///
1358/// True iff `entry`'s path is a directory AND its filename does
1359/// NOT begin with a `.` byte. The dotfile filter excludes the
1360/// flock sentinel subdirectory ([`crate::flock::LOCK_DIR_NAME`] =
1361/// `.locks`) plus any other operator-created or filesystem-
1362/// reserved dotfile directories from run-listing walkers
1363/// ([`newest_run_dir`] here, `sorted_run_entries` in
1364/// `crate::stats`) so the lock infrastructure does not pollute
1365/// `cargo ktstr stats list` output or claim the "most recent
1366/// run" bucket. Checking the first byte directly via
1367/// `as_encoded_bytes` is OS-string-safe (no UTF-8 round-trip)
1368/// and short-circuits cleanly on non-UTF-8 names that would
1369/// confuse a `to_str().starts_with('.')` chain.
1370///
1371/// Single source of truth for "is this a run-dir entry?" — both
1372/// run-listing call sites must pipe through this predicate so a
1373/// future relocation of `.locks/` (or any other added reserved
1374/// dotfile) updates one place.
1375pub(crate) fn is_run_directory(entry: &std::fs::DirEntry) -> bool {
1376    let path = entry.path();
1377    if !path.is_dir() {
1378        return false;
1379    }
1380    path.file_name()
1381        .and_then(|n| n.as_encoded_bytes().first().copied())
1382        .is_none_or(|b| b != b'.')
1383}
1384
1385/// Find the most recently modified run directory under [`runs_root`].
1386///
1387/// Used by bare `cargo ktstr stats` (no subcommand) when
1388/// `KTSTR_SIDECAR_DIR` isn't set: the stats command doesn't itself
1389/// run a kernel, so it can't reconstruct the
1390/// `{kernel}-{project_commit}` key that the test process used.
1391/// Picking the newest subdirectory by mtime mirrors "show me the
1392/// report from my last test run."
1393///
1394/// Dotfile-prefixed entries (notably the flock sentinel
1395/// subdirectory `.locks/`) are excluded via `is_run_directory`
1396/// so the lock infrastructure cannot claim the "most recent
1397/// run" bucket — `.locks/`'s mtime tracks per-write flock
1398/// activity and would otherwise eclipse the actual newest run
1399/// dir on every default-path sidecar write.
1400pub fn newest_run_dir() -> Option<PathBuf> {
1401    let root = runs_root();
1402    let entries = std::fs::read_dir(&root).ok()?;
1403    entries
1404        .filter_map(|e| e.ok())
1405        .filter(is_run_directory)
1406        .max_by_key(|e| e.metadata().and_then(|m| m.modified()).ok())
1407        .map(|e| e.path())
1408}
1409
1410/// One failed test's on-disk artifacts within a single run directory,
1411/// for the `cargo ktstr test` post-run footer.
1412///
1413/// `scheduler` / `topology` come from a FAILING variant's
1414/// `.ktstr.json` sidecar and are `None` when the test failed BEFORE
1415/// writing one — e.g. a scheduler BPF-load failure that produced only
1416/// a placeholder `.failure-dump.json` via
1417/// [`crate::test_support::eval`] and never reached [`write_sidecar`].
1418/// Each `Option` path is `Some` only when that artifact exists AND
1419/// was written in the current run (the mtime gate in
1420/// [`summarize_run_artifacts`]).
1421pub(crate) struct FailedTest {
1422    /// Bare test function name (the artifact filename prefix).
1423    pub(crate) test_name: String,
1424    /// Scheduler under test, from a FAILING variant's `.ktstr.json`
1425    /// sidecar; `None` when no variant sidecar recorded a failure (a
1426    /// dump-only pre-sidecar failure).
1427    pub(crate) scheduler: Option<String>,
1428    /// Topology label, from the same failing variant as `scheduler`;
1429    /// `None` under the same condition. For a gauntlet test with
1430    /// multiple failing variants this is a representative one; the
1431    /// full per-variant set is in `stats_sidecars`.
1432    pub(crate) topology: Option<String>,
1433    /// `{test}-{variant_hash}.failure-dump.json` for whichever variant
1434    /// the run-dir scan classified last (unsorted read_dir order);
1435    /// single-slot. The fail signal keys off the per-variant
1436    /// `dump_hashes` set, not this path.
1437    pub(crate) failure_dump: Option<PathBuf>,
1438    /// `{test}-{variant_hash}.repro.failure-dump.json` for whichever
1439    /// variant the scan classified last (auto-repro retry).
1440    pub(crate) repro_failure_dump: Option<PathBuf>,
1441    /// Every `{test}-{variant_hash}.ktstr.json` stats sidecar for
1442    /// this test, sorted — one per gauntlet variant (distinct
1443    /// variant hashes coexist). Empty for a dump-only failure.
1444    pub(crate) stats_sidecars: Vec<PathBuf>,
1445    /// `{test}-{variant_hash}.wprof.pb`.
1446    pub(crate) wprof: Option<PathBuf>,
1447    /// `{test}-{variant_hash}.repro.wprof.pb` (auto-repro retry).
1448    pub(crate) repro_wprof: Option<PathBuf>,
1449    /// True when ANY of this test's variant sidecars is `is_fail()`,
1450    /// so `cargo ktstr replay --filter <name>` (which selects from
1451    /// `is_fail` sidecars — see `replay.rs::select_failed_names`)
1452    /// will re-run it. False for dump-only failures (no sidecar), for
1453    /// which replay's pool selection finds nothing.
1454    pub(crate) replayable: bool,
1455}
1456
1457/// Per-run-directory artifact summary for the post-run footer.
1458pub(crate) struct RunDirSummary {
1459    /// The `{runs_root}/{kernel}-{project_commit}` run directory.
1460    pub(crate) dir: PathBuf,
1461    /// Failed tests in this dir, ordered by `test_name`.
1462    pub(crate) failed: Vec<FailedTest>,
1463    /// Count of `.ktstr.json` stats sidecars written this run
1464    /// (every executed VM test that reached [`write_sidecar`],
1465    /// pass or fail).
1466    pub(crate) stats_sidecars: usize,
1467    /// Count of `.wprof.pb` traces written this run (excludes the
1468    /// `.repro.wprof.pb` auto-repro variant).
1469    pub(crate) wprof_traces: usize,
1470}
1471
1472/// The five per-test artifact shapes a run directory holds.
1473enum RunArtifactKind {
1474    FailureDump,
1475    ReproFailureDump,
1476    StatsSidecar,
1477    Wprof,
1478    ReproWprof,
1479}
1480
1481/// Split a `{test}-{16-hex variant hash}` stem into `(test, hash)`.
1482///
1483/// Test function names are Rust identifiers (never contain `-`), so the
1484/// LAST `-` is the variant-hash separator. Falls back to `(stem, 0)`
1485/// when the trailing token is not a valid 16-hex hash — so a NON-variant
1486/// dump (a stale pre-variant-keying file, or a future writer that omits
1487/// the hash) still classifies by its full prefix instead of vanishing
1488/// (the "no silent drops" rule). The mtime gate already excludes stale
1489/// prior-run files; the fallback removes the silent-drop risk entirely.
1490fn split_variant_stem(stem: &str) -> (&str, u64) {
1491    if let Some((test, hash)) = stem.rsplit_once('-')
1492        && hash.len() == 16
1493        && let Ok(h) = u64::from_str_radix(hash, 16)
1494    {
1495        (test, h)
1496    } else {
1497        (stem, 0)
1498    }
1499}
1500
1501/// Parse a run-directory filename into `(test_name, variant_hash, kind)`.
1502///
1503/// Returns `None` for filenames that are not a recognized per-test
1504/// artifact — `.ktstr.json.tmp.<pid>.<run_id>` atomic-write staging
1505/// residue, stray non-ktstr files, or a `.ktstr.json` whose stem
1506/// lacks the `-{16-hex variant hash}` suffix [`write_sidecar`]
1507/// always appends.
1508///
1509/// The `variant_hash` lets the footer correlate each artifact with the
1510/// SAME-variant sidecar (a gauntlet test's per-preset dumps + sidecars
1511/// carry distinct hashes): a failure dump whose variant has no parsed
1512/// sidecar is a per-variant pre-sidecar failure even when a sibling
1513/// preset passed. failure-dump / wprof names fall back to `(stem, 0)`
1514/// when un-hashed (see [`split_variant_stem`]); a `.ktstr.json` sidecar
1515/// is ALWAYS variant-keyed by [`write_sidecar`], so a non-hashed one is
1516/// malformed and is dropped (`None`).
1517///
1518/// Suffix order is load-bearing: the `.repro.` shapes are checked
1519/// BEFORE their bare counterparts so `{test}-{hash}.repro.failure-dump.json`
1520/// classifies as [`RunArtifactKind::ReproFailureDump`] with
1521/// `test_name = {test}` rather than the bare-`.failure-dump.json`
1522/// branch stripping less and yielding `{test}-{hash}.repro`.
1523fn classify_run_artifact(name: &str) -> Option<(&str, u64, RunArtifactKind)> {
1524    if let Some(stem) = name.strip_suffix(".repro.failure-dump.json") {
1525        let (test, hash) = split_variant_stem(stem);
1526        return Some((test, hash, RunArtifactKind::ReproFailureDump));
1527    }
1528    if let Some(stem) = name.strip_suffix(".failure-dump.json") {
1529        let (test, hash) = split_variant_stem(stem);
1530        return Some((test, hash, RunArtifactKind::FailureDump));
1531    }
1532    if let Some(stem) = name.strip_suffix(".repro.wprof.pb") {
1533        let (test, hash) = split_variant_stem(stem);
1534        return Some((test, hash, RunArtifactKind::ReproWprof));
1535    }
1536    if let Some(stem) = name.strip_suffix(".wprof.pb") {
1537        let (test, hash) = split_variant_stem(stem);
1538        return Some((test, hash, RunArtifactKind::Wprof));
1539    }
1540    if let Some(stem) = name.strip_suffix(".ktstr.json") {
1541        // A sidecar is ALWAYS `{test}-{16-hex}` ({:016x} in
1542        // serialize_and_write_sidecar). A stem without a valid hash
1543        // suffix is a hand-named / malformed file — drop it (unlike the
1544        // dump arms, there's no un-hashed-sidecar writer to be lenient
1545        // for).
1546        let (test, hash) = stem.rsplit_once('-')?;
1547        if hash.len() == 16
1548            && let Ok(h) = u64::from_str_radix(hash, 16)
1549        {
1550            return Some((test, h, RunArtifactKind::StatsSidecar));
1551        }
1552    }
1553    None
1554}
1555
1556/// Summarize the per-test artifacts a single run directory holds,
1557/// counting only files written at or after `since`.
1558///
1559/// The mtime gate is the freshness boundary: a run directory is
1560/// keyed `{kernel}-{project_commit}` (see [`sidecar_dir`]), so
1561/// re-running the same suite reuses the directory, and
1562/// [`pre_clear_run_dir_once`] wipes only `*.ktstr.json` — stale
1563/// `*.failure-dump.json` / `*.wprof.pb` from an earlier run linger.
1564/// Filtering on `mtime >= since` (where `since` is captured before
1565/// the nextest build+run begins, so genuine artifacts — written
1566/// after the build — sort comfortably after it) keeps a stale dump
1567/// from a prior run from surfacing as a current failure.
1568///
1569/// Returns `None` when the directory holds no fresh artifacts (it
1570/// belongs to an earlier run, or cannot be read).
1571fn summarize_one_run_dir(
1572    dir: &std::path::Path,
1573    since: std::time::SystemTime,
1574) -> Option<RunDirSummary> {
1575    use std::collections::{BTreeMap, BTreeSet};
1576    #[derive(Default)]
1577    struct Acc {
1578        // The writer names these per-variant (hashed, `{test}-{hash}.…`);
1579        // each is a single Option collapsed to whichever variant the
1580        // read_dir scan classified last (unsorted order). The fail
1581        // signal below does NOT rely on them — it keys off the
1582        // per-variant `dump_hashes` set.
1583        failure_dump: Option<PathBuf>,
1584        repro_failure_dump: Option<PathBuf>,
1585        wprof: Option<PathBuf>,
1586        repro_wprof: Option<PathBuf>,
1587        // EVERY variant's stats sidecar (distinct variant-hash
1588        // filenames coexist), so a passing variant cannot mask a
1589        // failing sibling.
1590        stats_sidecars: Vec<PathBuf>,
1591        // OR of `is_fail` across all of this name's variant sidecars.
1592        // Post-finalize the sidecar carries the FINAL (post-inversion)
1593        // verdict, so a passing expect_err / expect_auto_repro test
1594        // reads `false` here even though its scenario failed.
1595        any_fail: bool,
1596        // Variant hashes whose stats sidecar PARSED, and variant hashes
1597        // that left a failure dump. The gate is PER-VARIANT: a dump whose
1598        // variant has no parsed sidecar is a pre-sidecar failure
1599        // (scheduler load / VM boot crash) for THAT preset and flags
1600        // FAILED even when a sibling preset's sidecar parsed; a dump whose
1601        // variant DID parse a (final, non-failing) sidecar is an
1602        // expected-failure run whose dump must NOT flag — the sidecar's
1603        // finalized verdict already classified it. (A gauntlet test's
1604        // per-preset dumps + sidecars carry distinct variant hashes.)
1605        parsed_sidecar_hashes: BTreeSet<u64>,
1606        dump_hashes: BTreeSet<u64>,
1607        // (scheduler, topology) of the FIRST failing variant seen,
1608        // for the FAILED block header; `None` when no variant sidecar
1609        // parsed as a failure (a dump-only pre-sidecar failure).
1610        fail_variant: Option<(String, String)>,
1611    }
1612    let entries = std::fs::read_dir(dir).ok()?;
1613    let mut by_test: BTreeMap<String, Acc> = BTreeMap::new();
1614    let mut stats_sidecars = 0usize;
1615    let mut wprof_traces = 0usize;
1616    for entry in entries.flatten() {
1617        let Ok(meta) = entry.metadata() else {
1618            continue;
1619        };
1620        if !meta.is_file() {
1621            continue;
1622        }
1623        match meta.modified() {
1624            Ok(m) if m >= since => {}
1625            _ => continue,
1626        }
1627        let path = entry.path();
1628        let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1629            continue;
1630        };
1631        let Some((test, variant_hash, kind)) = classify_run_artifact(name) else {
1632            continue;
1633        };
1634        let acc = by_test.entry(test.to_string()).or_default();
1635        match kind {
1636            RunArtifactKind::FailureDump => {
1637                acc.dump_hashes.insert(variant_hash);
1638                acc.failure_dump = Some(path);
1639            }
1640            RunArtifactKind::ReproFailureDump => {
1641                acc.dump_hashes.insert(variant_hash);
1642                acc.repro_failure_dump = Some(path);
1643            }
1644            RunArtifactKind::Wprof => {
1645                wprof_traces += 1;
1646                acc.wprof = Some(path);
1647            }
1648            RunArtifactKind::ReproWprof => acc.repro_wprof = Some(path),
1649            RunArtifactKind::StatsSidecar => {
1650                stats_sidecars += 1;
1651                // Accumulate EVERY variant (never overwrite by bare
1652                // name) and OR the fail signal, so one gauntlet
1653                // variant's pass cannot mask a failing sibling.
1654                match std::fs::read_to_string(&path)
1655                    .ok()
1656                    .and_then(|s| serde_json::from_str::<SidecarResult>(&s).ok())
1657                {
1658                    Some(sc) => {
1659                        acc.parsed_sidecar_hashes.insert(variant_hash);
1660                        if sc.is_fail() {
1661                            acc.any_fail = true;
1662                            if acc.fail_variant.is_none() {
1663                                acc.fail_variant = Some((sc.scheduler, sc.topology));
1664                            }
1665                        }
1666                    }
1667                    None => {
1668                        // Counted in `stats_sidecars` but
1669                        // unclassifiable. Warn so the count and the
1670                        // failed list cannot silently disagree (a
1671                        // corrupt `is_fail` sidecar would otherwise be
1672                        // swallowed).
1673                        tracing::warn!(
1674                            path = %path.display(),
1675                            "ktstr footer: unreadable/unparseable stats sidecar — \
1676                             counted but not classified",
1677                        );
1678                    }
1679                }
1680                acc.stats_sidecars.push(path);
1681            }
1682        }
1683    }
1684    if by_test.is_empty() {
1685        return None;
1686    }
1687    let mut failed = Vec::new();
1688    for (test_name, mut acc) in by_test {
1689        // A test FAILED this run if ANY of its variant sidecars records
1690        // `is_fail` (the FINAL post-inversion verdict), OR it left a
1691        // failure dump WITHOUT any parsed sidecar. A dump with no sidecar
1692        // is a pre-sidecar failure (scheduler load / VM boot) that never
1693        // reached `write_sidecar` — it must still flag. But a dump
1694        // alongside a parsed, non-failing sidecar is an expected-failure
1695        // run (expect_err / expect_auto_repro) whose induced-crash dump
1696        // must NOT flag: the sidecar's finalized verdict is authoritative.
1697        // The `is_fail` aggregate covers in-VM assertion failures,
1698        // including one failing gauntlet variant among passing siblings.
1699        // Per-variant dump gate: a failure dump whose variant has NO
1700        // parsed sidecar is a pre-sidecar failure for that preset and
1701        // flags — even when a SIBLING preset's sidecar parsed. (If every
1702        // dump-variant has a parsed sidecar, dump_hashes ⊆ parsed and the
1703        // sidecars' finalized verdicts are authoritative.) Closes the
1704        // mixed-gauntlet masking the old test-name-granularity gate had.
1705        let dump_only_failure = !acc.dump_hashes.is_subset(&acc.parsed_sidecar_hashes);
1706        if !acc.any_fail && !dump_only_failure {
1707            continue;
1708        }
1709        let (scheduler, topology) = match acc.fail_variant {
1710            Some((sch, topo)) => (Some(sch), Some(topo)),
1711            None => (None, None),
1712        };
1713        // Sort so the rendered footer is deterministic regardless of
1714        // `read_dir` order.
1715        acc.stats_sidecars.sort();
1716        failed.push(FailedTest {
1717            test_name,
1718            scheduler,
1719            topology,
1720            failure_dump: acc.failure_dump,
1721            repro_failure_dump: acc.repro_failure_dump,
1722            stats_sidecars: acc.stats_sidecars,
1723            wprof: acc.wprof,
1724            repro_wprof: acc.repro_wprof,
1725            replayable: acc.any_fail,
1726        });
1727    }
1728    Some(RunDirSummary {
1729        dir: dir.to_path_buf(),
1730        failed,
1731        stats_sidecars,
1732        wprof_traces,
1733    })
1734}
1735
1736/// Summarize the artifacts every run directory directly under
1737/// `runs_root` holds, keeping only files written at or after
1738/// `since`. Each [`RunDirSummary`] names its failed tests and the
1739/// concrete artifact path for each, so the `cargo ktstr test`
1740/// footer can point an operator at the exact file for the exact
1741/// test that failed rather than a directory + glob legend.
1742///
1743/// `since` is the wall-clock instant captured before the nextest
1744/// build+run; the mtime gate it drives is what excludes stale
1745/// artifacts left in a reused run directory (see
1746/// [`summarize_one_run_dir`]). Directories are returned sorted by
1747/// path so multi-kernel gauntlet output renders deterministically.
1748pub(crate) fn summarize_run_artifacts(
1749    runs_root: &std::path::Path,
1750    since: std::time::SystemTime,
1751) -> Vec<RunDirSummary> {
1752    let Ok(entries) = std::fs::read_dir(runs_root) else {
1753        return Vec::new();
1754    };
1755    let mut out: Vec<RunDirSummary> = entries
1756        .flatten()
1757        .filter(is_run_directory)
1758        .filter_map(|e| summarize_one_run_dir(&e.path(), since))
1759        .collect();
1760    out.sort_by(|a, b| a.dir.cmp(&b.dir));
1761    out
1762}
1763
1764/// Render the `cargo ktstr test` post-run footer: for each run
1765/// directory written at or after `since`, name every FAILED test
1766/// and the concrete path to each of its artifacts (failure dump,
1767/// auto-repro dump, stats sidecar, wprof trace), plus a per-dir
1768/// count of stats sidecars and wprof traces.
1769///
1770/// Returns the empty string when no run directory under `runs_root`
1771/// holds fresh artifacts — a host-only run (no VM tests) writes no
1772/// sidecars, so there is nothing to point at and the caller emits
1773/// no footer.
1774///
1775/// A test is listed FAILED when it left a failure dump (real or
1776/// placeholder) or an `is_fail` stats sidecar. This is NOT an
1777/// exhaustive failure list: a failure that writes neither — a
1778/// `builder.build()` / `vm.run()` error, a pre-build host error
1779/// (kvm probe, kernel/scheduler resolve, validation), a host panic,
1780/// or an unparseable guest result — leaves no on-disk artifact and
1781/// no entry here. The caller (`cargo_ktstr::run_cargo`) treats
1782/// nextest's own exit status as the authoritative pass/fail signal
1783/// and notes, when nextest reports failures, that any failure
1784/// without an entry left no artifact.
1785///
1786/// This replaces a directory + `*.glob` legend that carried no
1787/// test attribution: a reused run directory mixes artifacts from
1788/// many tests (and, before the mtime gate, prior runs), so a glob
1789/// legend pointed an operator at the directory and left them to
1790/// guess which `*.failure-dump.json` belonged to the test that
1791/// just failed.
1792pub fn format_run_artifact_footer(
1793    runs_root: &std::path::Path,
1794    since: std::time::SystemTime,
1795) -> String {
1796    let summaries = summarize_run_artifacts(runs_root, since);
1797    if summaries.is_empty() {
1798        return String::new();
1799    }
1800    let mut out = String::new();
1801    out.push_str("\ncargo ktstr: test outputs\n");
1802    for s in &summaries {
1803        out.push_str(&format!("  {}\n", s.dir.display()));
1804        for f in &s.failed {
1805            // scheduler/topology are set together (both from one
1806            // failing variant) or both absent (dump-only) — see
1807            // `summarize_one_run_dir`; no mixed arm is reachable.
1808            let variant = match (&f.scheduler, &f.topology) {
1809                (Some(sch), Some(topo)) => format!("  [{sch} {topo}]"),
1810                _ => String::new(),
1811            };
1812            out.push_str(&format!("    FAILED  {}{variant}\n", f.test_name));
1813            if let Some(p) = &f.failure_dump {
1814                out.push_str(&format!("      {:<13} {}\n", "failure dump", p.display()));
1815            }
1816            if let Some(p) = &f.repro_failure_dump {
1817                out.push_str(&format!("      {:<13} {}\n", "repro dump", p.display()));
1818            }
1819            for p in &f.stats_sidecars {
1820                out.push_str(&format!("      {:<13} {}\n", "stats", p.display()));
1821            }
1822            if let Some(p) = &f.wprof {
1823                out.push_str(&format!("      {:<13} {}\n", "wprof", p.display()));
1824            }
1825            if let Some(p) = &f.repro_wprof {
1826                out.push_str(&format!("      {:<13} {}\n", "repro wprof", p.display()));
1827            }
1828            if f.replayable {
1829                out.push_str(&format!(
1830                    "      {:<13} cargo ktstr replay --filter {} --exec\n",
1831                    "replay", f.test_name,
1832                ));
1833            }
1834        }
1835        out.push_str(&format!(
1836            "    ({} stats sidecar(s), {} wprof trace(s) written this run)\n",
1837            s.stats_sidecars, s.wprof_traces,
1838        ));
1839    }
1840    out
1841}
1842
1843/// Detect the kernel version associated with the current test run.
1844///
1845/// Routes through [`crate::ktstr_kernel_env`] for the raw env value
1846/// and [`crate::kernel_path::KernelId`] for variant dispatch so the
1847/// three [`crate::kernel_path::KernelId`] variants are honoured symmetrically:
1848///
1849/// - `KernelId::Path(dir)`: read `metadata.json` (cache entry
1850///   layout) or `include/config/kernel.release` (source tree
1851///   layout). Unchanged from the previous behaviour.
1852/// - `KernelId::Version(ver)`: the user asked for a specific
1853///   version — return it directly. No cache access needed; a
1854///   version string IS a version string.
1855/// - `KernelId::CacheKey(key)`: look up the cache entry and
1856///   return `entry.metadata.version`. The previous code path
1857///   silently treated the key as a directory name and read
1858///   `<cwd>/<key>/metadata.json`, which never matched — producing
1859///   `None` + `sidecar_dir()` using the `"unknown"` fallback even
1860///   though the cache metadata already carried the version.
1861///
1862/// Returns `None` when the env var is unset, or when the env
1863/// resolves to a variant whose underlying source doesn't yield a
1864/// version string (e.g. a Path whose metadata.json / kernel.release
1865/// are both absent, or a CacheKey with no cache hit).
1866pub(crate) fn detect_kernel_version() -> Option<String> {
1867    use crate::kernel_path::KernelId;
1868    let raw = crate::ktstr_kernel_env()?;
1869    match KernelId::parse(&raw) {
1870        KernelId::Path(_) => {
1871            let p = std::path::Path::new(&raw);
1872            let meta_path = p.join("metadata.json");
1873            if let Ok(data) = std::fs::read_to_string(&meta_path)
1874                && let Ok(meta) = serde_json::from_str::<crate::cache::KernelMetadata>(&data)
1875            {
1876                return meta.version;
1877            }
1878            let ver_path = p.join("include/config/kernel.release");
1879            if let Ok(v) = std::fs::read_to_string(ver_path) {
1880                let v = v.trim();
1881                if !v.is_empty() {
1882                    return Some(v.to_string());
1883                }
1884            }
1885            None
1886        }
1887        KernelId::Version(ver) => Some(ver),
1888        KernelId::CacheKey(key) => {
1889            let cache = crate::cache::CacheDir::new().ok()?;
1890            let entry = cache.lookup(&key)?;
1891            entry.metadata.version
1892        }
1893        // Multi-kernel specs in KTSTR_KERNEL never reach this
1894        // function in production — `find_kernel`'s env reader bails
1895        // before sidecar writing happens. This arm is defensive: if
1896        // the env value is somehow a range or git spec, return
1897        // `None` rather than guessing one endpoint, and the sidecar
1898        // record will leave `kernel_version` as null.
1899        KernelId::Range { .. } | KernelId::Git { .. } => None,
1900    }
1901}
1902
1903/// Detect the ktstr project's git HEAD at sidecar-write time.
1904///
1905/// Walks up from the test process's current working directory via
1906/// `gix::discover` to find an enclosing repository, then reads HEAD
1907/// short-hex (7 chars via `oid::to_hex_with_len(7)`) and appends
1908/// `-dirty` when index-vs-HEAD or worktree-vs-index changes are
1909/// observed. Submodules are ignored
1910/// (`Submodule::Given { ignore: All }`).
1911///
1912/// Dirt-detection runs through the shared [`repo_is_dirty`]
1913/// helper (peel HEAD to its tree, diff tree-vs-index, then
1914/// `status()` for worktree-vs-index, submodules skipped); see its
1915/// doc for cascade details. The cascade is similar in spirit to
1916/// [`crate::fetch::local_source`]'s dirt probe but deliberately
1917/// diverges in missing-index handling: the sidecar path silently
1918/// degrades a missing index leg to "treat as clean" so metadata
1919/// probes never gate sidecar writes, whereas `local_source`'s
1920/// cache-key path treats every leg as load-bearing. The HASH
1921/// REPRESENTATION also DIFFERS: `fetch::local_source` DROPS the
1922/// short hash entirely on dirty (returns `None`) because the
1923/// commit no longer describes the build input the cache key
1924/// embeds — publishing a stale hash there would misidentify the
1925/// build. This helper KEEPS the hash with a `-dirty` suffix
1926/// instead because the sidecar's `project_commit` is a debugging
1927/// breadcrumb (operator-readable identity, not a cache-key input);
1928/// the hash plus dirty flag carries strictly more information
1929/// than `None` for the operator's "which ktstr commit did this
1930/// sidecar come from?" question.
1931///
1932/// Returns `None` when:
1933/// - `current_dir()` cannot be resolved (process has no valid
1934///   cwd — extremely rare; happens only for processes whose cwd
1935///   was rmdir'd while alive);
1936/// - cwd is not inside any git repository (`gix::discover` fails);
1937/// - HEAD cannot be read (an unborn HEAD on a fresh `git init`
1938///   with zero commits, or a corrupt repository).
1939///
1940/// Returns `Some(short_hash)` (without the `-dirty` suffix) when
1941/// the HEAD read succeeds but a downstream dirt-detection call
1942/// fails — including a missing index, an unreadable working tree,
1943/// or `head_tree()` failure. Each failed leg degrades to "treat
1944/// as clean" rather than aborting the probe, because metadata
1945/// must not gate sidecar writes.
1946///
1947/// `None` is the documented fallback — sidecar writing must not
1948/// abort because of a metadata probe failure. Stats tooling that
1949/// reads `project_commit` already tolerates `None` rows by
1950/// treating them as wildcards (no `--project-commit` filter narrowing
1951/// applies).
1952///
1953/// `gix::discover` is preferred over `gix::open` because tests can
1954/// be launched from a subdirectory of the repo (e.g.
1955/// `cd src && cargo test`); `discover` walks parents until it
1956/// finds the `.git` marker, while `open` requires the exact root
1957/// path. The walk is cheap — a few stat() calls bounded by the
1958/// depth of the cwd inside the repo.
1959///
1960/// `env!("CARGO_MANIFEST_DIR")` is deliberately NOT used here:
1961/// `env!` resolves at compile time and bakes the build-host's
1962/// absolute manifest path into the binary's read-only data
1963/// segment, leaking the build environment into every published
1964/// artifact. Resolving cwd at runtime instead means the recorded
1965/// commit reflects the project tree the test was launched FROM —
1966/// for a scheduler crate using ktstr as a dev-dependency, this is
1967/// the scheduler crate's commit, not ktstr's. That is the more
1968/// accurate semantic anyway: "what code produced this sidecar"
1969/// depends on the cwd at test launch (which crate is exercising
1970/// ktstr), not the build host.
1971pub(crate) fn detect_project_commit() -> Option<String> {
1972    // Explicit override: an orchestrator (perf-delta) that checked the
1973    // project tree out WITHOUT a `.git` — a plain gix checkout of a baseline
1974    // commit into a temp dir — passes the commit label via
1975    // KTSTR_PROJECT_COMMIT_ENV so the sidecar records it verbatim instead of
1976    // a `gix::discover` that would resolve to the wrong repo (or none). It is
1977    // also set on the HEAD run so the recorded `project_commit` equals the
1978    // exact label perf-delta filters the pool on, closing the -dirty-suffix
1979    // mismatch between the filter (`short_hash`) and this recorder. Empty is
1980    // treated as unset. Mirrors the KTSTR_KERNEL_COMMIT_ENV override.
1981    if let Ok(explicit) = std::env::var(crate::KTSTR_PROJECT_COMMIT_ENV)
1982        && !explicit.is_empty()
1983    {
1984        return Some(explicit);
1985    }
1986    // Per-process memoization of the SUCCESS case only.
1987    //
1988    // The cwd is stable for the lifetime of a test process (no
1989    // caller mutates it), and the project tree's HEAD plus dirty
1990    // state cannot change underneath us without an explicit user
1991    // action that's outside the scope of any individual sidecar
1992    // write. Gauntlet runs invoke this function once per sidecar —
1993    // thousands of times per process — so caching the resolved
1994    // hash collapses every post-first successful call to a
1995    // `Clone`. The probe itself does ~3 syscalls (gix discover +
1996    // head_id + status) which dominate the sidecar-write critical
1997    // path; eliminating that cost on the hot path is the only
1998    // meaningful perf win available here.
1999    //
2000    // FAILURE IS NOT CACHED: a `None` probe outcome (no git repo
2001    // discoverable from cwd, unborn HEAD, transient FS / gix open
2002    // failure) does NOT seed the OnceLock. A FIRST call from a
2003    // momentarily-broken context (e.g. a test that swapped CWD via
2004    // some indirect path before ever calling
2005    // `detect_project_commit`, or a transient I/O hiccup during
2006    // `gix::discover`) would otherwise lock in `None` for the
2007    // rest of the process — every subsequent sidecar would land
2008    // under `target/ktstr/{kernel}-unknown/` even though the
2009    // commit IS resolvable from a healthy cwd. Retrying on failure
2010    // costs the same ~3 syscalls the success case pays once; the
2011    // re-probe only fires while the answer is still unknown.
2012    //
2013    // CACHE DOES NOT INVALIDATE on success: a user who commits /
2014    // amends / resets the project tree mid-run and expects the
2015    // new HEAD to surface in subsequent sidecars will see stale
2016    // values. This is acceptable — the
2017    // project tree is treated as stable-enough for a single suite
2018    // run; callers mutating the tree during a run own the
2019    // consequences.
2020    static PROJECT_COMMIT: std::sync::OnceLock<String> = std::sync::OnceLock::new();
2021    if let Some(cached) = PROJECT_COMMIT.get() {
2022        return Some(cached.clone());
2023    }
2024    let cwd = std::env::current_dir().ok()?;
2025    let probed = detect_commit_at(&cwd)?;
2026    // `set` on a hot OnceLock is a no-op `Err` — safe to ignore.
2027    // First successful caller wins; a second concurrent caller's
2028    // identical hash discards harmlessly.
2029    let _ = PROJECT_COMMIT.set(probed.clone());
2030    Some(probed)
2031}
2032
2033/// Path-taking core of [`detect_project_commit`]. Factored out so
2034/// unit tests can drive the full branch matrix (clean repo, dirty
2035/// repo, non-git directory, unborn HEAD, concurrent calls) against
2036/// `gix::init`-built fixtures in tempdirs without mutating the
2037/// process-wide `current_dir`. The public entry point reads `cwd`
2038/// once and delegates here.
2039///
2040/// `gix::discover` walks parents until it finds a `.git` marker —
2041/// tests can be launched from a subdirectory of the repo (e.g.
2042/// `cd src && cargo test`); the parent walk handles that, where
2043/// `gix::open` would require the exact root. The
2044/// open-vs-discover distinction is the ONLY difference between
2045/// this function and [`detect_kernel_commit`]; the post-open
2046/// "read HEAD, format short hex, append `-dirty` on dirt" body
2047/// lives in the shared [`commit_with_dirty_suffix`] helper.
2048fn detect_commit_at(path: &std::path::Path) -> Option<String> {
2049    let repo = gix::discover(path).ok()?;
2050    commit_with_dirty_suffix(&repo)
2051}
2052
2053/// Shared post-open body for [`detect_commit_at`] and
2054/// [`detect_kernel_commit`]: read `repo.head_id()`, format the
2055/// 7-char short hex, and append `-dirty` when [`repo_is_dirty`]
2056/// returns `Some(true)`.
2057///
2058/// Returns `None` when `head_id()` fails (unborn HEAD on a fresh
2059/// `gix::init` with zero commits, or a corrupt repository) — the
2060/// short-hex cannot be formed.
2061///
2062/// Returns `Some(short_hash)` (without `-dirty`) when the HEAD
2063/// read succeeds but the [`repo_is_dirty`] probe returns `None`
2064/// (HEAD-tree peel failure). This matches the documented "treat
2065/// as clean on probe failure" degradation: metadata probes must
2066/// not gate sidecar writes, so a probe failure flows through as
2067/// "clean" rather than aborting.
2068///
2069/// `to_hex_with_len(7)` produces a `HexDisplay` that formats 7
2070/// hex chars without the 40-char intermediate `format!("{}")`
2071/// allocation. `Id` derefs to `oid` (gix-hash) which owns the
2072/// method.
2073///
2074/// CALL SITES diverge ONLY on the open mode (`gix::discover` for
2075/// the project commit, `gix::open` for the kernel commit). The
2076/// helper takes a `&Repository` so each caller picks the open
2077/// strategy that matches its semantics: project commit walks
2078/// parents (cwd may be inside a subdir of the repo); kernel
2079/// commit demands the explicit root (the kernel directory is
2080/// not walked-up to avoid resolving the parent ktstr repo).
2081fn commit_with_dirty_suffix(repo: &gix::Repository) -> Option<String> {
2082    let head = repo.head_id().ok()?;
2083    let short_hash = head.to_hex_with_len(7).to_string();
2084    if repo_is_dirty(repo).unwrap_or(false) {
2085        Some(format!("{short_hash}-dirty"))
2086    } else {
2087        Some(short_hash)
2088    }
2089}
2090
2091/// Probe whether a gix repository's working tree differs from its
2092/// HEAD commit, ignoring submodules.
2093///
2094/// Returns `Some(true)` when the index differs from the HEAD tree
2095/// or the worktree differs from the index for any tracked file;
2096/// `Some(false)` when neither leg observed a difference; `None`
2097/// when the HEAD-tree peel itself failed (HEAD points at something
2098/// that cannot be read as a tree).
2099///
2100/// Callers in [`detect_commit_at`] / [`detect_kernel_commit`]
2101/// degrade `None` to "treat as clean" via `unwrap_or(false)` so
2102/// metadata probes never gate sidecar writes.
2103///
2104/// PROBE LEGS:
2105/// - tree-vs-index: peel HEAD to its tree, then `tree_index_status`
2106///   diff against the on-disk index. `repo.index()` returning Err
2107///   (missing index — partially-checked-out clones, or fresh
2108///   `git init` before the first commit) silently leaves the
2109///   index-dirty leg false. `index_or_empty()` is deliberately
2110///   NOT used because it would substitute an empty index and the
2111///   diff would flag every tracked file as "deleted from index",
2112///   tripping false-dirty.
2113/// - index-vs-worktree: `repo.status()` configured with
2114///   `Submodule::Given { ignore: All }` so submodule worktree
2115///   state is skipped. Short-circuited when the tree-vs-index leg
2116///   already flipped dirty: the result only needs one positive
2117///   signal, so a known-dirty index makes the worktree walk
2118///   redundant. Matches the equivalent short-circuit in
2119///   [`crate::fetch::local_source`].
2120///
2121/// FAILURE DEGRADATION: any individual leg failure (missing index,
2122/// `repo.status()` failure, `into_index_worktree_iter()` failure)
2123/// silently degrades that leg to "no signal" rather than aborting.
2124/// The function only returns `None` when the HEAD-tree peel
2125/// fails, because at that point neither leg can run at all.
2126///
2127/// `pub` (not `pub(crate)`) because `cargo-ktstr.rs` is a
2128/// separate `[[bin]]` crate that consumes `ktstr` as an
2129/// external dependency and needs this helper to compute the
2130/// `-dirty` suffix in
2131/// the baseline/HEAD commit in `cargo ktstr perf-delta`. Hidden
2132/// from rustdoc via `#[doc(hidden)]` because it is a probe-
2133/// style helper without a stable API contract — external
2134/// consumers should not depend on it.
2135#[doc(hidden)]
2136pub fn repo_is_dirty(repo: &gix::Repository) -> Option<bool> {
2137    let head_tree_id = repo.head_tree().ok()?.id;
2138
2139    let mut index_dirty = false;
2140    if let Ok(index) = repo.index() {
2141        let _ = repo.tree_index_status(
2142            &head_tree_id,
2143            &index,
2144            None,
2145            gix::status::tree_index::TrackRenames::Disabled,
2146            |_, _, _| {
2147                index_dirty = true;
2148                Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Break(()))
2149            },
2150        );
2151    }
2152
2153    let worktree_dirty = if index_dirty {
2154        false
2155    } else {
2156        repo.status(gix::progress::Discard)
2157            .ok()
2158            .and_then(|s| {
2159                s.index_worktree_rewrites(None)
2160                    .index_worktree_submodules(gix::status::Submodule::Given {
2161                        ignore: gix::submodule::config::Ignore::All,
2162                        check_dirty: false,
2163                    })
2164                    .index_worktree_options_mut(|opts| {
2165                        opts.dirwalk_options = None;
2166                    })
2167                    .into_index_worktree_iter(Vec::new())
2168                    .ok()
2169                    .map(|mut iter| iter.next().is_some())
2170            })
2171            .unwrap_or(false)
2172    };
2173
2174    Some(index_dirty || worktree_dirty)
2175}
2176
2177/// Detect the kernel SOURCE TREE's git HEAD at sidecar-write time.
2178///
2179/// `kernel_dir` is the explicit kernel source directory — typically
2180/// resolved from `KTSTR_KERNEL` for `KernelId::Path`, or from the
2181/// cache entry's `KernelSource::Local::source_tree_path` when
2182/// `KTSTR_KERNEL` is a Version / CacheKey whose underlying build
2183/// recorded a local tree. Uses `gix::open(kernel_dir)` (NOT
2184/// `gix::discover`) because the kernel directory is explicit, not
2185/// walked-up: the parent walk that `discover` performs would
2186/// resolve to whichever ancestor `.git` it found first, which
2187/// might be the ktstr project's repo when `kernel_dir` is a
2188/// non-git subdirectory inside it. `open` requires `kernel_dir`
2189/// itself to be the repo root, which is the documented invariant
2190/// for kernel checkouts.
2191///
2192/// Reads HEAD short-hex (7 chars via `oid::to_hex_with_len(7)`)
2193/// and appends `-dirty` when index-vs-HEAD or worktree-vs-index
2194/// changes are observed. Dirt-detection runs through the shared
2195/// [`repo_is_dirty`] helper (submodules skipped via
2196/// `Submodule::Given { ignore: All }`); see its doc for cascade
2197/// details. The cascade matches [`detect_project_commit`] and is
2198/// similar in spirit to [`crate::fetch::local_source`] but
2199/// deliberately diverges in missing-index handling: the sidecar
2200/// path silently degrades a missing index leg to "treat as
2201/// clean" so metadata probes never gate sidecar writes, whereas
2202/// `local_source`'s cache-key path treats every leg as
2203/// load-bearing. Same "treat as clean on probe failure"
2204/// degradation rules apply otherwise: a missing index, an
2205/// unreadable worktree, or `head_tree()` failure each fall
2206/// through as "clean" rather than aborting the probe — metadata
2207/// must not gate sidecar writes.
2208///
2209/// HASH REPRESENTATION matches [`detect_project_commit`]: keeps
2210/// the hash with `-dirty` appended (operator-readable identity).
2211/// Distinct from [`crate::fetch::local_source`], which DROPS the
2212/// hash on dirty because the commit no longer describes the
2213/// build INPUT for cache-key purposes.
2214///
2215/// Returns `None` when:
2216/// - `kernel_dir` is not a git repository (`gix::open` fails);
2217/// - HEAD cannot be read (unborn HEAD on a fresh `git init` with
2218///   zero commits, or a corrupt repository).
2219///
2220/// Returns `Some(short_hash)` (without the `-dirty` suffix) when
2221/// the HEAD read succeeds but a downstream dirt-detection call
2222/// fails — including a missing index, an unreadable working
2223/// tree, or `head_tree()` failure. Each failed leg degrades to
2224/// "treat as clean" rather than aborting the probe, because
2225/// metadata must not gate sidecar writes.
2226///
2227/// `pub` (not `pub(crate)`) + `#[doc(hidden)]` for the same reason as
2228/// `repo_is_dirty`: `cargo-ktstr` is a separate `[[bin]]` crate that
2229/// consumes `ktstr` as a dependency and calls this in `run_cargo` once
2230/// per resolved kernel to pre-compute the `dir=commit` map it exports
2231/// via [`crate::KTSTR_KERNEL_COMMIT_ENV`], letting each per-test process
2232/// skip its own gix dirty-walk. Hidden from rustdoc — a probe helper
2233/// with no stable API contract. The env fast-path that CONSUMES that
2234/// map lives at the sidecar call site (`kernel_commit_for_sidecar`), not
2235/// here, so this stays a pure directory→commit walk safe for the
2236/// orchestrator to call while building the map.
2237#[doc(hidden)]
2238pub fn detect_kernel_commit(kernel_dir: &std::path::Path) -> Option<String> {
2239    // Per-process, path-keyed memoization of the SUCCESS case
2240    // only. Same rationale as `detect_project_commit`: gauntlet
2241    // runs invoke this function once per sidecar — thousands of
2242    // times — and the kernel tree's HEAD plus dirty state cannot
2243    // change underneath us mid-suite without an explicit user
2244    // action outside any sidecar's control. The path key handles
2245    // the fixture-test case where unit tests rotate through
2246    // synthetic `tempfile::TempDir` kernel paths in the same
2247    // process; each distinct path memoizes independently.
2248    //
2249    // `Mutex<HashMap>` rather than `OnceLock` because the input
2250    // is parameterized on `kernel_dir` — a `OnceLock` collapses
2251    // every input to one cached result, which would conflate
2252    // different kernel directories into a single value.
2253    // Contention is bounded: post-warm reads are O(1) hash
2254    // lookups against a near-empty map (in production typically
2255    // ONE kernel per process), and the mutex is held only for
2256    // the duration of the lookup + insert.
2257    //
2258    // FAILURE IS NOT CACHED: a `None` probe outcome (kernel_dir
2259    // is not a git repo, unborn HEAD, transient `gix::open`
2260    // failure) does NOT seed the cache. Caching `None` would lock
2261    // in `unknown` for every subsequent sidecar even after the
2262    // condition resolves (e.g. a kernel directory that becomes a
2263    // valid checkout mid-suite, or a flaky FS that recovers).
2264    // Re-probing on failure costs the same gix-open + dirt-walk
2265    // the success case pays once; the re-probe only fires while
2266    // the answer is still unknown for that path.
2267    //
2268    // Mutex poisoning recovery: a panic mid-probe could poison
2269    // the lock; acquiring via
2270    // [`crate::sync::MutexExt::lock_unpoisoned`] returns the
2271    // guard regardless of poison state so a future caller doesn't
2272    // fail catastrophically. The cached map is just a HashMap of
2273    // owned strings; no invariant beyond "key→value mapping" can
2274    // be broken by an interrupted probe.
2275    use std::collections::HashMap;
2276    use std::path::PathBuf;
2277    use std::sync::{Mutex, OnceLock};
2278    static KERNEL_COMMIT_CACHE: OnceLock<Mutex<HashMap<PathBuf, String>>> = OnceLock::new();
2279    // Canonicalize the cache key so two paths that resolve to the
2280    // same on-disk directory share one entry. Without this, a
2281    // symlinked alias (`./linux` symlinked to `/abs/.../linux`)
2282    // and the resolved target would each populate their own slot,
2283    // re-running the gix-open + dirt-walk on every alias and
2284    // defeating the memoization. `canonicalize` resolves symlinks,
2285    // collapses `..` / `.`, and yields the absolute path the
2286    // kernel actually lives at. Falls back to the raw path on
2287    // canonicalize failure (e.g. caller passed a non-existent
2288    // `kernel_dir`) — gix::open will fail downstream and re-probe
2289    // each call until the path becomes resolvable.
2290    let cache_key = kernel_dir
2291        .canonicalize()
2292        .unwrap_or_else(|_| kernel_dir.to_path_buf());
2293    let cache = KERNEL_COMMIT_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
2294    {
2295        let guard = cache.lock_unpoisoned();
2296        if let Some(cached) = guard.get(&cache_key) {
2297            return Some(cached.clone());
2298        }
2299    }
2300    // `gix::open` (NOT `gix::discover`) — `kernel_dir` must BE the
2301    // repo root. Without this the parent walk could resolve to the
2302    // ktstr project's own `.git` when `kernel_dir` is a non-git
2303    // subdirectory inside the ktstr checkout. The
2304    // open-vs-discover distinction is the ONLY difference between
2305    // this function and [`detect_commit_at`]; the post-open
2306    // "read HEAD, format short hex, append `-dirty` on dirt" body
2307    // lives in the shared [`commit_with_dirty_suffix`] helper.
2308    //
2309    // Open against `kernel_dir` (the caller-supplied path) rather
2310    // than `cache_key`. The two paths point at the same on-disk
2311    // repo by construction (canonicalize resolves to the same
2312    // place), so gix opens the same repository either way; passing
2313    // the original keeps any user-facing diagnostics (gix's
2314    // internal error chain) consistent with the input shape.
2315    let result = gix::open(kernel_dir)
2316        .ok()
2317        .and_then(|repo| commit_with_dirty_suffix(&repo));
2318    if let Some(ref hash) = result {
2319        let mut guard = cache.lock_unpoisoned();
2320        // First successful caller wins; a concurrent caller's
2321        // identical hash would overwrite harmlessly because
2322        // success is deterministic for a given (canonicalized
2323        // path, HEAD, dirty state) tuple.
2324        guard.insert(cache_key, hash.clone());
2325    }
2326    result
2327}
2328
2329/// Environment variable CI runners set to mark sidecars they produce
2330/// as `"ci"`-source. Any non-empty value flips the tag; empty string
2331/// is treated as unset so a defensively-cleared variable does not
2332/// accidentally classify a developer run as CI.
2333///
2334/// Read at sidecar-write time by [`detect_run_source`]; matches the
2335/// `KTSTR_KERNEL` / `KTSTR_CACHE_DIR` env-name convention so the
2336/// full set of ktstr-controlled env vars is `KTSTR_*`-prefixed.
2337pub const KTSTR_CI_ENV: &str = "KTSTR_CI";
2338
2339/// Tag value written to [`SidecarResult::run_source`] for sidecars
2340/// produced under [`KTSTR_CI_ENV`].
2341pub const SIDECAR_RUN_SOURCE_CI: &str = "ci";
2342
2343/// Tag value written to [`SidecarResult::run_source`] for sidecars
2344/// produced without [`KTSTR_CI_ENV`] — the developer-machine
2345/// default.
2346pub const SIDECAR_RUN_SOURCE_LOCAL: &str = "local";
2347
2348/// Tag value applied to [`SidecarResult::run_source`] /
2349/// [`GauntletRow::run_source`](crate::stats::GauntletRow::run_source)
2350/// at LOAD time when the consumer pulls sidecars from a non-default
2351/// pool root via `cargo ktstr stats show-host --dir` /
2352/// `cargo ktstr stats list-values --dir`. NEVER written by
2353/// [`write_sidecar`] — the writer cannot know the file will later
2354/// be moved off-host. See [`apply_archive_source_override`].
2355pub const SIDECAR_RUN_SOURCE_ARCHIVE: &str = "archive";
2356
2357/// Read [`KTSTR_CI_ENV`] and classify the run as `"ci"` (when the
2358/// env var is set non-empty) or `"local"` (the default for any
2359/// developer-driven invocation). Empty-string env values count as
2360/// unset — see [`KTSTR_CI_ENV`] for rationale.
2361///
2362/// Returns `Some(_)` unconditionally because every sidecar producer
2363/// is, by construction, either local or CI; an `Option` return
2364/// keeps the field shape symmetric with the other nullable
2365/// `SidecarResult` fields and reserves room for a future "unknown"
2366/// arm without a serde-version bump.
2367pub(crate) fn detect_run_source() -> Option<String> {
2368    match std::env::var(KTSTR_CI_ENV) {
2369        Ok(v) if !v.is_empty() => Some(SIDECAR_RUN_SOURCE_CI.to_string()),
2370        _ => Some(SIDECAR_RUN_SOURCE_LOCAL.to_string()),
2371    }
2372}
2373
2374/// Override every sidecar's `run_source` field to
2375/// [`SIDECAR_RUN_SOURCE_ARCHIVE`] when the consumer pulled the pool
2376/// from a non-default root via `--dir`. Called at the boundary
2377/// between [`collect_pool`] and the downstream stats pipeline so
2378/// on-disk values stay untouched while the in-memory pool reflects
2379/// the operator's intent: "these sidecars were copied off another
2380/// host; treat them as archives, not as the local-machine record."
2381///
2382/// Mutation strategy is in-place rewrite of the entire `run_source`
2383/// field — the `"local"` / `"ci"` distinction is meaningful on the
2384/// PRODUCING host but irrelevant once the sidecars have been
2385/// moved off, where the only useful classification is "archived
2386/// elsewhere." Operators who need to retain the producer-side
2387/// distinction inside an archive bucket can keep `--dir`
2388/// untargeted (read from the default root) and let the on-disk
2389/// values pass through.
2390pub(crate) fn apply_archive_source_override(pool: &mut [SidecarResult]) {
2391    for sc in pool {
2392        sc.run_source = Some(SIDECAR_RUN_SOURCE_ARCHIVE.to_string());
2393    }
2394}
2395
2396/// Resolve the kernel source-tree path for [`detect_kernel_commit`]
2397/// from the [`crate::KTSTR_KERNEL_ENV`] env var.
2398///
2399/// Routes through [`crate::ktstr_kernel_env`] for the raw env
2400/// value and [`crate::kernel_path::KernelId`] for variant
2401/// dispatch:
2402///
2403/// - `KernelId::Path(p)`: probes the path's `metadata.json` first
2404///   — `cargo-ktstr`'s `--kernel /path/to/linux` resolver routes
2405///   clean source trees through the cache pipeline (see
2406///   [`crate::cli::resolve_kernel_dir_to_entry`]) and exports the
2407///   CACHE ENTRY directory through `KTSTR_KERNEL`, not the
2408///   literal source tree. When `metadata.json` parses and carries
2409///   a `KernelSource::Local::source_tree_path`, that path is the
2410///   underlying source tree and is returned. When parsing fails
2411///   (the path IS the source tree, the dirty-tree path that
2412///   skipped the cache store), falls back to using the raw env
2413///   value verbatim — that path is itself the source tree.
2414/// - `KernelId::Version(ver)`: looks for a Local cache entry
2415///   whose `metadata.version == ver` carrying a
2416///   `source_tree_path`. The tarball-shaped key (`{ver}-tarball-
2417///   {arch}-kc{suffix}`) is checked first because it is the
2418///   most-common form a Version-shaped env points at; on miss
2419///   (or hit yielding `Tarball` / `Git` source, both of which
2420///   are transient with no on-disk tree to probe), the function
2421///   falls back to scanning every valid cache entry for a Local
2422///   match on version. Without this fallback,
2423///   a cache populated by `kernel build --kernel
2424///   /path/to/linux` (a Local entry with source_tree_path) is
2425///   never found by a sidecar writer that has
2426///   `KTSTR_KERNEL=6.14.2`, even though the local tree is
2427///   exactly what the kernel_commit field needs to probe.
2428/// - `KernelId::CacheKey(k)`: uses `k` verbatim — the cache key
2429///   already carries every detail (source-type prefix, arch,
2430///   kconfig hash). On hit, returns
2431///   `KernelSource::Local::source_tree_path` if set, else
2432///   `None` (Tarball / Git entries are transient and have no
2433///   persisted source tree).
2434/// - `KernelId::Range { .. }` / `KernelId::Git { .. }`:
2435///   multi-kernel specs in `KTSTR_KERNEL` never reach this
2436///   helper in production (find_kernel's env reader bails
2437///   before sidecar writing). Defensive: returns `None`.
2438///
2439/// Returns `None` when the env var is unset, when no source
2440/// tree path is recoverable, or when the cache lookup fails.
2441fn resolve_kernel_source_dir() -> Option<std::path::PathBuf> {
2442    source_dir_for(&crate::ktstr_kernel_env()?)
2443}
2444
2445/// Resolve a `KTSTR_KERNEL` identifier string to the on-disk SOURCE
2446/// tree whose git HEAD is the kernel's commit (or `None` for transient
2447/// Range/Git specs or an unrecoverable cache lookup).
2448///
2449/// `pub` + `#[doc(hidden)]` for the same reason as `detect_kernel_commit`:
2450/// the cargo-ktstr `[[bin]]` calls this in `run_cargo` to pre-compute the
2451/// [`crate::KTSTR_KERNEL_COMMIT_ENV`] map. Computing the map value via THIS
2452/// function (then `detect_kernel_commit`) makes the map identical to the
2453/// sidecar's own fallback (`resolve_kernel_source_dir().and_then(
2454/// detect_kernel_commit)`), so a clean Path kernel — whose resolved dir is
2455/// a cache entry, not a git tree — still gets its real source commit into
2456/// the map instead of re-paying a per-test walk.
2457#[doc(hidden)]
2458pub fn source_dir_for(raw: &str) -> Option<std::path::PathBuf> {
2459    use crate::kernel_path::KernelId;
2460    let id = KernelId::parse(raw);
2461    match id {
2462        KernelId::Path(_) => {
2463            let p = std::path::Path::new(raw);
2464            // Cache-entry layout: `metadata.json` carries the
2465            // `KernelSource::Local::source_tree_path` recorded at
2466            // build time. Source-tree layout (dirty path that
2467            // skipped cache store): no metadata, so the env value
2468            // IS the source tree. The shared helper handles both.
2469            crate::cache::recover_local_source_tree(p)
2470                .or_else(|| Some(std::path::PathBuf::from(raw)))
2471        }
2472        KernelId::Version(_) | KernelId::CacheKey(_) => {
2473            let cache = crate::cache::CacheDir::new().ok()?;
2474            resolve_kernel_source_dir_with_cache(&id, &cache)
2475        }
2476        KernelId::Range { .. } | KernelId::Git { .. } => None,
2477    }
2478}
2479
2480/// Pure helper for [`resolve_kernel_source_dir`] that takes the
2481/// parsed `KernelId` and an opened `CacheDir`, returning the source
2482/// tree path if recoverable.
2483///
2484/// Split out from [`resolve_kernel_source_dir`] so tests can pin a
2485/// `CacheDir` at a tempdir root without mutating env vars (which
2486/// would race other tests reading `KTSTR_KERNEL` /
2487/// `KTSTR_CACHE_DIR`).
2488///
2489/// Lookup order for [`crate::kernel_path::KernelId::Version`]:
2490/// 1. Tarball-shaped cache key (`{ver}-tarball-{arch}-kc{suffix}`),
2491///    direct lookup. Returns `Some` only if the entry is a
2492///    `KernelSource::Local` carrying a `source_tree_path`.
2493/// 2. Fallback scan: every valid cache entry whose
2494///    `metadata.version == ver`. First match with
2495///    `KernelSource::Local::source_tree_path` set wins. Handles
2496///    the case where the user built `--kernel /path/to/linux`
2497///    (a Local cache entry without the tarball cache-key prefix)
2498///    but later set `KTSTR_KERNEL=6.14.2` for the test run —
2499///    without this fallback, the local source tree would be
2500///    invisible to the sidecar writer.
2501///
2502/// `KernelSource::Tarball` and `KernelSource::Git` entries are
2503/// skipped at every step because their source trees are transient
2504/// (deleted by the cache pipeline after build), so probing them
2505/// for a `kernel_commit` would always fail.
2506///
2507/// For [`crate::kernel_path::KernelId::CacheKey`], performs a single direct lookup —
2508/// the cache key already encodes every detail (source-type
2509/// prefix, arch, kconfig hash) so no fallback scan is needed.
2510fn resolve_kernel_source_dir_with_cache(
2511    id: &crate::kernel_path::KernelId,
2512    cache: &crate::cache::CacheDir,
2513) -> Option<std::path::PathBuf> {
2514    use crate::kernel_path::KernelId;
2515    match id {
2516        KernelId::Version(ver) => {
2517            let arch = std::env::consts::ARCH;
2518            let tarball_key = format!("{ver}-tarball-{arch}-kc{}", crate::cache_key_suffix());
2519            if let Some(entry) = cache.lookup(&tarball_key)
2520                && let crate::cache::KernelSource::Local {
2521                    source_tree_path: Some(p),
2522                    ..
2523                } = &entry.metadata.source
2524            {
2525                return Some(p.clone());
2526            }
2527            let entries = cache.list().ok()?;
2528            for listed in entries {
2529                let crate::cache::ListedEntry::Valid(entry) = listed else {
2530                    continue;
2531                };
2532                if entry.metadata.version.as_deref() != Some(ver.as_str()) {
2533                    continue;
2534                }
2535                if let crate::cache::KernelSource::Local {
2536                    source_tree_path: Some(p),
2537                    ..
2538                } = &entry.metadata.source
2539                {
2540                    return Some(p.clone());
2541                }
2542            }
2543            None
2544        }
2545        KernelId::CacheKey(k) => {
2546            let entry = cache.lookup(k)?;
2547            match entry.metadata.source {
2548                crate::cache::KernelSource::Local {
2549                    source_tree_path: Some(ref p),
2550                    ..
2551                } => Some(p.clone()),
2552                _ => None,
2553            }
2554        }
2555        // Path / Range / Git callers do not reach this helper —
2556        // resolve_kernel_source_dir handles them inline. Defensive
2557        // None covers any future caller that adds a new arm.
2558        _ => None,
2559    }
2560}
2561
2562/// The kernel commit recorded in a sidecar: the env fast-path first,
2563/// then the in-process gix walk.
2564///
2565/// cargo-ktstr pre-probes every resolved kernel's HEAD once and exports
2566/// a `dir=commit;...` map in [`crate::KTSTR_KERNEL_COMMIT_ENV`], keyed
2567/// by the dir it also exports as `KTSTR_KERNEL`. This process looks
2568/// itself up by its own [`crate::ktstr_kernel_env`] value — string-equal
2569/// to the map key by construction, since cargo-ktstr built both from the
2570/// same resolved dir. A hit skips `detect_kernel_commit`'s gix HEAD +
2571/// dirty-walk, which is memoized per process but NOT across the per-test
2572/// nextest processes (so without the map each of N processes re-pays
2573/// it).
2574///
2575/// Keying on `ktstr_kernel_env()` (the raw `KTSTR_KERNEL`) rather than
2576/// on `resolve_kernel_source_dir()` is deliberate — that is exactly the
2577/// key cargo-ktstr used. The map's commit VALUE matches this function's
2578/// own fallback because cargo-ktstr computes it via the SAME resolution
2579/// (`source_dir_for` then `detect_kernel_commit`); a kernel with no
2580/// recoverable source tree is simply absent from the map, so the miss
2581/// falls through to the identical resolve-and-walk here.
2582///
2583/// Miss / absent env / empty commit → the walk. Optimization only.
2584fn kernel_commit_for_sidecar() -> Option<String> {
2585    if let Some(self_dir) = crate::ktstr_kernel_env()
2586        && let Ok(raw) = std::env::var(crate::KTSTR_KERNEL_COMMIT_ENV)
2587    {
2588        for seg in raw.split(';') {
2589            if let Some((dir, commit)) = seg.rsplit_once('=')
2590                && dir == self_dir
2591                && !commit.is_empty()
2592            {
2593                return Some(commit.to_string());
2594            }
2595        }
2596    }
2597    resolve_kernel_source_dir().and_then(|d| detect_kernel_commit(&d))
2598}
2599
2600/// Compute a stable 64-bit discriminator over the fields that
2601/// distinguish gauntlet variants of the same test. Used to suffix
2602/// the sidecar filename so concurrent variants do not clobber each
2603/// other's output.
2604///
2605/// Uses [`siphasher::sip::SipHasher13`] with zero keys for the same
2606/// cross-toolchain stability reason as the other zero-keyed
2607/// SipHasher13 sites (`build.rs`, `runtime.rs` `content_hash`) —
2608/// the discriminator
2609/// must be the same across Rust toolchain versions or downstream
2610/// tooling that groups variants by filename breaks.
2611///
2612/// # Host-state collision caveat
2613///
2614/// The hash is over test-identity fields (topology, scheduler,
2615/// payload, work_type, sysctls, kargs) — NOT over
2616/// [`crate::host_context::HostContext`], NOT over `scheduler_commit`, NOT over
2617/// `project_commit`, NOT over `kernel_commit`, NOT over
2618/// `run_source`, NOT over `resolve_source`, and NOT over
2619/// `cpu_budget` / `vcpus`. The
2620/// [`crate::host_context::HostContext`] exclusion is pinned by
2621/// `sidecar_variant_hash_excludes_host_context`; the
2622/// `scheduler_commit` exclusion by
2623/// `sidecar_variant_hash_excludes_scheduler_commit`; the
2624/// `project_commit` exclusion by
2625/// `sidecar_variant_hash_excludes_project_commit`; the
2626/// `kernel_commit` exclusion by
2627/// `sidecar_variant_hash_excludes_kernel_commit`; the
2628/// `run_source` exclusion by
2629/// `sidecar_variant_hash_excludes_run_source`; the
2630/// `resolve_source` exclusion by
2631/// `sidecar_variant_hash_excludes_resolve_source`; the
2632/// `cpu_budget` / `vcpus` exclusion by
2633/// `sidecar_variant_hash_excludes_cpu_budget`. All seven are
2634/// deliberate for the same cross-host grouping reason — a
2635/// gauntlet rebuilt against a different userspace scheduler
2636/// commit, a bumped ktstr checkout, a kernel source tree at a
2637/// different HEAD, a different CI runner / developer machine, a
2638/// run that resolved its scheduler via a different discovery
2639/// path, or a run that confined its vCPUs to a different
2640/// host-CPU budget must still bucket with the same-named variant so
2641/// `compare_partitions` can diff two runs of the "same" test
2642/// without the commit hash, run-source tag, or budget shattering
2643/// them into one-row-per-commit islands. `cpu_budget` / `vcpus`
2644/// are instead surfaced as the [`crate::stats::Dimension::CpuBudget`]
2645/// pairing axis, which separates cross-budget runs at compare time
2646/// rather than at the identity bucket. Callers that want to detect
2647/// a commit drift or compare across run environments inspect
2648/// [`SidecarResult::scheduler_commit`] /
2649/// [`SidecarResult::project_commit`] /
2650/// [`SidecarResult::kernel_commit`] /
2651/// [`SidecarResult::run_source`] /
2652/// [`SidecarResult::resolve_source`] directly (via
2653/// `--project-commit` / `--kernel-commit` / `--run-source` /
2654/// `--resolve-source` on `perf-delta`); the filename stays stable
2655/// across commits and run environments by design.
2656///
2657/// The corollary of the HostContext exclusion: if the host's
2658/// observable state mutates mid-suite — NUMA hotplug, hugepage
2659/// reconfiguration, a `sysctl -w` from a parallel process — two
2660/// runs of the same test will produce the same sidecar filename
2661/// and the later write clobbers the earlier. ktstr treats host
2662/// state as stable-enough for a single suite run; callers
2663/// mutating host state during a run own the ordering themselves
2664/// (e.g. by writing to a different `KTSTR_SIDECAR_DIR` per host
2665/// snapshot).
2666/// The single canonical-JSON + siphash site for the variant hash.
2667///
2668/// [`sidecar_variant_hash`] (from a written [`SidecarResult`]) and
2669/// [`variant_hash_from_parts`] (from a test entry + resolved topology +
2670/// work_type, before any sidecar exists) both route through this so the
2671/// two derivations can never drift. `sysctls`/`kargs` are sorted here
2672/// for order-independence.
2673fn variant_hash_of(
2674    topology: &str,
2675    scheduler: &str,
2676    payload: Option<&str>,
2677    work_type: &str,
2678    sysctls: &[String],
2679    kargs: &[String],
2680) -> u64 {
2681    use siphasher::sip::SipHasher13;
2682    use std::hash::Hasher;
2683    let mut sorted_sysctls = sysctls.to_vec();
2684    sorted_sysctls.sort();
2685    let mut sorted_kargs = kargs.to_vec();
2686    sorted_kargs.sort();
2687    let canonical = serde_json::json!({
2688        "topology": topology,
2689        "scheduler": scheduler,
2690        "payload": payload,
2691        "work_type": work_type,
2692        "sysctls": sorted_sysctls,
2693        "kargs": sorted_kargs,
2694    });
2695    let bytes = serde_json::to_vec(&canonical).expect("json serialization cannot fail for strings");
2696    let mut h = SipHasher13::new_with_keys(0, 0);
2697    h.write(&bytes);
2698    h.finish()
2699}
2700
2701pub(crate) fn sidecar_variant_hash(sidecar: &SidecarResult) -> u64 {
2702    variant_hash_of(
2703        &sidecar.topology,
2704        &sidecar.scheduler,
2705        sidecar.payload.as_deref(),
2706        &sidecar.work_type,
2707        &sidecar.sysctls,
2708        &sidecar.kargs,
2709    )
2710}
2711
2712/// The variant hash for a test entry's run at a given resolved topology
2713/// and `work_type`, computed BEFORE any sidecar exists — the
2714/// failure-dump path (and the Ctx/VmResult `variant_hash` stamp) need
2715/// the identity at VM-build time. Mirrors [`write_sidecar`]'s field
2716/// derivation (topology = the resolved topology, scheduler/sysctls/kargs
2717/// = [`scheduler_fingerprint`], payload = `entry.payload`) so the dump
2718/// filename carries the SAME variant hash the sidecar will. Pinned
2719/// equal to [`sidecar_variant_hash`] by a roundtrip test.
2720pub(crate) fn variant_hash_from_parts(
2721    entry: &KtstrTestEntry,
2722    resolved_topology: &crate::vmm::topology::Topology,
2723    work_type: &str,
2724) -> u64 {
2725    let fp = scheduler_fingerprint(entry);
2726    variant_hash_of(
2727        &resolved_topology.to_string(),
2728        &fp.scheduler,
2729        entry.payload.map(|p| p.name),
2730        work_type,
2731        &fp.sysctls,
2732        &fp.kargs,
2733    )
2734}
2735
2736/// Entry-derived scheduler metadata that every sidecar carries
2737/// regardless of pass/fail/skip.
2738///
2739/// Both write paths ([`write_sidecar`] and [`write_skip_sidecar`])
2740/// thread the same materialized fields through to their
2741/// `SidecarResult` constructors; keeping the derivation in a
2742/// named struct (rather than a 4-tuple) means a new
2743/// scheduler-level field shows up as a named field at both
2744/// writer sites and in every call-site binding, instead of as
2745/// an additional anonymous tuple slot that readers have to
2746/// remember the ordering of.
2747///
2748/// `pub(crate)` rather than `pub`: the intermediate struct is a
2749/// write-path detail, not a public API surface. No serde — this
2750/// is not a persisted shape, just a grouped return value.
2751///
2752/// Derives `Debug` for `assert_eq!` diagnostics, `Clone` so tests
2753/// can materialize a fixture once and reuse it across assertions,
2754/// and `PartialEq`/`Eq` so tests can compare whole fingerprints
2755/// in one statement rather than destructuring and asserting on
2756/// each field.
2757#[derive(Debug, Clone, PartialEq, Eq)]
2758pub(crate) struct SchedulerFingerprint {
2759    /// Pretty scheduler name (matches `SidecarResult::scheduler`),
2760    /// e.g. `"eevdf"` or a scheduler-kind payload's declared name.
2761    pub(crate) scheduler: String,
2762    /// Best-effort userspace scheduler commit; `None` for every
2763    /// current variant per
2764    /// [`crate::test_support::SchedulerSpec::scheduler_commit`].
2765    pub(crate) scheduler_commit: Option<String>,
2766    /// Formatted `sysctl.<key>=<value>` lines derived from the
2767    /// scheduler's declared `sysctls()`.
2768    pub(crate) sysctls: Vec<String>,
2769    /// Kernel command-line args declared by the scheduler,
2770    /// forwarded verbatim.
2771    pub(crate) kargs: Vec<String>,
2772}
2773
2774/// Materialize the [`SchedulerFingerprint`] for a test entry.
2775///
2776/// A change to the sidecar schema (e.g. a new scheduler-level
2777/// field) extends this function + [`SchedulerFingerprint`] in
2778/// one place and every writer picks it up automatically.
2779fn scheduler_fingerprint(entry: &KtstrTestEntry) -> SchedulerFingerprint {
2780    let scheduler = entry.scheduler.name.to_string();
2781    // `SchedulerSpec::scheduler_commit()` returns `None` for every
2782    // variant (Eevdf, Discover, Path, KernelBuiltin) — the commit
2783    // string is not carried in the static spec; it comes from the
2784    // sidecar's run-time git probe instead. This call is here only
2785    // to surface the slot in the fingerprint so a future spec
2786    // variant carrying a commit would flow through automatically.
2787    let scheduler_commit = entry
2788        .scheduler
2789        .binary
2790        .scheduler_commit()
2791        .map(|s| s.to_string());
2792    let sysctls: Vec<String> = entry
2793        .scheduler
2794        .sysctls
2795        .iter()
2796        .map(|s| format!("sysctl.{}={}", s.key(), s.value()))
2797        .collect();
2798    let kargs: Vec<String> = entry
2799        .scheduler
2800        .kargs
2801        .iter()
2802        .map(|s| s.to_string())
2803        .collect();
2804    SchedulerFingerprint {
2805        scheduler,
2806        scheduler_commit,
2807        sysctls,
2808        kargs,
2809    }
2810}
2811
2812/// Compute the per-variant sidecar path and serialize + write the
2813/// result to disk.
2814///
2815/// Gauntlet variants of the same test differ by work_type, flags
2816/// (via scheduler args → sysctls/kargs), scheduler, and topology. A
2817/// filename of just `{test_name}.ktstr.json` causes variants to
2818/// overwrite each other, erasing all but the last-written result.
2819/// `sidecar_variant_hash` hashes the discriminating fields into a
2820/// short stable suffix so each variant gets its own sidecar file.
2821///
2822/// On the first call PER UNIQUE DIRECTORY within a process,
2823/// [`pre_clear_run_dir_once`] removes any pre-existing
2824/// `*.ktstr.json` files in the resolved directory so the run is a
2825/// clean snapshot rather than a mosaic of sidecars carried over
2826/// from a prior invocation that shared the same
2827/// `{kernel}-{project_commit}` key (e.g. re-running the suite
2828/// without committing changes).
2829/// Subsequent writes within the same process to the same directory
2830/// append into the cleared directory.
2831///
2832/// Pre-clear is SKIPPED when `KTSTR_SIDECAR_DIR` is set: the
2833/// operator chose that directory and owns its contents — silent
2834/// data loss is not acceptable on an explicit override. When the
2835/// override is unset (the default-path branch),
2836/// `std::fs::create_dir_all` materializes the directory BEFORE
2837/// pre-clear runs so the helper's canonicalize step always sees
2838/// an existing on-disk path; without this ordering, a missing
2839/// dir on the very first call would key the cache against the
2840/// raw path while a later call (after the dir exists) would key
2841/// against the canonicalized absolute path, splitting the cache
2842/// and causing the second call to re-fire pre-clear and wipe the
2843/// first call's sidecars.
2844///
2845/// CROSS-PROCESS SERIALIZATION: on the default path (override
2846/// unset), the call acquires advisory `LOCK_EX` on a per-run-key
2847/// sentinel file (`{runs_root}/.locks/{key}.lock`) before
2848/// pre-clear runs and holds it for the duration of the
2849/// pre-clear + serialize + write cycle. The lock prevents
2850/// process B's `pre_clear_run_dir_once` from interleaving with
2851/// process A's mid-write `std::fs::write` — the kernel-flock
2852/// critical section makes the (read_dir + remove_file) +
2853/// (serialize + write) sequence atomic with respect to peer
2854/// processes targeting the same `{kernel}-{project_commit}`
2855/// directory. The override path skips the lock for the same
2856/// reason it skips pre-clear: operator-chosen directories are
2857/// owned by the operator, so we do not place a `.locks/` sibling
2858/// inside (or above) their custom layout.
2859///
2860/// EX-around-the-whole-cycle (not just pre-clear) is the correct
2861/// choice: it makes the (read_dir + remove_file) + (serialize +
2862/// write) sequence atomic against concurrent peers, so no peer
2863/// observes a half-cleared directory or a mid-write sidecar.
2864///
2865/// A later peer process still RUNS its own pre-clear (its
2866/// `OnceLock` is process-local), but `pre_clear_run_dir_once` skips
2867/// the wipe when the dir's `.ktstr_run_epoch` sentinel already
2868/// records this session's [`crate::KTSTR_RUN_EPOCH_ENV`] token (a
2869/// peer cleared it earlier this session), sparing every
2870/// `{test}-{hash}.ktstr.json` written THIS session. Without that
2871/// sentinel a later peer's pre-clear would delete an earlier peer's
2872/// freshly-written sidecar — silent stats loss; the session token
2873/// closes that window. (Raw `cargo nextest run` sets no token, so
2874/// its peers fall back to wipe-everything and the loss can recur —
2875/// the orchestrated path is the supported one.)
2876///
2877/// PER-FILE ATOMICITY (both branches): the JSON is written to a
2878/// `<final>.tmp.<pid>.<run_id>` sibling and then `rename(2)`'d into
2879/// place. POSIX `rename` is atomic for same-directory destinations,
2880/// so a peer reader (`collect_sidecars`) never observes a partial
2881/// JSON payload — either the old contents stay or the new contents
2882/// replace them in one filesystem step. Two concurrent writers that
2883/// both target the same `{test_name}-{variant_hash}.ktstr.json`
2884/// (override path: two CI jobs sharing one operator-chosen dir;
2885/// default path: a torn-write window inside the flock body that the
2886/// flock would otherwise have to cover) cannot leave a half-written
2887/// JSON behind — last-rename-wins, both files are individually
2888/// well-formed. The `.tmp.<pid>.<run_id>` discriminator on the
2889/// staging name keeps two writers from racing on the same staging
2890/// path even when their final destinations collide. The flock on
2891/// the default path remains load-bearing for the pre-clear leg
2892/// (atomic write only protects the write itself, not the
2893/// `read_dir + remove_file` walk that pre-clear runs).
2894///
2895/// `label` is a caller-supplied noun for the context message ("skip
2896/// sidecar" / "sidecar") so the error chain points at the right call
2897/// site.
2898fn serialize_and_write_sidecar(sidecar: &SidecarResult, label: &str) -> anyhow::Result<()> {
2899    // Read the override ONCE. The two branches below carry the
2900    // result through structurally so neither leg re-reads
2901    // `KTSTR_SIDECAR_DIR` — preventing the override from flipping
2902    // mid-call (which would otherwise let an external mutation
2903    // between the dir resolve and the pre-clear gate either skip
2904    // the wipe on a default-path dir or fire a wipe on an
2905    // operator-chosen one).
2906    let (dir, do_pre_clear) = match sidecar_dir_override() {
2907        Some(path) => (path, false),
2908        None => (resolve_default_sidecar_dir(), true),
2909    };
2910    // Materialize the directory FIRST so `pre_clear_run_dir_once`
2911    // can canonicalize a path that exists on disk. Without this,
2912    // the very first invocation in a process resolves the cache
2913    // key against the raw relative path (canonicalize fails on a
2914    // missing dir, falls back to raw); subsequent invocations
2915    // resolve against the canonicalized absolute path because the
2916    // dir now exists. Two distinct keys for the same logical dir
2917    // → second invocation re-fires pre-clear and wipes the first
2918    // invocation's sidecars. Materializing pre-pre-clear closes
2919    // the relative-vs-absolute split.
2920    std::fs::create_dir_all(&dir)
2921        .with_context(|| format!("create sidecar dir {}", dir.display()))?;
2922    // Acquire the per-run-key cross-process flock for the duration
2923    // of the pre-clear + write cycle. The override branch (operator-
2924    // chosen directory) skips the lock for the same reason it skips
2925    // pre-clear — see the function-level doc. `_run_dir_lock` is
2926    // scoped to this function body so the kernel-side flock releases
2927    // via `OwnedFd::drop` when the function returns (success or
2928    // error path), making the lock RAII-managed without an explicit
2929    // unlock call.
2930    let _run_dir_lock = if do_pre_clear {
2931        Some(acquire_run_dir_flock(&dir)?)
2932    } else {
2933        None
2934    };
2935    if do_pre_clear {
2936        pre_clear_run_dir_once(&dir);
2937    }
2938    let variant_hash = sidecar_variant_hash(sidecar);
2939    let path = dir.join(format!(
2940        "{}-{:016x}.ktstr.json",
2941        sidecar.test_name, variant_hash
2942    ));
2943    let json = serde_json::to_string_pretty(sidecar)
2944        .with_context(|| format!("serialize {label} for '{}'", sidecar.test_name))?;
2945    // Atomic write: stage into a `.tmp.<pid>.<run_id>` sibling and
2946    // rename(2) into the final path. `rename` is atomic for
2947    // same-directory destinations on every filesystem ktstr supports
2948    // (ext4, btrfs, xfs, tmpfs, overlayfs); a peer reader never
2949    // observes a partial payload. The staging name carries the pid
2950    // AND the unique sidecar `run_id` so two writers in the same
2951    // process targeting identical final paths (e.g. two threads in
2952    // the budget-test stdout-capture path) cannot stomp each other's
2953    // staging file before either rename lands. On rename failure the
2954    // staging file is removed so a partial sidecar does not survive
2955    // as garbage in the run dir; rename success consumes the staging
2956    // entry and there is nothing to clean up.
2957    let pid = std::process::id();
2958    let staging = dir.join(format!(
2959        "{}-{:016x}.ktstr.json.tmp.{pid}.{}",
2960        sidecar.test_name, variant_hash, sidecar.run_id,
2961    ));
2962    std::fs::write(&staging, &json)
2963        .with_context(|| format!("write {label} staging {}", staging.display()))?;
2964    if let Err(e) = std::fs::rename(&staging, &path) {
2965        // Best-effort cleanup of the staged payload; ignore the
2966        // unlink error so the rename failure is what surfaces
2967        // (the rename error names the actual problem).
2968        let _ = std::fs::remove_file(&staging);
2969        return Err(anyhow::Error::from(e).context(format!(
2970            "rename {label} staging {} -> {}",
2971            staging.display(),
2972            path.display(),
2973        )));
2974    }
2975    LAST_SIDECAR_PATH.with(|p| *p.borrow_mut() = Some(path.clone()));
2976    Ok(())
2977}
2978
2979thread_local! {
2980    /// Absolute path of the most recent sidecar this thread wrote (via
2981    /// [`serialize_and_write_sidecar`]). The dispatch run loop
2982    /// ([`crate::test_support::eval::run_ktstr_test_inner`]) reads and
2983    /// clears it after the run to finalize the persisted verdict to the
2984    /// test's FINAL (post-inversion) outcome. nextest is process-per-test
2985    /// so a run writes one sidecar; a value left from an earlier phase is
2986    /// overwritten by the current write, so the take always yields this
2987    /// run's sidecar.
2988    static LAST_SIDECAR_PATH: std::cell::RefCell<Option<PathBuf>> =
2989        const { std::cell::RefCell::new(None) };
2990}
2991
2992/// Take (read + clear) the path of the sidecar most recently written on
2993/// this thread, or `None` when no sidecar was written this run (an
2994/// early bail before any write). See [`LAST_SIDECAR_PATH`].
2995///
2996/// MUST be drained exactly once per run — `run_ktstr_test_inner` does
2997/// this after each dispatch. The thread-local persists across calls in
2998/// a process, so a caller that writes a sidecar WITHOUT a following take
2999/// would leave a stale path for the next take to consume; in practice
3000/// only `run_ktstr_test_inner` pairs a write with a take, and a stale
3001/// path points at a dropped tempdir so the finalize read fails benignly.
3002pub(crate) fn take_last_sidecar_path() -> Option<PathBuf> {
3003    LAST_SIDECAR_PATH.with(|p| p.borrow_mut().take())
3004}
3005
3006/// Overwrite a written sidecar's verdict bits with the test's FINAL
3007/// (post-inversion) `(passed, skipped, inconclusive)` outcome — see
3008/// [`crate::test_support::dispatch::Verdict::sidecar_bits`] — and set
3009/// [`SidecarResult::expected_failure`] when an actual scenario
3010/// failure/inconclusive was inverted to a pass/skip. Rewrites the file
3011/// atomically (temp + rename).
3012///
3013/// A no-op when the final verdict already matches what was persisted (an
3014/// ordinary pass/fail/skip — no `expect_err`/`expect_auto_repro`
3015/// inversion). Best-effort: a read/parse/serialize/write error is
3016/// surfaced on stderr and swallowed so the raw sidecar stands (the
3017/// footer then falls back to it) rather than failing the run.
3018pub(crate) fn finalize_sidecar_verdict(
3019    path: &std::path::Path,
3020    passed: bool,
3021    skipped: bool,
3022    inconclusive: bool,
3023) {
3024    let Ok(json) = std::fs::read_to_string(path) else {
3025        return;
3026    };
3027    let Ok(mut sc) = serde_json::from_str::<SidecarResult>(&json) else {
3028        eprintln!(
3029            "ktstr: finalize_sidecar_verdict: unparseable sidecar {}",
3030            path.display()
3031        );
3032        return;
3033    };
3034    // The run's telemetry is failure-mode-dominated when its scenario
3035    // actually failed/was-inconclusive but the final verdict is a
3036    // pass/skip (an inversion) — `perf-delta` excludes such rows.
3037    let raw_failed = sc.is_fail() || sc.is_inconclusive();
3038    let expected_failure = raw_failed && (passed || skipped);
3039    if sc.passed == passed
3040        && sc.skipped == skipped
3041        && sc.inconclusive == inconclusive
3042        && sc.expected_failure == expected_failure
3043    {
3044        return;
3045    }
3046    sc.passed = passed;
3047    sc.skipped = skipped;
3048    sc.inconclusive = inconclusive;
3049    sc.expected_failure = expected_failure;
3050    let Ok(out) = serde_json::to_string_pretty(&sc) else {
3051        return;
3052    };
3053    // Stage with a `.ktstr.json.tmp.…` suffix (append, NOT
3054    // `with_extension`, which would drop `.json`) so a hard-crash orphan
3055    // — write succeeded but rename did not — is reaped by
3056    // `pre_clear_run_dir_once` via `is_sidecar_staging_filename`, the
3057    // same way the primary write's staging file is.
3058    let pid = std::process::id();
3059    let mut staging = path.as_os_str().to_owned();
3060    staging.push(format!(".tmp.finalize.{pid}"));
3061    let staging = std::path::PathBuf::from(staging);
3062    if std::fs::write(&staging, &out).is_ok() && std::fs::rename(&staging, path).is_err() {
3063        let _ = std::fs::remove_file(&staging);
3064    }
3065}
3066
3067/// Remove the failure-dump artifacts
3068/// (`{test}-{variant_hash}.failure-dump.json` and
3069/// `{test}-{variant_hash}.repro.failure-dump.json`) for `test_name` in
3070/// the current sidecar dir.
3071///
3072/// Called when a run's FINAL outcome is a pass/skip but it wrote NO
3073/// sidecar — the run crashed before the guest produced a parseable
3074/// result (e.g. an `expect_err` test with a host-triggered BPF crash),
3075/// so [`finalize_sidecar_verdict`] had nothing to finalize. The freeze
3076/// coordinator wrote the dump unconditionally; without a sidecar to mark
3077/// the pass, the footer's dump-only trigger
3078/// ([`summarize_one_run_dir`] flags a dump with no parsed sidecar) would
3079/// surface this PASSING test as FAILED. Removing the dump keeps the
3080/// footer consistent with nextest's pass. Best-effort: a missing dump
3081/// (the normal clean-pass case) is fine. A genuine pre-sidecar failure
3082/// (final = Fail) does NOT call this, so its dump still flags.
3083pub(crate) fn suppress_failure_dumps(test_name: &str, variant_hash: u64) {
3084    let dir = sidecar_dir();
3085    // Remove THIS variant's dumps by the precise `{test}-{hash}` key, not
3086    // a `{test}-*` glob: a glob would also delete a SIBLING gauntlet
3087    // preset's legitimately-failing dump in the same run dir.
3088    for suffix in [".failure-dump.json", ".repro.failure-dump.json"] {
3089        let _ = std::fs::remove_file(dir.join(format!("{test_name}-{variant_hash:016x}{suffix}")));
3090    }
3091}
3092
3093/// `Some(path)` when `KTSTR_SIDECAR_DIR` is set non-empty,
3094/// returning the override path verbatim; `None` when the env
3095/// var is unset or empty (default-path branch). Single source
3096/// of truth for the override read so [`sidecar_dir`] and
3097/// [`serialize_and_write_sidecar`] (which gates pre-clear on
3098/// the override's presence) share one env-read site rather
3099/// than each calling `std::env::var` independently.
3100///
3101/// The `is_empty()` filter is deliberate: a defensively-cleared
3102/// `KTSTR_SIDECAR_DIR=""` must NOT be treated as an override
3103/// (joining an empty path onto the run-root would silently
3104/// alias the runs-root itself, contaminating the listing).
3105/// Empty-string aliases unset, matching the
3106/// `if let Ok(d) ... && !d.is_empty()` predicate the function
3107/// replaced.
3108///
3109/// `serialize_and_write_sidecar` interprets `Some(_)` as the
3110/// "operator chose this dir, do not pre-clear" gate — silent
3111/// data loss is unacceptable on an explicit override (the
3112/// override is for users who want exact control over where
3113/// sidecars land: test isolation, archival capture, custom CI
3114/// layouts).
3115fn sidecar_dir_override() -> Option<PathBuf> {
3116    std::env::var(crate::KTSTR_SIDECAR_DIR_ENV)
3117        .ok()
3118        .filter(|d| !d.is_empty())
3119        .map(PathBuf::from)
3120}
3121
3122/// Emit a one-shot stderr warning when [`detect_project_commit`]
3123/// resolves to `None` and the run directory therefore lands at
3124/// `{kernel}-unknown`. Operators in this state lose the
3125/// `{project_commit}` discriminator on the run-directory name —
3126/// every non-git invocation at the same kernel collides on a
3127/// single directory, with the latest run pre-clearing the
3128/// previous one's sidecars. The warning surfaces this loss-of-isolation
3129/// risk so the operator can either set `KTSTR_SIDECAR_DIR` to
3130/// disambiguate per-run, or place the project tree under git
3131/// so each run carries its own commit hash.
3132///
3133/// `OnceLock<()>` gates the warning to fire EXACTLY ONCE per
3134/// process: every gauntlet variant resolves a sidecar directory
3135/// independently (via [`sidecar_dir`] and
3136/// [`serialize_and_write_sidecar`]), so without the gate the
3137/// operator would see thousands of duplicate warnings interleaved
3138/// with test output. Called via [`resolve_default_sidecar_dir`] —
3139/// which is the shared default-path body that both [`sidecar_dir`]
3140/// and [`serialize_and_write_sidecar`] funnel through — so the
3141/// warning fires only on the default-path branch. The override
3142/// branch in either caller returns before
3143/// [`resolve_default_sidecar_dir`] is reached, so an operator who
3144/// set `KTSTR_SIDECAR_DIR` to disambiguate non-git runs does not
3145/// see a misleading "commit unknown" warning that does not apply
3146/// to their effective directory layout.
3147///
3148/// Implementation is split into a public-facing wrapper
3149/// (this function) that owns the process-global `OnceLock` and
3150/// targets stderr, and a pure inner helper
3151/// [`warn_unknown_project_commit_inner`] that takes the
3152/// `&OnceLock<()>` gate and the `&mut dyn Write` sink as
3153/// parameters. The split lets tests drive the warning logic
3154/// against a local `OnceLock` and a `Vec<u8>` sink without
3155/// fighting the process-global gate or the global stderr fd —
3156/// the wrapper's behavior is what the inner does, just with
3157/// the static gate and stderr supplied.
3158fn warn_unknown_project_commit_once() {
3159    static WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
3160    let mut sink = std::io::stderr();
3161    warn_unknown_project_commit_inner(&WARNED, &mut sink);
3162}
3163
3164/// Pure helper for [`warn_unknown_project_commit_once`]: gate the
3165/// warning on `gate` and write the warning text to `sink` exactly
3166/// once across the gate's lifetime. Both parameters are taken by
3167/// reference so call sites supply ownership semantics that match
3168/// their gating story:
3169/// - The production wrapper passes a `'static` `OnceLock<()>` so
3170///   the gate spans the whole process and a stderr handle so the
3171///   warning lands in the operator's terminal.
3172/// - Tests pass a local `OnceLock<()>` so each test gets a fresh
3173///   gate (no cross-test contamination via a process-global)
3174///   and a `Vec<u8>` sink so the test can read back the emitted
3175///   bytes and assert on the warning text.
3176///
3177/// Errors from `writeln!` are ignored via `let _ =`: a metadata
3178/// probe warning must not gate sidecar writes. This DEPARTS from
3179/// the previous `eprintln!` semantics (which panic on stderr
3180/// write failure per the std docs) — here we drop the write
3181/// error silently because a metadata probe warning must not gate
3182/// sidecar writes.
3183fn warn_unknown_project_commit_inner(
3184    gate: &std::sync::OnceLock<()>,
3185    sink: &mut dyn std::io::Write,
3186) {
3187    gate.get_or_init(|| {
3188        let _ = writeln!(
3189            sink,
3190            "ktstr: WARNING: project commit unavailable (cwd not in a git \
3191             repo, or HEAD unreadable); runs at this kernel overwrite \
3192             each other in target/ktstr/{{kernel}}-unknown/. Set \
3193             KTSTR_SIDECAR_DIR=<unique-path> per run, or run from inside a \
3194             git repo with at least one commit."
3195        );
3196    });
3197}
3198
3199/// Remove PRIOR-SESSION `*.ktstr.json` files (and orphaned staging
3200/// files) in the resolved run directory, exactly once per unique
3201/// directory per process.
3202///
3203/// "Prior-session" is gated on the [`crate::KTSTR_RUN_EPOCH_ENV`]
3204/// session token: when set (the orchestrated `cargo ktstr test`
3205/// path) the first process to clear a dir records the token in the
3206/// `.ktstr_run_epoch` sentinel, and a later peer process whose token
3207/// matches SKIPS the wipe entirely — sparing every sidecar this
3208/// session's peers wrote (nextest is process-per-test). A
3209/// differing/absent sentinel (new session, or raw `cargo nextest
3210/// run` with no token) wipes every `*.ktstr.json` match and records
3211/// the token — see the CONCURRENT WRITERS (cross-process) section.
3212///
3213/// The run-key format is `{kernel}-{project_commit}` (see
3214/// [`sidecar_dir`]), so two `cargo ktstr test` invocations sharing
3215/// the same kernel and project commit (the typical "re-run the
3216/// suite without committing changes" loop) resolve to the same
3217/// directory. Without
3218/// pre-clearing, each subsequent run would land its sidecars next
3219/// to the previous run's, leaving downstream `cargo ktstr stats`
3220/// readers to see a mosaic of two distinct test outcomes for the
3221/// same variant — the variant-hash suffix on each filename
3222/// prevents overwrites within a single run, but ALSO prevents the
3223/// next run from naturally clobbering the previous one's files
3224/// when the test set or pass/fail mix changes. Wiping
3225/// `*.ktstr.json` once at first-write makes each run a clean
3226/// snapshot of (kernel, project commit) — last-SESSION-wins (a new
3227/// session's full sidecar set replaces the prior session's, while
3228/// peers within one session coexist via the epoch gate).
3229///
3230/// PER-DIRECTORY KEYING: the cache is a `Mutex<HashSet<PathBuf>>`
3231/// keyed on the canonicalized `dir` (with raw `dir` as fallback
3232/// when canonicalize fails — e.g. the directory does not yet
3233/// exist). A `OnceLock<()>` would fire once for the FIRST
3234/// directory only, leaving subsequent writes to other directories
3235/// unprotected. The HashSet ensures every distinct directory the
3236/// process writes to gets pre-cleared exactly once, regardless of
3237/// ordering. Canonicalization collapses symlink aliases so two
3238/// path spellings of the same on-disk dir share one entry.
3239///
3240/// In production today only the default-path
3241/// `runs_root().join({kernel}-{project_commit})` is fed into this
3242/// function (the override path skips pre-clear entirely via
3243/// [`sidecar_dir_override`]), so per-process cache size
3244/// stays at exactly 1 entry. The HashSet shape is the
3245/// future-proof keying for direct unit-test fixtures (which
3246/// rotate tempdir paths through this helper) and any future
3247/// production code path that writes default-path sidecars from
3248/// multiple distinct (kernel, commit) pairs in one process.
3249///
3250/// SCOPE: only `*.ktstr.json` sidecars and orphaned `.tmp` staging
3251/// files in the immediate directory are removed. Subdirectories
3252/// (per-job gauntlet layouts written by external orchestrators) and
3253/// non-sidecar files are left untouched — pre-clear is shallow. Note
3254/// that `collect_sidecars` walks one level of subdirectories, so
3255/// stale sidecars left in subdirectories from a prior run will still
3256/// appear in `cargo ktstr stats` output until the operator removes
3257/// them. The function never deletes the directory itself; production
3258/// callers (`serialize_and_write_sidecar`) materialize the directory
3259/// via `create_dir_all` BEFORE invoking this helper. Beyond the
3260/// wipe, the only other side effect is writing the `.ktstr_run_epoch`
3261/// session sentinel (when a token is set — see CONCURRENT WRITERS).
3262///
3263/// CONCURRENT WRITERS (intra-process): the per-process
3264/// `Mutex<HashSet>` guards against multiple writes within a single
3265/// process re-clearing the same directory. The cache mutex is held
3266/// ACROSS the `read_dir` walk and per-file removals — releasing it
3267/// after the cache insert but before the walk would open a TOCTOU
3268/// window where a sibling thread observes the cached entry, skips
3269/// its own pre-clear, writes a sidecar, and then the original
3270/// thread's still-pending walk deletes that sibling's fresh file.
3271/// Holding the lock across the bounded walk closes the window.
3272///
3273/// CONCURRENT WRITERS (cross-process): nextest is process-per-test,
3274/// so distinct `#[ktstr_test]` functions run as separate processes
3275/// sharing one `{kernel}-{project_commit}` dir. Each has its own
3276/// `OnceLock` and runs its own pre-clear. The
3277/// [`crate::KTSTR_RUN_EPOCH_ENV`] session token is what keeps a
3278/// later peer from deleting an earlier peer's fresh sidecar: the
3279/// first process records the token in the `.ktstr_run_epoch`
3280/// sentinel; a peer whose token matches SKIPS its wipe, sparing
3281/// every `{test}-{hash}.ktstr.json` this session wrote.
3282/// `serialize_and_write_sidecar`'s `LOCK_EX` serializes the
3283/// pre-clear+write cycle so the sentinel read/wipe/write is atomic
3284/// against peers — but serialization ALONE does NOT spare A's
3285/// already-written file from B's later wipe (B runs after A released
3286/// the lock); the sentinel does. Without a token (raw `cargo nextest
3287/// run`) peers fall back to wipe-everything and can lose each other's
3288/// sidecars — the orchestrated path is the supported one.
3289///
3290/// FAILURE: `read_dir` errors are silently ignored — defensive
3291/// behavior for direct callers (e.g. unit tests probing the
3292/// missing-dir edge); production callers materialize the
3293/// directory before invoking this helper, so the missing-dir
3294/// branch is unreachable in production today. Metadata probes
3295/// must not gate sidecar writes. Per-file `remove_file`
3296/// errors are also silently ignored — a partial pre-clear leaves
3297/// either an overwrite (when the new run reproduces a stale
3298/// file's exact `{test_name}-{variant_hash}.ktstr.json` name —
3299/// the desired outcome) or a coexistence (when the new run's
3300/// variant set differs from the prior run's, leaving stale
3301/// sidecars next to fresh ones — the undesired outcome that
3302/// pre-clear was meant to prevent). Coexistence is the acceptable
3303/// degradation here: a noisy pre-clear failure should not abort
3304/// the test run.
3305fn pre_clear_run_dir_once(dir: &std::path::Path) {
3306    use std::collections::HashSet;
3307    use std::path::PathBuf;
3308    use std::sync::{Mutex, OnceLock};
3309    static PRE_CLEARED: OnceLock<Mutex<HashSet<PathBuf>>> = OnceLock::new();
3310    // Canonicalize so two spellings of the same on-disk dir share
3311    // one cache entry. Falls back to the raw path when canonicalize
3312    // fails (the directory may not exist yet on the very first
3313    // write, in which case the raw path keys the entry; subsequent
3314    // calls with the same raw path also miss canonicalize the
3315    // same way and share the entry).
3316    let cache_key = dir.canonicalize().unwrap_or_else(|_| dir.to_path_buf());
3317    let cache = PRE_CLEARED.get_or_init(|| Mutex::new(HashSet::new()));
3318    let mut guard = cache.lock_unpoisoned();
3319    if guard.contains(&cache_key) {
3320        return;
3321    }
3322    // First time this directory has been seen — wipe sidecars while
3323    // the cache mutex is still held. Releasing the guard before the
3324    // read_dir walk would open a TOCTOU window: a sibling thread that
3325    // observes the now-cached entry would skip its own pre-clear,
3326    // proceed to write a sidecar, and the original thread's walk
3327    // (running after the drop) would then delete that sibling's
3328    // freshly-written file. The walk is one read_dir + a bounded
3329    // number of `*.ktstr.json` removals, so holding the lock across
3330    // it is brief; concurrent calls against DIFFERENT directories
3331    // serialize through this critical section but each does a small,
3332    // bounded amount of I/O, which is acceptable for a metadata
3333    // probe call pattern. The cache insert happens AFTER the wipe
3334    // completes (rather than before) so a panic mid-wipe does not
3335    // poison the cache with an entry whose wipe never actually ran.
3336    // The mutex itself enforces serialization across threads; the
3337    // entry only records "wipe completed for this dir" and must
3338    // never be observed without the wipe having succeeded. `guard`
3339    // is dropped at end-of-scope so the lock release happens after
3340    // the loop completes.
3341    let session_token = run_session_token();
3342    let sentinel = dir.join(SESSION_SENTINEL);
3343    if let Some(token) = &session_token
3344        && std::fs::read_to_string(&sentinel).is_ok_and(|recorded| recorded == *token)
3345    {
3346        // A peer test process in THIS session already cleared the dir
3347        // (the sentinel records the session token under the flock);
3348        // its and the other peers' current-session sidecars must
3349        // survive, so skip the wipe entirely. See CONCURRENT WRITERS.
3350        guard.insert(cache_key);
3351        return;
3352    }
3353    if let Ok(entries) = std::fs::read_dir(dir) {
3354        for entry in entries.flatten() {
3355            let path = entry.path();
3356            if !path.is_file() {
3357                continue;
3358            }
3359            // Two file shapes are reaped here (current-session peers
3360            // were already spared by the sentinel skip above, so a
3361            // file reaching this point is prior-session or orphaned
3362            // residue):
3363            // - `<test>-<hash>.ktstr.json` — sidecars from a PRIOR
3364            //   session sharing this `{kernel}-{project_commit}` key.
3365            // - `<test>-<hash>.ktstr.json.tmp.<pid>.<run_id>` —
3366            //   orphaned staging from a writer that died between
3367            //   `write` and `rename` in `serialize_and_write_sidecar`
3368            //   (`is_sidecar_filename` excludes these — the extension
3369            //   is `<run_id>`, not `json` — so the staging sweep is
3370            //   what reaps them). The flock makes reaping an in-flight
3371            //   stage impossible: a live peer holds the lock we hold.
3372            if is_sidecar_filename(&path) || is_sidecar_staging_filename(&path) {
3373                let _ = std::fs::remove_file(&path);
3374            }
3375        }
3376    }
3377    // Record this session's token so peer processes skip re-wiping.
3378    // Best-effort: if the write fails, a later peer won't see the
3379    // token and re-wipes (the pre-fix behavior) — no worse than the
3380    // unfixed code, just the cross-test loss left unfixed for this
3381    // dir. Written AFTER the wipe so a crash mid-wipe leaves no
3382    // stale sentinel falsely claiming the dir was cleared.
3383    if let Some(token) = &session_token {
3384        let _ = std::fs::write(&sentinel, token);
3385    }
3386    // Record completion AFTER the wipe finishes, not before. If a
3387    // panic interrupts the loop above, the cache remains empty so
3388    // a subsequent call retries the wipe rather than skipping it
3389    // on the assumption that a prior call already cleared the dir.
3390    guard.insert(cache_key);
3391    drop(guard);
3392}
3393
3394/// Filename of the per-run-directory session sentinel that records
3395/// the [`crate::KTSTR_RUN_EPOCH_ENV`] token of the session that last
3396/// cleared the dir. A dotfile so every sidecar reader ignores it
3397/// (`is_sidecar_filename` requires a `.json` extension and
3398/// `classify_run_artifact` matches none of its suffixes), and it
3399/// lives in the run dir itself (which the caller already
3400/// `create_dir_all`'d) rather than the `.locks/` sibling.
3401const SESSION_SENTINEL: &str = ".ktstr_run_epoch";
3402
3403/// Read the `cargo ktstr test` session token from
3404/// [`crate::KTSTR_RUN_EPOCH_ENV`] — an opaque per-invocation value
3405/// the orchestrator stamps once before nextest spawns, inherited by
3406/// every child test process.
3407///
3408/// `None` when the variable is unset or empty (raw `cargo nextest
3409/// run` — no orchestrator); [`pre_clear_run_dir_once`] then wipes
3410/// every sidecar match (status quo for the unorchestrated path).
3411/// `Some` lets pre-clear record/match the `.ktstr_run_epoch`
3412/// sentinel so a later peer process skips re-wiping a dir this
3413/// session already cleared, sparing the peers' sidecars.
3414fn run_session_token() -> Option<String> {
3415    std::env::var(crate::KTSTR_RUN_EPOCH_ENV)
3416        .ok()
3417        .filter(|v| !v.is_empty())
3418}
3419
3420/// Predicate: is `path` an atomic-write staging file produced by
3421/// [`serialize_and_write_sidecar`]?
3422///
3423/// True iff the filename matches the `<test>-<hash>.ktstr.json.tmp.…`
3424/// shape — `is_sidecar_filename` rejects these because the
3425/// extension is `<run_id>` rather than `json`, so a separate
3426/// predicate is needed for the [`pre_clear_run_dir_once`] sweep
3427/// that reaps orphaned staging files. Filename-component check
3428/// (rather than full-path string) for the same load-bearing reason
3429/// `is_sidecar_filename` uses `Path::file_name()`: a `.ktstr.json.tmp.`
3430/// substring inside an ancestor segment must not match.
3431fn is_sidecar_staging_filename(path: &std::path::Path) -> bool {
3432    path.file_name()
3433        .and_then(|n| n.to_str())
3434        .is_some_and(|n| n.contains(".ktstr.json.tmp."))
3435}
3436
3437/// Wall-clock timeout for [`acquire_run_dir_flock`] before it gives
3438/// up and returns an error. 30 s is generous for the per-write
3439/// critical section: each peer writer holds the lock for at most
3440/// one (read_dir + bounded removes) + one (serialize + write)
3441/// cycle, all measured in milliseconds. A holder that does not
3442/// release within 30 s has stalled (a stuck filesystem, a panic
3443/// inside the locked section that somehow survived the RAII
3444/// drop, etc.) and surfacing that as an actionable error beats
3445/// hanging the test run indefinitely. The timeout is asymmetric
3446/// with the cache-store 300 s (5 minute) timeout because
3447/// cache-store waits for tens of test runs to drain whereas this
3448/// lock waits for at most one peer write.
3449const RUN_DIR_LOCK_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
3450
3451/// Compute the per-run-key flock sentinel path for `dir`.
3452///
3453/// Layout: `{dir.parent()}/.locks/{dir.file_name()}.lock`. When
3454/// `dir = {runs_root}/{key}` (the production default-path shape),
3455/// this resolves to `{runs_root}/.locks/{key}.lock`. Sourced from
3456/// [`crate::flock::LOCK_DIR_NAME`] so a relocation of the lock
3457/// subdirectory updates one place across both this surface and
3458/// the cache module.
3459///
3460/// Returns `None` when `dir` has no parent (root) or no
3461/// `file_name` component (current dir, root) — neither case is
3462/// reachable on the production default path
3463/// ([`runs_root`] always returns a non-root multi-component
3464/// path), but the function is total over the input domain so a
3465/// future caller passing an unusual path surfaces a clean `None`
3466/// rather than panicking on `unwrap`.
3467///
3468/// Pure function over the input path — no I/O. The caller is
3469/// responsible for materializing the parent `.locks/`
3470/// subdirectory before opening the lockfile —
3471/// [`crate::flock::acquire_flock_with_timeout`] handles that
3472/// lazily.
3473fn run_dir_lock_path(dir: &std::path::Path) -> Option<PathBuf> {
3474    let parent = dir.parent()?;
3475    let leaf = dir.file_name()?;
3476    let mut filename = std::ffi::OsString::from(leaf);
3477    filename.push(".lock");
3478    Some(parent.join(crate::flock::LOCK_DIR_NAME).join(filename))
3479}
3480
3481/// Acquire `LOCK_EX` on the per-run-key flock sentinel for `dir`.
3482/// Default-timeout wrapper over [`acquire_run_dir_flock_with_timeout`];
3483/// see that helper's doc for the full behavior contract. The
3484/// timeout split exists so tests can exercise the contention /
3485/// timeout path with a sub-second deadline rather than waiting
3486/// 30 s of real time per assertion.
3487fn acquire_run_dir_flock(dir: &std::path::Path) -> anyhow::Result<std::os::fd::OwnedFd> {
3488    acquire_run_dir_flock_with_timeout(dir, RUN_DIR_LOCK_TIMEOUT)
3489}
3490
3491/// Test-parametrizable inner of [`acquire_run_dir_flock`].
3492///
3493/// Resolves the per-run-key lockfile path via [`run_dir_lock_path`]
3494/// then delegates to [`crate::flock::acquire_flock_with_timeout`],
3495/// which handles parent-directory creation, the poll loop, the
3496/// `tracing::debug!` contention log, and the formatted timeout
3497/// error. The `context` argument names the run directory and the
3498/// `remediation` argument supplies the operator-facing recovery
3499/// hint about peer cargo ktstr test processes that the shared
3500/// helper appends to the timeout error.
3501///
3502/// Returns `Err` on:
3503/// - `run_dir_lock_path(dir)` returning `None` (no parent / no
3504///   file_name — production default path always satisfies both,
3505///   so this is a defensive arm),
3506/// - any error from [`crate::flock::acquire_flock_with_timeout`]
3507///   (parent directory creation failure, `try_flock` error, or
3508///   wall-clock `timeout` elapsing).
3509///
3510/// Returns `Ok(OwnedFd)` on successful acquire. Caller drops the
3511/// fd to release the kernel-side flock; the OFD-bound semantics
3512/// of `flock(2)` mean no explicit unlock call is required —
3513/// `OwnedFd::drop` runs `close(2)` which releases the lock when
3514/// no other fd refers to the same OFD (the fresh `try_flock`
3515/// open guarantees uniqueness).
3516fn acquire_run_dir_flock_with_timeout(
3517    dir: &std::path::Path,
3518    timeout: std::time::Duration,
3519) -> anyhow::Result<std::os::fd::OwnedFd> {
3520    let lock_path = run_dir_lock_path(dir).ok_or_else(|| {
3521        anyhow::anyhow!(
3522            "cannot derive run-dir lock path from {} (no parent or no file_name component)",
3523            dir.display(),
3524        )
3525    })?;
3526    let context = format!("run-dir {}", dir.display());
3527    crate::flock::acquire_flock_with_timeout(
3528        &lock_path,
3529        crate::flock::FlockMode::Exclusive,
3530        timeout,
3531        &context,
3532        Some(
3533            "A peer cargo ktstr test process is writing sidecars to the \
3534             same {kernel}-{project_commit} directory; wait for it to \
3535             finish or kill it, then retry.",
3536        ),
3537    )
3538}
3539
3540/// Emit a minimal sidecar for a PRE-VM-BOOT skip path.
3541///
3542/// Stats tooling enumerates sidecars to compute pass/skip/fail
3543/// rates; when a test bails before `run_ktstr_test_inner` reaches
3544/// the VM-run site that calls [`write_sidecar`], the skip is
3545/// invisible to post-run analysis — it shows up as a missing
3546/// result rather than a recorded skip.
3547///
3548/// This helper writes a sidecar flagged `skipped: true, passed: true`
3549/// with empty VM telemetry (no monitor, no stimulus events, no
3550/// verifier stats, no kvm stats, no payload metrics). Stats tooling
3551/// that subtracts skipped runs from the pass count treats the entry
3552/// correctly.
3553///
3554/// # Distinction from in-VM `AssertResult::skip` paths
3555///
3556/// There are TWO classes of skip, each with its own sidecar writer:
3557///
3558/// 1. **Pre-VM-boot skips** route through this helper
3559///    (`write_skip_sidecar`). Examples:
3560///    - `performance_mode` gated off via `KTSTR_NO_PERF_MODE`
3561///      (see `run_ktstr_test_inner`),
3562///    - `ResourceContention` at `builder.build()` or `vm.run()`
3563///      (all-slots-busy / transient host-resource contention — the
3564///      VM never booted).
3565///
3566///    These paths write a MINIMAL sidecar: empty VM telemetry,
3567///    `skipped: true`, and BOTH `payload` and `work_type` resolved
3568///    exactly as a run of this config would (the entry's declared
3569///    payload and [`crate::test_support::args::current_work_type`]) so
3570///    the skip shares the run's variant identity — a later run of the
3571///    same config overwrites this skip's sidecar instead of coexisting
3572///    with it. There is no VmResult to drain because the VM didn't boot.
3573///
3574/// 2. **In-VM `AssertResult::skip` returns** — e.g. the
3575///    empty-cpuset skip in `scenario::run_scenario`
3576///    (`AssertResult::skip("not enough CPUs/LLCs")`), or the
3577///    `need >= 4 CPUs` checks in `scenario::dynamic::*` — route
3578///    through [`write_sidecar`] at `run_ktstr_test_inner`'s end.
3579///    The guest VM fully booted, ran through scenario setup,
3580///    discovered the topology couldn't accommodate the test, and
3581///    returned early. The resulting sidecar carries REAL VM
3582///    telemetry (monitor, kvm_stats, verifier_stats) alongside
3583///    `skipped: true` — not a blind spot, just a richer record
3584///    than what this helper emits.
3585///
3586/// The asymmetry is intentional: pre-VM-boot skips have no
3587/// telemetry to record, while in-VM skips do. Stats tooling that
3588/// wants to uniformly discount skipped runs filters on
3589/// [`SidecarResult::skipped == true`] regardless of which writer
3590/// produced the entry — both set the field identically.
3591///
3592/// Returns `Err` when the sidecar directory cannot be created, the
3593/// JSON cannot be serialized, or the file write fails. Callers that
3594/// ignore the Result accept the risk of stats-tooling blind spots on
3595/// this run.
3596pub(crate) fn write_skip_sidecar(
3597    entry: &KtstrTestEntry,
3598    resolved_topology: &crate::vmm::topology::Topology,
3599) -> anyhow::Result<()> {
3600    let SchedulerFingerprint {
3601        scheduler,
3602        scheduler_commit,
3603        sysctls,
3604        kargs,
3605    } = scheduler_fingerprint(entry);
3606    let sidecar = SidecarResult {
3607        test_name: entry.name.to_string(),
3608        perf_delta_assertions: entry
3609            .perf_delta_assertions
3610            .iter()
3611            .map(|&a| a.into())
3612            .collect(),
3613        // The RESOLVED topology a run of this preset would boot
3614        // (resolve_vm_topology(entry, topo)), NOT the declared
3615        // entry.topology — for a topology gauntlet each preset boots a
3616        // distinct topology, so recording the declared value would make
3617        // every preset share one variant_hash and clobber. For a plain
3618        // test (no override) resolved == declared. The skip and the run
3619        // of one preset thus share a variant_hash (the run path records
3620        // the same resolved topology), so a flaky test that skips on one
3621        // attempt and runs on the retry writes one sidecar.
3622        topology: resolved_topology.to_string(),
3623        scheduler,
3624        scheduler_commit,
3625        // A skip resolves no scheduler binary (no run), so there is no
3626        // discovery path to record.
3627        resolve_source: None,
3628        project_commit: detect_project_commit(),
3629        // A skip never runs the payload. Still record the declared
3630        // payload name so stats tooling can attribute the skip to
3631        // the payload-gauntlet variant rather than losing the
3632        // association.
3633        payload: entry.payload.map(|p| p.name.to_string()),
3634        metrics: Vec::new(),
3635        passed: false,
3636        skipped: true,
3637        inconclusive: false,
3638        expected_failure: false,
3639        stats: Default::default(),
3640        monitor: None,
3641        // A skip never ran the VM, so no periodic captures fired.
3642        periodic_fired: 0,
3643        periodic_target: 0,
3644        // A skip never booted the VM, so it has no measured budget. 0/0
3645        // maps to None on the GauntletRow's cpu_budget dim (skips carry no
3646        // budget identity; the skipped=true flag, not a sentinel field
3647        // value, marks them).
3648        vcpus: 0,
3649        cpu_budget: 0,
3650        stimulus_events: Vec::new(),
3651        // A skip never ran the workload, but it carries the SAME
3652        // work_type a run of this config would (current_work_type reads
3653        // the per-variant --ktstr-work-type arg, identical across nextest
3654        // retry attempts). That keeps the skip's variant_hash equal to
3655        // the run's, so a flaky test that skips on one attempt and runs
3656        // on the retry writes one sidecar (the retry overwrites the skip)
3657        // rather than two coexisting files the footer would both flag.
3658        // Skips stay identified by skipped=true, not by a work_type
3659        // sentinel (see the variant-hash + skipped-bool contract above).
3660        work_type: super::args::current_work_type(),
3661        verifier_stats: Vec::new(),
3662        kvm_stats: None,
3663        sysctls,
3664        kargs,
3665        kernel_version: detect_kernel_version(),
3666        kernel_commit: kernel_commit_for_sidecar(),
3667        timestamp: now_iso8601(),
3668        run_id: generate_run_id(),
3669        host: Some(crate::host_context::collect_host_context()),
3670        // Skip paths never reach `collect_results`, so cleanup
3671        // duration is undefined. Emit `null` per the sidecar's
3672        // symmetric serialize/deserialize contract.
3673        cleanup_duration_ms: None,
3674        run_source: detect_run_source(),
3675    };
3676    serialize_and_write_sidecar(&sidecar, "skip sidecar")
3677}
3678
3679/// Write a sidecar JSON file for post-run analysis.
3680///
3681/// Output goes to the current run's sidecar directory
3682/// (`KTSTR_SIDECAR_DIR` override, or
3683/// `{CARGO_TARGET_DIR or "target"}/ktstr/{kernel}-{project_commit}/`,
3684/// where `{project_commit}` is the project HEAD short hex with
3685/// `-dirty` when the worktree differs).
3686///
3687/// `payload_metrics` is the accumulated per-invocation output from
3688/// `ctx.payload(X).run()` / `.spawn().wait()` calls made in the
3689/// test body. Empty vec when the test body never called
3690/// `Ctx::payload` (scheduler-only tests, host-only probes).
3691///
3692/// Returns `Err` when the sidecar directory cannot be created, the
3693/// JSON cannot be serialized, or the file write fails. Callers that
3694/// ignore the Result accept the risk of stats-tooling blind spots on
3695/// this run.
3696pub(crate) fn write_sidecar(
3697    entry: &KtstrTestEntry,
3698    vm_result: &vmm::VmResult,
3699    stimulus_events: &[StimulusEvent],
3700    check_result: &AssertResult,
3701    work_type: &str,
3702    payload_metrics: &[PayloadMetrics],
3703    resolved_topology: &crate::vmm::topology::Topology,
3704) -> anyhow::Result<()> {
3705    let SchedulerFingerprint {
3706        scheduler,
3707        scheduler_commit,
3708        sysctls,
3709        kargs,
3710    } = scheduler_fingerprint(entry);
3711    let sidecar = SidecarResult {
3712        test_name: entry.name.to_string(),
3713        perf_delta_assertions: entry
3714            .perf_delta_assertions
3715            .iter()
3716            .map(|&a| a.into())
3717            .collect(),
3718        // The RESOLVED topology this run booted (resolve_vm_topology
3719        // result), NOT the declared entry.topology — a topology gauntlet
3720        // boots a distinct topology per preset, so the declared value
3721        // would collapse every preset to one variant_hash. resolved ==
3722        // declared for a plain test (no override).
3723        topology: resolved_topology.to_string(),
3724        scheduler,
3725        scheduler_commit,
3726        // Scheduler-resolution provenance, carried on VmResult from the
3727        // host eval path (run_ktstr_test_inner_impl resolves the binary
3728        // once and stamps the source), mirroring how vcpus / cpu_budget
3729        // ride VmResult to this stamp.
3730        resolve_source: vm_result.resolve_source.clone(),
3731        project_commit: detect_project_commit(),
3732        payload: entry.payload.map(|p| p.name.to_string()),
3733        metrics: payload_metrics.to_vec(),
3734        passed: check_result.is_pass(),
3735        skipped: check_result.is_skip(),
3736        inconclusive: check_result.is_inconclusive(),
3737        // Raw scenario verdict at write time; the dispatch-layer
3738        // finalize (finalize_sidecar_verdict) overwrites these bits with
3739        // the post-inversion outcome and sets expected_failure.
3740        expected_failure: false,
3741        stats: check_result.stats.clone(),
3742        monitor: vm_result.monitor.as_ref().map(|m| m.summary.clone()),
3743        periodic_fired: vm_result.periodic_fired,
3744        periodic_target: vm_result.periodic_target,
3745        vcpus: vm_result.vcpus,
3746        cpu_budget: vm_result.cpu_budget,
3747        stimulus_events: stimulus_events.to_vec(),
3748        work_type: work_type.to_string(),
3749        verifier_stats: vm_result.verifier_stats.clone(),
3750        kvm_stats: vm_result.kvm_stats.clone(),
3751        sysctls,
3752        kargs,
3753        kernel_version: detect_kernel_version(),
3754        kernel_commit: kernel_commit_for_sidecar(),
3755        timestamp: now_iso8601(),
3756        run_id: generate_run_id(),
3757        host: Some(crate::host_context::collect_host_context()),
3758        cleanup_duration_ms: vm_result.cleanup_duration.map(|d| d.as_millis() as u64),
3759        run_source: detect_run_source(),
3760    };
3761    serialize_and_write_sidecar(&sidecar, "sidecar")
3762}
3763
3764#[cfg(test)]
3765mod tests;
ktstr/test_support/sidecar/mod.rs

ktstr/test_support/sidecar/
mod.rs