ktstr/stats/
compare.rs

1use super::*;
2
3/// One significant per-metric finding produced by [`compare_rows_by`].
4///
5/// `pairing_key` carries the dynamic identity the row pair joined
6/// on — `scenario` plus every NON-slicing dimension's value. The
7/// table renderer in [`compare_partitions`] decodes the key against
8/// the slicing-dim list to produce a label like
9/// `scenario/topology/work_type` (when topology + work_type are
10/// pairing dims) or just `scenario` (when every other dim slices).
11///
12/// The `scenario` / `topology` / `work_type` fields carry the
13/// matched row's values verbatim for legacy-shape consumers and
14/// test fixtures that pre-date the dimensional-slicing refactor.
15/// New code should read [`Finding::pairing_key`] directly so the
16/// slicing-dim variation stays visible.
17///
18/// `metric` is the registry entry the comparison ran against;
19/// consumers read polarity, display unit, and name through it
20/// directly without re-looking up [`metric_def`].
21#[derive(Debug, Clone, serde::Serialize)]
22pub(crate) struct Finding {
23    pub pairing_key: PairingKey,
24    pub scenario: String,
25    pub topology: String,
26    pub work_type: String,
27    pub metric: &'static MetricDef,
28    pub val_a: f64,
29    pub val_b: f64,
30    pub delta: f64,
31    pub kind: FindingKind,
32}
33
34/// How a significant (past-dual-gate) delta is classified. A metric
35/// becomes a [`Finding`] only after clearing the
36/// dual gate; this says which kind. `Informational` is for a
37/// [`Polarity::Informational`](crate::test_support::Polarity::Informational)
38/// metric (`MetricDef::classify_direction` => `None`): the change is
39/// shown but is NEVER a regression or improvement and never affects the
40/// exit code.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
42pub(crate) enum FindingKind {
43    Regression,
44    Improvement,
45    Informational,
46}
47
48/// A metric present on exactly ONE side of a paired (scenario,
49/// topology, work_type) row — a coverage difference, not a perf delta.
50///
51/// `MetricDef::read` returns `None` for a metric absent on a row and
52/// `Some(v)` (including `Some(0.0)`) when present, so an absent metric
53/// is distinguishable from a genuine zero. A metric present on one side
54/// and absent on the other is NOT a regression/improvement: it never
55/// had a comparable baseline. Recording it here (never gated, never
56/// counted in `regressions`/`improvements`/`informational`) surfaces
57/// the appear/disappear-between-runs case instead of either silently
58/// dropping it or — as the pre-fix `read().unwrap_or(0.0)` did —
59/// mis-flagging it as a directional verdict against a coerced-zero
60/// side: an unbounded relative change when the absent side is the
61/// baseline (rel-gate INFINITY), a bounded one otherwise (e.g. 5 -> 0
62/// gives rel ~1.0) — either clears the gate and yields a phantom
63/// regression or improvement (the direction follows the metric's
64/// polarity, so it inverts between LowerBetter and HigherBetter).
65#[derive(Debug, Clone, serde::Serialize)]
66pub(crate) struct CoverageDiff {
67    pub pairing_key: PairingKey,
68    pub scenario: String,
69    pub topology: String,
70    pub work_type: String,
71    pub metric: &'static MetricDef,
72    /// The side that HAS the metric; the other side is absent.
73    pub present_side: ComparePartition,
74    /// The present side's value (the absent side has none).
75    pub value: f64,
76}
77
78/// Aggregate result of comparing two row sets via [`compare_rows_by`].
79///
80/// `regressions` and `improvements` count significant entries in
81/// `findings`; `unchanged` counts metrics that fell below the dual
82/// gate; `excluded_pairs` counts paired (scenario, topology, work_type)
83/// row pairs where either side is excluded from regression math —
84/// `fail`, `inconclusive`, `skip`, or an inverted `expected_failure`
85/// run (which passes but carries failure-mode-dominated telemetry) all
86/// route here. The field name captures "excluded from regression math"
87/// rather than encoding any of the four excluded states, because the
88/// per-side disposition (which side, which state) is recoverable from
89/// the individual `GauntletRow::is_*` / `expected_failure` accessors
90/// when the operator drills in.
91/// `new_in_b`
92/// counts B-side rows whose key has no match on the A side; the
93/// converse is `removed_from_a`. The filter (when set) applies to
94/// every counter, so excluded rows do not contribute.
95#[derive(Debug, Clone, Default, serde::Serialize)]
96pub(crate) struct CompareReport {
97    pub regressions: u32,
98    pub improvements: u32,
99    /// Significant changes in `Polarity::Informational` metrics — shown
100    /// but never gated (excluded from `regressions`/`improvements` and
101    /// the exit code).
102    pub informational: u32,
103    pub unchanged: u32,
104    pub excluded_pairs: u32,
105    pub new_in_b: u32,
106    pub removed_from_a: u32,
107    pub findings: Vec<Finding>,
108    /// Metrics present on exactly one side of a paired row (a metric
109    /// appeared or disappeared between runs A and B). Never gated — not
110    /// counted in `regressions`/`improvements`/`informational` and no
111    /// effect on the exit code; surfaced so a coverage change is
112    /// visible rather than silently dropped or mis-flagged as a
113    /// regression from a zero baseline. See [`CoverageDiff`].
114    pub coverage_diffs: Vec<CoverageDiff>,
115}
116
117/// Which side of an A/B comparison a row belongs to. Typed surface
118/// for the per-phase rows so new code does not propagate the
119/// `"A"` / `"B"` string-literal pattern the scalar-finding path
120/// uses (kept as-is at the existing `"A"` / `"B"` call sites in this
121/// module — `render_side_label`, `zero_match_diagnostic`).
122#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize)]
123pub(crate) enum ComparePartition {
124    A,
125    B,
126}
127
128impl ComparePartition {
129    /// Render the side as the same one-letter label
130    /// `render_side_label` produces for the scalar table headers,
131    /// so the noise per-phase coverage rows and the scalar findings
132    /// table share the same operator-facing side identifier.
133    pub fn as_str(self) -> &'static str {
134        match self {
135            Self::A => "A",
136            Self::B => "B",
137        }
138    }
139}
140
141/// Per-metric threshold policy driving `compare_rows` /
142/// `compare_partitions`.
143///
144/// Resolution priority for a given metric's relative significance
145/// threshold, highest first:
146///
147/// 1. `per_metric_percent[metric_name]` — explicit override for
148///    this metric.
149/// 2. `default_percent` — uniform override across every metric
150///    not listed in the map (equivalent to the old `--threshold N`
151///    CLI flag).
152/// 3. The metric's built-in `default_rel` from the `METRICS`
153///    registry — the "no policy" fallback.
154///
155/// Values in the struct are stored as PERCENT (e.g. `10.0` meaning
156/// 10%), NOT fractions. [`Self::rel_threshold`] does the `/100.0`
157/// conversion so every caller inside `compare_rows` reads a
158/// fraction without re-deriving the division.
159///
160/// Note on the registry-fallback branch: the `default_rel` field
161/// on `MetricDef` is already a FRACTION (e.g. `0.25` for 25%),
162/// not a percent. `rel_threshold` returns it verbatim — it
163/// does NOT divide by 100. Only the override branches
164/// (per-metric map, `default_percent`) do the percent-to-fraction
165/// conversion because their inputs are percents. This asymmetry
166/// is deliberate so callers supplying CLI/file-based overrides
167/// work in human-intuitive percent units while the registry
168/// defaults (which already ship in fraction form) pass through
169/// unchanged.
170///
171/// The struct is `serde::Serialize` / `serde::Deserialize` so
172/// `cargo ktstr perf-delta --policy <path>` can load a
173/// JSON-persisted policy file. Default construction produces an
174/// empty policy that uses every registry default; [`Self::uniform`]
175/// reproduces the old `--threshold N` behaviour without any
176/// per-metric override plumbing at the call site.
177#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
178#[serde(default, deny_unknown_fields)]
179pub struct ComparisonPolicy {
180    /// Uniform override: when `Some(p)`, every metric whose name is
181    /// NOT in [`Self::per_metric_percent`] uses `p / 100.0` as its
182    /// relative threshold. `None` falls through to the registry
183    /// `default_rel`. Stored as percent (e.g. `10.0` for 10%).
184    pub default_percent: Option<f64>,
185    /// Per-metric overrides keyed by metric name. Each value is a
186    /// percent (e.g. `15.0` → 15%). An entry here takes precedence
187    /// over both [`Self::default_percent`] and the registry
188    /// `default_rel`.
189    pub per_metric_percent: BTreeMap<String, f64>,
190}
191
192/// CLI-controlled rendering of the per-phase spread block in
193/// `cargo ktstr perf-delta --noise-adjust`. Bundled as a struct
194/// so the 5-flag clap surface threads through
195/// `compare_partitions_noise` as a single positional rather than
196/// five. Default value renders every phase / every metric / every
197/// paired row — equivalent to passing no phase flags. All 5 flags
198/// require `--noise-adjust` (per-phase output exists only there).
199///
200/// The flags compose via AND on independent axes (block-level
201/// suppression × phase-id × row-significance), with three
202/// mutex constraints enforced at CLI parse time:
203///
204/// - `--no-phases` excludes every other phase flag (the whole
205///   block is suppressed; refining what to render is a
206///   contradiction).
207/// - `--phases-only` excludes `--no-phases` (same reason).
208/// - `--steps-only` excludes `--phase` (one of them collapses
209///   to a single bucket; the other suppresses BASELINE — both
210///   together are confused phrasing).
211///
212/// The 5 flags trigger renderer behaviour ONLY — the
213/// `--noise-adjust` per-phase pass always computes the full set
214/// of `NoisePhaseFinding`s and coverage entries so programmatic
215/// consumers see the unfiltered surface. Filtering is render-time
216/// projection.
217#[derive(Debug, Default, Clone)]
218pub struct PhaseDisplayOptions {
219    /// `--no-phases`: suppress the `--noise-adjust` per-phase
220    /// spread block entirely. The aggregate spread table and
221    /// footer render unchanged; the only effect is hiding the
222    /// per-phase block (and its footer hint). Mutually exclusive
223    /// with every other phase flag at CLI parse time.
224    pub no_phases: bool,
225    /// `--phases-only`: suppress the aggregate spread table and
226    /// the host-context delta; render ONLY the per-phase spread
227    /// block. Useful for narrowing investigation to a phase
228    /// regression when the aggregate rollup is noise. Composes
229    /// with `--steps-only`, `--phase`, and `--phase-threshold`.
230    pub phases_only: bool,
231    /// `--steps-only`: within the per-phase block, suppress
232    /// the BASELINE bucket (`step_index == 0`); render only
233    /// scenario Step buckets. Useful when the BASELINE settle
234    /// window is dominated by scheduler startup transients.
235    /// Mutually exclusive with `--phase`.
236    pub steps_only: bool,
237    /// `--phase <N>`: within the per-phase block, render only
238    /// rows whose `step_index == N`. `0` selects BASELINE;
239    /// `1..=N` selects scenario Step ordinals (1 → Step\[0\],
240    /// 2 → Step\[1\], ...). Integer chosen over label so a label
241    /// rename (`"Step[0]"` → `"Step:0"`) doesn't break operator
242    /// CI invocations. Mutually exclusive with `--steps-only`.
243    pub phase: Option<u16>,
244    /// `--phase-threshold <PCT>`: render-side relative-spread
245    /// gate for the `--noise-adjust` per-phase pass. Suppresses
246    /// paired rows where `|delta-mean| / |a.mean| < PCT / 100.0`;
247    /// a value from a ~zero baseline (`|a.mean| < ZERO_MEAN_EPS`)
248    /// is an unbounded relative change and clears any finite
249    /// threshold. `0.0` shows every paired row; absence falls
250    /// through to the registry's per-metric `default_rel`.
251    /// Independent from `--threshold` — the aggregate and
252    /// per-phase passes have separate filters so an operator can
253    /// widen the per-phase view without widening the aggregate
254    /// view.
255    pub phase_threshold: Option<f64>,
256}
257
258impl PhaseDisplayOptions {
259    /// Resolve the per-phase relative threshold for a given
260    /// metric. Returns the override fraction when
261    /// `phase_threshold` is set, else falls through to the
262    /// `ComparisonPolicy` resolution the scalar pass uses. The
263    /// `metric_name` + `default_rel` shape mirrors
264    /// [`ComparisonPolicy::rel_threshold`] so the two surfaces
265    /// stay symmetric.
266    pub fn rel_threshold(
267        &self,
268        policy: &ComparisonPolicy,
269        metric_name: &str,
270        default_rel: f64,
271    ) -> f64 {
272        match self.phase_threshold {
273            Some(pct) => pct / 100.0,
274            None => policy.rel_threshold(metric_name, default_rel),
275        }
276    }
277
278    /// True when a phase row at the given `step_index` should
279    /// render under the current display flags. Combines the two
280    /// step-axis predicates (`--phase <N>` filter and
281    /// `--steps-only` BASELINE-suppressor) into a single
282    /// row-level decision the renderer applies uniformly across
283    /// the `--noise-adjust` per-phase findings and coverage rows.
284    /// Returns `true` when no relevant flag is set (default
285    /// path: every step renders).
286    pub fn matches_phase(&self, step_index: u16) -> bool {
287        if let Some(want) = self.phase
288            && step_index != want
289        {
290            return false;
291        }
292        if self.steps_only && step_index == 0 {
293            return false;
294        }
295        true
296    }
297
298    /// The `--phase-threshold` relative-spread gate for a
299    /// [`NoisePhaseFinding`]'s verdict: `|b.mean - a.mean| / |a.mean| >=
300    /// phase_threshold / 100`. A move from a ~zero baseline (`|a.mean| <
301    /// ZERO_MEAN_EPS`) is unbounded → shown; both ~zero carries no signal →
302    /// filtered by any positive threshold. Returns `true` when no flag is set.
303    /// The noise row carries per-side means (a [`NoiseVerdict`]), so the gate
304    /// works on the mean delta rather than a single-run row delta.
305    pub(crate) fn passes_noise_spread_threshold(&self, verdict: &NoiseVerdict) -> bool {
306        let Some(pct) = self.phase_threshold else {
307            return true;
308        };
309        let a = verdict.a.mean.abs();
310        let delta = (verdict.b.mean - verdict.a.mean).abs();
311        let rel = if a > ZERO_MEAN_EPS {
312            delta / a
313        } else if delta > ZERO_MEAN_EPS {
314            f64::INFINITY
315        } else {
316            0.0
317        };
318        rel >= pct / 100.0
319    }
320}
321
322impl ComparisonPolicy {
323    /// Empty policy — every metric uses its `METRICS` registry
324    /// default. Equivalent to the old `--threshold None` CLI path.
325    pub fn new() -> Self {
326        Self::default()
327    }
328
329    /// Uniform override: every metric uses `percent / 100.0`.
330    /// Mirrors the old `--threshold N` CLI behaviour; the CLI
331    /// dispatch at `cargo ktstr perf-delta --threshold N`
332    /// constructs a policy via this constructor.
333    pub fn uniform(percent: f64) -> Self {
334        Self {
335            default_percent: Some(percent),
336            per_metric_percent: BTreeMap::new(),
337        }
338    }
339
340    /// Load a JSON-persisted policy from a file. Errors propagate
341    /// the read / parse reason as an `anyhow::Error` with the file
342    /// path in the context chain so a malformed `--policy path.json`
343    /// surfaces an actionable message rather than a generic
344    /// "invalid JSON."
345    ///
346    /// Validates after parsing via [`Self::validate`]: rejects
347    /// negative thresholds (a misconfigured 10 vs -10 would
348    /// invert the dual-gate logic at the `.abs() >= rel_thresh`
349    /// check and silently classify every metric as significant)
350    /// and rejects per-metric keys not registered in `METRICS`
351    /// (a typo like `"wrost_spread"` would otherwise be silently
352    /// ignored — the key simply never matches during resolution
353    /// and the metric falls through to `default_percent`).
354    pub fn load_json(path: &std::path::Path) -> anyhow::Result<Self> {
355        use anyhow::Context;
356        let data = std::fs::read_to_string(path)
357            .with_context(|| format!("read comparison policy from {}", path.display()))?;
358        let policy: ComparisonPolicy = serde_json::from_str(&data)
359            .with_context(|| format!("parse comparison policy from {}", path.display()))?;
360        policy
361            .validate()
362            .with_context(|| format!("validate comparison policy from {}", path.display()))?;
363        Ok(policy)
364    }
365
366    /// Structural validation separate from parsing so both the
367    /// `load_json` path and programmatic constructors (after
368    /// [`Self::uniform`] with a user-supplied percent) can share
369    /// one set of invariants without re-implementing checks at
370    /// each call site. Called automatically by [`Self::load_json`];
371    /// CLI dispatch should call it after constructing via
372    /// [`Self::uniform`] to catch `--threshold -10` at the
373    /// entry point rather than deep inside `compare_rows` where
374    /// the dual-gate math silently misbehaves.
375    ///
376    /// Rejects:
377    /// - Negative `default_percent` (nonsensical — thresholds are
378    ///   absolute-value comparisons).
379    /// - Negative entries in `per_metric_percent`.
380    /// - Per-metric keys not in the `METRICS` registry (silent
381    ///   typos would otherwise fall through to `default_percent`
382    ///   unnoticed).
383    pub fn validate(&self) -> anyhow::Result<()> {
384        if let Some(p) = self.default_percent
385            && p < 0.0
386        {
387            anyhow::bail!(
388                "ComparisonPolicy: default_percent must be non-negative; got {p}. \
389                 Thresholds are absolute-value comparisons — a negative value \
390                 would invert the dual-gate logic and silently classify every \
391                 delta as significant."
392            );
393        }
394        for (name, p) in &self.per_metric_percent {
395            if !METRICS.iter().any(|m| m.name == name) {
396                let known: Vec<&str> = METRICS.iter().map(|m| m.name).collect();
397                anyhow::bail!(
398                    "ComparisonPolicy: per_metric_percent contains unknown \
399                     metric `{name}`. A typo in the key would silently fall \
400                     through to default_percent. Registered metrics: {}",
401                    known.join(", "),
402                );
403            }
404            if *p < 0.0 {
405                anyhow::bail!(
406                    "ComparisonPolicy: per_metric_percent[{name:?}] must be \
407                     non-negative; got {p}",
408                );
409            }
410        }
411        Ok(())
412    }
413
414    /// Resolve the mutually-exclusive `--threshold` / `--policy` CLI
415    /// pair into a policy: `--threshold N` is sugar for a uniform N%
416    /// default (validated for sign); `--policy PATH` loads a
417    /// per-metric JSON policy; neither falls through to the registry
418    /// defaults. Shared by every subcommand that accepts the pair
419    /// (`perf-delta`) so the resolution rules — and
420    /// the "exactly one of the two" contract — live in one place.
421    ///
422    /// Both flags set is rejected with an error. At the CLI call
423    /// sites clap `conflicts_with` makes that unreachable, but this is
424    /// a library entry point and must not panic on its inputs; the
425    /// error is the defence-in-depth backstop.
426    pub fn from_cli_flags(
427        threshold: Option<f64>,
428        policy: Option<&std::path::Path>,
429    ) -> anyhow::Result<Self> {
430        match (threshold, policy) {
431            (Some(t), None) => {
432                let p = Self::uniform(t);
433                p.validate()?;
434                Ok(p)
435            }
436            (None, Some(path)) => Self::load_json(path),
437            (None, None) => Ok(Self::default()),
438            (Some(_), Some(_)) => anyhow::bail!(
439                "--threshold and --policy are mutually exclusive; use --policy \
440                 for per-metric overrides"
441            ),
442        }
443    }
444
445    /// Resolve the relative threshold (as a fraction, e.g. `0.10`
446    /// for 10%) for `metric_name` with `default_rel` as the
447    /// registry-level fallback. Handles the percent→fraction
448    /// conversion so `compare_rows_by` does not need to re-derive
449    /// `p / 100.0` at every call site.
450    pub fn rel_threshold(&self, metric_name: &str, default_rel: f64) -> f64 {
451        if let Some(p) = self.per_metric_percent.get(metric_name) {
452            p / 100.0
453        } else if let Some(p) = self.default_percent {
454            p / 100.0
455        } else {
456            default_rel
457        }
458    }
459}
460
461/// Compare two row sets metric-by-metric, parametrised on
462/// `pairing_dims`.
463///
464/// Pure function: no I/O, no globals. Two rows pair iff their
465/// [`PairingKey`] (scenario + every value for each dimension in
466/// `pairing_dims`) is equal — this is the dimensional-slicing
467/// pipeline's join primitive, with slicing dims EXCLUDED from
468/// `pairing_dims` so rows on the A/B sides that differ on those
469/// dims still pair as long as they agree on every non-slicing
470/// dim. When `filter` is `Some(s)`, a row is included only if
471/// `s` appears as a substring of the joined `"scenario topology
472/// scheduler work_type"` string. The scheduler is
473/// searchable via the substring filter but is not part of the
474/// pairing key by default (only when `Dimension::Scheduler` is
475/// in `pairing_dims`), so the same scenario+topology+work_type
476/// pair compares correctly across different scheduler binaries
477/// when the filter does not constrain it.
478///
479/// Row-pair accounting:
480/// - B-side rows with no A-side match are counted in `new_in_b`.
481/// - A-side rows with no B-side match are counted in `removed_from_a`
482///   (a separate pass over `rows_a`).
483/// - Paired rows where either side has `passed=false` are dropped
484///   from the regression math and counted in `excluded_pairs`: a
485///   failed scenario's metrics reflect the failure mode (short run,
486///   stalled workload, missing samples), not the scheduler's
487///   behavior.
488///
489/// The filter (when set) applies to every counter -- excluded rows
490/// never reach the matching, pass, or metric stages.
491///
492/// `policy` carries the comparison thresholds. See
493/// [`ComparisonPolicy`] for the resolution rules — per-metric
494/// override → `default_percent` → registry `default_rel`. The
495/// absolute gate always uses the metric's `default_abs`. A delta
496/// must clear both gates to count as significant.
497pub(crate) fn compare_rows_by(
498    rows_a: &[GauntletRow],
499    rows_b: &[GauntletRow],
500    pairing_dims: &[Dimension],
501    filter: Option<&str>,
502    policy: &ComparisonPolicy,
503) -> CompareReport {
504    let mut report = CompareReport::default();
505
506    // Build a HashMap<PairingKey, &GauntletRow> from rows_a once so
507    // each row_b lookup is O(1) instead of O(rows_a). `or_insert_with`
508    // preserves first-match semantics from the prior `rows_a.iter().find()`
509    // call: on the rare path where two A-side rows share a key (the
510    // averaging path folds same-key rows into one mean, so a
511    // shared key is not normally reachable), the
512    // earlier-iterated row wins.
513    let mut a_by_key: HashMap<PairingKey, &GauntletRow> = HashMap::with_capacity(rows_a.len());
514    for row_a in rows_a {
515        let key = PairingKey::from_row(row_a, pairing_dims);
516        a_by_key.entry(key).or_insert(row_a);
517    }
518
519    // Hoist the per-metric relative threshold out of the row×metric
520    // loop. `policy.rel_threshold(m.name, m.default_rel)` is a pure
521    // function of the metric — recomputing it for every row pair was
522    // O(rows_b × METRICS) BTreeMap probes for nothing.
523    let rel_thresholds: Vec<f64> = METRICS
524        .iter()
525        .map(|m| policy.rel_threshold(m.name, m.default_rel))
526        .collect();
527    // Same hoist for the render-suppression predicate: it is a pure
528    // function of the metric (a small fixed-slice membership scan), so
529    // probing it per (row_b x metric) re-ran the scan for nothing.
530    let suppressed: Vec<bool> = METRICS
531        .iter()
532        .map(|m| is_render_suppressed_component(m.name))
533        .collect();
534
535    for row_b in rows_b {
536        // Dynamic pairing key: scenario + every NON-slicing
537        // dimension's value. Two rows pair iff their dynamic keys
538        // match.
539        let key_b = PairingKey::from_row(row_b, pairing_dims);
540        if let Some(f) = filter {
541            // Substring filter joins all identity-bearing fields —
542            // including the SLICING dim values — so an operator
543            // can narrow by any visible field via `-E`.
544            let joined = format!(
545                "{} {} {} {}",
546                row_b.scenario, row_b.topology, row_b.scheduler, row_b.work_type,
547            );
548            if !joined.contains(f) {
549                continue;
550            }
551        }
552        let Some(&row_a) = a_by_key.get(&key_b) else {
553            report.new_in_b += 1;
554            continue;
555        };
556
557        // Drop from regression math when either side is a skip,
558        // inconclusive, failure, or an inverted expected_failure run.
559        // Skips carry no executed metrics
560        // (the run didn't happen); inconclusive runs ran but lacked
561        // signal to evaluate (zero-denominator ratio gate); failures
562        // carry telemetry dominated by the failure mode (short run,
563        // stalled workload), not the scheduler's behavior. An
564        // expected_failure run has `passed == true` but its telemetry
565        // is likewise failure-mode-dominated (short / stalled run), so
566        // it is excluded despite passing —
567        // comparing any of these against a real run produces
568        // meaningless deltas.
569        if row_a.is_fail()
570            || row_b.is_fail()
571            || row_a.is_inconclusive()
572            || row_b.is_inconclusive()
573            || row_a.is_skip()
574            || row_b.is_skip()
575            || row_a.expected_failure
576            || row_b.expected_failure
577        {
578            report.excluded_pairs += 1;
579            continue;
580        }
581
582        push_scalar_findings(
583            &mut report,
584            row_a,
585            row_b,
586            &key_b,
587            &rel_thresholds,
588            &suppressed,
589        );
590    }
591
592    // Second pass: A-side rows whose key has no match on the B side.
593    // Filter applies here too, so rows excluded by the filter never
594    // count as removed. Build a HashSet<PairingKey> from rows_b once
595    // so the existence check is O(1) per row_a; rows_b are inserted
596    // unfiltered to preserve prior behaviour where a row_b that fails
597    // the substring filter still suppresses a same-key row_a's
598    // removed_from_a increment (the substring filter compares against
599    // identity-bearing fields including slicing dims, so two rows
600    // sharing a pairing key can disagree on filter membership).
601    let b_keys: HashSet<PairingKey> = rows_b
602        .iter()
603        .map(|r| PairingKey::from_row(r, pairing_dims))
604        .collect();
605    for row_a in rows_a {
606        let key_a = PairingKey::from_row(row_a, pairing_dims);
607        if let Some(f) = filter {
608            let joined = format!(
609                "{} {} {} {}",
610                row_a.scenario, row_a.topology, row_a.scheduler, row_a.work_type,
611            );
612            if !joined.contains(f) {
613                continue;
614            }
615        }
616        if !b_keys.contains(&key_a) {
617            report.removed_from_a += 1;
618        }
619    }
620
621    report
622}
623
624/// Append the scalar per-metric findings for one matched `(row_a,
625/// row_b)` pair to `report`. Indexed by the `METRICS` enumerate
626/// position: `rel_thresholds[i]` is the hoisted relative threshold
627/// and `suppressed[i]` the hoisted render-suppression flag for the
628/// i-th metric (both built once by [`compare_rows_by`] over the same
629/// `METRICS` order). Bumps `report.unchanged` for sub-dual-gate
630/// deltas and `report.regressions` / `report.improvements` per
631/// metric polarity for the rest, pushing a [`Finding`] for each
632/// significant delta.
633fn push_scalar_findings(
634    report: &mut CompareReport,
635    row_a: &GauntletRow,
636    row_b: &GauntletRow,
637    key_b: &PairingKey,
638    rel_thresholds: &[f64],
639    suppressed: &[bool],
640) {
641    for (i, m) in METRICS.iter().enumerate() {
642        // Rate components are internal plumbing — suppressed from compare
643        // output (they remain in storage for the cross-run re-pool).
644        if suppressed[i] {
645            continue;
646        }
647        // `read` returns `None` for a metric absent on a row and `Some(v)`
648        // (including `Some(0.0)`) when present, so absent is distinguishable
649        // from a genuine zero. A metric present on exactly one side is a
650        // coverage difference, NOT a delta: record it (never gated) and skip
651        // the directional verdict. The pre-fix `unwrap_or(0.0)` coerced an
652        // absent side to 0.0, producing a phantom directional verdict: when
653        // the absent side was the baseline (val_a==0) the rel-gate's INFINITY
654        // branch fired (unbounded), and otherwise rel_delta was bounded (e.g.
655        // 5 -> 0 gives |(-5)/5|=1.0); either cleared both gates and yielded a
656        // phantom regression or improvement (direction per the metric's
657        // polarity) for a metric simply not captured on one side.
658        let (val_a, val_b) = match (m.read(row_a), m.read(row_b)) {
659            (Some(a), Some(b)) => (a, b),
660            (None, None) => continue,
661            (Some(a), None) => {
662                report.coverage_diffs.push(CoverageDiff {
663                    pairing_key: key_b.clone(),
664                    scenario: row_b.scenario.clone(),
665                    topology: row_b.topology.clone(),
666                    work_type: row_b.work_type.clone(),
667                    metric: m,
668                    present_side: ComparePartition::A,
669                    value: a,
670                });
671                continue;
672            }
673            (None, Some(b)) => {
674                report.coverage_diffs.push(CoverageDiff {
675                    pairing_key: key_b.clone(),
676                    scenario: row_b.scenario.clone(),
677                    topology: row_b.topology.clone(),
678                    work_type: row_b.work_type.clone(),
679                    metric: m,
680                    present_side: ComparePartition::B,
681                    value: b,
682                });
683                continue;
684            }
685        };
686        // Both sides negligible (under ZERO_MEAN_EPS, the domain zero epsilon —
687        // not f64::EPSILON, the machine ulp near 1.0): no signal, skip without
688        // counting.
689        if val_a.abs() < ZERO_MEAN_EPS && val_b.abs() < ZERO_MEAN_EPS {
690            continue;
691        }
692
693        let rel_thresh = rel_thresholds[i];
694
695        let delta = val_b - val_a;
696        let rel_delta = if val_a.abs() > ZERO_MEAN_EPS {
697            (delta / val_a).abs()
698        } else {
699            // A non-negligible value (val_b — the both-zero case is skipped
700            // above) appearing from a ~zero baseline is an unbounded relative
701            // change, not "unchanged". INFINITY clears the rel gate so the
702            // absolute gate alone decides whether this delta is significant.
703            f64::INFINITY
704        };
705
706        if delta.abs() < m.default_abs || rel_delta < rel_thresh {
707            report.unchanged += 1;
708            continue;
709        }
710
711        // Verdict: dual-gate already passed above (significant). An
712        // Informational metric (classify_direction => None) is recorded
713        // and displayed but NEVER counted as regression/improvement and
714        // NEVER affects the exit code; a directional metric splits on its
715        // polarity.
716        let kind = match m.classify_direction() {
717            None => {
718                report.informational += 1;
719                FindingKind::Informational
720            }
721            Some(higher_is_worse) => {
722                let is_regression = if higher_is_worse {
723                    delta > 0.0
724                } else {
725                    delta < 0.0
726                };
727                if is_regression {
728                    report.regressions += 1;
729                    FindingKind::Regression
730                } else {
731                    report.improvements += 1;
732                    FindingKind::Improvement
733                }
734            }
735        };
736        report.findings.push(Finding {
737            pairing_key: key_b.clone(),
738            scenario: row_b.scenario.clone(),
739            topology: row_b.topology.clone(),
740            work_type: row_b.work_type.clone(),
741            metric: m,
742            val_a,
743            val_b,
744            delta,
745            kind,
746        });
747    }
748}
749
750/// Emit a stderr warning naming any `-dirty` commit values present
751/// in the partitioned rows so the operator knows the comparison
752/// includes builds whose source tree may not match the recorded
753/// HEAD.
754///
755/// Scans `commit` (project HEAD) and `kernel_commit` (kernel source
756/// tree HEAD) on both sides' rows, dedupes the surviving values,
757/// and emits one warning block listing each distinct dirty value
758/// per dimension. Emits at most one block — silent when no row
759/// carries a `-dirty` suffix on either dimension.
760///
761/// Dirty runs reuse the same sidecar filename as their clean HEAD
762/// (the variant hash excludes `commit` / `kernel_commit` per
763/// `crate::test_support::sidecar`), so re-running the same test
764/// from a dirty tree overwrites the previous record. The warning
765/// surfaces this so an operator can decide whether to commit the
766/// working tree before re-running for a reproducible comparison.
767///
768/// Splits collection from emission via [`render_dirty_warning`] so
769/// unit tests can pin the rendered text without trapping `stderr`.
770fn warn_on_dirty_builds(rows_a: &[GauntletRow], rows_b: &[GauntletRow]) {
771    if let Some(text) = render_dirty_warning(rows_a, rows_b) {
772        eprint!("{text}");
773    }
774}
775
776/// Emit the CPU-budget hazard warning for a comparison, if any.
777/// Pure-render half is [`render_overcommit_warning`]; this only
778/// `eprint!`s it, mirroring [`warn_on_dirty_builds`].
779fn warn_on_overcommit(rows_a: &[GauntletRow], rows_b: &[GauntletRow], pairing_dims: &[Dimension]) {
780    if let Some(text) = render_overcommit_warning(rows_a, rows_b, pairing_dims) {
781        eprint!("{text}");
782    }
783}
784
785/// Build the CPU-budget hazard warning from the filtered compare
786/// sides, or `None` when neither hazard is present.
787///
788/// Two independent hazards, both read from [`GauntletRow::cpu_budget`]
789/// / [`GauntletRow::vcpus`] — the consumers that make those fields
790/// load-bearing on the compare path:
791///
792/// - OVERCOMMIT (`cpu_budget < vcpus`): the host time-sliced that
793///   run's vCPU threads, so its wake-latency / off-CPU / run-delay
794///   timing metrics are host-contention artifacts, not scheduler
795///   signal (see [`crate::vmm::host_topology::overcommit_warning`]).
796///   Always flagged when present on either side: comparing raw timing
797///   from an overcommitted run is the silent-wrong-answer the budget
798///   stamp exists to surface.
799/// - MIXED BUDGET: a single pairing group on a side holds more than
800///   one distinct non-skip budget. [`group_and_average_by`] folds rows
801///   that share a full [`PairingKey`], so this is exactly the set the
802///   averaging fold combines across budgets. It only arises
803///   when [`Dimension::CpuBudget`] is NOT a pairing dim (the operator
804///   sliced on cpu-budget, dropping it from the key); when it IS a
805///   pairing dim, each budget keys its own group and is never folded.
806///   Detection is per pairing group, NOT side-wide: two rows of
807///   different scenarios (or any differing pairing dim) carry different
808///   keys and never average, so a side merely spanning budgets across
809///   distinct groups is not flagged.
810///
811/// Skip rows (budget 0 -> `None` in [`sidecar_to_row`]) carry no
812/// budget identity and are ignored by both checks. Split from
813/// emission so a unit test pins the text and the `None`-when-clean
814/// polarity without trapping stderr, mirroring [`render_dirty_warning`].
815pub(crate) fn render_overcommit_warning(
816    rows_a: &[GauntletRow],
817    rows_b: &[GauntletRow],
818    pairing_dims: &[Dimension],
819) -> Option<String> {
820    use std::collections::BTreeSet;
821    use std::fmt::Write;
822
823    // Side-wide: the distinct overcommitted (budget, vcpus) pairs.
824    let overcommitted = |rows: &[GauntletRow]| -> BTreeSet<(u32, u32)> {
825        let mut over = BTreeSet::new();
826        for r in rows {
827            if let (Some(b), Some(v)) = (r.cpu_budget, r.vcpus)
828                && b < v
829            {
830                over.insert((b, v));
831            }
832        }
833        over
834    };
835
836    // Per pairing group: the union of budgets across groups that hold
837    // >1 distinct budget — exactly the budgets the averaging fold
838    // combines into one mean. Empty when CpuBudget is a pairing dim (each budget keys
839    // its own group, so no group ever holds two).
840    let cpu_budget_is_pairing = pairing_dims.contains(&Dimension::CpuBudget);
841    let mixed_folded = |rows: &[GauntletRow]| -> BTreeSet<u32> {
842        let mut folded = BTreeSet::new();
843        if cpu_budget_is_pairing {
844            return folded;
845        }
846        let mut by_key: std::collections::HashMap<PairingKey, BTreeSet<u32>> =
847            std::collections::HashMap::new();
848        for r in rows {
849            if let Some(b) = r.cpu_budget {
850                by_key
851                    .entry(PairingKey::from_row(r, pairing_dims))
852                    .or_default()
853                    .insert(b);
854            }
855        }
856        for budgets in by_key.values() {
857            if budgets.len() > 1 {
858                folded.extend(budgets.iter().copied());
859            }
860        }
861        folded
862    };
863
864    let over_a = overcommitted(rows_a);
865    let over_b = overcommitted(rows_b);
866    let mixed_a = mixed_folded(rows_a);
867    let mixed_b = mixed_folded(rows_b);
868
869    if over_a.is_empty() && over_b.is_empty() && mixed_a.is_empty() && mixed_b.is_empty() {
870        return None;
871    }
872
873    let any_overcommit = !over_a.is_empty() || !over_b.is_empty();
874    let mut out = String::new();
875    if any_overcommit {
876        // Host time-slicing actually occurred -> raw timing is confounded.
877        let _ = writeln!(
878            out,
879            "ktstr: WARNING: CPU-budget hazard in this comparison — a run was \
880             host-overcommitted, so its guest-scheduler timing metrics \
881             (wake-latency / off-CPU / run-delay) are host-contention-confounded. \
882             Compare the overcommit-invariant worst_iterations_per_cpu_sec metric \
883             instead of raw \
884             timing."
885        );
886    } else {
887        // Mixed budgets with NO overcommit: no host contention, the hazard is
888        // collapsing two different measurement conditions into one number.
889        let _ = writeln!(
890            out,
891            "ktstr: WARNING: CPU-budget hazard in this comparison — runs of \
892             different CPU budgets share a pairing group, mixing two measurement \
893             conditions. Slice with --cpu-budget, or compare the budget-invariant \
894             worst_iterations_per_cpu_sec metric."
895        );
896    }
897    let mut emit_side = |label: &str, over: &BTreeSet<(u32, u32)>, mixed: &BTreeSet<u32>| {
898        if !over.is_empty() {
899            let list = over
900                .iter()
901                .map(|(b, v)| format!("{b}/{v}"))
902                .collect::<Vec<_>>()
903                .join(", ");
904            let _ = writeln!(
905                out,
906                "  side {label}: host-overcommitted run(s) [budget/vcpus]: {list}"
907            );
908        }
909        if !mixed.is_empty() {
910            let list = mixed
911                .iter()
912                .map(|b| b.to_string())
913                .collect::<Vec<_>>()
914                .join(", ");
915            let _ = writeln!(
916                out,
917                "  side {label}: CPU budgets [{list}] share a pairing group — \
918                 the average fold collapses them into one mean; slice with --cpu-budget so cross-budget runs are \
919                 not compared under one key"
920            );
921        }
922    };
923    emit_side("A", &over_a, &mixed_a);
924    emit_side("B", &over_b, &mixed_b);
925    Some(out)
926}
927
928/// Build the dirty-builds warning block from row data.
929///
930/// Returns `None` when no row on either side carries a `-dirty`
931/// suffix on either `commit` or `kernel_commit`. Otherwise returns
932/// the full multi-line warning text — the body emitted to stderr by
933/// [`warn_on_dirty_builds`] — terminated with a trailing newline so
934/// the caller can `eprint!` it without further formatting.
935///
936/// Dimensions render in fixed order ("kernel source" before
937/// "project") so the same dirty hashes always produce byte-identical
938/// output across runs; values within each dimension are
939/// `BTreeSet`-deduped so multiple rows sharing one dirty hash list
940/// it once, and multiple distinct dirty hashes on one dimension list
941/// in lex order.
942pub(crate) fn render_dirty_warning(
943    rows_a: &[GauntletRow],
944    rows_b: &[GauntletRow],
945) -> Option<String> {
946    use std::collections::BTreeSet;
947    use std::fmt::Write;
948
949    let mut dirty_kernel: BTreeSet<&str> = BTreeSet::new();
950    let mut dirty_project: BTreeSet<&str> = BTreeSet::new();
951    for row in rows_a.iter().chain(rows_b.iter()) {
952        // `ends_with` matches the producer contract: `detect_kernel_commit`
953        // and `detect_project_commit` (src/test_support/sidecar/mod.rs) append
954        // `-dirty` as a SUFFIX to the 7-char hex via
955        // `format!("{short_hash}-dirty")`, so the dirty marker is
956        // always tail-positioned. `contains` would also match a
957        // hex hash that legitimately contains the substring `-dirty`
958        // somewhere in the middle (impossible for the current
959        // 7-char hex prefix, but a future commit-ish format change
960        // would let a non-dirty value flag itself dirty under
961        // `contains`).
962        if let Some(c) = row.kernel_commit.as_deref()
963            && c.ends_with("-dirty")
964        {
965            dirty_kernel.insert(c);
966        }
967        if let Some(c) = row.commit.as_deref()
968            && c.ends_with("-dirty")
969        {
970            dirty_project.insert(c);
971        }
972    }
973
974    if dirty_kernel.is_empty() && dirty_project.is_empty() {
975        return None;
976    }
977
978    let mut out = String::new();
979    writeln!(out, "warning: comparison includes dirty builds:").unwrap();
980    for v in &dirty_kernel {
981        writeln!(
982            out,
983            "  - kernel source: {v} (working tree may have changed since this run)"
984        )
985        .unwrap();
986    }
987    for v in &dirty_project {
988        writeln!(
989            out,
990            "  - project: {v} (working tree may have changed since this run)"
991        )
992        .unwrap();
993    }
994    writeln!(
995        out,
996        "  Dirty runs overwrite previous results with the same HEAD."
997    )
998    .unwrap();
999    writeln!(out, "  Commit changes for reproducible-ish comparisons.").unwrap();
1000    Some(out)
1001}
1002
1003/// Render the actionable bail message emitted when one side's filter
1004/// matches zero sidecars in the pool.
1005///
1006/// Beyond the generic "check filters / run `cargo ktstr stats list`"
1007/// redirect, this helper inspects WHY the filter matched nothing and
1008/// adds three operator-actionable hints when applicable:
1009///
1010/// 1. **Dirty-form hint**: when the user passed
1011///    `--project-commit X` (or per-side / kernel-commit equivalent)
1012///    and the pool contains a row whose `commit` (or `kernel_commit`)
1013///    is `X-dirty`, append "Did you mean `--project-commit X-dirty`?".
1014///    A clean-vs-dirty mismatch is the single most common cause of a
1015///    false-zero on the commit dims — `detect_project_commit` /
1016///    `detect_kernel_commit` append `-dirty` whenever HEAD-vs-index
1017///    or index-vs-worktree changes are observed, so an operator who
1018///    expected `abcdef1` but the recorded value is `abcdef1-dirty`
1019///    sees no rows match without realizing why.
1020///
1021/// 2. **Unknown run-source hint**: when the user passed
1022///    `--run-source X` (or per-side equivalent) and `X` is NOT
1023///    among the distinct `run_source` values present in the pool,
1024///    append a hint listing the actual values seen. The schema is
1025///    deliberately extensible (`"benchmark"` and other future tags
1026///    are valid), so this is a hint rather than a hard validator —
1027///    but a typo (`--run-source loca` for `local`, or `--run-source CI`
1028///    for `ci` since the values are case-sensitive) is the most
1029///    common cause of a false-zero on the source dim, and listing
1030///    the distinct values present is more actionable than asking
1031///    the operator to consult the schema doc.
1032///
1033/// 3. **list-values redirect for commit dims**: when the user
1034///    populated any commit dimension (`project_commits` /
1035///    `kernel_commits`), suggest `cargo ktstr stats list-values`
1036///    specifically — that command emits the exact distinct values
1037///    present per dimension, which is more actionable than the
1038///    generic `stats list` which only shows top-level run keys.
1039///
1040/// `side` is `"A"` or `"B"` for diagnostic context. `filter` is the
1041/// per-side `RowFilter`. `rows` is the sidecar-derived row vec
1042/// (post-`sidecar_to_row` mapping, pre-filtering). `pool_len` is
1043/// the raw pool count for the "(N pooled)" diagnostic context.
1044pub(crate) fn zero_match_diagnostic(
1045    side: &str,
1046    filter: &RowFilter,
1047    rows: &[GauntletRow],
1048    pool_len: usize,
1049) -> String {
1050    let mut msg = format!(
1051        "perf-delta: {side} side filter matched 0 sidecars in \
1052         pool ({pool_len} pooled). Check the per-side filters or \
1053         confirm the runs exist with `cargo ktstr stats list`."
1054    );
1055
1056    // Dirty-form hint per commit dimension. Only fires when a
1057    // populated filter value's `-dirty` form is in the pool.
1058    let mut dirty_hints: Vec<String> = Vec::new();
1059    for want in &filter.project_commits {
1060        let dirty = format!("{want}-dirty");
1061        let found = rows
1062            .iter()
1063            .any(|r| r.commit.as_deref() == Some(dirty.as_str()));
1064        if found {
1065            dirty_hints.push(format!(
1066                "no rows match `--project-commit {want}` but `{dirty}` exists in the pool — \
1067                 did you mean `--project-commit {dirty}`?"
1068            ));
1069        }
1070    }
1071    for want in &filter.kernel_commits {
1072        let dirty = format!("{want}-dirty");
1073        let found = rows
1074            .iter()
1075            .any(|r| r.kernel_commit.as_deref() == Some(dirty.as_str()));
1076        if found {
1077            dirty_hints.push(format!(
1078                "no rows match `--kernel-commit {want}` but `{dirty}` exists in the pool — \
1079                 did you mean `--kernel-commit {dirty}`?"
1080            ));
1081        }
1082    }
1083    for hint in dirty_hints {
1084        msg.push_str("\nhint: ");
1085        msg.push_str(&hint);
1086    }
1087
1088    // Unknown-run-source hint. Fires when a `--run-source X` value
1089    // is not present in the pool — typo / wrong casing is the most
1090    // common cause. Schema is intentionally extensible (operators
1091    // can write `"benchmark"` etc.), so this is a hint not a hard
1092    // validator: the bail still fires, the operator still sees the
1093    // distinct values present, and the producer side is free to
1094    // emit any tag.
1095    if !filter.run_sources.is_empty() {
1096        let pool_run_sources: std::collections::BTreeSet<&str> = rows
1097            .iter()
1098            .filter_map(|r| r.run_source.as_deref())
1099            .collect();
1100        let unknowns: Vec<&str> = filter
1101            .run_sources
1102            .iter()
1103            .map(String::as_str)
1104            .filter(|want| !pool_run_sources.contains(*want))
1105            .collect();
1106        if !unknowns.is_empty() {
1107            let mut present: Vec<&str> = pool_run_sources.iter().copied().collect();
1108            present.sort_unstable();
1109            let unknown_list = unknowns
1110                .iter()
1111                .map(|s| format!("`{s}`"))
1112                .collect::<Vec<_>>()
1113                .join(", ");
1114            let present_list = if present.is_empty() {
1115                "(none — every row has `run_source: null`)".to_string()
1116            } else {
1117                present
1118                    .iter()
1119                    .map(|s| format!("`{s}`"))
1120                    .collect::<Vec<_>>()
1121                    .join(", ")
1122            };
1123            msg.push_str(&format!(
1124                "\nhint: --run-source {unknown_list} not found in pool; \
1125                 distinct values present: {present_list}. Values are \
1126                 case-sensitive (`ci` ≠ `CI`)."
1127            ));
1128        }
1129    }
1130
1131    // Unknown-resolve-source hint. Mirrors the run_sources hint for the
1132    // scheduler-resolution-path dimension: fires when a `--resolve-source`
1133    // value is not among the resolve_sources present in the pool.
1134    if !filter.resolve_sources.is_empty() {
1135        let pool_resolve_sources: std::collections::BTreeSet<&str> = rows
1136            .iter()
1137            .filter_map(|r| r.resolve_source.as_deref())
1138            .collect();
1139        let unknowns: Vec<&str> = filter
1140            .resolve_sources
1141            .iter()
1142            .map(String::as_str)
1143            .filter(|want| !pool_resolve_sources.contains(*want))
1144            .collect();
1145        if !unknowns.is_empty() {
1146            let mut present: Vec<&str> = pool_resolve_sources.iter().copied().collect();
1147            present.sort_unstable();
1148            let unknown_list = unknowns
1149                .iter()
1150                .map(|s| format!("`{s}`"))
1151                .collect::<Vec<_>>()
1152                .join(", ");
1153            let present_list = if present.is_empty() {
1154                "(none — every row has `resolve_source: null`)".to_string()
1155            } else {
1156                present
1157                    .iter()
1158                    .map(|s| format!("`{s}`"))
1159                    .collect::<Vec<_>>()
1160                    .join(", ")
1161            };
1162            msg.push_str(&format!(
1163                "\nhint: --resolve-source {unknown_list} not found in pool; \
1164                 distinct values present: {present_list}. Values are \
1165                 case-sensitive (`auto_built` \u{2260} `Auto_Built`)."
1166            ));
1167        }
1168    }
1169
1170    // Unknown-cpu-budget hint. Mirrors the run_sources hint for the
1171    // numeric budget dimension: fires when a `--cpu-budget` value is
1172    // not among the budgets present in the pool (the budgets render
1173    // canonically as decimal via `cpu_budget.to_string()`, so a
1174    // non-canonical input like `032` lists as not-found against the
1175    // canonical present set). Skip rows (`cpu_budget == None`) carry no
1176    // budget and are excluded.
1177    if !filter.cpu_budgets.is_empty() {
1178        let pool_budgets: std::collections::BTreeSet<u32> =
1179            rows.iter().filter_map(|r| r.cpu_budget).collect();
1180        let present_strs: std::collections::BTreeSet<String> =
1181            pool_budgets.iter().map(|b| b.to_string()).collect();
1182        let unknowns: Vec<&str> = filter
1183            .cpu_budgets
1184            .iter()
1185            .map(String::as_str)
1186            .filter(|want| !present_strs.contains(*want))
1187            .collect();
1188        if !unknowns.is_empty() {
1189            let unknown_list = unknowns
1190                .iter()
1191                .map(|s| format!("`{s}`"))
1192                .collect::<Vec<_>>()
1193                .join(", ");
1194            let present_list = if pool_budgets.is_empty() {
1195                "(none — every row is a skip with no recorded budget)".to_string()
1196            } else {
1197                pool_budgets
1198                    .iter()
1199                    .map(|b| format!("`{b}`"))
1200                    .collect::<Vec<_>>()
1201                    .join(", ")
1202            };
1203            msg.push_str(&format!(
1204                "\nhint: --cpu-budget {unknown_list} not found in pool; \
1205                 distinct budgets present: {present_list}."
1206            ));
1207        }
1208    }
1209
1210    // list-values redirect: only fires when the operator narrowed
1211    // on a commit dimension. Generic case (no commit filter) keeps
1212    // the existing `stats list` redirect at the top of the message
1213    // — `list-values` would emit a long per-dimension dump that
1214    // isn't more actionable than `stats list` for a kernel/scheduler
1215    // /topology miss.
1216    let touched_commit_dim =
1217        !filter.project_commits.is_empty() || !filter.kernel_commits.is_empty();
1218    if touched_commit_dim {
1219        msg.push_str(
1220            "\nhint: run `cargo ktstr stats list-values` to see every \
1221             distinct commit value present in the pool — the specific \
1222             value the filter expected may not have a sidecar yet, or \
1223             may differ from what was recorded by \
1224             `detect_project_commit` / `detect_kernel_commit`.",
1225        );
1226    }
1227    msg
1228}
1229
1230/// Resolved inputs for the `perf-delta` render phase.
1231///
1232/// Produced by [`prepare_partitioned_comparison`] — the validation,
1233/// pooling, partitioning, and averaging steps of [`compare_partitions`]
1234/// extracted into an owned bundle so the render half reads from one
1235/// destructure rather than a long flat prelude. Every field carries
1236/// the exact value the prior in-function prelude bound; the render
1237/// half computes labels and headers from these, then runs the four
1238/// print helpers.
1239struct PartitionedComparison {
1240    /// Dimensions on which `filter_a` differs from `filter_b` — the
1241    /// A/B contrast axes. Guaranteed non-empty (the empty case bails).
1242    slicing_dims: Vec<Dimension>,
1243    /// Dimensions NOT in `slicing_dims`, in canonical
1244    /// [`Dimension::ALL`] order — the join axes for pairing.
1245    pairing_dims: Vec<Dimension>,
1246    /// Every sidecar under the runs root (or `--dir` override).
1247    /// Guaranteed non-empty (the empty pool bails).
1248    pool: Vec<crate::test_support::SidecarResult>,
1249    /// `pool` converted to rows, same length and iteration order.
1250    rows: Vec<GauntletRow>,
1251    /// A-side rows fed to [`compare_rows_by`]: averaged mean rows
1252    /// under [`RowPrep::Averaged`], the raw filtered rows under
1253    /// [`RowPrep::PerRunPooled`].
1254    rows_a_for_compare: Vec<GauntletRow>,
1255    /// B-side counterpart of `rows_a_for_compare`.
1256    rows_b_for_compare: Vec<GauntletRow>,
1257    /// A-side averaged groups under [`RowPrep::Averaged`]; `None` under
1258    /// [`RowPrep::PerRunPooled`]. Drives the per-group pass-count block.
1259    avg_a: Option<Vec<AveragedGroup>>,
1260    /// B-side counterpart of `avg_a`.
1261    avg_b: Option<Vec<AveragedGroup>>,
1262    /// Post-typed-filter A-side contributor row count (pre-aggregation)
1263    /// — the "averaged across N runs" header numerator.
1264    pre_agg_a: usize,
1265    /// B-side counterpart of `pre_agg_a`.
1266    pre_agg_b: usize,
1267}
1268
1269/// How [`prepare_partitioned_comparison`] folds each side's rows before
1270/// the compare half consumes them.
1271#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1272enum RowPrep {
1273    /// Fold same-pairing-key rows on each side into one arithmetic-mean
1274    /// row — the default (averaging) compare behavior.
1275    Averaged,
1276    /// Keep every row INCLUDING duplicate pairing keys. The
1277    /// `perf-delta --noise-adjust`
1278    /// path: [`noise_findings`] groups the N runs per key per side into
1279    /// a spread, so N-per-key is the intended input, not an error.
1280    PerRunPooled,
1281}
1282
1283/// Validate, pool, partition, and average the inputs for
1284/// [`compare_partitions`]. Returns the owned [`PartitionedComparison`]
1285/// bundle the render half destructures, or bails with the same
1286/// diagnostics in the same order as the original in-function prelude:
1287/// identical-rows gate, empty-pool gate, then the two zero-match
1288/// gates. The multi-dim slicing warning and the dirty-build /
1289/// overcommit warnings are emitted here so they precede the render
1290/// half's header lines, preserving output order.
1291fn prepare_partitioned_comparison(
1292    filter_a: &RowFilter,
1293    filter_b: &RowFilter,
1294    dir: Option<&std::path::Path>,
1295    prep: RowPrep,
1296) -> anyhow::Result<PartitionedComparison> {
1297    // Validation gate 1: there must be at least one dimension
1298    // on which filter_a differs from filter_b — otherwise the
1299    // operator hasn't expressed a contrast and the function has
1300    // nothing to compare. Empty slicing dims OR identical filters
1301    // are both rejected here with actionable diagnostics so the
1302    // user knows which knob to turn.
1303    let slicing_dims = derive_slicing_dims(filter_a, filter_b);
1304    if slicing_dims.is_empty() {
1305        anyhow::bail!(
1306            "perf-delta: A and B select identical rows. \
1307             Specify at least one per-side filter (e.g. \
1308             --a-kernel 6.14 --b-kernel 6.15) to define what \
1309             dimension separates the two sides."
1310        );
1311    }
1312
1313    // Validation gate 2: warn (not error) when slicing on
1314    // multiple dimensions. The result is still well-defined —
1315    // the comparison joins on remaining pairing dims and
1316    // collapses the slicing-dim cross-product into a single
1317    // A/B contrast — but the operator is asking for a multi-axis
1318    // delta which is harder to interpret. The warning surfaces
1319    // the dim list so they can confirm the cohort shape.
1320    if slicing_dims.len() > 1 {
1321        let dim_names: Vec<&str> = slicing_dims.iter().map(|d| d.name()).collect();
1322        eprintln!(
1323            "warning: perf-delta: slicing on {n} dimensions [{dims}]; \
1324             results compress multiple axes into a single A/B contrast.",
1325            n = slicing_dims.len(),
1326            dims = dim_names.join(", "),
1327        );
1328    }
1329
1330    // Pairing dims = every dimension NOT in the slicing-dim set,
1331    // in canonical [`Dimension::ALL`] order. The dynamic key
1332    // shape `(scenario, *pairing_dims)` matches whatever
1333    // dimensions are currently NOT being contrasted across A
1334    // and B.
1335    let pairing_dims = Dimension::pairing_dims(&slicing_dims);
1336
1337    // Pool every sidecar under the runs root (or the operator's
1338    // --dir override) and convert to rows. The full-scan cost
1339    // is acceptable for the single-comparison-per-session
1340    // workflow.
1341    //
1342    // `--dir`-loaded sidecars get their `source` field rewritten
1343    // to `"archive"` via `apply_archive_source_override` before
1344    // row conversion. The producer-side `"local"` / `"ci"`
1345    // distinction is meaningful on the host that wrote the
1346    // sidecars; once the files have been copied off, the only
1347    // useful classification is "this came from elsewhere", which
1348    // is what `--run-source archive` queries for. Operators who need
1349    // to retain the producer-side distinction read from the
1350    // default root (no `--dir`) so values pass through untouched.
1351    let (root, override_archive) = match dir {
1352        Some(d) => (d.to_path_buf(), true),
1353        None => (crate::test_support::runs_root(), false),
1354    };
1355    let mut pool = crate::test_support::collect_pool(&root);
1356    if override_archive {
1357        crate::test_support::apply_archive_source_override(&mut pool);
1358    }
1359    if pool.is_empty() {
1360        anyhow::bail!(
1361            "perf-delta: no sidecar data found under {}. \
1362             Run `cargo ktstr test` to generate runs, or pass \
1363             --dir to point at an archived sidecar tree.",
1364            root.display(),
1365        );
1366    }
1367    let rows: Vec<GauntletRow> = pool.iter().map(sidecar_to_row).collect();
1368
1369    // Partition: apply each side's filter to the same pool. A
1370    // row may match both sides (e.g. when project_commit is the
1371    // slicing dim, a row whose `project_commit` is in
1372    // `filter_a.project_commits` matches A but NOT B unless
1373    // `filter_b.project_commits` also contains it).
1374    let rows_a = apply_row_filters(&rows, filter_a);
1375    let rows_b = apply_row_filters(&rows, filter_b);
1376    if rows_a.is_empty() {
1377        anyhow::bail!(
1378            "{}",
1379            zero_match_diagnostic("A", filter_a, &rows, pool.len()),
1380        );
1381    }
1382    if rows_b.is_empty() {
1383        anyhow::bail!(
1384            "{}",
1385            zero_match_diagnostic("B", filter_b, &rows, pool.len()),
1386        );
1387    }
1388
1389    warn_on_dirty_builds(&rows_a, &rows_b);
1390    warn_on_overcommit(&rows_a, &rows_b, &pairing_dims);
1391
1392    let pre_agg_a = rows_a.len();
1393    let pre_agg_b = rows_b.len();
1394
1395    // Fold each side's rows per the caller's [`RowPrep`]. `Averaged`
1396    // collapses same-pairing-key rows into one mean row (the default);
1397    // PerRunPooled keeps every row distinct, including duplicate
1398    // pairing keys (N-per-key is the intended noise-adjust input).
1399    let (rows_a_for_compare, rows_b_for_compare, avg_a, avg_b) = match prep {
1400        RowPrep::Averaged => {
1401            let avg_a = group_and_average_by(&rows_a, &pairing_dims);
1402            let avg_b = group_and_average_by(&rows_b, &pairing_dims);
1403            let a_rows: Vec<GauntletRow> = avg_a.iter().map(|r| r.row.clone()).collect();
1404            let b_rows: Vec<GauntletRow> = avg_b.iter().map(|r| r.row.clone()).collect();
1405            (a_rows, b_rows, Some(avg_a), Some(avg_b))
1406        }
1407        RowPrep::PerRunPooled => {
1408            // The noise-spread compare groups the N runs per pairing key
1409            // per side (see `noise_findings`), so N-per-key is the
1410            // intended input. Keep every row including duplicate
1411            // pairing keys — N-per-key is expected here, not an error.
1412            (rows_a, rows_b, None, None)
1413        }
1414    };
1415
1416    Ok(PartitionedComparison {
1417        slicing_dims,
1418        pairing_dims,
1419        pool,
1420        rows,
1421        rows_a_for_compare,
1422        rows_b_for_compare,
1423        avg_a,
1424        avg_b,
1425        pre_agg_a,
1426        pre_agg_b,
1427    })
1428}
1429
1430/// Warning for the SCALAR compare path when compared tests declare
1431/// `perf_delta_assertions` gates it does not evaluate. Declared gates are a
1432/// CI-gating perf assertion; gating on single-run scalar data would flip CI on
1433/// noise, so they are honored ONLY under `perf-delta --noise-adjust` (multi-run,
1434/// Welch + separation). Returning the message instead of a bare no-op keeps the
1435/// gate from silently disappearing on the default `perf-delta` invocation.
1436/// `None` when no compared test declares a gate. Pure (no I/O) so the
1437/// count/message is unit-testable. Counts DISTINCT scenarios carrying a gate.
1438pub(crate) fn scalar_declared_gate_warning(rows_b: &[GauntletRow]) -> Option<String> {
1439    let n = rows_b
1440        .iter()
1441        .filter(|r| !r.perf_delta_assertions.is_empty())
1442        .map(|r| r.scenario.as_str())
1443        .collect::<std::collections::BTreeSet<_>>()
1444        .len();
1445    (n > 0).then(|| {
1446        format!(
1447            "NOTE — {n} compared test(s) declare perf_delta_assertions gate(s) that this \
1448             scalar comparison does NOT evaluate. Declared gates are enforced only under \
1449             `perf-delta --noise-adjust N` (single-run scalar gating would flip CI on \
1450             noise); re-run with --noise-adjust to gate on them."
1451        )
1452    })
1453}
1454
1455/// Compare two filter-defined partitions of the sidecar pool and
1456/// report regressions across slicing dimensions.
1457///
1458/// `filter_a` and `filter_b` are the per-side row filters that
1459/// define the A/B contrast. The dimensions on which the two
1460/// filters DIFFER are the SLICING dimensions; the dimensions on
1461/// which they AGREE (or on which both are unconstrained) are the
1462/// PAIRING dimensions. Two rows pair across the A/B sides iff
1463/// their dynamic [`PairingKey`] (scenario plus every pairing-dim
1464/// value) is equal — so the comparison naturally ignores
1465/// differences on the slicing axes (those ARE the contrast) and
1466/// joins on everything else.
1467///
1468/// `dir` overrides the default `runs_root()` for pool collection.
1469/// Pass `Some(path)` to compare archived sidecar trees copied off
1470/// a CI host; pass `None` to walk `target/ktstr/` (or
1471/// `CARGO_TARGET_DIR/ktstr/`).
1472///
1473/// Validation:
1474/// - Empty slicing-dim set (every dimension is identical between
1475///   A and B): bail with "specify at least one --a-X / --b-X to
1476///   define what to compare". This includes the no-flags-at-all
1477///   case (both filters are the empty default).
1478/// - Identical effective filters with at least one slicing dim is
1479///   a contradiction caught by clap-level construction; the
1480///   downstream check is "every value in filter_a appears in
1481///   filter_b on the same dim and vice versa." We catch that as
1482///   "A and B select identical rows" — symmetric to the empty
1483///   case.
1484/// - More than one slicing dimension prints a warning to stderr
1485///   ("warning: slicing on N dimensions; results compress
1486///   multiple axes into a single A/B contrast") but does NOT
1487///   bail — multi-dim slicing is a deliberate feature for
1488///   comparing e.g. (kernel A + scheduler A) against (kernel B +
1489///   scheduler B).
1490///
1491/// Groups every matching sidecar within each side by pairing key
1492/// and averages the metrics across the group.
1493///
1494/// Returns 1 when the confident regressions fail the operator gate — their
1495/// count reaches `gate.fail_threshold` (default 5) or a `gate.must_fail`
1496/// metric regressed; 0 otherwise. See [`gate_fails`] / [`GateOptions`].
1497pub fn compare_partitions(
1498    filter_a: &RowFilter,
1499    filter_b: &RowFilter,
1500    filter: Option<&str>,
1501    policy: &ComparisonPolicy,
1502    dir: Option<&std::path::Path>,
1503    gate: &GateOptions,
1504) -> anyhow::Result<i32> {
1505    let prepared = prepare_partitioned_comparison(filter_a, filter_b, dir, RowPrep::Averaged)?;
1506    let PartitionedComparison {
1507        slicing_dims,
1508        pairing_dims,
1509        pool,
1510        rows,
1511        rows_a_for_compare,
1512        rows_b_for_compare,
1513        avg_a,
1514        avg_b,
1515        pre_agg_a,
1516        pre_agg_b,
1517    } = &prepared;
1518
1519    let report = compare_rows_by(
1520        rows_a_for_compare,
1521        rows_b_for_compare,
1522        pairing_dims,
1523        filter,
1524        policy,
1525    );
1526
1527    // Side labels derive from the slicing dims' filter values.
1528    // Single slicing dim: e.g. "6.14.2" / "6.15.0". Multi: e.g.
1529    // "6.14.2:scx_rusty" / "6.15.0:scx_alpha". >3 values per dim:
1530    // collapse to "A"/"B" to keep column headers readable.
1531    let label_a = render_side_label(filter_a, slicing_dims, "A");
1532    let label_b = render_side_label(filter_b, slicing_dims, "B");
1533
1534    // Header lines: name the slicing and pairing axes so the
1535    // operator can confirm the comparison shape at a glance.
1536    let slice_names: Vec<&str> = slicing_dims.iter().map(|d| d.name()).collect();
1537    let pair_names: Vec<&str> = pairing_dims.iter().map(|d| d.name()).collect();
1538    println!("slicing dimensions: {}", slice_names.join(", "));
1539    println!(
1540        "pairing on: scenario{}{}",
1541        if pair_names.is_empty() { "" } else { ", " },
1542        pair_names.join(", "),
1543    );
1544    // Declared gates are not evaluated on the scalar path — warn rather than
1545    // silently ignore (they are honored only under `perf-delta --noise-adjust`).
1546    if let Some(warning) = scalar_declared_gate_warning(rows_b_for_compare) {
1547        println!("{warning}");
1548    }
1549
1550    println!(
1551        "{}",
1552        format_average_header(*pre_agg_a, *pre_agg_b, &label_a, &label_b)
1553    );
1554
1555    // Scalar findings table.
1556    print_scalar_findings_table(&report, &label_a, &label_b);
1557
1558    // Scalar summary block — regressions / improvements /
1559    // unchanged + skipped-failed + per-group pass counts +
1560    // new_in_b / removed_from_a.
1561    print_summary_block(&report, avg_a, avg_b, &label_a, &label_b);
1562
1563    // Host-context delta. Same first-Some(host) baseline
1564    // `compare_partitions` uses — picking representative hosts
1565    // off the partitioned sidecars rather than the full pool so
1566    // the delta reflects what actually fed the comparison.
1567    print_host_context_delta(pool, rows, filter_a, filter_b, &label_a, &label_b);
1568
1569    // Operator gate: the significance policy above decided WHICH deltas are
1570    // confident regressions; the gate decides HOW MANY / WHICH-NAMED of them
1571    // fail the run. (--all-metrics is a no-op on this path — the scalar table
1572    // already lists every changed metric and the unchanged COUNT prints in
1573    // the summary; it reveals stable/noisy rows on the --noise-adjust table.)
1574    let regressing: Vec<&str> = report
1575        .findings
1576        .iter()
1577        .filter(|f| f.kind == FindingKind::Regression)
1578        .map(|f| f.metric.name)
1579        .collect();
1580    Ok(if gate_fails(&regressing, gate) { 1 } else { 0 })
1581}
1582
1583/// Operator-level perf-delta failure gate + render options, layered on top
1584/// of the per-metric significance policy (which decides WHICH deltas are
1585/// confident regressions). These decide HOW MANY / WHICH-NAMED confident
1586/// regressions fail the run, and whether stable/noisy rows render.
1587#[derive(Debug, Clone, Default)]
1588pub struct GateOptions {
1589    /// Fail iff at least this many confident regressions occur. `None`
1590    /// means 5 — a handful of regressions is tolerated as run-to-run noise
1591    /// and the run fails only once several metrics regress; pass
1592    /// `--fail-threshold 1` for fail-on-any. `Some(0)` disables the count
1593    /// gate entirely — only [`Self::must_fail`] can then fail the run.
1594    pub fail_threshold: Option<usize>,
1595    /// Metric registry names that fail the run if ANY of them regresses,
1596    /// regardless of the count gate (ORed on top). Caller-validated against
1597    /// the metric registry.
1598    pub must_fail: Vec<String>,
1599    /// Render every compared metric row (stable + noisy included) on the
1600    /// `--noise-adjust` table instead of only the meaningful ones.
1601    /// Display-only — never affects the gate.
1602    pub show_all: bool,
1603}
1604
1605/// Decide whether a perf-delta run FAILS, given the registry names of the
1606/// confident regressions it found. Fails iff the count meets
1607/// [`GateOptions::fail_threshold`] (default 5; `Some(0)` disables the count
1608/// gate), OR any regressing metric is in [`GateOptions::must_fail`] (ORed
1609/// on top). The caller passes the CLASSIFIED regressions, so display-hidden
1610/// (suppressed) rows still feed the gate.
1611pub(crate) fn gate_fails(regressing_metrics: &[&str], gate: &GateOptions) -> bool {
1612    let n = gate.fail_threshold.unwrap_or(5);
1613    let fail_on_count = n >= 1 && regressing_metrics.len() >= n;
1614    let fail_on_must = !gate.must_fail.is_empty()
1615        && regressing_metrics
1616            .iter()
1617            .any(|m| gate.must_fail.iter().any(|w| w.as_str() == *m));
1618    fail_on_count || fail_on_must
1619}
1620
1621#[cfg(test)]
1622mod gate_option_tests {
1623    use super::*;
1624
1625    #[test]
1626    fn default_gate_fails_only_at_five_regressions() {
1627        // Default (None) threshold is 5: a handful of regressions is
1628        // tolerated as run-to-run noise; the run fails once >= 5 regress.
1629        let g = GateOptions::default();
1630        assert!(!gate_fails(&[], &g), "0 regressions passes");
1631        assert!(
1632            !gate_fails(&["a", "b", "c", "d"], &g),
1633            "4 < 5 passes under the default gate"
1634        );
1635        assert!(
1636            gate_fails(&["a", "b", "c", "d", "e"], &g),
1637            "5 >= 5 fails under the default gate"
1638        );
1639    }
1640
1641    #[test]
1642    fn count_threshold_requires_n() {
1643        let g = GateOptions {
1644            fail_threshold: Some(3),
1645            ..Default::default()
1646        };
1647        assert!(!gate_fails(&["a", "b"], &g), "2 < 3 passes");
1648        assert!(gate_fails(&["a", "b", "c"], &g), "3 >= 3 fails");
1649    }
1650
1651    #[test]
1652    fn zero_threshold_disables_count_gate() {
1653        let g = GateOptions {
1654            fail_threshold: Some(0),
1655            ..Default::default()
1656        };
1657        assert!(
1658            !gate_fails(&["a", "b", "c"], &g),
1659            "N=0 never fails on the count"
1660        );
1661    }
1662
1663    #[test]
1664    fn must_fail_fails_regardless_of_count() {
1665        let g = GateOptions {
1666            fail_threshold: Some(0),
1667            must_fail: vec!["worst_p99_wake_latency_us".to_string()],
1668            ..Default::default()
1669        };
1670        assert!(
1671            gate_fails(&["worst_p99_wake_latency_us"], &g),
1672            "a must-fail metric regressing fails even with the count gate off"
1673        );
1674        assert!(
1675            !gate_fails(&["some_other_metric"], &g),
1676            "a non-must-fail regression does not fail with the count gate off"
1677        );
1678    }
1679
1680    #[test]
1681    fn must_fail_is_ored_above_the_count() {
1682        let g = GateOptions {
1683            fail_threshold: Some(10),
1684            must_fail: vec!["avg_dsq_depth".to_string()],
1685            ..Default::default()
1686        };
1687        assert!(
1688            gate_fails(&["avg_dsq_depth"], &g),
1689            "must-fail fires even below the count threshold"
1690        );
1691    }
1692}
1693
1694/// How a metric's noise-adjusted verdict classifies for the gate.
1695#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1696pub(crate) enum NoiseKind {
1697    /// SEPARATED (Welch or disjoint bands) AND MATERIAL in the worsening
1698    /// direction (per polarity) — the only kind that fails the gate.
1699    Regression,
1700    /// Separated AND material in the improving direction.
1701    Improvement,
1702    /// A side realized fewer than 2 samples, so variance / Welch are undefined —
1703    /// verdict untrustworthy; flagged but does NOT fail the gate. A merely
1704    /// high-spread side no longer lands here: `high_spread` is ADVISORY and
1705    /// annotates a reported verdict (e.g. `REGRESSION (noisy spread)`) rather
1706    /// than suppressing it — the fix for the old spread-gate signal inversion.
1707    Noisy,
1708    /// Separated + material change in a directionless (`Polarity::Informational`)
1709    /// metric — shown but NEVER fails the gate (no good/bad direction to regress).
1710    Informational,
1711    /// Not separated, or separated but immaterial (below the registry dual-gate),
1712    /// with both sides having >= 2 samples. Shown in the full metrics table so the
1713    /// operator sees every metric's A/B stats, but never gates. Emitted only when
1714    /// the caller passes `include_stable = true` (the render path); the gate-only
1715    /// path omits it.
1716    Stable,
1717}
1718
1719/// One metric's noise-adjusted finding for a paired scenario.
1720pub(crate) struct NoiseFinding {
1721    /// The pairing-key label ("scenario" plus the pairing dims like
1722    /// topology/work_type, joined with "/"), so groups that share a scenario
1723    /// name but differ in a pairing dim render distinctly.
1724    pub pairing_label: String,
1725    pub metric: &'static MetricDef,
1726    pub verdict: NoiseVerdict,
1727    pub kind: NoiseKind,
1728    /// A whole-run [`crate::test_support::PerfDeltaAssertionRecord`] is declared
1729    /// on this (test, metric) — its `min_abs` / `max_regression_pct` /
1730    /// `direction` override the registry defaults in [`classify_noise`] (a
1731    /// declared value that is out of range on a corrupt/stale sidecar is
1732    /// rejected there and falls back to the registry default, but the row still
1733    /// CARRIES the declared gate). Rendered as a `(declared gate)` verdict
1734    /// annotation so the operator sees the author declared a gate here. `false`
1735    /// = no declared assertion; the registry defaults classified this row.
1736    pub gated_by_assertion: bool,
1737}
1738
1739/// One metric's noise-adjusted finding for a matched PHASE of a paired
1740/// scenario — the per-phase (`--noise-adjust` + per-phase) analog of
1741/// [`NoiseFinding`], carrying the `step_index`/`label` the render prints.
1742/// Emitted only for a `(step_index, metric)` present on BOTH sides across
1743/// the runs. Per-phase SPREAD findings are render-only EXCEPT one carrying
1744/// an author-declared phase-scoped gate (`gated_by_assertion`), which DOES
1745/// contribute to the exit via [`NoiseReport::declared_phase_regressions`].
1746pub(crate) struct NoisePhaseFinding {
1747    /// The pairing-key label (see [`NoiseFinding::pairing_label`]).
1748    pub pairing_label: String,
1749    /// `0` = BASELINE, `1..=N` = scenario Step ordinals (framework convention).
1750    pub step_index: u16,
1751    /// Mirrors [`crate::assert::PhaseBucket::label`] (`"BASELINE"` / `"Step[k]"`).
1752    pub label: String,
1753    pub metric: &'static MetricDef,
1754    pub verdict: NoiseVerdict,
1755    pub kind: NoiseKind,
1756    /// A phase-scoped declared assertion (`phase == Some(step_index)`) drove the
1757    /// gate for this per-phase row — see [`NoiseFinding::gated_by_assertion`].
1758    pub gated_by_assertion: bool,
1759}
1760
1761/// A per-phase metric present on only ONE side of the noise comparison —
1762/// either a metric absent in the other side's matched-`step_index` buckets,
1763/// or a whole one-sided `step_index`. Both collapse into one row because
1764/// a one-sided metric has no band on the absent side either way. Never gated;
1765/// surfaced so a coverage asymmetry is not silently dropped.
1766pub(crate) struct NoisePhaseCoverage {
1767    /// The pairing-key label (see [`NoiseFinding::pairing_label`]).
1768    pub pairing_label: String,
1769    pub step_index: u16,
1770    pub label: String,
1771    /// The one-sided metric, or `None` for a whole one-sided phase that
1772    /// carried no readable (non-suppressed) metric — surfaced (rendered with
1773    /// `—`) so the "phase fired but produced no data on one side" shape is not
1774    /// silently dropped.
1775    pub metric: Option<&'static MetricDef>,
1776    /// The side that carries the metric/phase; the other has none.
1777    pub present_side: ComparePartition,
1778    /// The present side's per-side mean across its runs, or `None` for the
1779    /// metric-less empty-phase-shape row.
1780    pub value: Option<f64>,
1781}
1782
1783/// A declared [`crate::test_support::PerfDeltaAssertionRecord`] that was NEVER
1784/// evaluated — its metric resolves in the registry (guaranteed by
1785/// `KtstrTestEntry::validate`) but produced no comparable value in THIS
1786/// comparison, so [`classify_noise`] never saw it. The runtime analog of the
1787/// validate-time typo check: the author declared a perf gate that silently did
1788/// not fire (metric absent from the captured data, a Rate with a zero pooled
1789/// denominator, or — for a phase-scoped assertion — a step that no matched run
1790/// carried). Surfaced (never gated) so a silently-inert declared gate is not
1791/// mistaken for a passing one.
1792pub(crate) struct NoiseAssertionCoverage {
1793    /// The pairing-key label the unmatched gate was declared under (see
1794    /// [`NoiseFinding::pairing_label`]).
1795    pub pairing_label: String,
1796    /// The declared gate that never evaluated — carries its metric, phase
1797    /// scope, and overridden thresholds for the warning message.
1798    pub assertion: crate::test_support::PerfDeltaAssertionRecord,
1799}
1800
1801/// Result of [`noise_findings`]: the per-(scenario, metric) aggregate findings,
1802/// the per-(scenario, phase, metric) findings + one-sided coverage rows, the
1803/// declared gates that never evaluated, and the number of scenarios paired
1804/// across both sides (for the footer).
1805pub(crate) struct NoiseReport {
1806    pub findings: Vec<NoiseFinding>,
1807    pub phase_findings: Vec<NoisePhaseFinding>,
1808    pub phase_coverage: Vec<NoisePhaseCoverage>,
1809    pub assertion_coverage: Vec<NoiseAssertionCoverage>,
1810    pub paired_scenarios: usize,
1811}
1812
1813impl NoiseReport {
1814    /// Confident AGGREGATE regressions — the gate's exit basis. Per-phase
1815    /// findings are render-only and deliberately excluded (see
1816    /// [`Self::phase_regressions`]).
1817    pub fn regressions(&self) -> usize {
1818        self.findings
1819            .iter()
1820            .filter(|f| f.kind == NoiseKind::Regression)
1821            .count()
1822    }
1823    /// Aggregate (whole-run) regressions on a metric the test author explicitly
1824    /// DECLARED a whole-run [`crate::test_support::PerfDeltaAssertion`] for
1825    /// (`phase: None`, `gated_by_assertion`). Like a declared PHASE gate
1826    /// ([`Self::declared_phase_regressions`]), a declared whole-run gate is an
1827    /// author opt-in that ALWAYS contributes to the exit, orthogonal to the
1828    /// operator's count / must-fail gate — an UNdeclared aggregate regression
1829    /// stays subject to that count gate (`gate_fails`).
1830    pub fn declared_regressions(&self) -> usize {
1831        self.findings
1832            .iter()
1833            .filter(|f| f.kind == NoiseKind::Regression && f.gated_by_assertion)
1834            .count()
1835    }
1836    /// Aggregate metrics flagged untrustworthy — a side had < 2 usable runs.
1837    pub fn noisy(&self) -> usize {
1838        self.findings
1839            .iter()
1840            .filter(|f| f.kind == NoiseKind::Noisy)
1841            .count()
1842    }
1843    /// Confident AGGREGATE improvements — a metric that moved MATERIALLY in the
1844    /// better direction and cleared the significance test. Render-only (an
1845    /// improvement never gates); cited in the summary footer alongside
1846    /// [`Self::regressions`] so the reader sees the composite verdict (what
1847    /// regressed AND what improved), with [`Self::stable`] as the residual.
1848    pub fn improvements(&self) -> usize {
1849        self.findings
1850            .iter()
1851            .filter(|f| f.kind == NoiseKind::Improvement)
1852            .count()
1853    }
1854    /// Aggregate metrics that did NOT move confidently+materially — the expected
1855    /// common outcome, since the noise gate (Welch / disjoint-bands + material
1856    /// dual-gate) is conservative. Cited as a residual COUNT in the footer (the
1857    /// low-value majority), never enumerated.
1858    pub fn stable(&self) -> usize {
1859        self.findings
1860            .iter()
1861            .filter(|f| f.kind == NoiseKind::Stable)
1862            .count()
1863    }
1864    /// Aggregate metrics that changed but carry no better/worse polarity
1865    /// (registry `Informational`) — cited in the footer only when present.
1866    pub fn informational(&self) -> usize {
1867        self.findings
1868            .iter()
1869            .filter(|f| f.kind == NoiseKind::Informational)
1870            .count()
1871    }
1872    /// Confident per-phase regressions — for the footer + tests ONLY, never
1873    /// the exit basis (per-phase SPREAD is render-only). A per-phase regression the author explicitly DECLARED via a
1874    /// phase-scoped [`crate::test_support::PerfDeltaAssertion`] DOES gate — see
1875    /// [`Self::declared_phase_regressions`].
1876    pub fn phase_regressions(&self) -> usize {
1877        self.phase_findings
1878            .iter()
1879            .filter(|f| f.kind == NoiseKind::Regression)
1880            .count()
1881    }
1882    /// Per-phase regressions on a metric the test author explicitly DECLARED a
1883    /// phase-scoped gate for. Unlike the render-only per-phase spread pass, a
1884    /// declared phase gate is an opt-in: the author accepted the narrower
1885    /// phase-window noise, so it contributes to the perf-delta EXIT alongside
1886    /// the aggregate [`Self::regressions`]. A per-phase regression WITHOUT a
1887    /// declared gate stays render-only (a narrow-window flake must not flip CI
1888    /// red on its own).
1889    pub fn declared_phase_regressions(&self) -> usize {
1890        self.phase_findings
1891            .iter()
1892            .filter(|f| f.kind == NoiseKind::Regression && f.gated_by_assertion)
1893            .count()
1894    }
1895    /// Per-phase metrics flagged untrustworthy — a side had < 2 usable runs.
1896    pub fn phase_noisy(&self) -> usize {
1897        self.phase_findings
1898            .iter()
1899            .filter(|f| f.kind == NoiseKind::Noisy)
1900            .count()
1901    }
1902}
1903
1904/// Row-level noise-adjusted compare — the testable core of
1905/// [`compare_partitions_noise`] (which wraps it with sidecar pooling + render),
1906/// mirroring how [`compare_rows_by`] underlies [`compare_partitions`]. Groups
1907/// each side's per-run rows by pairing key — EXCLUDING non-pass runs
1908/// (fail / skip / inconclusive / expected_failure), whose failure-mode
1909/// telemetry would corrupt the spread, exactly as [`compare_rows_by`] excludes
1910/// them — then per metric summarizes the spread and classifies via
1911/// [`noise_verdict`]. Returns one [`NoiseFinding`] per
1912/// (scenario, metric) that is changed, noisy, or — when `include_stable` is
1913/// true — unchanged-and-clean ([`NoiseKind::Stable`], shown in the full metrics
1914/// table but never gating). With `include_stable = false` only changed/noisy
1915/// metrics are returned (the gate-only path). Metrics with no signal on either
1916/// side (both means under [`ZERO_MEAN_EPS`]) and render-suppressed rate
1917/// components are omitted regardless of `include_stable`.
1918pub(crate) fn noise_findings(
1919    rows_a: &[GauntletRow],
1920    rows_b: &[GauntletRow],
1921    pairing_dims: &[Dimension],
1922    spread_threshold_pct: f64,
1923    include_stable: bool,
1924) -> NoiseReport {
1925    use std::collections::{BTreeMap, BTreeSet};
1926    let group = |rows: &[GauntletRow]| {
1927        let mut by_key: BTreeMap<PairingKey, Vec<GauntletRow>> = BTreeMap::new();
1928        for r in rows {
1929            // Exclude non-pass runs from the spread pool, mirroring the
1930            // scalar `compare_rows_by` and the averaged `group_and_average_by`:
1931            // a failed / inconclusive / skipped / expected_failure run's
1932            // telemetry is failure-mode-dominated (zeroed or an outlier), so
1933            // pooling it would corrupt the per-side [min, max] band and mean
1934            // and could produce a confident FALSE gate — the exact
1935            // silent-wrong-verdict class the noise mode exists to prevent.
1936            // A side reduced below 2 real samples by this filter is then
1937            // caught by `noise_verdict`'s n<2 insufficient_samples guard.
1938            if r.is_fail() || r.is_inconclusive() || r.is_skip() || r.expected_failure {
1939                continue;
1940            }
1941            // Derive this run's Rate metrics (e.g. the schedstat rates whose
1942            // components only materialize in ext_metrics at sidecar_to_row
1943            // time) so the spread reads them — the Averaged path derives via
1944            // group_and_average_by -> derive_rate_metrics; the per-run paths
1945            // must derive per row or every schedstat Rate is silently absent
1946            // from the verdict. Per-run derivation (each run's own num/den) is
1947            // the correct band semantics for run-to-run spread.
1948            let mut row = r.clone();
1949            crate::stats::metric::derive_rate_metrics(&mut row.ext_metrics);
1950            by_key
1951                .entry(PairingKey::from_row(&row, pairing_dims))
1952                .or_default()
1953                .push(row);
1954        }
1955        by_key
1956    };
1957    let a_by_key = group(rows_a);
1958    let b_by_key = group(rows_b);
1959
1960    let mut findings = Vec::new();
1961    let mut phase_findings = Vec::new();
1962    let mut phase_coverage = Vec::new();
1963    let mut assertion_coverage = Vec::new();
1964    let mut paired_scenarios = 0usize;
1965    for (key, a_rows) in &a_by_key {
1966        let Some(b_rows) = b_by_key.get(key) else {
1967            continue;
1968        };
1969        paired_scenarios += 1;
1970        // Metrics whose gate WAS evaluated this group, split by scope: whole-run
1971        // (aggregate pass) and phase-scoped `(step_index, metric)` (per-phase
1972        // pass). Diffed against the declared assertions after both passes to
1973        // surface any declared gate that never fired (metric absent from the
1974        // captured data).
1975        let mut consulted: BTreeSet<&'static str> = BTreeSet::new();
1976        let mut consulted_phase: BTreeSet<(u16, &'static str)> = BTreeSet::new();
1977        // Label by the FULL pairing key (scenario + pairing dims like
1978        // topology/work_type), joined via pairing_key.0.join. noise_findings groups by the full pairing key,
1979        // so a scenario run on multiple topologies/work_types forms distinct
1980        // groups; labeling by scenario alone would render them indistinguishably.
1981        let pairing_label = key.0.join("/");
1982        for m in METRICS {
1983            if is_render_suppressed_component(m.name) {
1984                continue;
1985            }
1986            let a_vals: Vec<f64> = a_rows.iter().filter_map(|r| m.read(r)).collect();
1987            let b_vals: Vec<f64> = b_rows.iter().filter_map(|r| m.read(r)).collect();
1988            if a_vals.is_empty() || b_vals.is_empty() {
1989                continue;
1990            }
1991            // Rate metrics: the per-run ratios (a_vals/b_vals) give the
1992            // run-to-run band, but the compared centroid is the pooled
1993            // Σnum/Σden (duration-weighted) — the cross-run Rate value the
1994            // registry documents (metric.rs), so --noise-adjust and the scalar
1995            // averaging compare agree on a Rate's central value while the band still measures
1996            // per-run variability. Non-Rate metrics summarize their samples.
1997            let verdict = match m.kind {
1998                MetricKind::Rate {
1999                    numerator,
2000                    denominator,
2001                } => {
2002                    let pooled = |rows: &[GauntletRow]| -> Option<f64> {
2003                        // Sum num/den only over runs that carry BOTH components,
2004                        // so a run missing one cannot skew the pooled ratio (the
2005                        // per-run rate is undefined for it anyway).
2006                        let (num, den) = rows.iter().fold((0.0, 0.0), |(sn, sd), r| {
2007                            match (r.ext_metrics.get(numerator), r.ext_metrics.get(denominator)) {
2008                                (Some(n), Some(d)) => (sn + n, sd + d),
2009                                _ => (sn, sd),
2010                            }
2011                        });
2012                        (den != 0.0).then(|| num / den)
2013                    };
2014                    // A zero pooled denominator means no rate to compare on that
2015                    // side — skip (matches the per-run derivation guard).
2016                    let (Some(a_pooled), Some(b_pooled)) = (pooled(a_rows), pooled(b_rows)) else {
2017                        continue;
2018                    };
2019                    noise_verdict_from(
2020                        SideSummary::of(&a_vals).with_pooled_mean(a_pooled),
2021                        SideSummary::of(&b_vals).with_pooled_mean(b_pooled),
2022                        spread_threshold_pct,
2023                    )
2024                }
2025                _ => noise_verdict(&a_vals, &b_vals, spread_threshold_pct),
2026            };
2027            // This metric produced a comparable verdict, so a declared gate on
2028            // it WAS evaluated — record it so the post-loop diff does not flag it
2029            // as never-evaluated. Recorded here (not at the empty-values / zero-
2030            // denominator `continue`s above) so those absent-data cases DO
2031            // surface as unmatched declared gates.
2032            consulted.insert(m.name);
2033            // Aggregate (whole-run) declared assertion for this test+metric
2034            // (phase: None). HEAD (B) side is authoritative on the declaration.
2035            let assertion = b_rows.first().and_then(|r| {
2036                r.perf_delta_assertions
2037                    .iter()
2038                    .find(|x| x.metric == m.name && x.phase.is_none())
2039            });
2040            let gated_by_assertion = assertion.is_some();
2041            let Some(kind) = classify_noise(&verdict, m, assertion, include_stable) else {
2042                continue;
2043            };
2044            findings.push(NoiseFinding {
2045                pairing_label: pairing_label.clone(),
2046                metric: m,
2047                verdict,
2048                kind,
2049                gated_by_assertion,
2050            });
2051        }
2052        // Per-phase sub-pass: mirror the aggregate spread for each matched
2053        // (step_index, metric), surfacing one-sided phases/metrics as coverage
2054        // rows. Render-only — never contributes to the gate exit.
2055        noise_phase_findings(
2056            a_rows,
2057            b_rows,
2058            &pairing_label,
2059            spread_threshold_pct,
2060            include_stable,
2061            &mut phase_findings,
2062            &mut phase_coverage,
2063            &mut consulted_phase,
2064        );
2065        // Diff the declared gates (HEAD/B authoritative) against what actually
2066        // evaluated this group. A whole-run gate (`phase: None`) matches when its
2067        // metric produced an aggregate verdict; a phase-scoped gate matches when
2068        // its `(phase, metric)` produced a per-phase verdict. Anything left is a
2069        // gate the author declared that silently never fired — surfaced, never
2070        // gated.
2071        if let Some(first) = b_rows.first() {
2072            for a in &first.perf_delta_assertions {
2073                let matched = match a.phase {
2074                    None => consulted.contains(a.metric.as_str()),
2075                    Some(step) => consulted_phase.contains(&(step, a.metric.as_str())),
2076                };
2077                if !matched {
2078                    assertion_coverage.push(NoiseAssertionCoverage {
2079                        pairing_label: pairing_label.clone(),
2080                        assertion: a.clone(),
2081                    });
2082                }
2083            }
2084        }
2085    }
2086    NoiseReport {
2087        findings,
2088        phase_findings,
2089        phase_coverage,
2090        assertion_coverage,
2091        paired_scenarios,
2092    }
2093}
2094
2095/// Classify one metric's [`NoiseVerdict`] into a [`NoiseKind`], shared by the
2096/// aggregate and per-phase noise passes. Returns `None` to omit the row: both
2097/// sides ~zero (no signal), or unchanged/immaterial-and-clean when
2098/// `include_stable` is false (the gate-only path).
2099///
2100/// A `< 2`-sample side (`insufficient_samples`) is a HARD gate that precedes
2101/// significance (Noisy, never a confident regression); the ADVISORY
2102/// `high_spread` flag does NOT gate (that suppression was the signal-inverting
2103/// bug). A row is a confident regression only when SEPARATED (Welch or disjoint
2104/// bands) AND MATERIAL (the registry dual-gate) in the worsening polarity.
2105fn classify_noise(
2106    verdict: &NoiseVerdict,
2107    m: &MetricDef,
2108    assertion: Option<&crate::test_support::PerfDeltaAssertionRecord>,
2109    include_stable: bool,
2110) -> Option<NoiseKind> {
2111    // No signal on either side (both ~zero): skip. Same zero epsilon as the
2112    // spread ratio for one consistent "is this zero".
2113    if verdict.a.mean.abs() < ZERO_MEAN_EPS && verdict.b.mean.abs() < ZERO_MEAN_EPS {
2114        return None;
2115    }
2116    // HARD gate: a side realized < 2 samples, so variance / Welch are undefined.
2117    // Precedes significance — never a confident regression. (The ADVISORY
2118    // high_spread flag, by contrast, does NOT gate.)
2119    if verdict.insufficient_samples {
2120        return Some(NoiseKind::Noisy);
2121    }
2122    // MATERIALITY: mirror the scalar dual-gate (push_scalar_findings ~L899) so
2123    // noise and default modes agree on "is this delta large enough". A
2124    // statistically-separated but trivially-small move stays Stable. --noise-adjust
2125    // conflicts with --threshold/--policy (cli.rs), so the registry defaults ARE
2126    // the resolved thresholds (an empty ComparisonPolicy::rel_threshold returns
2127    // default_rel); reading them from `m` directly is equivalent, no policy needed.
2128    // For a Rate metric, `mean` is the pooled Σnum/Σden centroid (the
2129    // registry-authoritative cross-run value), while the Welch separation arm
2130    // reads `sample_mean` (mean-of-ratios, coherent with `var`). Materiality AND
2131    // the direction label below authoritatively follow `mean`; the two statistics
2132    // can order the sides oppositely only under orders-of-magnitude denominator
2133    // skew between runs of one config (physically implausible), where the pooled
2134    // direction is the correct label anyway.
2135    let a = verdict.a.mean;
2136    let b = verdict.b.mean;
2137    let delta = b - a;
2138    let rel_delta = if a.abs() > ZERO_MEAN_EPS {
2139        (delta / a).abs()
2140    } else if delta.abs() > ZERO_MEAN_EPS {
2141        // Non-negligible move from a ~zero baseline: unbounded relative change,
2142        // so the absolute gate alone decides (mirrors the scalar path).
2143        f64::INFINITY
2144    } else {
2145        0.0
2146    };
2147    // A declared PerfDeltaAssertion OVERRIDES the registry gate for THIS
2148    // (test, metric): an absolute floor (`min_abs`), a relative threshold
2149    // (`max_regression_pct`), and/or a pinned direction. Absent (None) =>
2150    // registry defaults, so the no-assertion path is byte-identical to the
2151    // default gate. A tighter declared threshold turns a default-Stable move
2152    // into a Regression; a pinned direction can assert a registry-Informational
2153    // metric.
2154    //
2155    // `min_abs` / `max_regression_pct` come from a pub serde
2156    // `PerfDeltaAssertionRecord`. `KtstrTestEntry::validate` rejects a negative
2157    // or NaN threshold on the entry-construction path, but a hand-edited or
2158    // stale sidecar could deserialize one — and `delta.abs()` / `rel_delta` are
2159    // non-negative, so a NEGATIVE gate makes `material` unconditionally true (a
2160    // phantom confident regression that flips the exit) while a NaN gate makes
2161    // every `>=` false (silently disabled). Reject out-of-range values here and
2162    // fall back to the registry default — symmetric with the `TargetValue`
2163    // direction guard below, defending the same untrusted deserialization path.
2164    let abs_gate = assertion
2165        .and_then(|x| x.min_abs)
2166        .filter(|v| v.is_finite() && *v >= 0.0)
2167        .unwrap_or(m.default_abs);
2168    let rel_gate = assertion
2169        .and_then(|x| x.max_regression_pct)
2170        .filter(|v| v.is_finite() && *v >= 0.0)
2171        .map(|pct| pct / 100.0)
2172        .unwrap_or(m.default_rel);
2173    let material = delta.abs() >= abs_gate && rel_delta >= rel_gate;
2174    // A declared direction override, else the registry polarity. `TargetValue`
2175    // is rejected at the `PerfDeltaAssertion::with_direction` builder, so a
2176    // validated entry never carries it — but `PerfDeltaAssertionRecord` is a
2177    // pub serde type, so a hand-edited or stale sidecar could deserialize a
2178    // `direction: TargetValue`. Symmetric target-distance gating is
2179    // unimplemented (the polarity path would misread it as increase-is-worse),
2180    // so ignore it here and inherit the registry polarity — matching the
2181    // entry-path guarantee on the sidecar-deserialization path.
2182    let classify = match assertion.and_then(|x| x.direction) {
2183        Some(crate::test_support::Polarity::TargetValue(_)) | None => m.classify_direction(),
2184        Some(p) => p.classify_direction(),
2185    };
2186
2187    Some(if verdict.separated && material {
2188        match classify {
2189            // Directionless metric: a separated + material move with no good/bad
2190            // direction — shown, never fails the gate.
2191            None => NoiseKind::Informational,
2192            Some(higher_is_worse) => {
2193                // Polarity split by the SIGN of the mean delta (b vs a), NOT a
2194                // band position: separation can come from the Welch arm even when
2195                // b.mean sits inside a's [min, max] band.
2196                let worsened = if higher_is_worse { b > a } else { b < a };
2197                if worsened {
2198                    NoiseKind::Regression
2199                } else {
2200                    NoiseKind::Improvement
2201                }
2202            }
2203        }
2204    } else if include_stable {
2205        NoiseKind::Stable
2206    } else {
2207        return None; // unchanged/immaterial + clean: omit (gate-only path)
2208    })
2209}
2210
2211/// Push one one-sided per-phase metric to `coverage` — a metric present in a
2212/// matched-`step_index` bucket on one side only. `value` is the present
2213/// side's per-side mean across its runs (the absent side has none).
2214fn push_noise_phase_coverage(
2215    coverage: &mut Vec<NoisePhaseCoverage>,
2216    pairing_label: &str,
2217    step_index: u16,
2218    label: &str,
2219    metric: &'static MetricDef,
2220    present_side: ComparePartition,
2221    vals: &[f64],
2222) {
2223    if vals.is_empty() {
2224        return;
2225    }
2226    let value = vals.iter().sum::<f64>() / vals.len() as f64;
2227    coverage.push(NoisePhaseCoverage {
2228        pairing_label: pairing_label.to_string(),
2229        step_index,
2230        label: label.to_string(),
2231        metric: Some(metric),
2232        present_side,
2233        value: Some(value),
2234    });
2235}
2236
2237/// Surface every non-suppressed metric of a whole one-sided `step_index` (a
2238/// phase present on only one side's runs) as [`NoisePhaseCoverage`] rows, so a
2239/// scenario-shape asymmetry is not silently dropped.
2240fn push_noise_unpaired_step(
2241    coverage: &mut Vec<NoisePhaseCoverage>,
2242    pairing_label: &str,
2243    step_index: u16,
2244    side: ComparePartition,
2245    buckets: &[&crate::assert::PhaseBucket],
2246) {
2247    let label = buckets[0].label.clone();
2248    let names: std::collections::BTreeSet<&str> = buckets
2249        .iter()
2250        .flat_map(|p| p.metrics.keys())
2251        .map(String::as_str)
2252        .collect();
2253    let before = coverage.len();
2254    for name in names {
2255        if is_render_suppressed_component(name) {
2256            continue;
2257        }
2258        let Some(m) = metric_def(name) else {
2259            continue;
2260        };
2261        let vals: Vec<f64> = buckets
2262            .iter()
2263            .filter_map(|p| p.metrics.get(name).copied())
2264            .collect();
2265        push_noise_phase_coverage(coverage, pairing_label, step_index, &label, m, side, &vals);
2266    }
2267    if coverage.len() == before {
2268        // A one-sided phase with no readable (non-suppressed) metric — a
2269        // synthesized capture-free step can carry an empty metrics map. Surface
2270        // the empty shape (metric/value None -> rendered with `—`) so it is not
2271        // silently dropped.
2272        coverage.push(NoisePhaseCoverage {
2273            pairing_label: pairing_label.to_string(),
2274            step_index,
2275            label,
2276            metric: None,
2277            present_side: side,
2278            value: None,
2279        });
2280    }
2281}
2282
2283/// Per-phase noise sub-pass for one matched `(a_rows, b_rows)` pair, over the
2284/// N per-run [`crate::assert::PhaseBucket`]s per side. For each matched
2285/// `step_index` it walks the union of metric names
2286/// and emits a [`NoisePhaseFinding`] per `(step, metric)` present on BOTH sides
2287/// (spread verdict via the same machinery as the aggregate pass, incl. the
2288/// pooled-`Σnum/Σden` Rate centroid — here summed WITHIN the phase from
2289/// `bucket.metrics`), or a [`NoisePhaseCoverage`] for a one-sided metric or a
2290/// whole one-sided `step_index`. Skips the pair unless BOTH sides have at least
2291/// one run carrying phases (single-phase scenarios). Render-only: nothing here
2292/// contributes to the gate exit.
2293#[allow(clippy::too_many_arguments)]
2294fn noise_phase_findings(
2295    a_rows: &[GauntletRow],
2296    b_rows: &[GauntletRow],
2297    pairing_label: &str,
2298    spread_threshold_pct: f64,
2299    include_stable: bool,
2300    findings: &mut Vec<NoisePhaseFinding>,
2301    coverage: &mut Vec<NoisePhaseCoverage>,
2302    consulted_phase: &mut std::collections::BTreeSet<(u16, &'static str)>,
2303) {
2304    use std::collections::BTreeSet;
2305    // Single-phase scenarios carry empty phases on every run; skip the per-phase
2306    // view if either side has no run with phases.
2307    let has_phases = |rows: &[GauntletRow]| rows.iter().any(|r| !r.phases.is_empty());
2308    if !has_phases(a_rows) || !has_phases(b_rows) {
2309        return;
2310    }
2311    // A nested fn (not a closure) so the elided output lifetime ties the
2312    // borrowed &PhaseBucket to `rows` — a closure can't express that linkage.
2313    fn by_step(
2314        rows: &[GauntletRow],
2315    ) -> std::collections::BTreeMap<u16, Vec<&crate::assert::PhaseBucket>> {
2316        let mut m: std::collections::BTreeMap<u16, Vec<&crate::assert::PhaseBucket>> =
2317            std::collections::BTreeMap::new();
2318        for r in rows {
2319            for p in &r.phases {
2320                m.entry(p.step_index).or_default().push(p);
2321            }
2322        }
2323        m
2324    }
2325    let a_by_step = by_step(a_rows);
2326    let b_by_step = by_step(b_rows);
2327    let steps: BTreeSet<u16> = a_by_step.keys().chain(b_by_step.keys()).copied().collect();
2328    for step_index in steps {
2329        match (a_by_step.get(&step_index), b_by_step.get(&step_index)) {
2330            (Some(a_buckets), Some(b_buckets)) => {
2331                let label = a_buckets[0].label.clone();
2332                let names: BTreeSet<&str> = a_buckets
2333                    .iter()
2334                    .chain(b_buckets.iter())
2335                    .flat_map(|p| p.metrics.keys())
2336                    .map(String::as_str)
2337                    .collect();
2338                for name in names {
2339                    if is_render_suppressed_component(name) {
2340                        continue;
2341                    }
2342                    let Some(m) = metric_def(name) else {
2343                        continue;
2344                    };
2345                    let a_vals: Vec<f64> = a_buckets
2346                        .iter()
2347                        .filter_map(|p| p.metrics.get(name).copied())
2348                        .collect();
2349                    let b_vals: Vec<f64> = b_buckets
2350                        .iter()
2351                        .filter_map(|p| p.metrics.get(name).copied())
2352                        .collect();
2353                    match (a_vals.is_empty(), b_vals.is_empty()) {
2354                        (true, true) => continue,
2355                        // Present on only one matched side: no band on the other
2356                        // — coverage, not a delta.
2357                        (false, true) => {
2358                            push_noise_phase_coverage(
2359                                coverage,
2360                                pairing_label,
2361                                step_index,
2362                                &label,
2363                                m,
2364                                ComparePartition::A,
2365                                &a_vals,
2366                            );
2367                            continue;
2368                        }
2369                        (true, false) => {
2370                            push_noise_phase_coverage(
2371                                coverage,
2372                                pairing_label,
2373                                step_index,
2374                                &label,
2375                                m,
2376                                ComparePartition::B,
2377                                &b_vals,
2378                            );
2379                            continue;
2380                        }
2381                        (false, false) => {}
2382                    }
2383                    let verdict = match m.kind {
2384                        MetricKind::Rate {
2385                            numerator,
2386                            denominator,
2387                        } => {
2388                            // Per-phase Rate: pooled Σnum/Σden WITHIN the phase
2389                            // from bucket.metrics (phase-derivable rates carry
2390                            // both components per phase), band from per-run ratios.
2391                            let pooled = |buckets: &[&crate::assert::PhaseBucket]| -> Option<f64> {
2392                                let (num, den) = buckets.iter().fold((0.0, 0.0), |(sn, sd), p| {
2393                                    match (p.metrics.get(numerator), p.metrics.get(denominator)) {
2394                                        (Some(n), Some(d)) => (sn + n, sd + d),
2395                                        _ => (sn, sd),
2396                                    }
2397                                });
2398                                (den != 0.0).then(|| num / den)
2399                            };
2400                            let (Some(a_pooled), Some(b_pooled)) =
2401                                (pooled(a_buckets), pooled(b_buckets))
2402                            else {
2403                                continue;
2404                            };
2405                            noise_verdict_from(
2406                                SideSummary::of(&a_vals).with_pooled_mean(a_pooled),
2407                                SideSummary::of(&b_vals).with_pooled_mean(b_pooled),
2408                                spread_threshold_pct,
2409                            )
2410                        }
2411                        _ => noise_verdict(&a_vals, &b_vals, spread_threshold_pct),
2412                    };
2413                    // This (step, metric) produced a comparable verdict, so a
2414                    // phase-scoped gate on it WAS evaluated (see the aggregate
2415                    // pass's `consulted`).
2416                    consulted_phase.insert((step_index, m.name));
2417                    // Phase-scoped declared assertion for this test+metric+step.
2418                    let assertion = b_rows.first().and_then(|r| {
2419                        r.perf_delta_assertions
2420                            .iter()
2421                            .find(|x| x.metric == m.name && x.phase == Some(step_index))
2422                    });
2423                    let gated_by_assertion = assertion.is_some();
2424                    let Some(kind) = classify_noise(&verdict, m, assertion, include_stable) else {
2425                        continue;
2426                    };
2427                    findings.push(NoisePhaseFinding {
2428                        pairing_label: pairing_label.to_string(),
2429                        step_index,
2430                        label: label.clone(),
2431                        metric: m,
2432                        verdict,
2433                        kind,
2434                        gated_by_assertion,
2435                    });
2436                }
2437            }
2438            (Some(a_buckets), None) => {
2439                push_noise_unpaired_step(
2440                    coverage,
2441                    pairing_label,
2442                    step_index,
2443                    ComparePartition::A,
2444                    a_buckets,
2445                );
2446            }
2447            (None, Some(b_buckets)) => {
2448                push_noise_unpaired_step(
2449                    coverage,
2450                    pairing_label,
2451                    step_index,
2452                    ComparePartition::B,
2453                    b_buckets,
2454                );
2455            }
2456            (None, None) => {}
2457        }
2458    }
2459}
2460
2461/// Summarize a side's loaded runs by why each is or isn't comparable, using the
2462/// SAME exclusions [`noise_findings`] applies (skip / fail / inconclusive /
2463/// expected-failure). Returns `(comparable_count, human_summary)` so an empty
2464/// comparison can explain WHY — e.g. every run was skipped — instead of the bare
2465/// "nothing to compare". The `comparable_count` equals the number of rows
2466/// `noise_findings` would keep (a row is comparable iff none of the four
2467/// exclusions hold), so a zero here is exactly why the side produced no findings.
2468pub(crate) fn summarize_side_runs(rows: &[GauntletRow]) -> (usize, String) {
2469    let (mut skipped, mut failed, mut inconclusive, mut xfail, mut comparable) =
2470        (0usize, 0usize, 0usize, 0usize, 0usize);
2471    for r in rows {
2472        if r.is_skip() {
2473            skipped += 1;
2474        } else if r.is_fail() {
2475            failed += 1;
2476        } else if r.is_inconclusive() {
2477            inconclusive += 1;
2478        } else if r.expected_failure {
2479            xfail += 1;
2480        } else {
2481            comparable += 1;
2482        }
2483    }
2484    let mut excluded = Vec::new();
2485    if skipped > 0 {
2486        excluded.push(format!("{skipped} skipped"));
2487    }
2488    if failed > 0 {
2489        excluded.push(format!("{failed} failed"));
2490    }
2491    if inconclusive > 0 {
2492        excluded.push(format!("{inconclusive} inconclusive"));
2493    }
2494    if xfail > 0 {
2495        excluded.push(format!("{xfail} expected-failure"));
2496    }
2497    let breakdown = if excluded.is_empty() {
2498        "none excluded".to_string()
2499    } else {
2500        excluded.join(", ")
2501    };
2502    (
2503        comparable,
2504        format!(
2505            "{} run(s): {comparable} comparable ({breakdown})",
2506            rows.len()
2507        ),
2508    )
2509}
2510
2511/// Noise-adjusted variant of [`compare_partitions`]: instead of averaging each
2512/// side's runs into one mean and gating on a fixed threshold, it keeps every run
2513/// ([`RowPrep::PerRunPooled`]), summarizes each side per metric, and decides
2514/// whether the two sides are distinguishable given their run-to-run variability
2515/// (see [`noise_findings`] for the row-level core + [`noise_verdict`] for the
2516/// per-metric decision). Used by `perf-delta --noise-adjust N`, which produces N
2517/// runs per side. A metric is a CONFIDENT REGRESSION (fed to the operator
2518/// failure gate below) when it is SEPARATED (a two-sided Welch t-test rejects equal means at
2519/// `NOISE_ALPHA`, OR the `[min, max]` bands are fully disjoint) AND MATERIAL (the
2520/// mean delta clears both the registry `default_abs` and `default_rel`, the same
2521/// dual-gate as the scalar path) in the worsening direction (per polarity). A
2522/// side that realized fewer than 2 runs is flagged `NOISY` and never gates; a
2523/// per-side relative spread over `spread_threshold_pct` is an ADVISORY
2524/// `(noisy spread)` annotation that NEVER suppresses a verdict. By default prints
2525/// only the MEANINGFUL rows (confident regression / improvement / informational),
2526/// each with its side's `mean [min-max] spread%` and verdict; stable (unchanged +
2527/// clean) and noisy (<2-run) rows are hidden unless `--all-metrics`, and a
2528/// one-line summary prints when every row is suppressed. The footer leads with an
2529/// overall verdict word ([`overall_verdict`]: `STABLE` unless the regressed or
2530/// improved count clears the significance cutoff — sub-cutoff moves are flagged
2531/// but likely noise) followed by the composite counts (regressed / improved /
2532/// stable / under-sampled, plus informational when present); and, when every
2533/// changed (non-stable) metric had a side with <2 usable runs, an explicit
2534/// inconclusive note.
2535///
2536/// When the scenarios carry phases, a per-phase spread block follows the
2537/// aggregate table (via [`format_noise_phase_findings_lines`]): the same
2538/// spread verdict per matched `(step_index, metric)`, plus a coverage table for
2539/// one-sided phases/metrics. Like the aggregate table, the per-phase block shows
2540/// only MEANINGFUL rows by default (regression / improvement / informational) and
2541/// hides stable / noisy rows unless `--all-metrics` (`gate.show_all`); whenever the
2542/// spread rows are all suppressed it surfaces a one-line hint naming
2543/// `--all-metrics` (even alongside a coverage table), so the suppressed rows are
2544/// never silently gone. The coverage (one-sided) table is itself never
2545/// suppressed. `phase_opts` controls the per-phase
2546/// render only (`--no-phases` / `--phases-only` / `--steps-only` / `--phase` /
2547/// `--phase-threshold`); under `--phases-only` the aggregate table is
2548/// suppressed. Per-phase SPREAD findings are RENDER-ONLY — classified and
2549/// colored but not gating — EXCEPT a
2550/// per-phase regression the author explicitly declared a phase-scoped gate for,
2551/// which DOES contribute to the exit alongside the operator gate on the aggregate
2552/// confident regressions ([`gate_fails`]: their count reaches `fail_threshold`
2553/// [default 5], or a `must_fail` metric regressed).
2554/// The footer appends per-phase counts only when per-phase data exists and no
2555/// phase filter is active.
2556///
2557/// When no scenario pairs, the aggregate note breaks down each side's loaded
2558/// runs via [`summarize_side_runs`] — naming skipped / failed / inconclusive
2559/// runs — so an all-skipped comparison (e.g. a non-`performance_mode` test under
2560/// perf-delta's `KTSTR_PERF_ONLY`) is explained, not silently reported as
2561/// "nothing to compare".
2562pub fn compare_partitions_noise(
2563    filter_a: &RowFilter,
2564    filter_b: &RowFilter,
2565    dir: Option<&std::path::Path>,
2566    spread_threshold_pct: f64,
2567    phase_opts: &PhaseDisplayOptions,
2568    gate: &GateOptions,
2569) -> anyhow::Result<i32> {
2570    // Keep every per-run row INCLUDING duplicate pairing keys so the
2571    // run-to-run spread is observable: noise_findings groups the N runs
2572    // per key per side. PerRunPooled keeps every per-run row including
2573    // duplicate pairing keys — the N-per-key spread is the intended input.
2574    let prepared = prepare_partitioned_comparison(filter_a, filter_b, dir, RowPrep::PerRunPooled)?;
2575    let label_a = render_side_label(filter_a, &prepared.slicing_dims, "A");
2576    let label_b = render_side_label(filter_b, &prepared.slicing_dims, "B");
2577
2578    let report = noise_findings(
2579        &prepared.rows_a_for_compare,
2580        &prepared.rows_b_for_compare,
2581        &prepared.pairing_dims,
2582        spread_threshold_pct,
2583        // Include Stable (unchanged + clean) metrics so the rendered table
2584        // shows the full comparison, not just changed/noisy rows.
2585        true,
2586    );
2587
2588    println!(
2589        "perf-delta --noise-adjust: {label_b} vs {label_a} (advisory noisy-spread threshold {spread_threshold_pct:.2}%)"
2590    );
2591    // Aggregate spread table — suppressed under --phases-only (renders ONLY
2592    // the per-phase block).
2593    if !phase_opts.phases_only {
2594        if report.findings.is_empty() {
2595            // Distinguish "nothing paired" from "paired but every metric omitted"
2596            // so the message never contradicts the paired-scenario footer below.
2597            // A metric is omitted when both sides' means are ~zero, it read on
2598            // only one side, or it is a render-suppressed rate component.
2599            if report.paired_scenarios == 0 {
2600                // Explain WHY nothing paired instead of a bare "nothing to
2601                // compare": prepare_partitioned_comparison already bailed if a
2602                // side loaded ZERO rows, so both sides have >=1 run here — the
2603                // pairing failure is either all-excluded runs (skip/fail/etc.)
2604                // or a genuine scenario mismatch. Break down each side so a
2605                // skipped-on-both-sides run (the common perf-delta case: a
2606                // non-performance_mode test under KTSTR_PERF_ONLY) is named.
2607                let (a_ok, a_desc) = summarize_side_runs(&prepared.rows_a_for_compare);
2608                let (b_ok, b_desc) = summarize_side_runs(&prepared.rows_b_for_compare);
2609                println!(
2610                    "perf-delta --noise-adjust: no comparable runs to pair across the two runs."
2611                );
2612                println!("  {label_a} — {a_desc}");
2613                println!("  {label_b} — {b_desc}");
2614                if a_ok == 0 || b_ok == 0 {
2615                    println!(
2616                        "  A skipped run carries no metrics: perf-delta runs with \
2617                         KTSTR_PERF_ONLY, which skips any test not marked \
2618                         #[ktstr_test(performance_mode = true)]; host-gated skips land here \
2619                         too. A failed / inconclusive run is excluded from the spread math."
2620                    );
2621                } else {
2622                    println!(
2623                        "  Both sides produced comparable runs but share no scenario / topology \
2624                         / work_type — the two selections have no common test to contrast."
2625                    );
2626                }
2627            } else {
2628                println!(
2629                    "perf-delta --noise-adjust: no metric to display — every compared metric \
2630                     was unchanged at zero, present on only one side, or render-suppressed"
2631                );
2632            }
2633        } else {
2634            print!(
2635                "{}",
2636                format_noise_findings_table(&report.findings, &label_a, &label_b, gate.show_all)
2637            );
2638        }
2639    }
2640    // Per-phase spread block — render-only (never gates), honoring --no-phases /
2641    // --phase / --steps-only / --phase-threshold. Under --phases-only it is the
2642    // ONLY table, so emit an explicit note rather than a silent blank.
2643    let phase_lines = format_noise_phase_findings_lines(
2644        &report.phase_findings,
2645        &report.phase_coverage,
2646        phase_opts,
2647        &label_a,
2648        &label_b,
2649        gate.show_all,
2650    );
2651    if phase_lines.is_empty() && phase_opts.phases_only {
2652        println!(
2653            "perf-delta --noise-adjust: no per-phase noise data to show (no matched \
2654             multi-phase scenario at the selected step, or every per-phase row was \
2655             filtered by --phase / --steps-only / --phase-threshold)"
2656        );
2657    }
2658    for line in phase_lines {
2659        println!("{line}");
2660    }
2661    // Declared gates that never evaluated (metric absent from the compared
2662    // data). Rendered regardless of --phases-only — a silently-inert declared
2663    // gate is important whether or not the aggregate table is shown. Never
2664    // gates the exit.
2665    for line in format_noise_assertion_coverage_lines(&report.assertion_coverage) {
2666        println!("{line}");
2667    }
2668    let regressions = report.regressions();
2669    // Per-phase regressions the author explicitly DECLARED a phase-scoped gate
2670    // for — these gate the exit alongside the aggregate regressions (spread-only
2671    // per-phase findings stay render-only). Computed unconditionally so the exit
2672    // is identical whether or not the phase footer is shown.
2673    let declared_phase_regressions = report.declared_phase_regressions();
2674    // The aggregate summary footer describes the (hidden) aggregate spread, so
2675    // suppress it under --phases-only — which renders ONLY the per-phase
2676    // block.
2677    // The exit still gates on gate_fails(aggregate regressions) +
2678    // `declared_phase_regressions` (computed above) when the footer is hidden.
2679    if !phase_opts.phases_only {
2680        let noisy = report.noisy();
2681        // Composite verdict counts: regressed / improved LEAD (the table shows
2682        // each with its magnitude), stable is the expected residual cited as a
2683        // count only, and informational (changed but no polarity) is cited only
2684        // when present. All render-only — the exit reads gate_fails + declared
2685        // gates, never these.
2686        let improvements = report.improvements();
2687        let stable = report.stable();
2688        let informational = report.informational();
2689        let info_clause = if informational > 0 {
2690            format!(", {informational} informational")
2691        } else {
2692            String::new()
2693        };
2694        // Overall verdict word: STABLE unless a direction clears the cutoff (see
2695        // overall_verdict). Sub-cutoff moves are flagged in the counts above but
2696        // do not shift the verdict off STABLE.
2697        let verdict = overall_verdict(&report, gate);
2698        let stable_note = if verdict == "STABLE" && regressions + improvements > 0 {
2699            " (moves are below the significance cutoff, more likely noise than signal)"
2700        } else {
2701            ""
2702        };
2703        // Per-phase footer counts are shown ONLY when there IS per-phase data
2704        // AND no phase filter is active (no --no-phases / --phase / --steps-only
2705        // / --phase-threshold) — so the counts always match the fully-rendered
2706        // per-phase table, and a single-phase run (no per-phase view) shows no
2707        // confusing 0/0 per-phase clause. Uses the same !any-flag-set
2708        // discipline as the aggregate paired-scenario hint.
2709        let has_phase_data = !report.phase_findings.is_empty() || !report.phase_coverage.is_empty();
2710        let show_phase_footer = has_phase_data
2711            && !phase_opts.no_phases
2712            && phase_opts.phase.is_none()
2713            && !phase_opts.steps_only
2714            && phase_opts.phase_threshold.is_none();
2715        // A REGRESSED verdict with ZERO aggregate regressions can only come from a
2716        // declared PER-PHASE gate (a declared whole-run regression and --must-fail
2717        // both require an aggregate Regression, so regressions >= 1 there). When
2718        // the phase footer below is hidden (a phase filter is active) name the
2719        // source so `overall REGRESSED: 0 regressed` is not self-contradictory;
2720        // when the phase footer shows, its "(declared-gated, exit-affecting)"
2721        // clause already explains it, so skip the redundant note.
2722        let verdict_source =
2723            if verdict.starts_with("REGRESSED") && regressions == 0 && !show_phase_footer {
2724                " (regression is a declared per-phase gate)"
2725            } else {
2726                ""
2727            };
2728        let phase_footer = if show_phase_footer {
2729            // Per-phase regressions are render-only EXCEPT the declared-gated
2730            // ones, which gate the exit — call that out so the count is not read
2731            // as fully render-only when a declared phase gate fired.
2732            let declared_note = if declared_phase_regressions > 0 {
2733                format!(" ({declared_phase_regressions} declared-gated, exit-affecting)")
2734            } else {
2735                " (render-only)".to_string()
2736            };
2737            format!(
2738                "; {} per-phase regression(s){declared_note}, {} per-phase under-sampled (<2 runs)",
2739                report.phase_regressions(),
2740                report.phase_noisy(),
2741            )
2742        } else {
2743            String::new()
2744        };
2745        // Declared gates that never evaluated — a single-line summary of the
2746        // warning block above (shown whenever any gate went unevaluated).
2747        let unevaluated_gates = report.assertion_coverage.len();
2748        let gate_footer = if unevaluated_gates > 0 {
2749            format!("; {unevaluated_gates} declared gate(s) not evaluated")
2750        } else {
2751            String::new()
2752        };
2753        println!(
2754            "perf-delta --noise-adjust: {} paired scenario(s); overall {verdict}: \
2755             {regressions} regressed, {improvements} improved, {stable} stable{info_clause}, \
2756             {noisy} under-sampled (<2 runs){stable_note}{verdict_source}{phase_footer}{gate_footer}",
2757            report.paired_scenarios,
2758        );
2759    }
2760    // Inconclusive: every CHANGED aggregate metric (excluding Stable rows, which
2761    // never gate) had a side with < 2 usable runs — no trustworthy signal either
2762    // way. Surfaced prominently for CI logs, but noise FLAGS, not FAILS, so the
2763    // exit stays 0 unless a confident AGGREGATE regression fired. Suppressed
2764    // under --phases-only (the aggregate table is hidden there).
2765    if !phase_opts.phases_only {
2766        let changed: Vec<&NoiseFinding> = report
2767            .findings
2768            .iter()
2769            .filter(|f| f.kind != NoiseKind::Stable)
2770            .collect();
2771        if !changed.is_empty() && changed.iter().all(|f| f.kind == NoiseKind::Noisy) {
2772            println!(
2773                "perf-delta --noise-adjust: NOTE -- every changed metric had a side with <2 usable \
2774                 runs; raise --noise-adjust N (or investigate why per-side runs failed) for a \
2775                 trustworthy verdict"
2776            );
2777        }
2778    }
2779    // Exit gates on AGGREGATE regressions plus DECLARED phase regressions.
2780    // Spread-only per-phase findings stay render-only (parity with the scalar
2781    // per-phase pass — a narrow-window phase flake must not flip CI red), but a
2782    // phase-scoped gate the author explicitly declared is an opt-in and DOES
2783    // gate (matches the `PerfDeltaAssertion::phase` doc).
2784    // Operator gate: the count / must-fail gate applies to the UNdeclared
2785    // aggregate confident regressions; an author-DECLARED gate — whole-run OR
2786    // phase-scoped — always fails (its own opt-in, orthogonal to the operator's
2787    // count gate).
2788    Ok(noise_exit_code(&report, gate))
2789}
2790
2791/// Exit code for the noise-adjusted compare. Fails (`1`) when EITHER the
2792/// operator gate ([`gate_fails`]) trips on the aggregate confident regressions,
2793/// OR any author-DECLARED regression is present — whole-run
2794/// ([`NoiseReport::declared_regressions`]) or phase-scoped
2795/// ([`NoiseReport::declared_phase_regressions`]). A declared assertion is a
2796/// per-test opt-in that ALWAYS gates on its metric, independent of the
2797/// operator's count / must-fail gate, so a single declared regression fails
2798/// even below `--fail-threshold`. Extracted so the exit decision is
2799/// unit-testable.
2800pub(crate) fn noise_exit_code(report: &NoiseReport, gate: &GateOptions) -> i32 {
2801    let regressing: Vec<&str> = report
2802        .findings
2803        .iter()
2804        .filter(|f| f.kind == NoiseKind::Regression)
2805        .map(|f| f.metric.name)
2806        .collect();
2807    if gate_fails(&regressing, gate)
2808        || report.declared_regressions() > 0
2809        || report.declared_phase_regressions() > 0
2810    {
2811        1
2812    } else {
2813        0
2814    }
2815}
2816
2817/// Overall run verdict word for the `--noise-adjust` summary, derived from the
2818/// confident move counts against the significance cutoff (`--fail-threshold`,
2819/// default 5). Sub-cutoff moves in EITHER direction are FLAGGED (shown in the
2820/// table, counted in the footer) but do NOT shift the verdict off `STABLE` —
2821/// below the cutoff a move is more likely noise than signal, which is the whole
2822/// point of the conservative noise gate. A direction only reads into the verdict
2823/// once it clears the cutoff, and both can hold at once (`REGRESSED + IMPROVED`).
2824///
2825/// The regressed side reuses the exit decision ([`noise_exit_code`]) so a
2826/// `--must-fail` or declared gate that fails the run also reads `REGRESSED` even
2827/// below the count cutoff; the improved side has no gate, so it uses the count
2828/// cutoff alone. Display-only — never the exit basis.
2829pub(crate) fn overall_verdict(report: &NoiseReport, gate: &GateOptions) -> &'static str {
2830    verdict_label(
2831        noise_exit_code(report, gate) == 1,
2832        report.improvements(),
2833        gate.fail_threshold,
2834    )
2835}
2836
2837/// Pure verdict classifier: `regressed` is the run's regression-fail decision;
2838/// `improvements` is the confident-improvement count; `fail_threshold` is the
2839/// significance cutoff (`None` = 5, `Some(0)` disables count significance).
2840/// Split out from [`overall_verdict`] so the STABLE-below-cutoff policy is
2841/// unit-testable without building a full report.
2842pub(crate) fn verdict_label(
2843    regressed: bool,
2844    improvements: usize,
2845    fail_threshold: Option<usize>,
2846) -> &'static str {
2847    let n = fail_threshold.unwrap_or(5);
2848    let improved = n >= 1 && improvements >= n;
2849    match (regressed, improved) {
2850        (true, true) => "REGRESSED + IMPROVED",
2851        (true, false) => "REGRESSED",
2852        (false, true) => "IMPROVED",
2853        (false, false) => "STABLE",
2854    }
2855}
2856
2857/// Compose a noise verdict cell's text: the base label plus any parenthesized,
2858/// comma-joined annotations. Shared by the aggregate and per-phase tables so
2859/// both annotate identically. `noisy spread` (advisory `high_spread`, suppressed
2860/// on a Noisy <2-runs row where it is redundant) and `declared gate` (the row
2861/// carries a declared [`crate::test_support::PerfDeltaAssertion`] — its
2862/// overrides drive the gate, or fall back to the registry defaults if rejected
2863/// as out-of-range on a corrupt sidecar) can co-occur → `REGRESSION (noisy
2864/// spread, declared gate)`.
2865fn compose_noise_verdict_text(
2866    base: &str,
2867    high_spread: bool,
2868    kind: NoiseKind,
2869    gated_by_assertion: bool,
2870) -> String {
2871    let mut annotations: Vec<&str> = Vec::new();
2872    if high_spread && kind != NoiseKind::Noisy {
2873        annotations.push("noisy spread");
2874    }
2875    if gated_by_assertion {
2876        annotations.push("declared gate");
2877    }
2878    if annotations.is_empty() {
2879        base.to_string()
2880    } else {
2881        format!("{base} ({})", annotations.join(", "))
2882    }
2883}
2884
2885/// Render the per-metric noise-adjusted findings as a table for
2886/// `perf-delta --noise-adjust`: one row per (scenario, metric) with each
2887/// side's `mean [min-max] spread%` and the colored verdict. Includes
2888/// [`NoiseKind::Stable`] rows so the operator sees every compared metric,
2889/// not only the changed ones. Pure (returns the rendered string with a
2890/// trailing newline) so the row/verdict mapping is unit-testable without
2891/// capturing stdout.
2892pub(crate) fn format_noise_findings_table(
2893    findings: &[NoiseFinding],
2894    label_a: &str,
2895    label_b: &str,
2896    show_all: bool,
2897) -> String {
2898    use comfy_table::{Cell, Color};
2899    // Default view shows only MEANINGFUL rows (confident regression /
2900    // improvement / informational). Stable (unchanged / immaterial) and
2901    // Noisy (<2 usable runs) rows are hidden unless `show_all`; their COUNTS
2902    // still print in the caller's footer, and the gate reads the full
2903    // classified set, so this suppression is display-only.
2904    let visible: Vec<&NoiseFinding> = findings
2905        .iter()
2906        .filter(|f| show_all || !matches!(f.kind, NoiseKind::Stable | NoiseKind::Noisy))
2907        .collect();
2908    if visible.is_empty() {
2909        if findings.is_empty() {
2910            return String::new();
2911        }
2912        let noisy = findings
2913            .iter()
2914            .filter(|f| f.kind == NoiseKind::Noisy)
2915            .count();
2916        return format!(
2917            "perf-delta --noise-adjust: {} metric(s) compared, none meaningfully changed \
2918             ({noisy} under-sampled); re-run with --all-metrics to see them\n",
2919            findings.len(),
2920        );
2921    }
2922    let mut table = crate::cli::new_table();
2923    table.set_header(vec![
2924        "TEST / METRIC".to_string(),
2925        format!("{label_a} (A: mean [min-max] spread%)"),
2926        format!("{label_b} (B: mean [min-max] spread%)"),
2927        "VERDICT".to_string(),
2928    ]);
2929    for f in visible {
2930        let (base, color) = match f.kind {
2931            NoiseKind::Regression => ("REGRESSION", Color::Red),
2932            NoiseKind::Improvement => ("improvement", Color::Green),
2933            NoiseKind::Noisy => ("NOISY (<2 runs)", Color::Yellow),
2934            NoiseKind::Informational => ("informational", Color::Blue),
2935            NoiseKind::Stable => ("stable", Color::Grey),
2936        };
2937        // Verdict annotations (parenthesized, comma-joined). `noisy spread` is
2938        // ADVISORY (high_spread) — flags a noisy side without changing the
2939        // classification, redundant on a Noisy <2-runs row so omitted there.
2940        // `declared gate` marks a row that CARRIES a declared PerfDeltaAssertion
2941        // (its overrides drive the gate, or fall back to registry defaults if
2942        // rejected as out-of-range on a corrupt sidecar), so the operator can
2943        // tell an author-declared gate from a pure registry-default one.
2944        let verdict_text =
2945            compose_noise_verdict_text(base, f.verdict.high_spread, f.kind, f.gated_by_assertion);
2946        let v = &f.verdict;
2947        table.add_row(vec![
2948            Cell::new(format!("{} / {}", f.pairing_label, f.metric.name)),
2949            Cell::new(format!(
2950                "{:.1} [{:.1}-{:.1}] {:.2}%",
2951                v.a.mean, v.a.min, v.a.max, v.a.spread_pct
2952            )),
2953            Cell::new(format!(
2954                "{:.1} [{:.1}-{:.1}] {:.2}%",
2955                v.b.mean, v.b.min, v.b.max, v.b.spread_pct
2956            )),
2957            Cell::new(verdict_text).fg(color),
2958        ]);
2959    }
2960    format!("{table}\n")
2961}
2962
2963/// One-line description of a declared gate's overrides for the
2964/// not-evaluated warning: the thresholds it WOULD have applied. All-`None`
2965/// (a bare `PerfDeltaAssertion::new(metric)`) renders `registry defaults` —
2966/// a presence-checked gate that inherits the registry `default_abs`/
2967/// `default_rel`/polarity.
2968fn describe_declared_gate(a: &crate::test_support::PerfDeltaAssertionRecord) -> String {
2969    let mut parts: Vec<String> = Vec::new();
2970    if let Some(pct) = a.max_regression_pct {
2971        parts.push(format!("max_regression_pct={pct}"));
2972    }
2973    if let Some(abs) = a.min_abs {
2974        parts.push(format!("min_abs={abs}"));
2975    }
2976    if let Some(dir) = a.direction {
2977        parts.push(format!("direction={dir:?}"));
2978    }
2979    if parts.is_empty() {
2980        "registry defaults".to_string()
2981    } else {
2982        parts.join(", ")
2983    }
2984}
2985
2986/// Render the declared perf gates that never evaluated (a metric absent from
2987/// the compared data) as warning lines for `perf-delta --noise-adjust`: a
2988/// TEST | METRIC | PHASE | DECLARED GATE table, so an author whose declared
2989/// [`crate::test_support::PerfDeltaAssertion`] silently did not fire sees it
2990/// rather than mistaking a not-evaluated gate for a passing one. Never gates
2991/// the exit (a gate that could not evaluate is not a regression). Pure —
2992/// returns the lines (empty when there are none) so the mapping is
2993/// unit-testable without capturing stdout.
2994pub(crate) fn format_noise_assertion_coverage_lines(
2995    coverage: &[NoiseAssertionCoverage],
2996) -> Vec<String> {
2997    use comfy_table::{Cell, Color};
2998    let mut lines = Vec::new();
2999    if coverage.is_empty() {
3000        return lines;
3001    }
3002    let mut rows: Vec<&NoiseAssertionCoverage> = coverage.iter().collect();
3003    rows.sort_by(|a, b| {
3004        a.pairing_label
3005            .cmp(&b.pairing_label)
3006            .then_with(|| a.assertion.metric.cmp(&b.assertion.metric))
3007            .then_with(|| a.assertion.phase.cmp(&b.assertion.phase))
3008    });
3009    lines.push(String::new());
3010    lines.push(
3011        "declared perf gate(s) NOT evaluated — the metric was absent from the compared \
3012         data (workload no longer emits it, a one-sided/failed run, or a Rate with no \
3013         samples), so the declared gate silently did not fire:"
3014            .to_string(),
3015    );
3016    let mut table = crate::cli::new_table();
3017    table.set_header(vec![
3018        "TEST".to_string(),
3019        "METRIC".to_string(),
3020        "PHASE".to_string(),
3021        "DECLARED GATE".to_string(),
3022    ]);
3023    for c in rows {
3024        let phase = match c.assertion.phase {
3025            None => "aggregate".to_string(),
3026            Some(k) => k.to_string(),
3027        };
3028        table.add_row(vec![
3029            Cell::new(&c.pairing_label),
3030            Cell::new(&c.assertion.metric).fg(Color::Yellow),
3031            Cell::new(phase),
3032            Cell::new(describe_declared_gate(&c.assertion)),
3033        ]);
3034    }
3035    lines.push(format!("{table}"));
3036    lines
3037}
3038
3039/// Render the per-phase noise-adjusted findings as lines for
3040/// `perf-delta --noise-adjust`: a per-phase spread table (PHASE | TEST /
3041/// METRIC | A mean[min-max] spread% | B ... | VERDICT) plus a one-sided
3042/// coverage table (SIDE | TEST | PHASE | METRIC | VALUE). The TEST column
3043/// carries the full pairing-key label (scenario plus every pairing dim),
3044/// matching the scalar compare path — a scenario shared across topologies
3045/// renders as distinct rows. Mirrors
3046/// [`format_noise_findings_table`], honoring
3047/// [`PhaseDisplayOptions`] (no_phases / phase / steps_only / phase_threshold).
3048/// Render-only: these rows never gate. Pure — returns the lines (empty when
3049/// suppressed / no per-phase data) so the row/verdict mapping is unit-testable
3050/// without capturing stdout.
3051pub(crate) fn format_noise_phase_findings_lines(
3052    phase_findings: &[NoisePhaseFinding],
3053    phase_coverage: &[NoisePhaseCoverage],
3054    phase_opts: &PhaseDisplayOptions,
3055    label_a: &str,
3056    label_b: &str,
3057    show_all: bool,
3058) -> Vec<String> {
3059    use comfy_table::{Cell, Color};
3060    let mut lines = Vec::new();
3061    if phase_opts.no_phases {
3062        return lines;
3063    }
3064    // Rows passing the phase-axis filters (`--phase` / `--steps-only`) and the
3065    // `--phase-threshold` spread gate, BEFORE the meaningful-only display filter.
3066    // Retained so the collapse summary below can report how many rows the default
3067    // view suppressed.
3068    let phase_filtered: Vec<&NoisePhaseFinding> = phase_findings
3069        .iter()
3070        .filter(|f| phase_opts.matches_phase(f.step_index))
3071        .filter(|f| phase_opts.passes_noise_spread_threshold(&f.verdict))
3072        .collect();
3073    // Default view shows only MEANINGFUL rows (regression / improvement /
3074    // informational); Stable + Noisy rows are hidden unless `show_all`
3075    // (`--all-metrics`), mirroring the aggregate table
3076    // ([`format_noise_findings_table`]). Display-only: the footer still reports
3077    // the per-phase regression / under-sampled COUNTS from the unfiltered report,
3078    // and the exit gate reads the unfiltered findings — so suppressing rows here
3079    // changes neither the counts nor the pass/fail.
3080    let mut findings: Vec<&NoisePhaseFinding> = phase_filtered
3081        .iter()
3082        .copied()
3083        .filter(|f| show_all || !matches!(f.kind, NoiseKind::Stable | NoiseKind::Noisy))
3084        .collect();
3085    let mut coverage: Vec<&NoisePhaseCoverage> = phase_coverage
3086        .iter()
3087        .filter(|c| phase_opts.matches_phase(c.step_index))
3088        .collect();
3089    let had_findings = !findings.is_empty();
3090    // Spread rows existed but were all suppressed as stable/noisy in the default
3091    // view: a one-line hint (naming `--all-metrics`, wording matched to the
3092    // aggregate collapse) keeps the suppression discoverable — surfaced whether
3093    // the block is otherwise empty OR a coverage table follows, so the suppressed
3094    // rows are never silently gone. Only in the default view: under `show_all`
3095    // nothing is suppressed, so an empty result there means there was genuinely no
3096    // per-phase spread data.
3097    let suppressed_hint: Option<String> =
3098        if findings.is_empty() && !phase_filtered.is_empty() && !show_all {
3099            let noisy = phase_filtered
3100                .iter()
3101                .filter(|f| f.kind == NoiseKind::Noisy)
3102                .count();
3103            Some(format!(
3104                "perf-delta --noise-adjust: {} per-phase metric(s) compared, none meaningfully \
3105                 changed ({noisy} under-sampled); re-run with --all-metrics to see them",
3106                phase_filtered.len(),
3107            ))
3108        } else {
3109            None
3110        };
3111    if findings.is_empty() && coverage.is_empty() {
3112        // Nothing else to render — the hint (if any) is the whole block.
3113        lines.extend(suppressed_hint);
3114        return lines;
3115    }
3116    lines.push(String::new());
3117    if had_findings {
3118        // "per-phase spread:" heads the findings table ONLY — a coverage-only
3119        // section (no spread rows) gets its own header below, so the label
3120        // never mislabels a table.
3121        lines.push("per-phase spread:".to_string());
3122        // step_index-first (BASELINE..Step[N] time order), then pairing label,
3123        // then metric — a stable, top-down-by-phase-boundary order.
3124        findings.sort_by(|a, b| {
3125            a.step_index
3126                .cmp(&b.step_index)
3127                .then_with(|| a.pairing_label.cmp(&b.pairing_label))
3128                .then_with(|| a.metric.name.cmp(b.metric.name))
3129        });
3130        let mut table = crate::cli::new_table();
3131        table.set_header(vec![
3132            "PHASE".to_string(),
3133            "TEST / METRIC".to_string(),
3134            format!("{label_a} (A: mean [min-max] spread%)"),
3135            format!("{label_b} (B: mean [min-max] spread%)"),
3136            "VERDICT".to_string(),
3137        ]);
3138        for f in findings {
3139            let (base, color) = match f.kind {
3140                NoiseKind::Regression => ("REGRESSION", Color::Red),
3141                NoiseKind::Improvement => ("improvement", Color::Green),
3142                NoiseKind::Noisy => ("NOISY (<2 runs)", Color::Yellow),
3143                NoiseKind::Informational => ("informational", Color::Blue),
3144                NoiseKind::Stable => ("stable", Color::Grey),
3145            };
3146            // Same annotation composition as the aggregate table (advisory
3147            // `noisy spread` + `declared gate`).
3148            let verdict_text = compose_noise_verdict_text(
3149                base,
3150                f.verdict.high_spread,
3151                f.kind,
3152                f.gated_by_assertion,
3153            );
3154            let v = &f.verdict;
3155            table.add_row(vec![
3156                Cell::new(format!("{}: {}", f.step_index, f.label)),
3157                Cell::new(format!("{} / {}", f.pairing_label, f.metric.name)),
3158                Cell::new(format!(
3159                    "{:.1} [{:.1}-{:.1}] {:.2}%",
3160                    v.a.mean, v.a.min, v.a.max, v.a.spread_pct
3161                )),
3162                Cell::new(format!(
3163                    "{:.1} [{:.1}-{:.1}] {:.2}%",
3164                    v.b.mean, v.b.min, v.b.max, v.b.spread_pct
3165                )),
3166                Cell::new(verdict_text).fg(color),
3167            ]);
3168        }
3169        lines.push(table.to_string());
3170    } else {
3171        // Spread rows were all suppressed but a coverage table follows: surface
3172        // the suppression hint before it so `--all-metrics` stays discoverable.
3173        lines.extend(suppressed_hint);
3174    }
3175    if !coverage.is_empty() {
3176        // Separate from the spread table above only when one was rendered; the
3177        // section-leading blank already precedes a coverage-only block.
3178        if had_findings {
3179            lines.push(String::new());
3180        }
3181        lines.push("per-phase coverage asymmetry (one-sided metrics):".to_string());
3182        coverage.sort_by(|a, b| {
3183            a.step_index
3184                .cmp(&b.step_index)
3185                .then_with(|| a.present_side.as_str().cmp(b.present_side.as_str()))
3186                .then_with(|| a.pairing_label.cmp(&b.pairing_label))
3187                .then_with(|| a.metric.map(|m| m.name).cmp(&b.metric.map(|m| m.name)))
3188        });
3189        let mut table = crate::cli::new_table();
3190        table.set_header(vec!["SIDE", "TEST", "PHASE", "METRIC", "VALUE"]);
3191        for c in coverage {
3192            // A whole one-sided phase with no readable metric renders `—` in the
3193            // METRIC + VALUE columns.
3194            let metric_cell = c.metric.map(|m| m.name).unwrap_or("—");
3195            // Bare {:.2} with NO display_unit — matching the noise aggregate
3196            // findings table,
3197            // so a unit-carrying metric renders consistently across all of them.
3198            let value_cell = match c.value {
3199                Some(v) => format!("{v:.2}"),
3200                None => "—".to_string(),
3201            };
3202            table.add_row(vec![
3203                Cell::new(c.present_side.as_str()),
3204                Cell::new(c.pairing_label.as_str()),
3205                Cell::new(format!("{}: {}", c.step_index, c.label)),
3206                Cell::new(metric_cell),
3207                Cell::new(value_cell),
3208            ]);
3209        }
3210        lines.push(table.to_string());
3211    }
3212    lines
3213}
3214
3215/// Render the scalar findings table for `perf-delta`.
3216///
3217/// Extracted from [`compare_partitions`] verbatim; the
3218/// `--phases-only` gate stays at the call site so this prints
3219/// unconditionally when invoked.
3220fn print_scalar_findings_table(report: &CompareReport, label_a: &str, label_b: &str) {
3221    use comfy_table::{Cell, Color};
3222    let mut table = crate::cli::new_table();
3223    table.set_header(vec!["TEST", "METRIC", label_a, label_b, "DELTA", "VERDICT"]);
3224    for f in &report.findings {
3225        let (verdict_text, verdict_color) = match f.kind {
3226            FindingKind::Regression => ("REGRESSION", Color::Red),
3227            FindingKind::Improvement => ("improvement", Color::Green),
3228            // Directionless metric: shown, never gated. Neutral color.
3229            FindingKind::Informational => ("informational", Color::Blue),
3230        };
3231        // PairingKey's first slot is scenario; subsequent slots
3232        // are the pairing-dim values in canonical order. Joining
3233        // with `/` produces a label whose shape mirrors the
3234        // pairing-dim count — so a comparison that pairs on
3235        // (topology, work_type) renders a `scenario/topology/work_type`
3236        // label, while a comparison that slices on most dims
3237        // renders a shorter identifier. The operator can always
3238        // cross-reference the "pairing on:" header line above to
3239        // see what each segment means.
3240        let label = f.pairing_key.0.join("/");
3241        table.add_row(vec![
3242            Cell::new(label),
3243            Cell::new(f.metric.name),
3244            Cell::new(format!("{:.2}", f.val_a)),
3245            Cell::new(format!("{:.2}", f.val_b)),
3246            Cell::new(format!("{:+.2}{}", f.delta, f.metric.display_unit)),
3247            Cell::new(verdict_text).fg(verdict_color),
3248        ]);
3249    }
3250    println!("{table}");
3251}
3252
3253/// Render the scalar summary block for `perf-delta` —
3254/// regressions / improvements / unchanged + skipped-failed +
3255/// per-group pass counts + new_in_b / removed_from_a. All lines
3256/// describe the scalar findings table; the `--phases-only` gate
3257/// stays at the call site so this prints unconditionally when
3258/// invoked.
3259fn print_summary_block(
3260    report: &CompareReport,
3261    avg_a: &Option<Vec<AveragedGroup>>,
3262    avg_b: &Option<Vec<AveragedGroup>>,
3263    label_a: &str,
3264    label_b: &str,
3265) {
3266    println!();
3267    println!(
3268        "summary: {} regressions, {} improvements, {} informational, {} unchanged",
3269        report.regressions, report.improvements, report.informational, report.unchanged,
3270    );
3271    if report.excluded_pairs > 0 {
3272        println!(
3273            "  {} pairing-key row pair(s) excluded from regression math because one \
3274             or both sides was excluded (failed, inconclusive, skipped, or an inverted expected-failure run)",
3275            report.excluded_pairs,
3276        );
3277    }
3278    if let (Some(avg_a), Some(avg_b)) = (avg_a, avg_b) {
3279        let block = format_per_group_pass_counts(avg_a, avg_b, label_a, label_b);
3280        if !block.is_empty() {
3281            print!("{block}");
3282        }
3283    }
3284    if report.new_in_b > 0 {
3285        println!(
3286            "  {} row(s) new in '{}' (no matching key in '{}')",
3287            report.new_in_b, label_b, label_a,
3288        );
3289    }
3290    if report.removed_from_a > 0 {
3291        println!(
3292            "  {} row(s) removed from '{}' (no matching key in '{}')",
3293            report.removed_from_a, label_a, label_b,
3294        );
3295    }
3296    for line in format_coverage_diff_lines(report, label_a, label_b) {
3297        println!("{line}");
3298    }
3299}
3300
3301/// Render the coverage-diff lines (metrics present on exactly one side of a
3302/// paired row) for [`print_summary_block`]. Pure (returns the lines, empty
3303/// when there are no coverage diffs) so the present/absent label mapping by
3304/// [`ComparePartition`] is unit-testable without capturing stdout.
3305pub(crate) fn format_coverage_diff_lines(
3306    report: &CompareReport,
3307    label_a: &str,
3308    label_b: &str,
3309) -> Vec<String> {
3310    if report.coverage_diffs.is_empty() {
3311        return Vec::new();
3312    }
3313    let mut lines = vec![format!(
3314        "  {} metric(s) present on only one side (coverage difference, \
3315         not a regression):",
3316        report.coverage_diffs.len(),
3317    )];
3318    for cd in &report.coverage_diffs {
3319        // present_side names the side that HAS the metric; the other is absent.
3320        let (present, absent) = match cd.present_side {
3321            ComparePartition::A => (label_a, label_b),
3322            ComparePartition::B => (label_b, label_a),
3323        };
3324        lines.push(format!(
3325            "    {} / {} = {:.2} in '{}', absent in '{}'",
3326            cd.pairing_key.0.join("/"),
3327            cd.metric.name,
3328            cd.value,
3329            present,
3330            absent,
3331        ));
3332    }
3333    lines
3334}
3335
3336/// Print the host-context delta for `perf-delta`. Same
3337/// first-Some(host) baseline `compare_partitions` uses — picking
3338/// representative hosts off the partitioned sidecars rather than
3339/// the full pool so the delta reflects what actually fed the
3340/// comparison.
3341fn print_host_context_delta(
3342    pool: &[crate::test_support::SidecarResult],
3343    rows: &[GauntletRow],
3344    filter_a: &RowFilter,
3345    filter_b: &RowFilter,
3346    label_a: &str,
3347    label_b: &str,
3348) {
3349    // Zip the pool with the pre-computed `rows` (built once above
3350    // via `pool.iter().map(sidecar_to_row).collect()`) so the
3351    // per-side filter reuses the existing row instead of calling
3352    // `sidecar_to_row` a second and third time. `pool` and `rows`
3353    // are the same length and same iteration order by construction.
3354    let sidecars_a: Vec<&crate::test_support::SidecarResult> = pool
3355        .iter()
3356        .zip(rows.iter())
3357        .filter(|(_, r)| filter_a.matches(r))
3358        .map(|(s, _)| s)
3359        .collect();
3360    let sidecars_b: Vec<&crate::test_support::SidecarResult> = pool
3361        .iter()
3362        .zip(rows.iter())
3363        .filter(|(_, r)| filter_b.matches(r))
3364        .map(|(s, _)| s)
3365        .collect();
3366    let host_a = sidecars_a.iter().find_map(|s| s.host.as_ref());
3367    let host_b = sidecars_b.iter().find_map(|s| s.host.as_ref());
3368    print!("{}", format_host_delta(host_a, host_b, label_a, label_b));
3369}
3370
3371/// Render the host-context delta section of `perf-delta`
3372/// as a block of text ready to `print!`. Extracted as a pure
3373/// function of `(Option<&HostContext>, Option<&HostContext>, &str,
3374/// &str)` so the five match arms can be unit-tested without
3375/// fixturing a real run directory.
3376///
3377/// The returned string is either empty (when both sides have no
3378/// host data — nothing to print) or ends with a newline so callers
3379/// can chain further output. Single-side cases print a clear
3380/// "captured in X only, delta unavailable" message rather than
3381/// silently suppressing the section — a mixed-tooling-version run
3382/// comparison should surface the asymmetry.
3383/// Format the one-line averaging-mode header that prints above
3384/// the comparison table.
3385///
3386/// Pure function of (`pre_agg_a`, `pre_agg_b`, `a`, `b`) so the
3387/// exact-string contract — the operator-visible "averaged across
3388/// N runs (A) and M runs (B)" surface — can be unit-tested
3389/// without capturing stdout from `compare_partitions`.
3390///
3391/// `pre_agg_a` / `pre_agg_b` are the post-typed-filter contributor
3392/// row counts (i.e. the number of sidecar rows that fed
3393/// [`group_and_average_by`]), NOT the post-aggregation unique-key
3394/// counts. The two answer different operator questions; the
3395/// header surfaces the contributor count because that's the
3396/// "how many trials got folded?" intuition the averaging fold
3397/// is actually delivering.
3398pub(crate) fn format_average_header(
3399    pre_agg_a: usize,
3400    pre_agg_b: usize,
3401    a: &str,
3402    b: &str,
3403) -> String {
3404    format!("averaged across {pre_agg_a} runs ({a}) and {pre_agg_b} runs ({b})")
3405}
3406
3407/// Format the per-group `passes_observed/total_observed` block
3408/// that prints below the summary line.
3409///
3410/// Pure function of (`avg_a`, `avg_b`, `a`, `b`) so the rendered
3411/// surface — one line per (scenario, topology, work_type) group
3412/// present on either side, with `N/M` per side and `-` for any
3413/// side that lacks the group — can be unit-tested without
3414/// capturing stdout. Returns the trailing-newline-terminated
3415/// block, or empty string when neither side has groups.
3416///
3417/// Line shape:
3418/// `  scenario/topology/work_type: {a}=N/M {b}=N/M`
3419///
3420/// The leading two-space indent matches the sibling
3421/// `summary:` block's continuation lines (e.g.
3422/// `"  N (scenario, topology, work_type) row pair(s) skipped..."`)
3423/// so the per-group block reads as a continuation of the same
3424/// summary section. A blank line separates this block from the
3425/// preceding `summary:` line for readability.
3426///
3427/// Groups present on only one side render `-` for the missing
3428/// side (also counted in `compare_rows`' `new_in_b` /
3429/// `removed_from_a` upstream — the per-group block surfaces the
3430/// asymmetry by name so the operator can see *which* groups went
3431/// missing without cross-referencing the summary counters).
3432pub(crate) fn format_per_group_pass_counts(
3433    avg_a: &[AveragedGroup],
3434    avg_b: &[AveragedGroup],
3435    a: &str,
3436    b: &str,
3437) -> String {
3438    type SummaryKey<'a> = (&'a str, &'a str, &'a str);
3439    type SummaryValue<'a> = (Option<&'a AveragedGroup>, Option<&'a AveragedGroup>);
3440    let mut keys: BTreeMap<SummaryKey<'_>, SummaryValue<'_>> = BTreeMap::new();
3441    for ar in avg_a {
3442        let k = (
3443            ar.row.scenario.as_str(),
3444            ar.row.topology.as_str(),
3445            ar.row.work_type.as_str(),
3446        );
3447        keys.entry(k).or_insert((None, None)).0 = Some(ar);
3448    }
3449    for br in avg_b {
3450        let k = (
3451            br.row.scenario.as_str(),
3452            br.row.topology.as_str(),
3453            br.row.work_type.as_str(),
3454        );
3455        keys.entry(k).or_insert((None, None)).1 = Some(br);
3456    }
3457    if keys.is_empty() {
3458        return String::new();
3459    }
3460    let mut out = String::new();
3461    out.push('\n');
3462    out.push_str(
3463        "per-group pass counts (passes/total + skip/inconc/fail breakdown when non-zero):\n",
3464    );
3465    for ((scn, topo, wt), (ka, kb)) in keys.into_iter() {
3466        let fmt_side = |r: Option<&AveragedGroup>| -> String {
3467            let Some(x) = r else {
3468                return "-".to_string();
3469            };
3470            // Mirror format_dimension_summary's 4-state breakdown —
3471            // operators reading per-group lines must be able to
3472            // distinguish skip / inconclusive / fail buckets, not
3473            // see them collapsed into the (total - pass) denominator
3474            // gap. Skip silently rendering buckets that are zero so
3475            // the common-case "all passed" line stays terse.
3476            let mut s = format!("{}/{}", x.passes_observed, x.total_observed);
3477            let mut extras: Vec<String> = Vec::with_capacity(3);
3478            if x.skips_observed > 0 {
3479                extras.push(format!("{} skip", x.skips_observed));
3480            }
3481            if x.inconclusives_observed > 0 {
3482                extras.push(format!("{} inc", x.inconclusives_observed));
3483            }
3484            if x.failures_observed > 0 {
3485                extras.push(format!("{} fail", x.failures_observed));
3486            }
3487            if !extras.is_empty() {
3488                s.push_str(&format!(" ({})", extras.join(", ")));
3489            }
3490            s
3491        };
3492        out.push_str(&format!(
3493            "  {scn}/{topo}/{wt}: {a}={pa} {b}={pb}\n",
3494            pa = fmt_side(ka),
3495            pb = fmt_side(kb),
3496        ));
3497    }
3498    out
3499}
3500
3501pub(crate) fn format_host_delta(
3502    host_a: Option<&crate::host_context::HostContext>,
3503    host_b: Option<&crate::host_context::HostContext>,
3504    a: &str,
3505    b: &str,
3506) -> String {
3507    match (host_a, host_b) {
3508        (Some(ha), Some(hb)) => {
3509            let delta = ha.diff(hb);
3510            if delta.is_empty() {
3511                // Identical hosts: surface arch when both sides
3512                // carry it so the operator sees WHAT is identical
3513                // (the two runs share x86_64 vs both being aarch64
3514                // is the operator's question). When
3515                // either side leaves arch as `None` (pre-host-
3516                // context-landing archive, or arch probe failed
3517                // on at least one side), fall through to the
3518                // bare "identical" message — emitting a partial
3519                // hint would mislead the reader into thinking
3520                // the silent side disagreed.
3521                match (ha.arch.as_deref(), hb.arch.as_deref()) {
3522                    (Some(arch_a), Some(arch_b)) if arch_a == arch_b => {
3523                        format!("\nhost: identical between '{a}' and '{b}' (arch: {arch_a})\n",)
3524                    }
3525                    _ => format!("\nhost: identical between '{a}' and '{b}'\n"),
3526                }
3527            } else {
3528                format!("\nhost delta ('{a}' → '{b}'):\n{delta}")
3529            }
3530        }
3531        (Some(_), None) => {
3532            format!("\nhost: captured in '{a}' only, delta unavailable\n")
3533        }
3534        (None, Some(_)) => {
3535            format!("\nhost: captured in '{b}' only, delta unavailable\n")
3536        }
3537        (None, None) => String::new(),
3538    }
3539}