ktstr/stats/compare.rs
1use super::*;
2
3/// One significant per-metric finding produced by [`compare_rows_by`].
4///
5/// `pairing_key` carries the dynamic identity the row pair joined
6/// on — `scenario` plus every NON-slicing dimension's value. The
7/// table renderer in [`compare_partitions`] decodes the key against
8/// the slicing-dim list to produce a label like
9/// `scenario/topology/work_type` (when topology + work_type are
10/// pairing dims) or just `scenario` (when every other dim slices).
11///
12/// The `scenario` / `topology` / `work_type` fields carry the
13/// matched row's values verbatim for legacy-shape consumers and
14/// test fixtures that pre-date the dimensional-slicing refactor.
15/// New code should read [`Finding::pairing_key`] directly so the
16/// slicing-dim variation stays visible.
17///
18/// `metric` is the registry entry the comparison ran against;
19/// consumers read polarity, display unit, and name through it
20/// directly without re-looking up [`metric_def`].
21#[derive(Debug, Clone, serde::Serialize)]
22pub(crate) struct Finding {
23 pub pairing_key: PairingKey,
24 pub scenario: String,
25 pub topology: String,
26 pub work_type: String,
27 pub metric: &'static MetricDef,
28 pub val_a: f64,
29 pub val_b: f64,
30 pub delta: f64,
31 pub kind: FindingKind,
32}
33
34/// How a significant (past-dual-gate) delta is classified. A metric
35/// becomes a [`Finding`] only after clearing the
36/// dual gate; this says which kind. `Informational` is for a
37/// [`Polarity::Informational`](crate::test_support::Polarity::Informational)
38/// metric (`MetricDef::classify_direction` => `None`): the change is
39/// shown but is NEVER a regression or improvement and never affects the
40/// exit code.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
42pub(crate) enum FindingKind {
43 Regression,
44 Improvement,
45 Informational,
46}
47
48/// A metric present on exactly ONE side of a paired (scenario,
49/// topology, work_type) row — a coverage difference, not a perf delta.
50///
51/// `MetricDef::read` returns `None` for a metric absent on a row and
52/// `Some(v)` (including `Some(0.0)`) when present, so an absent metric
53/// is distinguishable from a genuine zero. A metric present on one side
54/// and absent on the other is NOT a regression/improvement: it never
55/// had a comparable baseline. Recording it here (never gated, never
56/// counted in `regressions`/`improvements`/`informational`) surfaces
57/// the appear/disappear-between-runs case instead of either silently
58/// dropping it or — as the pre-fix `read().unwrap_or(0.0)` did —
59/// mis-flagging it as a directional verdict against a coerced-zero
60/// side: an unbounded relative change when the absent side is the
61/// baseline (rel-gate INFINITY), a bounded one otherwise (e.g. 5 -> 0
62/// gives rel ~1.0) — either clears the gate and yields a phantom
63/// regression or improvement (the direction follows the metric's
64/// polarity, so it inverts between LowerBetter and HigherBetter).
65#[derive(Debug, Clone, serde::Serialize)]
66pub(crate) struct CoverageDiff {
67 pub pairing_key: PairingKey,
68 pub scenario: String,
69 pub topology: String,
70 pub work_type: String,
71 pub metric: &'static MetricDef,
72 /// The side that HAS the metric; the other side is absent.
73 pub present_side: ComparePartition,
74 /// The present side's value (the absent side has none).
75 pub value: f64,
76}
77
78/// Aggregate result of comparing two row sets via [`compare_rows_by`].
79///
80/// `regressions` and `improvements` count significant entries in
81/// `findings`; `unchanged` counts metrics that fell below the dual
82/// gate; `excluded_pairs` counts paired (scenario, topology, work_type)
83/// row pairs where either side is excluded from regression math —
84/// `fail`, `inconclusive`, `skip`, or an inverted `expected_failure`
85/// run (which passes but carries failure-mode-dominated telemetry) all
86/// route here. The field name captures "excluded from regression math"
87/// rather than encoding any of the four excluded states, because the
88/// per-side disposition (which side, which state) is recoverable from
89/// the individual `GauntletRow::is_*` / `expected_failure` accessors
90/// when the operator drills in.
91/// `new_in_b`
92/// counts B-side rows whose key has no match on the A side; the
93/// converse is `removed_from_a`. The filter (when set) applies to
94/// every counter, so excluded rows do not contribute.
95#[derive(Debug, Clone, Default, serde::Serialize)]
96pub(crate) struct CompareReport {
97 pub regressions: u32,
98 pub improvements: u32,
99 /// Significant changes in `Polarity::Informational` metrics — shown
100 /// but never gated (excluded from `regressions`/`improvements` and
101 /// the exit code).
102 pub informational: u32,
103 pub unchanged: u32,
104 pub excluded_pairs: u32,
105 pub new_in_b: u32,
106 pub removed_from_a: u32,
107 pub findings: Vec<Finding>,
108 /// Metrics present on exactly one side of a paired row (a metric
109 /// appeared or disappeared between runs A and B). Never gated — not
110 /// counted in `regressions`/`improvements`/`informational` and no
111 /// effect on the exit code; surfaced so a coverage change is
112 /// visible rather than silently dropped or mis-flagged as a
113 /// regression from a zero baseline. See [`CoverageDiff`].
114 pub coverage_diffs: Vec<CoverageDiff>,
115}
116
117/// Which side of an A/B comparison a row belongs to. Typed surface
118/// for the per-phase rows so new code does not propagate the
119/// `"A"` / `"B"` string-literal pattern the scalar-finding path
120/// uses (kept as-is at the existing `"A"` / `"B"` call sites in this
121/// module — `render_side_label`, `zero_match_diagnostic`).
122#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize)]
123pub(crate) enum ComparePartition {
124 A,
125 B,
126}
127
128impl ComparePartition {
129 /// Render the side as the same one-letter label
130 /// `render_side_label` produces for the scalar table headers,
131 /// so the noise per-phase coverage rows and the scalar findings
132 /// table share the same operator-facing side identifier.
133 pub fn as_str(self) -> &'static str {
134 match self {
135 Self::A => "A",
136 Self::B => "B",
137 }
138 }
139}
140
141/// Per-metric threshold policy driving `compare_rows` /
142/// `compare_partitions`.
143///
144/// Resolution priority for a given metric's relative significance
145/// threshold, highest first:
146///
147/// 1. `per_metric_percent[metric_name]` — explicit override for
148/// this metric.
149/// 2. `default_percent` — uniform override across every metric
150/// not listed in the map (equivalent to the old `--threshold N`
151/// CLI flag).
152/// 3. The metric's built-in `default_rel` from the `METRICS`
153/// registry — the "no policy" fallback.
154///
155/// Values in the struct are stored as PERCENT (e.g. `10.0` meaning
156/// 10%), NOT fractions. [`Self::rel_threshold`] does the `/100.0`
157/// conversion so every caller inside `compare_rows` reads a
158/// fraction without re-deriving the division.
159///
160/// Note on the registry-fallback branch: the `default_rel` field
161/// on `MetricDef` is already a FRACTION (e.g. `0.25` for 25%),
162/// not a percent. `rel_threshold` returns it verbatim — it
163/// does NOT divide by 100. Only the override branches
164/// (per-metric map, `default_percent`) do the percent-to-fraction
165/// conversion because their inputs are percents. This asymmetry
166/// is deliberate so callers supplying CLI/file-based overrides
167/// work in human-intuitive percent units while the registry
168/// defaults (which already ship in fraction form) pass through
169/// unchanged.
170///
171/// The struct is `serde::Serialize` / `serde::Deserialize` so
172/// `cargo ktstr perf-delta --policy <path>` can load a
173/// JSON-persisted policy file. Default construction produces an
174/// empty policy that uses every registry default; [`Self::uniform`]
175/// reproduces the old `--threshold N` behaviour without any
176/// per-metric override plumbing at the call site.
177#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
178#[serde(default, deny_unknown_fields)]
179pub struct ComparisonPolicy {
180 /// Uniform override: when `Some(p)`, every metric whose name is
181 /// NOT in [`Self::per_metric_percent`] uses `p / 100.0` as its
182 /// relative threshold. `None` falls through to the registry
183 /// `default_rel`. Stored as percent (e.g. `10.0` for 10%).
184 pub default_percent: Option<f64>,
185 /// Per-metric overrides keyed by metric name. Each value is a
186 /// percent (e.g. `15.0` → 15%). An entry here takes precedence
187 /// over both [`Self::default_percent`] and the registry
188 /// `default_rel`.
189 pub per_metric_percent: BTreeMap<String, f64>,
190}
191
192/// CLI-controlled rendering of the per-phase spread block in
193/// `cargo ktstr perf-delta --noise-adjust`. Bundled as a struct
194/// so the 5-flag clap surface threads through
195/// `compare_partitions_noise` as a single positional rather than
196/// five. Default value renders every phase / every metric / every
197/// paired row — equivalent to passing no phase flags. All 5 flags
198/// require `--noise-adjust` (per-phase output exists only there).
199///
200/// The flags compose via AND on independent axes (block-level
201/// suppression × phase-id × row-significance), with three
202/// mutex constraints enforced at CLI parse time:
203///
204/// - `--no-phases` excludes every other phase flag (the whole
205/// block is suppressed; refining what to render is a
206/// contradiction).
207/// - `--phases-only` excludes `--no-phases` (same reason).
208/// - `--steps-only` excludes `--phase` (one of them collapses
209/// to a single bucket; the other suppresses BASELINE — both
210/// together are confused phrasing).
211///
212/// The 5 flags trigger renderer behaviour ONLY — the
213/// `--noise-adjust` per-phase pass always computes the full set
214/// of `NoisePhaseFinding`s and coverage entries so programmatic
215/// consumers see the unfiltered surface. Filtering is render-time
216/// projection.
217#[derive(Debug, Default, Clone)]
218pub struct PhaseDisplayOptions {
219 /// `--no-phases`: suppress the `--noise-adjust` per-phase
220 /// spread block entirely. The aggregate spread table and
221 /// footer render unchanged; the only effect is hiding the
222 /// per-phase block (and its footer hint). Mutually exclusive
223 /// with every other phase flag at CLI parse time.
224 pub no_phases: bool,
225 /// `--phases-only`: suppress the aggregate spread table and
226 /// the host-context delta; render ONLY the per-phase spread
227 /// block. Useful for narrowing investigation to a phase
228 /// regression when the aggregate rollup is noise. Composes
229 /// with `--steps-only`, `--phase`, and `--phase-threshold`.
230 pub phases_only: bool,
231 /// `--steps-only`: within the per-phase block, suppress
232 /// the BASELINE bucket (`step_index == 0`); render only
233 /// scenario Step buckets. Useful when the BASELINE settle
234 /// window is dominated by scheduler startup transients.
235 /// Mutually exclusive with `--phase`.
236 pub steps_only: bool,
237 /// `--phase <N>`: within the per-phase block, render only
238 /// rows whose `step_index == N`. `0` selects BASELINE;
239 /// `1..=N` selects scenario Step ordinals (1 → Step\[0\],
240 /// 2 → Step\[1\], ...). Integer chosen over label so a label
241 /// rename (`"Step[0]"` → `"Step:0"`) doesn't break operator
242 /// CI invocations. Mutually exclusive with `--steps-only`.
243 pub phase: Option<u16>,
244 /// `--phase-threshold <PCT>`: render-side relative-spread
245 /// gate for the `--noise-adjust` per-phase pass. Suppresses
246 /// paired rows where `|delta-mean| / |a.mean| < PCT / 100.0`;
247 /// a value from a ~zero baseline (`|a.mean| < ZERO_MEAN_EPS`)
248 /// is an unbounded relative change and clears any finite
249 /// threshold. `0.0` shows every paired row; absence falls
250 /// through to the registry's per-metric `default_rel`.
251 /// Independent from `--threshold` — the aggregate and
252 /// per-phase passes have separate filters so an operator can
253 /// widen the per-phase view without widening the aggregate
254 /// view.
255 pub phase_threshold: Option<f64>,
256}
257
258impl PhaseDisplayOptions {
259 /// Resolve the per-phase relative threshold for a given
260 /// metric. Returns the override fraction when
261 /// `phase_threshold` is set, else falls through to the
262 /// `ComparisonPolicy` resolution the scalar pass uses. The
263 /// `metric_name` + `default_rel` shape mirrors
264 /// [`ComparisonPolicy::rel_threshold`] so the two surfaces
265 /// stay symmetric.
266 pub fn rel_threshold(
267 &self,
268 policy: &ComparisonPolicy,
269 metric_name: &str,
270 default_rel: f64,
271 ) -> f64 {
272 match self.phase_threshold {
273 Some(pct) => pct / 100.0,
274 None => policy.rel_threshold(metric_name, default_rel),
275 }
276 }
277
278 /// True when a phase row at the given `step_index` should
279 /// render under the current display flags. Combines the two
280 /// step-axis predicates (`--phase <N>` filter and
281 /// `--steps-only` BASELINE-suppressor) into a single
282 /// row-level decision the renderer applies uniformly across
283 /// the `--noise-adjust` per-phase findings and coverage rows.
284 /// Returns `true` when no relevant flag is set (default
285 /// path: every step renders).
286 pub fn matches_phase(&self, step_index: u16) -> bool {
287 if let Some(want) = self.phase
288 && step_index != want
289 {
290 return false;
291 }
292 if self.steps_only && step_index == 0 {
293 return false;
294 }
295 true
296 }
297
298 /// The `--phase-threshold` relative-spread gate for a
299 /// [`NoisePhaseFinding`]'s verdict: `|b.mean - a.mean| / |a.mean| >=
300 /// phase_threshold / 100`. A move from a ~zero baseline (`|a.mean| <
301 /// ZERO_MEAN_EPS`) is unbounded → shown; both ~zero carries no signal →
302 /// filtered by any positive threshold. Returns `true` when no flag is set.
303 /// The noise row carries per-side means (a [`NoiseVerdict`]), so the gate
304 /// works on the mean delta rather than a single-run row delta.
305 pub(crate) fn passes_noise_spread_threshold(&self, verdict: &NoiseVerdict) -> bool {
306 let Some(pct) = self.phase_threshold else {
307 return true;
308 };
309 let a = verdict.a.mean.abs();
310 let delta = (verdict.b.mean - verdict.a.mean).abs();
311 let rel = if a > ZERO_MEAN_EPS {
312 delta / a
313 } else if delta > ZERO_MEAN_EPS {
314 f64::INFINITY
315 } else {
316 0.0
317 };
318 rel >= pct / 100.0
319 }
320}
321
322impl ComparisonPolicy {
323 /// Empty policy — every metric uses its `METRICS` registry
324 /// default. Equivalent to the old `--threshold None` CLI path.
325 pub fn new() -> Self {
326 Self::default()
327 }
328
329 /// Uniform override: every metric uses `percent / 100.0`.
330 /// Mirrors the old `--threshold N` CLI behaviour; the CLI
331 /// dispatch at `cargo ktstr perf-delta --threshold N`
332 /// constructs a policy via this constructor.
333 pub fn uniform(percent: f64) -> Self {
334 Self {
335 default_percent: Some(percent),
336 per_metric_percent: BTreeMap::new(),
337 }
338 }
339
340 /// Load a JSON-persisted policy from a file. Errors propagate
341 /// the read / parse reason as an `anyhow::Error` with the file
342 /// path in the context chain so a malformed `--policy path.json`
343 /// surfaces an actionable message rather than a generic
344 /// "invalid JSON."
345 ///
346 /// Validates after parsing via [`Self::validate`]: rejects
347 /// negative thresholds (a misconfigured 10 vs -10 would
348 /// invert the dual-gate logic at the `.abs() >= rel_thresh`
349 /// check and silently classify every metric as significant)
350 /// and rejects per-metric keys not registered in `METRICS`
351 /// (a typo like `"wrost_spread"` would otherwise be silently
352 /// ignored — the key simply never matches during resolution
353 /// and the metric falls through to `default_percent`).
354 pub fn load_json(path: &std::path::Path) -> anyhow::Result<Self> {
355 use anyhow::Context;
356 let data = std::fs::read_to_string(path)
357 .with_context(|| format!("read comparison policy from {}", path.display()))?;
358 let policy: ComparisonPolicy = serde_json::from_str(&data)
359 .with_context(|| format!("parse comparison policy from {}", path.display()))?;
360 policy
361 .validate()
362 .with_context(|| format!("validate comparison policy from {}", path.display()))?;
363 Ok(policy)
364 }
365
366 /// Structural validation separate from parsing so both the
367 /// `load_json` path and programmatic constructors (after
368 /// [`Self::uniform`] with a user-supplied percent) can share
369 /// one set of invariants without re-implementing checks at
370 /// each call site. Called automatically by [`Self::load_json`];
371 /// CLI dispatch should call it after constructing via
372 /// [`Self::uniform`] to catch `--threshold -10` at the
373 /// entry point rather than deep inside `compare_rows` where
374 /// the dual-gate math silently misbehaves.
375 ///
376 /// Rejects:
377 /// - Negative `default_percent` (nonsensical — thresholds are
378 /// absolute-value comparisons).
379 /// - Negative entries in `per_metric_percent`.
380 /// - Per-metric keys not in the `METRICS` registry (silent
381 /// typos would otherwise fall through to `default_percent`
382 /// unnoticed).
383 pub fn validate(&self) -> anyhow::Result<()> {
384 if let Some(p) = self.default_percent
385 && p < 0.0
386 {
387 anyhow::bail!(
388 "ComparisonPolicy: default_percent must be non-negative; got {p}. \
389 Thresholds are absolute-value comparisons — a negative value \
390 would invert the dual-gate logic and silently classify every \
391 delta as significant."
392 );
393 }
394 for (name, p) in &self.per_metric_percent {
395 if !METRICS.iter().any(|m| m.name == name) {
396 let known: Vec<&str> = METRICS.iter().map(|m| m.name).collect();
397 anyhow::bail!(
398 "ComparisonPolicy: per_metric_percent contains unknown \
399 metric `{name}`. A typo in the key would silently fall \
400 through to default_percent. Registered metrics: {}",
401 known.join(", "),
402 );
403 }
404 if *p < 0.0 {
405 anyhow::bail!(
406 "ComparisonPolicy: per_metric_percent[{name:?}] must be \
407 non-negative; got {p}",
408 );
409 }
410 }
411 Ok(())
412 }
413
414 /// Resolve the mutually-exclusive `--threshold` / `--policy` CLI
415 /// pair into a policy: `--threshold N` is sugar for a uniform N%
416 /// default (validated for sign); `--policy PATH` loads a
417 /// per-metric JSON policy; neither falls through to the registry
418 /// defaults. Shared by every subcommand that accepts the pair
419 /// (`perf-delta`) so the resolution rules — and
420 /// the "exactly one of the two" contract — live in one place.
421 ///
422 /// Both flags set is rejected with an error. At the CLI call
423 /// sites clap `conflicts_with` makes that unreachable, but this is
424 /// a library entry point and must not panic on its inputs; the
425 /// error is the defence-in-depth backstop.
426 pub fn from_cli_flags(
427 threshold: Option<f64>,
428 policy: Option<&std::path::Path>,
429 ) -> anyhow::Result<Self> {
430 match (threshold, policy) {
431 (Some(t), None) => {
432 let p = Self::uniform(t);
433 p.validate()?;
434 Ok(p)
435 }
436 (None, Some(path)) => Self::load_json(path),
437 (None, None) => Ok(Self::default()),
438 (Some(_), Some(_)) => anyhow::bail!(
439 "--threshold and --policy are mutually exclusive; use --policy \
440 for per-metric overrides"
441 ),
442 }
443 }
444
445 /// Resolve the relative threshold (as a fraction, e.g. `0.10`
446 /// for 10%) for `metric_name` with `default_rel` as the
447 /// registry-level fallback. Handles the percent→fraction
448 /// conversion so `compare_rows_by` does not need to re-derive
449 /// `p / 100.0` at every call site.
450 pub fn rel_threshold(&self, metric_name: &str, default_rel: f64) -> f64 {
451 if let Some(p) = self.per_metric_percent.get(metric_name) {
452 p / 100.0
453 } else if let Some(p) = self.default_percent {
454 p / 100.0
455 } else {
456 default_rel
457 }
458 }
459}
460
461/// Compare two row sets metric-by-metric, parametrised on
462/// `pairing_dims`.
463///
464/// Pure function: no I/O, no globals. Two rows pair iff their
465/// [`PairingKey`] (scenario + every value for each dimension in
466/// `pairing_dims`) is equal — this is the dimensional-slicing
467/// pipeline's join primitive, with slicing dims EXCLUDED from
468/// `pairing_dims` so rows on the A/B sides that differ on those
469/// dims still pair as long as they agree on every non-slicing
470/// dim. When `filter` is `Some(s)`, a row is included only if
471/// `s` appears as a substring of the joined `"scenario topology
472/// scheduler work_type"` string. The scheduler is
473/// searchable via the substring filter but is not part of the
474/// pairing key by default (only when `Dimension::Scheduler` is
475/// in `pairing_dims`), so the same scenario+topology+work_type
476/// pair compares correctly across different scheduler binaries
477/// when the filter does not constrain it.
478///
479/// Row-pair accounting:
480/// - B-side rows with no A-side match are counted in `new_in_b`.
481/// - A-side rows with no B-side match are counted in `removed_from_a`
482/// (a separate pass over `rows_a`).
483/// - Paired rows where either side has `passed=false` are dropped
484/// from the regression math and counted in `excluded_pairs`: a
485/// failed scenario's metrics reflect the failure mode (short run,
486/// stalled workload, missing samples), not the scheduler's
487/// behavior.
488///
489/// The filter (when set) applies to every counter -- excluded rows
490/// never reach the matching, pass, or metric stages.
491///
492/// `policy` carries the comparison thresholds. See
493/// [`ComparisonPolicy`] for the resolution rules — per-metric
494/// override → `default_percent` → registry `default_rel`. The
495/// absolute gate always uses the metric's `default_abs`. A delta
496/// must clear both gates to count as significant.
497pub(crate) fn compare_rows_by(
498 rows_a: &[GauntletRow],
499 rows_b: &[GauntletRow],
500 pairing_dims: &[Dimension],
501 filter: Option<&str>,
502 policy: &ComparisonPolicy,
503) -> CompareReport {
504 let mut report = CompareReport::default();
505
506 // Build a HashMap<PairingKey, &GauntletRow> from rows_a once so
507 // each row_b lookup is O(1) instead of O(rows_a). `or_insert_with`
508 // preserves first-match semantics from the prior `rows_a.iter().find()`
509 // call: on the rare path where two A-side rows share a key (the
510 // averaging path folds same-key rows into one mean, so a
511 // shared key is not normally reachable), the
512 // earlier-iterated row wins.
513 let mut a_by_key: HashMap<PairingKey, &GauntletRow> = HashMap::with_capacity(rows_a.len());
514 for row_a in rows_a {
515 let key = PairingKey::from_row(row_a, pairing_dims);
516 a_by_key.entry(key).or_insert(row_a);
517 }
518
519 // Hoist the per-metric relative threshold out of the row×metric
520 // loop. `policy.rel_threshold(m.name, m.default_rel)` is a pure
521 // function of the metric — recomputing it for every row pair was
522 // O(rows_b × METRICS) BTreeMap probes for nothing.
523 let rel_thresholds: Vec<f64> = METRICS
524 .iter()
525 .map(|m| policy.rel_threshold(m.name, m.default_rel))
526 .collect();
527 // Same hoist for the render-suppression predicate: it is a pure
528 // function of the metric (a small fixed-slice membership scan), so
529 // probing it per (row_b x metric) re-ran the scan for nothing.
530 let suppressed: Vec<bool> = METRICS
531 .iter()
532 .map(|m| is_render_suppressed_component(m.name))
533 .collect();
534
535 for row_b in rows_b {
536 // Dynamic pairing key: scenario + every NON-slicing
537 // dimension's value. Two rows pair iff their dynamic keys
538 // match.
539 let key_b = PairingKey::from_row(row_b, pairing_dims);
540 if let Some(f) = filter {
541 // Substring filter joins all identity-bearing fields —
542 // including the SLICING dim values — so an operator
543 // can narrow by any visible field via `-E`.
544 let joined = format!(
545 "{} {} {} {}",
546 row_b.scenario, row_b.topology, row_b.scheduler, row_b.work_type,
547 );
548 if !joined.contains(f) {
549 continue;
550 }
551 }
552 let Some(&row_a) = a_by_key.get(&key_b) else {
553 report.new_in_b += 1;
554 continue;
555 };
556
557 // Drop from regression math when either side is a skip,
558 // inconclusive, failure, or an inverted expected_failure run.
559 // Skips carry no executed metrics
560 // (the run didn't happen); inconclusive runs ran but lacked
561 // signal to evaluate (zero-denominator ratio gate); failures
562 // carry telemetry dominated by the failure mode (short run,
563 // stalled workload), not the scheduler's behavior. An
564 // expected_failure run has `passed == true` but its telemetry
565 // is likewise failure-mode-dominated (short / stalled run), so
566 // it is excluded despite passing —
567 // comparing any of these against a real run produces
568 // meaningless deltas.
569 if row_a.is_fail()
570 || row_b.is_fail()
571 || row_a.is_inconclusive()
572 || row_b.is_inconclusive()
573 || row_a.is_skip()
574 || row_b.is_skip()
575 || row_a.expected_failure
576 || row_b.expected_failure
577 {
578 report.excluded_pairs += 1;
579 continue;
580 }
581
582 push_scalar_findings(
583 &mut report,
584 row_a,
585 row_b,
586 &key_b,
587 &rel_thresholds,
588 &suppressed,
589 );
590 }
591
592 // Second pass: A-side rows whose key has no match on the B side.
593 // Filter applies here too, so rows excluded by the filter never
594 // count as removed. Build a HashSet<PairingKey> from rows_b once
595 // so the existence check is O(1) per row_a; rows_b are inserted
596 // unfiltered to preserve prior behaviour where a row_b that fails
597 // the substring filter still suppresses a same-key row_a's
598 // removed_from_a increment (the substring filter compares against
599 // identity-bearing fields including slicing dims, so two rows
600 // sharing a pairing key can disagree on filter membership).
601 let b_keys: HashSet<PairingKey> = rows_b
602 .iter()
603 .map(|r| PairingKey::from_row(r, pairing_dims))
604 .collect();
605 for row_a in rows_a {
606 let key_a = PairingKey::from_row(row_a, pairing_dims);
607 if let Some(f) = filter {
608 let joined = format!(
609 "{} {} {} {}",
610 row_a.scenario, row_a.topology, row_a.scheduler, row_a.work_type,
611 );
612 if !joined.contains(f) {
613 continue;
614 }
615 }
616 if !b_keys.contains(&key_a) {
617 report.removed_from_a += 1;
618 }
619 }
620
621 report
622}
623
624/// Append the scalar per-metric findings for one matched `(row_a,
625/// row_b)` pair to `report`. Indexed by the `METRICS` enumerate
626/// position: `rel_thresholds[i]` is the hoisted relative threshold
627/// and `suppressed[i]` the hoisted render-suppression flag for the
628/// i-th metric (both built once by [`compare_rows_by`] over the same
629/// `METRICS` order). Bumps `report.unchanged` for sub-dual-gate
630/// deltas and `report.regressions` / `report.improvements` per
631/// metric polarity for the rest, pushing a [`Finding`] for each
632/// significant delta.
633fn push_scalar_findings(
634 report: &mut CompareReport,
635 row_a: &GauntletRow,
636 row_b: &GauntletRow,
637 key_b: &PairingKey,
638 rel_thresholds: &[f64],
639 suppressed: &[bool],
640) {
641 for (i, m) in METRICS.iter().enumerate() {
642 // Rate components are internal plumbing — suppressed from compare
643 // output (they remain in storage for the cross-run re-pool).
644 if suppressed[i] {
645 continue;
646 }
647 // `read` returns `None` for a metric absent on a row and `Some(v)`
648 // (including `Some(0.0)`) when present, so absent is distinguishable
649 // from a genuine zero. A metric present on exactly one side is a
650 // coverage difference, NOT a delta: record it (never gated) and skip
651 // the directional verdict. The pre-fix `unwrap_or(0.0)` coerced an
652 // absent side to 0.0, producing a phantom directional verdict: when
653 // the absent side was the baseline (val_a==0) the rel-gate's INFINITY
654 // branch fired (unbounded), and otherwise rel_delta was bounded (e.g.
655 // 5 -> 0 gives |(-5)/5|=1.0); either cleared both gates and yielded a
656 // phantom regression or improvement (direction per the metric's
657 // polarity) for a metric simply not captured on one side.
658 let (val_a, val_b) = match (m.read(row_a), m.read(row_b)) {
659 (Some(a), Some(b)) => (a, b),
660 (None, None) => continue,
661 (Some(a), None) => {
662 report.coverage_diffs.push(CoverageDiff {
663 pairing_key: key_b.clone(),
664 scenario: row_b.scenario.clone(),
665 topology: row_b.topology.clone(),
666 work_type: row_b.work_type.clone(),
667 metric: m,
668 present_side: ComparePartition::A,
669 value: a,
670 });
671 continue;
672 }
673 (None, Some(b)) => {
674 report.coverage_diffs.push(CoverageDiff {
675 pairing_key: key_b.clone(),
676 scenario: row_b.scenario.clone(),
677 topology: row_b.topology.clone(),
678 work_type: row_b.work_type.clone(),
679 metric: m,
680 present_side: ComparePartition::B,
681 value: b,
682 });
683 continue;
684 }
685 };
686 // Both sides negligible (under ZERO_MEAN_EPS, the domain zero epsilon —
687 // not f64::EPSILON, the machine ulp near 1.0): no signal, skip without
688 // counting.
689 if val_a.abs() < ZERO_MEAN_EPS && val_b.abs() < ZERO_MEAN_EPS {
690 continue;
691 }
692
693 let rel_thresh = rel_thresholds[i];
694
695 let delta = val_b - val_a;
696 let rel_delta = if val_a.abs() > ZERO_MEAN_EPS {
697 (delta / val_a).abs()
698 } else {
699 // A non-negligible value (val_b — the both-zero case is skipped
700 // above) appearing from a ~zero baseline is an unbounded relative
701 // change, not "unchanged". INFINITY clears the rel gate so the
702 // absolute gate alone decides whether this delta is significant.
703 f64::INFINITY
704 };
705
706 if delta.abs() < m.default_abs || rel_delta < rel_thresh {
707 report.unchanged += 1;
708 continue;
709 }
710
711 // Verdict: dual-gate already passed above (significant). An
712 // Informational metric (classify_direction => None) is recorded
713 // and displayed but NEVER counted as regression/improvement and
714 // NEVER affects the exit code; a directional metric splits on its
715 // polarity.
716 let kind = match m.classify_direction() {
717 None => {
718 report.informational += 1;
719 FindingKind::Informational
720 }
721 Some(higher_is_worse) => {
722 let is_regression = if higher_is_worse {
723 delta > 0.0
724 } else {
725 delta < 0.0
726 };
727 if is_regression {
728 report.regressions += 1;
729 FindingKind::Regression
730 } else {
731 report.improvements += 1;
732 FindingKind::Improvement
733 }
734 }
735 };
736 report.findings.push(Finding {
737 pairing_key: key_b.clone(),
738 scenario: row_b.scenario.clone(),
739 topology: row_b.topology.clone(),
740 work_type: row_b.work_type.clone(),
741 metric: m,
742 val_a,
743 val_b,
744 delta,
745 kind,
746 });
747 }
748}
749
750/// Emit a stderr warning naming any `-dirty` commit values present
751/// in the partitioned rows so the operator knows the comparison
752/// includes builds whose source tree may not match the recorded
753/// HEAD.
754///
755/// Scans `commit` (project HEAD) and `kernel_commit` (kernel source
756/// tree HEAD) on both sides' rows, dedupes the surviving values,
757/// and emits one warning block listing each distinct dirty value
758/// per dimension. Emits at most one block — silent when no row
759/// carries a `-dirty` suffix on either dimension.
760///
761/// Dirty runs reuse the same sidecar filename as their clean HEAD
762/// (the variant hash excludes `commit` / `kernel_commit` per
763/// `crate::test_support::sidecar`), so re-running the same test
764/// from a dirty tree overwrites the previous record. The warning
765/// surfaces this so an operator can decide whether to commit the
766/// working tree before re-running for a reproducible comparison.
767///
768/// Splits collection from emission via [`render_dirty_warning`] so
769/// unit tests can pin the rendered text without trapping `stderr`.
770fn warn_on_dirty_builds(rows_a: &[GauntletRow], rows_b: &[GauntletRow]) {
771 if let Some(text) = render_dirty_warning(rows_a, rows_b) {
772 eprint!("{text}");
773 }
774}
775
776/// Emit the CPU-budget hazard warning for a comparison, if any.
777/// Pure-render half is [`render_overcommit_warning`]; this only
778/// `eprint!`s it, mirroring [`warn_on_dirty_builds`].
779fn warn_on_overcommit(rows_a: &[GauntletRow], rows_b: &[GauntletRow], pairing_dims: &[Dimension]) {
780 if let Some(text) = render_overcommit_warning(rows_a, rows_b, pairing_dims) {
781 eprint!("{text}");
782 }
783}
784
785/// Build the CPU-budget hazard warning from the filtered compare
786/// sides, or `None` when neither hazard is present.
787///
788/// Two independent hazards, both read from [`GauntletRow::cpu_budget`]
789/// / [`GauntletRow::vcpus`] — the consumers that make those fields
790/// load-bearing on the compare path:
791///
792/// - OVERCOMMIT (`cpu_budget < vcpus`): the host time-sliced that
793/// run's vCPU threads, so its wake-latency / off-CPU / run-delay
794/// timing metrics are host-contention artifacts, not scheduler
795/// signal (see [`crate::vmm::host_topology::overcommit_warning`]).
796/// Always flagged when present on either side: comparing raw timing
797/// from an overcommitted run is the silent-wrong-answer the budget
798/// stamp exists to surface.
799/// - MIXED BUDGET: a single pairing group on a side holds more than
800/// one distinct non-skip budget. [`group_and_average_by`] folds rows
801/// that share a full [`PairingKey`], so this is exactly the set the
802/// averaging fold combines across budgets. It only arises
803/// when [`Dimension::CpuBudget`] is NOT a pairing dim (the operator
804/// sliced on cpu-budget, dropping it from the key); when it IS a
805/// pairing dim, each budget keys its own group and is never folded.
806/// Detection is per pairing group, NOT side-wide: two rows of
807/// different scenarios (or any differing pairing dim) carry different
808/// keys and never average, so a side merely spanning budgets across
809/// distinct groups is not flagged.
810///
811/// Skip rows (budget 0 -> `None` in [`sidecar_to_row`]) carry no
812/// budget identity and are ignored by both checks. Split from
813/// emission so a unit test pins the text and the `None`-when-clean
814/// polarity without trapping stderr, mirroring [`render_dirty_warning`].
815pub(crate) fn render_overcommit_warning(
816 rows_a: &[GauntletRow],
817 rows_b: &[GauntletRow],
818 pairing_dims: &[Dimension],
819) -> Option<String> {
820 use std::collections::BTreeSet;
821 use std::fmt::Write;
822
823 // Side-wide: the distinct overcommitted (budget, vcpus) pairs.
824 let overcommitted = |rows: &[GauntletRow]| -> BTreeSet<(u32, u32)> {
825 let mut over = BTreeSet::new();
826 for r in rows {
827 if let (Some(b), Some(v)) = (r.cpu_budget, r.vcpus)
828 && b < v
829 {
830 over.insert((b, v));
831 }
832 }
833 over
834 };
835
836 // Per pairing group: the union of budgets across groups that hold
837 // >1 distinct budget — exactly the budgets the averaging fold
838 // combines into one mean. Empty when CpuBudget is a pairing dim (each budget keys
839 // its own group, so no group ever holds two).
840 let cpu_budget_is_pairing = pairing_dims.contains(&Dimension::CpuBudget);
841 let mixed_folded = |rows: &[GauntletRow]| -> BTreeSet<u32> {
842 let mut folded = BTreeSet::new();
843 if cpu_budget_is_pairing {
844 return folded;
845 }
846 let mut by_key: std::collections::HashMap<PairingKey, BTreeSet<u32>> =
847 std::collections::HashMap::new();
848 for r in rows {
849 if let Some(b) = r.cpu_budget {
850 by_key
851 .entry(PairingKey::from_row(r, pairing_dims))
852 .or_default()
853 .insert(b);
854 }
855 }
856 for budgets in by_key.values() {
857 if budgets.len() > 1 {
858 folded.extend(budgets.iter().copied());
859 }
860 }
861 folded
862 };
863
864 let over_a = overcommitted(rows_a);
865 let over_b = overcommitted(rows_b);
866 let mixed_a = mixed_folded(rows_a);
867 let mixed_b = mixed_folded(rows_b);
868
869 if over_a.is_empty() && over_b.is_empty() && mixed_a.is_empty() && mixed_b.is_empty() {
870 return None;
871 }
872
873 let any_overcommit = !over_a.is_empty() || !over_b.is_empty();
874 let mut out = String::new();
875 if any_overcommit {
876 // Host time-slicing actually occurred -> raw timing is confounded.
877 let _ = writeln!(
878 out,
879 "ktstr: WARNING: CPU-budget hazard in this comparison — a run was \
880 host-overcommitted, so its guest-scheduler timing metrics \
881 (wake-latency / off-CPU / run-delay) are host-contention-confounded. \
882 Compare the overcommit-invariant worst_iterations_per_cpu_sec metric \
883 instead of raw \
884 timing."
885 );
886 } else {
887 // Mixed budgets with NO overcommit: no host contention, the hazard is
888 // collapsing two different measurement conditions into one number.
889 let _ = writeln!(
890 out,
891 "ktstr: WARNING: CPU-budget hazard in this comparison — runs of \
892 different CPU budgets share a pairing group, mixing two measurement \
893 conditions. Slice with --cpu-budget, or compare the budget-invariant \
894 worst_iterations_per_cpu_sec metric."
895 );
896 }
897 let mut emit_side = |label: &str, over: &BTreeSet<(u32, u32)>, mixed: &BTreeSet<u32>| {
898 if !over.is_empty() {
899 let list = over
900 .iter()
901 .map(|(b, v)| format!("{b}/{v}"))
902 .collect::<Vec<_>>()
903 .join(", ");
904 let _ = writeln!(
905 out,
906 " side {label}: host-overcommitted run(s) [budget/vcpus]: {list}"
907 );
908 }
909 if !mixed.is_empty() {
910 let list = mixed
911 .iter()
912 .map(|b| b.to_string())
913 .collect::<Vec<_>>()
914 .join(", ");
915 let _ = writeln!(
916 out,
917 " side {label}: CPU budgets [{list}] share a pairing group — \
918 the average fold collapses them into one mean; slice with --cpu-budget so cross-budget runs are \
919 not compared under one key"
920 );
921 }
922 };
923 emit_side("A", &over_a, &mixed_a);
924 emit_side("B", &over_b, &mixed_b);
925 Some(out)
926}
927
928/// Build the dirty-builds warning block from row data.
929///
930/// Returns `None` when no row on either side carries a `-dirty`
931/// suffix on either `commit` or `kernel_commit`. Otherwise returns
932/// the full multi-line warning text — the body emitted to stderr by
933/// [`warn_on_dirty_builds`] — terminated with a trailing newline so
934/// the caller can `eprint!` it without further formatting.
935///
936/// Dimensions render in fixed order ("kernel source" before
937/// "project") so the same dirty hashes always produce byte-identical
938/// output across runs; values within each dimension are
939/// `BTreeSet`-deduped so multiple rows sharing one dirty hash list
940/// it once, and multiple distinct dirty hashes on one dimension list
941/// in lex order.
942pub(crate) fn render_dirty_warning(
943 rows_a: &[GauntletRow],
944 rows_b: &[GauntletRow],
945) -> Option<String> {
946 use std::collections::BTreeSet;
947 use std::fmt::Write;
948
949 let mut dirty_kernel: BTreeSet<&str> = BTreeSet::new();
950 let mut dirty_project: BTreeSet<&str> = BTreeSet::new();
951 for row in rows_a.iter().chain(rows_b.iter()) {
952 // `ends_with` matches the producer contract: `detect_kernel_commit`
953 // and `detect_project_commit` (src/test_support/sidecar/mod.rs) append
954 // `-dirty` as a SUFFIX to the 7-char hex via
955 // `format!("{short_hash}-dirty")`, so the dirty marker is
956 // always tail-positioned. `contains` would also match a
957 // hex hash that legitimately contains the substring `-dirty`
958 // somewhere in the middle (impossible for the current
959 // 7-char hex prefix, but a future commit-ish format change
960 // would let a non-dirty value flag itself dirty under
961 // `contains`).
962 if let Some(c) = row.kernel_commit.as_deref()
963 && c.ends_with("-dirty")
964 {
965 dirty_kernel.insert(c);
966 }
967 if let Some(c) = row.commit.as_deref()
968 && c.ends_with("-dirty")
969 {
970 dirty_project.insert(c);
971 }
972 }
973
974 if dirty_kernel.is_empty() && dirty_project.is_empty() {
975 return None;
976 }
977
978 let mut out = String::new();
979 writeln!(out, "warning: comparison includes dirty builds:").unwrap();
980 for v in &dirty_kernel {
981 writeln!(
982 out,
983 " - kernel source: {v} (working tree may have changed since this run)"
984 )
985 .unwrap();
986 }
987 for v in &dirty_project {
988 writeln!(
989 out,
990 " - project: {v} (working tree may have changed since this run)"
991 )
992 .unwrap();
993 }
994 writeln!(
995 out,
996 " Dirty runs overwrite previous results with the same HEAD."
997 )
998 .unwrap();
999 writeln!(out, " Commit changes for reproducible-ish comparisons.").unwrap();
1000 Some(out)
1001}
1002
1003/// Render the actionable bail message emitted when one side's filter
1004/// matches zero sidecars in the pool.
1005///
1006/// Beyond the generic "check filters / run `cargo ktstr stats list`"
1007/// redirect, this helper inspects WHY the filter matched nothing and
1008/// adds three operator-actionable hints when applicable:
1009///
1010/// 1. **Dirty-form hint**: when the user passed
1011/// `--project-commit X` (or per-side / kernel-commit equivalent)
1012/// and the pool contains a row whose `commit` (or `kernel_commit`)
1013/// is `X-dirty`, append "Did you mean `--project-commit X-dirty`?".
1014/// A clean-vs-dirty mismatch is the single most common cause of a
1015/// false-zero on the commit dims — `detect_project_commit` /
1016/// `detect_kernel_commit` append `-dirty` whenever HEAD-vs-index
1017/// or index-vs-worktree changes are observed, so an operator who
1018/// expected `abcdef1` but the recorded value is `abcdef1-dirty`
1019/// sees no rows match without realizing why.
1020///
1021/// 2. **Unknown run-source hint**: when the user passed
1022/// `--run-source X` (or per-side equivalent) and `X` is NOT
1023/// among the distinct `run_source` values present in the pool,
1024/// append a hint listing the actual values seen. The schema is
1025/// deliberately extensible (`"benchmark"` and other future tags
1026/// are valid), so this is a hint rather than a hard validator —
1027/// but a typo (`--run-source loca` for `local`, or `--run-source CI`
1028/// for `ci` since the values are case-sensitive) is the most
1029/// common cause of a false-zero on the source dim, and listing
1030/// the distinct values present is more actionable than asking
1031/// the operator to consult the schema doc.
1032///
1033/// 3. **list-values redirect for commit dims**: when the user
1034/// populated any commit dimension (`project_commits` /
1035/// `kernel_commits`), suggest `cargo ktstr stats list-values`
1036/// specifically — that command emits the exact distinct values
1037/// present per dimension, which is more actionable than the
1038/// generic `stats list` which only shows top-level run keys.
1039///
1040/// `side` is `"A"` or `"B"` for diagnostic context. `filter` is the
1041/// per-side `RowFilter`. `rows` is the sidecar-derived row vec
1042/// (post-`sidecar_to_row` mapping, pre-filtering). `pool_len` is
1043/// the raw pool count for the "(N pooled)" diagnostic context.
1044pub(crate) fn zero_match_diagnostic(
1045 side: &str,
1046 filter: &RowFilter,
1047 rows: &[GauntletRow],
1048 pool_len: usize,
1049) -> String {
1050 let mut msg = format!(
1051 "perf-delta: {side} side filter matched 0 sidecars in \
1052 pool ({pool_len} pooled). Check the per-side filters or \
1053 confirm the runs exist with `cargo ktstr stats list`."
1054 );
1055
1056 // Dirty-form hint per commit dimension. Only fires when a
1057 // populated filter value's `-dirty` form is in the pool.
1058 let mut dirty_hints: Vec<String> = Vec::new();
1059 for want in &filter.project_commits {
1060 let dirty = format!("{want}-dirty");
1061 let found = rows
1062 .iter()
1063 .any(|r| r.commit.as_deref() == Some(dirty.as_str()));
1064 if found {
1065 dirty_hints.push(format!(
1066 "no rows match `--project-commit {want}` but `{dirty}` exists in the pool — \
1067 did you mean `--project-commit {dirty}`?"
1068 ));
1069 }
1070 }
1071 for want in &filter.kernel_commits {
1072 let dirty = format!("{want}-dirty");
1073 let found = rows
1074 .iter()
1075 .any(|r| r.kernel_commit.as_deref() == Some(dirty.as_str()));
1076 if found {
1077 dirty_hints.push(format!(
1078 "no rows match `--kernel-commit {want}` but `{dirty}` exists in the pool — \
1079 did you mean `--kernel-commit {dirty}`?"
1080 ));
1081 }
1082 }
1083 for hint in dirty_hints {
1084 msg.push_str("\nhint: ");
1085 msg.push_str(&hint);
1086 }
1087
1088 // Unknown-run-source hint. Fires when a `--run-source X` value
1089 // is not present in the pool — typo / wrong casing is the most
1090 // common cause. Schema is intentionally extensible (operators
1091 // can write `"benchmark"` etc.), so this is a hint not a hard
1092 // validator: the bail still fires, the operator still sees the
1093 // distinct values present, and the producer side is free to
1094 // emit any tag.
1095 if !filter.run_sources.is_empty() {
1096 let pool_run_sources: std::collections::BTreeSet<&str> = rows
1097 .iter()
1098 .filter_map(|r| r.run_source.as_deref())
1099 .collect();
1100 let unknowns: Vec<&str> = filter
1101 .run_sources
1102 .iter()
1103 .map(String::as_str)
1104 .filter(|want| !pool_run_sources.contains(*want))
1105 .collect();
1106 if !unknowns.is_empty() {
1107 let mut present: Vec<&str> = pool_run_sources.iter().copied().collect();
1108 present.sort_unstable();
1109 let unknown_list = unknowns
1110 .iter()
1111 .map(|s| format!("`{s}`"))
1112 .collect::<Vec<_>>()
1113 .join(", ");
1114 let present_list = if present.is_empty() {
1115 "(none — every row has `run_source: null`)".to_string()
1116 } else {
1117 present
1118 .iter()
1119 .map(|s| format!("`{s}`"))
1120 .collect::<Vec<_>>()
1121 .join(", ")
1122 };
1123 msg.push_str(&format!(
1124 "\nhint: --run-source {unknown_list} not found in pool; \
1125 distinct values present: {present_list}. Values are \
1126 case-sensitive (`ci` ≠ `CI`)."
1127 ));
1128 }
1129 }
1130
1131 // Unknown-resolve-source hint. Mirrors the run_sources hint for the
1132 // scheduler-resolution-path dimension: fires when a `--resolve-source`
1133 // value is not among the resolve_sources present in the pool.
1134 if !filter.resolve_sources.is_empty() {
1135 let pool_resolve_sources: std::collections::BTreeSet<&str> = rows
1136 .iter()
1137 .filter_map(|r| r.resolve_source.as_deref())
1138 .collect();
1139 let unknowns: Vec<&str> = filter
1140 .resolve_sources
1141 .iter()
1142 .map(String::as_str)
1143 .filter(|want| !pool_resolve_sources.contains(*want))
1144 .collect();
1145 if !unknowns.is_empty() {
1146 let mut present: Vec<&str> = pool_resolve_sources.iter().copied().collect();
1147 present.sort_unstable();
1148 let unknown_list = unknowns
1149 .iter()
1150 .map(|s| format!("`{s}`"))
1151 .collect::<Vec<_>>()
1152 .join(", ");
1153 let present_list = if present.is_empty() {
1154 "(none — every row has `resolve_source: null`)".to_string()
1155 } else {
1156 present
1157 .iter()
1158 .map(|s| format!("`{s}`"))
1159 .collect::<Vec<_>>()
1160 .join(", ")
1161 };
1162 msg.push_str(&format!(
1163 "\nhint: --resolve-source {unknown_list} not found in pool; \
1164 distinct values present: {present_list}. Values are \
1165 case-sensitive (`auto_built` \u{2260} `Auto_Built`)."
1166 ));
1167 }
1168 }
1169
1170 // Unknown-cpu-budget hint. Mirrors the run_sources hint for the
1171 // numeric budget dimension: fires when a `--cpu-budget` value is
1172 // not among the budgets present in the pool (the budgets render
1173 // canonically as decimal via `cpu_budget.to_string()`, so a
1174 // non-canonical input like `032` lists as not-found against the
1175 // canonical present set). Skip rows (`cpu_budget == None`) carry no
1176 // budget and are excluded.
1177 if !filter.cpu_budgets.is_empty() {
1178 let pool_budgets: std::collections::BTreeSet<u32> =
1179 rows.iter().filter_map(|r| r.cpu_budget).collect();
1180 let present_strs: std::collections::BTreeSet<String> =
1181 pool_budgets.iter().map(|b| b.to_string()).collect();
1182 let unknowns: Vec<&str> = filter
1183 .cpu_budgets
1184 .iter()
1185 .map(String::as_str)
1186 .filter(|want| !present_strs.contains(*want))
1187 .collect();
1188 if !unknowns.is_empty() {
1189 let unknown_list = unknowns
1190 .iter()
1191 .map(|s| format!("`{s}`"))
1192 .collect::<Vec<_>>()
1193 .join(", ");
1194 let present_list = if pool_budgets.is_empty() {
1195 "(none — every row is a skip with no recorded budget)".to_string()
1196 } else {
1197 pool_budgets
1198 .iter()
1199 .map(|b| format!("`{b}`"))
1200 .collect::<Vec<_>>()
1201 .join(", ")
1202 };
1203 msg.push_str(&format!(
1204 "\nhint: --cpu-budget {unknown_list} not found in pool; \
1205 distinct budgets present: {present_list}."
1206 ));
1207 }
1208 }
1209
1210 // list-values redirect: only fires when the operator narrowed
1211 // on a commit dimension. Generic case (no commit filter) keeps
1212 // the existing `stats list` redirect at the top of the message
1213 // — `list-values` would emit a long per-dimension dump that
1214 // isn't more actionable than `stats list` for a kernel/scheduler
1215 // /topology miss.
1216 let touched_commit_dim =
1217 !filter.project_commits.is_empty() || !filter.kernel_commits.is_empty();
1218 if touched_commit_dim {
1219 msg.push_str(
1220 "\nhint: run `cargo ktstr stats list-values` to see every \
1221 distinct commit value present in the pool — the specific \
1222 value the filter expected may not have a sidecar yet, or \
1223 may differ from what was recorded by \
1224 `detect_project_commit` / `detect_kernel_commit`.",
1225 );
1226 }
1227 msg
1228}
1229
1230/// Resolved inputs for the `perf-delta` render phase.
1231///
1232/// Produced by [`prepare_partitioned_comparison`] — the validation,
1233/// pooling, partitioning, and averaging steps of [`compare_partitions`]
1234/// extracted into an owned bundle so the render half reads from one
1235/// destructure rather than a long flat prelude. Every field carries
1236/// the exact value the prior in-function prelude bound; the render
1237/// half computes labels and headers from these, then runs the four
1238/// print helpers.
1239struct PartitionedComparison {
1240 /// Dimensions on which `filter_a` differs from `filter_b` — the
1241 /// A/B contrast axes. Guaranteed non-empty (the empty case bails).
1242 slicing_dims: Vec<Dimension>,
1243 /// Dimensions NOT in `slicing_dims`, in canonical
1244 /// [`Dimension::ALL`] order — the join axes for pairing.
1245 pairing_dims: Vec<Dimension>,
1246 /// Every sidecar under the runs root (or `--dir` override).
1247 /// Guaranteed non-empty (the empty pool bails).
1248 pool: Vec<crate::test_support::SidecarResult>,
1249 /// `pool` converted to rows, same length and iteration order.
1250 rows: Vec<GauntletRow>,
1251 /// A-side rows fed to [`compare_rows_by`]: averaged mean rows
1252 /// under [`RowPrep::Averaged`], the raw filtered rows under
1253 /// [`RowPrep::PerRunPooled`].
1254 rows_a_for_compare: Vec<GauntletRow>,
1255 /// B-side counterpart of `rows_a_for_compare`.
1256 rows_b_for_compare: Vec<GauntletRow>,
1257 /// A-side averaged groups under [`RowPrep::Averaged`]; `None` under
1258 /// [`RowPrep::PerRunPooled`]. Drives the per-group pass-count block.
1259 avg_a: Option<Vec<AveragedGroup>>,
1260 /// B-side counterpart of `avg_a`.
1261 avg_b: Option<Vec<AveragedGroup>>,
1262 /// Post-typed-filter A-side contributor row count (pre-aggregation)
1263 /// — the "averaged across N runs" header numerator.
1264 pre_agg_a: usize,
1265 /// B-side counterpart of `pre_agg_a`.
1266 pre_agg_b: usize,
1267}
1268
1269/// How [`prepare_partitioned_comparison`] folds each side's rows before
1270/// the compare half consumes them.
1271#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1272enum RowPrep {
1273 /// Fold same-pairing-key rows on each side into one arithmetic-mean
1274 /// row — the default (averaging) compare behavior.
1275 Averaged,
1276 /// Keep every row INCLUDING duplicate pairing keys. The
1277 /// `perf-delta --noise-adjust`
1278 /// path: [`noise_findings`] groups the N runs per key per side into
1279 /// a spread, so N-per-key is the intended input, not an error.
1280 PerRunPooled,
1281}
1282
1283/// Validate, pool, partition, and average the inputs for
1284/// [`compare_partitions`]. Returns the owned [`PartitionedComparison`]
1285/// bundle the render half destructures, or bails with the same
1286/// diagnostics in the same order as the original in-function prelude:
1287/// identical-rows gate, empty-pool gate, then the two zero-match
1288/// gates. The multi-dim slicing warning and the dirty-build /
1289/// overcommit warnings are emitted here so they precede the render
1290/// half's header lines, preserving output order.
1291fn prepare_partitioned_comparison(
1292 filter_a: &RowFilter,
1293 filter_b: &RowFilter,
1294 dir: Option<&std::path::Path>,
1295 prep: RowPrep,
1296) -> anyhow::Result<PartitionedComparison> {
1297 // Validation gate 1: there must be at least one dimension
1298 // on which filter_a differs from filter_b — otherwise the
1299 // operator hasn't expressed a contrast and the function has
1300 // nothing to compare. Empty slicing dims OR identical filters
1301 // are both rejected here with actionable diagnostics so the
1302 // user knows which knob to turn.
1303 let slicing_dims = derive_slicing_dims(filter_a, filter_b);
1304 if slicing_dims.is_empty() {
1305 anyhow::bail!(
1306 "perf-delta: A and B select identical rows. \
1307 Specify at least one per-side filter (e.g. \
1308 --a-kernel 6.14 --b-kernel 6.15) to define what \
1309 dimension separates the two sides."
1310 );
1311 }
1312
1313 // Validation gate 2: warn (not error) when slicing on
1314 // multiple dimensions. The result is still well-defined —
1315 // the comparison joins on remaining pairing dims and
1316 // collapses the slicing-dim cross-product into a single
1317 // A/B contrast — but the operator is asking for a multi-axis
1318 // delta which is harder to interpret. The warning surfaces
1319 // the dim list so they can confirm the cohort shape.
1320 if slicing_dims.len() > 1 {
1321 let dim_names: Vec<&str> = slicing_dims.iter().map(|d| d.name()).collect();
1322 eprintln!(
1323 "warning: perf-delta: slicing on {n} dimensions [{dims}]; \
1324 results compress multiple axes into a single A/B contrast.",
1325 n = slicing_dims.len(),
1326 dims = dim_names.join(", "),
1327 );
1328 }
1329
1330 // Pairing dims = every dimension NOT in the slicing-dim set,
1331 // in canonical [`Dimension::ALL`] order. The dynamic key
1332 // shape `(scenario, *pairing_dims)` matches whatever
1333 // dimensions are currently NOT being contrasted across A
1334 // and B.
1335 let pairing_dims = Dimension::pairing_dims(&slicing_dims);
1336
1337 // Pool every sidecar under the runs root (or the operator's
1338 // --dir override) and convert to rows. The full-scan cost
1339 // is acceptable for the single-comparison-per-session
1340 // workflow.
1341 //
1342 // `--dir`-loaded sidecars get their `source` field rewritten
1343 // to `"archive"` via `apply_archive_source_override` before
1344 // row conversion. The producer-side `"local"` / `"ci"`
1345 // distinction is meaningful on the host that wrote the
1346 // sidecars; once the files have been copied off, the only
1347 // useful classification is "this came from elsewhere", which
1348 // is what `--run-source archive` queries for. Operators who need
1349 // to retain the producer-side distinction read from the
1350 // default root (no `--dir`) so values pass through untouched.
1351 let (root, override_archive) = match dir {
1352 Some(d) => (d.to_path_buf(), true),
1353 None => (crate::test_support::runs_root(), false),
1354 };
1355 let mut pool = crate::test_support::collect_pool(&root);
1356 if override_archive {
1357 crate::test_support::apply_archive_source_override(&mut pool);
1358 }
1359 if pool.is_empty() {
1360 anyhow::bail!(
1361 "perf-delta: no sidecar data found under {}. \
1362 Run `cargo ktstr test` to generate runs, or pass \
1363 --dir to point at an archived sidecar tree.",
1364 root.display(),
1365 );
1366 }
1367 let rows: Vec<GauntletRow> = pool.iter().map(sidecar_to_row).collect();
1368
1369 // Partition: apply each side's filter to the same pool. A
1370 // row may match both sides (e.g. when project_commit is the
1371 // slicing dim, a row whose `project_commit` is in
1372 // `filter_a.project_commits` matches A but NOT B unless
1373 // `filter_b.project_commits` also contains it).
1374 let rows_a = apply_row_filters(&rows, filter_a);
1375 let rows_b = apply_row_filters(&rows, filter_b);
1376 if rows_a.is_empty() {
1377 anyhow::bail!(
1378 "{}",
1379 zero_match_diagnostic("A", filter_a, &rows, pool.len()),
1380 );
1381 }
1382 if rows_b.is_empty() {
1383 anyhow::bail!(
1384 "{}",
1385 zero_match_diagnostic("B", filter_b, &rows, pool.len()),
1386 );
1387 }
1388
1389 warn_on_dirty_builds(&rows_a, &rows_b);
1390 warn_on_overcommit(&rows_a, &rows_b, &pairing_dims);
1391
1392 let pre_agg_a = rows_a.len();
1393 let pre_agg_b = rows_b.len();
1394
1395 // Fold each side's rows per the caller's [`RowPrep`]. `Averaged`
1396 // collapses same-pairing-key rows into one mean row (the default);
1397 // PerRunPooled keeps every row distinct, including duplicate
1398 // pairing keys (N-per-key is the intended noise-adjust input).
1399 let (rows_a_for_compare, rows_b_for_compare, avg_a, avg_b) = match prep {
1400 RowPrep::Averaged => {
1401 let avg_a = group_and_average_by(&rows_a, &pairing_dims);
1402 let avg_b = group_and_average_by(&rows_b, &pairing_dims);
1403 let a_rows: Vec<GauntletRow> = avg_a.iter().map(|r| r.row.clone()).collect();
1404 let b_rows: Vec<GauntletRow> = avg_b.iter().map(|r| r.row.clone()).collect();
1405 (a_rows, b_rows, Some(avg_a), Some(avg_b))
1406 }
1407 RowPrep::PerRunPooled => {
1408 // The noise-spread compare groups the N runs per pairing key
1409 // per side (see `noise_findings`), so N-per-key is the
1410 // intended input. Keep every row including duplicate
1411 // pairing keys — N-per-key is expected here, not an error.
1412 (rows_a, rows_b, None, None)
1413 }
1414 };
1415
1416 Ok(PartitionedComparison {
1417 slicing_dims,
1418 pairing_dims,
1419 pool,
1420 rows,
1421 rows_a_for_compare,
1422 rows_b_for_compare,
1423 avg_a,
1424 avg_b,
1425 pre_agg_a,
1426 pre_agg_b,
1427 })
1428}
1429
1430/// Warning for the SCALAR compare path when compared tests declare
1431/// `perf_delta_assertions` gates it does not evaluate. Declared gates are a
1432/// CI-gating perf assertion; gating on single-run scalar data would flip CI on
1433/// noise, so they are honored ONLY under `perf-delta --noise-adjust` (multi-run,
1434/// Welch + separation). Returning the message instead of a bare no-op keeps the
1435/// gate from silently disappearing on the default `perf-delta` invocation.
1436/// `None` when no compared test declares a gate. Pure (no I/O) so the
1437/// count/message is unit-testable. Counts DISTINCT scenarios carrying a gate.
1438pub(crate) fn scalar_declared_gate_warning(rows_b: &[GauntletRow]) -> Option<String> {
1439 let n = rows_b
1440 .iter()
1441 .filter(|r| !r.perf_delta_assertions.is_empty())
1442 .map(|r| r.scenario.as_str())
1443 .collect::<std::collections::BTreeSet<_>>()
1444 .len();
1445 (n > 0).then(|| {
1446 format!(
1447 "NOTE — {n} compared test(s) declare perf_delta_assertions gate(s) that this \
1448 scalar comparison does NOT evaluate. Declared gates are enforced only under \
1449 `perf-delta --noise-adjust N` (single-run scalar gating would flip CI on \
1450 noise); re-run with --noise-adjust to gate on them."
1451 )
1452 })
1453}
1454
1455/// Compare two filter-defined partitions of the sidecar pool and
1456/// report regressions across slicing dimensions.
1457///
1458/// `filter_a` and `filter_b` are the per-side row filters that
1459/// define the A/B contrast. The dimensions on which the two
1460/// filters DIFFER are the SLICING dimensions; the dimensions on
1461/// which they AGREE (or on which both are unconstrained) are the
1462/// PAIRING dimensions. Two rows pair across the A/B sides iff
1463/// their dynamic [`PairingKey`] (scenario plus every pairing-dim
1464/// value) is equal — so the comparison naturally ignores
1465/// differences on the slicing axes (those ARE the contrast) and
1466/// joins on everything else.
1467///
1468/// `dir` overrides the default `runs_root()` for pool collection.
1469/// Pass `Some(path)` to compare archived sidecar trees copied off
1470/// a CI host; pass `None` to walk `target/ktstr/` (or
1471/// `CARGO_TARGET_DIR/ktstr/`).
1472///
1473/// Validation:
1474/// - Empty slicing-dim set (every dimension is identical between
1475/// A and B): bail with "specify at least one --a-X / --b-X to
1476/// define what to compare". This includes the no-flags-at-all
1477/// case (both filters are the empty default).
1478/// - Identical effective filters with at least one slicing dim is
1479/// a contradiction caught by clap-level construction; the
1480/// downstream check is "every value in filter_a appears in
1481/// filter_b on the same dim and vice versa." We catch that as
1482/// "A and B select identical rows" — symmetric to the empty
1483/// case.
1484/// - More than one slicing dimension prints a warning to stderr
1485/// ("warning: slicing on N dimensions; results compress
1486/// multiple axes into a single A/B contrast") but does NOT
1487/// bail — multi-dim slicing is a deliberate feature for
1488/// comparing e.g. (kernel A + scheduler A) against (kernel B +
1489/// scheduler B).
1490///
1491/// Groups every matching sidecar within each side by pairing key
1492/// and averages the metrics across the group.
1493///
1494/// Returns 1 when the confident regressions fail the operator gate — their
1495/// count reaches `gate.fail_threshold` (default 5) or a `gate.must_fail`
1496/// metric regressed; 0 otherwise. See [`gate_fails`] / [`GateOptions`].
1497pub fn compare_partitions(
1498 filter_a: &RowFilter,
1499 filter_b: &RowFilter,
1500 filter: Option<&str>,
1501 policy: &ComparisonPolicy,
1502 dir: Option<&std::path::Path>,
1503 gate: &GateOptions,
1504) -> anyhow::Result<i32> {
1505 let prepared = prepare_partitioned_comparison(filter_a, filter_b, dir, RowPrep::Averaged)?;
1506 let PartitionedComparison {
1507 slicing_dims,
1508 pairing_dims,
1509 pool,
1510 rows,
1511 rows_a_for_compare,
1512 rows_b_for_compare,
1513 avg_a,
1514 avg_b,
1515 pre_agg_a,
1516 pre_agg_b,
1517 } = &prepared;
1518
1519 let report = compare_rows_by(
1520 rows_a_for_compare,
1521 rows_b_for_compare,
1522 pairing_dims,
1523 filter,
1524 policy,
1525 );
1526
1527 // Side labels derive from the slicing dims' filter values.
1528 // Single slicing dim: e.g. "6.14.2" / "6.15.0". Multi: e.g.
1529 // "6.14.2:scx_rusty" / "6.15.0:scx_alpha". >3 values per dim:
1530 // collapse to "A"/"B" to keep column headers readable.
1531 let label_a = render_side_label(filter_a, slicing_dims, "A");
1532 let label_b = render_side_label(filter_b, slicing_dims, "B");
1533
1534 // Header lines: name the slicing and pairing axes so the
1535 // operator can confirm the comparison shape at a glance.
1536 let slice_names: Vec<&str> = slicing_dims.iter().map(|d| d.name()).collect();
1537 let pair_names: Vec<&str> = pairing_dims.iter().map(|d| d.name()).collect();
1538 println!("slicing dimensions: {}", slice_names.join(", "));
1539 println!(
1540 "pairing on: scenario{}{}",
1541 if pair_names.is_empty() { "" } else { ", " },
1542 pair_names.join(", "),
1543 );
1544 // Declared gates are not evaluated on the scalar path — warn rather than
1545 // silently ignore (they are honored only under `perf-delta --noise-adjust`).
1546 if let Some(warning) = scalar_declared_gate_warning(rows_b_for_compare) {
1547 println!("{warning}");
1548 }
1549
1550 println!(
1551 "{}",
1552 format_average_header(*pre_agg_a, *pre_agg_b, &label_a, &label_b)
1553 );
1554
1555 // Scalar findings table.
1556 print_scalar_findings_table(&report, &label_a, &label_b);
1557
1558 // Scalar summary block — regressions / improvements /
1559 // unchanged + skipped-failed + per-group pass counts +
1560 // new_in_b / removed_from_a.
1561 print_summary_block(&report, avg_a, avg_b, &label_a, &label_b);
1562
1563 // Host-context delta. Same first-Some(host) baseline
1564 // `compare_partitions` uses — picking representative hosts
1565 // off the partitioned sidecars rather than the full pool so
1566 // the delta reflects what actually fed the comparison.
1567 print_host_context_delta(pool, rows, filter_a, filter_b, &label_a, &label_b);
1568
1569 // Operator gate: the significance policy above decided WHICH deltas are
1570 // confident regressions; the gate decides HOW MANY / WHICH-NAMED of them
1571 // fail the run. (--all-metrics is a no-op on this path — the scalar table
1572 // already lists every changed metric and the unchanged COUNT prints in
1573 // the summary; it reveals stable/noisy rows on the --noise-adjust table.)
1574 let regressing: Vec<&str> = report
1575 .findings
1576 .iter()
1577 .filter(|f| f.kind == FindingKind::Regression)
1578 .map(|f| f.metric.name)
1579 .collect();
1580 Ok(if gate_fails(®ressing, gate) { 1 } else { 0 })
1581}
1582
1583/// Operator-level perf-delta failure gate + render options, layered on top
1584/// of the per-metric significance policy (which decides WHICH deltas are
1585/// confident regressions). These decide HOW MANY / WHICH-NAMED confident
1586/// regressions fail the run, and whether stable/noisy rows render.
1587#[derive(Debug, Clone, Default)]
1588pub struct GateOptions {
1589 /// Fail iff at least this many confident regressions occur. `None`
1590 /// means 5 — a handful of regressions is tolerated as run-to-run noise
1591 /// and the run fails only once several metrics regress; pass
1592 /// `--fail-threshold 1` for fail-on-any. `Some(0)` disables the count
1593 /// gate entirely — only [`Self::must_fail`] can then fail the run.
1594 pub fail_threshold: Option<usize>,
1595 /// Metric registry names that fail the run if ANY of them regresses,
1596 /// regardless of the count gate (ORed on top). Caller-validated against
1597 /// the metric registry.
1598 pub must_fail: Vec<String>,
1599 /// Render every compared metric row (stable + noisy included) on the
1600 /// `--noise-adjust` table instead of only the meaningful ones.
1601 /// Display-only — never affects the gate.
1602 pub show_all: bool,
1603}
1604
1605/// Decide whether a perf-delta run FAILS, given the registry names of the
1606/// confident regressions it found. Fails iff the count meets
1607/// [`GateOptions::fail_threshold`] (default 5; `Some(0)` disables the count
1608/// gate), OR any regressing metric is in [`GateOptions::must_fail`] (ORed
1609/// on top). The caller passes the CLASSIFIED regressions, so display-hidden
1610/// (suppressed) rows still feed the gate.
1611pub(crate) fn gate_fails(regressing_metrics: &[&str], gate: &GateOptions) -> bool {
1612 let n = gate.fail_threshold.unwrap_or(5);
1613 let fail_on_count = n >= 1 && regressing_metrics.len() >= n;
1614 let fail_on_must = !gate.must_fail.is_empty()
1615 && regressing_metrics
1616 .iter()
1617 .any(|m| gate.must_fail.iter().any(|w| w.as_str() == *m));
1618 fail_on_count || fail_on_must
1619}
1620
1621#[cfg(test)]
1622mod gate_option_tests {
1623 use super::*;
1624
1625 #[test]
1626 fn default_gate_fails_only_at_five_regressions() {
1627 // Default (None) threshold is 5: a handful of regressions is
1628 // tolerated as run-to-run noise; the run fails once >= 5 regress.
1629 let g = GateOptions::default();
1630 assert!(!gate_fails(&[], &g), "0 regressions passes");
1631 assert!(
1632 !gate_fails(&["a", "b", "c", "d"], &g),
1633 "4 < 5 passes under the default gate"
1634 );
1635 assert!(
1636 gate_fails(&["a", "b", "c", "d", "e"], &g),
1637 "5 >= 5 fails under the default gate"
1638 );
1639 }
1640
1641 #[test]
1642 fn count_threshold_requires_n() {
1643 let g = GateOptions {
1644 fail_threshold: Some(3),
1645 ..Default::default()
1646 };
1647 assert!(!gate_fails(&["a", "b"], &g), "2 < 3 passes");
1648 assert!(gate_fails(&["a", "b", "c"], &g), "3 >= 3 fails");
1649 }
1650
1651 #[test]
1652 fn zero_threshold_disables_count_gate() {
1653 let g = GateOptions {
1654 fail_threshold: Some(0),
1655 ..Default::default()
1656 };
1657 assert!(
1658 !gate_fails(&["a", "b", "c"], &g),
1659 "N=0 never fails on the count"
1660 );
1661 }
1662
1663 #[test]
1664 fn must_fail_fails_regardless_of_count() {
1665 let g = GateOptions {
1666 fail_threshold: Some(0),
1667 must_fail: vec!["worst_p99_wake_latency_us".to_string()],
1668 ..Default::default()
1669 };
1670 assert!(
1671 gate_fails(&["worst_p99_wake_latency_us"], &g),
1672 "a must-fail metric regressing fails even with the count gate off"
1673 );
1674 assert!(
1675 !gate_fails(&["some_other_metric"], &g),
1676 "a non-must-fail regression does not fail with the count gate off"
1677 );
1678 }
1679
1680 #[test]
1681 fn must_fail_is_ored_above_the_count() {
1682 let g = GateOptions {
1683 fail_threshold: Some(10),
1684 must_fail: vec!["avg_dsq_depth".to_string()],
1685 ..Default::default()
1686 };
1687 assert!(
1688 gate_fails(&["avg_dsq_depth"], &g),
1689 "must-fail fires even below the count threshold"
1690 );
1691 }
1692}
1693
1694/// How a metric's noise-adjusted verdict classifies for the gate.
1695#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1696pub(crate) enum NoiseKind {
1697 /// SEPARATED (Welch or disjoint bands) AND MATERIAL in the worsening
1698 /// direction (per polarity) — the only kind that fails the gate.
1699 Regression,
1700 /// Separated AND material in the improving direction.
1701 Improvement,
1702 /// A side realized fewer than 2 samples, so variance / Welch are undefined —
1703 /// verdict untrustworthy; flagged but does NOT fail the gate. A merely
1704 /// high-spread side no longer lands here: `high_spread` is ADVISORY and
1705 /// annotates a reported verdict (e.g. `REGRESSION (noisy spread)`) rather
1706 /// than suppressing it — the fix for the old spread-gate signal inversion.
1707 Noisy,
1708 /// Separated + material change in a directionless (`Polarity::Informational`)
1709 /// metric — shown but NEVER fails the gate (no good/bad direction to regress).
1710 Informational,
1711 /// Not separated, or separated but immaterial (below the registry dual-gate),
1712 /// with both sides having >= 2 samples. Shown in the full metrics table so the
1713 /// operator sees every metric's A/B stats, but never gates. Emitted only when
1714 /// the caller passes `include_stable = true` (the render path); the gate-only
1715 /// path omits it.
1716 Stable,
1717}
1718
1719/// One metric's noise-adjusted finding for a paired scenario.
1720pub(crate) struct NoiseFinding {
1721 /// The pairing-key label ("scenario" plus the pairing dims like
1722 /// topology/work_type, joined with "/"), so groups that share a scenario
1723 /// name but differ in a pairing dim render distinctly.
1724 pub pairing_label: String,
1725 pub metric: &'static MetricDef,
1726 pub verdict: NoiseVerdict,
1727 pub kind: NoiseKind,
1728 /// A whole-run [`crate::test_support::PerfDeltaAssertionRecord`] is declared
1729 /// on this (test, metric) — its `min_abs` / `max_regression_pct` /
1730 /// `direction` override the registry defaults in [`classify_noise`] (a
1731 /// declared value that is out of range on a corrupt/stale sidecar is
1732 /// rejected there and falls back to the registry default, but the row still
1733 /// CARRIES the declared gate). Rendered as a `(declared gate)` verdict
1734 /// annotation so the operator sees the author declared a gate here. `false`
1735 /// = no declared assertion; the registry defaults classified this row.
1736 pub gated_by_assertion: bool,
1737}
1738
1739/// One metric's noise-adjusted finding for a matched PHASE of a paired
1740/// scenario — the per-phase (`--noise-adjust` + per-phase) analog of
1741/// [`NoiseFinding`], carrying the `step_index`/`label` the render prints.
1742/// Emitted only for a `(step_index, metric)` present on BOTH sides across
1743/// the runs. Per-phase SPREAD findings are render-only EXCEPT one carrying
1744/// an author-declared phase-scoped gate (`gated_by_assertion`), which DOES
1745/// contribute to the exit via [`NoiseReport::declared_phase_regressions`].
1746pub(crate) struct NoisePhaseFinding {
1747 /// The pairing-key label (see [`NoiseFinding::pairing_label`]).
1748 pub pairing_label: String,
1749 /// `0` = BASELINE, `1..=N` = scenario Step ordinals (framework convention).
1750 pub step_index: u16,
1751 /// Mirrors [`crate::assert::PhaseBucket::label`] (`"BASELINE"` / `"Step[k]"`).
1752 pub label: String,
1753 pub metric: &'static MetricDef,
1754 pub verdict: NoiseVerdict,
1755 pub kind: NoiseKind,
1756 /// A phase-scoped declared assertion (`phase == Some(step_index)`) drove the
1757 /// gate for this per-phase row — see [`NoiseFinding::gated_by_assertion`].
1758 pub gated_by_assertion: bool,
1759}
1760
1761/// A per-phase metric present on only ONE side of the noise comparison —
1762/// either a metric absent in the other side's matched-`step_index` buckets,
1763/// or a whole one-sided `step_index`. Both collapse into one row because
1764/// a one-sided metric has no band on the absent side either way. Never gated;
1765/// surfaced so a coverage asymmetry is not silently dropped.
1766pub(crate) struct NoisePhaseCoverage {
1767 /// The pairing-key label (see [`NoiseFinding::pairing_label`]).
1768 pub pairing_label: String,
1769 pub step_index: u16,
1770 pub label: String,
1771 /// The one-sided metric, or `None` for a whole one-sided phase that
1772 /// carried no readable (non-suppressed) metric — surfaced (rendered with
1773 /// `—`) so the "phase fired but produced no data on one side" shape is not
1774 /// silently dropped.
1775 pub metric: Option<&'static MetricDef>,
1776 /// The side that carries the metric/phase; the other has none.
1777 pub present_side: ComparePartition,
1778 /// The present side's per-side mean across its runs, or `None` for the
1779 /// metric-less empty-phase-shape row.
1780 pub value: Option<f64>,
1781}
1782
1783/// A declared [`crate::test_support::PerfDeltaAssertionRecord`] that was NEVER
1784/// evaluated — its metric resolves in the registry (guaranteed by
1785/// `KtstrTestEntry::validate`) but produced no comparable value in THIS
1786/// comparison, so [`classify_noise`] never saw it. The runtime analog of the
1787/// validate-time typo check: the author declared a perf gate that silently did
1788/// not fire (metric absent from the captured data, a Rate with a zero pooled
1789/// denominator, or — for a phase-scoped assertion — a step that no matched run
1790/// carried). Surfaced (never gated) so a silently-inert declared gate is not
1791/// mistaken for a passing one.
1792pub(crate) struct NoiseAssertionCoverage {
1793 /// The pairing-key label the unmatched gate was declared under (see
1794 /// [`NoiseFinding::pairing_label`]).
1795 pub pairing_label: String,
1796 /// The declared gate that never evaluated — carries its metric, phase
1797 /// scope, and overridden thresholds for the warning message.
1798 pub assertion: crate::test_support::PerfDeltaAssertionRecord,
1799}
1800
1801/// Result of [`noise_findings`]: the per-(scenario, metric) aggregate findings,
1802/// the per-(scenario, phase, metric) findings + one-sided coverage rows, the
1803/// declared gates that never evaluated, and the number of scenarios paired
1804/// across both sides (for the footer).
1805pub(crate) struct NoiseReport {
1806 pub findings: Vec<NoiseFinding>,
1807 pub phase_findings: Vec<NoisePhaseFinding>,
1808 pub phase_coverage: Vec<NoisePhaseCoverage>,
1809 pub assertion_coverage: Vec<NoiseAssertionCoverage>,
1810 pub paired_scenarios: usize,
1811}
1812
1813impl NoiseReport {
1814 /// Confident AGGREGATE regressions — the gate's exit basis. Per-phase
1815 /// findings are render-only and deliberately excluded (see
1816 /// [`Self::phase_regressions`]).
1817 pub fn regressions(&self) -> usize {
1818 self.findings
1819 .iter()
1820 .filter(|f| f.kind == NoiseKind::Regression)
1821 .count()
1822 }
1823 /// Aggregate (whole-run) regressions on a metric the test author explicitly
1824 /// DECLARED a whole-run [`crate::test_support::PerfDeltaAssertion`] for
1825 /// (`phase: None`, `gated_by_assertion`). Like a declared PHASE gate
1826 /// ([`Self::declared_phase_regressions`]), a declared whole-run gate is an
1827 /// author opt-in that ALWAYS contributes to the exit, orthogonal to the
1828 /// operator's count / must-fail gate — an UNdeclared aggregate regression
1829 /// stays subject to that count gate (`gate_fails`).
1830 pub fn declared_regressions(&self) -> usize {
1831 self.findings
1832 .iter()
1833 .filter(|f| f.kind == NoiseKind::Regression && f.gated_by_assertion)
1834 .count()
1835 }
1836 /// Aggregate metrics flagged untrustworthy — a side had < 2 usable runs.
1837 pub fn noisy(&self) -> usize {
1838 self.findings
1839 .iter()
1840 .filter(|f| f.kind == NoiseKind::Noisy)
1841 .count()
1842 }
1843 /// Confident AGGREGATE improvements — a metric that moved MATERIALLY in the
1844 /// better direction and cleared the significance test. Render-only (an
1845 /// improvement never gates); cited in the summary footer alongside
1846 /// [`Self::regressions`] so the reader sees the composite verdict (what
1847 /// regressed AND what improved), with [`Self::stable`] as the residual.
1848 pub fn improvements(&self) -> usize {
1849 self.findings
1850 .iter()
1851 .filter(|f| f.kind == NoiseKind::Improvement)
1852 .count()
1853 }
1854 /// Aggregate metrics that did NOT move confidently+materially — the expected
1855 /// common outcome, since the noise gate (Welch / disjoint-bands + material
1856 /// dual-gate) is conservative. Cited as a residual COUNT in the footer (the
1857 /// low-value majority), never enumerated.
1858 pub fn stable(&self) -> usize {
1859 self.findings
1860 .iter()
1861 .filter(|f| f.kind == NoiseKind::Stable)
1862 .count()
1863 }
1864 /// Aggregate metrics that changed but carry no better/worse polarity
1865 /// (registry `Informational`) — cited in the footer only when present.
1866 pub fn informational(&self) -> usize {
1867 self.findings
1868 .iter()
1869 .filter(|f| f.kind == NoiseKind::Informational)
1870 .count()
1871 }
1872 /// Confident per-phase regressions — for the footer + tests ONLY, never
1873 /// the exit basis (per-phase SPREAD is render-only). A per-phase regression the author explicitly DECLARED via a
1874 /// phase-scoped [`crate::test_support::PerfDeltaAssertion`] DOES gate — see
1875 /// [`Self::declared_phase_regressions`].
1876 pub fn phase_regressions(&self) -> usize {
1877 self.phase_findings
1878 .iter()
1879 .filter(|f| f.kind == NoiseKind::Regression)
1880 .count()
1881 }
1882 /// Per-phase regressions on a metric the test author explicitly DECLARED a
1883 /// phase-scoped gate for. Unlike the render-only per-phase spread pass, a
1884 /// declared phase gate is an opt-in: the author accepted the narrower
1885 /// phase-window noise, so it contributes to the perf-delta EXIT alongside
1886 /// the aggregate [`Self::regressions`]. A per-phase regression WITHOUT a
1887 /// declared gate stays render-only (a narrow-window flake must not flip CI
1888 /// red on its own).
1889 pub fn declared_phase_regressions(&self) -> usize {
1890 self.phase_findings
1891 .iter()
1892 .filter(|f| f.kind == NoiseKind::Regression && f.gated_by_assertion)
1893 .count()
1894 }
1895 /// Per-phase metrics flagged untrustworthy — a side had < 2 usable runs.
1896 pub fn phase_noisy(&self) -> usize {
1897 self.phase_findings
1898 .iter()
1899 .filter(|f| f.kind == NoiseKind::Noisy)
1900 .count()
1901 }
1902}
1903
1904/// Row-level noise-adjusted compare — the testable core of
1905/// [`compare_partitions_noise`] (which wraps it with sidecar pooling + render),
1906/// mirroring how [`compare_rows_by`] underlies [`compare_partitions`]. Groups
1907/// each side's per-run rows by pairing key — EXCLUDING non-pass runs
1908/// (fail / skip / inconclusive / expected_failure), whose failure-mode
1909/// telemetry would corrupt the spread, exactly as [`compare_rows_by`] excludes
1910/// them — then per metric summarizes the spread and classifies via
1911/// [`noise_verdict`]. Returns one [`NoiseFinding`] per
1912/// (scenario, metric) that is changed, noisy, or — when `include_stable` is
1913/// true — unchanged-and-clean ([`NoiseKind::Stable`], shown in the full metrics
1914/// table but never gating). With `include_stable = false` only changed/noisy
1915/// metrics are returned (the gate-only path). Metrics with no signal on either
1916/// side (both means under [`ZERO_MEAN_EPS`]) and render-suppressed rate
1917/// components are omitted regardless of `include_stable`.
1918pub(crate) fn noise_findings(
1919 rows_a: &[GauntletRow],
1920 rows_b: &[GauntletRow],
1921 pairing_dims: &[Dimension],
1922 spread_threshold_pct: f64,
1923 include_stable: bool,
1924) -> NoiseReport {
1925 use std::collections::{BTreeMap, BTreeSet};
1926 let group = |rows: &[GauntletRow]| {
1927 let mut by_key: BTreeMap<PairingKey, Vec<GauntletRow>> = BTreeMap::new();
1928 for r in rows {
1929 // Exclude non-pass runs from the spread pool, mirroring the
1930 // scalar `compare_rows_by` and the averaged `group_and_average_by`:
1931 // a failed / inconclusive / skipped / expected_failure run's
1932 // telemetry is failure-mode-dominated (zeroed or an outlier), so
1933 // pooling it would corrupt the per-side [min, max] band and mean
1934 // and could produce a confident FALSE gate — the exact
1935 // silent-wrong-verdict class the noise mode exists to prevent.
1936 // A side reduced below 2 real samples by this filter is then
1937 // caught by `noise_verdict`'s n<2 insufficient_samples guard.
1938 if r.is_fail() || r.is_inconclusive() || r.is_skip() || r.expected_failure {
1939 continue;
1940 }
1941 // Derive this run's Rate metrics (e.g. the schedstat rates whose
1942 // components only materialize in ext_metrics at sidecar_to_row
1943 // time) so the spread reads them — the Averaged path derives via
1944 // group_and_average_by -> derive_rate_metrics; the per-run paths
1945 // must derive per row or every schedstat Rate is silently absent
1946 // from the verdict. Per-run derivation (each run's own num/den) is
1947 // the correct band semantics for run-to-run spread.
1948 let mut row = r.clone();
1949 crate::stats::metric::derive_rate_metrics(&mut row.ext_metrics);
1950 by_key
1951 .entry(PairingKey::from_row(&row, pairing_dims))
1952 .or_default()
1953 .push(row);
1954 }
1955 by_key
1956 };
1957 let a_by_key = group(rows_a);
1958 let b_by_key = group(rows_b);
1959
1960 let mut findings = Vec::new();
1961 let mut phase_findings = Vec::new();
1962 let mut phase_coverage = Vec::new();
1963 let mut assertion_coverage = Vec::new();
1964 let mut paired_scenarios = 0usize;
1965 for (key, a_rows) in &a_by_key {
1966 let Some(b_rows) = b_by_key.get(key) else {
1967 continue;
1968 };
1969 paired_scenarios += 1;
1970 // Metrics whose gate WAS evaluated this group, split by scope: whole-run
1971 // (aggregate pass) and phase-scoped `(step_index, metric)` (per-phase
1972 // pass). Diffed against the declared assertions after both passes to
1973 // surface any declared gate that never fired (metric absent from the
1974 // captured data).
1975 let mut consulted: BTreeSet<&'static str> = BTreeSet::new();
1976 let mut consulted_phase: BTreeSet<(u16, &'static str)> = BTreeSet::new();
1977 // Label by the FULL pairing key (scenario + pairing dims like
1978 // topology/work_type), joined via pairing_key.0.join. noise_findings groups by the full pairing key,
1979 // so a scenario run on multiple topologies/work_types forms distinct
1980 // groups; labeling by scenario alone would render them indistinguishably.
1981 let pairing_label = key.0.join("/");
1982 for m in METRICS {
1983 if is_render_suppressed_component(m.name) {
1984 continue;
1985 }
1986 let a_vals: Vec<f64> = a_rows.iter().filter_map(|r| m.read(r)).collect();
1987 let b_vals: Vec<f64> = b_rows.iter().filter_map(|r| m.read(r)).collect();
1988 if a_vals.is_empty() || b_vals.is_empty() {
1989 continue;
1990 }
1991 // Rate metrics: the per-run ratios (a_vals/b_vals) give the
1992 // run-to-run band, but the compared centroid is the pooled
1993 // Σnum/Σden (duration-weighted) — the cross-run Rate value the
1994 // registry documents (metric.rs), so --noise-adjust and the scalar
1995 // averaging compare agree on a Rate's central value while the band still measures
1996 // per-run variability. Non-Rate metrics summarize their samples.
1997 let verdict = match m.kind {
1998 MetricKind::Rate {
1999 numerator,
2000 denominator,
2001 } => {
2002 let pooled = |rows: &[GauntletRow]| -> Option<f64> {
2003 // Sum num/den only over runs that carry BOTH components,
2004 // so a run missing one cannot skew the pooled ratio (the
2005 // per-run rate is undefined for it anyway).
2006 let (num, den) = rows.iter().fold((0.0, 0.0), |(sn, sd), r| {
2007 match (r.ext_metrics.get(numerator), r.ext_metrics.get(denominator)) {
2008 (Some(n), Some(d)) => (sn + n, sd + d),
2009 _ => (sn, sd),
2010 }
2011 });
2012 (den != 0.0).then(|| num / den)
2013 };
2014 // A zero pooled denominator means no rate to compare on that
2015 // side — skip (matches the per-run derivation guard).
2016 let (Some(a_pooled), Some(b_pooled)) = (pooled(a_rows), pooled(b_rows)) else {
2017 continue;
2018 };
2019 noise_verdict_from(
2020 SideSummary::of(&a_vals).with_pooled_mean(a_pooled),
2021 SideSummary::of(&b_vals).with_pooled_mean(b_pooled),
2022 spread_threshold_pct,
2023 )
2024 }
2025 _ => noise_verdict(&a_vals, &b_vals, spread_threshold_pct),
2026 };
2027 // This metric produced a comparable verdict, so a declared gate on
2028 // it WAS evaluated — record it so the post-loop diff does not flag it
2029 // as never-evaluated. Recorded here (not at the empty-values / zero-
2030 // denominator `continue`s above) so those absent-data cases DO
2031 // surface as unmatched declared gates.
2032 consulted.insert(m.name);
2033 // Aggregate (whole-run) declared assertion for this test+metric
2034 // (phase: None). HEAD (B) side is authoritative on the declaration.
2035 let assertion = b_rows.first().and_then(|r| {
2036 r.perf_delta_assertions
2037 .iter()
2038 .find(|x| x.metric == m.name && x.phase.is_none())
2039 });
2040 let gated_by_assertion = assertion.is_some();
2041 let Some(kind) = classify_noise(&verdict, m, assertion, include_stable) else {
2042 continue;
2043 };
2044 findings.push(NoiseFinding {
2045 pairing_label: pairing_label.clone(),
2046 metric: m,
2047 verdict,
2048 kind,
2049 gated_by_assertion,
2050 });
2051 }
2052 // Per-phase sub-pass: mirror the aggregate spread for each matched
2053 // (step_index, metric), surfacing one-sided phases/metrics as coverage
2054 // rows. Render-only — never contributes to the gate exit.
2055 noise_phase_findings(
2056 a_rows,
2057 b_rows,
2058 &pairing_label,
2059 spread_threshold_pct,
2060 include_stable,
2061 &mut phase_findings,
2062 &mut phase_coverage,
2063 &mut consulted_phase,
2064 );
2065 // Diff the declared gates (HEAD/B authoritative) against what actually
2066 // evaluated this group. A whole-run gate (`phase: None`) matches when its
2067 // metric produced an aggregate verdict; a phase-scoped gate matches when
2068 // its `(phase, metric)` produced a per-phase verdict. Anything left is a
2069 // gate the author declared that silently never fired — surfaced, never
2070 // gated.
2071 if let Some(first) = b_rows.first() {
2072 for a in &first.perf_delta_assertions {
2073 let matched = match a.phase {
2074 None => consulted.contains(a.metric.as_str()),
2075 Some(step) => consulted_phase.contains(&(step, a.metric.as_str())),
2076 };
2077 if !matched {
2078 assertion_coverage.push(NoiseAssertionCoverage {
2079 pairing_label: pairing_label.clone(),
2080 assertion: a.clone(),
2081 });
2082 }
2083 }
2084 }
2085 }
2086 NoiseReport {
2087 findings,
2088 phase_findings,
2089 phase_coverage,
2090 assertion_coverage,
2091 paired_scenarios,
2092 }
2093}
2094
2095/// Classify one metric's [`NoiseVerdict`] into a [`NoiseKind`], shared by the
2096/// aggregate and per-phase noise passes. Returns `None` to omit the row: both
2097/// sides ~zero (no signal), or unchanged/immaterial-and-clean when
2098/// `include_stable` is false (the gate-only path).
2099///
2100/// A `< 2`-sample side (`insufficient_samples`) is a HARD gate that precedes
2101/// significance (Noisy, never a confident regression); the ADVISORY
2102/// `high_spread` flag does NOT gate (that suppression was the signal-inverting
2103/// bug). A row is a confident regression only when SEPARATED (Welch or disjoint
2104/// bands) AND MATERIAL (the registry dual-gate) in the worsening polarity.
2105fn classify_noise(
2106 verdict: &NoiseVerdict,
2107 m: &MetricDef,
2108 assertion: Option<&crate::test_support::PerfDeltaAssertionRecord>,
2109 include_stable: bool,
2110) -> Option<NoiseKind> {
2111 // No signal on either side (both ~zero): skip. Same zero epsilon as the
2112 // spread ratio for one consistent "is this zero".
2113 if verdict.a.mean.abs() < ZERO_MEAN_EPS && verdict.b.mean.abs() < ZERO_MEAN_EPS {
2114 return None;
2115 }
2116 // HARD gate: a side realized < 2 samples, so variance / Welch are undefined.
2117 // Precedes significance — never a confident regression. (The ADVISORY
2118 // high_spread flag, by contrast, does NOT gate.)
2119 if verdict.insufficient_samples {
2120 return Some(NoiseKind::Noisy);
2121 }
2122 // MATERIALITY: mirror the scalar dual-gate (push_scalar_findings ~L899) so
2123 // noise and default modes agree on "is this delta large enough". A
2124 // statistically-separated but trivially-small move stays Stable. --noise-adjust
2125 // conflicts with --threshold/--policy (cli.rs), so the registry defaults ARE
2126 // the resolved thresholds (an empty ComparisonPolicy::rel_threshold returns
2127 // default_rel); reading them from `m` directly is equivalent, no policy needed.
2128 // For a Rate metric, `mean` is the pooled Σnum/Σden centroid (the
2129 // registry-authoritative cross-run value), while the Welch separation arm
2130 // reads `sample_mean` (mean-of-ratios, coherent with `var`). Materiality AND
2131 // the direction label below authoritatively follow `mean`; the two statistics
2132 // can order the sides oppositely only under orders-of-magnitude denominator
2133 // skew between runs of one config (physically implausible), where the pooled
2134 // direction is the correct label anyway.
2135 let a = verdict.a.mean;
2136 let b = verdict.b.mean;
2137 let delta = b - a;
2138 let rel_delta = if a.abs() > ZERO_MEAN_EPS {
2139 (delta / a).abs()
2140 } else if delta.abs() > ZERO_MEAN_EPS {
2141 // Non-negligible move from a ~zero baseline: unbounded relative change,
2142 // so the absolute gate alone decides (mirrors the scalar path).
2143 f64::INFINITY
2144 } else {
2145 0.0
2146 };
2147 // A declared PerfDeltaAssertion OVERRIDES the registry gate for THIS
2148 // (test, metric): an absolute floor (`min_abs`), a relative threshold
2149 // (`max_regression_pct`), and/or a pinned direction. Absent (None) =>
2150 // registry defaults, so the no-assertion path is byte-identical to the
2151 // default gate. A tighter declared threshold turns a default-Stable move
2152 // into a Regression; a pinned direction can assert a registry-Informational
2153 // metric.
2154 //
2155 // `min_abs` / `max_regression_pct` come from a pub serde
2156 // `PerfDeltaAssertionRecord`. `KtstrTestEntry::validate` rejects a negative
2157 // or NaN threshold on the entry-construction path, but a hand-edited or
2158 // stale sidecar could deserialize one — and `delta.abs()` / `rel_delta` are
2159 // non-negative, so a NEGATIVE gate makes `material` unconditionally true (a
2160 // phantom confident regression that flips the exit) while a NaN gate makes
2161 // every `>=` false (silently disabled). Reject out-of-range values here and
2162 // fall back to the registry default — symmetric with the `TargetValue`
2163 // direction guard below, defending the same untrusted deserialization path.
2164 let abs_gate = assertion
2165 .and_then(|x| x.min_abs)
2166 .filter(|v| v.is_finite() && *v >= 0.0)
2167 .unwrap_or(m.default_abs);
2168 let rel_gate = assertion
2169 .and_then(|x| x.max_regression_pct)
2170 .filter(|v| v.is_finite() && *v >= 0.0)
2171 .map(|pct| pct / 100.0)
2172 .unwrap_or(m.default_rel);
2173 let material = delta.abs() >= abs_gate && rel_delta >= rel_gate;
2174 // A declared direction override, else the registry polarity. `TargetValue`
2175 // is rejected at the `PerfDeltaAssertion::with_direction` builder, so a
2176 // validated entry never carries it — but `PerfDeltaAssertionRecord` is a
2177 // pub serde type, so a hand-edited or stale sidecar could deserialize a
2178 // `direction: TargetValue`. Symmetric target-distance gating is
2179 // unimplemented (the polarity path would misread it as increase-is-worse),
2180 // so ignore it here and inherit the registry polarity — matching the
2181 // entry-path guarantee on the sidecar-deserialization path.
2182 let classify = match assertion.and_then(|x| x.direction) {
2183 Some(crate::test_support::Polarity::TargetValue(_)) | None => m.classify_direction(),
2184 Some(p) => p.classify_direction(),
2185 };
2186
2187 Some(if verdict.separated && material {
2188 match classify {
2189 // Directionless metric: a separated + material move with no good/bad
2190 // direction — shown, never fails the gate.
2191 None => NoiseKind::Informational,
2192 Some(higher_is_worse) => {
2193 // Polarity split by the SIGN of the mean delta (b vs a), NOT a
2194 // band position: separation can come from the Welch arm even when
2195 // b.mean sits inside a's [min, max] band.
2196 let worsened = if higher_is_worse { b > a } else { b < a };
2197 if worsened {
2198 NoiseKind::Regression
2199 } else {
2200 NoiseKind::Improvement
2201 }
2202 }
2203 }
2204 } else if include_stable {
2205 NoiseKind::Stable
2206 } else {
2207 return None; // unchanged/immaterial + clean: omit (gate-only path)
2208 })
2209}
2210
2211/// Push one one-sided per-phase metric to `coverage` — a metric present in a
2212/// matched-`step_index` bucket on one side only. `value` is the present
2213/// side's per-side mean across its runs (the absent side has none).
2214fn push_noise_phase_coverage(
2215 coverage: &mut Vec<NoisePhaseCoverage>,
2216 pairing_label: &str,
2217 step_index: u16,
2218 label: &str,
2219 metric: &'static MetricDef,
2220 present_side: ComparePartition,
2221 vals: &[f64],
2222) {
2223 if vals.is_empty() {
2224 return;
2225 }
2226 let value = vals.iter().sum::<f64>() / vals.len() as f64;
2227 coverage.push(NoisePhaseCoverage {
2228 pairing_label: pairing_label.to_string(),
2229 step_index,
2230 label: label.to_string(),
2231 metric: Some(metric),
2232 present_side,
2233 value: Some(value),
2234 });
2235}
2236
2237/// Surface every non-suppressed metric of a whole one-sided `step_index` (a
2238/// phase present on only one side's runs) as [`NoisePhaseCoverage`] rows, so a
2239/// scenario-shape asymmetry is not silently dropped.
2240fn push_noise_unpaired_step(
2241 coverage: &mut Vec<NoisePhaseCoverage>,
2242 pairing_label: &str,
2243 step_index: u16,
2244 side: ComparePartition,
2245 buckets: &[&crate::assert::PhaseBucket],
2246) {
2247 let label = buckets[0].label.clone();
2248 let names: std::collections::BTreeSet<&str> = buckets
2249 .iter()
2250 .flat_map(|p| p.metrics.keys())
2251 .map(String::as_str)
2252 .collect();
2253 let before = coverage.len();
2254 for name in names {
2255 if is_render_suppressed_component(name) {
2256 continue;
2257 }
2258 let Some(m) = metric_def(name) else {
2259 continue;
2260 };
2261 let vals: Vec<f64> = buckets
2262 .iter()
2263 .filter_map(|p| p.metrics.get(name).copied())
2264 .collect();
2265 push_noise_phase_coverage(coverage, pairing_label, step_index, &label, m, side, &vals);
2266 }
2267 if coverage.len() == before {
2268 // A one-sided phase with no readable (non-suppressed) metric — a
2269 // synthesized capture-free step can carry an empty metrics map. Surface
2270 // the empty shape (metric/value None -> rendered with `—`) so it is not
2271 // silently dropped.
2272 coverage.push(NoisePhaseCoverage {
2273 pairing_label: pairing_label.to_string(),
2274 step_index,
2275 label,
2276 metric: None,
2277 present_side: side,
2278 value: None,
2279 });
2280 }
2281}
2282
2283/// Per-phase noise sub-pass for one matched `(a_rows, b_rows)` pair, over the
2284/// N per-run [`crate::assert::PhaseBucket`]s per side. For each matched
2285/// `step_index` it walks the union of metric names
2286/// and emits a [`NoisePhaseFinding`] per `(step, metric)` present on BOTH sides
2287/// (spread verdict via the same machinery as the aggregate pass, incl. the
2288/// pooled-`Σnum/Σden` Rate centroid — here summed WITHIN the phase from
2289/// `bucket.metrics`), or a [`NoisePhaseCoverage`] for a one-sided metric or a
2290/// whole one-sided `step_index`. Skips the pair unless BOTH sides have at least
2291/// one run carrying phases (single-phase scenarios). Render-only: nothing here
2292/// contributes to the gate exit.
2293#[allow(clippy::too_many_arguments)]
2294fn noise_phase_findings(
2295 a_rows: &[GauntletRow],
2296 b_rows: &[GauntletRow],
2297 pairing_label: &str,
2298 spread_threshold_pct: f64,
2299 include_stable: bool,
2300 findings: &mut Vec<NoisePhaseFinding>,
2301 coverage: &mut Vec<NoisePhaseCoverage>,
2302 consulted_phase: &mut std::collections::BTreeSet<(u16, &'static str)>,
2303) {
2304 use std::collections::BTreeSet;
2305 // Single-phase scenarios carry empty phases on every run; skip the per-phase
2306 // view if either side has no run with phases.
2307 let has_phases = |rows: &[GauntletRow]| rows.iter().any(|r| !r.phases.is_empty());
2308 if !has_phases(a_rows) || !has_phases(b_rows) {
2309 return;
2310 }
2311 // A nested fn (not a closure) so the elided output lifetime ties the
2312 // borrowed &PhaseBucket to `rows` — a closure can't express that linkage.
2313 fn by_step(
2314 rows: &[GauntletRow],
2315 ) -> std::collections::BTreeMap<u16, Vec<&crate::assert::PhaseBucket>> {
2316 let mut m: std::collections::BTreeMap<u16, Vec<&crate::assert::PhaseBucket>> =
2317 std::collections::BTreeMap::new();
2318 for r in rows {
2319 for p in &r.phases {
2320 m.entry(p.step_index).or_default().push(p);
2321 }
2322 }
2323 m
2324 }
2325 let a_by_step = by_step(a_rows);
2326 let b_by_step = by_step(b_rows);
2327 let steps: BTreeSet<u16> = a_by_step.keys().chain(b_by_step.keys()).copied().collect();
2328 for step_index in steps {
2329 match (a_by_step.get(&step_index), b_by_step.get(&step_index)) {
2330 (Some(a_buckets), Some(b_buckets)) => {
2331 let label = a_buckets[0].label.clone();
2332 let names: BTreeSet<&str> = a_buckets
2333 .iter()
2334 .chain(b_buckets.iter())
2335 .flat_map(|p| p.metrics.keys())
2336 .map(String::as_str)
2337 .collect();
2338 for name in names {
2339 if is_render_suppressed_component(name) {
2340 continue;
2341 }
2342 let Some(m) = metric_def(name) else {
2343 continue;
2344 };
2345 let a_vals: Vec<f64> = a_buckets
2346 .iter()
2347 .filter_map(|p| p.metrics.get(name).copied())
2348 .collect();
2349 let b_vals: Vec<f64> = b_buckets
2350 .iter()
2351 .filter_map(|p| p.metrics.get(name).copied())
2352 .collect();
2353 match (a_vals.is_empty(), b_vals.is_empty()) {
2354 (true, true) => continue,
2355 // Present on only one matched side: no band on the other
2356 // — coverage, not a delta.
2357 (false, true) => {
2358 push_noise_phase_coverage(
2359 coverage,
2360 pairing_label,
2361 step_index,
2362 &label,
2363 m,
2364 ComparePartition::A,
2365 &a_vals,
2366 );
2367 continue;
2368 }
2369 (true, false) => {
2370 push_noise_phase_coverage(
2371 coverage,
2372 pairing_label,
2373 step_index,
2374 &label,
2375 m,
2376 ComparePartition::B,
2377 &b_vals,
2378 );
2379 continue;
2380 }
2381 (false, false) => {}
2382 }
2383 let verdict = match m.kind {
2384 MetricKind::Rate {
2385 numerator,
2386 denominator,
2387 } => {
2388 // Per-phase Rate: pooled Σnum/Σden WITHIN the phase
2389 // from bucket.metrics (phase-derivable rates carry
2390 // both components per phase), band from per-run ratios.
2391 let pooled = |buckets: &[&crate::assert::PhaseBucket]| -> Option<f64> {
2392 let (num, den) = buckets.iter().fold((0.0, 0.0), |(sn, sd), p| {
2393 match (p.metrics.get(numerator), p.metrics.get(denominator)) {
2394 (Some(n), Some(d)) => (sn + n, sd + d),
2395 _ => (sn, sd),
2396 }
2397 });
2398 (den != 0.0).then(|| num / den)
2399 };
2400 let (Some(a_pooled), Some(b_pooled)) =
2401 (pooled(a_buckets), pooled(b_buckets))
2402 else {
2403 continue;
2404 };
2405 noise_verdict_from(
2406 SideSummary::of(&a_vals).with_pooled_mean(a_pooled),
2407 SideSummary::of(&b_vals).with_pooled_mean(b_pooled),
2408 spread_threshold_pct,
2409 )
2410 }
2411 _ => noise_verdict(&a_vals, &b_vals, spread_threshold_pct),
2412 };
2413 // This (step, metric) produced a comparable verdict, so a
2414 // phase-scoped gate on it WAS evaluated (see the aggregate
2415 // pass's `consulted`).
2416 consulted_phase.insert((step_index, m.name));
2417 // Phase-scoped declared assertion for this test+metric+step.
2418 let assertion = b_rows.first().and_then(|r| {
2419 r.perf_delta_assertions
2420 .iter()
2421 .find(|x| x.metric == m.name && x.phase == Some(step_index))
2422 });
2423 let gated_by_assertion = assertion.is_some();
2424 let Some(kind) = classify_noise(&verdict, m, assertion, include_stable) else {
2425 continue;
2426 };
2427 findings.push(NoisePhaseFinding {
2428 pairing_label: pairing_label.to_string(),
2429 step_index,
2430 label: label.clone(),
2431 metric: m,
2432 verdict,
2433 kind,
2434 gated_by_assertion,
2435 });
2436 }
2437 }
2438 (Some(a_buckets), None) => {
2439 push_noise_unpaired_step(
2440 coverage,
2441 pairing_label,
2442 step_index,
2443 ComparePartition::A,
2444 a_buckets,
2445 );
2446 }
2447 (None, Some(b_buckets)) => {
2448 push_noise_unpaired_step(
2449 coverage,
2450 pairing_label,
2451 step_index,
2452 ComparePartition::B,
2453 b_buckets,
2454 );
2455 }
2456 (None, None) => {}
2457 }
2458 }
2459}
2460
2461/// Summarize a side's loaded runs by why each is or isn't comparable, using the
2462/// SAME exclusions [`noise_findings`] applies (skip / fail / inconclusive /
2463/// expected-failure). Returns `(comparable_count, human_summary)` so an empty
2464/// comparison can explain WHY — e.g. every run was skipped — instead of the bare
2465/// "nothing to compare". The `comparable_count` equals the number of rows
2466/// `noise_findings` would keep (a row is comparable iff none of the four
2467/// exclusions hold), so a zero here is exactly why the side produced no findings.
2468pub(crate) fn summarize_side_runs(rows: &[GauntletRow]) -> (usize, String) {
2469 let (mut skipped, mut failed, mut inconclusive, mut xfail, mut comparable) =
2470 (0usize, 0usize, 0usize, 0usize, 0usize);
2471 for r in rows {
2472 if r.is_skip() {
2473 skipped += 1;
2474 } else if r.is_fail() {
2475 failed += 1;
2476 } else if r.is_inconclusive() {
2477 inconclusive += 1;
2478 } else if r.expected_failure {
2479 xfail += 1;
2480 } else {
2481 comparable += 1;
2482 }
2483 }
2484 let mut excluded = Vec::new();
2485 if skipped > 0 {
2486 excluded.push(format!("{skipped} skipped"));
2487 }
2488 if failed > 0 {
2489 excluded.push(format!("{failed} failed"));
2490 }
2491 if inconclusive > 0 {
2492 excluded.push(format!("{inconclusive} inconclusive"));
2493 }
2494 if xfail > 0 {
2495 excluded.push(format!("{xfail} expected-failure"));
2496 }
2497 let breakdown = if excluded.is_empty() {
2498 "none excluded".to_string()
2499 } else {
2500 excluded.join(", ")
2501 };
2502 (
2503 comparable,
2504 format!(
2505 "{} run(s): {comparable} comparable ({breakdown})",
2506 rows.len()
2507 ),
2508 )
2509}
2510
2511/// Noise-adjusted variant of [`compare_partitions`]: instead of averaging each
2512/// side's runs into one mean and gating on a fixed threshold, it keeps every run
2513/// ([`RowPrep::PerRunPooled`]), summarizes each side per metric, and decides
2514/// whether the two sides are distinguishable given their run-to-run variability
2515/// (see [`noise_findings`] for the row-level core + [`noise_verdict`] for the
2516/// per-metric decision). Used by `perf-delta --noise-adjust N`, which produces N
2517/// runs per side. A metric is a CONFIDENT REGRESSION (fed to the operator
2518/// failure gate below) when it is SEPARATED (a two-sided Welch t-test rejects equal means at
2519/// `NOISE_ALPHA`, OR the `[min, max]` bands are fully disjoint) AND MATERIAL (the
2520/// mean delta clears both the registry `default_abs` and `default_rel`, the same
2521/// dual-gate as the scalar path) in the worsening direction (per polarity). A
2522/// side that realized fewer than 2 runs is flagged `NOISY` and never gates; a
2523/// per-side relative spread over `spread_threshold_pct` is an ADVISORY
2524/// `(noisy spread)` annotation that NEVER suppresses a verdict. By default prints
2525/// only the MEANINGFUL rows (confident regression / improvement / informational),
2526/// each with its side's `mean [min-max] spread%` and verdict; stable (unchanged +
2527/// clean) and noisy (<2-run) rows are hidden unless `--all-metrics`, and a
2528/// one-line summary prints when every row is suppressed. The footer leads with an
2529/// overall verdict word ([`overall_verdict`]: `STABLE` unless the regressed or
2530/// improved count clears the significance cutoff — sub-cutoff moves are flagged
2531/// but likely noise) followed by the composite counts (regressed / improved /
2532/// stable / under-sampled, plus informational when present); and, when every
2533/// changed (non-stable) metric had a side with <2 usable runs, an explicit
2534/// inconclusive note.
2535///
2536/// When the scenarios carry phases, a per-phase spread block follows the
2537/// aggregate table (via [`format_noise_phase_findings_lines`]): the same
2538/// spread verdict per matched `(step_index, metric)`, plus a coverage table for
2539/// one-sided phases/metrics. Like the aggregate table, the per-phase block shows
2540/// only MEANINGFUL rows by default (regression / improvement / informational) and
2541/// hides stable / noisy rows unless `--all-metrics` (`gate.show_all`); whenever the
2542/// spread rows are all suppressed it surfaces a one-line hint naming
2543/// `--all-metrics` (even alongside a coverage table), so the suppressed rows are
2544/// never silently gone. The coverage (one-sided) table is itself never
2545/// suppressed. `phase_opts` controls the per-phase
2546/// render only (`--no-phases` / `--phases-only` / `--steps-only` / `--phase` /
2547/// `--phase-threshold`); under `--phases-only` the aggregate table is
2548/// suppressed. Per-phase SPREAD findings are RENDER-ONLY — classified and
2549/// colored but not gating — EXCEPT a
2550/// per-phase regression the author explicitly declared a phase-scoped gate for,
2551/// which DOES contribute to the exit alongside the operator gate on the aggregate
2552/// confident regressions ([`gate_fails`]: their count reaches `fail_threshold`
2553/// [default 5], or a `must_fail` metric regressed).
2554/// The footer appends per-phase counts only when per-phase data exists and no
2555/// phase filter is active.
2556///
2557/// When no scenario pairs, the aggregate note breaks down each side's loaded
2558/// runs via [`summarize_side_runs`] — naming skipped / failed / inconclusive
2559/// runs — so an all-skipped comparison (e.g. a non-`performance_mode` test under
2560/// perf-delta's `KTSTR_PERF_ONLY`) is explained, not silently reported as
2561/// "nothing to compare".
2562pub fn compare_partitions_noise(
2563 filter_a: &RowFilter,
2564 filter_b: &RowFilter,
2565 dir: Option<&std::path::Path>,
2566 spread_threshold_pct: f64,
2567 phase_opts: &PhaseDisplayOptions,
2568 gate: &GateOptions,
2569) -> anyhow::Result<i32> {
2570 // Keep every per-run row INCLUDING duplicate pairing keys so the
2571 // run-to-run spread is observable: noise_findings groups the N runs
2572 // per key per side. PerRunPooled keeps every per-run row including
2573 // duplicate pairing keys — the N-per-key spread is the intended input.
2574 let prepared = prepare_partitioned_comparison(filter_a, filter_b, dir, RowPrep::PerRunPooled)?;
2575 let label_a = render_side_label(filter_a, &prepared.slicing_dims, "A");
2576 let label_b = render_side_label(filter_b, &prepared.slicing_dims, "B");
2577
2578 let report = noise_findings(
2579 &prepared.rows_a_for_compare,
2580 &prepared.rows_b_for_compare,
2581 &prepared.pairing_dims,
2582 spread_threshold_pct,
2583 // Include Stable (unchanged + clean) metrics so the rendered table
2584 // shows the full comparison, not just changed/noisy rows.
2585 true,
2586 );
2587
2588 println!(
2589 "perf-delta --noise-adjust: {label_b} vs {label_a} (advisory noisy-spread threshold {spread_threshold_pct:.2}%)"
2590 );
2591 // Aggregate spread table — suppressed under --phases-only (renders ONLY
2592 // the per-phase block).
2593 if !phase_opts.phases_only {
2594 if report.findings.is_empty() {
2595 // Distinguish "nothing paired" from "paired but every metric omitted"
2596 // so the message never contradicts the paired-scenario footer below.
2597 // A metric is omitted when both sides' means are ~zero, it read on
2598 // only one side, or it is a render-suppressed rate component.
2599 if report.paired_scenarios == 0 {
2600 // Explain WHY nothing paired instead of a bare "nothing to
2601 // compare": prepare_partitioned_comparison already bailed if a
2602 // side loaded ZERO rows, so both sides have >=1 run here — the
2603 // pairing failure is either all-excluded runs (skip/fail/etc.)
2604 // or a genuine scenario mismatch. Break down each side so a
2605 // skipped-on-both-sides run (the common perf-delta case: a
2606 // non-performance_mode test under KTSTR_PERF_ONLY) is named.
2607 let (a_ok, a_desc) = summarize_side_runs(&prepared.rows_a_for_compare);
2608 let (b_ok, b_desc) = summarize_side_runs(&prepared.rows_b_for_compare);
2609 println!(
2610 "perf-delta --noise-adjust: no comparable runs to pair across the two runs."
2611 );
2612 println!(" {label_a} — {a_desc}");
2613 println!(" {label_b} — {b_desc}");
2614 if a_ok == 0 || b_ok == 0 {
2615 println!(
2616 " A skipped run carries no metrics: perf-delta runs with \
2617 KTSTR_PERF_ONLY, which skips any test not marked \
2618 #[ktstr_test(performance_mode = true)]; host-gated skips land here \
2619 too. A failed / inconclusive run is excluded from the spread math."
2620 );
2621 } else {
2622 println!(
2623 " Both sides produced comparable runs but share no scenario / topology \
2624 / work_type — the two selections have no common test to contrast."
2625 );
2626 }
2627 } else {
2628 println!(
2629 "perf-delta --noise-adjust: no metric to display — every compared metric \
2630 was unchanged at zero, present on only one side, or render-suppressed"
2631 );
2632 }
2633 } else {
2634 print!(
2635 "{}",
2636 format_noise_findings_table(&report.findings, &label_a, &label_b, gate.show_all)
2637 );
2638 }
2639 }
2640 // Per-phase spread block — render-only (never gates), honoring --no-phases /
2641 // --phase / --steps-only / --phase-threshold. Under --phases-only it is the
2642 // ONLY table, so emit an explicit note rather than a silent blank.
2643 let phase_lines = format_noise_phase_findings_lines(
2644 &report.phase_findings,
2645 &report.phase_coverage,
2646 phase_opts,
2647 &label_a,
2648 &label_b,
2649 gate.show_all,
2650 );
2651 if phase_lines.is_empty() && phase_opts.phases_only {
2652 println!(
2653 "perf-delta --noise-adjust: no per-phase noise data to show (no matched \
2654 multi-phase scenario at the selected step, or every per-phase row was \
2655 filtered by --phase / --steps-only / --phase-threshold)"
2656 );
2657 }
2658 for line in phase_lines {
2659 println!("{line}");
2660 }
2661 // Declared gates that never evaluated (metric absent from the compared
2662 // data). Rendered regardless of --phases-only — a silently-inert declared
2663 // gate is important whether or not the aggregate table is shown. Never
2664 // gates the exit.
2665 for line in format_noise_assertion_coverage_lines(&report.assertion_coverage) {
2666 println!("{line}");
2667 }
2668 let regressions = report.regressions();
2669 // Per-phase regressions the author explicitly DECLARED a phase-scoped gate
2670 // for — these gate the exit alongside the aggregate regressions (spread-only
2671 // per-phase findings stay render-only). Computed unconditionally so the exit
2672 // is identical whether or not the phase footer is shown.
2673 let declared_phase_regressions = report.declared_phase_regressions();
2674 // The aggregate summary footer describes the (hidden) aggregate spread, so
2675 // suppress it under --phases-only — which renders ONLY the per-phase
2676 // block.
2677 // The exit still gates on gate_fails(aggregate regressions) +
2678 // `declared_phase_regressions` (computed above) when the footer is hidden.
2679 if !phase_opts.phases_only {
2680 let noisy = report.noisy();
2681 // Composite verdict counts: regressed / improved LEAD (the table shows
2682 // each with its magnitude), stable is the expected residual cited as a
2683 // count only, and informational (changed but no polarity) is cited only
2684 // when present. All render-only — the exit reads gate_fails + declared
2685 // gates, never these.
2686 let improvements = report.improvements();
2687 let stable = report.stable();
2688 let informational = report.informational();
2689 let info_clause = if informational > 0 {
2690 format!(", {informational} informational")
2691 } else {
2692 String::new()
2693 };
2694 // Overall verdict word: STABLE unless a direction clears the cutoff (see
2695 // overall_verdict). Sub-cutoff moves are flagged in the counts above but
2696 // do not shift the verdict off STABLE.
2697 let verdict = overall_verdict(&report, gate);
2698 let stable_note = if verdict == "STABLE" && regressions + improvements > 0 {
2699 " (moves are below the significance cutoff, more likely noise than signal)"
2700 } else {
2701 ""
2702 };
2703 // Per-phase footer counts are shown ONLY when there IS per-phase data
2704 // AND no phase filter is active (no --no-phases / --phase / --steps-only
2705 // / --phase-threshold) — so the counts always match the fully-rendered
2706 // per-phase table, and a single-phase run (no per-phase view) shows no
2707 // confusing 0/0 per-phase clause. Uses the same !any-flag-set
2708 // discipline as the aggregate paired-scenario hint.
2709 let has_phase_data = !report.phase_findings.is_empty() || !report.phase_coverage.is_empty();
2710 let show_phase_footer = has_phase_data
2711 && !phase_opts.no_phases
2712 && phase_opts.phase.is_none()
2713 && !phase_opts.steps_only
2714 && phase_opts.phase_threshold.is_none();
2715 // A REGRESSED verdict with ZERO aggregate regressions can only come from a
2716 // declared PER-PHASE gate (a declared whole-run regression and --must-fail
2717 // both require an aggregate Regression, so regressions >= 1 there). When
2718 // the phase footer below is hidden (a phase filter is active) name the
2719 // source so `overall REGRESSED: 0 regressed` is not self-contradictory;
2720 // when the phase footer shows, its "(declared-gated, exit-affecting)"
2721 // clause already explains it, so skip the redundant note.
2722 let verdict_source =
2723 if verdict.starts_with("REGRESSED") && regressions == 0 && !show_phase_footer {
2724 " (regression is a declared per-phase gate)"
2725 } else {
2726 ""
2727 };
2728 let phase_footer = if show_phase_footer {
2729 // Per-phase regressions are render-only EXCEPT the declared-gated
2730 // ones, which gate the exit — call that out so the count is not read
2731 // as fully render-only when a declared phase gate fired.
2732 let declared_note = if declared_phase_regressions > 0 {
2733 format!(" ({declared_phase_regressions} declared-gated, exit-affecting)")
2734 } else {
2735 " (render-only)".to_string()
2736 };
2737 format!(
2738 "; {} per-phase regression(s){declared_note}, {} per-phase under-sampled (<2 runs)",
2739 report.phase_regressions(),
2740 report.phase_noisy(),
2741 )
2742 } else {
2743 String::new()
2744 };
2745 // Declared gates that never evaluated — a single-line summary of the
2746 // warning block above (shown whenever any gate went unevaluated).
2747 let unevaluated_gates = report.assertion_coverage.len();
2748 let gate_footer = if unevaluated_gates > 0 {
2749 format!("; {unevaluated_gates} declared gate(s) not evaluated")
2750 } else {
2751 String::new()
2752 };
2753 println!(
2754 "perf-delta --noise-adjust: {} paired scenario(s); overall {verdict}: \
2755 {regressions} regressed, {improvements} improved, {stable} stable{info_clause}, \
2756 {noisy} under-sampled (<2 runs){stable_note}{verdict_source}{phase_footer}{gate_footer}",
2757 report.paired_scenarios,
2758 );
2759 }
2760 // Inconclusive: every CHANGED aggregate metric (excluding Stable rows, which
2761 // never gate) had a side with < 2 usable runs — no trustworthy signal either
2762 // way. Surfaced prominently for CI logs, but noise FLAGS, not FAILS, so the
2763 // exit stays 0 unless a confident AGGREGATE regression fired. Suppressed
2764 // under --phases-only (the aggregate table is hidden there).
2765 if !phase_opts.phases_only {
2766 let changed: Vec<&NoiseFinding> = report
2767 .findings
2768 .iter()
2769 .filter(|f| f.kind != NoiseKind::Stable)
2770 .collect();
2771 if !changed.is_empty() && changed.iter().all(|f| f.kind == NoiseKind::Noisy) {
2772 println!(
2773 "perf-delta --noise-adjust: NOTE -- every changed metric had a side with <2 usable \
2774 runs; raise --noise-adjust N (or investigate why per-side runs failed) for a \
2775 trustworthy verdict"
2776 );
2777 }
2778 }
2779 // Exit gates on AGGREGATE regressions plus DECLARED phase regressions.
2780 // Spread-only per-phase findings stay render-only (parity with the scalar
2781 // per-phase pass — a narrow-window phase flake must not flip CI red), but a
2782 // phase-scoped gate the author explicitly declared is an opt-in and DOES
2783 // gate (matches the `PerfDeltaAssertion::phase` doc).
2784 // Operator gate: the count / must-fail gate applies to the UNdeclared
2785 // aggregate confident regressions; an author-DECLARED gate — whole-run OR
2786 // phase-scoped — always fails (its own opt-in, orthogonal to the operator's
2787 // count gate).
2788 Ok(noise_exit_code(&report, gate))
2789}
2790
2791/// Exit code for the noise-adjusted compare. Fails (`1`) when EITHER the
2792/// operator gate ([`gate_fails`]) trips on the aggregate confident regressions,
2793/// OR any author-DECLARED regression is present — whole-run
2794/// ([`NoiseReport::declared_regressions`]) or phase-scoped
2795/// ([`NoiseReport::declared_phase_regressions`]). A declared assertion is a
2796/// per-test opt-in that ALWAYS gates on its metric, independent of the
2797/// operator's count / must-fail gate, so a single declared regression fails
2798/// even below `--fail-threshold`. Extracted so the exit decision is
2799/// unit-testable.
2800pub(crate) fn noise_exit_code(report: &NoiseReport, gate: &GateOptions) -> i32 {
2801 let regressing: Vec<&str> = report
2802 .findings
2803 .iter()
2804 .filter(|f| f.kind == NoiseKind::Regression)
2805 .map(|f| f.metric.name)
2806 .collect();
2807 if gate_fails(®ressing, gate)
2808 || report.declared_regressions() > 0
2809 || report.declared_phase_regressions() > 0
2810 {
2811 1
2812 } else {
2813 0
2814 }
2815}
2816
2817/// Overall run verdict word for the `--noise-adjust` summary, derived from the
2818/// confident move counts against the significance cutoff (`--fail-threshold`,
2819/// default 5). Sub-cutoff moves in EITHER direction are FLAGGED (shown in the
2820/// table, counted in the footer) but do NOT shift the verdict off `STABLE` —
2821/// below the cutoff a move is more likely noise than signal, which is the whole
2822/// point of the conservative noise gate. A direction only reads into the verdict
2823/// once it clears the cutoff, and both can hold at once (`REGRESSED + IMPROVED`).
2824///
2825/// The regressed side reuses the exit decision ([`noise_exit_code`]) so a
2826/// `--must-fail` or declared gate that fails the run also reads `REGRESSED` even
2827/// below the count cutoff; the improved side has no gate, so it uses the count
2828/// cutoff alone. Display-only — never the exit basis.
2829pub(crate) fn overall_verdict(report: &NoiseReport, gate: &GateOptions) -> &'static str {
2830 verdict_label(
2831 noise_exit_code(report, gate) == 1,
2832 report.improvements(),
2833 gate.fail_threshold,
2834 )
2835}
2836
2837/// Pure verdict classifier: `regressed` is the run's regression-fail decision;
2838/// `improvements` is the confident-improvement count; `fail_threshold` is the
2839/// significance cutoff (`None` = 5, `Some(0)` disables count significance).
2840/// Split out from [`overall_verdict`] so the STABLE-below-cutoff policy is
2841/// unit-testable without building a full report.
2842pub(crate) fn verdict_label(
2843 regressed: bool,
2844 improvements: usize,
2845 fail_threshold: Option<usize>,
2846) -> &'static str {
2847 let n = fail_threshold.unwrap_or(5);
2848 let improved = n >= 1 && improvements >= n;
2849 match (regressed, improved) {
2850 (true, true) => "REGRESSED + IMPROVED",
2851 (true, false) => "REGRESSED",
2852 (false, true) => "IMPROVED",
2853 (false, false) => "STABLE",
2854 }
2855}
2856
2857/// Compose a noise verdict cell's text: the base label plus any parenthesized,
2858/// comma-joined annotations. Shared by the aggregate and per-phase tables so
2859/// both annotate identically. `noisy spread` (advisory `high_spread`, suppressed
2860/// on a Noisy <2-runs row where it is redundant) and `declared gate` (the row
2861/// carries a declared [`crate::test_support::PerfDeltaAssertion`] — its
2862/// overrides drive the gate, or fall back to the registry defaults if rejected
2863/// as out-of-range on a corrupt sidecar) can co-occur → `REGRESSION (noisy
2864/// spread, declared gate)`.
2865fn compose_noise_verdict_text(
2866 base: &str,
2867 high_spread: bool,
2868 kind: NoiseKind,
2869 gated_by_assertion: bool,
2870) -> String {
2871 let mut annotations: Vec<&str> = Vec::new();
2872 if high_spread && kind != NoiseKind::Noisy {
2873 annotations.push("noisy spread");
2874 }
2875 if gated_by_assertion {
2876 annotations.push("declared gate");
2877 }
2878 if annotations.is_empty() {
2879 base.to_string()
2880 } else {
2881 format!("{base} ({})", annotations.join(", "))
2882 }
2883}
2884
2885/// Render the per-metric noise-adjusted findings as a table for
2886/// `perf-delta --noise-adjust`: one row per (scenario, metric) with each
2887/// side's `mean [min-max] spread%` and the colored verdict. Includes
2888/// [`NoiseKind::Stable`] rows so the operator sees every compared metric,
2889/// not only the changed ones. Pure (returns the rendered string with a
2890/// trailing newline) so the row/verdict mapping is unit-testable without
2891/// capturing stdout.
2892pub(crate) fn format_noise_findings_table(
2893 findings: &[NoiseFinding],
2894 label_a: &str,
2895 label_b: &str,
2896 show_all: bool,
2897) -> String {
2898 use comfy_table::{Cell, Color};
2899 // Default view shows only MEANINGFUL rows (confident regression /
2900 // improvement / informational). Stable (unchanged / immaterial) and
2901 // Noisy (<2 usable runs) rows are hidden unless `show_all`; their COUNTS
2902 // still print in the caller's footer, and the gate reads the full
2903 // classified set, so this suppression is display-only.
2904 let visible: Vec<&NoiseFinding> = findings
2905 .iter()
2906 .filter(|f| show_all || !matches!(f.kind, NoiseKind::Stable | NoiseKind::Noisy))
2907 .collect();
2908 if visible.is_empty() {
2909 if findings.is_empty() {
2910 return String::new();
2911 }
2912 let noisy = findings
2913 .iter()
2914 .filter(|f| f.kind == NoiseKind::Noisy)
2915 .count();
2916 return format!(
2917 "perf-delta --noise-adjust: {} metric(s) compared, none meaningfully changed \
2918 ({noisy} under-sampled); re-run with --all-metrics to see them\n",
2919 findings.len(),
2920 );
2921 }
2922 let mut table = crate::cli::new_table();
2923 table.set_header(vec![
2924 "TEST / METRIC".to_string(),
2925 format!("{label_a} (A: mean [min-max] spread%)"),
2926 format!("{label_b} (B: mean [min-max] spread%)"),
2927 "VERDICT".to_string(),
2928 ]);
2929 for f in visible {
2930 let (base, color) = match f.kind {
2931 NoiseKind::Regression => ("REGRESSION", Color::Red),
2932 NoiseKind::Improvement => ("improvement", Color::Green),
2933 NoiseKind::Noisy => ("NOISY (<2 runs)", Color::Yellow),
2934 NoiseKind::Informational => ("informational", Color::Blue),
2935 NoiseKind::Stable => ("stable", Color::Grey),
2936 };
2937 // Verdict annotations (parenthesized, comma-joined). `noisy spread` is
2938 // ADVISORY (high_spread) — flags a noisy side without changing the
2939 // classification, redundant on a Noisy <2-runs row so omitted there.
2940 // `declared gate` marks a row that CARRIES a declared PerfDeltaAssertion
2941 // (its overrides drive the gate, or fall back to registry defaults if
2942 // rejected as out-of-range on a corrupt sidecar), so the operator can
2943 // tell an author-declared gate from a pure registry-default one.
2944 let verdict_text =
2945 compose_noise_verdict_text(base, f.verdict.high_spread, f.kind, f.gated_by_assertion);
2946 let v = &f.verdict;
2947 table.add_row(vec![
2948 Cell::new(format!("{} / {}", f.pairing_label, f.metric.name)),
2949 Cell::new(format!(
2950 "{:.1} [{:.1}-{:.1}] {:.2}%",
2951 v.a.mean, v.a.min, v.a.max, v.a.spread_pct
2952 )),
2953 Cell::new(format!(
2954 "{:.1} [{:.1}-{:.1}] {:.2}%",
2955 v.b.mean, v.b.min, v.b.max, v.b.spread_pct
2956 )),
2957 Cell::new(verdict_text).fg(color),
2958 ]);
2959 }
2960 format!("{table}\n")
2961}
2962
2963/// One-line description of a declared gate's overrides for the
2964/// not-evaluated warning: the thresholds it WOULD have applied. All-`None`
2965/// (a bare `PerfDeltaAssertion::new(metric)`) renders `registry defaults` —
2966/// a presence-checked gate that inherits the registry `default_abs`/
2967/// `default_rel`/polarity.
2968fn describe_declared_gate(a: &crate::test_support::PerfDeltaAssertionRecord) -> String {
2969 let mut parts: Vec<String> = Vec::new();
2970 if let Some(pct) = a.max_regression_pct {
2971 parts.push(format!("max_regression_pct={pct}"));
2972 }
2973 if let Some(abs) = a.min_abs {
2974 parts.push(format!("min_abs={abs}"));
2975 }
2976 if let Some(dir) = a.direction {
2977 parts.push(format!("direction={dir:?}"));
2978 }
2979 if parts.is_empty() {
2980 "registry defaults".to_string()
2981 } else {
2982 parts.join(", ")
2983 }
2984}
2985
2986/// Render the declared perf gates that never evaluated (a metric absent from
2987/// the compared data) as warning lines for `perf-delta --noise-adjust`: a
2988/// TEST | METRIC | PHASE | DECLARED GATE table, so an author whose declared
2989/// [`crate::test_support::PerfDeltaAssertion`] silently did not fire sees it
2990/// rather than mistaking a not-evaluated gate for a passing one. Never gates
2991/// the exit (a gate that could not evaluate is not a regression). Pure —
2992/// returns the lines (empty when there are none) so the mapping is
2993/// unit-testable without capturing stdout.
2994pub(crate) fn format_noise_assertion_coverage_lines(
2995 coverage: &[NoiseAssertionCoverage],
2996) -> Vec<String> {
2997 use comfy_table::{Cell, Color};
2998 let mut lines = Vec::new();
2999 if coverage.is_empty() {
3000 return lines;
3001 }
3002 let mut rows: Vec<&NoiseAssertionCoverage> = coverage.iter().collect();
3003 rows.sort_by(|a, b| {
3004 a.pairing_label
3005 .cmp(&b.pairing_label)
3006 .then_with(|| a.assertion.metric.cmp(&b.assertion.metric))
3007 .then_with(|| a.assertion.phase.cmp(&b.assertion.phase))
3008 });
3009 lines.push(String::new());
3010 lines.push(
3011 "declared perf gate(s) NOT evaluated — the metric was absent from the compared \
3012 data (workload no longer emits it, a one-sided/failed run, or a Rate with no \
3013 samples), so the declared gate silently did not fire:"
3014 .to_string(),
3015 );
3016 let mut table = crate::cli::new_table();
3017 table.set_header(vec![
3018 "TEST".to_string(),
3019 "METRIC".to_string(),
3020 "PHASE".to_string(),
3021 "DECLARED GATE".to_string(),
3022 ]);
3023 for c in rows {
3024 let phase = match c.assertion.phase {
3025 None => "aggregate".to_string(),
3026 Some(k) => k.to_string(),
3027 };
3028 table.add_row(vec![
3029 Cell::new(&c.pairing_label),
3030 Cell::new(&c.assertion.metric).fg(Color::Yellow),
3031 Cell::new(phase),
3032 Cell::new(describe_declared_gate(&c.assertion)),
3033 ]);
3034 }
3035 lines.push(format!("{table}"));
3036 lines
3037}
3038
3039/// Render the per-phase noise-adjusted findings as lines for
3040/// `perf-delta --noise-adjust`: a per-phase spread table (PHASE | TEST /
3041/// METRIC | A mean[min-max] spread% | B ... | VERDICT) plus a one-sided
3042/// coverage table (SIDE | TEST | PHASE | METRIC | VALUE). The TEST column
3043/// carries the full pairing-key label (scenario plus every pairing dim),
3044/// matching the scalar compare path — a scenario shared across topologies
3045/// renders as distinct rows. Mirrors
3046/// [`format_noise_findings_table`], honoring
3047/// [`PhaseDisplayOptions`] (no_phases / phase / steps_only / phase_threshold).
3048/// Render-only: these rows never gate. Pure — returns the lines (empty when
3049/// suppressed / no per-phase data) so the row/verdict mapping is unit-testable
3050/// without capturing stdout.
3051pub(crate) fn format_noise_phase_findings_lines(
3052 phase_findings: &[NoisePhaseFinding],
3053 phase_coverage: &[NoisePhaseCoverage],
3054 phase_opts: &PhaseDisplayOptions,
3055 label_a: &str,
3056 label_b: &str,
3057 show_all: bool,
3058) -> Vec<String> {
3059 use comfy_table::{Cell, Color};
3060 let mut lines = Vec::new();
3061 if phase_opts.no_phases {
3062 return lines;
3063 }
3064 // Rows passing the phase-axis filters (`--phase` / `--steps-only`) and the
3065 // `--phase-threshold` spread gate, BEFORE the meaningful-only display filter.
3066 // Retained so the collapse summary below can report how many rows the default
3067 // view suppressed.
3068 let phase_filtered: Vec<&NoisePhaseFinding> = phase_findings
3069 .iter()
3070 .filter(|f| phase_opts.matches_phase(f.step_index))
3071 .filter(|f| phase_opts.passes_noise_spread_threshold(&f.verdict))
3072 .collect();
3073 // Default view shows only MEANINGFUL rows (regression / improvement /
3074 // informational); Stable + Noisy rows are hidden unless `show_all`
3075 // (`--all-metrics`), mirroring the aggregate table
3076 // ([`format_noise_findings_table`]). Display-only: the footer still reports
3077 // the per-phase regression / under-sampled COUNTS from the unfiltered report,
3078 // and the exit gate reads the unfiltered findings — so suppressing rows here
3079 // changes neither the counts nor the pass/fail.
3080 let mut findings: Vec<&NoisePhaseFinding> = phase_filtered
3081 .iter()
3082 .copied()
3083 .filter(|f| show_all || !matches!(f.kind, NoiseKind::Stable | NoiseKind::Noisy))
3084 .collect();
3085 let mut coverage: Vec<&NoisePhaseCoverage> = phase_coverage
3086 .iter()
3087 .filter(|c| phase_opts.matches_phase(c.step_index))
3088 .collect();
3089 let had_findings = !findings.is_empty();
3090 // Spread rows existed but were all suppressed as stable/noisy in the default
3091 // view: a one-line hint (naming `--all-metrics`, wording matched to the
3092 // aggregate collapse) keeps the suppression discoverable — surfaced whether
3093 // the block is otherwise empty OR a coverage table follows, so the suppressed
3094 // rows are never silently gone. Only in the default view: under `show_all`
3095 // nothing is suppressed, so an empty result there means there was genuinely no
3096 // per-phase spread data.
3097 let suppressed_hint: Option<String> =
3098 if findings.is_empty() && !phase_filtered.is_empty() && !show_all {
3099 let noisy = phase_filtered
3100 .iter()
3101 .filter(|f| f.kind == NoiseKind::Noisy)
3102 .count();
3103 Some(format!(
3104 "perf-delta --noise-adjust: {} per-phase metric(s) compared, none meaningfully \
3105 changed ({noisy} under-sampled); re-run with --all-metrics to see them",
3106 phase_filtered.len(),
3107 ))
3108 } else {
3109 None
3110 };
3111 if findings.is_empty() && coverage.is_empty() {
3112 // Nothing else to render — the hint (if any) is the whole block.
3113 lines.extend(suppressed_hint);
3114 return lines;
3115 }
3116 lines.push(String::new());
3117 if had_findings {
3118 // "per-phase spread:" heads the findings table ONLY — a coverage-only
3119 // section (no spread rows) gets its own header below, so the label
3120 // never mislabels a table.
3121 lines.push("per-phase spread:".to_string());
3122 // step_index-first (BASELINE..Step[N] time order), then pairing label,
3123 // then metric — a stable, top-down-by-phase-boundary order.
3124 findings.sort_by(|a, b| {
3125 a.step_index
3126 .cmp(&b.step_index)
3127 .then_with(|| a.pairing_label.cmp(&b.pairing_label))
3128 .then_with(|| a.metric.name.cmp(b.metric.name))
3129 });
3130 let mut table = crate::cli::new_table();
3131 table.set_header(vec![
3132 "PHASE".to_string(),
3133 "TEST / METRIC".to_string(),
3134 format!("{label_a} (A: mean [min-max] spread%)"),
3135 format!("{label_b} (B: mean [min-max] spread%)"),
3136 "VERDICT".to_string(),
3137 ]);
3138 for f in findings {
3139 let (base, color) = match f.kind {
3140 NoiseKind::Regression => ("REGRESSION", Color::Red),
3141 NoiseKind::Improvement => ("improvement", Color::Green),
3142 NoiseKind::Noisy => ("NOISY (<2 runs)", Color::Yellow),
3143 NoiseKind::Informational => ("informational", Color::Blue),
3144 NoiseKind::Stable => ("stable", Color::Grey),
3145 };
3146 // Same annotation composition as the aggregate table (advisory
3147 // `noisy spread` + `declared gate`).
3148 let verdict_text = compose_noise_verdict_text(
3149 base,
3150 f.verdict.high_spread,
3151 f.kind,
3152 f.gated_by_assertion,
3153 );
3154 let v = &f.verdict;
3155 table.add_row(vec![
3156 Cell::new(format!("{}: {}", f.step_index, f.label)),
3157 Cell::new(format!("{} / {}", f.pairing_label, f.metric.name)),
3158 Cell::new(format!(
3159 "{:.1} [{:.1}-{:.1}] {:.2}%",
3160 v.a.mean, v.a.min, v.a.max, v.a.spread_pct
3161 )),
3162 Cell::new(format!(
3163 "{:.1} [{:.1}-{:.1}] {:.2}%",
3164 v.b.mean, v.b.min, v.b.max, v.b.spread_pct
3165 )),
3166 Cell::new(verdict_text).fg(color),
3167 ]);
3168 }
3169 lines.push(table.to_string());
3170 } else {
3171 // Spread rows were all suppressed but a coverage table follows: surface
3172 // the suppression hint before it so `--all-metrics` stays discoverable.
3173 lines.extend(suppressed_hint);
3174 }
3175 if !coverage.is_empty() {
3176 // Separate from the spread table above only when one was rendered; the
3177 // section-leading blank already precedes a coverage-only block.
3178 if had_findings {
3179 lines.push(String::new());
3180 }
3181 lines.push("per-phase coverage asymmetry (one-sided metrics):".to_string());
3182 coverage.sort_by(|a, b| {
3183 a.step_index
3184 .cmp(&b.step_index)
3185 .then_with(|| a.present_side.as_str().cmp(b.present_side.as_str()))
3186 .then_with(|| a.pairing_label.cmp(&b.pairing_label))
3187 .then_with(|| a.metric.map(|m| m.name).cmp(&b.metric.map(|m| m.name)))
3188 });
3189 let mut table = crate::cli::new_table();
3190 table.set_header(vec!["SIDE", "TEST", "PHASE", "METRIC", "VALUE"]);
3191 for c in coverage {
3192 // A whole one-sided phase with no readable metric renders `—` in the
3193 // METRIC + VALUE columns.
3194 let metric_cell = c.metric.map(|m| m.name).unwrap_or("—");
3195 // Bare {:.2} with NO display_unit — matching the noise aggregate
3196 // findings table,
3197 // so a unit-carrying metric renders consistently across all of them.
3198 let value_cell = match c.value {
3199 Some(v) => format!("{v:.2}"),
3200 None => "—".to_string(),
3201 };
3202 table.add_row(vec![
3203 Cell::new(c.present_side.as_str()),
3204 Cell::new(c.pairing_label.as_str()),
3205 Cell::new(format!("{}: {}", c.step_index, c.label)),
3206 Cell::new(metric_cell),
3207 Cell::new(value_cell),
3208 ]);
3209 }
3210 lines.push(table.to_string());
3211 }
3212 lines
3213}
3214
3215/// Render the scalar findings table for `perf-delta`.
3216///
3217/// Extracted from [`compare_partitions`] verbatim; the
3218/// `--phases-only` gate stays at the call site so this prints
3219/// unconditionally when invoked.
3220fn print_scalar_findings_table(report: &CompareReport, label_a: &str, label_b: &str) {
3221 use comfy_table::{Cell, Color};
3222 let mut table = crate::cli::new_table();
3223 table.set_header(vec!["TEST", "METRIC", label_a, label_b, "DELTA", "VERDICT"]);
3224 for f in &report.findings {
3225 let (verdict_text, verdict_color) = match f.kind {
3226 FindingKind::Regression => ("REGRESSION", Color::Red),
3227 FindingKind::Improvement => ("improvement", Color::Green),
3228 // Directionless metric: shown, never gated. Neutral color.
3229 FindingKind::Informational => ("informational", Color::Blue),
3230 };
3231 // PairingKey's first slot is scenario; subsequent slots
3232 // are the pairing-dim values in canonical order. Joining
3233 // with `/` produces a label whose shape mirrors the
3234 // pairing-dim count — so a comparison that pairs on
3235 // (topology, work_type) renders a `scenario/topology/work_type`
3236 // label, while a comparison that slices on most dims
3237 // renders a shorter identifier. The operator can always
3238 // cross-reference the "pairing on:" header line above to
3239 // see what each segment means.
3240 let label = f.pairing_key.0.join("/");
3241 table.add_row(vec![
3242 Cell::new(label),
3243 Cell::new(f.metric.name),
3244 Cell::new(format!("{:.2}", f.val_a)),
3245 Cell::new(format!("{:.2}", f.val_b)),
3246 Cell::new(format!("{:+.2}{}", f.delta, f.metric.display_unit)),
3247 Cell::new(verdict_text).fg(verdict_color),
3248 ]);
3249 }
3250 println!("{table}");
3251}
3252
3253/// Render the scalar summary block for `perf-delta` —
3254/// regressions / improvements / unchanged + skipped-failed +
3255/// per-group pass counts + new_in_b / removed_from_a. All lines
3256/// describe the scalar findings table; the `--phases-only` gate
3257/// stays at the call site so this prints unconditionally when
3258/// invoked.
3259fn print_summary_block(
3260 report: &CompareReport,
3261 avg_a: &Option<Vec<AveragedGroup>>,
3262 avg_b: &Option<Vec<AveragedGroup>>,
3263 label_a: &str,
3264 label_b: &str,
3265) {
3266 println!();
3267 println!(
3268 "summary: {} regressions, {} improvements, {} informational, {} unchanged",
3269 report.regressions, report.improvements, report.informational, report.unchanged,
3270 );
3271 if report.excluded_pairs > 0 {
3272 println!(
3273 " {} pairing-key row pair(s) excluded from regression math because one \
3274 or both sides was excluded (failed, inconclusive, skipped, or an inverted expected-failure run)",
3275 report.excluded_pairs,
3276 );
3277 }
3278 if let (Some(avg_a), Some(avg_b)) = (avg_a, avg_b) {
3279 let block = format_per_group_pass_counts(avg_a, avg_b, label_a, label_b);
3280 if !block.is_empty() {
3281 print!("{block}");
3282 }
3283 }
3284 if report.new_in_b > 0 {
3285 println!(
3286 " {} row(s) new in '{}' (no matching key in '{}')",
3287 report.new_in_b, label_b, label_a,
3288 );
3289 }
3290 if report.removed_from_a > 0 {
3291 println!(
3292 " {} row(s) removed from '{}' (no matching key in '{}')",
3293 report.removed_from_a, label_a, label_b,
3294 );
3295 }
3296 for line in format_coverage_diff_lines(report, label_a, label_b) {
3297 println!("{line}");
3298 }
3299}
3300
3301/// Render the coverage-diff lines (metrics present on exactly one side of a
3302/// paired row) for [`print_summary_block`]. Pure (returns the lines, empty
3303/// when there are no coverage diffs) so the present/absent label mapping by
3304/// [`ComparePartition`] is unit-testable without capturing stdout.
3305pub(crate) fn format_coverage_diff_lines(
3306 report: &CompareReport,
3307 label_a: &str,
3308 label_b: &str,
3309) -> Vec<String> {
3310 if report.coverage_diffs.is_empty() {
3311 return Vec::new();
3312 }
3313 let mut lines = vec![format!(
3314 " {} metric(s) present on only one side (coverage difference, \
3315 not a regression):",
3316 report.coverage_diffs.len(),
3317 )];
3318 for cd in &report.coverage_diffs {
3319 // present_side names the side that HAS the metric; the other is absent.
3320 let (present, absent) = match cd.present_side {
3321 ComparePartition::A => (label_a, label_b),
3322 ComparePartition::B => (label_b, label_a),
3323 };
3324 lines.push(format!(
3325 " {} / {} = {:.2} in '{}', absent in '{}'",
3326 cd.pairing_key.0.join("/"),
3327 cd.metric.name,
3328 cd.value,
3329 present,
3330 absent,
3331 ));
3332 }
3333 lines
3334}
3335
3336/// Print the host-context delta for `perf-delta`. Same
3337/// first-Some(host) baseline `compare_partitions` uses — picking
3338/// representative hosts off the partitioned sidecars rather than
3339/// the full pool so the delta reflects what actually fed the
3340/// comparison.
3341fn print_host_context_delta(
3342 pool: &[crate::test_support::SidecarResult],
3343 rows: &[GauntletRow],
3344 filter_a: &RowFilter,
3345 filter_b: &RowFilter,
3346 label_a: &str,
3347 label_b: &str,
3348) {
3349 // Zip the pool with the pre-computed `rows` (built once above
3350 // via `pool.iter().map(sidecar_to_row).collect()`) so the
3351 // per-side filter reuses the existing row instead of calling
3352 // `sidecar_to_row` a second and third time. `pool` and `rows`
3353 // are the same length and same iteration order by construction.
3354 let sidecars_a: Vec<&crate::test_support::SidecarResult> = pool
3355 .iter()
3356 .zip(rows.iter())
3357 .filter(|(_, r)| filter_a.matches(r))
3358 .map(|(s, _)| s)
3359 .collect();
3360 let sidecars_b: Vec<&crate::test_support::SidecarResult> = pool
3361 .iter()
3362 .zip(rows.iter())
3363 .filter(|(_, r)| filter_b.matches(r))
3364 .map(|(s, _)| s)
3365 .collect();
3366 let host_a = sidecars_a.iter().find_map(|s| s.host.as_ref());
3367 let host_b = sidecars_b.iter().find_map(|s| s.host.as_ref());
3368 print!("{}", format_host_delta(host_a, host_b, label_a, label_b));
3369}
3370
3371/// Render the host-context delta section of `perf-delta`
3372/// as a block of text ready to `print!`. Extracted as a pure
3373/// function of `(Option<&HostContext>, Option<&HostContext>, &str,
3374/// &str)` so the five match arms can be unit-tested without
3375/// fixturing a real run directory.
3376///
3377/// The returned string is either empty (when both sides have no
3378/// host data — nothing to print) or ends with a newline so callers
3379/// can chain further output. Single-side cases print a clear
3380/// "captured in X only, delta unavailable" message rather than
3381/// silently suppressing the section — a mixed-tooling-version run
3382/// comparison should surface the asymmetry.
3383/// Format the one-line averaging-mode header that prints above
3384/// the comparison table.
3385///
3386/// Pure function of (`pre_agg_a`, `pre_agg_b`, `a`, `b`) so the
3387/// exact-string contract — the operator-visible "averaged across
3388/// N runs (A) and M runs (B)" surface — can be unit-tested
3389/// without capturing stdout from `compare_partitions`.
3390///
3391/// `pre_agg_a` / `pre_agg_b` are the post-typed-filter contributor
3392/// row counts (i.e. the number of sidecar rows that fed
3393/// [`group_and_average_by`]), NOT the post-aggregation unique-key
3394/// counts. The two answer different operator questions; the
3395/// header surfaces the contributor count because that's the
3396/// "how many trials got folded?" intuition the averaging fold
3397/// is actually delivering.
3398pub(crate) fn format_average_header(
3399 pre_agg_a: usize,
3400 pre_agg_b: usize,
3401 a: &str,
3402 b: &str,
3403) -> String {
3404 format!("averaged across {pre_agg_a} runs ({a}) and {pre_agg_b} runs ({b})")
3405}
3406
3407/// Format the per-group `passes_observed/total_observed` block
3408/// that prints below the summary line.
3409///
3410/// Pure function of (`avg_a`, `avg_b`, `a`, `b`) so the rendered
3411/// surface — one line per (scenario, topology, work_type) group
3412/// present on either side, with `N/M` per side and `-` for any
3413/// side that lacks the group — can be unit-tested without
3414/// capturing stdout. Returns the trailing-newline-terminated
3415/// block, or empty string when neither side has groups.
3416///
3417/// Line shape:
3418/// ` scenario/topology/work_type: {a}=N/M {b}=N/M`
3419///
3420/// The leading two-space indent matches the sibling
3421/// `summary:` block's continuation lines (e.g.
3422/// `" N (scenario, topology, work_type) row pair(s) skipped..."`)
3423/// so the per-group block reads as a continuation of the same
3424/// summary section. A blank line separates this block from the
3425/// preceding `summary:` line for readability.
3426///
3427/// Groups present on only one side render `-` for the missing
3428/// side (also counted in `compare_rows`' `new_in_b` /
3429/// `removed_from_a` upstream — the per-group block surfaces the
3430/// asymmetry by name so the operator can see *which* groups went
3431/// missing without cross-referencing the summary counters).
3432pub(crate) fn format_per_group_pass_counts(
3433 avg_a: &[AveragedGroup],
3434 avg_b: &[AveragedGroup],
3435 a: &str,
3436 b: &str,
3437) -> String {
3438 type SummaryKey<'a> = (&'a str, &'a str, &'a str);
3439 type SummaryValue<'a> = (Option<&'a AveragedGroup>, Option<&'a AveragedGroup>);
3440 let mut keys: BTreeMap<SummaryKey<'_>, SummaryValue<'_>> = BTreeMap::new();
3441 for ar in avg_a {
3442 let k = (
3443 ar.row.scenario.as_str(),
3444 ar.row.topology.as_str(),
3445 ar.row.work_type.as_str(),
3446 );
3447 keys.entry(k).or_insert((None, None)).0 = Some(ar);
3448 }
3449 for br in avg_b {
3450 let k = (
3451 br.row.scenario.as_str(),
3452 br.row.topology.as_str(),
3453 br.row.work_type.as_str(),
3454 );
3455 keys.entry(k).or_insert((None, None)).1 = Some(br);
3456 }
3457 if keys.is_empty() {
3458 return String::new();
3459 }
3460 let mut out = String::new();
3461 out.push('\n');
3462 out.push_str(
3463 "per-group pass counts (passes/total + skip/inconc/fail breakdown when non-zero):\n",
3464 );
3465 for ((scn, topo, wt), (ka, kb)) in keys.into_iter() {
3466 let fmt_side = |r: Option<&AveragedGroup>| -> String {
3467 let Some(x) = r else {
3468 return "-".to_string();
3469 };
3470 // Mirror format_dimension_summary's 4-state breakdown —
3471 // operators reading per-group lines must be able to
3472 // distinguish skip / inconclusive / fail buckets, not
3473 // see them collapsed into the (total - pass) denominator
3474 // gap. Skip silently rendering buckets that are zero so
3475 // the common-case "all passed" line stays terse.
3476 let mut s = format!("{}/{}", x.passes_observed, x.total_observed);
3477 let mut extras: Vec<String> = Vec::with_capacity(3);
3478 if x.skips_observed > 0 {
3479 extras.push(format!("{} skip", x.skips_observed));
3480 }
3481 if x.inconclusives_observed > 0 {
3482 extras.push(format!("{} inc", x.inconclusives_observed));
3483 }
3484 if x.failures_observed > 0 {
3485 extras.push(format!("{} fail", x.failures_observed));
3486 }
3487 if !extras.is_empty() {
3488 s.push_str(&format!(" ({})", extras.join(", ")));
3489 }
3490 s
3491 };
3492 out.push_str(&format!(
3493 " {scn}/{topo}/{wt}: {a}={pa} {b}={pb}\n",
3494 pa = fmt_side(ka),
3495 pb = fmt_side(kb),
3496 ));
3497 }
3498 out
3499}
3500
3501pub(crate) fn format_host_delta(
3502 host_a: Option<&crate::host_context::HostContext>,
3503 host_b: Option<&crate::host_context::HostContext>,
3504 a: &str,
3505 b: &str,
3506) -> String {
3507 match (host_a, host_b) {
3508 (Some(ha), Some(hb)) => {
3509 let delta = ha.diff(hb);
3510 if delta.is_empty() {
3511 // Identical hosts: surface arch when both sides
3512 // carry it so the operator sees WHAT is identical
3513 // (the two runs share x86_64 vs both being aarch64
3514 // is the operator's question). When
3515 // either side leaves arch as `None` (pre-host-
3516 // context-landing archive, or arch probe failed
3517 // on at least one side), fall through to the
3518 // bare "identical" message — emitting a partial
3519 // hint would mislead the reader into thinking
3520 // the silent side disagreed.
3521 match (ha.arch.as_deref(), hb.arch.as_deref()) {
3522 (Some(arch_a), Some(arch_b)) if arch_a == arch_b => {
3523 format!("\nhost: identical between '{a}' and '{b}' (arch: {arch_a})\n",)
3524 }
3525 _ => format!("\nhost: identical between '{a}' and '{b}'\n"),
3526 }
3527 } else {
3528 format!("\nhost delta ('{a}' → '{b}'):\n{delta}")
3529 }
3530 }
3531 (Some(_), None) => {
3532 format!("\nhost: captured in '{a}' only, delta unavailable\n")
3533 }
3534 (None, Some(_)) => {
3535 format!("\nhost: captured in '{b}' only, delta unavailable\n")
3536 }
3537 (None, None) => String::new(),
3538 }
3539}