ktstr/stats/group.rs
1use super::*;
2
3/// One of the nine dimensions that compose a `GauntletRow`'s
4/// identity in the comparison pipeline: `kernel`, `scheduler`,
5/// `topology`, `work-type`, `project-commit`, `kernel-commit`,
6/// `run-source`, `resolve-source`, `cpu-budget`. Each maps to the corresponding
7/// `RowFilter` field and `GauntletRow` field; the dimension
8/// model lets `compare_partitions` derive its slicing dims and
9/// dynamic pairing key without hardcoding the dimension list at
10/// every call site. Variant names match the CLI flag suffix
11/// (e.g. `Dimension::ProjectCommit` ↔ `--project-commit`,
12/// `Dimension::RunSource` ↔ `--run-source`,
13/// `Dimension::CpuBudget` ↔ `--cpu-budget`) so a reader can map
14/// from operator surface to internal enum without a translation
15/// table.
16///
17/// `scenario` is NOT a dimension — it is the test name and is
18/// always part of the pairing key (you can't compare scenario A
19/// against scenario B; that would compare unrelated tests).
20///
21/// Iteration order via [`Dimension::ALL`] is deterministic and
22/// matches the order operators read in the CLI flags
23/// (`--kernel` / `--scheduler` / `--topology` / `--work-type` /
24/// `--project-commit` / `--kernel-commit` / `--run-source` /
25/// `--resolve-source` / `--cpu-budget`), so generated labels and error messages list
26/// dims in a stable, predictable order.
27#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
28pub enum Dimension {
29 Kernel,
30 Scheduler,
31 Topology,
32 WorkType,
33 ProjectCommit,
34 KernelCommit,
35 RunSource,
36 ResolveSource,
37 CpuBudget,
38}
39
40impl Dimension {
41 /// Every dimension in CLI-flag order. Used by
42 /// [`Self::pairing_dims`] (invoked from `compare_partitions`) to
43 /// compute the pairing-dim complement set (all dims minus slicing
44 /// dims). The sliceable subset [`derive_slicing_dims`] contrasts on
45 /// is [`Self::SLICEABLE`].
46 pub const ALL: &'static [Dimension] = &[
47 Dimension::Kernel,
48 Dimension::Scheduler,
49 Dimension::Topology,
50 Dimension::WorkType,
51 Dimension::ProjectCommit,
52 Dimension::KernelCommit,
53 Dimension::RunSource,
54 Dimension::ResolveSource,
55 Dimension::CpuBudget,
56 ];
57
58 /// The dimensions that may form an A/B CONTRAST (slice). Only the
59 /// version axes are contrastable: comparing across a project commit,
60 /// a kernel version, or a kernel commit is a purposeful "did this
61 /// change regress" question. Every other dimension
62 /// (scheduler/topology/work_type/run_source/resolve_source/cpu_budget)
63 /// is FILTER + PAIRING only — it narrows the cohort and joins A to B,
64 /// but contrasting across it bulk-compares heterogeneous runs
65 /// (different configs/hosts/conditions), which the significance math
66 /// cannot soundly attribute. A cross-config question is answered
67 /// in-test via the Verdict DSL (`better_across_phases`), not here.
68 pub const SLICEABLE: &'static [Dimension] = &[
69 Dimension::Kernel,
70 Dimension::ProjectCommit,
71 Dimension::KernelCommit,
72 ];
73
74 /// Compute pairing dims from a slicing-dim set: every
75 /// dimension in [`Dimension::ALL`] that is NOT in `slicing`,
76 /// in canonical order. This is the dynamic key derivation the
77 /// comparison pipeline uses everywhere — slicing dims define
78 /// the contrast (different on A vs B), pairing dims define
79 /// the join (same across A and B). A non-[`Self::SLICEABLE`] dimension is
80 /// never in `slicing`, so it is always a pairing dim.
81 pub fn pairing_dims(slicing: &[Dimension]) -> Vec<Dimension> {
82 Self::ALL
83 .iter()
84 .copied()
85 .filter(|d| !slicing.contains(d))
86 .collect()
87 }
88
89 /// Operator-readable name for diagnostic and table output.
90 /// Matches the CLI flag suffix (e.g. `--kernel` →
91 /// `"kernel"`, `--work-type` → `"work-type"`). Used in the
92 /// "slicing dimensions: ..." / "pairing on: ..." header
93 /// lines and in the "A and B select identical rows" error.
94 pub fn name(self) -> &'static str {
95 match self {
96 Dimension::Kernel => "kernel",
97 Dimension::Scheduler => "scheduler",
98 Dimension::Topology => "topology",
99 Dimension::WorkType => "work-type",
100 Dimension::ProjectCommit => "project-commit",
101 Dimension::KernelCommit => "kernel-commit",
102 Dimension::RunSource => "run-source",
103 Dimension::ResolveSource => "resolve-source",
104 Dimension::CpuBudget => "cpu-budget",
105 }
106 }
107}
108
109/// Legacy pairing-dim set used by tests that pre-date the
110/// dimensional-slicing refactor. Equivalent to the historical
111/// hardcoded tuple `(scenario, topology, work_type)` — scenario
112/// is always implicit in [`PairingKey::from_row`] and the
113/// remaining two dimensions are listed here. Production
114/// callers (`compare_partitions`) compute pairing dims via
115/// [`Dimension::pairing_dims`] from the slicing-dim derivation;
116/// only test fixtures use this constant directly, so it is gated
117/// behind `#[cfg(test)]`.
118#[cfg(test)]
119pub(crate) const LEGACY_PAIRING_DIMS: &[Dimension] = &[Dimension::Topology, Dimension::WorkType];
120
121/// Derive the set of dimensions on which `filter_a` and
122/// `filter_b` differ. These are the SLICING dimensions —
123/// dimensions on which the two sides select disjoint cohorts and
124/// therefore form the A/B contrast. The complement (every other
125/// dimension) is the PAIRING-key dimension set used by
126/// `compare_rows` to join A-side rows against B-side rows.
127///
128/// Comparison shape per dimension: every dim uses the same
129/// SORTED-DEDUPED `Vec<&str>` comparison — order and multiplicity
130/// don't matter (`--a-kernel 6.14 --a-kernel 6.15` and
131/// `--b-kernel 6.15 --b-kernel 6.14` are NOT a slice). All nine
132/// dimensions are repeatable Vec filters; the previously
133/// `Option<String>`-typed `scheduler` / `topology` / `work_type`
134/// dims were promoted to `Vec<String>` so the operator-visible
135/// shape is uniform across every dimension.
136///
137/// Returns dimensions in canonical ([`Dimension::ALL`]) order so callers
138/// (header lines, error messages, side labels) get a stable presentation.
139/// Only a [`Dimension::SLICEABLE`] dimension can be a slicing dim; the
140/// non-sliceable dims are filter + pairing only and are only ever set via a
141/// single shared `--<x>` filter (applied to BOTH sides), so they can never
142/// differ A↔B — the walk skips them.
143pub fn derive_slicing_dims(filter_a: &RowFilter, filter_b: &RowFilter) -> Vec<Dimension> {
144 let mut out = Vec::new();
145 for &dim in Dimension::SLICEABLE {
146 let differs = match dim {
147 Dimension::Kernel => sorted_dedup(&filter_a.kernels) != sorted_dedup(&filter_b.kernels),
148 Dimension::ProjectCommit => {
149 sorted_dedup(&filter_a.project_commits) != sorted_dedup(&filter_b.project_commits)
150 }
151 Dimension::KernelCommit => {
152 sorted_dedup(&filter_a.kernel_commits) != sorted_dedup(&filter_b.kernel_commits)
153 }
154 // Non-sliceable dims are filter + pairing only (see
155 // [`Dimension::SLICEABLE`]); the walk never reaches them.
156 Dimension::Scheduler
157 | Dimension::Topology
158 | Dimension::WorkType
159 | Dimension::RunSource
160 | Dimension::ResolveSource
161 | Dimension::CpuBudget => {
162 unreachable!("non-sliceable dimension {dim:?} in SLICEABLE walk")
163 }
164 };
165 if differs {
166 out.push(dim);
167 }
168 }
169 out
170}
171
172fn sorted_dedup(v: &[String]) -> Vec<&str> {
173 let mut s: Vec<&str> = v.iter().map(String::as_str).collect();
174 s.sort_unstable();
175 s.dedup();
176 s
177}
178
179/// Render a side's filter values into a column-header label for
180/// the comparison table. `dims` is the slicing-dimension set —
181/// the only dims whose values vary between A and B. The label
182/// concatenates each dim's per-side filter value(s) with `:`
183/// between dim values (e.g. `"6.14.2:scx_rusty"` when both
184/// `kernel` and `scheduler` slice). For multi-value Vec filters
185/// (kernels, commits) the values join with `|` when there
186/// are ≤3; longer lists collapse to `"A"` or `"B"` (the bare
187/// side label) to keep the column header readable.
188///
189/// `bare_label` is `"A"` / `"B"`, used as the fallback when a
190/// slicing dim's filter has more than 3 values OR the slicing
191/// dim's filter is empty on this side (the slice exists because
192/// the OTHER side populated the filter — the empty-side label is
193/// the bare letter).
194pub(crate) fn render_side_label(
195 filter: &RowFilter,
196 dims: &[Dimension],
197 bare_label: &str,
198) -> String {
199 if dims.is_empty() {
200 return bare_label.to_string();
201 }
202 let mut parts: Vec<String> = Vec::new();
203 for &dim in dims {
204 let part = match dim {
205 Dimension::Kernel => render_vec_dim(&filter.kernels, bare_label),
206 Dimension::Scheduler => render_vec_dim(&filter.schedulers, bare_label),
207 Dimension::Topology => render_vec_dim(&filter.topologies, bare_label),
208 Dimension::WorkType => render_vec_dim(&filter.work_types, bare_label),
209 Dimension::ProjectCommit => render_vec_dim(&filter.project_commits, bare_label),
210 Dimension::KernelCommit => render_vec_dim(&filter.kernel_commits, bare_label),
211 Dimension::RunSource => render_vec_dim(&filter.run_sources, bare_label),
212 Dimension::ResolveSource => render_vec_dim(&filter.resolve_sources, bare_label),
213 Dimension::CpuBudget => render_vec_dim(&filter.cpu_budgets, bare_label),
214 };
215 parts.push(part);
216 }
217 parts.join(":")
218}
219
220/// `≤3` values: join with `|`. `>3` values: collapse to
221/// `bare_label`. Empty Vec: also bare label (slicing exists
222/// because the OTHER side populated the same dim).
223fn render_vec_dim(values: &[String], bare_label: &str) -> String {
224 if values.is_empty() || values.len() > 3 {
225 bare_label.to_string()
226 } else {
227 let mut sorted: Vec<&str> = values.iter().map(String::as_str).collect();
228 sorted.sort_unstable();
229 sorted.join("|")
230 }
231}
232
233/// Dynamic pairing key for [`compare_rows_by`] — the tuple of
234/// values on every NON-slicing dimension, plus the always-pinned
235/// `scenario`. Two rows pair iff their dynamic keys match.
236///
237/// Stored as a `Vec<String>` so the same struct shape works for
238/// any `pairing_dims` slice (the alternative — a tuple of
239/// `Option<&str>` per dim — would force every consumer to know
240/// the dim list at compile time, defeating the point of
241/// dimension-set parametrisation).
242///
243/// First element is always `scenario`; subsequent elements
244/// follow `pairing_dims` order (which is itself
245/// [`Dimension::ALL`] order minus the slicing dims).
246#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize)]
247pub(crate) struct PairingKey(pub Vec<String>);
248
249impl PairingKey {
250 /// Extract the pairing key for `row` given the list of
251 /// dimensions to include. The scenario is ALWAYS the first
252 /// component; the `pairing_dims` list controls the rest.
253 /// Each non-scenario dim contributes a single string slot:
254 /// `Option<String>` fields render `None` as the empty
255 /// string, `Vec<String>` fields render as a sorted-deduped
256 /// `|`-joined string so the same set produces the same key
257 /// regardless of input order.
258 ///
259 /// Commit dimensions (`ProjectCommit`, `KernelCommit`) strip the
260 /// trailing `-dirty` suffix before contributing to the key.
261 /// Without the strip, a clean run at HEAD `abc1234` and a
262 /// dirty run at the same HEAD (`abc1234-dirty`) would shatter
263 /// into two separate pairing buckets, defeating
264 /// [`group_and_average_by`]'s `+mixed` cohort detection — that
265 /// helper can only surface "this aggregate has both clean and
266 /// dirty contributors" when the two contributors actually land
267 /// in the same group. Stripping at the key level pairs them by
268 /// canonical hex; the per-row `-dirty` distinction is preserved
269 /// downstream in the aggregate's `commit` / `kernel_commit`
270 /// field via the `+mixed` marker in
271 /// `group_and_average_by`'s `render_mixed_dirty` helper.
272 pub fn from_row(row: &GauntletRow, pairing_dims: &[Dimension]) -> Self {
273 let mut parts = Vec::with_capacity(1 + pairing_dims.len());
274 parts.push(row.scenario.clone());
275 for &dim in pairing_dims {
276 parts.push(match dim {
277 Dimension::Kernel => row.kernel_version.clone().unwrap_or_default(),
278 Dimension::Scheduler => row.scheduler.clone(),
279 Dimension::Topology => row.topology.clone(),
280 Dimension::WorkType => row.work_type.clone(),
281 Dimension::ProjectCommit => commit_pairing_key_part(&row.commit),
282 Dimension::KernelCommit => commit_pairing_key_part(&row.kernel_commit),
283 Dimension::RunSource => row.run_source.clone().unwrap_or_default(),
284 Dimension::ResolveSource => row.resolve_source.clone().unwrap_or_default(),
285 // Cross-budget rows never pair: a row's budget value
286 // becomes part of its pairing key (None -> empty, distinct
287 // from any real budget). A skip (None) only pairs with
288 // another skip.
289 Dimension::CpuBudget => row.cpu_budget.map(|n| n.to_string()).unwrap_or_default(),
290 });
291 }
292 PairingKey(parts)
293 }
294}
295
296/// Strip the trailing `-dirty` suffix from a commit dimension's
297/// value before it contributes to a [`PairingKey`]. `None` and
298/// already-clean values pass through unchanged (`None` → empty
299/// string; `Some("abc1234")` → `"abc1234"`); a dirty value
300/// (`Some("abc1234-dirty")`) is canonicalized to `"abc1234"` so
301/// it pairs with its clean sibling.
302///
303/// Used by [`PairingKey::from_row`] for both the `ProjectCommit`
304/// and `KernelCommit` arms; the per-row `-dirty` distinction is
305/// preserved separately by [`group_and_average_by`] via its
306/// dirty-tracking accumulator and `+mixed` marker.
307fn commit_pairing_key_part(value: &Option<String>) -> String {
308 let Some(s) = value.as_deref() else {
309 return String::new();
310 };
311 s.strip_suffix("-dirty").unwrap_or(s).to_string()
312}
313
314/// One aggregated `GauntletRow` produced by `group_and_average_by`,
315/// plus the pass-bookkeeping needed to render the per-group summary
316/// block (`N/M passed` + the `(S skip, I inc, F fail)` breakdown).
317///
318/// `row` carries arithmetic-mean metric values across every real
319/// Pass contributor in the group; the (`scenario`, `topology`,
320/// `work_type`, `scheduler`, `kernel_version`) identity is taken
321/// verbatim from the first contributor in iteration order — every
322/// contributor in the group shares the identity tuple by
323/// construction (`scenario`, `topology`, and `work_type` ARE the
324/// group key, and `scheduler` / `kernel_version` are
325/// typed-filter-narrowed at the call site so they can only vary if
326/// the operator passed no `--scheduler` / `--kernel` filter).
327///
328/// The verdict bits on `row` (`passed`, `skipped`, `inconclusive`)
329/// fold under the strict 4-state
330/// `Fail > Inconclusive > Pass > Skip` lattice: any failing
331/// contributor sets the aggregate to Fail (`passed=false`,
332/// `inconclusive=false`, `skipped=false`); else any inconclusive
333/// contributor sets `inconclusive=true`; else any skipped
334/// contributor sets `skipped=true`; only an all-pass cohort yields
335/// `passed=true`. The lattice mechanics match
336/// `GauntletRow::is_pass`'s triple-conjunct, so the aggregated
337/// row's accessor reads honestly. Aggregate rows that are not real
338/// Pass route the pair through `compare_rows_by`'s
339/// `excluded_pairs` gate.
340///
341/// `passes_observed`, `skips_observed`, `inconclusives_observed`,
342/// `failures_observed` and `total_observed` count contributors per
343/// the strict 4-state mutex: the four bucket counters sum to
344/// `total_observed` because every contributor falls into exactly
345/// one bucket. Only real Pass contributors feed the per-row sums —
346/// failing, inconclusive, and skipped contributors all carry no
347/// comparable per-run signal (failure-mode telemetry; "couldn't
348/// evaluate" non-signal; "didn't run" non-signal). When no
349/// contributor passed cleanly the running sum is zero and the
350/// aggregate `row` carries default-zero metric values plus
351/// `passed = false` — the downstream `excluded_pairs` gate then
352/// drops the pair from the regression math.
353#[derive(Clone, Debug)]
354#[non_exhaustive]
355pub struct AveragedGroup {
356 /// Aggregated row carrying arithmetic-mean metric values plus
357 /// the lattice-folded `(passed, skipped, inconclusive)` bits
358 /// matching the `Fail > Inconclusive > Pass > Skip`
359 /// dominance. `passed` is true only when every contributor was
360 /// a real pass; `inconclusive` fires when at least one
361 /// contributor was Inconclusive and none failed; `skipped`
362 /// fires when at least one contributor was Skip and none
363 /// failed or was Inconclusive. Fed directly into
364 /// `compare_rows` (averaging is the fixed compare_partitions fold).
365 pub row: GauntletRow,
366 /// Number of contributors that were a real pass
367 /// (`is_pass() == true`). Renders as the numerator of the
368 /// per-group `N/M` summary.
369 pub passes_observed: u32,
370 /// Number of contributors that were Skip (`is_skip() == true`).
371 /// Surfaced in the per-group rendering as the "S skipped"
372 /// breakdown so an operator can distinguish "scenario didn't
373 /// run" from real failures.
374 pub skips_observed: u32,
375 /// Number of contributors that were Inconclusive
376 /// (`is_inconclusive() == true`). Surfaced in the per-group
377 /// rendering as the "I inconclusive" breakdown so an operator
378 /// can distinguish "couldn't evaluate" from real failures —
379 /// same defense-in-depth pattern as
380 /// `format_dimension_summary`'s inconc bucket.
381 pub inconclusives_observed: u32,
382 /// Number of contributors that were a real Fail
383 /// (`is_fail() == true`). Surfaced in the per-group rendering
384 /// as the "F failed" breakdown.
385 pub failures_observed: u32,
386 /// Total contributors in the group (`= group.len()`). Renders
387 /// as the denominator of the per-group `N/M` summary.
388 /// Mechanically:
389 /// `total_observed == passes_observed + skips_observed +
390 /// inconclusives_observed + failures_observed`
391 /// under the strict 4-state mutex.
392 pub total_observed: u32,
393}
394
395/// Per-row dirty-status update used by [`group_and_average_by`] to
396/// detect when a group's contributors disagree on the `-dirty`
397/// suffix for a commit dimension. `value` is `Some(hex)` /
398/// `Some(hex-dirty)` / `None`; the function flips `any_clean` if
399/// the value lacks the `-dirty` suffix and `any_dirty` if it
400/// carries one. `first_base` records the first un-suffixed form
401/// seen (used to render the `+mixed` marker against a canonical
402/// hex even when `acc.first` happens to be the dirty form).
403///
404/// Per-row scope spans EVERY contributor (passing, failing,
405/// skipped). Mixed-dirty is metadata about the cohort's working-
406/// tree state, not about which contributors succeeded — surfacing
407/// it only across passes would hide WIP-vs-committed disagreement
408/// that the operator needs to know about. `None` values do not
409/// flip either flag and do not seed `first_base`.
410fn update_dirty_tracking(
411 value: &Option<String>,
412 any_clean: &mut bool,
413 any_dirty: &mut bool,
414 first_base: &mut Option<String>,
415) {
416 let Some(s) = value.as_deref() else { return };
417 let (base, is_dirty) = match s.strip_suffix("-dirty") {
418 Some(base) => (base, true),
419 None => (s, false),
420 };
421 if is_dirty {
422 *any_dirty = true;
423 } else {
424 *any_clean = true;
425 }
426 if first_base.is_none() {
427 *first_base = Some(base.to_string());
428 }
429}
430
431/// Render the aggregate's commit string for one dimension
432/// (project_commit or kernel_commit) given the cohort-wide
433/// dirty/clean tracking state. When `any_clean && any_dirty` for
434/// the same un-suffixed hex, the rendered form is
435/// `Some("{first_base}+mixed")`; otherwise the function returns
436/// `acc.first.commit` (or `acc.first.kernel_commit`) verbatim,
437/// preserving the existing first-seen behaviour for homogeneous
438/// cohorts (every contributor clean, every contributor dirty, or
439/// every contributor `None`).
440///
441/// `first_base` is the canonical un-suffixed hex captured by
442/// [`update_dirty_tracking`]; using it (rather than stripping
443/// `acc.first.commit`) ensures the rendered form is `abc1234+mixed`
444/// regardless of whether the first contributor was clean or dirty.
445fn render_mixed_dirty(
446 any_clean: bool,
447 any_dirty: bool,
448 first_base: &Option<String>,
449 first_commit: &Option<String>,
450) -> Option<String> {
451 if any_clean
452 && any_dirty
453 && let Some(base) = first_base
454 {
455 return Some(format!("{base}+mixed"));
456 }
457 first_commit.clone()
458}
459
460/// Per-pairing-group fold accumulator for [`group_and_average_by`].
461/// Built via [`Accumulator::new`] from the group's first contributor,
462/// fed one contributor at a time via [`Accumulator::observe`], and
463/// folded into the emitted [`AveragedGroup`] via
464/// [`Accumulator::into_averaged_group`]. Split out of
465/// `group_and_average_by` only to satisfy the source-function size
466/// guard — the field set and fold math are unchanged from the
467/// in-function definition.
468struct Accumulator<'a> {
469 first: &'a GauntletRow,
470 total_observed: u32,
471 passes_observed: u32,
472 skips_observed: u32,
473 inconclusives_observed: u32,
474 failures_observed: u32,
475 any_skipped: bool,
476 any_failed: bool,
477 any_inconclusive: bool,
478 any_expected_failure: bool,
479 // Tracks whether contributors disagree on the `-dirty`
480 // suffix for the project_commit / kernel_commit dimensions.
481 // `any_*_clean` is true if any contributor's value is the
482 // un-suffixed form; `any_*_dirty` is true if any contributor
483 // ends in `-dirty`. When BOTH are true the aggregate is
484 // mixed-dirty and the rendered `commit` / `kernel_commit`
485 // gets a `+mixed` marker so downstream readers don't see a
486 // single arbitrary contributor's status. Tracked across
487 // EVERY contributor (passing, failing, skipped) — a mixed
488 // working-tree state is metadata about the cohort, not
489 // about the metric mean. Empty / `None` values are ignored
490 // and do not flip either flag.
491 any_project_clean: bool,
492 any_project_dirty: bool,
493 any_kernel_clean: bool,
494 any_kernel_dirty: bool,
495 // First-seen un-suffixed (clean-form) project / kernel
496 // commit string. Held separately from `first` because
497 // `first.commit` may be `Some("abc1234-dirty")` when the
498 // first contributor was dirty but later contributors carry
499 // the clean form — the rendered `+mixed` marker should
500 // still attach to the canonical un-suffixed hex so the
501 // operator sees `abc1234+mixed` not `abc1234-dirty+mixed`.
502 first_project_base: Option<String>,
503 first_kernel_base: Option<String>,
504 // Sums across passing+non-skipped contributors only.
505 // Counts are tracked per ext_metric key separately because
506 // a key may be absent from some contributors.
507 // Per-row sum for mean-fold fields (Counter / Gauge(Last) /
508 // Gauge(Avg) — though no typed Gauge(Avg) field exists
509 // today). Arithmetic mean across runs is the operator-
510 // facing cohort-comparison default; per-RUN totals are
511 // averaged to produce a comparable per-run quantity
512 // across cohorts of different run counts.
513 sum_spread: f64,
514 sum_migrations: u64,
515 sum_migration_ratio: f64,
516 sum_stuck_count: f64,
517 sum_fallback_count: i64,
518 sum_keep_last_count: i64,
519 sum_total_iterations: u64,
520 // sum_page_locality + sum_cross_node_mig removed: both NUMA roll-ups are now
521 // ext-sourced (worst_page_locality = WorstLowest, worst_cross_node_migration_ratio
522 // = WorstCrossNodeRatio), re-pooled from the per-phase carriers and
523 // cross-run-MEAN-folded via the ext fold like the other migrated worst_*
524 // selectors — not typed GauntletRow columns, so no per-row group-average
525 // accumulators.
526 // Per-row MAX-fold for Peak-kind fields. Per
527 // `MetricKind::Peak` contract, cross-RUN aggregation
528 // surfaces the worst-instant observed across the cohort —
529 // averaging Peak across runs dilutes the high-water signal
530 // (a 1-run spike at 100 averaged with 4 runs at 0 reports
531 // 20, hiding the actual peak). MAX preserves "did this
532 // peak ever fire in this cohort".
533 max_gap_ms: u64,
534 max_imbalance_ratio: f64,
535 max_max_dsq_depth: u32,
536 // Per-ext-metric (value, weight) pairs, accumulated across
537 // contributors. At emit time the kind-aware fold dispatches
538 // each key through `aggregate_samples` with `Some(&weights)`
539 // so Gauge(Avg) metrics get a weighted mean (per the F-C
540 // fix on aggregate_samples) and other kinds fold by their
541 // own semantics. Unregistered metric names (no MetricDef)
542 // fall back to arithmetic mean — same legacy semantic the
543 // previous (sum, u32) shape produced.
544 ext_pairs: BTreeMap<String, Vec<(f64, usize)>>,
545 // Union of the Dynamic monotonic-counter ext keys across contributors
546 // (`GauntletRow::ext_counter_keys`). `fold_ext_metrics` SUM-folds these
547 // instead of averaging — they are not in the static `METRICS` registry, so
548 // `metric_def` can't classify them. Carried onto the aggregated row so a
549 // second-level cross-RUN fold keeps SUM-folding them.
550 ext_counter_keys: BTreeSet<String>,
551 // Sum of `run_sample_count` across contributors. Carries
552 // through to the aggregated row's `run_sample_count` so a
553 // downstream cross-RUN consumer that further folds these
554 // already-aggregated rows can apply the same weighted
555 // semantic. Currently no typed Gauge(Avg) field exists
556 // (imbalance_ratio is registered as `max_imbalance_ratio`
557 // kind=Peak, NOT Gauge(Avg) — the Gauge(Avg) sibling
558 // `avg_imbalance_ratio` lands in ext_metrics where the
559 // weighted-mean dispatch already fires); the sum is
560 // preserved here for future typed-field Gauge(Avg)
561 // additions and for downstream cohort-of-cohort
562 // aggregation that wants a meaningful weight.
563 sum_run_sample_count: usize,
564}
565
566impl<'a> Accumulator<'a> {
567 /// Seed an accumulator from the group's first contributor.
568 /// Identity is taken from `first`; every counter / sum / max
569 /// starts at its zero value. `observe` (called once per
570 /// contributor, including `first`) performs the per-row fold.
571 fn new(first: &'a GauntletRow) -> Self {
572 Accumulator {
573 first,
574 total_observed: 0,
575 passes_observed: 0,
576 skips_observed: 0,
577 inconclusives_observed: 0,
578 failures_observed: 0,
579 any_skipped: false,
580 any_failed: false,
581 any_inconclusive: false,
582 any_expected_failure: false,
583 any_project_clean: false,
584 any_project_dirty: false,
585 any_kernel_clean: false,
586 any_kernel_dirty: false,
587 first_project_base: None,
588 first_kernel_base: None,
589 sum_spread: 0.0,
590 sum_migrations: 0,
591 sum_migration_ratio: 0.0,
592 sum_stuck_count: 0.0,
593 sum_fallback_count: 0,
594 sum_keep_last_count: 0,
595 sum_total_iterations: 0,
596 max_gap_ms: 0,
597 max_imbalance_ratio: 0.0,
598 max_max_dsq_depth: 0,
599 ext_pairs: BTreeMap::new(),
600 ext_counter_keys: BTreeSet::new(),
601 sum_run_sample_count: 0,
602 }
603 }
604
605 /// Fold one contributor into the accumulator. Called once per
606 /// row in the group (including the group's first contributor).
607 /// Skip / fail / inconclusive contributors flip their verdict
608 /// bits and return early without feeding the metric sums; only
609 /// real passes contribute to the per-row sums and maxes.
610 fn observe(&mut self, row: &GauntletRow) {
611 self.total_observed += 1;
612 // Dirty-status tracking spans ALL contributors. Same hex
613 // with mixed dirty/clean across the cohort is the case the
614 // `+mixed` marker exists to surface — the per-row scope
615 // (passing, failing, skipped) is irrelevant since the
616 // marker describes WIP-vs-committed disagreement among the
617 // contributors, not their metric outcomes.
618 update_dirty_tracking(
619 &row.commit,
620 &mut self.any_project_clean,
621 &mut self.any_project_dirty,
622 &mut self.first_project_base,
623 );
624 update_dirty_tracking(
625 &row.kernel_commit,
626 &mut self.any_kernel_clean,
627 &mut self.any_kernel_dirty,
628 &mut self.first_kernel_base,
629 );
630 if row.expected_failure {
631 // An expect_err / expect_auto_repro run inverted to a pass:
632 // OR the flag so the aggregated row stays OUT of the
633 // ab-compare regression math (its telemetry is
634 // failure-mode-dominated). Its metrics may still fold into
635 // the cohort sums below, but compare_rows_by excludes any
636 // expected_failure row, so the aggregate is never read.
637 self.any_expected_failure = true;
638 }
639 if row.is_skip() {
640 self.any_skipped = true;
641 self.skips_observed += 1;
642 return;
643 }
644 if row.is_fail() {
645 self.any_failed = true;
646 self.failures_observed += 1;
647 return;
648 }
649 if row.is_inconclusive() {
650 // Inconclusive contributors are not passes (the gate
651 // could not be evaluated) and carry no measured signal
652 // worth folding into the cohort means. Track the bit
653 // for the aggregated verdict's `inconclusive` field
654 // (so the aggregate row reads Inconclusive in the
655 // `Fail > Inconclusive > Pass > Skip` lattice when no
656 // contributor failed) and skip the per-row sums.
657 self.any_inconclusive = true;
658 self.inconclusives_observed += 1;
659 return;
660 }
661 self.passes_observed += 1;
662 self.sum_spread += row.spread;
663 self.sum_migrations = self.sum_migrations.saturating_add(row.migrations);
664 self.sum_migration_ratio += row.migration_ratio;
665 self.sum_stuck_count += row.stuck_count;
666 self.sum_fallback_count = self.sum_fallback_count.saturating_add(row.fallback_count);
667 self.sum_keep_last_count = self.sum_keep_last_count.saturating_add(row.keep_last_count);
668 self.sum_total_iterations = self
669 .sum_total_iterations
670 .saturating_add(row.total_iterations);
671 // Peak-kind typed fields: cross-RUN aggregation surfaces
672 // the worst-instant observed across the cohort, NOT the
673 // arithmetic mean (which dilutes a single peak across
674 // many quiet runs and hides the high-water signal).
675 self.max_gap_ms = self.max_gap_ms.max(row.gap_ms);
676 if row.imbalance_ratio > self.max_imbalance_ratio {
677 self.max_imbalance_ratio = row.imbalance_ratio;
678 }
679 self.max_max_dsq_depth = self.max_max_dsq_depth.max(row.max_dsq_depth);
680 self.sum_run_sample_count = self
681 .sum_run_sample_count
682 .saturating_add(row.run_sample_count);
683 // Floor the cross-RUN weight at 1: a passing run that emitted this ext
684 // key contributes one observation to a Gauge(Avg) weighted mean, never
685 // zero-weighted out of a mixed cohort. A run with run_sample_count==0
686 // (e.g. snapshot-bridge-sourced metrics with no monitor samples) would
687 // otherwise be silently dropped from the mean. Matches the .max(1) floors
688 // at run_metrics.rs (populate_run_ext_metrics_from_phases) and
689 // stats_types.rs (merge_metric_values).
690 for (k, v) in &row.ext_metrics {
691 self.ext_pairs
692 .entry(k.clone())
693 .or_default()
694 .push((*v, row.run_sample_count.max(1)));
695 }
696 // Union the Dynamic monotonic-counter key tags across contributors.
697 // Load-bearing, NOT merely defensive: per-run bpf-field resolution (and
698 // which topology levels are present) can vary within a pairing group, so
699 // a key tagged in only some rows must still be recognized as a counter
700 // for the whole group. fold_ext_metrics SUM-folds the union, not means.
701 self.ext_counter_keys
702 .extend(row.ext_counter_keys.iter().cloned());
703 }
704
705 /// Emit the folded [`AveragedGroup`] for this group. Identity
706 /// fields are first-seen; metric fields are the kind-correct
707 /// cross-RUN fold (mean for Counter / mean-fold, MAX for Peak,
708 /// rounded mean for integer-typed fields); the verdict bits
709 /// fold under the `Fail > Inconclusive > Pass > Skip` lattice.
710 fn into_averaged_group(self) -> AveragedGroup {
711 let acc = self;
712 let n = acc.passes_observed;
713 let denom = if n == 0 { 1.0 } else { f64::from(n) };
714 // Rounded mean for integer-typed Counter / mean-fold
715 // fields. When n == 0 the sums are all zero, so dividing
716 // by 1.0 still yields 0 — the aggregate's passed=false
717 // routes the pair through excluded_pairs downstream and
718 // the metrics are never consulted. Peak-kind integer
719 // fields (max_dsq_depth) take the MAX-fold path directly
720 // and don't need a rounding helper.
721 let round_u64 = |sum: u64| -> u64 { (sum as f64 / denom).round() as u64 };
722 let round_i64 = |sum: i64| -> i64 { (sum as f64 / denom).round() as i64 };
723
724 // Mixed-dirty markers. When the cohort contains both a
725 // clean-form and dirty-form contributor for the same hex
726 // (e.g. some sidecars from a clean tree, others from a
727 // -dirty WIP), the rendered commit field carries `+mixed`
728 // appended to the canonical un-suffixed hex. The
729 // alternative — taking `acc.first.commit` verbatim — would
730 // hide WIP-vs-committed disagreement, presenting `abc1234`
731 // when half the contributors actually came from a dirty
732 // tree (or `abc1234-dirty` when half came from a clean
733 // tree). Operators reading averaged stats need to know the
734 // cohort spanned a working-tree state change, since that
735 // changes the meaning of the metric mean. `+mixed` is the
736 // chosen separator (not `-mixed`) so it cannot be confused
737 // with the existing `-dirty` suffix grammar — `dirty` is a
738 // per-record property, `mixed` is a cohort-level property.
739 let project_commit_rendered = render_mixed_dirty(
740 acc.any_project_clean,
741 acc.any_project_dirty,
742 &acc.first_project_base,
743 &acc.first.commit,
744 );
745 let kernel_commit_rendered = render_mixed_dirty(
746 acc.any_kernel_clean,
747 acc.any_kernel_dirty,
748 &acc.first_kernel_base,
749 &acc.first.kernel_commit,
750 );
751 // ext_metrics is built BEFORE the struct so Rate keys can be
752 // re-derived from the folded components as a post-pass. Rate and PerPhase
753 // are skipped here: Rate's components survive cross-RUN as their own ext
754 // keys so it re-derives Σnum/Σdenom (folding two ready-made ratios would
755 // lose the re-pool, and routing a Rate through
756 // aggregate_samples_weighted would hit the aggregate_finite guard);
757 // PerPhase is a per-phase-only scalar with no cross-RUN aggregate.
758 // Distribution / WorstLowest / WakeLatencyTailRatio / WorstCrossNodeRatio
759 // are NOT skipped — their raw components do
760 // NOT survive cross-RUN (phases are dropped), so there is no pooled set
761 // to re-derive; they fall through to aggregate_samples_weighted and
762 // fold by kind (MEAN for the percentile / CV / mean reductions and
763 // every WorstLowest, MAX for SampleReduction::Worst — the
764 // aggregate_finite arms). Dispatch by registered MetricKind so
765 // Gauge(Avg) gets the weighted-mean fold (matches the per-phase merge
766 // contract); unregistered names (no metric_def) fall back to
767 // arithmetic mean, the legacy (sum, count) semantic. Skip a key whose
768 // reduction is None (every value NaN — defensive post sidecar_to_row
769 // sanitize).
770 let ext_metrics = fold_ext_metrics(acc.ext_pairs, &acc.ext_counter_keys);
771 let aggregated = GauntletRow {
772 scenario: acc.first.scenario.clone(),
773 // Per-test gate declarations are identical across a test's grouped
774 // runs (same entry), so the first row's carry the group's.
775 perf_delta_assertions: acc.first.perf_delta_assertions.clone(),
776 topology: acc.first.topology.clone(),
777 work_type: acc.first.work_type.clone(),
778 scheduler: acc.first.scheduler.clone(),
779 kernel_version: acc.first.kernel_version.clone(),
780 commit: project_commit_rendered,
781 kernel_commit: kernel_commit_rendered,
782 run_source: acc.first.run_source.clone(),
783 resolve_source: acc.first.resolve_source.clone(),
784 // First-seen budget metadata, like scheduler/kernel_version
785 // above. CpuBudget is a PAIRING dim (not sliceable — see
786 // Dimension::SLICEABLE), so it is part of the group key and every
787 // contributor shares one budget; the first row's value is the
788 // group's. vcpus is likewise first-seen metadata — and is NOT a
789 // Dimension. No post-aggregation consumer reads the aggregated
790 // vcpus (`render_overcommit_warning` and the other overcommit
791 // checks run pre-aggregation on the raw rows), so the first-seen
792 // value is metadata only.
793 cpu_budget: acc.first.cpu_budget,
794 vcpus: acc.first.vcpus,
795 // ALL must pass: any failed, inconclusive, or skipped
796 // contributor flips the aggregate. A group with zero
797 // passes_observed (every contributor failed, was
798 // inconclusive, or was skipped) collapses to
799 // passed=false here. The four-bit verdict is
800 // strict 4-state (exactly one of pass/skip/inconc/fail
801 // set per row); the lattice
802 // `Fail > Inconclusive > Pass > Skip` determines which
803 // bit dominates when a cohort has mixed contributors.
804 // Skip is the lowest-precedence bit — it fires only
805 // when no contributor failed AND no contributor was
806 // inconclusive AND at least one was skipped. Fail
807 // (all-false) dominates Inconclusive dominates Skip;
808 // exactly one of the four states is encoded per row.
809 passed: !acc.any_failed && !acc.any_inconclusive && !acc.any_skipped && n > 0,
810 skipped: !acc.any_failed && !acc.any_inconclusive && acc.any_skipped,
811 inconclusive: !acc.any_failed && acc.any_inconclusive,
812 expected_failure: acc.any_expected_failure,
813 // Sum across contributors so the aggregated row's
814 // weight is the cohort's total sample population. A
815 // downstream consumer that further folds these
816 // aggregated rows can apply the same weighted semantic
817 // (a 5-RUN cohort of 50-sample runs weighs 250 vs a
818 // 1-RUN cohort of 10 samples weighting 10).
819 run_sample_count: acc.sum_run_sample_count,
820 spread: acc.sum_spread / denom,
821 // Peak-kind typed fields: MAX across runs (kind-correct
822 // cross-RUN fold; arithmetic mean dilutes the
823 // worst-instant signal).
824 gap_ms: acc.max_gap_ms,
825 imbalance_ratio: acc.max_imbalance_ratio,
826 max_dsq_depth: acc.max_max_dsq_depth,
827 migrations: round_u64(acc.sum_migrations),
828 migration_ratio: acc.sum_migration_ratio / denom,
829 stuck_count: acc.sum_stuck_count / denom,
830 fallback_count: round_i64(acc.sum_fallback_count),
831 keep_last_count: round_i64(acc.sum_keep_last_count),
832 total_iterations: round_u64(acc.sum_total_iterations),
833 ext_metrics,
834 // Carry the Dynamic counter-key tags forward so a second-level
835 // cross-RUN fold of these already-aggregated rows keeps SUM-folding
836 // them (the SUM-of-SUMs stays a SUM).
837 ext_counter_keys: acc.ext_counter_keys,
838 // Phase buckets do not aggregate cleanly across an
839 // averaged group: two contributors might run different
840 // scenarios with different phase counts, and per-phase
841 // averaging across mismatched step_index sets would
842 // invent rows neither side carried. Surface the empty
843 // slice so downstream consumers fall back to the flat
844 // bucket. Averaged groups carry no per-phase data; the
845 // per-step_index intersection + one-sided-step surfacing
846 // semantic lives in the per-run noise path
847 // (noise_phase_findings), not the averaging path.
848 phases: Vec::new(),
849 };
850 AveragedGroup {
851 row: aggregated,
852 passes_observed: acc.passes_observed,
853 skips_observed: acc.skips_observed,
854 inconclusives_observed: acc.inconclusives_observed,
855 failures_observed: acc.failures_observed,
856 total_observed: acc.total_observed,
857 }
858 }
859}
860
861/// Fold one group's accumulated per-ext-metric (value, weight) pairs
862/// into the aggregated row's `ext_metrics` map. Rate, PerPhase, and
863/// PerRunDistribution are skipped in the kind dispatch: Rate's components
864/// survive cross-RUN as their own ext keys so it re-derives Σnum/Σdenom
865/// (folding two ready-made ratios would lose the re-pool, and routing a Rate
866/// through aggregate_samples_weighted would hit the aggregate_finite guard);
867/// PerPhase is a per-phase-only scalar with no cross-RUN aggregate;
868/// PerRunDistribution is a whole-run percentile/min/max that CANNOT be
869/// cross-RUN folded (a percentile of a union is not a mean of per-run
870/// percentiles, and the per-phase histograms are dropped cross-RUN so there is
871/// no pooled set to re-derive) — its only cross-RUN consumer is the per-run
872/// noise-compare (`noise_findings`), so it must never be averaged here.
873/// Distribution / WorstLowest / WakeLatencyTailRatio / WorstCrossNodeRatio
874/// are NOT skipped — their raw components do
875/// NOT survive cross-RUN (phases are dropped), so there is no pooled set
876/// to re-derive; they fall through to aggregate_samples_weighted and
877/// fold by kind (MEAN for the percentile / CV / mean reductions and
878/// every WorstLowest, MAX for SampleReduction::Worst — the
879/// aggregate_finite arms). Dispatch by registered MetricKind so
880/// Gauge(Avg) gets the weighted-mean fold (matches the per-phase merge
881/// contract); an unregistered name (no metric_def) folds by `counter_keys`: a
882/// Dynamic monotonic-counter key (lb_*/alb_* schedstat delta, ScalarCounter bpf
883/// field) SUM-folds (matching the registered-Counter convention), every other
884/// unregistered name falls back to arithmetic mean, the legacy (sum, count)
885/// semantic. Skip a key whose
886/// reduction is None (every value NaN — defensive post sidecar_to_row
887/// sanitize). Rate metrics are then re-derived from the folded
888/// components (Σnum/Σdenom) as a post-pass.
889fn fold_ext_metrics(
890 ext_pairs: BTreeMap<String, Vec<(f64, usize)>>,
891 counter_keys: &BTreeSet<String>,
892) -> BTreeMap<String, f64> {
893 let mut ext_metrics: std::collections::BTreeMap<String, f64> = ext_pairs
894 .into_iter()
895 .filter_map(|(k, pairs)| {
896 if let Some(def) = metric_def(&k) {
897 // Rate re-derives from its folded components (post-pass below);
898 // PerPhase is a per-phase-only scalar with no meaningful
899 // cross-RUN aggregate — both are skipped here. The PerPhase skip
900 // is also load-bearing: a PerPhase key reaching
901 // aggregate_samples_weighted would hit aggregate_finite's
902 // unreachable!() arm. (PerPhase keys should never reach here —
903 // populate_run_ext_metrics_from_phases skips is_derived keys —
904 // so this is defensive belt-and-suspenders.)
905 if matches!(
906 def.kind,
907 MetricKind::Rate { .. } | MetricKind::PerPhase | MetricKind::PerRunDistribution
908 ) {
909 return None;
910 }
911 aggregate_samples_weighted(&pairs, def.kind).map(|v| (k, v))
912 } else {
913 let n = pairs.len();
914 if n == 0 {
915 None
916 } else {
917 let sum: f64 = pairs.iter().map(|(v, _)| *v).sum();
918 // A Dynamic monotonic-counter key (lb_*/alb_* schedstat delta
919 // or a ScalarCounter bpf field) SUM-folds across runs,
920 // matching the registered-Counter convention (aggregate_finite's
921 // Counter arm). Untagged keys (gauges, per-CPU _avg/_max) keep
922 // the legacy arithmetic-mean fold.
923 let v = if counter_keys.contains(&k) {
924 sum
925 } else {
926 sum / n as f64
927 };
928 Some((k, v))
929 }
930 }
931 })
932 .collect();
933 // Re-derive Rate metrics from the folded components (Σnum/Σdenom).
934 derive_rate_metrics(&mut ext_metrics);
935 ext_metrics
936}
937
938/// Group `rows` by the dynamic pairing key (`scenario` plus every
939/// dimension in `pairing_dims`) and arithmetic-mean their metric
940/// fields, returning one [`AveragedGroup`] per distinct key.
941/// Slicing dims are EXCLUDED from `pairing_dims` (rows on the A/B
942/// sides differ on them by design); pairing dims are INCLUDED.
943///
944/// Group key matches [`compare_rows_by`]' pairing key so the post-
945/// aggregation row vec joins cleanly across A/B sides under the
946/// same identity contract.
947///
948/// Aggregation rules:
949/// - The verdict bits `(passed, skipped, inconclusive)` aggregate
950/// under the strict 4-state mutex per the
951/// `Fail > Inconclusive > Pass > Skip` lattice. Fail (all-false)
952/// dominates: any failed contributor flips the aggregate's
953/// `passed` to `false` and leaves `skipped`/`inconclusive` clear,
954/// yielding Fail at the aggregate level. Otherwise Inconclusive
955/// dominates: any inconclusive contributor sets the aggregate's
956/// `inconclusive = true`. Otherwise Skip dominates: any skipped
957/// contributor sets `skipped = true`. Only when every contributor
958/// was a real Pass does the aggregate carry `passed = true`. This
959/// matches [`GauntletRow::is_pass`]'s triple-conjunct semantics
960/// so the aggregate's accessor reads honestly.
961/// - Metrics (`f64` / `u64` / `i64` fields, plus `ext_metrics`
962/// entries) are summed only across contributors where
963/// `passed && !skipped`, then divided by that count to yield an
964/// arithmetic mean. Failing/skipped contributors carry telemetry
965/// dominated by the failure mode, NOT scheduler behaviour, and
966/// are therefore excluded from the mean. When no contributor
967/// passed cleanly, every metric defaults to zero and the
968/// aggregate's `passed = false` routes the pair to
969/// [`compare_rows_by`]' `excluded_pairs` gate.
970/// - `u64` / `i64` fields take the rounded mean
971/// (`(sum / count).round() as u64`). The up-to-0.5-unit per-side
972/// rounding error (up to 1.0 across an A/B pair) stays below each
973/// such field's `default_abs` gate: the smallest is
974/// `total_iterations` / `total_migrations` at 2.0, held `>= 2.0` by
975/// the scale-varying #28 recalibration precisely so a rounding-only
976/// delta (`<= 1.0`) never clears the gate and fabricates a unit
977/// regression.
978/// - `stuck_count` is the exception: it is `f64` and carries the
979/// EXACT mean (`sum / count`, no rounding). Its `default_abs` is
980/// 1.0 — tight enough that a rounded mean's up-to-1.0 per-A/B-pair
981/// error would fabricate single-stall regressions from sub-integer
982/// differences (an A-side mean of 1.4 vs a B-side 1.6 rounds to
983/// 1 vs 2, a spurious delta of 1).
984/// - `ext_metrics` keys are unioned across passing contributors;
985/// each key's mean is computed only across contributors that
986/// carried it. A key present in some passing rows and absent
987/// from others uses the present-only count as its denominator —
988/// absent-and-zero are not equivalent (the `BTreeMap<String,
989/// f64>` shape cannot represent "absent" with a stored zero).
990/// - Identity fields (`scenario`, `topology`, `work_type`,
991/// `scheduler`, `kernel_version`) come from the first contributor
992/// in iteration order. Every contributor in the group shares the
993/// first three by construction (group key); `scheduler` and
994/// `kernel_version` may vary across the group if the operator did
995/// not narrow via typed filters first, but the aggregated row
996/// carries the first contributor's value in any case — the join
997/// downstream uses the three-tuple, so scheduler/version on the
998/// aggregate is metadata, not a join key.
999/// - Commit dimensions (`commit`, `kernel_commit`) follow a
1000/// first-seen rule with one exception: when contributors disagree
1001/// on the `-dirty` suffix for the same canonical hex (some clean,
1002/// some dirty), the rendered form becomes `{hex}+mixed` so the
1003/// working-tree disagreement is surfaced rather than hidden by
1004/// first-seen. `+mixed` (not `-mixed`) is intentional —
1005/// `-dirty` is a per-record property of one sidecar, `+mixed`
1006/// is a cohort-level property of the average. Mixed-dirty
1007/// tracking spans EVERY contributor (passing, failing, skipped)
1008/// because the cohort's WIP state is metadata, not a metric.
1009///
1010/// Group iteration order matches the order of FIRST appearance of
1011/// each key in `rows`; `BTreeMap` ordering is by key (not iteration
1012/// order) so we maintain a parallel `Vec<key>` to preserve
1013/// first-seen ordering. Stable order keeps test fixtures
1014/// deterministic across runs.
1015pub fn group_and_average_by(
1016 rows: &[GauntletRow],
1017 pairing_dims: &[Dimension],
1018) -> Vec<AveragedGroup> {
1019 // Dynamic pairing key — scenario + every NON-slicing
1020 // dimension's value, in [`Dimension::ALL`] order. The
1021 // `PairingKey` newtype is owned (`Vec<String>`) so the
1022 // BTreeMap can hold keys without lifetime gymnastics; the
1023 // alternative — borrowing slices into `rows` — would force
1024 // every consumer to keep `rows` alive for the duration of
1025 // the map.
1026 type Key = PairingKey;
1027
1028 let mut order: Vec<Key> = Vec::new();
1029 let mut groups: BTreeMap<Key, Accumulator<'_>> = BTreeMap::new();
1030
1031 for row in rows {
1032 let key = PairingKey::from_row(row, pairing_dims);
1033 let acc = groups.entry(key.clone()).or_insert_with(|| {
1034 order.push(key);
1035 Accumulator::new(row)
1036 });
1037 acc.observe(row);
1038 }
1039
1040 let mut out = Vec::with_capacity(order.len());
1041 for key in order {
1042 let acc = groups
1043 .remove(&key)
1044 .expect("first-seen key must still be in groups map");
1045 out.push(acc.into_averaged_group());
1046 }
1047 out
1048}
1049
1050/// Convert a SidecarResult to a GauntletRow for run-to-run comparison.
1051///
1052/// Non-finite f64 values (NaN, ±Infinity) are sanitized to 0.0 with a
1053/// warn before they reach the row. `serde_json::to_string` rejects
1054/// non-finite, so a single poisoned metric would otherwise halt every
1055/// downstream JSON write. Sanitizing at the ingress boundary keeps the
1056/// serializer happy without silencing the upstream data quality issue.
1057///
1058/// # NaN → 0.0 ambiguity for zero-meaningful metrics
1059///
1060/// The 0.0 substitution is indistinguishable from a legitimate 0.0
1061/// measurement for metrics whose natural zero carries its own signal.
1062/// One direct f64 field is especially affected — note the in-tree producer
1063/// already guards the typical divide-by-zero path
1064/// (`assert::reductions::migration_ratio_of` emits `0.0` for
1065/// migration_ratio when `total_iterations == 0`), so a NaN reaching
1066/// this boundary indicates an upstream producer outside that guard (e.g. an
1067/// external `ext_metrics` contributor, or a schedstat arithmetic
1068/// edge that slipped past a guard):
1069///
1070/// - `migration_ratio`: lower-better. A real 0.0 means "no task was
1071/// migrated" (ideal locality). A sanitized NaN collapses to the
1072/// same value and reads as *falsely good* — a downstream regression
1073/// gate sees "perfect locality" where the truth is "no data".
1074/// (`page_locality` is NO LONGER a finite_or_zero typed field: it is the
1075/// ext-sourced `worst_page_locality` WorstLowest metric, re-pooled from the
1076/// per-phase NUMA carriers, so a non-finite value is DROPPED via the ext path —
1077/// absence preserved — not coerced to a falsely-bad 0.0 here.)
1078///
1079/// The reclassified wake-latency / run-delay distributions (e.g.
1080/// `worst_wake_latency_cv`) are NO LONGER direct f64 fields — they flow
1081/// through `ext_metrics`, where a non-finite value is DROPPED (the entry is
1082/// absent), NOT substituted with 0.0. That is the opposite, no-false-zero
1083/// contract: an absent key reads as no-data, distinct from a measured 0.0.
1084///
1085/// The accompanying `tracing::warn!` is the only signal that
1086/// separates a sanitized NaN from a real 0.0; downstream aggregation
1087/// by value alone cannot distinguish them.
1088pub fn sidecar_to_row(sc: &crate::test_support::SidecarResult) -> GauntletRow {
1089 // Local closure so the warn can carry the scenario name as
1090 // context — keyed by field so the operator can pinpoint which
1091 // metric produced the bad value.
1092 let finite_or_zero = |field: &str, v: f64| -> f64 {
1093 if v.is_finite() {
1094 v
1095 } else {
1096 tracing::warn!(
1097 test = %sc.test_name,
1098 field,
1099 value = v,
1100 "non-finite f64 in GauntletRow field; substituting 0.0",
1101 );
1102 0.0
1103 }
1104 };
1105
1106 // Build ext_metrics from the in-guest payload map (dropping the
1107 // walk-truncation sentinel + non-finite values), then layer in the
1108 // host-side monitor schedstat aggregates below.
1109 let mut ext_metrics: BTreeMap<String, f64> = sc
1110 .stats
1111 .ext_metrics
1112 .iter()
1113 .filter_map(|(k, &v)| {
1114 if crate::test_support::is_truncation_sentinel_name(k) {
1115 return None;
1116 }
1117 if v.is_finite() {
1118 Some((k.clone(), v))
1119 } else {
1120 tracing::warn!(
1121 test = %sc.test_name,
1122 metric = %k,
1123 value = v,
1124 "dropping non-finite ext_metric; serde_json rejects NaN/Infinity",
1125 );
1126 None
1127 }
1128 })
1129 .collect();
1130 // System-wide schedstat aggregates, read host-side from guest memory
1131 // at freeze (zero observer effect; `MonitorSummary::schedstat_deltas`,
1132 // summed across CPUs over the run). Keys ABSENT when CONFIG_SCHEDSTATS
1133 // is off (schedstat_deltas == None): absent != 0 for a no-data run, and
1134 // a 0 would pollute the cross-run Counter SUM and the Rate denominators
1135 // (`total_pcount`, `total_ttwu_count`). All seven
1136 // insert under one `if let` so each Rate's numerator/denominator pair is
1137 // always co-present (derive_rate_metrics needs both). `u64 -> f64` is
1138 // exact below 2^53 and inherently finite, so these skip the finite
1139 // filter the payload keys go through. The registry entries are
1140 // `Polarity::Informational` Counter raw components that feed nine
1141 // `MetricKind::Rate` derivations (per-schedule: total_run_delay_ns_per_sched,
1142 // ttwu_local_fraction, sched_goidle_fraction; per-second: run_delay_per_sec,
1143 // pcount_per_sec, sched_count_per_sec, yld_count_per_sec, ttwu_count_per_sec,
1144 // sched_goidle_per_sec); see [`crate::stats::METRICS`].
1145 if let Some(sd) = sc
1146 .monitor
1147 .as_ref()
1148 .and_then(|m| m.schedstat_deltas.as_ref())
1149 {
1150 ext_metrics.insert("total_run_delay".to_string(), sd.total_run_delay as f64);
1151 ext_metrics.insert("total_pcount".to_string(), sd.total_pcount as f64);
1152 ext_metrics.insert("total_sched_count".to_string(), sd.total_sched_count as f64);
1153 ext_metrics.insert("total_yld_count".to_string(), sd.total_yld_count as f64);
1154 ext_metrics.insert(
1155 "total_sched_goidle".to_string(),
1156 sd.total_sched_goidle as f64,
1157 );
1158 ext_metrics.insert("total_ttwu_count".to_string(), sd.total_ttwu_count as f64);
1159 ext_metrics.insert("total_ttwu_local".to_string(), sd.total_ttwu_local as f64);
1160 // Per-second Rate denominator: the schedstat-window span, co-inserted
1161 // both-or-neither with the total_* numerators above so every *_per_sec
1162 // schedstat Rate has its matching-window denominator present (the
1163 // derive_rate_metrics num+den co-presence invariant; the same window the
1164 // total_* deltas span, so num/den share a time base).
1165 ext_metrics.insert(
1166 "total_schedstat_wall_sec".to_string(),
1167 sd.total_schedstat_wall_sec,
1168 );
1169 }
1170 // Run-level ext-only monitor metrics (avg_nr_running + the PELT IRQ load
1171 // pair + the PSI-irq pair), folded from the run's MonitorSummary. Inserted
1172 // only when the run has monitor samples (a 0-sample run carries no
1173 // occupancy / IRQ signal — absent, not a false 0.0); the IRQ fields insert
1174 // only on Some (loud-absent on a kernel without the source). Shared with
1175 // VmResult::run_metric via fold_run_level_ext so the key list + loud-absent
1176 // guard can't drift between the sidecar row and the in-test accessor.
1177 // Dynamic monotonic-counter ext keys (lb_*/alb_* schedstat deltas + any
1178 // ScalarCounter bpf field) collected alongside the values so the cross-run
1179 // fold SUM-folds them (they are not in the static METRICS registry, so
1180 // metric_def can't classify them — see fold_ext_metrics).
1181 let mut ext_counter_keys = BTreeSet::new();
1182 if let Some(m) = sc.monitor.as_ref() {
1183 m.fold_run_level_ext_with_counter_keys(&mut ext_metrics, &mut ext_counter_keys);
1184 }
1185
1186 GauntletRow {
1187 scenario: sc.test_name.clone(),
1188 perf_delta_assertions: sc.perf_delta_assertions.clone(),
1189 topology: sc.topology.clone(),
1190 work_type: sc.work_type.clone(),
1191 scheduler: sc.scheduler.clone(),
1192 kernel_version: sc.kernel_version.clone(),
1193 commit: sc.project_commit.clone(),
1194 kernel_commit: sc.kernel_commit.clone(),
1195 run_source: sc.run_source.clone(),
1196 resolve_source: sc.resolve_source.clone(),
1197 // 0 = skip rows (never booted) -> None: skips carry no budget
1198 // identity, so they don't pair into a "budget 0" bucket.
1199 cpu_budget: (sc.cpu_budget != 0).then_some(sc.cpu_budget),
1200 vcpus: (sc.vcpus != 0).then_some(sc.vcpus),
1201 passed: sc.is_pass(),
1202 skipped: sc.is_skip(),
1203 inconclusive: sc.is_inconclusive(),
1204 expected_failure: sc.expected_failure,
1205 run_sample_count: sc.monitor.as_ref().map(|m| m.total_samples).unwrap_or(0),
1206 spread: finite_or_zero("spread", sc.stats.worst_spread),
1207 gap_ms: sc.stats.worst_gap_ms,
1208 migrations: sc.stats.total_migrations,
1209 migration_ratio: finite_or_zero("migration_ratio", sc.stats.worst_migration_ratio),
1210 imbalance_ratio: finite_or_zero(
1211 "imbalance_ratio",
1212 sc.monitor
1213 .as_ref()
1214 .map(|m| m.max_imbalance_ratio)
1215 .unwrap_or(0.0),
1216 ),
1217 max_dsq_depth: sc
1218 .monitor
1219 .as_ref()
1220 .map(|m| m.max_local_dsq_depth)
1221 .unwrap_or(0),
1222 stuck_count: sc.monitor.as_ref().map(|m| m.stuck_count).unwrap_or(0) as f64,
1223 fallback_count: sc
1224 .monitor
1225 .as_ref()
1226 .and_then(|m| m.event_deltas.as_ref())
1227 .map(|e| e.total_fallback)
1228 .unwrap_or(0),
1229 keep_last_count: sc
1230 .monitor
1231 .as_ref()
1232 .and_then(|m| m.event_deltas.as_ref())
1233 .map(|e| e.total_dispatch_keep_last)
1234 .unwrap_or(0),
1235 total_iterations: sc.stats.total_iterations,
1236 // Built above: in-guest payload ext keys (non-finite values and
1237 // the walk-truncation sentinel dropped — a dropped non-finite must
1238 // not be confused with a real 0.0, and the sentinel is JSON-walker
1239 // diagnostic metadata, not a scenario metric) plus the host-side
1240 // monitor schedstat aggregates.
1241 ext_metrics,
1242 // Which of the Dynamic ext keys are monotonic counters (SUM-fold
1243 // cross-run); empty when there is no monitor / no counter keys.
1244 ext_counter_keys,
1245 // Carry per-phase buckets verbatim from the source
1246 // ScenarioStats. The bucket structure has already been
1247 // reduced by the host-side phase aggregator (Counter via
1248 // `phase_counter_delta`, Gauge/Peak/Timestamp via
1249 // `aggregate_samples`), so the sidecar -> row step just
1250 // forwards the prebuilt slice. An empty `phases` slot on
1251 // the source sidecar (single-phase scenario or legacy
1252 // file) flows through as an empty slice.
1253 phases: sc.stats.phases.clone(),
1254 }
1255}