ktstr/stats/
group.rs

1use super::*;
2
3/// One of the nine dimensions that compose a `GauntletRow`'s
4/// identity in the comparison pipeline: `kernel`, `scheduler`,
5/// `topology`, `work-type`, `project-commit`, `kernel-commit`,
6/// `run-source`, `resolve-source`, `cpu-budget`. Each maps to the corresponding
7/// `RowFilter` field and `GauntletRow` field; the dimension
8/// model lets `compare_partitions` derive its slicing dims and
9/// dynamic pairing key without hardcoding the dimension list at
10/// every call site. Variant names match the CLI flag suffix
11/// (e.g. `Dimension::ProjectCommit` ↔ `--project-commit`,
12/// `Dimension::RunSource` ↔ `--run-source`,
13/// `Dimension::CpuBudget` ↔ `--cpu-budget`) so a reader can map
14/// from operator surface to internal enum without a translation
15/// table.
16///
17/// `scenario` is NOT a dimension — it is the test name and is
18/// always part of the pairing key (you can't compare scenario A
19/// against scenario B; that would compare unrelated tests).
20///
21/// Iteration order via [`Dimension::ALL`] is deterministic and
22/// matches the order operators read in the CLI flags
23/// (`--kernel` / `--scheduler` / `--topology` / `--work-type` /
24/// `--project-commit` / `--kernel-commit` / `--run-source` /
25/// `--resolve-source` / `--cpu-budget`), so generated labels and error messages list
26/// dims in a stable, predictable order.
27#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
28pub enum Dimension {
29    Kernel,
30    Scheduler,
31    Topology,
32    WorkType,
33    ProjectCommit,
34    KernelCommit,
35    RunSource,
36    ResolveSource,
37    CpuBudget,
38}
39
40impl Dimension {
41    /// Every dimension in CLI-flag order. Used by
42    /// [`Self::pairing_dims`] (invoked from `compare_partitions`) to
43    /// compute the pairing-dim complement set (all dims minus slicing
44    /// dims). The sliceable subset [`derive_slicing_dims`] contrasts on
45    /// is [`Self::SLICEABLE`].
46    pub const ALL: &'static [Dimension] = &[
47        Dimension::Kernel,
48        Dimension::Scheduler,
49        Dimension::Topology,
50        Dimension::WorkType,
51        Dimension::ProjectCommit,
52        Dimension::KernelCommit,
53        Dimension::RunSource,
54        Dimension::ResolveSource,
55        Dimension::CpuBudget,
56    ];
57
58    /// The dimensions that may form an A/B CONTRAST (slice). Only the
59    /// version axes are contrastable: comparing across a project commit,
60    /// a kernel version, or a kernel commit is a purposeful "did this
61    /// change regress" question. Every other dimension
62    /// (scheduler/topology/work_type/run_source/resolve_source/cpu_budget)
63    /// is FILTER + PAIRING only — it narrows the cohort and joins A to B,
64    /// but contrasting across it bulk-compares heterogeneous runs
65    /// (different configs/hosts/conditions), which the significance math
66    /// cannot soundly attribute. A cross-config question is answered
67    /// in-test via the Verdict DSL (`better_across_phases`), not here.
68    pub const SLICEABLE: &'static [Dimension] = &[
69        Dimension::Kernel,
70        Dimension::ProjectCommit,
71        Dimension::KernelCommit,
72    ];
73
74    /// Compute pairing dims from a slicing-dim set: every
75    /// dimension in [`Dimension::ALL`] that is NOT in `slicing`,
76    /// in canonical order. This is the dynamic key derivation the
77    /// comparison pipeline uses everywhere — slicing dims define
78    /// the contrast (different on A vs B), pairing dims define
79    /// the join (same across A and B). A non-[`Self::SLICEABLE`] dimension is
80    /// never in `slicing`, so it is always a pairing dim.
81    pub fn pairing_dims(slicing: &[Dimension]) -> Vec<Dimension> {
82        Self::ALL
83            .iter()
84            .copied()
85            .filter(|d| !slicing.contains(d))
86            .collect()
87    }
88
89    /// Operator-readable name for diagnostic and table output.
90    /// Matches the CLI flag suffix (e.g. `--kernel` →
91    /// `"kernel"`, `--work-type` → `"work-type"`). Used in the
92    /// "slicing dimensions: ..." / "pairing on: ..." header
93    /// lines and in the "A and B select identical rows" error.
94    pub fn name(self) -> &'static str {
95        match self {
96            Dimension::Kernel => "kernel",
97            Dimension::Scheduler => "scheduler",
98            Dimension::Topology => "topology",
99            Dimension::WorkType => "work-type",
100            Dimension::ProjectCommit => "project-commit",
101            Dimension::KernelCommit => "kernel-commit",
102            Dimension::RunSource => "run-source",
103            Dimension::ResolveSource => "resolve-source",
104            Dimension::CpuBudget => "cpu-budget",
105        }
106    }
107}
108
109/// Legacy pairing-dim set used by tests that pre-date the
110/// dimensional-slicing refactor. Equivalent to the historical
111/// hardcoded tuple `(scenario, topology, work_type)` — scenario
112/// is always implicit in [`PairingKey::from_row`] and the
113/// remaining two dimensions are listed here. Production
114/// callers (`compare_partitions`) compute pairing dims via
115/// [`Dimension::pairing_dims`] from the slicing-dim derivation;
116/// only test fixtures use this constant directly, so it is gated
117/// behind `#[cfg(test)]`.
118#[cfg(test)]
119pub(crate) const LEGACY_PAIRING_DIMS: &[Dimension] = &[Dimension::Topology, Dimension::WorkType];
120
121/// Derive the set of dimensions on which `filter_a` and
122/// `filter_b` differ. These are the SLICING dimensions —
123/// dimensions on which the two sides select disjoint cohorts and
124/// therefore form the A/B contrast. The complement (every other
125/// dimension) is the PAIRING-key dimension set used by
126/// `compare_rows` to join A-side rows against B-side rows.
127///
128/// Comparison shape per dimension: every dim uses the same
129/// SORTED-DEDUPED `Vec<&str>` comparison — order and multiplicity
130/// don't matter (`--a-kernel 6.14 --a-kernel 6.15` and
131/// `--b-kernel 6.15 --b-kernel 6.14` are NOT a slice). All nine
132/// dimensions are repeatable Vec filters; the previously
133/// `Option<String>`-typed `scheduler` / `topology` / `work_type`
134/// dims were promoted to `Vec<String>` so the operator-visible
135/// shape is uniform across every dimension.
136///
137/// Returns dimensions in canonical ([`Dimension::ALL`]) order so callers
138/// (header lines, error messages, side labels) get a stable presentation.
139/// Only a [`Dimension::SLICEABLE`] dimension can be a slicing dim; the
140/// non-sliceable dims are filter + pairing only and are only ever set via a
141/// single shared `--<x>` filter (applied to BOTH sides), so they can never
142/// differ A↔B — the walk skips them.
143pub fn derive_slicing_dims(filter_a: &RowFilter, filter_b: &RowFilter) -> Vec<Dimension> {
144    let mut out = Vec::new();
145    for &dim in Dimension::SLICEABLE {
146        let differs = match dim {
147            Dimension::Kernel => sorted_dedup(&filter_a.kernels) != sorted_dedup(&filter_b.kernels),
148            Dimension::ProjectCommit => {
149                sorted_dedup(&filter_a.project_commits) != sorted_dedup(&filter_b.project_commits)
150            }
151            Dimension::KernelCommit => {
152                sorted_dedup(&filter_a.kernel_commits) != sorted_dedup(&filter_b.kernel_commits)
153            }
154            // Non-sliceable dims are filter + pairing only (see
155            // [`Dimension::SLICEABLE`]); the walk never reaches them.
156            Dimension::Scheduler
157            | Dimension::Topology
158            | Dimension::WorkType
159            | Dimension::RunSource
160            | Dimension::ResolveSource
161            | Dimension::CpuBudget => {
162                unreachable!("non-sliceable dimension {dim:?} in SLICEABLE walk")
163            }
164        };
165        if differs {
166            out.push(dim);
167        }
168    }
169    out
170}
171
172fn sorted_dedup(v: &[String]) -> Vec<&str> {
173    let mut s: Vec<&str> = v.iter().map(String::as_str).collect();
174    s.sort_unstable();
175    s.dedup();
176    s
177}
178
179/// Render a side's filter values into a column-header label for
180/// the comparison table. `dims` is the slicing-dimension set —
181/// the only dims whose values vary between A and B. The label
182/// concatenates each dim's per-side filter value(s) with `:`
183/// between dim values (e.g. `"6.14.2:scx_rusty"` when both
184/// `kernel` and `scheduler` slice). For multi-value Vec filters
185/// (kernels, commits) the values join with `|` when there
186/// are ≤3; longer lists collapse to `"A"` or `"B"` (the bare
187/// side label) to keep the column header readable.
188///
189/// `bare_label` is `"A"` / `"B"`, used as the fallback when a
190/// slicing dim's filter has more than 3 values OR the slicing
191/// dim's filter is empty on this side (the slice exists because
192/// the OTHER side populated the filter — the empty-side label is
193/// the bare letter).
194pub(crate) fn render_side_label(
195    filter: &RowFilter,
196    dims: &[Dimension],
197    bare_label: &str,
198) -> String {
199    if dims.is_empty() {
200        return bare_label.to_string();
201    }
202    let mut parts: Vec<String> = Vec::new();
203    for &dim in dims {
204        let part = match dim {
205            Dimension::Kernel => render_vec_dim(&filter.kernels, bare_label),
206            Dimension::Scheduler => render_vec_dim(&filter.schedulers, bare_label),
207            Dimension::Topology => render_vec_dim(&filter.topologies, bare_label),
208            Dimension::WorkType => render_vec_dim(&filter.work_types, bare_label),
209            Dimension::ProjectCommit => render_vec_dim(&filter.project_commits, bare_label),
210            Dimension::KernelCommit => render_vec_dim(&filter.kernel_commits, bare_label),
211            Dimension::RunSource => render_vec_dim(&filter.run_sources, bare_label),
212            Dimension::ResolveSource => render_vec_dim(&filter.resolve_sources, bare_label),
213            Dimension::CpuBudget => render_vec_dim(&filter.cpu_budgets, bare_label),
214        };
215        parts.push(part);
216    }
217    parts.join(":")
218}
219
220/// `≤3` values: join with `|`. `>3` values: collapse to
221/// `bare_label`. Empty Vec: also bare label (slicing exists
222/// because the OTHER side populated the same dim).
223fn render_vec_dim(values: &[String], bare_label: &str) -> String {
224    if values.is_empty() || values.len() > 3 {
225        bare_label.to_string()
226    } else {
227        let mut sorted: Vec<&str> = values.iter().map(String::as_str).collect();
228        sorted.sort_unstable();
229        sorted.join("|")
230    }
231}
232
233/// Dynamic pairing key for [`compare_rows_by`] — the tuple of
234/// values on every NON-slicing dimension, plus the always-pinned
235/// `scenario`. Two rows pair iff their dynamic keys match.
236///
237/// Stored as a `Vec<String>` so the same struct shape works for
238/// any `pairing_dims` slice (the alternative — a tuple of
239/// `Option<&str>` per dim — would force every consumer to know
240/// the dim list at compile time, defeating the point of
241/// dimension-set parametrisation).
242///
243/// First element is always `scenario`; subsequent elements
244/// follow `pairing_dims` order (which is itself
245/// [`Dimension::ALL`] order minus the slicing dims).
246#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize)]
247pub(crate) struct PairingKey(pub Vec<String>);
248
249impl PairingKey {
250    /// Extract the pairing key for `row` given the list of
251    /// dimensions to include. The scenario is ALWAYS the first
252    /// component; the `pairing_dims` list controls the rest.
253    /// Each non-scenario dim contributes a single string slot:
254    /// `Option<String>` fields render `None` as the empty
255    /// string, `Vec<String>` fields render as a sorted-deduped
256    /// `|`-joined string so the same set produces the same key
257    /// regardless of input order.
258    ///
259    /// Commit dimensions (`ProjectCommit`, `KernelCommit`) strip the
260    /// trailing `-dirty` suffix before contributing to the key.
261    /// Without the strip, a clean run at HEAD `abc1234` and a
262    /// dirty run at the same HEAD (`abc1234-dirty`) would shatter
263    /// into two separate pairing buckets, defeating
264    /// [`group_and_average_by`]'s `+mixed` cohort detection — that
265    /// helper can only surface "this aggregate has both clean and
266    /// dirty contributors" when the two contributors actually land
267    /// in the same group. Stripping at the key level pairs them by
268    /// canonical hex; the per-row `-dirty` distinction is preserved
269    /// downstream in the aggregate's `commit` / `kernel_commit`
270    /// field via the `+mixed` marker in
271    /// `group_and_average_by`'s `render_mixed_dirty` helper.
272    pub fn from_row(row: &GauntletRow, pairing_dims: &[Dimension]) -> Self {
273        let mut parts = Vec::with_capacity(1 + pairing_dims.len());
274        parts.push(row.scenario.clone());
275        for &dim in pairing_dims {
276            parts.push(match dim {
277                Dimension::Kernel => row.kernel_version.clone().unwrap_or_default(),
278                Dimension::Scheduler => row.scheduler.clone(),
279                Dimension::Topology => row.topology.clone(),
280                Dimension::WorkType => row.work_type.clone(),
281                Dimension::ProjectCommit => commit_pairing_key_part(&row.commit),
282                Dimension::KernelCommit => commit_pairing_key_part(&row.kernel_commit),
283                Dimension::RunSource => row.run_source.clone().unwrap_or_default(),
284                Dimension::ResolveSource => row.resolve_source.clone().unwrap_or_default(),
285                // Cross-budget rows never pair: a row's budget value
286                // becomes part of its pairing key (None -> empty, distinct
287                // from any real budget). A skip (None) only pairs with
288                // another skip.
289                Dimension::CpuBudget => row.cpu_budget.map(|n| n.to_string()).unwrap_or_default(),
290            });
291        }
292        PairingKey(parts)
293    }
294}
295
296/// Strip the trailing `-dirty` suffix from a commit dimension's
297/// value before it contributes to a [`PairingKey`]. `None` and
298/// already-clean values pass through unchanged (`None` → empty
299/// string; `Some("abc1234")` → `"abc1234"`); a dirty value
300/// (`Some("abc1234-dirty")`) is canonicalized to `"abc1234"` so
301/// it pairs with its clean sibling.
302///
303/// Used by [`PairingKey::from_row`] for both the `ProjectCommit`
304/// and `KernelCommit` arms; the per-row `-dirty` distinction is
305/// preserved separately by [`group_and_average_by`] via its
306/// dirty-tracking accumulator and `+mixed` marker.
307fn commit_pairing_key_part(value: &Option<String>) -> String {
308    let Some(s) = value.as_deref() else {
309        return String::new();
310    };
311    s.strip_suffix("-dirty").unwrap_or(s).to_string()
312}
313
314/// One aggregated `GauntletRow` produced by `group_and_average_by`,
315/// plus the pass-bookkeeping needed to render the per-group summary
316/// block (`N/M passed` + the `(S skip, I inc, F fail)` breakdown).
317///
318/// `row` carries arithmetic-mean metric values across every real
319/// Pass contributor in the group; the (`scenario`, `topology`,
320/// `work_type`, `scheduler`, `kernel_version`) identity is taken
321/// verbatim from the first contributor in iteration order — every
322/// contributor in the group shares the identity tuple by
323/// construction (`scenario`, `topology`, and `work_type` ARE the
324/// group key, and `scheduler` / `kernel_version` are
325/// typed-filter-narrowed at the call site so they can only vary if
326/// the operator passed no `--scheduler` / `--kernel` filter).
327///
328/// The verdict bits on `row` (`passed`, `skipped`, `inconclusive`)
329/// fold under the strict 4-state
330/// `Fail > Inconclusive > Pass > Skip` lattice: any failing
331/// contributor sets the aggregate to Fail (`passed=false`,
332/// `inconclusive=false`, `skipped=false`); else any inconclusive
333/// contributor sets `inconclusive=true`; else any skipped
334/// contributor sets `skipped=true`; only an all-pass cohort yields
335/// `passed=true`. The lattice mechanics match
336/// `GauntletRow::is_pass`'s triple-conjunct, so the aggregated
337/// row's accessor reads honestly. Aggregate rows that are not real
338/// Pass route the pair through `compare_rows_by`'s
339/// `excluded_pairs` gate.
340///
341/// `passes_observed`, `skips_observed`, `inconclusives_observed`,
342/// `failures_observed` and `total_observed` count contributors per
343/// the strict 4-state mutex: the four bucket counters sum to
344/// `total_observed` because every contributor falls into exactly
345/// one bucket. Only real Pass contributors feed the per-row sums —
346/// failing, inconclusive, and skipped contributors all carry no
347/// comparable per-run signal (failure-mode telemetry; "couldn't
348/// evaluate" non-signal; "didn't run" non-signal). When no
349/// contributor passed cleanly the running sum is zero and the
350/// aggregate `row` carries default-zero metric values plus
351/// `passed = false` — the downstream `excluded_pairs` gate then
352/// drops the pair from the regression math.
353#[derive(Clone, Debug)]
354#[non_exhaustive]
355pub struct AveragedGroup {
356    /// Aggregated row carrying arithmetic-mean metric values plus
357    /// the lattice-folded `(passed, skipped, inconclusive)` bits
358    /// matching the `Fail > Inconclusive > Pass > Skip`
359    /// dominance. `passed` is true only when every contributor was
360    /// a real pass; `inconclusive` fires when at least one
361    /// contributor was Inconclusive and none failed; `skipped`
362    /// fires when at least one contributor was Skip and none
363    /// failed or was Inconclusive. Fed directly into
364    /// `compare_rows` (averaging is the fixed compare_partitions fold).
365    pub row: GauntletRow,
366    /// Number of contributors that were a real pass
367    /// (`is_pass() == true`). Renders as the numerator of the
368    /// per-group `N/M` summary.
369    pub passes_observed: u32,
370    /// Number of contributors that were Skip (`is_skip() == true`).
371    /// Surfaced in the per-group rendering as the "S skipped"
372    /// breakdown so an operator can distinguish "scenario didn't
373    /// run" from real failures.
374    pub skips_observed: u32,
375    /// Number of contributors that were Inconclusive
376    /// (`is_inconclusive() == true`). Surfaced in the per-group
377    /// rendering as the "I inconclusive" breakdown so an operator
378    /// can distinguish "couldn't evaluate" from real failures —
379    /// same defense-in-depth pattern as
380    /// `format_dimension_summary`'s inconc bucket.
381    pub inconclusives_observed: u32,
382    /// Number of contributors that were a real Fail
383    /// (`is_fail() == true`). Surfaced in the per-group rendering
384    /// as the "F failed" breakdown.
385    pub failures_observed: u32,
386    /// Total contributors in the group (`= group.len()`). Renders
387    /// as the denominator of the per-group `N/M` summary.
388    /// Mechanically:
389    /// `total_observed == passes_observed + skips_observed +
390    /// inconclusives_observed + failures_observed`
391    /// under the strict 4-state mutex.
392    pub total_observed: u32,
393}
394
395/// Per-row dirty-status update used by [`group_and_average_by`] to
396/// detect when a group's contributors disagree on the `-dirty`
397/// suffix for a commit dimension. `value` is `Some(hex)` /
398/// `Some(hex-dirty)` / `None`; the function flips `any_clean` if
399/// the value lacks the `-dirty` suffix and `any_dirty` if it
400/// carries one. `first_base` records the first un-suffixed form
401/// seen (used to render the `+mixed` marker against a canonical
402/// hex even when `acc.first` happens to be the dirty form).
403///
404/// Per-row scope spans EVERY contributor (passing, failing,
405/// skipped). Mixed-dirty is metadata about the cohort's working-
406/// tree state, not about which contributors succeeded — surfacing
407/// it only across passes would hide WIP-vs-committed disagreement
408/// that the operator needs to know about. `None` values do not
409/// flip either flag and do not seed `first_base`.
410fn update_dirty_tracking(
411    value: &Option<String>,
412    any_clean: &mut bool,
413    any_dirty: &mut bool,
414    first_base: &mut Option<String>,
415) {
416    let Some(s) = value.as_deref() else { return };
417    let (base, is_dirty) = match s.strip_suffix("-dirty") {
418        Some(base) => (base, true),
419        None => (s, false),
420    };
421    if is_dirty {
422        *any_dirty = true;
423    } else {
424        *any_clean = true;
425    }
426    if first_base.is_none() {
427        *first_base = Some(base.to_string());
428    }
429}
430
431/// Render the aggregate's commit string for one dimension
432/// (project_commit or kernel_commit) given the cohort-wide
433/// dirty/clean tracking state. When `any_clean && any_dirty` for
434/// the same un-suffixed hex, the rendered form is
435/// `Some("{first_base}+mixed")`; otherwise the function returns
436/// `acc.first.commit` (or `acc.first.kernel_commit`) verbatim,
437/// preserving the existing first-seen behaviour for homogeneous
438/// cohorts (every contributor clean, every contributor dirty, or
439/// every contributor `None`).
440///
441/// `first_base` is the canonical un-suffixed hex captured by
442/// [`update_dirty_tracking`]; using it (rather than stripping
443/// `acc.first.commit`) ensures the rendered form is `abc1234+mixed`
444/// regardless of whether the first contributor was clean or dirty.
445fn render_mixed_dirty(
446    any_clean: bool,
447    any_dirty: bool,
448    first_base: &Option<String>,
449    first_commit: &Option<String>,
450) -> Option<String> {
451    if any_clean
452        && any_dirty
453        && let Some(base) = first_base
454    {
455        return Some(format!("{base}+mixed"));
456    }
457    first_commit.clone()
458}
459
460/// Per-pairing-group fold accumulator for [`group_and_average_by`].
461/// Built via [`Accumulator::new`] from the group's first contributor,
462/// fed one contributor at a time via [`Accumulator::observe`], and
463/// folded into the emitted [`AveragedGroup`] via
464/// [`Accumulator::into_averaged_group`]. Split out of
465/// `group_and_average_by` only to satisfy the source-function size
466/// guard — the field set and fold math are unchanged from the
467/// in-function definition.
468struct Accumulator<'a> {
469    first: &'a GauntletRow,
470    total_observed: u32,
471    passes_observed: u32,
472    skips_observed: u32,
473    inconclusives_observed: u32,
474    failures_observed: u32,
475    any_skipped: bool,
476    any_failed: bool,
477    any_inconclusive: bool,
478    any_expected_failure: bool,
479    // Tracks whether contributors disagree on the `-dirty`
480    // suffix for the project_commit / kernel_commit dimensions.
481    // `any_*_clean` is true if any contributor's value is the
482    // un-suffixed form; `any_*_dirty` is true if any contributor
483    // ends in `-dirty`. When BOTH are true the aggregate is
484    // mixed-dirty and the rendered `commit` / `kernel_commit`
485    // gets a `+mixed` marker so downstream readers don't see a
486    // single arbitrary contributor's status. Tracked across
487    // EVERY contributor (passing, failing, skipped) — a mixed
488    // working-tree state is metadata about the cohort, not
489    // about the metric mean. Empty / `None` values are ignored
490    // and do not flip either flag.
491    any_project_clean: bool,
492    any_project_dirty: bool,
493    any_kernel_clean: bool,
494    any_kernel_dirty: bool,
495    // First-seen un-suffixed (clean-form) project / kernel
496    // commit string. Held separately from `first` because
497    // `first.commit` may be `Some("abc1234-dirty")` when the
498    // first contributor was dirty but later contributors carry
499    // the clean form — the rendered `+mixed` marker should
500    // still attach to the canonical un-suffixed hex so the
501    // operator sees `abc1234+mixed` not `abc1234-dirty+mixed`.
502    first_project_base: Option<String>,
503    first_kernel_base: Option<String>,
504    // Sums across passing+non-skipped contributors only.
505    // Counts are tracked per ext_metric key separately because
506    // a key may be absent from some contributors.
507    // Per-row sum for mean-fold fields (Counter / Gauge(Last) /
508    // Gauge(Avg) — though no typed Gauge(Avg) field exists
509    // today). Arithmetic mean across runs is the operator-
510    // facing cohort-comparison default; per-RUN totals are
511    // averaged to produce a comparable per-run quantity
512    // across cohorts of different run counts.
513    sum_spread: f64,
514    sum_migrations: u64,
515    sum_migration_ratio: f64,
516    sum_stuck_count: f64,
517    sum_fallback_count: i64,
518    sum_keep_last_count: i64,
519    sum_total_iterations: u64,
520    // sum_page_locality + sum_cross_node_mig removed: both NUMA roll-ups are now
521    // ext-sourced (worst_page_locality = WorstLowest, worst_cross_node_migration_ratio
522    // = WorstCrossNodeRatio), re-pooled from the per-phase carriers and
523    // cross-run-MEAN-folded via the ext fold like the other migrated worst_*
524    // selectors — not typed GauntletRow columns, so no per-row group-average
525    // accumulators.
526    // Per-row MAX-fold for Peak-kind fields. Per
527    // `MetricKind::Peak` contract, cross-RUN aggregation
528    // surfaces the worst-instant observed across the cohort —
529    // averaging Peak across runs dilutes the high-water signal
530    // (a 1-run spike at 100 averaged with 4 runs at 0 reports
531    // 20, hiding the actual peak). MAX preserves "did this
532    // peak ever fire in this cohort".
533    max_gap_ms: u64,
534    max_imbalance_ratio: f64,
535    max_max_dsq_depth: u32,
536    // Per-ext-metric (value, weight) pairs, accumulated across
537    // contributors. At emit time the kind-aware fold dispatches
538    // each key through `aggregate_samples` with `Some(&weights)`
539    // so Gauge(Avg) metrics get a weighted mean (per the F-C
540    // fix on aggregate_samples) and other kinds fold by their
541    // own semantics. Unregistered metric names (no MetricDef)
542    // fall back to arithmetic mean — same legacy semantic the
543    // previous (sum, u32) shape produced.
544    ext_pairs: BTreeMap<String, Vec<(f64, usize)>>,
545    // Union of the Dynamic monotonic-counter ext keys across contributors
546    // (`GauntletRow::ext_counter_keys`). `fold_ext_metrics` SUM-folds these
547    // instead of averaging — they are not in the static `METRICS` registry, so
548    // `metric_def` can't classify them. Carried onto the aggregated row so a
549    // second-level cross-RUN fold keeps SUM-folding them.
550    ext_counter_keys: BTreeSet<String>,
551    // Sum of `run_sample_count` across contributors. Carries
552    // through to the aggregated row's `run_sample_count` so a
553    // downstream cross-RUN consumer that further folds these
554    // already-aggregated rows can apply the same weighted
555    // semantic. Currently no typed Gauge(Avg) field exists
556    // (imbalance_ratio is registered as `max_imbalance_ratio`
557    // kind=Peak, NOT Gauge(Avg) — the Gauge(Avg) sibling
558    // `avg_imbalance_ratio` lands in ext_metrics where the
559    // weighted-mean dispatch already fires); the sum is
560    // preserved here for future typed-field Gauge(Avg)
561    // additions and for downstream cohort-of-cohort
562    // aggregation that wants a meaningful weight.
563    sum_run_sample_count: usize,
564}
565
566impl<'a> Accumulator<'a> {
567    /// Seed an accumulator from the group's first contributor.
568    /// Identity is taken from `first`; every counter / sum / max
569    /// starts at its zero value. `observe` (called once per
570    /// contributor, including `first`) performs the per-row fold.
571    fn new(first: &'a GauntletRow) -> Self {
572        Accumulator {
573            first,
574            total_observed: 0,
575            passes_observed: 0,
576            skips_observed: 0,
577            inconclusives_observed: 0,
578            failures_observed: 0,
579            any_skipped: false,
580            any_failed: false,
581            any_inconclusive: false,
582            any_expected_failure: false,
583            any_project_clean: false,
584            any_project_dirty: false,
585            any_kernel_clean: false,
586            any_kernel_dirty: false,
587            first_project_base: None,
588            first_kernel_base: None,
589            sum_spread: 0.0,
590            sum_migrations: 0,
591            sum_migration_ratio: 0.0,
592            sum_stuck_count: 0.0,
593            sum_fallback_count: 0,
594            sum_keep_last_count: 0,
595            sum_total_iterations: 0,
596            max_gap_ms: 0,
597            max_imbalance_ratio: 0.0,
598            max_max_dsq_depth: 0,
599            ext_pairs: BTreeMap::new(),
600            ext_counter_keys: BTreeSet::new(),
601            sum_run_sample_count: 0,
602        }
603    }
604
605    /// Fold one contributor into the accumulator. Called once per
606    /// row in the group (including the group's first contributor).
607    /// Skip / fail / inconclusive contributors flip their verdict
608    /// bits and return early without feeding the metric sums; only
609    /// real passes contribute to the per-row sums and maxes.
610    fn observe(&mut self, row: &GauntletRow) {
611        self.total_observed += 1;
612        // Dirty-status tracking spans ALL contributors. Same hex
613        // with mixed dirty/clean across the cohort is the case the
614        // `+mixed` marker exists to surface — the per-row scope
615        // (passing, failing, skipped) is irrelevant since the
616        // marker describes WIP-vs-committed disagreement among the
617        // contributors, not their metric outcomes.
618        update_dirty_tracking(
619            &row.commit,
620            &mut self.any_project_clean,
621            &mut self.any_project_dirty,
622            &mut self.first_project_base,
623        );
624        update_dirty_tracking(
625            &row.kernel_commit,
626            &mut self.any_kernel_clean,
627            &mut self.any_kernel_dirty,
628            &mut self.first_kernel_base,
629        );
630        if row.expected_failure {
631            // An expect_err / expect_auto_repro run inverted to a pass:
632            // OR the flag so the aggregated row stays OUT of the
633            // ab-compare regression math (its telemetry is
634            // failure-mode-dominated). Its metrics may still fold into
635            // the cohort sums below, but compare_rows_by excludes any
636            // expected_failure row, so the aggregate is never read.
637            self.any_expected_failure = true;
638        }
639        if row.is_skip() {
640            self.any_skipped = true;
641            self.skips_observed += 1;
642            return;
643        }
644        if row.is_fail() {
645            self.any_failed = true;
646            self.failures_observed += 1;
647            return;
648        }
649        if row.is_inconclusive() {
650            // Inconclusive contributors are not passes (the gate
651            // could not be evaluated) and carry no measured signal
652            // worth folding into the cohort means. Track the bit
653            // for the aggregated verdict's `inconclusive` field
654            // (so the aggregate row reads Inconclusive in the
655            // `Fail > Inconclusive > Pass > Skip` lattice when no
656            // contributor failed) and skip the per-row sums.
657            self.any_inconclusive = true;
658            self.inconclusives_observed += 1;
659            return;
660        }
661        self.passes_observed += 1;
662        self.sum_spread += row.spread;
663        self.sum_migrations = self.sum_migrations.saturating_add(row.migrations);
664        self.sum_migration_ratio += row.migration_ratio;
665        self.sum_stuck_count += row.stuck_count;
666        self.sum_fallback_count = self.sum_fallback_count.saturating_add(row.fallback_count);
667        self.sum_keep_last_count = self.sum_keep_last_count.saturating_add(row.keep_last_count);
668        self.sum_total_iterations = self
669            .sum_total_iterations
670            .saturating_add(row.total_iterations);
671        // Peak-kind typed fields: cross-RUN aggregation surfaces
672        // the worst-instant observed across the cohort, NOT the
673        // arithmetic mean (which dilutes a single peak across
674        // many quiet runs and hides the high-water signal).
675        self.max_gap_ms = self.max_gap_ms.max(row.gap_ms);
676        if row.imbalance_ratio > self.max_imbalance_ratio {
677            self.max_imbalance_ratio = row.imbalance_ratio;
678        }
679        self.max_max_dsq_depth = self.max_max_dsq_depth.max(row.max_dsq_depth);
680        self.sum_run_sample_count = self
681            .sum_run_sample_count
682            .saturating_add(row.run_sample_count);
683        // Floor the cross-RUN weight at 1: a passing run that emitted this ext
684        // key contributes one observation to a Gauge(Avg) weighted mean, never
685        // zero-weighted out of a mixed cohort. A run with run_sample_count==0
686        // (e.g. snapshot-bridge-sourced metrics with no monitor samples) would
687        // otherwise be silently dropped from the mean. Matches the .max(1) floors
688        // at run_metrics.rs (populate_run_ext_metrics_from_phases) and
689        // stats_types.rs (merge_metric_values).
690        for (k, v) in &row.ext_metrics {
691            self.ext_pairs
692                .entry(k.clone())
693                .or_default()
694                .push((*v, row.run_sample_count.max(1)));
695        }
696        // Union the Dynamic monotonic-counter key tags across contributors.
697        // Load-bearing, NOT merely defensive: per-run bpf-field resolution (and
698        // which topology levels are present) can vary within a pairing group, so
699        // a key tagged in only some rows must still be recognized as a counter
700        // for the whole group. fold_ext_metrics SUM-folds the union, not means.
701        self.ext_counter_keys
702            .extend(row.ext_counter_keys.iter().cloned());
703    }
704
705    /// Emit the folded [`AveragedGroup`] for this group. Identity
706    /// fields are first-seen; metric fields are the kind-correct
707    /// cross-RUN fold (mean for Counter / mean-fold, MAX for Peak,
708    /// rounded mean for integer-typed fields); the verdict bits
709    /// fold under the `Fail > Inconclusive > Pass > Skip` lattice.
710    fn into_averaged_group(self) -> AveragedGroup {
711        let acc = self;
712        let n = acc.passes_observed;
713        let denom = if n == 0 { 1.0 } else { f64::from(n) };
714        // Rounded mean for integer-typed Counter / mean-fold
715        // fields. When n == 0 the sums are all zero, so dividing
716        // by 1.0 still yields 0 — the aggregate's passed=false
717        // routes the pair through excluded_pairs downstream and
718        // the metrics are never consulted. Peak-kind integer
719        // fields (max_dsq_depth) take the MAX-fold path directly
720        // and don't need a rounding helper.
721        let round_u64 = |sum: u64| -> u64 { (sum as f64 / denom).round() as u64 };
722        let round_i64 = |sum: i64| -> i64 { (sum as f64 / denom).round() as i64 };
723
724        // Mixed-dirty markers. When the cohort contains both a
725        // clean-form and dirty-form contributor for the same hex
726        // (e.g. some sidecars from a clean tree, others from a
727        // -dirty WIP), the rendered commit field carries `+mixed`
728        // appended to the canonical un-suffixed hex. The
729        // alternative — taking `acc.first.commit` verbatim — would
730        // hide WIP-vs-committed disagreement, presenting `abc1234`
731        // when half the contributors actually came from a dirty
732        // tree (or `abc1234-dirty` when half came from a clean
733        // tree). Operators reading averaged stats need to know the
734        // cohort spanned a working-tree state change, since that
735        // changes the meaning of the metric mean. `+mixed` is the
736        // chosen separator (not `-mixed`) so it cannot be confused
737        // with the existing `-dirty` suffix grammar — `dirty` is a
738        // per-record property, `mixed` is a cohort-level property.
739        let project_commit_rendered = render_mixed_dirty(
740            acc.any_project_clean,
741            acc.any_project_dirty,
742            &acc.first_project_base,
743            &acc.first.commit,
744        );
745        let kernel_commit_rendered = render_mixed_dirty(
746            acc.any_kernel_clean,
747            acc.any_kernel_dirty,
748            &acc.first_kernel_base,
749            &acc.first.kernel_commit,
750        );
751        // ext_metrics is built BEFORE the struct so Rate keys can be
752        // re-derived from the folded components as a post-pass. Rate and PerPhase
753        // are skipped here: Rate's components survive cross-RUN as their own ext
754        // keys so it re-derives Σnum/Σdenom (folding two ready-made ratios would
755        // lose the re-pool, and routing a Rate through
756        // aggregate_samples_weighted would hit the aggregate_finite guard);
757        // PerPhase is a per-phase-only scalar with no cross-RUN aggregate.
758        // Distribution / WorstLowest / WakeLatencyTailRatio / WorstCrossNodeRatio
759        // are NOT skipped — their raw components do
760        // NOT survive cross-RUN (phases are dropped), so there is no pooled set
761        // to re-derive; they fall through to aggregate_samples_weighted and
762        // fold by kind (MEAN for the percentile / CV / mean reductions and
763        // every WorstLowest, MAX for SampleReduction::Worst — the
764        // aggregate_finite arms). Dispatch by registered MetricKind so
765        // Gauge(Avg) gets the weighted-mean fold (matches the per-phase merge
766        // contract); unregistered names (no metric_def) fall back to
767        // arithmetic mean, the legacy (sum, count) semantic. Skip a key whose
768        // reduction is None (every value NaN — defensive post sidecar_to_row
769        // sanitize).
770        let ext_metrics = fold_ext_metrics(acc.ext_pairs, &acc.ext_counter_keys);
771        let aggregated = GauntletRow {
772            scenario: acc.first.scenario.clone(),
773            // Per-test gate declarations are identical across a test's grouped
774            // runs (same entry), so the first row's carry the group's.
775            perf_delta_assertions: acc.first.perf_delta_assertions.clone(),
776            topology: acc.first.topology.clone(),
777            work_type: acc.first.work_type.clone(),
778            scheduler: acc.first.scheduler.clone(),
779            kernel_version: acc.first.kernel_version.clone(),
780            commit: project_commit_rendered,
781            kernel_commit: kernel_commit_rendered,
782            run_source: acc.first.run_source.clone(),
783            resolve_source: acc.first.resolve_source.clone(),
784            // First-seen budget metadata, like scheduler/kernel_version
785            // above. CpuBudget is a PAIRING dim (not sliceable — see
786            // Dimension::SLICEABLE), so it is part of the group key and every
787            // contributor shares one budget; the first row's value is the
788            // group's. vcpus is likewise first-seen metadata — and is NOT a
789            // Dimension. No post-aggregation consumer reads the aggregated
790            // vcpus (`render_overcommit_warning` and the other overcommit
791            // checks run pre-aggregation on the raw rows), so the first-seen
792            // value is metadata only.
793            cpu_budget: acc.first.cpu_budget,
794            vcpus: acc.first.vcpus,
795            // ALL must pass: any failed, inconclusive, or skipped
796            // contributor flips the aggregate. A group with zero
797            // passes_observed (every contributor failed, was
798            // inconclusive, or was skipped) collapses to
799            // passed=false here. The four-bit verdict is
800            // strict 4-state (exactly one of pass/skip/inconc/fail
801            // set per row); the lattice
802            // `Fail > Inconclusive > Pass > Skip` determines which
803            // bit dominates when a cohort has mixed contributors.
804            // Skip is the lowest-precedence bit — it fires only
805            // when no contributor failed AND no contributor was
806            // inconclusive AND at least one was skipped. Fail
807            // (all-false) dominates Inconclusive dominates Skip;
808            // exactly one of the four states is encoded per row.
809            passed: !acc.any_failed && !acc.any_inconclusive && !acc.any_skipped && n > 0,
810            skipped: !acc.any_failed && !acc.any_inconclusive && acc.any_skipped,
811            inconclusive: !acc.any_failed && acc.any_inconclusive,
812            expected_failure: acc.any_expected_failure,
813            // Sum across contributors so the aggregated row's
814            // weight is the cohort's total sample population. A
815            // downstream consumer that further folds these
816            // aggregated rows can apply the same weighted semantic
817            // (a 5-RUN cohort of 50-sample runs weighs 250 vs a
818            // 1-RUN cohort of 10 samples weighting 10).
819            run_sample_count: acc.sum_run_sample_count,
820            spread: acc.sum_spread / denom,
821            // Peak-kind typed fields: MAX across runs (kind-correct
822            // cross-RUN fold; arithmetic mean dilutes the
823            // worst-instant signal).
824            gap_ms: acc.max_gap_ms,
825            imbalance_ratio: acc.max_imbalance_ratio,
826            max_dsq_depth: acc.max_max_dsq_depth,
827            migrations: round_u64(acc.sum_migrations),
828            migration_ratio: acc.sum_migration_ratio / denom,
829            stuck_count: acc.sum_stuck_count / denom,
830            fallback_count: round_i64(acc.sum_fallback_count),
831            keep_last_count: round_i64(acc.sum_keep_last_count),
832            total_iterations: round_u64(acc.sum_total_iterations),
833            ext_metrics,
834            // Carry the Dynamic counter-key tags forward so a second-level
835            // cross-RUN fold of these already-aggregated rows keeps SUM-folding
836            // them (the SUM-of-SUMs stays a SUM).
837            ext_counter_keys: acc.ext_counter_keys,
838            // Phase buckets do not aggregate cleanly across an
839            // averaged group: two contributors might run different
840            // scenarios with different phase counts, and per-phase
841            // averaging across mismatched step_index sets would
842            // invent rows neither side carried. Surface the empty
843            // slice so downstream consumers fall back to the flat
844            // bucket. Averaged groups carry no per-phase data; the
845            // per-step_index intersection + one-sided-step surfacing
846            // semantic lives in the per-run noise path
847            // (noise_phase_findings), not the averaging path.
848            phases: Vec::new(),
849        };
850        AveragedGroup {
851            row: aggregated,
852            passes_observed: acc.passes_observed,
853            skips_observed: acc.skips_observed,
854            inconclusives_observed: acc.inconclusives_observed,
855            failures_observed: acc.failures_observed,
856            total_observed: acc.total_observed,
857        }
858    }
859}
860
861/// Fold one group's accumulated per-ext-metric (value, weight) pairs
862/// into the aggregated row's `ext_metrics` map. Rate, PerPhase, and
863/// PerRunDistribution are skipped in the kind dispatch: Rate's components
864/// survive cross-RUN as their own ext keys so it re-derives Σnum/Σdenom
865/// (folding two ready-made ratios would lose the re-pool, and routing a Rate
866/// through aggregate_samples_weighted would hit the aggregate_finite guard);
867/// PerPhase is a per-phase-only scalar with no cross-RUN aggregate;
868/// PerRunDistribution is a whole-run percentile/min/max that CANNOT be
869/// cross-RUN folded (a percentile of a union is not a mean of per-run
870/// percentiles, and the per-phase histograms are dropped cross-RUN so there is
871/// no pooled set to re-derive) — its only cross-RUN consumer is the per-run
872/// noise-compare (`noise_findings`), so it must never be averaged here.
873/// Distribution / WorstLowest / WakeLatencyTailRatio / WorstCrossNodeRatio
874/// are NOT skipped — their raw components do
875/// NOT survive cross-RUN (phases are dropped), so there is no pooled set
876/// to re-derive; they fall through to aggregate_samples_weighted and
877/// fold by kind (MEAN for the percentile / CV / mean reductions and
878/// every WorstLowest, MAX for SampleReduction::Worst — the
879/// aggregate_finite arms). Dispatch by registered MetricKind so
880/// Gauge(Avg) gets the weighted-mean fold (matches the per-phase merge
881/// contract); an unregistered name (no metric_def) folds by `counter_keys`: a
882/// Dynamic monotonic-counter key (lb_*/alb_* schedstat delta, ScalarCounter bpf
883/// field) SUM-folds (matching the registered-Counter convention), every other
884/// unregistered name falls back to arithmetic mean, the legacy (sum, count)
885/// semantic. Skip a key whose
886/// reduction is None (every value NaN — defensive post sidecar_to_row
887/// sanitize). Rate metrics are then re-derived from the folded
888/// components (Σnum/Σdenom) as a post-pass.
889fn fold_ext_metrics(
890    ext_pairs: BTreeMap<String, Vec<(f64, usize)>>,
891    counter_keys: &BTreeSet<String>,
892) -> BTreeMap<String, f64> {
893    let mut ext_metrics: std::collections::BTreeMap<String, f64> = ext_pairs
894        .into_iter()
895        .filter_map(|(k, pairs)| {
896            if let Some(def) = metric_def(&k) {
897                // Rate re-derives from its folded components (post-pass below);
898                // PerPhase is a per-phase-only scalar with no meaningful
899                // cross-RUN aggregate — both are skipped here. The PerPhase skip
900                // is also load-bearing: a PerPhase key reaching
901                // aggregate_samples_weighted would hit aggregate_finite's
902                // unreachable!() arm. (PerPhase keys should never reach here —
903                // populate_run_ext_metrics_from_phases skips is_derived keys —
904                // so this is defensive belt-and-suspenders.)
905                if matches!(
906                    def.kind,
907                    MetricKind::Rate { .. } | MetricKind::PerPhase | MetricKind::PerRunDistribution
908                ) {
909                    return None;
910                }
911                aggregate_samples_weighted(&pairs, def.kind).map(|v| (k, v))
912            } else {
913                let n = pairs.len();
914                if n == 0 {
915                    None
916                } else {
917                    let sum: f64 = pairs.iter().map(|(v, _)| *v).sum();
918                    // A Dynamic monotonic-counter key (lb_*/alb_* schedstat delta
919                    // or a ScalarCounter bpf field) SUM-folds across runs,
920                    // matching the registered-Counter convention (aggregate_finite's
921                    // Counter arm). Untagged keys (gauges, per-CPU _avg/_max) keep
922                    // the legacy arithmetic-mean fold.
923                    let v = if counter_keys.contains(&k) {
924                        sum
925                    } else {
926                        sum / n as f64
927                    };
928                    Some((k, v))
929                }
930            }
931        })
932        .collect();
933    // Re-derive Rate metrics from the folded components (Σnum/Σdenom).
934    derive_rate_metrics(&mut ext_metrics);
935    ext_metrics
936}
937
938/// Group `rows` by the dynamic pairing key (`scenario` plus every
939/// dimension in `pairing_dims`) and arithmetic-mean their metric
940/// fields, returning one [`AveragedGroup`] per distinct key.
941/// Slicing dims are EXCLUDED from `pairing_dims` (rows on the A/B
942/// sides differ on them by design); pairing dims are INCLUDED.
943///
944/// Group key matches [`compare_rows_by`]' pairing key so the post-
945/// aggregation row vec joins cleanly across A/B sides under the
946/// same identity contract.
947///
948/// Aggregation rules:
949/// - The verdict bits `(passed, skipped, inconclusive)` aggregate
950///   under the strict 4-state mutex per the
951///   `Fail > Inconclusive > Pass > Skip` lattice. Fail (all-false)
952///   dominates: any failed contributor flips the aggregate's
953///   `passed` to `false` and leaves `skipped`/`inconclusive` clear,
954///   yielding Fail at the aggregate level. Otherwise Inconclusive
955///   dominates: any inconclusive contributor sets the aggregate's
956///   `inconclusive = true`. Otherwise Skip dominates: any skipped
957///   contributor sets `skipped = true`. Only when every contributor
958///   was a real Pass does the aggregate carry `passed = true`. This
959///   matches [`GauntletRow::is_pass`]'s triple-conjunct semantics
960///   so the aggregate's accessor reads honestly.
961/// - Metrics (`f64` / `u64` / `i64` fields, plus `ext_metrics`
962///   entries) are summed only across contributors where
963///   `passed && !skipped`, then divided by that count to yield an
964///   arithmetic mean. Failing/skipped contributors carry telemetry
965///   dominated by the failure mode, NOT scheduler behaviour, and
966///   are therefore excluded from the mean. When no contributor
967///   passed cleanly, every metric defaults to zero and the
968///   aggregate's `passed = false` routes the pair to
969///   [`compare_rows_by`]' `excluded_pairs` gate.
970/// - `u64` / `i64` fields take the rounded mean
971///   (`(sum / count).round() as u64`). The up-to-0.5-unit per-side
972///   rounding error (up to 1.0 across an A/B pair) stays below each
973///   such field's `default_abs` gate: the smallest is
974///   `total_iterations` / `total_migrations` at 2.0, held `>= 2.0` by
975///   the scale-varying #28 recalibration precisely so a rounding-only
976///   delta (`<= 1.0`) never clears the gate and fabricates a unit
977///   regression.
978/// - `stuck_count` is the exception: it is `f64` and carries the
979///   EXACT mean (`sum / count`, no rounding). Its `default_abs` is
980///   1.0 — tight enough that a rounded mean's up-to-1.0 per-A/B-pair
981///   error would fabricate single-stall regressions from sub-integer
982///   differences (an A-side mean of 1.4 vs a B-side 1.6 rounds to
983///   1 vs 2, a spurious delta of 1).
984/// - `ext_metrics` keys are unioned across passing contributors;
985///   each key's mean is computed only across contributors that
986///   carried it. A key present in some passing rows and absent
987///   from others uses the present-only count as its denominator —
988///   absent-and-zero are not equivalent (the `BTreeMap<String,
989///   f64>` shape cannot represent "absent" with a stored zero).
990/// - Identity fields (`scenario`, `topology`, `work_type`,
991///   `scheduler`, `kernel_version`) come from the first contributor
992///   in iteration order. Every contributor in the group shares the
993///   first three by construction (group key); `scheduler` and
994///   `kernel_version` may vary across the group if the operator did
995///   not narrow via typed filters first, but the aggregated row
996///   carries the first contributor's value in any case — the join
997///   downstream uses the three-tuple, so scheduler/version on the
998///   aggregate is metadata, not a join key.
999/// - Commit dimensions (`commit`, `kernel_commit`) follow a
1000///   first-seen rule with one exception: when contributors disagree
1001///   on the `-dirty` suffix for the same canonical hex (some clean,
1002///   some dirty), the rendered form becomes `{hex}+mixed` so the
1003///   working-tree disagreement is surfaced rather than hidden by
1004///   first-seen. `+mixed` (not `-mixed`) is intentional —
1005///   `-dirty` is a per-record property of one sidecar, `+mixed`
1006///   is a cohort-level property of the average. Mixed-dirty
1007///   tracking spans EVERY contributor (passing, failing, skipped)
1008///   because the cohort's WIP state is metadata, not a metric.
1009///
1010/// Group iteration order matches the order of FIRST appearance of
1011/// each key in `rows`; `BTreeMap` ordering is by key (not iteration
1012/// order) so we maintain a parallel `Vec<key>` to preserve
1013/// first-seen ordering. Stable order keeps test fixtures
1014/// deterministic across runs.
1015pub fn group_and_average_by(
1016    rows: &[GauntletRow],
1017    pairing_dims: &[Dimension],
1018) -> Vec<AveragedGroup> {
1019    // Dynamic pairing key — scenario + every NON-slicing
1020    // dimension's value, in [`Dimension::ALL`] order. The
1021    // `PairingKey` newtype is owned (`Vec<String>`) so the
1022    // BTreeMap can hold keys without lifetime gymnastics; the
1023    // alternative — borrowing slices into `rows` — would force
1024    // every consumer to keep `rows` alive for the duration of
1025    // the map.
1026    type Key = PairingKey;
1027
1028    let mut order: Vec<Key> = Vec::new();
1029    let mut groups: BTreeMap<Key, Accumulator<'_>> = BTreeMap::new();
1030
1031    for row in rows {
1032        let key = PairingKey::from_row(row, pairing_dims);
1033        let acc = groups.entry(key.clone()).or_insert_with(|| {
1034            order.push(key);
1035            Accumulator::new(row)
1036        });
1037        acc.observe(row);
1038    }
1039
1040    let mut out = Vec::with_capacity(order.len());
1041    for key in order {
1042        let acc = groups
1043            .remove(&key)
1044            .expect("first-seen key must still be in groups map");
1045        out.push(acc.into_averaged_group());
1046    }
1047    out
1048}
1049
1050/// Convert a SidecarResult to a GauntletRow for run-to-run comparison.
1051///
1052/// Non-finite f64 values (NaN, ±Infinity) are sanitized to 0.0 with a
1053/// warn before they reach the row. `serde_json::to_string` rejects
1054/// non-finite, so a single poisoned metric would otherwise halt every
1055/// downstream JSON write. Sanitizing at the ingress boundary keeps the
1056/// serializer happy without silencing the upstream data quality issue.
1057///
1058/// # NaN → 0.0 ambiguity for zero-meaningful metrics
1059///
1060/// The 0.0 substitution is indistinguishable from a legitimate 0.0
1061/// measurement for metrics whose natural zero carries its own signal.
1062/// One direct f64 field is especially affected — note the in-tree producer
1063/// already guards the typical divide-by-zero path
1064/// (`assert::reductions::migration_ratio_of` emits `0.0` for
1065/// migration_ratio when `total_iterations == 0`), so a NaN reaching
1066/// this boundary indicates an upstream producer outside that guard (e.g. an
1067/// external `ext_metrics` contributor, or a schedstat arithmetic
1068/// edge that slipped past a guard):
1069///
1070/// - `migration_ratio`: lower-better. A real 0.0 means "no task was
1071///   migrated" (ideal locality). A sanitized NaN collapses to the
1072///   same value and reads as *falsely good* — a downstream regression
1073///   gate sees "perfect locality" where the truth is "no data".
1074///   (`page_locality` is NO LONGER a finite_or_zero typed field: it is the
1075///   ext-sourced `worst_page_locality` WorstLowest metric, re-pooled from the
1076///   per-phase NUMA carriers, so a non-finite value is DROPPED via the ext path —
1077///   absence preserved — not coerced to a falsely-bad 0.0 here.)
1078///
1079/// The reclassified wake-latency / run-delay distributions (e.g.
1080/// `worst_wake_latency_cv`) are NO LONGER direct f64 fields — they flow
1081/// through `ext_metrics`, where a non-finite value is DROPPED (the entry is
1082/// absent), NOT substituted with 0.0. That is the opposite, no-false-zero
1083/// contract: an absent key reads as no-data, distinct from a measured 0.0.
1084///
1085/// The accompanying `tracing::warn!` is the only signal that
1086/// separates a sanitized NaN from a real 0.0; downstream aggregation
1087/// by value alone cannot distinguish them.
1088pub fn sidecar_to_row(sc: &crate::test_support::SidecarResult) -> GauntletRow {
1089    // Local closure so the warn can carry the scenario name as
1090    // context — keyed by field so the operator can pinpoint which
1091    // metric produced the bad value.
1092    let finite_or_zero = |field: &str, v: f64| -> f64 {
1093        if v.is_finite() {
1094            v
1095        } else {
1096            tracing::warn!(
1097                test = %sc.test_name,
1098                field,
1099                value = v,
1100                "non-finite f64 in GauntletRow field; substituting 0.0",
1101            );
1102            0.0
1103        }
1104    };
1105
1106    // Build ext_metrics from the in-guest payload map (dropping the
1107    // walk-truncation sentinel + non-finite values), then layer in the
1108    // host-side monitor schedstat aggregates below.
1109    let mut ext_metrics: BTreeMap<String, f64> = sc
1110        .stats
1111        .ext_metrics
1112        .iter()
1113        .filter_map(|(k, &v)| {
1114            if crate::test_support::is_truncation_sentinel_name(k) {
1115                return None;
1116            }
1117            if v.is_finite() {
1118                Some((k.clone(), v))
1119            } else {
1120                tracing::warn!(
1121                    test = %sc.test_name,
1122                    metric = %k,
1123                    value = v,
1124                    "dropping non-finite ext_metric; serde_json rejects NaN/Infinity",
1125                );
1126                None
1127            }
1128        })
1129        .collect();
1130    // System-wide schedstat aggregates, read host-side from guest memory
1131    // at freeze (zero observer effect; `MonitorSummary::schedstat_deltas`,
1132    // summed across CPUs over the run). Keys ABSENT when CONFIG_SCHEDSTATS
1133    // is off (schedstat_deltas == None): absent != 0 for a no-data run, and
1134    // a 0 would pollute the cross-run Counter SUM and the Rate denominators
1135    // (`total_pcount`, `total_ttwu_count`). All seven
1136    // insert under one `if let` so each Rate's numerator/denominator pair is
1137    // always co-present (derive_rate_metrics needs both). `u64 -> f64` is
1138    // exact below 2^53 and inherently finite, so these skip the finite
1139    // filter the payload keys go through. The registry entries are
1140    // `Polarity::Informational` Counter raw components that feed nine
1141    // `MetricKind::Rate` derivations (per-schedule: total_run_delay_ns_per_sched,
1142    // ttwu_local_fraction, sched_goidle_fraction; per-second: run_delay_per_sec,
1143    // pcount_per_sec, sched_count_per_sec, yld_count_per_sec, ttwu_count_per_sec,
1144    // sched_goidle_per_sec); see [`crate::stats::METRICS`].
1145    if let Some(sd) = sc
1146        .monitor
1147        .as_ref()
1148        .and_then(|m| m.schedstat_deltas.as_ref())
1149    {
1150        ext_metrics.insert("total_run_delay".to_string(), sd.total_run_delay as f64);
1151        ext_metrics.insert("total_pcount".to_string(), sd.total_pcount as f64);
1152        ext_metrics.insert("total_sched_count".to_string(), sd.total_sched_count as f64);
1153        ext_metrics.insert("total_yld_count".to_string(), sd.total_yld_count as f64);
1154        ext_metrics.insert(
1155            "total_sched_goidle".to_string(),
1156            sd.total_sched_goidle as f64,
1157        );
1158        ext_metrics.insert("total_ttwu_count".to_string(), sd.total_ttwu_count as f64);
1159        ext_metrics.insert("total_ttwu_local".to_string(), sd.total_ttwu_local as f64);
1160        // Per-second Rate denominator: the schedstat-window span, co-inserted
1161        // both-or-neither with the total_* numerators above so every *_per_sec
1162        // schedstat Rate has its matching-window denominator present (the
1163        // derive_rate_metrics num+den co-presence invariant; the same window the
1164        // total_* deltas span, so num/den share a time base).
1165        ext_metrics.insert(
1166            "total_schedstat_wall_sec".to_string(),
1167            sd.total_schedstat_wall_sec,
1168        );
1169    }
1170    // Run-level ext-only monitor metrics (avg_nr_running + the PELT IRQ load
1171    // pair + the PSI-irq pair), folded from the run's MonitorSummary. Inserted
1172    // only when the run has monitor samples (a 0-sample run carries no
1173    // occupancy / IRQ signal — absent, not a false 0.0); the IRQ fields insert
1174    // only on Some (loud-absent on a kernel without the source). Shared with
1175    // VmResult::run_metric via fold_run_level_ext so the key list + loud-absent
1176    // guard can't drift between the sidecar row and the in-test accessor.
1177    // Dynamic monotonic-counter ext keys (lb_*/alb_* schedstat deltas + any
1178    // ScalarCounter bpf field) collected alongside the values so the cross-run
1179    // fold SUM-folds them (they are not in the static METRICS registry, so
1180    // metric_def can't classify them — see fold_ext_metrics).
1181    let mut ext_counter_keys = BTreeSet::new();
1182    if let Some(m) = sc.monitor.as_ref() {
1183        m.fold_run_level_ext_with_counter_keys(&mut ext_metrics, &mut ext_counter_keys);
1184    }
1185
1186    GauntletRow {
1187        scenario: sc.test_name.clone(),
1188        perf_delta_assertions: sc.perf_delta_assertions.clone(),
1189        topology: sc.topology.clone(),
1190        work_type: sc.work_type.clone(),
1191        scheduler: sc.scheduler.clone(),
1192        kernel_version: sc.kernel_version.clone(),
1193        commit: sc.project_commit.clone(),
1194        kernel_commit: sc.kernel_commit.clone(),
1195        run_source: sc.run_source.clone(),
1196        resolve_source: sc.resolve_source.clone(),
1197        // 0 = skip rows (never booted) -> None: skips carry no budget
1198        // identity, so they don't pair into a "budget 0" bucket.
1199        cpu_budget: (sc.cpu_budget != 0).then_some(sc.cpu_budget),
1200        vcpus: (sc.vcpus != 0).then_some(sc.vcpus),
1201        passed: sc.is_pass(),
1202        skipped: sc.is_skip(),
1203        inconclusive: sc.is_inconclusive(),
1204        expected_failure: sc.expected_failure,
1205        run_sample_count: sc.monitor.as_ref().map(|m| m.total_samples).unwrap_or(0),
1206        spread: finite_or_zero("spread", sc.stats.worst_spread),
1207        gap_ms: sc.stats.worst_gap_ms,
1208        migrations: sc.stats.total_migrations,
1209        migration_ratio: finite_or_zero("migration_ratio", sc.stats.worst_migration_ratio),
1210        imbalance_ratio: finite_or_zero(
1211            "imbalance_ratio",
1212            sc.monitor
1213                .as_ref()
1214                .map(|m| m.max_imbalance_ratio)
1215                .unwrap_or(0.0),
1216        ),
1217        max_dsq_depth: sc
1218            .monitor
1219            .as_ref()
1220            .map(|m| m.max_local_dsq_depth)
1221            .unwrap_or(0),
1222        stuck_count: sc.monitor.as_ref().map(|m| m.stuck_count).unwrap_or(0) as f64,
1223        fallback_count: sc
1224            .monitor
1225            .as_ref()
1226            .and_then(|m| m.event_deltas.as_ref())
1227            .map(|e| e.total_fallback)
1228            .unwrap_or(0),
1229        keep_last_count: sc
1230            .monitor
1231            .as_ref()
1232            .and_then(|m| m.event_deltas.as_ref())
1233            .map(|e| e.total_dispatch_keep_last)
1234            .unwrap_or(0),
1235        total_iterations: sc.stats.total_iterations,
1236        // Built above: in-guest payload ext keys (non-finite values and
1237        // the walk-truncation sentinel dropped — a dropped non-finite must
1238        // not be confused with a real 0.0, and the sentinel is JSON-walker
1239        // diagnostic metadata, not a scenario metric) plus the host-side
1240        // monitor schedstat aggregates.
1241        ext_metrics,
1242        // Which of the Dynamic ext keys are monotonic counters (SUM-fold
1243        // cross-run); empty when there is no monitor / no counter keys.
1244        ext_counter_keys,
1245        // Carry per-phase buckets verbatim from the source
1246        // ScenarioStats. The bucket structure has already been
1247        // reduced by the host-side phase aggregator (Counter via
1248        // `phase_counter_delta`, Gauge/Peak/Timestamp via
1249        // `aggregate_samples`), so the sidecar -> row step just
1250        // forwards the prebuilt slice. An empty `phases` slot on
1251        // the source sidecar (single-phase scenario or legacy
1252        // file) flows through as an empty slice.
1253        phases: sc.stats.phases.clone(),
1254    }
1255}
ktstr/stats/group.rs

ktstr/stats/
group.rs