ktstr/stats/
metric.rs

1use super::*;
2
3/// Definition of a metric for the comparison pipeline.
4///
5/// Each entry describes polarity (`higher_is_worse`), dual-gate
6/// significance thresholds (`default_abs`, `default_rel`), a
7/// display unit string for formatted output, and a row accessor
8/// (`accessor`) that returns the metric's value from a
9/// [`GauntletRow`] without a hand-maintained name→field match.
10///
11/// The `accessor` field is skipped in serde output because `fn`
12/// pointers are not serializable. A future `Deserialize` impl
13/// would need callers to re-hydrate the accessor by looking up
14/// `name` via [`metric_def`] — the static [`METRICS`] table is
15/// the authoritative source of the function identity. No such
16/// impl exists today; the note is a forward-conditional so that
17/// if one is added, the migration path is spelled out rather
18/// than reinvented per site.
19///
20/// # Registered vs unregistered metrics
21///
22/// The static [`METRICS`] registry is the "core metric" set with
23/// hand-authored accessors, hand-tuned dual-gate thresholds
24/// (`default_abs` / `default_rel`), and display units. Each
25/// registered `MetricDef.accessor` reads a typed field on
26/// `GauntletRow` directly (e.g. `r.spread`, `r.gap_ms`).
27///
28/// Metrics that fall OUTSIDE this registry are carried on
29/// `GauntletRow.ext_metrics: BTreeMap<String, f64>`. Registered
30/// metrics never flow through `ext_metrics`; unregistered metrics
31/// never flow through the typed fields. [`MetricDef::read`] and
32/// `read_metric` check the registered-field accessor first and
33/// fall back to an `ext_metrics.get(name)` lookup — a name that
34/// matches neither returns `None`. Consumers that want to
35/// distinguish "registered-but-null" from "unregistered-and-
36/// absent" must inspect the registry directly rather than rely
37/// on the fallback.
38///
39/// # `#[non_exhaustive]` migration note
40///
41/// Downstream code that pattern-matches an instance of `MetricDef`
42/// must end the match with `..` so a future field addition does
43/// not become a breaking change. Prefer reading values through
44/// the static [`METRICS`] registry and [`metric_def`] lookup
45/// rather than constructing `MetricDef` values by hand.
46#[derive(Debug, Clone, serde::Serialize)]
47#[non_exhaustive]
48pub struct MetricDef {
49    pub name: &'static str,
50    /// Regression direction for this metric. A metric that
51    /// previously used `higher_is_worse: true` maps to
52    /// [`Polarity::LowerBetter`](crate::test_support::Polarity::LowerBetter)
53    /// (bigger values are regressions, so smaller is better);
54    /// `false` maps to
55    /// [`Polarity::HigherBetter`](crate::test_support::Polarity::HigherBetter).
56    /// The sense is INVERSE: the old bool answered "does growing
57    /// this value mean worse?" while the enum answers "what
58    /// direction do we want this to move?".
59    pub polarity: crate::test_support::Polarity,
60    /// Temporal aggregation kind. Drives how
61    /// [`aggregate_samples`] collapses N readings of the same
62    /// metric across multiple capture samples (e.g. periodic
63    /// monitor ticks within one run, or runs pooled for a
64    /// `cargo ktstr perf-delta` comparison) into one comparable
65    /// value. Distinct from [`Self::polarity`], which is the
66    /// "good direction" of the FINAL value: kind tells us HOW to
67    /// reduce a vec of samples; polarity tells us how to interpret
68    /// the reduced number.
69    ///
70    /// Default `Counter` matches the most common shape — every
71    /// kernel monotonic counter (SCX_EV_*, ttwu_count, run_delay,
72    /// cpustat[]) collapses by sum-of-deltas. ~80% of ktstr fields
73    /// are counters; the field exists so the remaining peaks and
74    /// gauges can opt out of sum-aggregation explicitly.
75    pub kind: MetricKind,
76    /// Absolute-materiality gate: a move smaller than this (in the metric's
77    /// [`Self::display_unit`]) is never a confident change, ANDed with
78    /// [`Self::default_rel`]. Its role depends on the metric's dynamic range
79    /// across workloads:
80    ///
81    /// - **Scale-bounded** metrics (a ratio of co-scaling counters, or a
82    ///   naturally-bounded unit — `%` spread, `ms`/`µs` latency, `x` ratios,
83    ///   `[0,1]` fractions): `default_abs` is a fixed unit-scale measurement-
84    ///   noise floor. A sub-unit move is immaterial regardless of its relative
85    ///   size, so a fixed floor is correct.
86    /// - **Scale-varying** metrics (a raw per-event count, or a rate normalized
87    ///   only by time — `*_per_sec`, `ops/s`, `req/s`, `ns/s`): the baseline
88    ///   spans orders of magnitude across workloads, so a fixed floor calibrated
89    ///   for high throughput would MASK a large relative regression on a low-
90    ///   throughput workload. Here `default_abs` is only a NEAR-IDLE activity
91    ///   guard (it keeps a big relative swing on a near-idle baseline from
92    ///   firing) and [`Self::default_rel`] carries materiality. Three tests
93    ///   enforce that every scale-varying metric keeps a near-idle floor:
94    ///   `throughput_rate_floors_are_near_idle` (per-time rates + throughput
95    ///   carriers), `scale_varying_count_floors_are_near_idle` (raw counts and
96    ///   ns/µs accumulations), and `mixed_class_scale_varying_floors_pinned`
97    ///   (the mixed-class Peak/WorstLowest/PerPhase metrics, by allowlist).
98    pub default_abs: f64,
99    pub default_rel: f64,
100    pub display_unit: &'static str,
101    #[serde(skip)]
102    pub accessor: fn(&GauntletRow) -> Option<f64>,
103}
104
105/// Temporal aggregation classification for a metric.
106///
107/// Kernel-source-grounded per the metric-semantics taxonomy.
108/// Drives [`aggregate_samples`] — the function that collapses a
109/// slice of per-sample readings of the SAME metric into one
110/// representative value for downstream regression / display.
111///
112/// Reduction semantics by variant:
113///   - [`MetricKind::Counter`] — kernel monotonic counter; the
114///     temporal aggregate is the SUM of consecutive deltas across
115///     the sample window. For pre-deltaed inputs (each sample
116///     carries its own window's count) this is `samples.iter().sum()`.
117///   - [`MetricKind::Gauge`] — instantaneous value; the
118///     [`GaugeAgg`] subkind picks Avg / Last / Max.
119///   - [`MetricKind::Peak`] — kernel-side max-of-window (e.g.
120///     `max_run_delay`, `max_newidle_lb_cost`); temporal aggregate
121///     is max-of-max so a window-wise high-water never gets
122///     diluted.
123///   - [`MetricKind::Timestamp`] — wall/rq clock; the temporal
124///     aggregate is the LAST sample's value (a snapshot of "where
125///     the clock is now"). Diffing two captures gives elapsed
126///     time, but a single window's reduction picks the latest
127///     reading — averaging timestamps is meaningless.
128// Serialize only: MetricKind is serialized as part of MetricDef (which is
129// Serialize-only) but is never deserialized. A `Deserialize` derive here
130// would narrow to `Deserialize<'static>` because the Rate variant carries
131// `&'static str` fields (serde treats `&str` as borrowed), so it would not
132// satisfy `DeserializeOwned` and would break any future container that
133// deserializes an embedded MetricKind. Drop it rather than carry a fragile,
134// unused impl.
135#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
136#[non_exhaustive]
137pub enum MetricKind {
138    /// Monotonic counter (SCX_EV_* event counters,
139    /// `cpustat[CPUTIME_*]`, `bpf_prog_stats.cnt`, `ttwu_count`,
140    /// `nr_migrations`, …). Aggregate by sum.
141    Counter,
142    /// Instantaneous value (`nr_running`, `local_dsq.nr`, current
143    /// `policy`, current `comm`). The [`GaugeAgg`] tag picks the
144    /// reduction: Avg for typical-load, Last for "what's happening
145    /// now", Max for worst-instant.
146    Gauge(GaugeAgg),
147    /// Kernel max-of-window (`max_run_delay`,
148    /// `max_newidle_lb_cost`, the per-CPU preempt-off peak).
149    /// Aggregate by max — a peak that ever fired must survive the
150    /// reduction.
151    Peak,
152    /// Clock or wall-time reading (`rq.clock`,
153    /// CLOCK_REALTIME-stamped capture timestamps). Aggregate by
154    /// Last — averaging timestamps loses meaning.
155    Timestamp,
156    /// PRE-DELTAED counter: each sample is already a delta-since-the-
157    /// previous-read, not a cumulative-since-boot total. Schedulers
158    /// that delta their scx_stats Metrics server-side per reader
159    /// request (e.g. scx_mitosis) produce this — one ktstr snapshot =
160    /// one reader request = one delta. The per-phase reduction is the
161    /// SUM of the in-phase deltas (NOT the `Counter` last-minus-first,
162    /// which would difference two deltas into nonsense); the flat-run
163    /// reduction is likewise the sum. Boundary: the first in-phase
164    /// delta straddles the phase boundary (it spans from the last
165    /// pre-phase read to the first in-phase read, so it includes a
166    /// little pre-phase activity); it is attributed to the phase its
167    /// read lands in — a slight left-edge over-attribution, the
168    /// deliberate semantic since a per-read delta cannot be split.
169    DeltaSum,
170    /// An INJECTED per-phase delta that SUMS across phases within a run but
171    /// folds by UNWEIGHTED MEAN across runs. Each per-phase value is a
172    /// thread-group CPU-time delta (`system_time_ns` / `user_time_ns`,
173    /// written directly into `PhaseBucket.metrics` by `phase_group_cpu_delta`
174    /// — there is no `read_sample` arm). The cross-PHASE fold is the SUM of the
175    /// disjoint per-phase deltas (like `Counter`), NOT a sample-count-weighted
176    /// mean: a per-phase delta is already proportional to the phase's
177    /// wall-clock duration, and the freeze `sample_count` is too, so a weighted
178    /// mean would double-count duration and collapse the run to a meaningless
179    /// duration-weighted average per-phase delta. The sum is the run's total
180    /// OBSERVED CPU time — first-to-last freeze WITHIN each phase — a lower
181    /// bound that excludes pre-first-freeze, post-last-freeze, and
182    /// inter-phase-gap windows (and single-freeze phases, which contribute
183    /// nothing), per `phase_group_cpu_delta`'s observed-window semantic; that
184    /// is sufficient for an A/B regression signal, where the same
185    /// observed-window sum is compared on both sides. Across RUNS each run
186    /// contributes exactly one total, folded by the UNWEIGHTED arithmetic mean
187    /// (`Σ / contributors`) over the runs that emitted the key — NOT weighted
188    /// by `run_sample_count` (the monitor capture count, an unrelated
189    /// population that would also silently zero-weight a monitor-off run). This
190    /// SUM-cross-phase / MEAN-cross-run pair is exactly why these deltas are
191    /// NOT `Gauge(Avg)` (weighted mean at BOTH levels — the bug this kind
192    /// fixes) and NOT `DeltaSum` (sum at both levels, which would inflate the
193    /// cross-run value by the run count). The value is run-wide POOLED (one
194    /// scalar per phase across all tgids), NOT per-cgroup; the cross-cgroup
195    /// same-step merge is `Commutative` (`a + b`, like `Counter`) but is the
196    /// defensive same-step-index path only — dead for these keys, which are
197    /// injected exactly once into the pooled host `PhaseBucket`. NOT
198    /// [`Self::is_derived`] — it carries a real per-phase value, unlike Rate /
199    /// Distribution.
200    PerPhaseDeltaSum,
201    /// Derived ratio of two component metrics — a RATE that must be
202    /// recomputed from its components at every in-map aggregation level, never
203    /// averaged as a ready-made ratio. The variant carries the registry
204    /// names of its `numerator` and `denominator` component metrics, each
205    /// itself registered with its own kind (e.g. a `Counter` numerator).
206    ///
207    /// A Rate has NO samples of its own. Its value is DERIVED from the
208    /// already-reduced component values as `map[numerator] /
209    /// map[denominator]` by the [`derive_rate_metrics`] post-pass. An
210    /// aggregation level that pools the components FIRST (each by its own
211    /// kind — a `Counter` numerator sums, a `Gauge(Avg)` averages) and
212    /// then re-derives the rate RE-POOLS correctly: for the common
213    /// `Counter / Counter` case the result is `Σnumerator / Σdenominator`,
214    /// NOT a mean of two phases' ready-made ratios `(r₁ + r₂) / 2` (which
215    /// is WRONG whenever the phases carry different denominator weight,
216    /// e.g. iterations-per-cpu-second across phases of unequal CPU time).
217    /// The numerator and denominator must already be expressed in units
218    /// whose quotient is the intended rate unit (the component
219    /// registration owns the unit choice; this variant does not scale).
220    ///
221    /// `derive_rate_metrics` runs as a post-pass at the nine aggregation
222    /// sites where the components co-locate in one map: the two per-phase
223    /// builds (`buckets_from_grouped`, `build_phase_buckets_with_stimulus`),
224    /// the cross-phase bucket merge (`merge_matched_phase_buckets`), the
225    /// three cross-RUN ext-metrics reducers (`populate_run_ext_metrics`,
226    /// `populate_run_ext_metrics_from_phases`, and `group_and_average_by`),
227    /// and the cross-CGROUP pooled re-pools
228    /// (`crate::assert::populate_run_pooled_iterations_per_cpu_sec`,
229    /// `crate::assert::populate_run_pooled_taobench`,
230    /// `crate::assert::populate_run_pooled_schbench`).
231    /// The cross-CGROUP `AssertResult::merge` ext-metrics fold itself uses
232    /// worst-case polarity (min/max) and is NOT a re-pool site; the pooled
233    /// re-pool runs separately after it, at the eval layer, reading
234    /// `stats.cgroups` directly. `iteration_rate` does not exercise the merge
235    /// fold either: it and its components are host-injected by
236    /// `populate_run_ext_metrics_from_phases` AFTER the cross-cgroup `merge`,
237    /// so the fold never sees them. The pooled `iterations_per_cpu_sec` is the
238    /// rate whose components ARE per-cgroup, and
239    /// `populate_run_pooled_iterations_per_cpu_sec` re-pools it post-merge.
240    ///
241    /// Because a single sample slice cannot express the re-pool, a Rate is
242    /// FORBIDDEN from the single-slice reducers ([`aggregate_finite`]
243    /// panics on it); the post-pass is its only producer.
244    Rate {
245        /// Registry name of the numerator component metric.
246        numerator: &'static str,
247        /// Registry name of the denominator component metric.
248        denominator: &'static str,
249    },
250    /// Derived DISTRIBUTIONAL aggregate re-pooled from a raw per-cgroup
251    /// sample set, never folded from ready-made per-cgroup reductions. The
252    /// variant names the [`SampleSource`] (which
253    /// [`crate::assert::PhaseCgroupStats`] sample vector feeds it) and the
254    /// [`SampleReduction`] (which statistic to compute over the pooled set).
255    ///
256    /// Like [`MetricKind::Rate`], a Distribution has NO value of its own at
257    /// the WITHIN-RUN levels: its run-level value is DERIVED post-merge by
258    /// `crate::assert::populate_run_distribution_metrics`, which pools the
259    /// raw samples from `stats.phases[].per_cgroup` across every phase and
260    /// cgroup and recomputes the statistic over the COMBINED set — the
261    /// percentile / CV / mean / extreme of the pooled distribution, NOT a
262    /// max or mean of per-cgroup reductions (the percentile of a union is
263    /// not the max of per-source percentiles). It is therefore FORBIDDEN
264    /// from the per-phase single-slice reducers
265    /// ([`aggregate_samples_for_phase`] returns None via
266    /// [`MetricKind::is_derived`]); the post-pass is its only within-run
267    /// producer. When the size-limited bulk frame strips the sample pools
268    /// (`crate::assert::strip_phase_cgroup_samples`), the producer falls
269    /// back to a worst-wins fold over the surviving per-cgroup `CgroupStats`
270    /// reductions so the metric degrades rather than vanishing.
271    ///
272    /// CROSS-RUN it is a HYBRID, unlike Rate: a run's components (the raw
273    /// sample vectors) do not survive into the cross-RUN ext-metrics map
274    /// (phases are dropped at the cross-RUN fold), so there is no combined
275    /// sample SET to re-pool across runs. The cross-RUN value is instead a
276    /// plain fold of the per-run derived values — an UNWEIGHTED mean (over the
277    /// runs that emitted the key, `sum / finite.len()`) for the percentile /
278    /// CV / mean reductions and a MAX for [`SampleReduction::Worst`] (the
279    /// peak run-delay) — applied by [`aggregate_finite`] over the per-run ext
280    /// values. So `is_derived`
281    /// skips it at the within-run sites, but the cross-RUN ext fold does
282    /// NOT skip it (only Rate, whose components DO survive cross-RUN, is
283    /// skipped there).
284    Distribution {
285        /// Which raw sample vector on
286        /// [`crate::assert::PhaseCgroupStats`] feeds this aggregate.
287        source: SampleSource,
288        /// Which statistic to recompute over the pooled sample set.
289        reduction: SampleReduction,
290    },
291    /// Derived LOWEST-WINS per-cgroup selector — the worst (lowest) cgroup's
292    /// `numerator / denominator` ratio across the run, re-pooled from per-cgroup
293    /// carriers rather than folded from ready-made ratios. None-aware lowest-wins
294    /// (the semantic the deleted `fold_lowest_some` carried in
295    /// [`crate::assert::AssertResult::merge`], now in
296    /// `crate::assert::populate_run_distribution_metrics`): a measured
297    /// `Some(0.0)` — a cgroup that ran zero iterations (real starvation) for the
298    /// efficiency selectors, or one with all pages off-node for
299    /// `worst_page_locality` — wins the worst bucket; a not-measured `None` (no
300    /// workers / no on-CPU time / no NUMA pages) is skipped; and an all-`None`
301    /// cohort produces no key (absence preserved as a missing ext entry, never a
302    /// `0.0`).
303    ///
304    /// Derived post-merge by
305    /// `crate::assert::populate_run_distribution_metrics`. The SOURCE depends on
306    /// the numerator: the iteration-efficiency selectors (`Iterations`) re-pool
307    /// from the `stats.cgroups[]` counters (which survive bulk-frame stripping,
308    /// so they need no degraded fallback); `worst_page_locality` (`NumaLocal`)
309    /// re-pools from the per-phase `stats.phases[].per_cgroup` NUMA carriers (the
310    /// reports-only `CgroupStats` hardcodes `page_locality` 0.0, so it cannot
311    /// source from `stats.cgroups[]`). Like Distribution it is `is_derived`
312    /// (skipped at the within-run reducers) and CROSS-RUN it MEAN-folds the
313    /// per-run derived values through [`aggregate_finite`].
314    WorstLowest {
315        /// The per-cgroup numerator (`Iterations` or `NumaLocal`).
316        numerator: WorstLowestNumerator,
317        /// The per-cgroup denominator the numerator is divided by.
318        denominator: WorstLowestDenominator,
319    },
320    /// Derived WORST-CGROUP wake-latency tail-amplification selector — the
321    /// highest per-cgroup `p99 / median` wake-latency ratio across the run.
322    /// Higher-is-worse (a stretched long tail), so "worst" is the MAX over
323    /// cgroups — the polarity-opposite of [`MetricKind::WorstLowest`]'s
324    /// lowest-wins. Re-selected post-merge by
325    /// `crate::assert::populate_run_distribution_metrics` from the
326    /// `stats.cgroups[]` entries via `CgroupStats::wake_latency_tail_ratio`
327    /// (deliberately NOT `pooled_p99 / pooled_median` of the cross-cgroup
328    /// union — that is the distinct `worst_p99_wake_latency_us` /
329    /// `worst_median_wake_latency_us` Distribution pair). Like Distribution /
330    /// WorstLowest it is [`MetricKind::is_derived`] (skipped at the within-run
331    /// reducers); the producer emits NO key when the run is below the
332    /// [`WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`] noise floor or no cgroup
333    /// carried a measurable tail (absence preserved as a missing ext entry,
334    /// never a `0.0` sentinel — the no-false-zero contract the deleted typed
335    /// field could not express).
336    ///
337    /// CROSS-RUN it folds, like every WorstLowest selector, by the UNWEIGHTED
338    /// exclude-missing MEAN through [`aggregate_finite`] (`sum / finite.len()`
339    /// over the runs that emitted the key) — the cohort's TYPICAL worst-cgroup
340    /// tail amplification, deliberately NOT a MAX: peak-of-peaks is reserved
341    /// for [`SampleReduction::Worst`] (a peak detector answering "did this ever
342    /// fire"), whereas this answers "what is this cohort's characteristic
343    /// worst-cgroup tail". A run below the floor never enters the mean, so no
344    /// sub-threshold run dilutes the cohort (the bug the ext relocation fixed:
345    /// the deleted typed cross-RUN fold summed every passing run's raw ratio
346    /// over `passes_observed`, folding noisy low-N runs in as real values).
347    WakeLatencyTailRatio,
348    /// Derived WORST-CGROUP cross-node migration-churn selector — the highest
349    /// per-cgroup `cross_node_migrated / numa_pages_total` ratio across the run.
350    /// LowerBetter (more cross-node migration is worse), so "worst" is the MAX
351    /// over cgroups — the polarity twin of `worst_page_locality`
352    /// ([`MetricKind::WorstLowest`] `NumaLocal`/`NumaTotal`, lowest-wins), sharing
353    /// the same per-phase NUMA carriers and the `numa_agg_per_cgroup` helper.
354    /// Re-pooled post-merge by
355    /// `crate::assert::populate_run_distribution_metrics` from
356    /// `stats.phases[].per_cgroup`: the cross-phase fold SUMs the per-phase
357    /// migration-counter deltas over the LATEST residency total
358    /// (`cross_node_migration_ratio_of(summed_migrated, latest_total)`), then
359    /// MAXes across cgroups that measured NUMA residency (`numa_pages_total > 0`);
360    /// a never-measured cohort yields no key (absence preserved as a missing ext
361    /// entry, never a `0.0`). A CHURN ratio (cumulative migration EVENTS over a
362    /// residency SNAPSHOT) — can legitimately exceed 1.0, NOT a bounded `[0,1]`
363    /// fraction. A dedicated max-selector like [`MetricKind::WakeLatencyTailRatio`]
364    /// (no generic numerator/denominator), since cross_node is the sole max-wins
365    /// phase-carrier ratio. Like the other derived kinds it is
366    /// [`MetricKind::is_derived`] (skipped at the within-run reducers) and
367    /// CROSS-RUN MEAN-folds through [`aggregate_finite`] over the runs that
368    /// EMITTED the key — the cohort's typical worst-cgroup churn. The deleted
369    /// typed `Gauge(Last)` field instead averaged its value over
370    /// `passes_observed`, folding a NUMA-less passing run's `0.0` sentinel into
371    /// the mean (dilution); the ext re-pool writes no key for a never-measured
372    /// run (absence preserved), so it never enters the divisor.
373    WorstCrossNodeRatio,
374    /// Per-phase-only scalar derived ONCE per phase from a phase-scoped
375    /// carrier, NOT from monitor samples and NOT re-pooled run-level. The sole
376    /// producer is [`crate::assert::derive_phase_metrics`], which derives two
377    /// families per phase:
378    /// - the schbench scalars: it pools each phase's
379    ///   `crate::assert::PhaseCgroupStats` schbench carriers and re-derives a
380    ///   percentile (from the merged latency histogram), a sample-weighted mean
381    ///   (run-delay), or a count (loop_count), writing them into BOTH each
382    ///   carrier's `PhaseCgroupStats::metrics` (per-cgroup, read via
383    ///   `phase_cgroup_metric`) AND the pooled
384    ///   `crate::assert::PhaseBucket::metrics` (read via `phase_metric`).
385    /// - the NON-schbench carrier scalars (wake/run-delay/off-cpu distributions
386    ///   + migration/iterations/locality ratios, via `write_carrier_scalars`):
387    ///   written ONLY into each `PhaseCgroupStats::metrics` (per-cgroup, read
388    ///   via `phase_cgroup_metric`) — these have NO pooled
389    ///   `crate::assert::PhaseBucket::metrics` entry; their run-level aggregate
390    ///   is the `worst_*` ext-metrics key.
391    /// It is [`MetricKind::is_derived`] (so the within-run reducers —
392    /// [`aggregate_samples_for_phase`] and the phase-bucket merge loop — skip
393    /// it) and has NO run-level producer; it is ADDITIONALLY gated out of the
394    /// cross-RUN ext fold (`fold_ext_metrics`), since a per-phase scalar has no
395    /// meaningful cross-run aggregate. A unit marker (no payload): the
396    /// derivation owns the metric-name→computation mapping, so the kind need not
397    /// carry a percentile selector (which would leak `plat::Pct` through this
398    /// public enum).
399    // doc_lazy_continuation: pre-existing list-item wording surfaced by the clippy
400    // 1.94 bump; renders fine. Suppress rather than reflow the prose.
401    #[allow(clippy::doc_lazy_continuation)]
402    PerPhase,
403    /// A WHOLE-RUN distributional value (a percentile / min / max) re-pooled
404    /// run-level by UNIONING the per-phase per-cgroup raw distribution carriers
405    /// and re-deriving the statistic over the union — the schbench engine's
406    /// `*_whole` wakeup / request / rps keys, written by
407    /// `crate::assert::populate_run_pooled_schbench_distribution` from the
408    /// `PhaseCgroupStats::schbench` `PlatStats` histograms (`PlatStats::combine`
409    /// is an associative bucket-count add, so the merged histogram is the faithful
410    /// union and the re-derived percentile is the percentile OF the pooled sample
411    /// set, NOT a mean of per-source percentiles). UNLIKE
412    /// [`MetricKind::Distribution`] it is NOT cross-RUN folded: a percentile of a
413    /// union is not a mean of per-run percentiles, and the per-phase histograms
414    /// are dropped at the cross-run boundary (no pooled set survives to
415    /// re-derive), so the only honest cross-run treatment is the per-run
416    /// noise-compare — `crate::stats::noise_findings` reads each run's own
417    /// `*_whole` scalar and compares the spread, never averaging them. So it is
418    /// [`MetricKind::is_derived`] (the within-run reducers skip it; the value is
419    /// produced solely by the run-level union populate) AND gated out of the
420    /// cross-RUN ext fold (`fold_ext_metrics`); `noise_findings` is its only
421    /// consumer. ext-only (`accessor |_| None`); the `*_whole` names are distinct
422    /// from the per-phase [`MetricKind::PerPhase`] percentile keys (one registry
423    /// name = one kind) — the established per-phase-vs-whole-run coexistence (as
424    /// the taobench and schbench loop/run-delay whole-run keys also do).
425    PerRunDistribution,
426}
427
428/// Sub-classification for [`MetricKind::Gauge`] picking the
429/// per-window reduction. Most ktstr gauges are Avg ("typical-load
430/// over the window"); Last fits "current state" snapshots like
431/// `comm` / `policy`; Max fits worst-instant queue-depth probes.
432// Serialize-only, matching its container MetricKind (which is Serialize-only)
433// and the sibling MetricKind sub-enums (SampleSource / SampleReduction /
434// WorstLowestNumerator / WorstLowestDenominator). Nothing deserializes a
435// MetricKind / GaugeAgg, so the prior Deserialize derive was dead.
436#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
437#[non_exhaustive]
438pub enum GaugeAgg {
439    /// Reduce by arithmetic mean. Default for `nr_running`-style
440    /// gauges where the question is "what was the typical load".
441    Avg,
442    /// Take the latest sample. Default for `comm` / `policy` /
443    /// `cgroup_path`-style snapshots where the value is "what is
444    /// it RIGHT NOW".
445    Last,
446    /// Take the max sample. Useful when a gauge is being used to
447    /// detect a worst-case regression (e.g. queue-depth probe
448    /// where any spike is the signal of interest).
449    Max,
450}
451
452/// The raw per-cgroup sample vector on
453/// [`crate::assert::PhaseCgroupStats`] that a [`MetricKind::Distribution`]
454/// re-pools over. Each variant maps to exactly one un-reduced sample
455/// vector the per-phase per-cgroup carrier holds (stored RAW in
456/// nanoseconds; the [`SampleReduction`] applies the ns→µs scale once).
457#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
458#[non_exhaustive]
459// The `Ns` suffix on every variant documents the unit (all sources are RAW
460// nanoseconds) at each use site, not just in the enum doc; clippy's
461// enum_variant_names is a style heuristic that misfires on a meaningful shared
462// unit suffix — renaming would drop the unit, so the suffix is kept.
463#[allow(clippy::enum_variant_names)]
464pub enum SampleSource {
465    /// Per-wakeup latency samples in ns
466    /// (`crate::assert::PhaseCgroupStats::wake_latencies_ns`). One sample per
467    /// observed wakeup (reservoir-capped per cgroup), so the pooled set is the
468    /// cross-cgroup union of those capped per-wakeup samples.
469    WakeLatencyNs,
470    /// Per-worker schedstat run-delay samples in ns
471    /// (`crate::assert::PhaseCgroupStats::run_delays_ns`). One sample per worker
472    /// — each is that worker's `sched_info.run_delay` delta over the carrier's
473    /// window (whole-run last-minus-first for the step-local carrier; the
474    /// per-phase delta for the backdrop slice carrier), so the pool size is the
475    /// worker count, NOT a per-wakeup stream like `WakeLatencyNs`.
476    RunDelayNs,
477    /// Per-timer-cycle latency samples in ns
478    /// (`crate::assert::PhaseCgroupStats::timer_latencies_ns`). One sample per
479    /// `clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME)` wake — the observed
480    /// wake time minus the absolute deadline, floored at 0 — recorded by
481    /// [`crate::workload::WorkType::TimerLatency`] (reservoir-capped per cgroup
482    /// like `WakeLatencyNs`). Distinct from `WakeLatencyNs` so cyclictest-style
483    /// timer latency does not blur with the blocking variants' wake latency in a
484    /// shared sidecar.
485    TimerLatencyNs,
486}
487
488/// The statistic a [`MetricKind::Distribution`] computes over its pooled
489/// [`SampleSource`] set. Each maps to the matching reduction
490/// `crate::assert::cgroup_stats` computes per cgroup, so the run-level
491/// re-pool reproduces that reduction over the COMBINED cross-cgroup set
492/// rather than folding ready-made per-cgroup reductions.
493#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
494#[non_exhaustive]
495pub enum SampleReduction {
496    /// 99th percentile (nearest-rank), ns→µs.
497    P99,
498    /// 99.9th percentile (nearest-rank), ns→µs — the deep tail an RT /
499    /// cyclictest-style latency probe turns on (a single max is one sample;
500    /// p99.9 is the robust deep-tail percentile between p99 and max).
501    P999,
502    /// Median (50th percentile, nearest-rank), ns→µs.
503    Median,
504    /// Coefficient of variation (stddev / mean) over the pooled set,
505    /// `n = pool.len()`. Unitless.
506    Cv,
507    /// Arithmetic mean over the pooled set, ns→µs.
508    Mean,
509    /// Maximum (worst) sample over the pooled set, ns→µs. CROSS-RUN this is
510    /// the one reduction [`aggregate_finite`] folds by MAX (peak survives),
511    /// not MEAN — see [`MetricKind::Distribution`].
512    Worst,
513}
514
515/// The per-cgroup numerator of a [`MetricKind::WorstLowest`] lowest-wins
516/// selector. `#[non_exhaustive]`, mirroring [`MetricKind::Rate`]'s `numerator`.
517/// The producer branches on the numerator to pick the per-cgroup SOURCE:
518/// `Iterations` reads the `crate::assert::CgroupStats` counters; `NumaLocal`
519/// reads the per-phase NUMA carriers (`page_locality` is structurally 0.0 in the
520/// reports-only `CgroupStats`, so it must come from the phase carriers).
521#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
522#[non_exhaustive]
523pub enum WorstLowestNumerator {
524    /// Per-cgroup total iteration count
525    /// (`crate::assert::CgroupStats::total_iterations`).
526    Iterations,
527    /// Per-cgroup pages resident on the expected NUMA node(s) — the
528    /// page-locality numerator, the LATEST per-phase residency snapshot summed
529    /// across the cgroup's workers (`crate::assert::PhaseCgroupStats::numa_pages_local`),
530    /// NOT a `CgroupStats` counter.
531    NumaLocal,
532}
533
534/// The per-cgroup denominator a [`MetricKind::WorstLowest`] numerator is divided
535/// by to form the lowest-wins ratio.
536#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
537#[non_exhaustive]
538pub enum WorstLowestDenominator {
539    /// Worker count (`crate::assert::CgroupStats::num_workers`) — yields
540    /// iterations-per-worker (raw throughput, scales with the CPU budget).
541    NumWorkers,
542    /// On-CPU nanoseconds (`crate::assert::CgroupStats::total_cpu_time_ns`),
543    /// converted ns→s ONCE on the summed counter — yields the
544    /// overcommit-invariant iterations-per-CPU-second efficiency.
545    CpuTimeNs,
546    /// Total resident pages — the page-locality denominator, the LATEST
547    /// per-phase residency snapshot (`crate::assert::PhaseCgroupStats::numa_pages_total`),
548    /// shared with the cross-node ratio. Paired with `NumaLocal` to yield the
549    /// page-locality fraction; the per-cgroup ratio is absent (None) when the
550    /// cgroup measured no NUMA pages.
551    NumaTotal,
552}
553
554/// How a per-phase metric reduction merges across two
555/// [`crate::assert::AssertResult`]s that both carry a
556/// [`crate::assert::PhaseBucket`] at the same `step_index`.
557///
558/// Driven by [`MetricKind::merge_kind`] so a future
559/// [`MetricKind`] addition is forced to declare its merge
560/// semantic explicitly (the match is `#[non_exhaustive]`-aware
561/// via the helper rather than a bare `match` in every caller).
562///
563/// The split mirrors the rolling-aggregation contract in
564/// [`AssertResult::merge`](crate::assert::AssertResult::merge): the
565/// per-phase fold must commute so the accumulator pattern
566/// `AssertResult::pass().merge(real_a).merge(real_b)` yields the
567/// same result whether merges arrive in `a→b` or `b→a` order
568/// — EXCEPT for kinds whose reduction is intrinsically the LAST
569/// sample (`Gauge(Last)`, `Timestamp`), where the merge must
570/// resolve to the bucket whose `end_ms` is later.
571///
572/// Counter, Peak, and Gauge(Max/Avg) are commutative because their
573/// reductions are sum / max / weighted-mean respectively — all
574/// associative, commutative folds over reduced values. Gauge(Last)
575/// and Timestamp are NOT commutative under a per-merge cumulative
576/// fold (the "later" sample wins) so the merge uses `end_ms` as
577/// the tiebreaker rather than the operand order.
578#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
579#[non_exhaustive]
580pub enum MergeKind {
581    /// The reduction commutes: `merge(a, b) == merge(b, a)`. The
582    /// merge folds the two reduced values via the kind's natural
583    /// commutative operation (Counter → sum, Peak / Gauge(Max) →
584    /// max, Gauge(Avg) → weighted mean by `sample_count`).
585    Commutative,
586    /// The reduction is "the LATEST sample's value" (Gauge(Last),
587    /// Timestamp). The merge resolves to the value from whichever
588    /// bucket has the later `end_ms`; ties keep `self`.
589    NonCommutative,
590    /// The value is DERIVED post-merge from pooled components, never folded
591    /// from two already-reduced values. Covers every
592    /// [`MetricKind::is_derived`] kind:
593    /// - [`MetricKind::Rate`]: re-derived as `Σnumerator / Σdenominator` from
594    ///   its component keys by [`derive_rate_metrics`];
595    /// - [`MetricKind::Distribution`] / [`MetricKind::WorstLowest`] /
596    ///   [`MetricKind::WakeLatencyTailRatio`] / [`MetricKind::WorstCrossNodeRatio`]:
597    ///   re-pooled post-merge by
598    ///   `crate::assert::populate_run_distribution_metrics`;
599    /// - [`MetricKind::PerPhase`]: re-derived per phase by
600    ///   `crate::assert::derive_phase_metrics`.
601    ///
602    /// The per-metric merge loop skips these derived keys entirely and the
603    /// post-pass produces them, so this variant is classification metadata: no
604    /// merge dispatches on it.
605    Recompute,
606}
607
608impl MetricKind {
609    /// Map each [`MetricKind`] variant to the corresponding
610    /// [`MergeKind`] used by per-phase
611    /// [`AssertResult::merge`](crate::assert::AssertResult::merge).
612    /// Centralising the mapping here means a future kind
613    /// addition fails the build until the new variant is wired
614    /// (the inner `match` is exhaustive even though `MetricKind`
615    /// is `#[non_exhaustive]` because this fn lives in the same
616    /// crate).
617    pub fn merge_kind(self) -> MergeKind {
618        match self {
619            MetricKind::Counter => MergeKind::Commutative,
620            MetricKind::Peak => MergeKind::Commutative,
621            MetricKind::Gauge(GaugeAgg::Avg) => MergeKind::Commutative,
622            MetricKind::Gauge(GaugeAgg::Max) => MergeKind::Commutative,
623            MetricKind::Gauge(GaugeAgg::Last) => MergeKind::NonCommutative,
624            MetricKind::Timestamp => MergeKind::NonCommutative,
625            // Per-phase reduction is a sum of in-phase deltas — an
626            // associative, commutative fold, so cross-AssertResult merge
627            // sums the two reduced values (same as Counter).
628            MetricKind::DeltaSum => MergeKind::Commutative,
629            // PerPhaseDeltaSum: the cross-cgroup same-step merge is a
630            // commutative sum (`a + b`, like Counter), but for these run-wide
631            // POOLED keys (one scalar per phase, injected once into the host
632            // bucket) it is the defensive same-step-index path only — two real
633            // values are never summed. (The SUM-cross-phase / MEAN-cross-run
634            // split is handled at the fold sites, not in this merge.)
635            MetricKind::PerPhaseDeltaSum => MergeKind::Commutative,
636            // A Rate is re-derived from its pooled components, never
637            // folded from two ready-made ratios.
638            MetricKind::Rate { .. } => MergeKind::Recompute,
639            // Distribution and WorstLowest are derived post-merge by
640            // `populate_run_distribution_metrics` (re-pooled from the
641            // per-cgroup raw samples / counters), so the per-phase merge
642            // loop skips them and re-derives — classification-only, like
643            // Rate. See [`MetricKind::is_derived`].
644            MetricKind::Distribution { .. } => MergeKind::Recompute,
645            MetricKind::WorstLowest { .. } => MergeKind::Recompute,
646            // Worst-cgroup wake-latency tail ratio: derived post-merge by
647            // `populate_run_distribution_metrics` (max over the merged
648            // `stats.cgroups` per-cgroup ratios), so the per-phase merge loop
649            // skips and re-derives it — classification-only, like the other
650            // derived kinds.
651            MetricKind::WakeLatencyTailRatio => MergeKind::Recompute,
652            // Worst-cgroup cross-node migration churn: derived post-merge by
653            // `populate_run_distribution_metrics` (max over the per-phase NUMA
654            // carriers' per-cgroup churn ratio), so the per-phase merge loop skips
655            // and re-derives it — classification-only, like the other derived kinds.
656            MetricKind::WorstCrossNodeRatio => MergeKind::Recompute,
657            // PerPhase is derived post-merge by `derive_phase_metrics` (schbench
658            // scalars into PhaseBucket.metrics + PhaseCgroupStats::metrics;
659            // non-schbench carrier scalars into PhaseCgroupStats::metrics only);
660            // the per-phase merge loop skips it (is_derived) and never re-derives
661            // via a kind — classification-only, like the other derived kinds.
662            MetricKind::PerPhase => MergeKind::Recompute,
663            // PerRunDistribution is derived run-level by
664            // `populate_run_pooled_schbench_distribution` (union of the per-phase
665            // per-cgroup PlatStats histograms, percentile re-derived over the
666            // union); the per-phase merge loop skips it (is_derived) and never
667            // re-derives via a kind — classification-only, like the other derived
668            // kinds.
669            MetricKind::PerRunDistribution => MergeKind::Recompute,
670        }
671    }
672
673    /// Whether this kind is DERIVED post-merge from other data rather than
674    /// reduced from its own per-phase sample slice: [`MetricKind::Rate`]
675    /// (from numerator/denominator components), [`MetricKind::Distribution`]
676    /// (re-pooled from the per-cgroup raw sample sets), [`MetricKind::WorstLowest`]
677    /// (lowest-wins over per-cgroup counters),
678    /// [`MetricKind::WakeLatencyTailRatio`] (max over the per-cgroup p99/median
679    /// wake-latency ratios, floor-gated), and [`MetricKind::WorstCrossNodeRatio`]
680    /// (max over the per-cgroup cross-node migration-churn ratios).
681    ///
682    /// Drives the WITHIN-RUN skip-sites that must not reduce a derived kind
683    /// from a slice: [`aggregate_samples_for_phase`] returns None, and the
684    /// per-phase build, the cross-phase
685    /// `crate::assert::merge_matched_phase_buckets` key-loop, and
686    /// [`crate::assert::populate_run_ext_metrics_from_phases`] all skip the
687    /// key then re-derive.
688    ///
689    /// NOT a uniform cross-RUN skip: at the cross-RUN ext fold
690    /// ([`group_and_average_by`], via `fold_ext_metrics`) [`MetricKind::Rate`],
691    /// [`MetricKind::PerPhase`], AND [`MetricKind::PerRunDistribution`] are
692    /// skipped — Rate's components survive cross-RUN so it re-derives there;
693    /// PerPhase is a per-phase-only scalar with no meaningful cross-RUN aggregate
694    /// (its skip also keeps [`aggregate_finite`]'s `PerPhase => unreachable!`
695    /// unreachable); and PerRunDistribution is a percentile-of-union whose
696    /// per-phase histograms are not shipped cross-RUN (a percentile of a union is
697    /// not a mean of per-run percentiles), so it is noise-compared per-run, never
698    /// folded — while Distribution / WorstLowest / WakeLatencyTailRatio /
699    /// WorstCrossNodeRatio, whose components do NOT survive cross-RUN, fall through
700    /// to be plainly folded (MEAN, or MAX for [`SampleReduction::Worst`]) by
701    /// [`aggregate_finite`]. So callers gate on `is_derived` for the within-run
702    /// sites and on `matches!(.., Rate { .. } | PerPhase | PerRunDistribution)`
703    /// for the cross-RUN ext fold.
704    pub fn is_derived(self) -> bool {
705        matches!(
706            self,
707            MetricKind::Rate { .. }
708                | MetricKind::Distribution { .. }
709                | MetricKind::WorstLowest { .. }
710                | MetricKind::WakeLatencyTailRatio
711                | MetricKind::WorstCrossNodeRatio
712                | MetricKind::PerPhase
713                | MetricKind::PerRunDistribution
714        )
715    }
716}
717
718/// Reduce a slice of per-sample readings of the same metric into
719/// one representative value, dispatching on [`MetricKind`]. Used
720/// by sample-windowed comparison paths (e.g. multi-tick monitor
721/// captures, perf-delta across multiple snapshot
722/// subdirectories) to collapse a sample vec into the value the
723/// existing scalar-comparison pipeline already understands.
724///
725/// Returns `None` when `samples` is empty — the caller decides
726/// whether absence is a missing-data condition or a benign
727/// "no samples in window" result. NaN samples are dropped from
728/// the reduction (same semantics as the existing percentile()
729/// helper); a final all-NaN input also returns `None`.
730///
731/// Semantics by kind:
732///   - `Counter` → sum of finite samples — the flat-run reduction
733///     for cross-RUN aggregation. NOT the right semantic for
734///     per-phase reduction of a cumulative-since-boot Counter
735///     (which would over-count). Callers wanting per-phase
736///     Counter reduction use [`aggregate_samples_for_phase`],
737///     which routes Counter through a dedicated last-minus-first
738///     branch instead of dispatching through here.
739///   - `Gauge(Avg)` → arithmetic mean of finite samples.
740///   - `Gauge(Last)` → last finite sample.
741///   - `Gauge(Max)` → max of finite samples.
742///   - `Peak` → max of finite samples.
743///   - `Timestamp` → last finite sample.
744///
745/// Live caller: [`aggregate_samples_for_phase`] dispatches every
746/// non-Counter kind through this entry point so the per-phase
747/// reduction inherits the flat-run semantic for Gauge / Peak /
748/// Timestamp without restating it. That fn is itself folded by
749/// [`crate::assert::build_phase_buckets`] whose live caller is
750/// the host-side `evaluate_vm_result` AssertResult-population
751/// site at `src/test_support/eval/mod.rs`.
752pub fn aggregate_samples(samples: &[f64], kind: MetricKind) -> Option<f64> {
753    let finite: Vec<f64> = samples.iter().copied().filter(|x| x.is_finite()).collect();
754    aggregate_finite(&finite, |_| 1, kind)
755}
756
757/// Weighted variant of [`aggregate_samples`]. Takes a slice of
758/// `(value, weight)` pairs so the lock-step shape is enforced by
759/// the type — there is no length-mismatch class for the caller to
760/// trigger. Weight is consulted for [`MetricKind::Gauge`] with
761/// [`GaugeAgg::Avg`] (weighted mean); other kinds fold by their
762/// natural reduction and ignore weight.
763///
764/// NaN-valued pairs drop along with their weight (filter operates
765/// on the value field — no risk of weights misaligning to other
766/// samples after filtering, unlike the previous parallel-slice
767/// shape).
768///
769/// Zero total weight degenerates to the unweighted mean per the
770/// `merge_metric_values` precedent. Weight sum uses `checked_add`
771/// with fallback to unweighted on overflow so a pathological
772/// caller can't crash the aggregator.
773pub fn aggregate_samples_weighted(pairs: &[(f64, usize)], kind: MetricKind) -> Option<f64> {
774    let finite: Vec<(f64, usize)> = pairs
775        .iter()
776        .copied()
777        .filter(|(x, _)| x.is_finite())
778        .collect();
779    if finite.is_empty() {
780        return None;
781    }
782    let values: Vec<f64> = finite.iter().map(|(x, _)| *x).collect();
783    aggregate_finite(&values, |i| finite[i].1, kind)
784}
785
786/// Inner fold shared by [`aggregate_samples`] (uniform weights)
787/// and [`aggregate_samples_weighted`] (caller-supplied weights).
788/// `weight_for(i)` returns the weight for the i-th element of
789/// `finite`; callers either pass `|_| 1` (unweighted) or a
790/// closure that reads from their pair vec (weighted). Pre-filtered
791/// `finite` carries only NaN-free values so the closure indexes
792/// into a known-good vec without risking shape drift.
793fn aggregate_finite(
794    finite: &[f64],
795    weight_for: impl Fn(usize) -> usize,
796    kind: MetricKind,
797) -> Option<f64> {
798    if finite.is_empty() {
799        return None;
800    }
801    Some(match kind {
802        // Counter (cumulative-since-boot, cross-RUN flat sum) and
803        // DeltaSum (each sample already a per-read delta) both reduce to
804        // a plain sum of the finite samples here; they differ only in
805        // the PER-PHASE path (Counter last-minus-first vs DeltaSum sum —
806        // see aggregate_samples_for_phase).
807        MetricKind::Counter | MetricKind::DeltaSum => finite.iter().sum(),
808        // PerPhaseDeltaSum at the CROSS-RUN fold: each contributor is one run's
809        // already-summed per-phase total, so runs fold by the UNWEIGHTED
810        // arithmetic mean over the runs that emitted the key — `Σ / len`, the
811        // same shape as the Distribution / WorstLowest cross-run mean below and
812        // deliberately NOT weighted by `run_sample_count`. The CROSS-PHASE sum
813        // that builds each run's total is done directly in
814        // `crate::assert::populate_run_ext_metrics_from_phases`, which does NOT
815        // route this kind through `aggregate_finite`, so this arm only ever runs
816        // at the cross-RUN ext fold.
817        MetricKind::PerPhaseDeltaSum => finite.iter().sum::<f64>() / (finite.len() as f64),
818        // Distribution Worst (peak run-delay): the cross-RUN fold is MAX
819        // so the high-water peak survives, distinct from the MEAN-folded
820        // percentile / CV / mean reductions below. (WITHIN-RUN no
821        // derived kind reaches here — `is_derived` skips every one at
822        // the per-phase reducers; this arm only fires at the cross-RUN ext
823        // fold in `group_and_average_by`.) Matched before the general
824        // `Distribution { .. }` mean arm so Worst takes MAX, not MEAN.
825        MetricKind::Distribution {
826            reduction: SampleReduction::Worst,
827            ..
828        } => finite.iter().copied().fold(f64::NEG_INFINITY, f64::max),
829        // Cross-RUN MEAN fold of the remaining Distribution reductions (p99 /
830        // median / CV / mean run-delay) and every WorstLowest selector: each
831        // per-run value is itself a within-run pooled reduction or a
832        // lowest-wins selector, NOT a monitor-sampled gauge, so the cross-RUN
833        // fold is an UNWEIGHTED arithmetic mean — `sum / finite.len()`, i.e.
834        // over the runs that EMITTED a finite value for the key. This matches
835        // the unweighted-mean SHAPE of the surviving typed siblings
836        // (spread, migration_ratio), but its
837        // divisor is the present-finite-contributor count, NOT the typed path's
838        // `sum / passes_observed`: a passing run that omitted the key (absent /
839        // dropped-non-finite ext entry) is EXCLUDED from the mean rather than
840        // folded in as 0.0 — the deliberate no-false-zero improvement the ext
841        // relocation buys (the old typed field defaulted a no-data run to 0.0).
842        // Weighting by `run_sample_count` (the MONITOR capture count) would
843        // weight by an unrelated population AND silently zero-weight a
844        // monitor-off run, so it is deliberately NOT used here. (WITHIN-RUN
845        // these never reach here — `is_derived` skips them at the per-phase
846        // reducers; this arm only fires at the cross-RUN ext fold in
847        // `group_and_average_by`.)
848        //
849        // EXTREMUM ASYMMETRY (on the record, ratified): every WorstLowest
850        // selector is a within-run lowest-wins ("worst cgroup") value yet folds
851        // cross-RUN by this MEAN, NOT by an extremum — UNLIKE worst_run_delay_us
852        // (SampleReduction::Worst), whose dedicated MAX arm above preserves the
853        // peak-of-peaks. Both reproduce the deleted typed cross-RUN folds
854        // exactly: run-delay is a peak detector (MAX), the iteration
855        // efficiencies are a starvation-floor cohort statistic (MEAN). Aligning
856        // WorstLowest to an extremum (a MIN arm gated on HigherBetter) would be
857        // a future product decision, tracked separately, not a Stage-1 fix.
858        //
859        // HYBRID caveat (sharpest for CV): a cross-RUN value here is a
860        // mean-of-per-run-reductions, NOT a reduction recomputed over the
861        // combined raw set — the raw samples do not survive cross-RUN (phases
862        // are dropped), so there is no union to re-pool. For p99 / median /
863        // mean run-delay this mean-of-summaries is a defensible cohort
864        // statistic; for worst_wake_latency_cv it is a mean-of-ratios (the
865        // fold-of-ready-made-ratios shape the Rate kind exists to avoid), not a
866        // pooled CV — accepted here only because no combined set exists to
867        // recompute over, and it reproduces the deleted typed path's shape
868        // exactly. See [`MetricKind::Distribution`].
869        MetricKind::Distribution { .. }
870        | MetricKind::WorstLowest { .. }
871        | MetricKind::WakeLatencyTailRatio
872        | MetricKind::WorstCrossNodeRatio => finite.iter().sum::<f64>() / (finite.len() as f64),
873        MetricKind::Gauge(GaugeAgg::Avg) => {
874            // Weighted mean: sum(v * w) / sum(w). Uniform-weight
875            // callers (aggregate_samples) reduce to arithmetic
876            // mean per weight_for == |_| 1. Zero total weight
877            // degenerates to the unweighted mean rather than
878            // dividing by zero; mirrors `merge_metric_values` at
879            // `crate::assert::merge_matched_phase_buckets` per
880            // single-source-of-truth.
881            //
882            // `checked_add` on the running weight sum so a
883            // pathological caller (huge per-RUN sample counts
884            // across many runs) saturates to MAX rather than
885            // wrapping silently in release. On overflow we
886            // collapse to the unweighted-mean fallback so the
887            // returned value stays plausible.
888            let total_weight: usize = finite
889                .iter()
890                .enumerate()
891                .try_fold(0usize, |acc, (i, _)| acc.checked_add(weight_for(i)))
892                .unwrap_or(0);
893            if total_weight == 0 {
894                finite.iter().sum::<f64>() / (finite.len() as f64)
895            } else {
896                finite
897                    .iter()
898                    .enumerate()
899                    .map(|(i, x)| *x * (weight_for(i) as f64))
900                    .sum::<f64>()
901                    / (total_weight as f64)
902            }
903        }
904        MetricKind::Gauge(GaugeAgg::Last) | MetricKind::Timestamp => {
905            *finite.last().expect("non-empty by check above")
906        }
907        MetricKind::Gauge(GaugeAgg::Max) | MetricKind::Peak => {
908            finite.iter().copied().fold(f64::NEG_INFINITY, f64::max)
909        }
910        // A Rate is derived from its components by `derive_rate_metrics`,
911        // never reduced from a single sample slice (one slice cannot
912        // express Σnum/Σdenom). EVERY aggregation path skips Rate before
913        // reaching the reducers: `aggregate_samples_for_phase` returns
914        // None, and the per-phase build, the cross-phase merge, and both
915        // cross-RUN reducers skip Rate keys then re-derive via
916        // `derive_rate_metrics`. So reaching here is a routing bug.
917        MetricKind::Rate { .. } => unreachable!(
918            "MetricKind::Rate must be derived via derive_rate_metrics, \
919             not reduced from a sample slice"
920        ),
921        // PerPhase is derived post-merge by derive_phase_metrics (into
922        // PhaseBucket.metrics and/or PhaseCgroupStats::metrics) and is gated out
923        // of the cross-RUN ext fold (fold_ext_metrics) + the within-run reducers
924        // (is_derived), so it never reaches a sample-slice reduction. Reaching
925        // here is a routing bug (the gate or is_derived was bypassed).
926        MetricKind::PerPhase => unreachable!(
927            "MetricKind::PerPhase is derived by derive_phase_metrics, \
928             not reduced from a sample slice"
929        ),
930        // PerRunDistribution is produced run-level by
931        // populate_run_pooled_schbench_distribution / populate_run_pooled_taobench_distribution
932        // (union of the per-phase PlatStats histograms) and is gated out of BOTH
933        // the within-run reducers (is_derived) AND the cross-RUN ext fold
934        // (fold_ext_metrics skip) — its only consumer is noise_findings reading the
935        // per-run scalar. So it never reaches a sample-slice reduction; reaching
936        // here is a routing bug.
937        MetricKind::PerRunDistribution => unreachable!(
938            "MetricKind::PerRunDistribution is produced by \
939             populate_run_pooled_schbench_distribution / populate_run_pooled_taobench_distribution \
940             and noise-compared per-run, not reduced from a sample slice"
941        ),
942    })
943}
944
945/// Per-phase metric reduction with the correct semantic per
946/// [`MetricKind`].
947///
948/// Counter kinds bypass [`aggregate_samples`]'s flat-run `sum`
949/// (which is correct for cross-RUN aggregation, but wrong for
950/// cumulative-since-boot per-phase data — summing 10 samples at
951/// `[100, 150, 175, ...]` yields ~425 instead of the per-phase
952/// delta `175 - 100 = 75`) and route through
953/// [`phase_counter_delta`] instead. All other kinds use
954/// [`aggregate_samples`] verbatim, which is correct for them
955/// (Gauge avg/last/max, Peak max, Timestamp last, and DeltaSum — whose
956/// samples are ALREADY per-read deltas, so the per-phase reduction is
957/// the sum of the in-phase deltas, NOT a last-minus-first that would
958/// difference two deltas into nonsense).
959///
960/// `samples` are the per-Sample readings of `metric` collected
961/// over one phase's window of
962/// [`crate::scenario::sample::Sample`]s via `MetricDef::read_sample`
963/// once that helper is wired through.
964/// Returns `None` when every reading was `None` / `NaN`.
965///
966/// Live caller: [`crate::assert::build_phase_buckets`] folds
967/// per-phase sample slices through this entry point and the
968/// result lands on [`crate::assert::PhaseBucket::metrics`]; the
969/// host-side `evaluate_vm_result` at `src/test_support/eval/mod.rs`
970/// is the consumer that drives the call.
971pub fn aggregate_samples_for_phase(metric: &MetricDef, samples: &[f64]) -> Option<f64> {
972    match metric.kind {
973        MetricKind::Counter => phase_counter_delta(samples),
974        // Derived kinds (every `is_derived()`: Rate / Distribution / WorstLowest /
975        // WakeLatencyTailRatio / WorstCrossNodeRatio / PerPhase / PerRunDistribution)
976        // have no samples
977        // of their own: their value is produced by a post-pass
978        // (`derive_rate_metrics` / `crate::assert::populate_run_distribution_metrics`)
979        // from pooled components, not reduced from a per-phase slice. Return
980        // None so the build loop inserts no key here.
981        k if k.is_derived() => None,
982        _ => aggregate_samples(samples, metric.kind),
983    }
984}
985
986/// Per-phase reduction for [`MetricKind::Counter`]: compute the
987/// last finite sample minus the first finite sample, clamping
988/// negative results (counter reset across a scheduler restart)
989/// to 0 and emitting a `tracing::warn!` so the reset is visible
990/// in stderr. Mirrors the existing
991/// `crate::monitor`-side counter-delta clamp pattern used
992/// when reducing cumulative kernel counters across boundaries
993/// for the same reset-detection reason.
994///
995/// Edge cases (sentinel-free absent-vs-measured-zero):
996///   - 0 or 1 finite samples -> `None`. A delta is UNMEASURABLE from
997///     fewer than two points; absence here is distinct from a measured
998///     zero. The renderer's has-data signal is `PhaseBucket::sample_count`
999///     (see `expect_metric`), NOT this value, so absence loses no
1000///     diagnostic. (Previously a 1-sample phase returned a phantom
1001///     `Some(0.0)` that made a per-phase Counter claim read 0 even when
1002///     the phase fired plenty — only one freeze landed.)
1003///   - 2+ finite samples -> `Some(max(0.0, last - first))` (equal
1004///     endpoints give a REAL `Some(0.0)`: the counter did not advance).
1005///
1006/// Live caller: [`aggregate_samples_for_phase`] dispatches the
1007/// Counter variant through this entry point.
1008pub fn phase_counter_delta(samples: &[f64]) -> Option<f64> {
1009    let finite: Vec<f64> = samples.iter().copied().filter(|x| x.is_finite()).collect();
1010    match finite.as_slice() {
1011        // 0 or 1 finite samples: a delta is unmeasurable from fewer than two
1012        // points -> None (sentinel-free contract). The 2+-equal case below
1013        // still yields a real Some(0.0).
1014        [] | [_] => None,
1015        [first, .., last] => {
1016            let delta = *last - *first;
1017            if delta < 0.0 {
1018                tracing::warn!(
1019                    first = *first,
1020                    last = *last,
1021                    "phase_counter_delta: counter reset detected (last < first); clamping to 0"
1022                );
1023                Some(0.0)
1024            } else {
1025                Some(delta)
1026            }
1027        }
1028    }
1029}
1030
1031/// Derive every registered [`MetricKind::Rate`] metric in `metrics`
1032/// from its already-present numerator / denominator component values:
1033/// `metrics[rate] = metrics[numerator] / metrics[denominator]`.
1034///
1035/// This is the SOLE producer of a Rate metric's value. It runs as a
1036/// post-pass at nine aggregation sites where the components co-locate in
1037/// one map: the two per-phase builds, the cross-phase bucket merge, the
1038/// three cross-RUN ext-metrics reducers (`populate_run_ext_metrics`,
1039/// `populate_run_ext_metrics_from_phases`, `group_and_average_by`), and the
1040/// cross-CGROUP pooled re-pools
1041/// (`crate::assert::populate_run_pooled_iterations_per_cpu_sec`, run
1042/// post-`merge` at the eval layer to re-pool `iterations_per_cpu_sec` across a
1043/// run's cgroups, plus `crate::assert::populate_run_pooled_taobench` and
1044/// `crate::assert::populate_run_pooled_schbench` for the taobench/schbench
1045/// whole-run Rates). At each, the components are
1046/// pooled FIRST by their own kinds (a `Counter` numerator summed), then
1047/// the rate is re-derived — so for `Counter / Counter` the result is
1048/// `Σnumerator / Σdenominator`, the correct re-pool rather than a mean of
1049/// ready-made ratios. (The cross-CGROUP `AssertResult::merge` ext-metrics
1050/// fold itself uses worst-case polarity and is NOT a derive site — the
1051/// pooled re-pool above runs separately after it; see [`MetricKind::Rate`].)
1052///
1053/// A rate is skipped (its key left absent) when either component key is
1054/// missing, the denominator is zero, or either component is non-finite —
1055/// keeping an absent rate distinct from a real `0.0`.
1056///
1057/// INVARIANT: the producers must co-insert both components from the same
1058/// observation (both-or-neither per map) — e.g.
1059/// `build_phase_buckets_with_stimulus` inserts `total_phase_iterations` and
1060/// `total_phase_duration_sec` together under one `rate_components` guard. A
1061/// partial pair (numerator from one source, denominator from another) is
1062/// never produced today but would derive a cross-paired rate; any second
1063/// Rate must keep the co-insertion contract.
1064pub(crate) fn derive_rate_metrics(metrics: &mut std::collections::BTreeMap<String, f64>) {
1065    derive_rate_metrics_from(
1066        metrics,
1067        METRICS.iter().filter_map(|m| match m.kind {
1068            MetricKind::Rate {
1069                numerator,
1070                denominator,
1071            } => Some((m.name, numerator, denominator)),
1072            _ => None,
1073        }),
1074    );
1075}
1076
1077/// Inner of [`derive_rate_metrics`] taking the rate specs explicitly as
1078/// `(name, numerator, denominator)` so the derivation math is
1079/// unit-testable without a registered Rate metric in [`METRICS`].
1080pub(crate) fn derive_rate_metrics_from<'a>(
1081    metrics: &mut std::collections::BTreeMap<String, f64>,
1082    rates: impl Iterator<Item = (&'a str, &'a str, &'a str)>,
1083) {
1084    for (name, numerator, denominator) in rates {
1085        let (Some(num), Some(den)) = (
1086            metrics.get(numerator).copied(),
1087            metrics.get(denominator).copied(),
1088        ) else {
1089            continue;
1090        };
1091        if num.is_finite() && den.is_finite() && den != 0.0 {
1092            // Guard the QUOTIENT too: a finite num / finite tiny den can
1093            // overflow to +/-inf. Insert only a finite rate so an absent
1094            // rate stays distinct from a real value (no inf in the map).
1095            let rate = num / den;
1096            if rate.is_finite() {
1097                metrics.insert(name.to_string(), rate);
1098            }
1099        }
1100    }
1101}
1102
1103impl MetricDef {
1104    /// Read this metric's value from `row`. Consults the
1105    /// accessor first (for built-in `GauntletRow` fields) and
1106    /// falls back to `row.ext_metrics[self.name]` when the
1107    /// accessor returns `None`.
1108    pub fn read(&self, row: &GauntletRow) -> Option<f64> {
1109        (self.accessor)(row).or_else(|| row.ext_metrics.get(self.name).copied())
1110    }
1111
1112    /// Read this metric's value from a single
1113    /// [`crate::scenario::sample::Sample`] — the per-sample
1114    /// analogue of [`Self::read`] used by the per-phase
1115    /// aggregator to fold a window of samples into one
1116    /// [`crate::assert::PhaseBucket`] value per metric.
1117    ///
1118    /// Returns `None` for metrics that cannot be derived from a
1119    /// single-sample shape: most ktstr metrics are computed host-side
1120    /// (cross-CPU / cross-cgroup folds, run-level distributional
1121    /// re-pools, or monitor-axis windowing), not from one sample —
1122    /// `worst_spread`, `worst_gap_ms`, `worst_migration_ratio`,
1123    /// `max_imbalance_ratio`, the `worst_*_wake_latency_*` /
1124    /// `worst_mean_run_delay_us` / `worst_run_delay_us` distributions,
1125    /// `worst_iterations_per_worker` / `worst_iterations_per_cpu_sec`,
1126    /// `worst_page_locality`, `worst_cross_node_migration_ratio`,
1127    /// `worst_wake_latency_tail_ratio` — and have no single-sample
1128    /// reading.
1129    ///
1130    /// Wired per-sample arms (return `Some`): `max_dsq_depth` /
1131    /// `avg_dsq_depth` from `sample.snapshot`'s DSQ-walker,
1132    /// `total_fallback` / `total_keep_last` from its SCX events
1133    /// region, and the IRQ/steal cross-CPU sums `total_hardirqs`,
1134    /// `total_softirq_net_rx` / `total_softirq_net_tx` /
1135    /// `total_softirq_timer` / `total_softirq_sched`,
1136    /// `total_irq_time_ns`, `total_softirq_time_ns`, and
1137    /// `total_steal_time_ns` from its `per_cpu_time`. Every other
1138    /// registered metric falls to `_ => None`
1139    /// here, for one of three reasons: (1) it is a MONITOR-axis
1140    /// signal with no guest-`Snapshot` shape (`stuck_count`,
1141    /// `max_imbalance_ratio`, `avg_imbalance_ratio`) — folded
1142    /// per-phase from `MonitorSample` windowing in
1143    /// [`crate::assert::build_phase_buckets`], NOT from read_sample;
1144    /// (2) it has no per-sample source yet (`total_migrations`,
1145    /// `total_iterations` — per-task guest counters not captured per
1146    /// tick); or (3) it is a run-level metric with no single-sample
1147    /// reading (the `worst_*` family above).
1148    /// [`crate::stats::aggregate_samples_for_phase`] surfaces an
1149    /// all-None reduction as a `None` bucket entry — distinct from
1150    /// `Some(0.0)` (a real zero) — so the bucket renderer can paint
1151    /// "no data" vs "real zero" distinctly without losing information.
1152    ///
1153    /// Live caller: [`crate::assert::build_phase_buckets`] calls
1154    /// `read_sample` once per [`crate::stats::METRICS`] entry per
1155    /// sample to collect the per-sample readings the per-phase
1156    /// aggregator folds. The host-side `evaluate_vm_result` at
1157    /// `src/test_support/eval/mod.rs` drives the chain.
1158    pub fn read_sample(&self, sample: &crate::scenario::sample::Sample<'_>) -> Option<f64> {
1159        // Per-metric dispatch by registry name. Only the metrics
1160        // whose value is genuinely a per-sample reading are wired;
1161        // every other entry in the METRICS registry is
1162        // cross-cgroup folds or run-level distributional re-pools
1163        // computed host-side at `evaluate_vm_result` time
1164        // (worst-spread / worst-gap-ms fold; the
1165        // `worst_*_wake_latency_*` distributions + worst-iterations-per-
1166        // worker efficiencies re-pool) and have no single-sample
1167        // equivalent —
1168        // they fall through to None below and the phase
1169        // aggregator paints them as absent bucket entries
1170        // (distinct from a real zero — sentinel-free contract).
1171        match self.name {
1172            // BPF dsq-state walker captures per-DSQ depth at the
1173            // freeze instant. `local_dsq_depth` is the per-CPU
1174            // local DSQ; take max across CPUs because the metric
1175            // is Peak-kind ("worst depth this instant"). DsqState
1176            // sets `origin = "local cpu N"` for local DSQs (see
1177            // src/monitor/scx_walker.rs `DsqState::origin`); the
1178            // filter pins the metric to the local-DSQ class so
1179            // global / bypass / user DSQs do not pollute the
1180            // reading.
1181            "max_dsq_depth" => sample
1182                .snapshot
1183                .dsq_states()
1184                .iter()
1185                .filter(|d| d.origin.starts_with("local cpu "))
1186                .map(|d| u64::from(d.nr))
1187                .max()
1188                .map(|v| v as f64),
1189            // Per-sample arithmetic mean of the same local-CPU
1190            // DSQ depth readings `max_dsq_depth` walks. Returns
1191            // `None` when no local DSQs are present so the bucket
1192            // renderer can distinguish "no data" from "real zero"
1193            // (sentinel-free contract); a zero-population set
1194            // never enters the mean.
1195            "avg_dsq_depth" => {
1196                let locals: Vec<f64> = sample
1197                    .snapshot
1198                    .dsq_states()
1199                    .iter()
1200                    .filter(|d| d.origin.starts_with("local cpu "))
1201                    .map(|d| u64::from(d.nr) as f64)
1202                    .collect();
1203                if locals.is_empty() {
1204                    None
1205                } else {
1206                    Some(locals.iter().sum::<f64>() / locals.len() as f64)
1207                }
1208            }
1209            // Cumulative `select_cpu_fallback` counter at the
1210            // freeze instant. The host's event-counter walker
1211            // builds a per-tick timeline of CPU-summed counters
1212            // (`EventCounterSample` at src/monitor/dump/mod.rs:477);
1213            // `.last()` gives the cumulative reading at the most
1214            // recent tick within this freeze's capture window.
1215            // Counter-kind reduction folds `last - first` across
1216            // the phase's sample window, yielding the per-phase
1217            // delta (the genuine "how many fallbacks fired during
1218            // THIS phase").
1219            "total_fallback" => sample
1220                .snapshot
1221                .event_counter_timeline()
1222                .last()
1223                .map(|e| e.select_cpu_fallback as f64),
1224            // Cumulative `dispatch_keep_last` counter; same
1225            // per-tick timeline source as `total_fallback`. Same
1226            // Counter-kind reduction semantic; per-phase delta
1227            // surfaces the keep-last count for THIS phase.
1228            "total_keep_last" => sample
1229                .snapshot
1230                .event_counter_timeline()
1231                .last()
1232                .map(|e| e.dispatch_keep_last as f64),
1233            // IRQ observability: cross-CPU SUM of the cumulative per-CPU
1234            // counter at this freeze. Counter kind takes the per-phase
1235            // last-minus-first (phase_counter_delta) over these per-freeze
1236            // totals. The per-CPU set is fixed across freezes (every CPU is
1237            // present every freeze), so the cross-CPU sum has NO task-set-change
1238            // inflation — the exact reason system_time_ns below is NOT a
1239            // read_sample arm but these are. Empty per_cpu_time -> None
1240            // (loud-absent, never a false zero). Softirq vectors index against
1241            // the compile-pinned named consts, never bare literals.
1242            "total_hardirqs" => {
1243                // saturating fold (applies to every IRQ arm below): overflow-safe
1244                // cross-CPU spatial sum of the guest per-CPU counter — a corrupt /
1245                // hostile per-CPU u64::MAX must clamp, not wrap, this per-freeze
1246                // total (the Counter delta reads it). Exact for every in-range value.
1247                let cpus = sample.snapshot.per_cpu_time();
1248                (!cpus.is_empty()).then(|| {
1249                    cpus.iter()
1250                        .map(|c| c.irqs_sum)
1251                        .fold(0u64, u64::saturating_add) as f64
1252                })
1253            }
1254            "total_softirq_net_rx" => {
1255                let cpus = sample.snapshot.per_cpu_time();
1256                (!cpus.is_empty()).then(|| {
1257                    cpus.iter()
1258                        .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_NET_RX])
1259                        .fold(0u64, u64::saturating_add) as f64
1260                })
1261            }
1262            "total_softirq_net_tx" => {
1263                let cpus = sample.snapshot.per_cpu_time();
1264                (!cpus.is_empty()).then(|| {
1265                    cpus.iter()
1266                        .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_NET_TX])
1267                        .fold(0u64, u64::saturating_add) as f64
1268                })
1269            }
1270            "total_softirq_timer" => {
1271                let cpus = sample.snapshot.per_cpu_time();
1272                (!cpus.is_empty()).then(|| {
1273                    cpus.iter()
1274                        .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_TIMER])
1275                        .fold(0u64, u64::saturating_add) as f64
1276                })
1277            }
1278            "total_softirq_sched" => {
1279                let cpus = sample.snapshot.per_cpu_time();
1280                (!cpus.is_empty()).then(|| {
1281                    cpus.iter()
1282                        .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_SCHED])
1283                        .fold(0u64, u64::saturating_add) as f64
1284                })
1285            }
1286            "total_irq_time_ns" => {
1287                let cpus = sample.snapshot.per_cpu_time();
1288                (!cpus.is_empty()).then(|| {
1289                    cpus.iter()
1290                        .map(|c| c.cpustat_irq_ns)
1291                        .fold(0u64, u64::saturating_add) as f64
1292                })
1293            }
1294            "total_softirq_time_ns" => {
1295                let cpus = sample.snapshot.per_cpu_time();
1296                (!cpus.is_empty()).then(|| {
1297                    cpus.iter()
1298                        .map(|c| c.cpustat_softirq_ns)
1299                        .fold(0u64, u64::saturating_add) as f64
1300                })
1301            }
1302            "total_steal_time_ns" => {
1303                let cpus = sample.snapshot.per_cpu_time();
1304                (!cpus.is_empty()).then(|| {
1305                    cpus.iter()
1306                        .map(|c| c.cpustat_steal_ns)
1307                        .fold(0u64, u64::saturating_add) as f64
1308                })
1309            }
1310            // `system_time_ns` / `user_time_ns` are deliberately absent
1311            // here: they are NOT read per-sample. A per-sample
1312            // cross-thread SUM followed by a Counter `last - first`
1313            // inflates whenever the captured task set changes between
1314            // freezes — a task carrying a large cumulative counter that
1315            // appears only in a LATER sample dumps its entire pre-phase
1316            // history into the delta. They are injected post-hoc as a
1317            // per-thread-GROUP delta (each tgid's first-seen-to-last-seen
1318            // `thread_group_cputime`) by
1319            // [`crate::assert::phase_group_cpu_delta`], which subtracts
1320            // each group's own first-seen total and so bounds the result
1321            // by wall-clock × cores. Still observer-free — that injector
1322            // reads the same frozen `task_struct` enrichments.
1323            //
1324            // Every other metric stays None. The 16 host-only
1325            // names (full list in the doc comment above) compute
1326            // cross-cgroup folds at `evaluate_vm_result` time and
1327            // have no per-sample equivalent until a per-cgroup
1328            // per-sample capture path lands; surfacing them via a
1329            // synthetic single-sample value would falsify the
1330            // per-phase trajectory the bucket renderer paints.
1331            _ => None,
1332        }
1333    }
1334
1335    /// Returns `true` when a metric INCREASING is the bad direction:
1336    /// [`LowerBetter`](crate::test_support::Polarity::LowerBetter),
1337    /// [`TargetValue`](crate::test_support::Polarity::TargetValue), and the
1338    /// conservative [`Unknown`](crate::test_support::Polarity::Unknown)
1339    /// default (an unclassified metric is treated as higher-is-worse so a
1340    /// real regression in it is still caught). `false` for
1341    /// [`HigherBetter`](crate::test_support::Polarity::HigherBetter) and
1342    /// [`Informational`](crate::test_support::Polarity::Informational).
1343    ///
1344    /// This is the cross-cgroup FOLD direction (max-vs-min when merging
1345    /// per-cgroup `ext_metrics`, see [`crate::assert::AssertResult::merge`])
1346    /// and the timeline-narrative direction. The PERF-DELTA VERDICT path
1347    /// uses [`classify_direction`](Self::classify_direction) instead, which
1348    /// returns `None` for `Informational` so it never gates. `Informational`
1349    /// folding as `false` (min) here is harmless: the system-wide monitor
1350    /// counters that carry it are row-level and never hit the per-cgroup
1351    /// merge.
1352    pub const fn higher_is_worse(&self) -> bool {
1353        use crate::test_support::Polarity;
1354        matches!(
1355            self.polarity,
1356            Polarity::LowerBetter | Polarity::TargetValue(_) | Polarity::Unknown
1357        )
1358    }
1359
1360    /// Verdict direction for the perf-delta comparison:
1361    /// - `Some(true)`  — an INCREASE is a regression (`LowerBetter` /
1362    ///   `TargetValue` / the conservative `Unknown` default),
1363    /// - `Some(false)` — a DECREASE is a regression (`HigherBetter`),
1364    /// - `None`        — [`Informational`](crate::test_support::Polarity::Informational):
1365    ///   directionless; the comparison records and displays it but NEVER
1366    ///   classifies it as regression/improvement and it NEVER affects the
1367    ///   exit code.
1368    ///
1369    /// The verdict sites in [`compare_partitions`] branch on the `Option`:
1370    /// `None` => informational, `Some(hiw)` => the dual-gated
1371    /// regression/improvement split. This matches
1372    /// [`higher_is_worse`](Self::higher_is_worse) for every variant EXCEPT
1373    /// `Informational` (there `false`/min-fold, here `None`/never-gated) —
1374    /// the deliberate split between "fold needs a direction" and "verdict
1375    /// must stay neutral".
1376    pub const fn classify_direction(&self) -> Option<bool> {
1377        self.polarity.classify_direction()
1378    }
1379
1380    /// Whether this metric's value can reach the AGGREGATE findings the
1381    /// perf-delta failure gate reads — the cross-run scalar compare
1382    /// ([`compare_partitions`], `noise_adjust == false`) or the per-run noise
1383    /// compare ([`compare_partitions_noise`], `noise_adjust == true`). `false`
1384    /// for names whose value never lands on a compared row in that mode, so a
1385    /// `--must-fail` gate on one could never fire (a silent no-op):
1386    /// - [`MetricKind::PerPhase`]: `accessor` is `None`, it has no run-level
1387    ///   producer, and it is gated out of the cross-run ext fold — its value
1388    ///   lives only in per-phase carriers, never on a `GauntletRow`, so it
1389    ///   reaches NEITHER compare. Always `false`.
1390    /// - [`MetricKind::PerRunDistribution`]: `accessor` is `None` and it is
1391    ///   gated out of the cross-run ext fold, so it is absent on the scalar
1392    ///   compare's cross-run-folded rows — but each run carries its own
1393    ///   `*_whole` scalar that [`compare_partitions_noise`] reads, so it CAN
1394    ///   gate under `--noise-adjust`. `noise_adjust`-only.
1395    ///
1396    /// Every other kind reaches both compares; whether it then produces a
1397    /// *regression* (vs an informational finding) is a separate
1398    /// direction question — see [`classify_direction`](Self::classify_direction).
1399    pub const fn gates_aggregate(&self, noise_adjust: bool) -> bool {
1400        match self.kind {
1401            MetricKind::PerPhase => false,
1402            MetricKind::PerRunDistribution => noise_adjust,
1403            _ => true,
1404        }
1405    }
1406}
1407
1408/// Unified metric registry covering all built-in and extensible metrics.
1409///
1410/// The comparison pipeline uses `higher_is_worse` to determine regression
1411/// direction, `default_abs`/`default_rel` for dual-gate significance
1412/// thresholds, and `display_unit` for formatted output. Per-test
1413/// assertion overrides can still use their own thresholds; this registry
1414/// is the source of truth for polarity and display.
1415///
1416/// `AssertResult::merge` consults `higher_is_worse` via [`metric_def`]
1417/// when folding per-cgroup `ext_metrics` into the scenario-level worst
1418/// case: `true` takes max, `false` takes min. Unknown names (not in
1419/// this registry) default to max; register a `MetricDef` here before
1420/// relying on min-polarity merge. The comparison system
1421/// ([`compare_partitions`]) uses `higher_is_worse` for delta direction.
1422///
1423/// # Metric-name triples (registry / field / DataFrame column)
1424///
1425/// Each metric is referenced by three names across the pipeline.
1426/// The registry name is the stable surface — sidecars, CI gates,
1427/// and `cargo ktstr perf-delta` output all quote it verbatim —
1428/// and cannot be renamed without silently invalidating downstream
1429/// consumers. The field name on [`GauntletRow`] and the polars
1430/// DataFrame column name are internal; they are kept terse and
1431/// match each other, but diverge from the registry name where
1432/// the domain-level wording adds context (`worst_*`, `total_*`,
1433/// `max_*`) that would be noise on an already-qualified field.
1434/// Nine divergent triples:
1435///
1436/// | Registry (`MetricDef.name`) | `GauntletRow` field | DataFrame column |
1437/// |---|---|---|
1438/// | `worst_spread` | `spread` | `spread` |
1439/// | `worst_gap_ms` | `gap_ms` | `gap_ms` |
1440/// | `total_migrations` | `migrations` | `migrations` |
1441/// | `worst_migration_ratio` | `migration_ratio` | `migration_ratio` |
1442/// | `max_imbalance_ratio` | `imbalance_ratio` | `imbalance` |
1443/// | `max_dsq_depth` | `max_dsq_depth` | `dsq_depth` |
1444/// | `stuck_count` | `stuck_count` | `stuck` |
1445/// | `total_fallback` | `fallback_count` | `fallback` |
1446/// | `total_keep_last` | `keep_last_count` | `keep_last` |
1447///
1448/// One of the remaining metrics in [`METRICS`] has matching
1449/// registry / field / DataFrame column names backed by a typed
1450/// `GauntletRow` field (`total_iterations`) and is not listed — no
1451/// translation to document.
1452///
1453/// The ten wake-latency / run-delay / iteration-efficiency / NUMA roll-ups
1454/// (`worst_p99_wake_latency_us`, `worst_median_wake_latency_us`,
1455/// `worst_wake_latency_cv`, `worst_mean_run_delay_us`,
1456/// `worst_run_delay_us`, `worst_iterations_per_worker`,
1457/// `worst_iterations_per_cpu_sec`, `worst_wake_latency_tail_ratio`,
1458/// `worst_page_locality`, `worst_cross_node_migration_ratio`) are
1459/// DERIVED kinds ([`MetricKind::Distribution`] / [`MetricKind::WorstLowest`]
1460/// / [`MetricKind::WakeLatencyTailRatio`] / [`MetricKind::WorstCrossNodeRatio`])
1461/// with NO typed `GauntletRow`
1462/// field: their accessors are `|_| None` and
1463/// `crate::assert::populate_run_distribution_metrics` re-pools their value
1464/// into `ext_metrics` post-merge, so [`MetricDef::read`] reads them through
1465/// the ext fallback.
1466///
1467/// `worst_` naming convention: it is the codebase-wide prefix for a
1468/// cross-cgroup roll-up, independent of polarity and of HOW the roll-up is
1469/// formed. Polarity-directional selectors (`worst_spread`, and the derived
1470/// `worst_cross_node_migration_ratio`, both LowerBetter → max) and
1471/// [`MetricKind::WorstLowest`] (`worst_page_locality` +
1472/// `worst_iterations_per_*`, None-aware lowest-wins where a measured 0.0
1473/// wins) both surface the most problematic cgroup; whereas
1474/// [`MetricKind::Distribution`] (`worst_p99_wake_latency_us` etc.) is the
1475/// POOLED cross-cgroup distribution over the combined sample set, NOT a
1476/// per-cgroup selection — here `worst_` is retained for sidecar /
1477/// DataFrame / CI-gate name stability rather than literal accuracy. A
1478/// `lowest_*` rename of the HigherBetter selectors was weighed and
1479/// rejected as a high-churn rename across sidecars / DataFrames / CI gates
1480/// for no readability gain.
1481///
1482/// Quoting the matching list instead of a bare count avoids
1483/// silent drift on rename: a metric whose registry / field /
1484/// column names diverge belongs in the table above, while a
1485/// matching triple belongs in this paragraph; a future rename
1486/// that forgets to migrate the metric across the boundary
1487/// surfaces here as a stale list rather than a wrong count.
1488///
1489/// Consumers that cross the registry / DataFrame boundary should
1490/// go through [`MetricDef::read`] / the accessor closure rather
1491/// than hand-translating by string. The four-name mapping for
1492/// `worst_spread` specifically is documented in detail on the
1493/// [`GauntletRow::spread`] field (adds the
1494/// [`ScenarioStats::worst_spread`](crate::assert::ScenarioStats::worst_spread)
1495/// upstream source as a fourth name).
1496/// Registry names for the schbench per-phase metrics ([`MetricKind::PerPhase`]).
1497/// Shared by the [`METRICS`] entries below and the schbench per-phase derivation
1498/// (`crate::assert::derive_phase_metrics`) so the registered name and
1499/// the key the derivation writes into `crate::assert::PhaseBucket::metrics` are
1500/// one source of truth. Latency keys are µs (the unit `plat` buckets in);
1501/// sched-delay keys are µs (converted from ns at derivation); loop_count is a
1502/// bare count.
1503pub(crate) const SCHBENCH_WAKEUP_P50_US: &str = "wakeup_p50_latency_us";
1504pub(crate) const SCHBENCH_WAKEUP_P90_US: &str = "wakeup_p90_latency_us";
1505pub(crate) const SCHBENCH_WAKEUP_P99_US: &str = "wakeup_p99_latency_us";
1506pub(crate) const SCHBENCH_WAKEUP_P999_US: &str = "wakeup_p999_latency_us";
1507pub(crate) const SCHBENCH_REQUEST_P50_US: &str = "request_p50_latency_us";
1508pub(crate) const SCHBENCH_REQUEST_P90_US: &str = "request_p90_latency_us";
1509pub(crate) const SCHBENCH_REQUEST_P99_US: &str = "request_p99_latency_us";
1510pub(crate) const SCHBENCH_REQUEST_P999_US: &str = "request_p999_latency_us";
1511pub(crate) const SCHBENCH_SCHED_DELAY_MSG_US: &str = "sched_delay_msg_us";
1512pub(crate) const SCHBENCH_SCHED_DELAY_WORKER_US: &str = "sched_delay_worker_us";
1513pub(crate) const SCHBENCH_LOOP_COUNT: &str = "schbench_loop_count";
1514// schbench WHOLE-RUN Class-3 keys (loop count + role-separate run-delay gate
1515// Rates) for perf-delta --noise-adjust. Re-pooled run-level by
1516// `populate_run_pooled_schbench` from the per-phase per-cgroup
1517// `SchbenchPhaseStats` raw pairs (Σ over all phases+cgroups). The four
1518// `total_schbench_*` run-delay/pcount Counters are the rate components
1519// (RENDER_SUPPRESSED) for the two sample-weighted Σrun_delay/Σpcount gate Rates
1520// (workload-scoped siblings of the system-wide `total_run_delay_ns_per_sched`);
1521// the message and worker thread ROLES pool separately (different per-schedule
1522// wait populations). The per-phase `sched_delay_msg/worker_us` is the SAME
1523// Σrun_delay_ns/Σpcount per-schedule mean at phase scope (NOT schbench's native
1524// mean-of-per-thread-means, a separate whole-run SchbenchResult stat) and stays
1525// PerPhase display-only — only these Rates gate, no double-count.
1526// `total_schbench_loops` is the whole-run loop Counter (distinct from the
1527// per-phase `schbench_loop_count`).
1528pub(crate) const TOTAL_SCHBENCH_MSG_RUN_DELAY_NS: &str = "total_schbench_msg_run_delay_ns";
1529pub(crate) const TOTAL_SCHBENCH_MSG_PCOUNT: &str = "total_schbench_msg_pcount";
1530pub(crate) const TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS: &str = "total_schbench_worker_run_delay_ns";
1531pub(crate) const TOTAL_SCHBENCH_WORKER_PCOUNT: &str = "total_schbench_worker_pcount";
1532pub(crate) const TOTAL_SCHBENCH_LOOPS: &str = "total_schbench_loops";
1533pub(crate) const SCHBENCH_MSG_RUN_DELAY_NS_PER_SCHED: &str = "schbench_msg_run_delay_ns_per_sched";
1534pub(crate) const SCHBENCH_WORKER_RUN_DELAY_NS_PER_SCHED: &str =
1535    "schbench_worker_run_delay_ns_per_sched";
1536// schbench WHOLE-RUN distributional keys for perf-delta --noise-adjust:
1537// each per-phase percentile/min/max re-pooled run-level by
1538// `populate_run_pooled_schbench_distribution` (union of the per-phase per-cgroup
1539// PlatStats histograms, percentile re-derived over the union). MetricKind::
1540// PerRunDistribution: noise-compared per-run, NEVER cross-run folded. `*_whole`
1541// names keep them registry-distinct from the per-phase PerPhase keys above (one
1542// name = one kind). Latency LowerBetter; rps HigherBetter.
1543pub(crate) const SCHBENCH_WAKEUP_P50_US_WHOLE: &str = "wakeup_p50_latency_us_whole";
1544pub(crate) const SCHBENCH_WAKEUP_P90_US_WHOLE: &str = "wakeup_p90_latency_us_whole";
1545pub(crate) const SCHBENCH_WAKEUP_P99_US_WHOLE: &str = "wakeup_p99_latency_us_whole";
1546pub(crate) const SCHBENCH_WAKEUP_P999_US_WHOLE: &str = "wakeup_p999_latency_us_whole";
1547pub(crate) const SCHBENCH_WAKEUP_MIN_US_WHOLE: &str = "wakeup_min_latency_us_whole";
1548pub(crate) const SCHBENCH_WAKEUP_MAX_US_WHOLE: &str = "wakeup_max_latency_us_whole";
1549pub(crate) const SCHBENCH_REQUEST_P50_US_WHOLE: &str = "request_p50_latency_us_whole";
1550pub(crate) const SCHBENCH_REQUEST_P90_US_WHOLE: &str = "request_p90_latency_us_whole";
1551pub(crate) const SCHBENCH_REQUEST_P99_US_WHOLE: &str = "request_p99_latency_us_whole";
1552pub(crate) const SCHBENCH_REQUEST_P999_US_WHOLE: &str = "request_p999_latency_us_whole";
1553pub(crate) const SCHBENCH_REQUEST_MIN_US_WHOLE: &str = "request_min_latency_us_whole";
1554pub(crate) const SCHBENCH_REQUEST_MAX_US_WHOLE: &str = "request_max_latency_us_whole";
1555pub(crate) const SCHBENCH_RPS_P20_WHOLE: &str = "rps_p20_whole";
1556pub(crate) const SCHBENCH_RPS_P50_WHOLE: &str = "rps_p50_whole";
1557pub(crate) const SCHBENCH_RPS_P90_WHOLE: &str = "rps_p90_whole";
1558pub(crate) const SCHBENCH_RPS_MIN_WHOLE: &str = "rps_min_whole";
1559pub(crate) const SCHBENCH_RPS_MAX_WHOLE: &str = "rps_max_whole";
1560// taobench per-phase metric keys (the WorkType::Taobench engine's qps + hit
1561// ratios, derived per-phase by write_taobench_scalars; MetricKind::PerPhase).
1562// total/fast qps are HigherBetter; slow_qps + hit_ratio + hit_rate are
1563// Informational (slow_qps is a component, not a direction; the hit numbers are
1564// run-validity signals, not regression directions). The two hit keys are distinct
1565// axes: `taobench_hit_ratio` is RESPONSE-time (fast_ops / (fast_ops + slow_ops),
1566// the whole-run analog is `taobench_hit_fraction`) and `taobench_hit_rate` is
1567// COMMAND-time (1 - get_misses / get_cmds, the whole-run analog is
1568// `taobench_command_hit_rate`). Under open-loop arrival the two diverge
1569// (request-time vs response-time).
1570pub(crate) const TAOBENCH_TOTAL_QPS: &str = "taobench_total_qps";
1571pub(crate) const TAOBENCH_FAST_QPS: &str = "taobench_fast_qps";
1572pub(crate) const TAOBENCH_SLOW_QPS: &str = "taobench_slow_qps";
1573/// Response-time per-phase hit ratio: fast_ops / (fast_ops + slow_ops).
1574pub(crate) const TAOBENCH_HIT_RATIO: &str = "taobench_hit_ratio";
1575/// Command-time per-phase hit rate: 1 - get_misses / get_cmds.
1576pub(crate) const TAOBENCH_HIT_RATE: &str = "taobench_hit_rate";
1577// taobench per-phase open-loop SERVE-LATENCY percentiles (µs): the
1578// coordinated-omission serve latency distribution per phase (PerPhase,
1579// LowerBetter), pooled cross-cgroup + re-derived by `write_taobench_scalars`.
1580// Absent in closed loop (no serve samples).
1581pub(crate) const TAOBENCH_SERVE_P50_US: &str = "taobench_serve_p50_us";
1582pub(crate) const TAOBENCH_SERVE_P90_US: &str = "taobench_serve_p90_us";
1583pub(crate) const TAOBENCH_SERVE_P99_US: &str = "taobench_serve_p99_us";
1584pub(crate) const TAOBENCH_SERVE_P999_US: &str = "taobench_serve_p999_us";
1585pub(crate) const TAOBENCH_SERVE_MIN_US: &str = "taobench_serve_min_us";
1586pub(crate) const TAOBENCH_SERVE_MAX_US: &str = "taobench_serve_max_us";
1587// taobench WHOLE-RUN Rate component + Rate keys (the run-level qps + hit
1588// fraction, pooled cross-cgroup by `populate_run_pooled_taobench` and derived by
1589// `derive_rate_metrics`). Distinct from the per-phase `taobench_*_qps` above
1590// (`MetricKind::PerPhase`, invisible to the whole-run cross-run fold): these are
1591// registered `Rate`/`Counter` METRICS so they reach perf-delta `--noise-adjust`
1592// spread. The four `total_taobench_*` Counters are the rate components (their
1593// `total_` prefix satisfies the Counter naming gate); they are
1594// `RENDER_SUPPRESSED_COMPONENTS` so the compare output shows the rates, not the
1595// raw counts.
1596pub(crate) const TOTAL_TAOBENCH_OPS: &str = "total_taobench_ops";
1597pub(crate) const TOTAL_TAOBENCH_FAST_OPS: &str = "total_taobench_fast_ops";
1598pub(crate) const TOTAL_TAOBENCH_SLOW_OPS: &str = "total_taobench_slow_ops";
1599pub(crate) const TOTAL_TAOBENCH_WALL_SEC: &str = "total_taobench_wall_sec";
1600pub(crate) const TAOBENCH_TOTAL_OPS_PER_SEC: &str = "taobench_total_ops_per_sec";
1601pub(crate) const TAOBENCH_FAST_OPS_PER_SEC: &str = "taobench_fast_ops_per_sec";
1602pub(crate) const TAOBENCH_SLOW_OPS_PER_SEC: &str = "taobench_slow_ops_per_sec";
1603pub(crate) const TAOBENCH_HIT_FRACTION: &str = "taobench_hit_fraction";
1604// taobench WHOLE-RUN open-loop serve-latency percentiles (µs): the union of the
1605// per-phase per-cgroup serve histograms re-derived run-level
1606// (`MetricKind::PerRunDistribution` — noise-compared per-run, never cross-run
1607// folded), pooled by `populate_run_pooled_taobench_distribution`. `*_whole`
1608// names, distinct from the per-phase `taobench_serve_*_us` keys above.
1609pub(crate) const TAOBENCH_SERVE_P50_US_WHOLE: &str = "taobench_serve_p50_us_whole";
1610pub(crate) const TAOBENCH_SERVE_P90_US_WHOLE: &str = "taobench_serve_p90_us_whole";
1611pub(crate) const TAOBENCH_SERVE_P99_US_WHOLE: &str = "taobench_serve_p99_us_whole";
1612pub(crate) const TAOBENCH_SERVE_P999_US_WHOLE: &str = "taobench_serve_p999_us_whole";
1613pub(crate) const TAOBENCH_SERVE_MIN_US_WHOLE: &str = "taobench_serve_min_us_whole";
1614pub(crate) const TAOBENCH_SERVE_MAX_US_WHOLE: &str = "taobench_serve_max_us_whole";
1615// taobench WHOLE-RUN command-time hit: the request-time hit rate (distinct from
1616// the response-time `taobench_hit_fraction`; the two diverge under open-loop
1617// arrival). hits = cmds − misses, pooled cross-cgroup by
1618// `populate_run_pooled_taobench`; `taobench_command_hit_rate` = Σhits/Σcmds
1619// (`total_` Counter components satisfy the naming gate; the Rate ends in `_rate`).
1620// Whole-run Rates use their rate-form name, never the `_whole` suffix (which is
1621// the PerRunDistribution marker) — the same convention as `taobench_*_per_sec`.
1622pub(crate) const TOTAL_TAOBENCH_GET_CMDS: &str = "total_taobench_get_cmds";
1623pub(crate) const TOTAL_TAOBENCH_GET_HITS: &str = "total_taobench_get_hits";
1624pub(crate) const TAOBENCH_COMMAND_HIT_RATE: &str = "taobench_command_hit_rate";
1625// Per-phase latency min/max (schbench's `min=`/`max=` table footer,
1626// `schbench.c:579`): the per-phase PlatStats already carries them, so these are
1627// emitted from `q.min`/`q.max`. LowerBetter (a higher min/max latency is worse).
1628pub(crate) const SCHBENCH_WAKEUP_MIN_US: &str = "wakeup_min_latency_us";
1629pub(crate) const SCHBENCH_WAKEUP_MAX_US: &str = "wakeup_max_latency_us";
1630pub(crate) const SCHBENCH_REQUEST_MIN_US: &str = "request_min_latency_us";
1631pub(crate) const SCHBENCH_REQUEST_MAX_US: &str = "request_max_latency_us";
1632// Per-phase achieved-RPS distribution (schbench's RPS table, PLIST_FOR_RPS =
1633// 20/50/90, `schbench.c:130`) + its min/max. HigherBetter (more requests/sec =
1634// more throughput); the min/max INVERT the latency polarity (a higher worst-
1635// second rate is better). A per-second RATE, so no `_us` suffix.
1636pub(crate) const SCHBENCH_RPS_P20: &str = "rps_p20";
1637pub(crate) const SCHBENCH_RPS_P50: &str = "rps_p50";
1638pub(crate) const SCHBENCH_RPS_P90: &str = "rps_p90";
1639pub(crate) const SCHBENCH_RPS_MIN: &str = "rps_min";
1640pub(crate) const SCHBENCH_RPS_MAX: &str = "rps_max";
1641
1642pub static METRICS: &[MetricDef] = &[
1643    MetricDef {
1644        // `"worst_spread"` is the wire/surface name — emitted in
1645        // sidecars, referenced by CI gates, and printed by
1646        // `cargo ktstr perf-delta`. Internally the field on
1647        // `GauntletRow` is named `spread` and the polars DataFrame
1648        // column keeps that shorter name; see the doc on
1649        // `GauntletRow.spread` for the rationale (rename-of-
1650        // registry-name is not safe because existing gate configs
1651        // match this string by value).
1652        name: "worst_spread",
1653        polarity: crate::test_support::Polarity::LowerBetter,
1654        kind: MetricKind::Gauge(GaugeAgg::Last),
1655        default_abs: 5.0,
1656        default_rel: 0.25,
1657        display_unit: "%",
1658        accessor: |r| Some(r.spread),
1659    },
1660    MetricDef {
1661        name: "worst_gap_ms",
1662        polarity: crate::test_support::Polarity::LowerBetter,
1663        kind: MetricKind::Peak,
1664        default_abs: 500.0,
1665        default_rel: 0.50,
1666        display_unit: "ms",
1667        accessor: |r| Some(r.gap_ms as f64),
1668    },
1669    MetricDef {
1670        name: "total_migrations",
1671        polarity: crate::test_support::Polarity::LowerBetter,
1672        kind: MetricKind::Counter,
1673        default_abs: 2.0,
1674        default_rel: 0.30,
1675        display_unit: "",
1676        accessor: |r| Some(r.migrations as f64),
1677    },
1678    MetricDef {
1679        name: "worst_migration_ratio",
1680        polarity: crate::test_support::Polarity::LowerBetter,
1681        kind: MetricKind::Gauge(GaugeAgg::Last),
1682        default_abs: 0.05,
1683        default_rel: 0.20,
1684        display_unit: "",
1685        accessor: |r| Some(r.migration_ratio),
1686    },
1687    MetricDef {
1688        name: "max_imbalance_ratio",
1689        polarity: crate::test_support::Polarity::LowerBetter,
1690        kind: MetricKind::Peak,
1691        default_abs: 1.0,
1692        default_rel: 0.25,
1693        display_unit: "x",
1694        accessor: |r| Some(r.imbalance_ratio),
1695    },
1696    MetricDef {
1697        // Per-phase mean of per-tick imbalance_ratio observations
1698        // (max(nr_running) / max(1, min(nr_running)) per CPU; full-
1699        // class count). Sourced from MonitorSample (not Snapshot)
1700        // because Snapshot exposes only scx_rq.nr_running (SCX-
1701        // only) while imbalance is meaningful only across the
1702        // full per-CPU runqueue. Populated by build_phase_buckets
1703        // via per-phase MonitorSample windowing — bypasses
1704        // MetricDef::read_sample (which dispatches off
1705        // sample.snapshot only) per the data-axis split. Kind
1706        // Gauge(Avg) folds across cgroups via weighted-mean per
1707        // sample_count; Polarity::LowerBetter mirrors the Peak
1708        // sibling.
1709        name: "avg_imbalance_ratio",
1710        polarity: crate::test_support::Polarity::LowerBetter,
1711        kind: MetricKind::Gauge(GaugeAgg::Avg),
1712        default_abs: 0.5,
1713        default_rel: 0.25,
1714        display_unit: "x",
1715        accessor: |_| None,
1716    },
1717    MetricDef {
1718        name: "max_dsq_depth",
1719        polarity: crate::test_support::Polarity::LowerBetter,
1720        kind: MetricKind::Peak,
1721        default_abs: 10.0,
1722        default_rel: 0.50,
1723        display_unit: "",
1724        accessor: |r| Some(r.max_dsq_depth as f64),
1725    },
1726    MetricDef {
1727        // Per-sample mean of local-CPU DSQ depths sourced from
1728        // the BPF DSQ walker (Snapshot::dsq_states() filtered by
1729        // `origin.starts_with("local cpu ")`), reduced per phase
1730        // via the Gauge(Avg) path. The DSQ-walker axis is the
1731        // authoritative source — it reads the actual scheduler
1732        // dispatch queues. The legacy Timeline::build path
1733        // computed avg_dsq_depth from MonitorSample.CpuSnapshot.
1734        // local_dsq_depth (a per-CPU rq-level metric); the new
1735        // DSQ-walker axis is more accurate for an scx scheduler
1736        // because it observes the dispatch queue directly rather
1737        // than the rq-level reflection.
1738        //
1739        // Truncation caveat: when scx_walker hits MAX_NODES_PER_LIST
1740        // (per src/monitor/scx_walker.rs), the captured DSQs are a
1741        // prefix of the full set. The mean then shifts toward the
1742        // captured prefix's central tendency; a 64-CPU box capturing
1743        // only 20 DSQs reports the mean of those 20, not the mean
1744        // over 64. max_dsq_depth (the Peak sibling) is robust to
1745        // this (max-of-captured surfaces the deepest queue ever
1746        // captured); avg_dsq_depth has no such monotonicity. If
1747        // walker truncation becomes routine, add a denom-aware
1748        // version that sums-then-divides by the topology's
1749        // expected local-CPU count.
1750        //
1751        // Accessor falls back to ext_metrics (no typed GauntletRow
1752        // field; promoting to typed is gated on cross-RUN
1753        // aggregation needs surfacing).
1754        name: "avg_dsq_depth",
1755        polarity: crate::test_support::Polarity::LowerBetter,
1756        kind: MetricKind::Gauge(GaugeAgg::Avg),
1757        default_abs: 5.0,
1758        default_rel: 0.50,
1759        display_unit: "",
1760        accessor: |_| None,
1761    },
1762    MetricDef {
1763        name: "stuck_count",
1764        polarity: crate::test_support::Polarity::LowerBetter,
1765        kind: MetricKind::Counter,
1766        // abs=1.0 (vs 5.0 for the event counters below): one additional
1767        // scheduler stall is high-signal, so a delta of a single whole
1768        // stall — gated by the 0.50 rel threshold — is worth flagging.
1769        default_abs: 1.0,
1770        default_rel: 0.50,
1771        display_unit: "",
1772        accessor: |r| Some(r.stuck_count),
1773    },
1774    MetricDef {
1775        name: "total_fallback",
1776        polarity: crate::test_support::Polarity::LowerBetter,
1777        kind: MetricKind::Counter,
1778        default_abs: 5.0,
1779        default_rel: 0.30,
1780        // Integer event count, not a rate — the source field on
1781        // `MonitorSummary::event_deltas.total_fallback` is a cumulative
1782        // delta across the run, not per-second. Empty unit matches the
1783        // other counter metrics (`stuck_count`, `total_iterations`,
1784        // `total_migrations`).
1785        display_unit: "",
1786        accessor: |r| Some(r.fallback_count as f64),
1787    },
1788    MetricDef {
1789        name: "total_keep_last",
1790        polarity: crate::test_support::Polarity::LowerBetter,
1791        kind: MetricKind::Counter,
1792        default_abs: 5.0,
1793        default_rel: 0.30,
1794        // Integer event count, not a rate — see `total_fallback`
1795        // rationale above. Source field is
1796        // `MonitorSummary::event_deltas.total_dispatch_keep_last`.
1797        display_unit: "",
1798        accessor: |r| Some(r.keep_last_count as f64),
1799    },
1800    // -- System-wide schedstat aggregates. Read host-side from guest memory at
1801    // -- freeze (zero observer effect) via `MonitorSummary::schedstat_deltas`
1802    // -- (per-rq `struct rq` schedstat fields summed across CPUs over the run);
1803    // -- `sidecar_to_row` inserts them into `GauntletRow::ext_metrics` so the
1804    // -- `|_| None` accessors surface them through the ext fallback. The seven
1805    // -- raw counters are `Polarity::Informational` — directionless (more
1806    // -- wakeups / context-switches / yields is neither inherently better nor
1807    // -- worse), so they are SHOWN but NEVER gated. They are also
1808    // -- WINDOW-DURATION- and LOAD-CONFOUNDED raw sums (a longer monitor window
1809    // -- or more offered runnable work inflates them independent of the
1810    // -- scheduler) — a second reason they are Informational, not LowerBetter:
1811    // -- a large raw delta is not a regression. The duration- and load-robust
1812    // -- GATED signals are the per-schedule mean (`total_run_delay_ns_per_sched`)
1813    // -- and the locality ratio (`ttwu_local_fraction`) derived below; four of
1814    // -- the raw counters double as those Rates' Counter components.
1815    MetricDef {
1816        // Numerator of `total_run_delay_ns_per_sched`. Cumulative runqueue-wait
1817        // delay (ns) across all tasks + all CPUs (`rq.rq_sched_info.run_delay`).
1818        // `total_` prefix satisfies the Counter naming gate.
1819        name: "total_run_delay",
1820        polarity: crate::test_support::Polarity::Informational,
1821        kind: MetricKind::Counter,
1822        default_abs: 1000.0,
1823        default_rel: 0.10,
1824        display_unit: "ns",
1825        accessor: |_| None,
1826    },
1827    MetricDef {
1828        // Denominator of `total_run_delay_ns_per_sched`. Count of non-idle task
1829        // arrivals (`rq.rq_sched_info.pcount`) — the number of schedules the
1830        // run-delay accrued over.
1831        name: "total_pcount",
1832        polarity: crate::test_support::Polarity::Informational,
1833        kind: MetricKind::Counter,
1834        default_abs: 1.0,
1835        default_rel: 0.10,
1836        display_unit: "",
1837        accessor: |_| None,
1838    },
1839    MetricDef {
1840        // schedule() invocation count (`rq.sched_count` — incremented once per
1841        // __schedule() call, a superset of context switches since re-picking the
1842        // same task still counts). Informational: more scheduler entries can
1843        // mean responsiveness OR thrashing — no direction.
1844        name: "total_sched_count",
1845        polarity: crate::test_support::Polarity::Informational,
1846        kind: MetricKind::Counter,
1847        default_abs: 1.0,
1848        default_rel: 0.10,
1849        display_unit: "",
1850        accessor: |_| None,
1851    },
1852    MetricDef {
1853        // `sched_yield()` call count (`rq.yld_count`). Informational (workload
1854        // behavior, not a scheduler-quality signal).
1855        name: "total_yld_count",
1856        polarity: crate::test_support::Polarity::Informational,
1857        kind: MetricKind::Counter,
1858        default_abs: 1.0,
1859        default_rel: 0.10,
1860        display_unit: "",
1861        accessor: |_| None,
1862    },
1863    MetricDef {
1864        // Go-idle count (`rq.sched_goidle`): times a CPU picked the idle task.
1865        // Informational (good utilization vs wasted idle — ambiguous).
1866        name: "total_sched_goidle",
1867        polarity: crate::test_support::Polarity::Informational,
1868        kind: MetricKind::Counter,
1869        default_abs: 1.0,
1870        default_rel: 0.10,
1871        display_unit: "",
1872        accessor: |_| None,
1873    },
1874    MetricDef {
1875        // Denominator of `ttwu_local_fraction`. Total wakeups (`rq.ttwu_count`)
1876        // — workload activity. Informational.
1877        name: "total_ttwu_count",
1878        polarity: crate::test_support::Polarity::Informational,
1879        kind: MetricKind::Counter,
1880        default_abs: 1.0,
1881        default_rel: 0.10,
1882        display_unit: "",
1883        accessor: |_| None,
1884    },
1885    MetricDef {
1886        // Numerator of `ttwu_local_fraction`. Wakeups kept on the waking CPU
1887        // (`rq.ttwu_local`). Informational on its own; the locality RATIO below
1888        // carries the direction.
1889        name: "total_ttwu_local",
1890        polarity: crate::test_support::Polarity::Informational,
1891        kind: MetricKind::Counter,
1892        default_abs: 1.0,
1893        default_rel: 0.10,
1894        display_unit: "",
1895        accessor: |_| None,
1896    },
1897    MetricDef {
1898        // GATED. System-wide per-schedule MEAN runqueue-wait delay =
1899        // Σrun_delay / Σpcount, re-derived across CPUs/runs by
1900        // `derive_rate_metrics` (the `Rate` kind's `MergeKind::Recompute` pools
1901        // the components — never a mean-of-ratios). Duration- and load-robust
1902        // (per-EVENT, not per-time): the system-wide analog of the
1903        // workload-scoped per-task `mean_run_delay_us` (schbench's
1904        // `mean_sched_delay = run_delay/pcount`). LowerBetter. Absent when
1905        // `total_pcount` is 0 (no schedules) or CONFIG_SCHEDSTATS is off
1906        // (components absent).
1907        name: "total_run_delay_ns_per_sched",
1908        polarity: crate::test_support::Polarity::LowerBetter,
1909        kind: MetricKind::Rate {
1910            numerator: "total_run_delay",
1911            denominator: "total_pcount",
1912        },
1913        default_abs: 100.0,
1914        default_rel: 0.15,
1915        display_unit: "ns",
1916        accessor: |_| None,
1917    },
1918    MetricDef {
1919        // GATED. Wakeup LOCALITY = Σttwu_local / Σttwu_count, re-derived by
1920        // `derive_rate_metrics`. A fraction in [0, 1]: the share of wakeups kept
1921        // on the waking CPU (better cache locality, fewer cross-CPU hops on
1922        // wakeup). HigherBetter. Absent when `total_ttwu_count` is 0 or
1923        // CONFIG_SCHEDSTATS is off.
1924        name: "ttwu_local_fraction",
1925        polarity: crate::test_support::Polarity::HigherBetter,
1926        kind: MetricKind::Rate {
1927            numerator: "total_ttwu_local",
1928            denominator: "total_ttwu_count",
1929        },
1930        default_abs: 0.05,
1931        default_rel: 0.10,
1932        display_unit: "",
1933        accessor: |_| None,
1934    },
1935    MetricDef {
1936        // GATED. Go-idle FRACTION = Σsched_goidle / Σsched_count, re-derived by
1937        // `derive_rate_metrics`. A fraction in [0, 1]: the share of `schedule()`
1938        // calls that picked the idle task (the CPU found nothing runnable).
1939        // Load-normalized (per-schedule, not per-time), so it is duration- AND
1940        // arrival-rate-stable — the genuinely-useful-for-spread schedstat rate
1941        // (a bare per-second rate carries the same spread as the raw total when
1942        // cohort runs share a duration, so it adds nothing at equal duration).
1943        // Informational: a high idle fraction is ambiguous — efficient when no
1944        // runnable work exists, but a starvation symptom when runnable work is
1945        // not dispatched — so it surfaces in `--noise-adjust` spread but does
1946        // not gate a regression verdict. Absent when `total_sched_count` is 0
1947        // (no schedules) or CONFIG_SCHEDSTATS is off (components absent).
1948        name: "sched_goidle_fraction",
1949        polarity: crate::test_support::Polarity::Informational,
1950        kind: MetricKind::Rate {
1951            numerator: "total_sched_goidle",
1952            denominator: "total_sched_count",
1953        },
1954        default_abs: 0.05,
1955        default_rel: 0.10,
1956        display_unit: "",
1957        accessor: |_| None,
1958    },
1959    // Per-second schedstat rates: each total_* schedstat Counter divided by
1960    // total_schedstat_wall_sec (the monitor-window span). Unlike the
1961    // per-schedule ratios above (total_run_delay_ns_per_sched / *_fraction,
1962    // load-normalized per-EVENT), these are per-TIME — duration-normalized so
1963    // --noise-adjust can compare cohorts whose runs differ in wall duration
1964    // (raw counts are not comparable across differing durations; per-second
1965    // rates are). At EQUAL duration a per-second rate ranks identically to the
1966    // raw count, so it adds nothing then — its value is the differing-duration
1967    // case. Rate kind => cross-run Σnumerator/Σdenominator (duration-weighted),
1968    // NOT a mean of per-run rates. All Informational (raw activity rates carry
1969    // no universal better-direction) except run_delay_per_sec (latency,
1970    // LowerBetter). Absent when CONFIG_SCHEDSTATS is off or the window is
1971    // degenerate (denominator absent/0).
1972    MetricDef {
1973        // Hidden rate-denominator component (NOT user-facing): the schedstat
1974        // monitor-window span in seconds, co-inserted both-or-neither with the
1975        // total_* schedstat counters in sidecar_to_row. Counter so it survives
1976        // the cross-RUN Sum-fold (Σcount / Σsec re-derives). Distinct from
1977        // total_phase_wall_sec (the per-phase IRQ-capture window) — schedstat's
1978        // window is the monitor-sample span, a different measurement.
1979        name: "total_schedstat_wall_sec",
1980        polarity: crate::test_support::Polarity::Informational,
1981        kind: MetricKind::Counter,
1982        default_abs: 0.1,
1983        default_rel: 0.30,
1984        display_unit: "s",
1985        accessor: |_| None,
1986    },
1987    MetricDef {
1988        // Σrun_delay / Σwindow-seconds — total scheduling-wait delay accrued per
1989        // second (ns/s). LowerBetter (less accrued wait = better). Distinct from
1990        // total_run_delay_ns_per_sched (ns PER SCHEDULE): _per_sec is per-time,
1991        // _ns_per_sched is per-event.
1992        name: "run_delay_per_sec",
1993        polarity: crate::test_support::Polarity::LowerBetter,
1994        kind: MetricKind::Rate {
1995            numerator: "total_run_delay",
1996            denominator: "total_schedstat_wall_sec",
1997        },
1998        default_abs: 1000.0,
1999        default_rel: 0.30,
2000        display_unit: "ns/s",
2001        accessor: |_| None,
2002    },
2003    MetricDef {
2004        // Σpcount / Σwindow-seconds — task-arrival (non-idle schedule) rate per
2005        // second. Informational (scheduling-activity throughput tracks offered
2006        // load + scheduler behavior together, no universal direction).
2007        name: "pcount_per_sec",
2008        polarity: crate::test_support::Polarity::Informational,
2009        kind: MetricKind::Rate {
2010            numerator: "total_pcount",
2011            denominator: "total_schedstat_wall_sec",
2012        },
2013        default_abs: 1.0,
2014        default_rel: 0.30,
2015        display_unit: "/s",
2016        accessor: |_| None,
2017    },
2018    MetricDef {
2019        // Σsched_count / Σwindow-seconds — schedule() invocations per second
2020        // (rq.sched_count increments once per __schedule() call, a superset of
2021        // context switches since re-picking the same task still counts).
2022        // Informational. The per-second sibling of the precomputed struct rate
2023        // that was retired; cross-run-foldable here (Σnum/Σden), the struct
2024        // field was not.
2025        name: "sched_count_per_sec",
2026        polarity: crate::test_support::Polarity::Informational,
2027        kind: MetricKind::Rate {
2028            numerator: "total_sched_count",
2029            denominator: "total_schedstat_wall_sec",
2030        },
2031        default_abs: 1.0,
2032        default_rel: 0.30,
2033        display_unit: "/s",
2034        accessor: |_| None,
2035    },
2036    MetricDef {
2037        // Σyld_count / Σwindow-seconds — sched_yield() calls per second.
2038        // Informational; high-signal only under a yield-storm pathology.
2039        name: "yld_count_per_sec",
2040        polarity: crate::test_support::Polarity::Informational,
2041        kind: MetricKind::Rate {
2042            numerator: "total_yld_count",
2043            denominator: "total_schedstat_wall_sec",
2044        },
2045        default_abs: 1.0,
2046        default_rel: 0.30,
2047        display_unit: "/s",
2048        accessor: |_| None,
2049    },
2050    MetricDef {
2051        // Σttwu_count / Σwindow-seconds — wakeups per second. Informational
2052        // (wakeup volume; the locality DIRECTION is ttwu_local_fraction, not a
2053        // per-second magnitude — so ttwu_local has no _per_sec rate).
2054        name: "ttwu_count_per_sec",
2055        polarity: crate::test_support::Polarity::Informational,
2056        kind: MetricKind::Rate {
2057            numerator: "total_ttwu_count",
2058            denominator: "total_schedstat_wall_sec",
2059        },
2060        default_abs: 1.0,
2061        default_rel: 0.30,
2062        display_unit: "/s",
2063        accessor: |_| None,
2064    },
2065    MetricDef {
2066        // Σsched_goidle / Σwindow-seconds — go-idle transitions per second.
2067        // Informational; the per-TIME companion to sched_goidle_fraction (the
2068        // per-schedule share) — a high goidle/sec can signal wakeup-thrash.
2069        name: "sched_goidle_per_sec",
2070        polarity: crate::test_support::Polarity::Informational,
2071        kind: MetricKind::Rate {
2072            numerator: "total_sched_goidle",
2073            denominator: "total_schedstat_wall_sec",
2074        },
2075        default_abs: 1.0,
2076        default_rel: 0.30,
2077        display_unit: "/s",
2078        accessor: |_| None,
2079    },
2080    MetricDef {
2081        // Whole-run mean per-CPU runqueue depth (`rq.nr_running`, ALL scheduling
2082        // classes), read host-side from guest memory via
2083        // `MonitorSummary::avg_nr_running`. The occupancy LEVEL — distinct from
2084        // `avg_dsq_depth` (scx DSQ only) and `avg_imbalance_ratio` (cross-CPU
2085        // SKEW, not level). `MetricKind::Gauge(Avg)`: the cross-run fold is the
2086        // sample-weighted pooled mean (Σ avg×samples / Σ samples via
2087        // `aggregate_samples_weighted`, weight = `run_sample_count`). The weight
2088        // is sample count, not samples×CPUs — EXACT under same-topology pairing
2089        // (CPU count is a pairing dim, so cross-folded runs share it; the same
2090        // basis `avg_imbalance_ratio` uses). LowerBetter — higher mean depth =
2091        // more tasks waiting, but load-confounded (more offered runnable tasks
2092        // raises it independent of the scheduler), the same caveat
2093        // `avg_dsq_depth` carries; meaningful for same-offered-load A/B.
2094        // ext_metrics-only (accessor `|_| None`, surfaced via the ext fallback);
2095        // absent when the run has no monitor samples.
2096        name: "avg_nr_running",
2097        polarity: crate::test_support::Polarity::LowerBetter,
2098        kind: MetricKind::Gauge(GaugeAgg::Avg),
2099        default_abs: 0.5,
2100        default_rel: 0.20,
2101        display_unit: "",
2102        accessor: |_| None,
2103    },
2104    MetricDef {
2105        // Wake-latency p99, re-pooled over the COMBINED wake-latency sample
2106        // set across every cgroup (and phase), NOT a max of per-cgroup p99s.
2107        // Distribution kind: derived post-merge by
2108        // `crate::assert::populate_run_distribution_metrics`; accessor is
2109        // |_| None so `MetricDef::read` takes the ext_metrics value the
2110        // re-pool writes. (The `worst_` name is retained for sidecar /
2111        // DataFrame / CI-gate stability — see the `worst_` naming
2112        // convention on [`METRICS`].)
2113        name: "worst_p99_wake_latency_us",
2114        polarity: crate::test_support::Polarity::LowerBetter,
2115        kind: MetricKind::Distribution {
2116            source: SampleSource::WakeLatencyNs,
2117            reduction: SampleReduction::P99,
2118        },
2119        default_abs: 50.0,
2120        default_rel: 0.25,
2121        display_unit: "\u{00b5}s",
2122        accessor: |_| None,
2123    },
2124    MetricDef {
2125        // Wake-latency median (50th pct), re-pooled over the combined wake
2126        // set — see `worst_p99_wake_latency_us`.
2127        name: "worst_median_wake_latency_us",
2128        polarity: crate::test_support::Polarity::LowerBetter,
2129        kind: MetricKind::Distribution {
2130            source: SampleSource::WakeLatencyNs,
2131            reduction: SampleReduction::Median,
2132        },
2133        default_abs: 20.0,
2134        default_rel: 0.25,
2135        display_unit: "\u{00b5}s",
2136        accessor: |_| None,
2137    },
2138    MetricDef {
2139        // Wake-latency coefficient of variation (stddev/mean), re-pooled
2140        // over the combined wake set with a population-WEIGHTED variance and
2141        // mean (denominator = Σ per-sample population weights, i.e. the
2142        // reconstructed true wakeup population; == `pool.len()` only below the
2143        // reservoir cap, where every weight is 1.0) — see
2144        // `worst_p99_wake_latency_us`.
2145        name: "worst_wake_latency_cv",
2146        polarity: crate::test_support::Polarity::LowerBetter,
2147        kind: MetricKind::Distribution {
2148            source: SampleSource::WakeLatencyNs,
2149            reduction: SampleReduction::Cv,
2150        },
2151        default_abs: 0.10,
2152        default_rel: 0.25,
2153        display_unit: "",
2154        accessor: |_| None,
2155    },
2156    MetricDef {
2157        // Run-level timer-latency p99 (WorkType::TimerLatency cyclictest probe),
2158        // re-pooled over the combined timer-latency sample set across every
2159        // cgroup and phase (NOT a max of per-cgroup p99s). Distribution: derived
2160        // post-merge by populate_run_distribution_metrics; accessor |_| None
2161        // reads the ext_metrics value the re-pool writes.
2162        name: "worst_p99_timer_latency_us",
2163        polarity: crate::test_support::Polarity::LowerBetter,
2164        kind: MetricKind::Distribution {
2165            source: SampleSource::TimerLatencyNs,
2166            reduction: SampleReduction::P99,
2167        },
2168        default_abs: 50.0,
2169        default_rel: 0.25,
2170        display_unit: "\u{00b5}s",
2171        accessor: |_| None,
2172    },
2173    MetricDef {
2174        // Run-level timer-latency median — see worst_p99_timer_latency_us.
2175        name: "worst_median_timer_latency_us",
2176        polarity: crate::test_support::Polarity::LowerBetter,
2177        kind: MetricKind::Distribution {
2178            source: SampleSource::TimerLatencyNs,
2179            reduction: SampleReduction::Median,
2180        },
2181        default_abs: 20.0,
2182        default_rel: 0.25,
2183        display_unit: "\u{00b5}s",
2184        accessor: |_| None,
2185    },
2186    MetricDef {
2187        // Run-level timer-latency p99.9 (the deep RT tail) — see
2188        // worst_p99_timer_latency_us.
2189        name: "worst_p999_timer_latency_us",
2190        polarity: crate::test_support::Polarity::LowerBetter,
2191        kind: MetricKind::Distribution {
2192            source: SampleSource::TimerLatencyNs,
2193            reduction: SampleReduction::P999,
2194        },
2195        default_abs: 100.0,
2196        default_rel: 0.25,
2197        display_unit: "\u{00b5}s",
2198        accessor: |_| None,
2199    },
2200    MetricDef {
2201        // Run-level WORST (max) timer-latency — the cyclictest headline.
2202        // MAX-folds cross-RUN (SampleReduction::Worst, the peak survives) via
2203        // aggregate_finite, distinct from the MEAN-folded percentiles above.
2204        // Named worst_* with no pNN exactly like worst_run_delay_us
2205        // (Distribution{RunDelayNs, Worst}).
2206        name: "worst_timer_latency_us",
2207        polarity: crate::test_support::Polarity::LowerBetter,
2208        kind: MetricKind::Distribution {
2209            source: SampleSource::TimerLatencyNs,
2210            reduction: SampleReduction::Worst,
2211        },
2212        default_abs: 200.0,
2213        default_rel: 0.25,
2214        display_unit: "\u{00b5}s",
2215        accessor: |_| None,
2216    },
2217    MetricDef {
2218        // Per-phase worker iterations per second. MetricKind::Rate with
2219        // Counter components total_phase_iterations / total_phase_duration_sec:
2220        // build_phase_buckets_with_stimulus emits those two components (the
2221        // iteration delta + the window seconds) from adjacent stimulus events'
2222        // total_iterations / elapsed_ms deltas — NOT a ready ratio — and
2223        // derive_rate_metrics re-derives iteration_rate = Σiterations /
2224        // Σseconds, so it re-pools correctly across phases/runs rather than
2225        // averaging per-phase ratios. Higher-is-better (more throughput). The
2226        // registry entry exists so MetricDef::read on a
2227        // GauntletRow.ext_metrics fallback surfaces it through cargo ktstr
2228        // perf-delta like any other metric, and so
2229        // Timeline::from_phase_buckets reads it by the canonical name from
2230        // PhaseBucket.metrics. No typed GauntletRow field; accessor is the
2231        // ext_metrics fallback.
2232        name: "iteration_rate",
2233        polarity: crate::test_support::Polarity::HigherBetter,
2234        kind: MetricKind::Rate {
2235            numerator: "total_phase_iterations",
2236            denominator: "total_phase_duration_sec",
2237        },
2238        default_abs: 1.0,
2239        default_rel: 0.30,
2240        display_unit: "iter/s",
2241        accessor: |_| None,
2242    },
2243    MetricDef {
2244        name: "total_iterations",
2245        polarity: crate::test_support::Polarity::HigherBetter,
2246        kind: MetricKind::Counter,
2247        default_abs: 2.0,
2248        default_rel: 0.10,
2249        display_unit: "",
2250        accessor: |r| Some(r.total_iterations as f64),
2251    },
2252    MetricDef {
2253        // Per-phase iteration delta — the NUMERATOR component of the
2254        // `iteration_rate` Rate. ext_metrics-only (no GauntletRow field):
2255        // inserted per phase as the last-minus-first delta of the cumulative
2256        // iteration counter, alongside `total_phase_duration_sec`, so
2257        // `derive_rate_metrics` yields `iteration_rate` = Σ(iter delta) /
2258        // Σ(phase seconds). `total_` prefix satisfies the Counter naming gate.
2259        name: "total_phase_iterations",
2260        polarity: crate::test_support::Polarity::HigherBetter,
2261        kind: MetricKind::Counter,
2262        default_abs: 1.0,
2263        default_rel: 0.10,
2264        display_unit: "",
2265        accessor: |_| None,
2266    },
2267    MetricDef {
2268        // Per-phase WALL-clock duration in SECONDS — the DENOMINATOR
2269        // component of the `iteration_rate` Rate. ext_metrics-only. The
2270        // ms→s conversion is applied at the component-insertion site (NOT in
2271        // `derive_rate_metrics`, which does a bare num/den with no scaling),
2272        // so the stored value is already seconds and the derived rate is
2273        // iterations/second. `total_` prefix satisfies the Counter naming gate.
2274        name: "total_phase_duration_sec",
2275        polarity: crate::test_support::Polarity::HigherBetter,
2276        kind: MetricKind::Counter,
2277        default_abs: 1.0,
2278        default_rel: 0.30,
2279        display_unit: "s",
2280        accessor: |_| None,
2281    },
2282    MetricDef {
2283        // Run-level POOLED CPU-seconds — the DENOMINATOR component of the
2284        // pooled `iterations_per_cpu_sec` Rate. ext_metrics-only (accessor
2285        // |_| None): populate_run_pooled_iterations_per_cpu_sec sums the
2286        // MEASURED cgroups' CgroupStats.total_cpu_time_ns (total_cpu_time_ns >
2287        // 0) and inserts the ns→s value (= Σns / 1e9) at the post-merge eval
2288        // site. The measured-only filter leaves this denominator unchanged
2289        // (excluded cgroups contribute 0 ns) — it matters for the numerator,
2290        // whose excluded cgroups carry nonzero iterations. The /1e9 lives
2291        // there (NOT in derive_rate_metrics, which does a bare num/den),
2292        // applied ONCE on the summed ns. `total_` prefix satisfies the Counter
2293        // gate.
2294        name: "total_cpu_time_sec",
2295        polarity: crate::test_support::Polarity::HigherBetter,
2296        kind: MetricKind::Counter,
2297        default_abs: 1.0,
2298        default_rel: 0.30,
2299        display_unit: "s",
2300        accessor: |_| None,
2301    },
2302    MetricDef {
2303        // Run-level POOLED iteration count — the NUMERATOR component of the
2304        // pooled `iterations_per_cpu_sec` Rate, summed over cgroups with
2305        // MEASURED cpu-time (total_cpu_time_ns > 0). ext_metrics-only,
2306        // DISTINCT from the typed `total_iterations` Counter on purpose: the
2307        // typed field is skipped from ext_metrics (TYPED_FIELD_NAMES) and folds
2308        // cross-RUN as a MEAN (group_and_average_by's round_u64 divides the
2309        // accumulated sum by the contributor count — a display average), while
2310        // a Rate numerator must fold cross-RUN as a SUM (aggregate_finite
2311        // Counter arm, no divide) so Σnum/Σdenom re-pools. One shared key
2312        // cannot carry both folds, so the numerator gets its own ext key. It
2313        // also sums only MEASURED cgroups, where the typed field's per-RUN
2314        // cross-cgroup merge sums ALL cgroups — so it equals the merge-summed
2315        // typed total_iterations unless an excluded (zero-cpu-time) cgroup
2316        // carried iterations>0, in which case it is LESS.
2317        // `total_` prefix satisfies the Counter naming gate.
2318        name: "total_iterations_pooled",
2319        polarity: crate::test_support::Polarity::HigherBetter,
2320        kind: MetricKind::Counter,
2321        default_abs: 1.0,
2322        default_rel: 0.10,
2323        display_unit: "",
2324        accessor: |_| None,
2325    },
2326    MetricDef {
2327        // Run-level cohort CPU-time EFFICIENCY pooled across cgroups (and
2328        // re-pooled across runs): Σiterations / Σcpu-seconds. MetricKind::Rate
2329        // over the two Counter components above; derive_rate_metrics re-derives
2330        // it = Σtotal_iterations_pooled / Σtotal_cpu_time_sec at every level.
2331        // Distinct from the per-cgroup `worst_iterations_per_cpu_sec`
2332        // WorstLowest metric (the lowest-wins min-fold starvation selector):
2333        // this is the POOLED cohort rate, overcommit-invariant. _per_cpu_sec name + Rate kind passes the
2334        // reverse naming gate; ext_metrics-only (accessor |_| None).
2335        //
2336        // SAME physical quantity as worst_iterations_per_cpu_sec (iter/CPU-s
2337        // efficiency), so it shares that sibling's compare thresholds:
2338        // default_rel=0.10 (a 10% efficiency change is the regression signal)
2339        // and default_abs=10.0 (near-zero anti-jitter floor — a real busy
2340        // workload's rate is orders of magnitude larger). NOT the looser
2341        // iteration_rate throughput gate (rel=0.30), which would silently
2342        // swallow a 10-29% efficiency regression the per-cgroup row flags.
2343        name: "iterations_per_cpu_sec",
2344        polarity: crate::test_support::Polarity::HigherBetter,
2345        kind: MetricKind::Rate {
2346            numerator: "total_iterations_pooled",
2347            denominator: "total_cpu_time_sec",
2348        },
2349        default_abs: 10.0,
2350        default_rel: 0.10,
2351        display_unit: "iter/cpu-s",
2352        accessor: |_| None,
2353    },
2354    MetricDef {
2355        // Per-phase SYSTEM (in-kernel) CPU time in nanoseconds. Read
2356        // host-side from frozen task_struct.stime + the thread-group
2357        // signal_struct.stime accumulator (zero guest work). Injected
2358        // post-hoc — NOT a read_sample metric — as a per-thread-GROUP
2359        // delta over the phase: `crate::assert::phase_group_cpu_delta`
2360        // sums each tgid's `thread_group_cputime` (signal + live-thread
2361        // stime) at its first and last appearance among the phase's
2362        // freeze samples and takes `last - first` = system CPU time the
2363        // group spent during the phase. PerPhaseDeltaSum: the per-phase value
2364        // is already a delta, so the disjoint per-phase deltas SUM across the
2365        // run (the run's total OBSERVED system CPU time — a lower bound
2366        // excluding head / tail / inter-phase-gap windows; see the kind doc),
2367        // and the per-run totals fold by UNWEIGHTED MEAN cross-RUN (NOT
2368        // sample-count-weighted), like user_time_ns. LowerBetter — the DSQ-spinlock
2369        // regression surfaces as rising system time (CPUs spinning in
2370        // the kernel). No typed GauntletRow field; the ext_metrics
2371        // fallback carries it through cargo ktstr perf-delta.
2372        name: "system_time_ns",
2373        polarity: crate::test_support::Polarity::LowerBetter,
2374        kind: MetricKind::PerPhaseDeltaSum,
2375        default_abs: 1000.0,
2376        default_rel: 0.30,
2377        display_unit: "ns",
2378        accessor: |_| None,
2379    },
2380    MetricDef {
2381        // Per-phase USER-mode CPU time in nanoseconds. Same host-side /
2382        // injected / PerPhaseDeltaSum shape as `system_time_ns` (task_struct
2383        // .utime + the thread-group signal_struct.utime accumulator,
2384        // per-tgid delta via `crate::assert::phase_group_cpu_delta`; SUM
2385        // cross-phase, unweighted MEAN cross-run).
2386        // Pairs with it so a test can distinguish "system time rose,
2387        // user work flat" (the lock-contention signature) from "both
2388        // rose" (genuine extra work). LowerBetter — less CPU consumed
2389        // for the same work is the efficiency win; utime already
2390        // includes gtime so the two are never summed.
2391        name: "user_time_ns",
2392        polarity: crate::test_support::Polarity::LowerBetter,
2393        kind: MetricKind::PerPhaseDeltaSum,
2394        default_abs: 1000.0,
2395        default_rel: 0.30,
2396        display_unit: "ns",
2397        accessor: |_| None,
2398    },
2399    // ---- IRQ observability ----
2400    // Host-side observer-free IRQ signals from PerCpuTimeStats (freeze
2401    // Snapshot, src/monitor/dump/mod.rs), cross-CPU folded at
2402    // read_sample and carried through ext_metrics (accessor |_| None) like
2403    // system_time_ns. The time signals require CONFIG_IRQ_TIME_ACCOUNTING;
2404    // loud-absent (None), never false-zero, when off. Per-phase
2405    // reduction is the Counter last-minus-first over the bucket's freeze
2406    // captures (needs num_snapshots >= 2). The per-CPU SPATIAL axis
2407    // (max_cpu_hardirqs + max_cpu_hardirq_concentration, the busiest-CPU
2408    // dimension vs this cross-CPU SUM) is registered below; per-softirq
2409    // spatial-max is a follow-up.
2410    MetricDef {
2411        // Sum of kernel_stat.irqs_sum across CPUs — total hardirqs fired
2412        // (per-CPU monotonic count, __kstat_incr_irqs_this_cpu,
2413        // kernel/irq/internals.h). NOT gated on irqtime (always populates).
2414        name: "total_hardirqs",
2415        polarity: crate::test_support::Polarity::Informational,
2416        kind: MetricKind::Counter,
2417        default_abs: 10.0,
2418        default_rel: 0.50,
2419        display_unit: "",
2420        accessor: |_| None,
2421    },
2422    MetricDef {
2423        // Sum of kernel_stat.softirqs[NET_RX] across CPUs (index via
2424        // SOFTIRQ_NAMES; kstat_incr_softirqs_this_cpu, kernel/softirq.c). The
2425        // load-bearing softirq for NetTraffic RX.
2426        name: "total_softirq_net_rx",
2427        polarity: crate::test_support::Polarity::Informational,
2428        kind: MetricKind::Counter,
2429        default_abs: 10.0,
2430        default_rel: 0.50,
2431        display_unit: "",
2432        accessor: |_| None,
2433    },
2434    MetricDef {
2435        // Sum of kernel_stat.softirqs[NET_TX] across CPUs.
2436        name: "total_softirq_net_tx",
2437        polarity: crate::test_support::Polarity::Informational,
2438        kind: MetricKind::Counter,
2439        default_abs: 10.0,
2440        default_rel: 0.50,
2441        display_unit: "",
2442        accessor: |_| None,
2443    },
2444    MetricDef {
2445        // Sum of kernel_stat.softirqs[TIMER] across CPUs.
2446        name: "total_softirq_timer",
2447        polarity: crate::test_support::Polarity::Informational,
2448        kind: MetricKind::Counter,
2449        default_abs: 10.0,
2450        default_rel: 0.50,
2451        display_unit: "",
2452        accessor: |_| None,
2453    },
2454    MetricDef {
2455        // Sum of kernel_stat.softirqs[SCHED] across CPUs.
2456        name: "total_softirq_sched",
2457        polarity: crate::test_support::Polarity::Informational,
2458        kind: MetricKind::Counter,
2459        default_abs: 10.0,
2460        default_rel: 0.50,
2461        display_unit: "",
2462        accessor: |_| None,
2463    },
2464    MetricDef {
2465        // Sum of kernel_cpustat.cpustat[CPUTIME_IRQ] across CPUs — raw ns in
2466        // hardirq (irqtime_account_delta, kernel/sched/cputime.c). Read from
2467        // guest memory as ns (NOT /proc/stat jiffies — no nsec_to_clock_t).
2468        // Requires CONFIG_IRQ_TIME_ACCOUNTING; Counter/ns like system_time_ns.
2469        name: "total_irq_time_ns",
2470        polarity: crate::test_support::Polarity::Informational,
2471        kind: MetricKind::Counter,
2472        default_abs: 1000.0,
2473        default_rel: 0.50,
2474        display_unit: "ns",
2475        accessor: |_| None,
2476    },
2477    MetricDef {
2478        // Sum of kernel_cpustat.cpustat[CPUTIME_SOFTIRQ] across CPUs — raw ns
2479        // in softirq. Requires CONFIG_IRQ_TIME_ACCOUNTING.
2480        name: "total_softirq_time_ns",
2481        polarity: crate::test_support::Polarity::Informational,
2482        kind: MetricKind::Counter,
2483        default_abs: 1000.0,
2484        default_rel: 0.50,
2485        display_unit: "ns",
2486        accessor: |_| None,
2487    },
2488    MetricDef {
2489        // Sum of kernel_cpustat.cpustat[CPUTIME_STEAL] across CPUs — raw ns the
2490        // hypervisor stole (account_steal_time; needs CONFIG_PARAVIRT_TIME_
2491        // ACCOUNTING + kvm-clock steal-time). CPUTIME_STEAL is an unconditional
2492        // enum member (enum cpu_usage_stat, include/linux/kernel_stat.h), so
2493        // steal-accounting-off reads a constant 0 — a measured Some(0.0), NOT
2494        // loud-absent like the BTF-gated avg_irq gauge.
2495        name: "total_steal_time_ns",
2496        polarity: crate::test_support::Polarity::Informational,
2497        kind: MetricKind::Counter,
2498        default_abs: 1000.0,
2499        default_rel: 0.50,
2500        display_unit: "ns",
2501        accessor: |_| None,
2502    },
2503    MetricDef {
2504        // Mean across CPUs of rq->avg_irq.util_avg — the PELT IRQ load average
2505        // (struct sched_avg, kernel/sched/sched.h; range [0, 1024] =
2506        // SCHED_CAPACITY_SCALE). INSTANTANEOUS gauge (decaying PELT), NEVER
2507        // deltaed. Requires CONFIG_HAVE_SCHED_AVG_IRQ (def_bool y when
2508        // (IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING) && SMP — init/Kconfig).
2509        // Distinct from taskstats avg_irq_delay_ns (irq-DELAY accounting); this
2510        // is PELT util.
2511        name: "avg_irq_util",
2512        polarity: crate::test_support::Polarity::LowerBetter,
2513        kind: MetricKind::Gauge(GaugeAgg::Avg),
2514        default_abs: 20.0,
2515        default_rel: 0.30,
2516        display_unit: "",
2517        accessor: |_| None,
2518    },
2519    MetricDef {
2520        // Host spatial-max ACROSS CPUs of the INSTANTANEOUS rq->avg_irq.util_avg
2521        // gauge (worst-CPU IRQ load at the freeze) — NOT a kernel max-of-window.
2522        // Peak because both the spatial and temporal reduces are max over
2523        // instantaneous values (no cumulative-delta hazard, unlike a counter's
2524        // spatial-max; per-CPU axis is a follow-up). Range [0, 1024].
2525        name: "max_avg_irq_util",
2526        polarity: crate::test_support::Polarity::LowerBetter,
2527        kind: MetricKind::Peak,
2528        default_abs: 50.0,
2529        default_rel: 0.30,
2530        display_unit: "",
2531        accessor: |_| None,
2532    },
2533    MetricDef {
2534        // DERIVED rate: total_hardirqs / total_phase_wall_sec — hardirqs per
2535        // second over the CAPTURE WINDOW (first->last freeze span, NOT the full
2536        // phase; see total_phase_wall_sec). For A/B compare the cadence cancels.
2537        name: "hardirq_rate",
2538        polarity: crate::test_support::Polarity::Informational,
2539        kind: MetricKind::Rate {
2540            numerator: "total_hardirqs",
2541            denominator: "total_phase_wall_sec",
2542        },
2543        default_abs: 1.0,
2544        default_rel: 0.30,
2545        display_unit: "irq/s",
2546        accessor: |_| None,
2547    },
2548    MetricDef {
2549        // DERIVED rate: total_softirq_net_rx / total_phase_wall_sec — NET_RX
2550        // softirqs per second over the capture window. The NetTraffic
2551        // softirq-pressure signal.
2552        name: "net_rx_softirq_rate",
2553        polarity: crate::test_support::Polarity::Informational,
2554        kind: MetricKind::Rate {
2555            numerator: "total_softirq_net_rx",
2556            denominator: "total_phase_wall_sec",
2557        },
2558        default_abs: 1.0,
2559        default_rel: 0.30,
2560        display_unit: "softirq/s",
2561        accessor: |_| None,
2562    },
2563    MetricDef {
2564        // DERIVED rate: total_irq_time_ns / total_phase_wall_ns — the
2565        // dimensionless [0,1] fraction of the capture window spent in hardirq.
2566        // ns/ns (both over the SAME first->last freeze span) so the span-vs-
2567        // phase gap cancels. The exact-integral companion to avg_irq_util's
2568        // smoothed PELT gauge.
2569        name: "irq_time_fraction",
2570        polarity: crate::test_support::Polarity::LowerBetter,
2571        kind: MetricKind::Rate {
2572            numerator: "total_irq_time_ns",
2573            denominator: "total_phase_wall_ns",
2574        },
2575        default_abs: 0.02,
2576        default_rel: 0.30,
2577        display_unit: "",
2578        accessor: |_| None,
2579    },
2580    MetricDef {
2581        // Hidden rate-denominator component (NOT user-facing): the CAPTURE-
2582        // WINDOW duration in seconds = (bucket end_ms - start_ms)/1000, co-
2583        // inserted in buckets_from_grouped both-or-neither with the IRQ
2584        // counters (the /1000 lives at the insertion site; derive_rate_metrics
2585        // does bare num/den). Backs hardirq_rate / net_rx_softirq_rate. Counter
2586        // so it survives the cross-RUN Sum-fold (Sum count / Sum sec re-derives).
2587        name: "total_phase_wall_sec",
2588        polarity: crate::test_support::Polarity::Informational,
2589        kind: MetricKind::Counter,
2590        default_abs: 0.1,
2591        default_rel: 0.30,
2592        display_unit: "s",
2593        accessor: |_| None,
2594    },
2595    MetricDef {
2596        // Hidden rate-denominator component (NOT user-facing): the capture-
2597        // window duration in NANOSECONDS = (bucket end_ms - start_ms) * 1e6,
2598        // co-inserted with the IRQ counters. Backs irq_time_fraction (ns/ns).
2599        name: "total_phase_wall_ns",
2600        polarity: crate::test_support::Polarity::Informational,
2601        kind: MetricKind::Counter,
2602        default_abs: 1000.0,
2603        default_rel: 0.30,
2604        display_unit: "ns",
2605        accessor: |_| None,
2606    },
2607    MetricDef {
2608        // Per-CPU IRQ spatial axis: the BUSIEST CPU's hardirq delta over the
2609        // phase — max over CPUs of each CPU's (last - first freeze) irqs_sum,
2610        // correlated by the per_cpu_time cpu field (NOT the cross-CPU sum, which
2611        // is total_hardirqs). Custom per-CPU-delta fold in assert::phase_build,
2612        // NOT a read_sample arm (read_sample yields one f64 per freeze, no
2613        // per-CPU vector). Peak = spatial-max of a per-CPU cumulative-counter
2614        // delta. Informational: a high busiest-CPU count is ambiguous (high
2615        // traffic vs concentration) — the concentration ratio below is the
2616        // balance signal, mirroring the raw-counts-Informational split.
2617        name: "max_cpu_hardirqs",
2618        polarity: crate::test_support::Polarity::Informational,
2619        kind: MetricKind::Peak,
2620        default_abs: 10.0,
2621        default_rel: 0.50,
2622        display_unit: "",
2623        accessor: |_| None,
2624    },
2625    MetricDef {
2626        // IRQ-concentration ratio: max_cpu_hardirqs / mean per-CPU hardirq delta
2627        // over the SAME reporting-CPU set — the busiest CPU's share of the
2628        // average. Range [1, num_cpus]: 1.0 = perfectly even, higher = IRQs
2629        // concentrated on one CPU. Peak (worst per-phase concentration),
2630        // LowerBetter. Computed in the same per-CPU-delta fold, NOT a Rate (a
2631        // Peak numerator fails every_rate_metric_has_registered_counter_components,
2632        // and max/mean is not Σ-poolable). DELIBERATELY max/MEAN, distinct from
2633        // the sibling max_imbalance_ratio's max/MIN: max/min explodes when any
2634        // CPU takes ~0 IRQs, whereas max/mean measures disproportionate SHARE
2635        // (the IRQ-steering question). Absent (None) when < 2 reporting CPUs or
2636        // mean == 0.
2637        name: "max_cpu_hardirq_concentration",
2638        polarity: crate::test_support::Polarity::LowerBetter,
2639        kind: MetricKind::Peak,
2640        default_abs: 1.0,
2641        default_rel: 0.25,
2642        display_unit: "x",
2643        accessor: |_| None,
2644    },
2645    MetricDef {
2646        // Per-CPU NET_RX softirq spatial axis: the BUSIEST CPU's NET_RX softirq
2647        // delta over the phase — max over CPUs of each CPU's (last - first freeze)
2648        // kstat.softirqs[NET_RX] delta, correlated by the per_cpu_time cpu field
2649        // (NOT the cross-CPU sum, which is total_softirq_net_rx). Counts softirq
2650        // RUNS/invocations (handle_softirqs increments once per pending NET_RX bit
2651        // per dispatch via kstat_incr_softirqs_this_cpu), NOT packets — a
2652        // softirq-frequency / affinity-concentration signal. Custom per-CPU-delta
2653        // fold in assert::phase_build (fold_per_cpu_spatial_max), NOT a read_sample
2654        // arm. Peak = spatial-max of a per-CPU cumulative-counter delta. The
2655        // softirq sibling of max_cpu_hardirqs; Informational for the same reason —
2656        // a high busiest-CPU count is ambiguous (high RX traffic vs concentration),
2657        // the concentration ratio below is the balance signal.
2658        name: "max_cpu_softirq_net_rx",
2659        polarity: crate::test_support::Polarity::Informational,
2660        kind: MetricKind::Peak,
2661        default_abs: 10.0,
2662        default_rel: 0.50,
2663        display_unit: "",
2664        accessor: |_| None,
2665    },
2666    MetricDef {
2667        // NET_RX-softirq-concentration ratio: max_cpu_softirq_net_rx / mean
2668        // per-CPU NET_RX softirq delta over the SAME reporting-CPU set — the
2669        // busiest CPU's share of the average. Range [1, num_cpus]: 1.0 = even,
2670        // higher = NET_RX softirqs concentrated on one CPU (the single-queue-NIC
2671        // vs RPS/RSS-spread signal). Peak, LowerBetter, max/mean — the softirq
2672        // sibling of max_cpu_hardirq_concentration (same NOT-a-Rate, max/MEAN-not-
2673        // max/MIN, >=2-reporting-CPU + mean>0 discipline). Absent (None) when
2674        // < 2 reporting CPUs or mean == 0.
2675        name: "max_cpu_softirq_net_rx_concentration",
2676        polarity: crate::test_support::Polarity::LowerBetter,
2677        kind: MetricKind::Peak,
2678        default_abs: 1.0,
2679        default_rel: 0.25,
2680        display_unit: "x",
2681        accessor: |_| None,
2682    },
2683    MetricDef {
2684        // Mean ACROSS CPUs of the scx_layered util-compensation SCALE over the
2685        // capture window — the factor by which a CPU's useful-work capacity is
2686        // scaled up to compensate for IRQ / softirq / stolen time. Per CPU over
2687        // the first->last per_cpu_time freeze: scale = delta_total / available,
2688        // where delta_total = Σ of ALL 8 kernel_cpustat[] ns deltas
2689        // (user+nice+system+idle+iowait+irq+softirq+steal) and available =
2690        // delta_total - (irq+softirq+steal); clamped to [1.0, 20.0], and
2691        // available == 0 yields the 1.0 floor. Byte-faithful to scx_layered's
2692        // util_compensation compute — the ns-vs-µs unit cancels in the ratio
2693        // (scx_layered reads /proc microseconds; we read kernel_cpustat ns, the
2694        // same slots /proc/stat formats from). 1.0 = no IRQ/steal interference;
2695        // higher = more capacity stolen, so LowerBetter. An idle ktstr VM reads
2696        // exactly 1.0 — the MEASURED clamp floor (a real Some), NOT loud-absent;
2697        // a compensation > 1.0 requires an IRQ/steal-generating workload.
2698        // Gauge(Avg): cross-phase folds weighted-mean to run-level, cross-run
2699        // means — the typical compensation magnitude. Custom per-CPU-delta fold
2700        // in assert::phase_build (fold_util_comp_scale), NOT a read_sample arm: a
2701        // per-CPU clamp-then-mean is not expressible as a scalar Counter/Rate.
2702        // System-axis mean: scx_layered clamps per-CPU then applies per-LAYER;
2703        // ktstr has no layers, so the run-level signal is the mean of the
2704        // per-CPU scale distribution. cpustat[CPUTIME_SOFTIRQ] excludes
2705        // softirq deferred to ksoftirqd (irqtime_account_irq's
2706        // curr != this_cpu_ksoftirqd() guard, kernel/sched/cputime.c) — the same
2707        // undercount scx_layered inherits from /proc, so faithful to it; the
2708        // scale is a lower bound on true IRQ+softirq pressure.
2709        name: "avg_cpu_util_comp_scale",
2710        polarity: crate::test_support::Polarity::LowerBetter,
2711        kind: MetricKind::Gauge(GaugeAgg::Avg),
2712        default_abs: 0.5,
2713        default_rel: 0.30,
2714        display_unit: "x",
2715        accessor: |_| None,
2716    },
2717    MetricDef {
2718        // Mean across (freeze, live task) of scx_lavd's per-task
2719        // normalized_lat_cri (task_ctx.normalized_lat_cri, [0,1024]) — the
2720        // scheduler's latency-criticality score, host-read from the sdt_alloc
2721        // arena (BPF_MAP_TYPE_ARENA) each freeze and BTF-rendered, NOT a kernel
2722        // counter and NOT a BPF .bss field. A GAUGE (an instantaneous per-task
2723        // value lavd recomputes each schedule, scx_lavd lat_cri.bpf.c: lat_cri is
2724        // squared then waker/wakee-propagated, normalized to [0,1024]), so folded
2725        // as a mean over every (freeze, task) observation. Informational: a
2726        // scheduler-internal decision signal with no good/bad direction.
2727        // normalized (not raw lat_cri) for cross-run comparability — raw lat_cri
2728        // is squared + propagated + load-dependent. Custom per-task fold in
2729        // assert::phase_build (fold_lat_cri), NOT a read_sample arm. Distinct from
2730        // lavd's own .bss sys_stat.avg_lat_cri EWMA over SCHEDULED tasks (surfaced
2731        // via watch_bpf_map as the scx_lavd_avg_lat_cri key) — this is an
2732        // instantaneous host walk over ALL live task_ctx, INCLUDING not-yet-scored
2733        // slots that render 0, so the mean is population-sensitive to task-alloc
2734        // churn. Loud-absent for non-lavd schedulers (the rendered payload has no
2735        // such member).
2736        name: "avg_task_lat_cri",
2737        polarity: crate::test_support::Polarity::Informational,
2738        kind: MetricKind::Gauge(GaugeAgg::Avg),
2739        default_abs: 50.0,
2740        default_rel: 0.30,
2741        display_unit: "",
2742        accessor: |_| None,
2743    },
2744    MetricDef {
2745        // Max across (freeze, live task) of scx_lavd's per-task
2746        // normalized_lat_cri ([0,1024]) — the worst-case latency-criticality
2747        // observed over the phase. Peak (spatial+temporal max of an instantaneous
2748        // gauge, no delta). Informational. Same host sdt_alloc-arena source +
2749        // per-task fold (fold_lat_cri) + normalized rationale + loud-absent as
2750        // avg_task_lat_cri.
2751        name: "max_task_lat_cri",
2752        polarity: crate::test_support::Polarity::Informational,
2753        kind: MetricKind::Peak,
2754        default_abs: 100.0,
2755        default_rel: 0.30,
2756        display_unit: "",
2757        accessor: |_| None,
2758    },
2759    MetricDef {
2760        // Per-cgroup IRQ-pressure spatial axis: the busiest workload-leaf cgroup's
2761        // PSI-irq `full` stall DELTA over the phase (decoded µs) — max over the
2762        // workload-root leaf cgroups of each leaf's (last - first freeze)
2763        // cgroup->psi total[PSI_AVGS][PSI_IRQ_FULL], correlated across freezes by
2764        // (cgroup_kva, serial_nr) — the serial disambiguates a freed slab KVA
2765        // reused by a new cgroup. The per-cgroup analog of max_cpu_hardirqs (the
2766        // busiest-CELL dimension);
2767        // attributes IRQ-servicing stall to the workload cell that induced it,
2768        // which the system-wide total_irq_pressure_us cannot. Custom per-cgroup
2769        // delta fold (assert::phase_build fold_per_cgroup_psi), NOT a read_sample
2770        // arm. Peak = spatial-max of a per-cgroup cumulative-counter delta.
2771        // Informational: an absolute per-cell stall is workload-confounded (more
2772        // work → more stall) — the concentration ratio below is the isolation
2773        // signal, mirroring the max_cpu_hardirqs raw-counts split.
2774        name: "max_cgroup_irq_pressure",
2775        polarity: crate::test_support::Polarity::Informational,
2776        kind: MetricKind::Peak,
2777        default_abs: 1.0,
2778        default_rel: 0.50,
2779        display_unit: "µs",
2780        accessor: |_| None,
2781    },
2782    MetricDef {
2783        // Per-cgroup IRQ-pressure concentration: max_cgroup_irq_pressure / the
2784        // mean per-leaf IRQ-full stall delta over the SAME reporting-leaf set —
2785        // the busiest cell's share of the average. Range [1, num_leaves]: 1.0 =
2786        // evenly spread, higher = IRQ-servicing stall concentrated on one workload
2787        // cell (the cgroup-isolation / cell-steering signal). Peak, LowerBetter,
2788        // max/MEAN — the per-cgroup sibling of max_cpu_hardirq_concentration (same
2789        // NOT-a-Rate, max/MEAN-not-max/MIN, >=2-reporting-leaf + mean>0 discipline;
2790        // disjoint leaves — cgroup2's no-internal-process rule — so no
2791        // double-count). Absent (None) when < 2 reporting leaves or mean == 0.
2792        name: "max_cgroup_irq_pressure_concentration",
2793        polarity: crate::test_support::Polarity::LowerBetter,
2794        kind: MetricKind::Peak,
2795        default_abs: 1.0,
2796        default_rel: 0.25,
2797        display_unit: "x",
2798        accessor: |_| None,
2799    },
2800    MetricDef {
2801        // Per-cgroup IRQ-pressure GAUGE: the worst workload-leaf cgroup's PSI-irq
2802        // `full` avg10 (decoded 10s-EWMA percent, 0..=100) — per freeze the max
2803        // across the leaves, then the max across the phase's freezes. The
2804        // instantaneous-pressure companion to max_cgroup_irq_pressure (a gauge, so
2805        // a spatial-max with no delta — the max_avg_irq_util shape on the cgroup
2806        // axis). Peak; LowerBetter (less IRQ pressure on the worst cell is better).
2807        // Custom fold (fold_per_cgroup_psi), NOT a read_sample arm. Loud-absent
2808        // when no leaf reported PSI (psi_cgroups off / absent workload root).
2809        name: "max_cgroup_psi_irq_avg10",
2810        polarity: crate::test_support::Polarity::LowerBetter,
2811        kind: MetricKind::Peak,
2812        default_abs: 5.0,
2813        default_rel: 0.30,
2814        display_unit: "%",
2815        accessor: |_| None,
2816    },
2817    MetricDef {
2818        // System-wide PSI-irq `full` avg10: the mean over monitor samples of the
2819        // decoded 10s-EWMA full IRQ pressure (percent, 0..=100), host-walked from
2820        // the global `psi_system` (NOT a guest /proc read). Gauge(Avg) like
2821        // `avg_irq_util` — an instantaneous smoothed gauge, never deltaed; the
2822        // cross-run fold sample-weights it. LowerBetter (less IRQ pressure is
2823        // better). ext-only (accessor |_| None), folded from MonitorSummary in
2824        // group::sidecar_to_row. Loud-absent (None) when CONFIG_PSI /
2825        // CONFIG_IRQ_TIME_ACCOUNTING is off (no PSI_IRQ_FULL in BTF), never 0.0.
2826        name: "psi_irq_full_avg10",
2827        polarity: crate::test_support::Polarity::LowerBetter,
2828        kind: MetricKind::Gauge(GaugeAgg::Avg),
2829        default_abs: 5.0,
2830        default_rel: 0.30,
2831        display_unit: "%",
2832        accessor: |_| None,
2833    },
2834    MetricDef {
2835        // Cumulative system-wide PSI-irq `full` stall over the monitoring window
2836        // (µs): the end-start delta of `total[PSI_AVGS][PSI_IRQ_FULL]` (decoded
2837        // ns→µs), host-walked from `psi_system`. Counter (a monotonic cumulative
2838        // total, end-start deltaed; saturating on reset) like `total_irq_time_ns`,
2839        // so the cross-run fold Σ-pools it. Informational: an absolute stall time
2840        // is workload-confounded (longer run → more stall) — the avg10 gauge is
2841        // the magnitude-normalized signal; mirrors the total_irq_time_ns split.
2842        // ext-only, same loud-absent gate as `psi_irq_full_avg10`.
2843        name: "total_irq_pressure_us",
2844        polarity: crate::test_support::Polarity::Informational,
2845        kind: MetricKind::Counter,
2846        default_abs: 1.0,
2847        default_rel: 0.50,
2848        display_unit: "µs",
2849        accessor: |_| None,
2850    },
2851    MetricDef {
2852        // Mean schedstat run-delay, re-pooled as the mean over the COMBINED
2853        // run-delay sample set across every cgroup (and phase), RAW ns→µs
2854        // once — see `worst_p99_wake_latency_us`. Each sample is one per-WORKER
2855        // cumulative sched_info.run_delay total (NOT per-dispatch), so the pool
2856        // size is the worker count — see
2857        // [`crate::assert::PhaseCgroupStats::run_delays_ns`]. Distribution kind;
2858        // accessor |_| None (ext_metrics-sourced from the re-pool).
2859        name: "worst_mean_run_delay_us",
2860        polarity: crate::test_support::Polarity::LowerBetter,
2861        kind: MetricKind::Distribution {
2862            source: SampleSource::RunDelayNs,
2863            reduction: SampleReduction::Mean,
2864        },
2865        default_abs: 50.0,
2866        default_rel: 0.25,
2867        display_unit: "\u{00b5}s",
2868        accessor: |_| None,
2869    },
2870    MetricDef {
2871        // Worst (max) schedstat run-delay over the combined run-delay sample
2872        // set, RAW ns→µs once. Distribution kind with the Worst reduction:
2873        // the one Distribution reduction whose cross-RUN fold is MAX (the
2874        // peak survives), not MEAN — see [`crate::stats::SampleReduction::Worst`].
2875        name: "worst_run_delay_us",
2876        polarity: crate::test_support::Polarity::LowerBetter,
2877        kind: MetricKind::Distribution {
2878            source: SampleSource::RunDelayNs,
2879            reduction: SampleReduction::Worst,
2880        },
2881        default_abs: 100.0,
2882        default_rel: 0.50,
2883        display_unit: "\u{00b5}s",
2884        accessor: |_| None,
2885    },
2886    MetricDef {
2887        // Ratio of p99 / median wake latency, worst-case across
2888        // cgroups. `LowerBetter` because a higher ratio signals a
2889        // stretched long tail. Unitless; baseline is 1.0 (p99 == median
2890        // is the perfect-uniform floor set by order-statistic
2891        // ordering). `default_abs = 0.5` guards against trivially
2892        // small deltas that percent-only gates would flag; `default_rel
2893        // = 0.25` matches the wake-latency metrics' percent gate.
2894        //
2895        // BASIS: the per-cgroup worst — the MAX over each cgroup's own
2896        // p99/median ratio (`CgroupStats::wake_latency_tail_ratio`), selected
2897        // post-merge over `stats.cgroups`. Deliberately NOT
2898        // `pooled_p99 / pooled_median` of the `worst_p99_wake_latency_us` /
2899        // `worst_median_wake_latency_us` Distributions (those re-pool the
2900        // cross-cgroup union), so the two do not satisfy
2901        // `tail_ratio == pooled_p99/pooled_median`.
2902        //
2903        // CROSS-RUN FOLD = unweighted exclude-missing MEAN (NOT MAX), by
2904        // design. `MetricKind::WakeLatencyTailRatio` is a WITHIN-RUN
2905        // worst-across-cgroups selector; cross-RUN `aggregate_finite`
2906        // MEAN-folds the per-run worst values over ONLY the runs that cleared
2907        // the floor (divisor = present-finite-contributor count), so a cohort
2908        // of repeated runs reports its TYPICAL worst-cgroup tail amplification
2909        // — the operator-facing cohort-comparison default shared with every
2910        // WorstLowest selector. It deliberately does NOT fold by MAX: MAX
2911        // (peak-of-peaks) is reserved for `SampleReduction::Worst`
2912        // (worst_run_delay_us), a peak DETECTOR; this answers "what is this
2913        // cohort's characteristic worst-cgroup tail". Aligning worst-across
2914        // selectors to a cross-RUN extremum is a tracked product decision (see
2915        // the EXTREMUM ASYMMETRY note in `aggregate_finite`), not this fix.
2916        //
2917        // Samples-required noise gate, enforced at the PRODUCER (not an
2918        // accessor): `crate::assert::populate_run_distribution_metrics` emits
2919        // NO ext key when the run completed fewer than
2920        // [`WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`] iterations (with few
2921        // samples the p99 estimate is effectively the observed maximum and the
2922        // ratio is dominated by a single outlier, not a distributional signal),
2923        // and none when no cgroup carried a measurable tail. An absent key is
2924        // EXCLUDED from the cross-RUN mean (no sub-threshold run dilutes the
2925        // cohort) and read as `None` by `compare_rows`, where the `(None,
2926        // None)` arm skips the pair entirely (no verdict, no coverage diff).
2927        // This REPLACES the deleted
2928        // typed field's accessor gate, which (a) summed every passing run's raw
2929        // ratio over `passes_observed` cross-RUN — folding noisy low-N runs in
2930        // as real values — and (b) re-gated the AGGREGATED row against a MEANED
2931        // iteration count. See [`WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`] for
2932        // the threshold-value rationale.
2933        //
2934        // accessor |_| None: ext_metrics-sourced from the post-merge producer.
2935        name: "worst_wake_latency_tail_ratio",
2936        polarity: crate::test_support::Polarity::LowerBetter,
2937        kind: MetricKind::WakeLatencyTailRatio,
2938        default_abs: 0.5,
2939        default_rel: 0.25,
2940        display_unit: "x",
2941        accessor: |_| None,
2942    },
2943    MetricDef {
2944        // Per-worker iteration throughput, worst (lowest) cgroup.
2945        // `HigherBetter` mirrors [`total_iterations`]: a cgroup that
2946        // fell behind regresses this downward, and a cross-variant
2947        // improvement raises it. `default_abs = 10.0` is the absolute
2948        // iteration-count floor below which deltas are noise;
2949        // `default_rel = 0.10` mirrors the `total_iterations` gate.
2950        //
2951        // Derivation of `abs = 10`: this metric is PER-WORKER. In-tree
2952        // fixtures span `workers_per_cgroup` from 1 through 8 (see
2953        // the KtstrTestEntry declarations under src/scenario/*.rs and
2954        // tests/*.rs); `CtxBuilder`'s `workers_per_cgroup`
2955        // defaults to 1, with scenario-level overrides raising it. A
2956        // per-worker floor of 10 therefore corresponds to
2957        // aggregate regressions of 10-80 total iterations across the
2958        // supported worker counts — high enough that a lightly-
2959        // loaded scheduler's jitter does not flag a regression, low
2960        // enough that a genuine drop (e.g. a cgroup that fell behind
2961        // by 10 iterations at workers=1, or 80 at workers=8) still
2962        // trips the gate. Going below 10 would flag normal cross-run
2963        // jitter on single-worker configs; going above 10 would mask
2964        // regressions on low-worker-count tests. The `rel=0.10`
2965        // companion gate handles larger throughputs proportionally,
2966        // so the `abs=10` floor only binds in the small-count regime
2967        // where rel-only would let single-digit losses slip through.
2968        //
2969        // WorstLowest kind: the lowest (worst) cgroup's
2970        // total_iterations / num_workers, re-pooled post-merge by
2971        // `crate::assert::populate_run_distribution_metrics` from the
2972        // per-cgroup counters via the None-aware lowest-wins fold (a
2973        // measured Some(0.0) wins; a no-workers None is skipped). Accessor
2974        // |_| None — ext_metrics-sourced; an all-None cohort writes no key.
2975        name: "worst_iterations_per_worker",
2976        polarity: crate::test_support::Polarity::HigherBetter,
2977        kind: MetricKind::WorstLowest {
2978            numerator: WorstLowestNumerator::Iterations,
2979            denominator: WorstLowestDenominator::NumWorkers,
2980        },
2981        default_abs: 10.0,
2982        default_rel: 0.10,
2983        display_unit: "",
2984        accessor: |_| None,
2985    },
2986    MetricDef {
2987        // Overcommit-INVARIANT per-cgroup efficiency (iterations per
2988        // CPU-second). `HigherBetter`: a cgroup that lost efficiency
2989        // regresses this downward. Unlike worst_iterations_per_worker
2990        // (raw work, scales with the host-CPU budget), this is the metric
2991        // to compare across `cpu_budget` settings — the overcommit marker
2992        // and compare-path warning point operators here.
2993        //
2994        // `default_rel = 0.10` is the binding proportional gate (a 10%
2995        // efficiency change is the regression signal), mirroring the
2996        // per-worker sibling. `default_abs = 10.0` (iterations/CPU-second)
2997        // is a near-zero noise floor: for any real busy workload the rate
2998        // is orders of magnitude larger, so the floor only binds for a
2999        // near-idle cgroup, where it stops a large rel% on a tiny rate
3000        // from flagging jitter. Distinct from the per-worker metric's
3001        // floor (which scales with worker count) — this is a per-second
3002        // rate, so the floor is a flat anti-noise guard, not a per-worker
3003        // derivation.
3004        //
3005        // WorstLowest kind: the lowest (worst) cgroup's
3006        // total_iterations / (total_cpu_time_ns / 1e9), re-pooled post-merge
3007        // by `crate::assert::populate_run_distribution_metrics` (None when a
3008        // cgroup has no workers or no on-CPU time; lowest measured wins).
3009        // Accessor |_| None — ext_metrics-sourced.
3010        name: "worst_iterations_per_cpu_sec",
3011        polarity: crate::test_support::Polarity::HigherBetter,
3012        kind: MetricKind::WorstLowest {
3013            numerator: WorstLowestNumerator::Iterations,
3014            denominator: WorstLowestDenominator::CpuTimeNs,
3015        },
3016        default_abs: 10.0,
3017        default_rel: 0.10,
3018        // Same physical quantity as the pooled iterations_per_cpu_sec Rate;
3019        // share its unit string rather than leaving this one under-specified.
3020        display_unit: "iter/cpu-s",
3021        accessor: |_| None,
3022    },
3023    MetricDef {
3024        // The WORST (lowest) per-cgroup page-locality fraction across the run.
3025        // HigherBetter, so lowest-wins = worst — a WorstLowest selector
3026        // (None-aware: a measured 0.0, all pages off-node, WINS the lowest; a
3027        // cgroup that measured no NUMA pages is skipped, not a 0.0 sentinel).
3028        // Re-pooled post-merge from the per-phase NUMA carriers
3029        // (assert::populate_run_distribution_metrics, numa_agg_per_cgroup) — NOT
3030        // a typed field: the reports-only CgroupStats hardcodes page_locality 0.0
3031        // (no expected-node set), and the prior typed Gauge field folded via
3032        // fold_lowest_nonzero, which SKIPPED a measured 0.0 and reported a
3033        // better-than-worst cross-run value. accessor None: ext-sourced.
3034        name: "worst_page_locality",
3035        polarity: crate::test_support::Polarity::HigherBetter,
3036        kind: MetricKind::WorstLowest {
3037            numerator: WorstLowestNumerator::NumaLocal,
3038            denominator: WorstLowestDenominator::NumaTotal,
3039        },
3040        default_abs: 0.05,
3041        default_rel: 0.10,
3042        display_unit: "",
3043        accessor: |_| None,
3044    },
3045    MetricDef {
3046        // The WORST (highest) per-cgroup cross-node migration-churn ratio across
3047        // the run. LowerBetter, so highest-wins = worst — a WorstCrossNodeRatio
3048        // max-selector re-pooled post-merge from the per-phase NUMA carriers
3049        // (assert::populate_run_distribution_metrics, numa_agg_per_cgroup) — NOT a
3050        // typed field: the prior typed Gauge(Last) field/GauntletRow column was
3051        // merge-max-folded within-run but cross-run averaged each run's value over
3052        // passes_observed (folding a NUMA-less run's 0.0 sentinel in), AND diverged
3053        // from run_metric (which already re-derived from the per-phase carriers), so
3054        // the sidecar and the in-test read gave different values on multi-phase
3055        // runs. accessor None: ext-sourced.
3056        name: "worst_cross_node_migration_ratio",
3057        polarity: crate::test_support::Polarity::LowerBetter,
3058        kind: MetricKind::WorstCrossNodeRatio,
3059        default_abs: 0.05,
3060        default_rel: 0.20,
3061        display_unit: "",
3062        accessor: |_| None,
3063    },
3064    // -- schbench per-phase metrics (MetricKind::PerPhase) --
3065    // Derived ONCE per phase by `crate::assert::derive_phase_metrics`
3066    // from the phase's pooled schbench histograms / run-delay raw pairs, written
3067    // directly into `PhaseBucket::metrics`. is_derived (skipped by the within-run
3068    // reducers + the phase-bucket merge) with no run-level producer; a per-phase
3069    // A/B claim reads them via `phase_metric`. `accessor: |_| None` — they never
3070    // live on a `GauntletRow`. Latency p50/p90 mirror worst_median_wake_latency_us
3071    // (abs 20), p99/p999 + sched-delay mirror worst_p99/mean (abs 50); all rel 0.25.
3072    MetricDef {
3073        name: SCHBENCH_WAKEUP_P50_US,
3074        polarity: crate::test_support::Polarity::LowerBetter,
3075        kind: MetricKind::PerPhase,
3076        default_abs: 20.0,
3077        default_rel: 0.25,
3078        display_unit: "\u{00b5}s",
3079        accessor: |_| None,
3080    },
3081    MetricDef {
3082        name: SCHBENCH_WAKEUP_P90_US,
3083        polarity: crate::test_support::Polarity::LowerBetter,
3084        kind: MetricKind::PerPhase,
3085        default_abs: 20.0,
3086        default_rel: 0.25,
3087        display_unit: "\u{00b5}s",
3088        accessor: |_| None,
3089    },
3090    MetricDef {
3091        name: SCHBENCH_WAKEUP_P99_US,
3092        polarity: crate::test_support::Polarity::LowerBetter,
3093        kind: MetricKind::PerPhase,
3094        default_abs: 50.0,
3095        default_rel: 0.25,
3096        display_unit: "\u{00b5}s",
3097        accessor: |_| None,
3098    },
3099    MetricDef {
3100        name: SCHBENCH_WAKEUP_P999_US,
3101        polarity: crate::test_support::Polarity::LowerBetter,
3102        kind: MetricKind::PerPhase,
3103        default_abs: 50.0,
3104        default_rel: 0.25,
3105        display_unit: "\u{00b5}s",
3106        accessor: |_| None,
3107    },
3108    MetricDef {
3109        name: SCHBENCH_REQUEST_P50_US,
3110        polarity: crate::test_support::Polarity::LowerBetter,
3111        kind: MetricKind::PerPhase,
3112        default_abs: 20.0,
3113        default_rel: 0.25,
3114        display_unit: "\u{00b5}s",
3115        accessor: |_| None,
3116    },
3117    MetricDef {
3118        name: SCHBENCH_REQUEST_P90_US,
3119        polarity: crate::test_support::Polarity::LowerBetter,
3120        kind: MetricKind::PerPhase,
3121        default_abs: 20.0,
3122        default_rel: 0.25,
3123        display_unit: "\u{00b5}s",
3124        accessor: |_| None,
3125    },
3126    MetricDef {
3127        name: SCHBENCH_REQUEST_P99_US,
3128        polarity: crate::test_support::Polarity::LowerBetter,
3129        kind: MetricKind::PerPhase,
3130        default_abs: 50.0,
3131        default_rel: 0.25,
3132        display_unit: "\u{00b5}s",
3133        accessor: |_| None,
3134    },
3135    MetricDef {
3136        name: SCHBENCH_REQUEST_P999_US,
3137        polarity: crate::test_support::Polarity::LowerBetter,
3138        kind: MetricKind::PerPhase,
3139        default_abs: 50.0,
3140        default_rel: 0.25,
3141        display_unit: "\u{00b5}s",
3142        accessor: |_| None,
3143    },
3144    MetricDef {
3145        name: SCHBENCH_SCHED_DELAY_MSG_US,
3146        polarity: crate::test_support::Polarity::LowerBetter,
3147        kind: MetricKind::PerPhase,
3148        default_abs: 50.0,
3149        default_rel: 0.25,
3150        display_unit: "\u{00b5}s",
3151        accessor: |_| None,
3152    },
3153    MetricDef {
3154        name: SCHBENCH_SCHED_DELAY_WORKER_US,
3155        polarity: crate::test_support::Polarity::LowerBetter,
3156        kind: MetricKind::PerPhase,
3157        default_abs: 50.0,
3158        default_rel: 0.25,
3159        display_unit: "\u{00b5}s",
3160        accessor: |_| None,
3161    },
3162    MetricDef {
3163        // Completed work cycles in the phase — more is better (throughput).
3164        name: SCHBENCH_LOOP_COUNT,
3165        polarity: crate::test_support::Polarity::HigherBetter,
3166        kind: MetricKind::PerPhase,
3167        default_abs: 1.0,
3168        default_rel: 0.30,
3169        display_unit: "",
3170        accessor: |_| None,
3171    },
3172    // taobench per-phase qps + hit ratios (WorkType::Taobench engine, derived by
3173    // write_taobench_scalars). total/fast qps HigherBetter (throughput); slow_qps
3174    // + hit_ratio + hit_rate Informational (a component / run-validity signals,
3175    // never a regression direction — see classify_direction).
3176    MetricDef {
3177        name: TAOBENCH_TOTAL_QPS,
3178        polarity: crate::test_support::Polarity::HigherBetter,
3179        kind: MetricKind::PerPhase,
3180        default_abs: 10.0,
3181        default_rel: 0.10,
3182        display_unit: "ops/s",
3183        accessor: |_| None,
3184    },
3185    MetricDef {
3186        name: TAOBENCH_FAST_QPS,
3187        polarity: crate::test_support::Polarity::HigherBetter,
3188        kind: MetricKind::PerPhase,
3189        default_abs: 10.0,
3190        default_rel: 0.10,
3191        display_unit: "ops/s",
3192        accessor: |_| None,
3193    },
3194    MetricDef {
3195        name: TAOBENCH_SLOW_QPS,
3196        polarity: crate::test_support::Polarity::Informational,
3197        kind: MetricKind::PerPhase,
3198        default_abs: 10.0,
3199        default_rel: 0.10,
3200        display_unit: "ops/s",
3201        accessor: |_| None,
3202    },
3203    MetricDef {
3204        name: TAOBENCH_HIT_RATIO,
3205        polarity: crate::test_support::Polarity::Informational,
3206        kind: MetricKind::PerPhase,
3207        default_abs: 0.02,
3208        default_rel: 0.05,
3209        display_unit: "",
3210        accessor: |_| None,
3211    },
3212    MetricDef {
3213        name: TAOBENCH_HIT_RATE,
3214        polarity: crate::test_support::Polarity::Informational,
3215        kind: MetricKind::PerPhase,
3216        default_abs: 0.02,
3217        default_rel: 0.05,
3218        display_unit: "",
3219        accessor: |_| None,
3220    },
3221    // taobench per-phase open-loop SERVE-LATENCY percentiles (µs, LowerBetter,
3222    // PerPhase): the coordinated-omission serve distribution per phase. Thresholds
3223    // mirror the schbench per-phase latency siblings (p50/p90/min abs 20; p99/p999
3224    // abs 50; max abs 50 / rel 0.50 for the noisier tail). Absent in closed loop.
3225    MetricDef {
3226        name: TAOBENCH_SERVE_P50_US,
3227        polarity: crate::test_support::Polarity::LowerBetter,
3228        kind: MetricKind::PerPhase,
3229        default_abs: 20.0,
3230        default_rel: 0.25,
3231        display_unit: "\u{00b5}s",
3232        accessor: |_| None,
3233    },
3234    MetricDef {
3235        name: TAOBENCH_SERVE_P90_US,
3236        polarity: crate::test_support::Polarity::LowerBetter,
3237        kind: MetricKind::PerPhase,
3238        default_abs: 20.0,
3239        default_rel: 0.25,
3240        display_unit: "\u{00b5}s",
3241        accessor: |_| None,
3242    },
3243    MetricDef {
3244        name: TAOBENCH_SERVE_P99_US,
3245        polarity: crate::test_support::Polarity::LowerBetter,
3246        kind: MetricKind::PerPhase,
3247        default_abs: 50.0,
3248        default_rel: 0.25,
3249        display_unit: "\u{00b5}s",
3250        accessor: |_| None,
3251    },
3252    MetricDef {
3253        name: TAOBENCH_SERVE_P999_US,
3254        polarity: crate::test_support::Polarity::LowerBetter,
3255        kind: MetricKind::PerPhase,
3256        default_abs: 50.0,
3257        default_rel: 0.25,
3258        display_unit: "\u{00b5}s",
3259        accessor: |_| None,
3260    },
3261    MetricDef {
3262        name: TAOBENCH_SERVE_MIN_US,
3263        polarity: crate::test_support::Polarity::LowerBetter,
3264        kind: MetricKind::PerPhase,
3265        default_abs: 20.0,
3266        default_rel: 0.25,
3267        display_unit: "\u{00b5}s",
3268        accessor: |_| None,
3269    },
3270    MetricDef {
3271        name: TAOBENCH_SERVE_MAX_US,
3272        polarity: crate::test_support::Polarity::LowerBetter,
3273        kind: MetricKind::PerPhase,
3274        default_abs: 50.0,
3275        default_rel: 0.50,
3276        display_unit: "\u{00b5}s",
3277        accessor: |_| None,
3278    },
3279    // taobench WHOLE-RUN qps + hit Rates and their Counter components, pooled
3280    // cross-cgroup by `crate::assert::populate_run_pooled_taobench` and derived
3281    // by `derive_rate_metrics`. The four `total_taobench_*` Counters are the rate
3282    // components (ext_metrics-only, accessor |_| None; `total_` prefix satisfies
3283    // the Counter naming gate) and are `RENDER_SUPPRESSED_COMPONENTS`, so their
3284    // default_abs/default_rel are inert at the compare layer — the entries exist
3285    // for the re-pool (`name` is the component key, `kind` drives the Counter
3286    // SUM-fold). Cross-RUN each component SUMs, so the Rates re-pool as
3287    // Σnumerator / Σdenominator (aggregate throughput, not a mean of per-run qps).
3288    // HIT is exposed whole-run BOTH ways: the RESPONSE-time taobench_hit_fraction
3289    // (Σfast/Σcompleted) AND the COMMAND-time taobench_command_hit_rate (Σhits/Σcmds,
3290    // hits = cmds − misses — the whole-run analog of the per-phase
3291    // taobench_hit_rate). Under closed-loop every issued lookup completes so the
3292    // two converge; under OPEN-LOOP arrival they diverge (a slow/overloaded run
3293    // issues lookups that have not yet completed), which is why both carry distinct
3294    // --noise-adjust spread and both are registered.
3295    MetricDef {
3296        name: TOTAL_TAOBENCH_OPS,
3297        polarity: crate::test_support::Polarity::HigherBetter,
3298        kind: MetricKind::Counter,
3299        default_abs: 10.0,
3300        default_rel: 0.10,
3301        display_unit: "",
3302        accessor: |_| None,
3303    },
3304    MetricDef {
3305        name: TOTAL_TAOBENCH_FAST_OPS,
3306        polarity: crate::test_support::Polarity::HigherBetter,
3307        kind: MetricKind::Counter,
3308        default_abs: 10.0,
3309        default_rel: 0.10,
3310        display_unit: "",
3311        accessor: |_| None,
3312    },
3313    MetricDef {
3314        name: TOTAL_TAOBENCH_SLOW_OPS,
3315        polarity: crate::test_support::Polarity::Informational,
3316        kind: MetricKind::Counter,
3317        default_abs: 10.0,
3318        default_rel: 0.10,
3319        display_unit: "",
3320        accessor: |_| None,
3321    },
3322    MetricDef {
3323        // Whole-run wall window (ns→s applied once at the producer), the qps
3324        // DENOMINATOR. Counter — cross-RUN SUM, mirroring `total_cpu_time_sec`,
3325        // so Σops/Σwall re-pools the cohort throughput. `total_` prefix satisfies
3326        // the Counter naming gate.
3327        name: TOTAL_TAOBENCH_WALL_SEC,
3328        polarity: crate::test_support::Polarity::HigherBetter,
3329        kind: MetricKind::Counter,
3330        default_abs: 1.0,
3331        default_rel: 0.30,
3332        display_unit: "s",
3333        accessor: |_| None,
3334    },
3335    MetricDef {
3336        // Whole-run total throughput = Σcompleted ops / Σwall-seconds.
3337        // HigherBetter (throughput). Shares the per-phase `taobench_total_qps`
3338        // thresholds. Absent when no Taobench cgroup ran or the wall window was
3339        // unmeasured (components absent).
3340        name: TAOBENCH_TOTAL_OPS_PER_SEC,
3341        polarity: crate::test_support::Polarity::HigherBetter,
3342        kind: MetricKind::Rate {
3343            numerator: TOTAL_TAOBENCH_OPS,
3344            denominator: TOTAL_TAOBENCH_WALL_SEC,
3345        },
3346        default_abs: 10.0,
3347        default_rel: 0.10,
3348        display_unit: "ops/s",
3349        accessor: |_| None,
3350    },
3351    MetricDef {
3352        // Whole-run hit (fast-path) throughput = Σfast ops / Σwall-seconds.
3353        // HigherBetter.
3354        name: TAOBENCH_FAST_OPS_PER_SEC,
3355        polarity: crate::test_support::Polarity::HigherBetter,
3356        kind: MetricKind::Rate {
3357            numerator: TOTAL_TAOBENCH_FAST_OPS,
3358            denominator: TOTAL_TAOBENCH_WALL_SEC,
3359        },
3360        default_abs: 10.0,
3361        default_rel: 0.10,
3362        display_unit: "ops/s",
3363        accessor: |_| None,
3364    },
3365    MetricDef {
3366        // Whole-run slow-path throughput = Σslow ops / Σwall-seconds.
3367        // Informational — the slow path is a component of total throughput, not a
3368        // standalone regression direction (mirrors the per-phase
3369        // `taobench_slow_qps`).
3370        name: TAOBENCH_SLOW_OPS_PER_SEC,
3371        polarity: crate::test_support::Polarity::Informational,
3372        kind: MetricKind::Rate {
3373            numerator: TOTAL_TAOBENCH_SLOW_OPS,
3374            denominator: TOTAL_TAOBENCH_WALL_SEC,
3375        },
3376        default_abs: 10.0,
3377        default_rel: 0.10,
3378        display_unit: "ops/s",
3379        accessor: |_| None,
3380    },
3381    MetricDef {
3382        // Whole-run cache hit FRACTION = Σfast ops / Σcompleted ops — the SAME
3383        // response-time hit measurement as the per-phase `taobench_hit_ratio`, at
3384        // whole-run scope (its run-level Σ/Σ pool). The name differs only by axis:
3385        // per-phase `_ratio` vs whole-run `_fraction` — a distinct registry key is
3386        // required because a Rate cannot share a name with the per-phase PerPhase
3387        // entry, and `_fraction` reads as the pooled [0, 1] ratio-of-counters (the
3388        // qps siblings diverge the same way: per-phase `_qps` vs whole-run
3389        // `_ops_per_sec`). DISTINCT from the command-time `taobench_hit_rate`
3390        // (1 - misses/cmds), which is request-time, not response-time — see the
3391        // block comment above. A fraction in [0, 1]. Informational — a hit-rate
3392        // change is a workload-shape signal, not a scheduler regression direction.
3393        // Absent when no ops completed (`total_taobench_ops` is 0 →
3394        // `derive_rate_metrics` skips the zero denominator).
3395        name: TAOBENCH_HIT_FRACTION,
3396        polarity: crate::test_support::Polarity::Informational,
3397        kind: MetricKind::Rate {
3398            numerator: TOTAL_TAOBENCH_FAST_OPS,
3399            denominator: TOTAL_TAOBENCH_OPS,
3400        },
3401        default_abs: 0.02,
3402        default_rel: 0.05,
3403        display_unit: "",
3404        accessor: |_| None,
3405    },
3406    // taobench WHOLE-RUN open-loop serve-latency percentiles (µs, LowerBetter,
3407    // PerRunDistribution): the union of the per-phase per-cgroup serve histograms,
3408    // percentile re-derived over the union by
3409    // `crate::assert::populate_run_pooled_taobench_distribution`. Noise-compared
3410    // per-run, never cross-run folded (is_derived). Thresholds mirror the
3411    // per-phase serve siblings. Absent in closed loop (no serve samples).
3412    MetricDef {
3413        name: TAOBENCH_SERVE_P50_US_WHOLE,
3414        polarity: crate::test_support::Polarity::LowerBetter,
3415        kind: MetricKind::PerRunDistribution,
3416        default_abs: 20.0,
3417        default_rel: 0.25,
3418        display_unit: "\u{00b5}s",
3419        accessor: |_| None,
3420    },
3421    MetricDef {
3422        name: TAOBENCH_SERVE_P90_US_WHOLE,
3423        polarity: crate::test_support::Polarity::LowerBetter,
3424        kind: MetricKind::PerRunDistribution,
3425        default_abs: 20.0,
3426        default_rel: 0.25,
3427        display_unit: "\u{00b5}s",
3428        accessor: |_| None,
3429    },
3430    MetricDef {
3431        name: TAOBENCH_SERVE_P99_US_WHOLE,
3432        polarity: crate::test_support::Polarity::LowerBetter,
3433        kind: MetricKind::PerRunDistribution,
3434        default_abs: 50.0,
3435        default_rel: 0.25,
3436        display_unit: "\u{00b5}s",
3437        accessor: |_| None,
3438    },
3439    MetricDef {
3440        name: TAOBENCH_SERVE_P999_US_WHOLE,
3441        polarity: crate::test_support::Polarity::LowerBetter,
3442        kind: MetricKind::PerRunDistribution,
3443        default_abs: 50.0,
3444        default_rel: 0.25,
3445        display_unit: "\u{00b5}s",
3446        accessor: |_| None,
3447    },
3448    MetricDef {
3449        name: TAOBENCH_SERVE_MIN_US_WHOLE,
3450        polarity: crate::test_support::Polarity::LowerBetter,
3451        kind: MetricKind::PerRunDistribution,
3452        default_abs: 20.0,
3453        default_rel: 0.25,
3454        display_unit: "\u{00b5}s",
3455        accessor: |_| None,
3456    },
3457    MetricDef {
3458        name: TAOBENCH_SERVE_MAX_US_WHOLE,
3459        polarity: crate::test_support::Polarity::LowerBetter,
3460        kind: MetricKind::PerRunDistribution,
3461        default_abs: 50.0,
3462        default_rel: 0.50,
3463        display_unit: "\u{00b5}s",
3464        accessor: |_| None,
3465    },
3466    // taobench WHOLE-RUN command-time hit: get_cmds + get_hits (= cmds − misses)
3467    // Counter components (ext-only, RENDER_SUPPRESSED, `total_` gate) →
3468    // taobench_command_hit_rate = Σhits/Σcmds (the request-time hit, which diverges
3469    // from the response-time taobench_hit_fraction under open-loop). Pooled by
3470    // `crate::assert::populate_run_pooled_taobench`.
3471    MetricDef {
3472        name: TOTAL_TAOBENCH_GET_CMDS,
3473        polarity: crate::test_support::Polarity::HigherBetter,
3474        kind: MetricKind::Counter,
3475        default_abs: 10.0,
3476        default_rel: 0.10,
3477        display_unit: "",
3478        accessor: |_| None,
3479    },
3480    MetricDef {
3481        name: TOTAL_TAOBENCH_GET_HITS,
3482        polarity: crate::test_support::Polarity::HigherBetter,
3483        kind: MetricKind::Counter,
3484        default_abs: 10.0,
3485        default_rel: 0.10,
3486        display_unit: "",
3487        accessor: |_| None,
3488    },
3489    MetricDef {
3490        name: TAOBENCH_COMMAND_HIT_RATE,
3491        polarity: crate::test_support::Polarity::Informational,
3492        kind: MetricKind::Rate {
3493            numerator: TOTAL_TAOBENCH_GET_HITS,
3494            denominator: TOTAL_TAOBENCH_GET_CMDS,
3495        },
3496        default_abs: 0.02,
3497        default_rel: 0.05,
3498        display_unit: "",
3499        accessor: |_| None,
3500    },
3501    // schbench WHOLE-RUN Class-3: role-separate run-delay gate Rates + their
3502    // Counter components + the whole-run loop Counter, re-pooled run-level by
3503    // `crate::assert::populate_run_pooled_schbench` from the per-phase per-cgroup
3504    // SchbenchPhaseStats raw pairs (Σ over phases+cgroups). The four run-delay /
3505    // pcount Counters are ext-only rate components (accessor |_| None;
3506    // RENDER_SUPPRESSED; `total_` prefix → Counter gate). The two Rates are the
3507    // sample-weighted Σrun_delay/Σpcount per-schedule means (the workload-scoped
3508    // siblings of the system-wide `total_run_delay_ns_per_sched`); message and
3509    // worker roles pool separately. The per-phase `sched_delay_msg/worker_us`
3510    // (PerPhase, display-only) is the SAME Σrun_delay_ns/Σpcount per-schedule
3511    // mean at phase scope -- NOT schbench's native mean-of-per-thread-means
3512    // (that is a separate whole-run stat on `SchbenchResult`, see
3513    // workload/schbench). Only these Rates gate, so no double-count.
3514    MetricDef {
3515        name: TOTAL_SCHBENCH_MSG_RUN_DELAY_NS,
3516        polarity: crate::test_support::Polarity::LowerBetter,
3517        kind: MetricKind::Counter,
3518        default_abs: 1000.0,
3519        default_rel: 0.10,
3520        display_unit: "ns",
3521        accessor: |_| None,
3522    },
3523    MetricDef {
3524        name: TOTAL_SCHBENCH_MSG_PCOUNT,
3525        polarity: crate::test_support::Polarity::Informational,
3526        kind: MetricKind::Counter,
3527        default_abs: 1.0,
3528        default_rel: 0.10,
3529        display_unit: "",
3530        accessor: |_| None,
3531    },
3532    MetricDef {
3533        name: TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS,
3534        polarity: crate::test_support::Polarity::LowerBetter,
3535        kind: MetricKind::Counter,
3536        default_abs: 1000.0,
3537        default_rel: 0.10,
3538        display_unit: "ns",
3539        accessor: |_| None,
3540    },
3541    MetricDef {
3542        name: TOTAL_SCHBENCH_WORKER_PCOUNT,
3543        polarity: crate::test_support::Polarity::Informational,
3544        kind: MetricKind::Counter,
3545        default_abs: 1.0,
3546        default_rel: 0.10,
3547        display_unit: "",
3548        accessor: |_| None,
3549    },
3550    MetricDef {
3551        // Whole-run completed work cycles (Σ over phases+cgroups). HigherBetter
3552        // (throughput). NOT a rate component, so NOT suppressed. Uses the tighter
3553        // rel 0.10 throughput-Counter band shared by its structural peers
3554        // total_iterations / total_phase_iterations (HigherBetter completed-work
3555        // Counters): the whole-run Σ pools every cycle, so a 10-29% drop is a real
3556        // regression, not noise. The per-phase twin `schbench_loop_count`
3557        // (PerPhase) DELIBERATELY keeps the looser rel 0.30 for a
3558        // SMALL-SAMPLE-WINDOW reason, not an accounting one: a single phase pools
3559        // far fewer completed cycles than the whole-run Σ, so its run-to-run
3560        // relative variance (CV) is higher and needs a wider band. It is NOT
3561        // phase-edge jitter -- the per-phase counts partition EXACTLY to the
3562        // whole-run total and cycles are whole, never fractional (schbench/run.rs
3563        // increments once per completed cycle, drains a whole count at the phase
3564        // boundary). The nearest per-phase RAW-COUNT peer, total_phase_iterations,
3565        // itself gates at 0.10; loop_count's 0.30 is the small-window-CV
3566        // exception, not a like-for-like registry precedent. default_abs is the
3567        // near-idle activity floor; default_rel carries materiality (see
3568        // MetricDef::default_abs).
3569        name: TOTAL_SCHBENCH_LOOPS,
3570        polarity: crate::test_support::Polarity::HigherBetter,
3571        kind: MetricKind::Counter,
3572        default_abs: 1.0,
3573        default_rel: 0.10,
3574        display_unit: "",
3575        accessor: |_| None,
3576    },
3577    MetricDef {
3578        // Message-thread per-schedule run-delay mean = Σrun_delay_ns / Σpcount
3579        // (sample-weighted, NOT mean-of-per-run-means). LowerBetter (higher
3580        // scheduling wait is worse). Absent when no message thread was scheduled
3581        // (Σpcount == 0).
3582        name: SCHBENCH_MSG_RUN_DELAY_NS_PER_SCHED,
3583        polarity: crate::test_support::Polarity::LowerBetter,
3584        kind: MetricKind::Rate {
3585            numerator: TOTAL_SCHBENCH_MSG_RUN_DELAY_NS,
3586            denominator: TOTAL_SCHBENCH_MSG_PCOUNT,
3587        },
3588        default_abs: 100.0,
3589        default_rel: 0.10,
3590        display_unit: "ns",
3591        accessor: |_| None,
3592    },
3593    MetricDef {
3594        // Worker per-schedule run-delay mean = Σrun_delay_ns / Σpcount. LowerBetter.
3595        // Pooled SEPARATELY from the message role (different per-schedule wait
3596        // populations). Absent when no worker was scheduled (Σpcount == 0).
3597        name: SCHBENCH_WORKER_RUN_DELAY_NS_PER_SCHED,
3598        polarity: crate::test_support::Polarity::LowerBetter,
3599        kind: MetricKind::Rate {
3600            numerator: TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS,
3601            denominator: TOTAL_SCHBENCH_WORKER_PCOUNT,
3602        },
3603        default_abs: 100.0,
3604        default_rel: 0.10,
3605        display_unit: "ns",
3606        accessor: |_| None,
3607    },
3608    // Per-phase latency min/max. LowerBetter (a higher min/max latency is worse).
3609    // min is a low-tail value → p50/p90 abs tier (20). max is a PEAK (a single
3610    // extreme sample, the flakiest latency stat) → the peak rel tolerance (0.50,
3611    // matching worst_gap_ms) so one outlier spike does not fabricate a regression.
3612    MetricDef {
3613        name: SCHBENCH_WAKEUP_MIN_US,
3614        polarity: crate::test_support::Polarity::LowerBetter,
3615        kind: MetricKind::PerPhase,
3616        default_abs: 20.0,
3617        default_rel: 0.25,
3618        display_unit: "\u{00b5}s",
3619        accessor: |_| None,
3620    },
3621    MetricDef {
3622        name: SCHBENCH_WAKEUP_MAX_US,
3623        polarity: crate::test_support::Polarity::LowerBetter,
3624        kind: MetricKind::PerPhase,
3625        default_abs: 50.0,
3626        default_rel: 0.50,
3627        display_unit: "\u{00b5}s",
3628        accessor: |_| None,
3629    },
3630    MetricDef {
3631        name: SCHBENCH_REQUEST_MIN_US,
3632        polarity: crate::test_support::Polarity::LowerBetter,
3633        kind: MetricKind::PerPhase,
3634        default_abs: 20.0,
3635        default_rel: 0.25,
3636        display_unit: "\u{00b5}s",
3637        accessor: |_| None,
3638    },
3639    MetricDef {
3640        name: SCHBENCH_REQUEST_MAX_US,
3641        polarity: crate::test_support::Polarity::LowerBetter,
3642        kind: MetricKind::PerPhase,
3643        default_abs: 50.0,
3644        default_rel: 0.50,
3645        display_unit: "\u{00b5}s",
3646        accessor: |_| None,
3647    },
3648    // Per-phase achieved-RPS distribution (PLIST_FOR_RPS = 20/50/90, schbench.c:130)
3649    // + min/max (schbench.c:579 stderr footer + :713-714/:1963 JSON — parity with
3650    // what schbench emits, not an extension). HigherBetter (more requests/sec = more
3651    // throughput) — note min/max INVERT the latency polarity (a higher worst-second
3652    // rate is better). A per-second RATE spanning tens..tens-of-thousands, so
3653    // rel-dominant (rel 0.10) with a near-idle abs floor (10) — NOT loop_count's
3654    // count-style abs 1/rel 0.30.
3655    //
3656    // rps min/max keep the percentile-tier rel (0.10), NOT the loosened latency-max
3657    // tier (0.50): each rps sample is a 1-second-AVERAGED rate (cycles completed that
3658    // second), not a single event like a latency sample, so an rps extreme is the
3659    // worst/best SECOND — far less flaky than a latency per-request peak, and the
3660    // worst-second is a meaningful scheduler-tail signal worth a tight gate. (The
3661    // latency-max 0.50 loosening guards single-request spikes that do not exist in the
3662    // 1s-averaged rps series.)
3663    //
3664    // rps_min is UNRELIABLE when any 0-rps (starvation) second occurs: TWO independent
3665    // paths drop a real 0 from the min. (1) Within a histogram, add_lat's min==0
3666    // sentinel (plat.rs `if min==0 || us<min`) treats 0 as "unset" — a 0 sets min=0 but
3667    // the next sample replaces it (e.g. [100,0,200] -> min=200), so min reads 0 only
3668    // when a 0 is the last min-lowering sample. (2) Across cgroups, PlatStats::combine's
3669    // `other.min != 0` guard (plat.rs:230 — correct for latency, where 0 means empty)
3670    // skips a starved cgroup's min=0 when pooling, so a 0-rps cgroup pooled with a
3671    // nonzero one leaves rps_min nonzero. rps_min is thus a trustworthy worst-second
3672    // floor only absent 0-seconds. Sustained starvation (0-seconds >= 20% of the
3673    // window) still shows in rps_p20, which reads the pooled histogram's bucket 0
3674    // (folded unconditionally, plat.rs:223-224) in both cases; a single 0-second in a longer
3675    // window is below p20 and lost from rps_min — invisible to both. Faithful to
3676    // schbench's add_lat min sentinel.
3677    MetricDef {
3678        name: SCHBENCH_RPS_P20,
3679        polarity: crate::test_support::Polarity::HigherBetter,
3680        kind: MetricKind::PerPhase,
3681        default_abs: 10.0,
3682        default_rel: 0.10,
3683        display_unit: "req/s",
3684        accessor: |_| None,
3685    },
3686    MetricDef {
3687        name: SCHBENCH_RPS_P50,
3688        polarity: crate::test_support::Polarity::HigherBetter,
3689        kind: MetricKind::PerPhase,
3690        default_abs: 10.0,
3691        default_rel: 0.10,
3692        display_unit: "req/s",
3693        accessor: |_| None,
3694    },
3695    MetricDef {
3696        name: SCHBENCH_RPS_P90,
3697        polarity: crate::test_support::Polarity::HigherBetter,
3698        kind: MetricKind::PerPhase,
3699        default_abs: 10.0,
3700        default_rel: 0.10,
3701        display_unit: "req/s",
3702        accessor: |_| None,
3703    },
3704    MetricDef {
3705        name: SCHBENCH_RPS_MIN,
3706        polarity: crate::test_support::Polarity::HigherBetter,
3707        kind: MetricKind::PerPhase,
3708        default_abs: 10.0,
3709        default_rel: 0.10,
3710        display_unit: "req/s",
3711        accessor: |_| None,
3712    },
3713    MetricDef {
3714        name: SCHBENCH_RPS_MAX,
3715        polarity: crate::test_support::Polarity::HigherBetter,
3716        kind: MetricKind::PerPhase,
3717        default_abs: 10.0,
3718        default_rel: 0.10,
3719        display_unit: "req/s",
3720        accessor: |_| None,
3721    },
3722    // schbench WHOLE-RUN distributional metrics (MetricKind::
3723    // PerRunDistribution): each per-phase percentile/min/max above, re-pooled
3724    // run-level by populate_run_pooled_schbench_distribution (union of the
3725    // per-phase per-cgroup PlatStats histograms, percentile re-derived over the
3726    // union — the faithful percentile-of-union). Noise-compared per-run (never
3727    // cross-run folded). Thresholds + polarity + unit mirror the per-phase
3728    // sibling. accessor |_| None (ext-only, written by the union populate).
3729    MetricDef {
3730        name: SCHBENCH_WAKEUP_P50_US_WHOLE,
3731        polarity: crate::test_support::Polarity::LowerBetter,
3732        kind: MetricKind::PerRunDistribution,
3733        default_abs: 20.0,
3734        default_rel: 0.25,
3735        display_unit: "\u{00b5}s",
3736        accessor: |_| None,
3737    },
3738    MetricDef {
3739        name: SCHBENCH_WAKEUP_P90_US_WHOLE,
3740        polarity: crate::test_support::Polarity::LowerBetter,
3741        kind: MetricKind::PerRunDistribution,
3742        default_abs: 20.0,
3743        default_rel: 0.25,
3744        display_unit: "\u{00b5}s",
3745        accessor: |_| None,
3746    },
3747    MetricDef {
3748        name: SCHBENCH_WAKEUP_P99_US_WHOLE,
3749        polarity: crate::test_support::Polarity::LowerBetter,
3750        kind: MetricKind::PerRunDistribution,
3751        default_abs: 50.0,
3752        default_rel: 0.25,
3753        display_unit: "\u{00b5}s",
3754        accessor: |_| None,
3755    },
3756    MetricDef {
3757        name: SCHBENCH_WAKEUP_P999_US_WHOLE,
3758        polarity: crate::test_support::Polarity::LowerBetter,
3759        kind: MetricKind::PerRunDistribution,
3760        default_abs: 50.0,
3761        default_rel: 0.25,
3762        display_unit: "\u{00b5}s",
3763        accessor: |_| None,
3764    },
3765    MetricDef {
3766        name: SCHBENCH_WAKEUP_MIN_US_WHOLE,
3767        polarity: crate::test_support::Polarity::LowerBetter,
3768        kind: MetricKind::PerRunDistribution,
3769        default_abs: 20.0,
3770        default_rel: 0.25,
3771        display_unit: "\u{00b5}s",
3772        accessor: |_| None,
3773    },
3774    MetricDef {
3775        name: SCHBENCH_WAKEUP_MAX_US_WHOLE,
3776        polarity: crate::test_support::Polarity::LowerBetter,
3777        kind: MetricKind::PerRunDistribution,
3778        default_abs: 50.0,
3779        default_rel: 0.50,
3780        display_unit: "\u{00b5}s",
3781        accessor: |_| None,
3782    },
3783    MetricDef {
3784        name: SCHBENCH_REQUEST_P50_US_WHOLE,
3785        polarity: crate::test_support::Polarity::LowerBetter,
3786        kind: MetricKind::PerRunDistribution,
3787        default_abs: 20.0,
3788        default_rel: 0.25,
3789        display_unit: "\u{00b5}s",
3790        accessor: |_| None,
3791    },
3792    MetricDef {
3793        name: SCHBENCH_REQUEST_P90_US_WHOLE,
3794        polarity: crate::test_support::Polarity::LowerBetter,
3795        kind: MetricKind::PerRunDistribution,
3796        default_abs: 20.0,
3797        default_rel: 0.25,
3798        display_unit: "\u{00b5}s",
3799        accessor: |_| None,
3800    },
3801    MetricDef {
3802        name: SCHBENCH_REQUEST_P99_US_WHOLE,
3803        polarity: crate::test_support::Polarity::LowerBetter,
3804        kind: MetricKind::PerRunDistribution,
3805        default_abs: 50.0,
3806        default_rel: 0.25,
3807        display_unit: "\u{00b5}s",
3808        accessor: |_| None,
3809    },
3810    MetricDef {
3811        name: SCHBENCH_REQUEST_P999_US_WHOLE,
3812        polarity: crate::test_support::Polarity::LowerBetter,
3813        kind: MetricKind::PerRunDistribution,
3814        default_abs: 50.0,
3815        default_rel: 0.25,
3816        display_unit: "\u{00b5}s",
3817        accessor: |_| None,
3818    },
3819    MetricDef {
3820        name: SCHBENCH_REQUEST_MIN_US_WHOLE,
3821        polarity: crate::test_support::Polarity::LowerBetter,
3822        kind: MetricKind::PerRunDistribution,
3823        default_abs: 20.0,
3824        default_rel: 0.25,
3825        display_unit: "\u{00b5}s",
3826        accessor: |_| None,
3827    },
3828    MetricDef {
3829        name: SCHBENCH_REQUEST_MAX_US_WHOLE,
3830        polarity: crate::test_support::Polarity::LowerBetter,
3831        kind: MetricKind::PerRunDistribution,
3832        default_abs: 50.0,
3833        default_rel: 0.50,
3834        display_unit: "\u{00b5}s",
3835        accessor: |_| None,
3836    },
3837    MetricDef {
3838        name: SCHBENCH_RPS_P20_WHOLE,
3839        polarity: crate::test_support::Polarity::HigherBetter,
3840        kind: MetricKind::PerRunDistribution,
3841        default_abs: 10.0,
3842        default_rel: 0.10,
3843        display_unit: "req/s",
3844        accessor: |_| None,
3845    },
3846    MetricDef {
3847        name: SCHBENCH_RPS_P50_WHOLE,
3848        polarity: crate::test_support::Polarity::HigherBetter,
3849        kind: MetricKind::PerRunDistribution,
3850        default_abs: 10.0,
3851        default_rel: 0.10,
3852        display_unit: "req/s",
3853        accessor: |_| None,
3854    },
3855    MetricDef {
3856        name: SCHBENCH_RPS_P90_WHOLE,
3857        polarity: crate::test_support::Polarity::HigherBetter,
3858        kind: MetricKind::PerRunDistribution,
3859        default_abs: 10.0,
3860        default_rel: 0.10,
3861        display_unit: "req/s",
3862        accessor: |_| None,
3863    },
3864    MetricDef {
3865        name: SCHBENCH_RPS_MIN_WHOLE,
3866        polarity: crate::test_support::Polarity::HigherBetter,
3867        kind: MetricKind::PerRunDistribution,
3868        default_abs: 10.0,
3869        default_rel: 0.10,
3870        display_unit: "req/s",
3871        accessor: |_| None,
3872    },
3873    MetricDef {
3874        name: SCHBENCH_RPS_MAX_WHOLE,
3875        polarity: crate::test_support::Polarity::HigherBetter,
3876        kind: MetricKind::PerRunDistribution,
3877        default_abs: 10.0,
3878        default_rel: 0.10,
3879        display_unit: "req/s",
3880        accessor: |_| None,
3881    },
3882    // -- Per-cgroup per-phase NON-schbench families. PerPhase, `accessor:
3883    // |_| None`: read from `PhaseCgroupStats::metrics` by name (written by
3884    // `write_carrier_scalars`), never from a `GauntletRow`. BARE per-cgroup
3885    // names (NOT the run-level `worst_*`): a single cgroup's value is not a
3886    // "worst across cgroups", and reusing `worst_*` would collide
3887    // `metric_def` with the run-level selector. Thresholds mirror the
3888    // analogous `worst_*` entries. (`iterations_per_cpu_sec` is intentionally
3889    // absent — it is already a Rate entry above; the per-cgroup value resolves
3890    // through that name without a second registration.)
3891    MetricDef {
3892        name: "p99_wake_latency_us",
3893        polarity: crate::test_support::Polarity::LowerBetter,
3894        kind: MetricKind::PerPhase,
3895        default_abs: 50.0,
3896        default_rel: 0.25,
3897        display_unit: "\u{00b5}s",
3898        accessor: |_| None,
3899    },
3900    MetricDef {
3901        name: "median_wake_latency_us",
3902        polarity: crate::test_support::Polarity::LowerBetter,
3903        kind: MetricKind::PerPhase,
3904        default_abs: 20.0,
3905        default_rel: 0.25,
3906        display_unit: "\u{00b5}s",
3907        accessor: |_| None,
3908    },
3909    MetricDef {
3910        name: "wake_latency_cv",
3911        polarity: crate::test_support::Polarity::LowerBetter,
3912        kind: MetricKind::PerPhase,
3913        default_abs: 0.10,
3914        default_rel: 0.25,
3915        display_unit: "",
3916        accessor: |_| None,
3917    },
3918    MetricDef {
3919        // Per-cgroup per-phase timer-latency (WorkType::TimerLatency). PerPhase,
3920        // accessor |_| None: read from PhaseCgroupStats::metrics by name
3921        // (written by write_carrier_scalars). Bare name (not worst_*) — a single
3922        // cgroup's value, not a worst-across-cgroups.
3923        name: "p99_timer_latency_us",
3924        polarity: crate::test_support::Polarity::LowerBetter,
3925        kind: MetricKind::PerPhase,
3926        default_abs: 50.0,
3927        default_rel: 0.25,
3928        display_unit: "\u{00b5}s",
3929        accessor: |_| None,
3930    },
3931    MetricDef {
3932        name: "median_timer_latency_us",
3933        polarity: crate::test_support::Polarity::LowerBetter,
3934        kind: MetricKind::PerPhase,
3935        default_abs: 20.0,
3936        default_rel: 0.25,
3937        display_unit: "\u{00b5}s",
3938        accessor: |_| None,
3939    },
3940    MetricDef {
3941        name: "p999_timer_latency_us",
3942        polarity: crate::test_support::Polarity::LowerBetter,
3943        kind: MetricKind::PerPhase,
3944        default_abs: 100.0,
3945        default_rel: 0.25,
3946        display_unit: "\u{00b5}s",
3947        accessor: |_| None,
3948    },
3949    MetricDef {
3950        name: "mean_run_delay_us",
3951        polarity: crate::test_support::Polarity::LowerBetter,
3952        kind: MetricKind::PerPhase,
3953        default_abs: 50.0,
3954        default_rel: 0.25,
3955        display_unit: "\u{00b5}s",
3956        accessor: |_| None,
3957    },
3958    MetricDef {
3959        name: "max_run_delay_us",
3960        polarity: crate::test_support::Polarity::LowerBetter,
3961        kind: MetricKind::PerPhase,
3962        default_abs: 100.0,
3963        default_rel: 0.50,
3964        display_unit: "\u{00b5}s",
3965        accessor: |_| None,
3966    },
3967    MetricDef {
3968        name: "avg_off_cpu_pct",
3969        polarity: crate::test_support::Polarity::LowerBetter,
3970        kind: MetricKind::PerPhase,
3971        default_abs: 5.0,
3972        default_rel: 0.25,
3973        display_unit: "%",
3974        accessor: |_| None,
3975    },
3976    MetricDef {
3977        name: "min_off_cpu_pct",
3978        polarity: crate::test_support::Polarity::LowerBetter,
3979        kind: MetricKind::PerPhase,
3980        default_abs: 5.0,
3981        default_rel: 0.25,
3982        display_unit: "%",
3983        accessor: |_| None,
3984    },
3985    MetricDef {
3986        name: "max_off_cpu_pct",
3987        polarity: crate::test_support::Polarity::LowerBetter,
3988        kind: MetricKind::PerPhase,
3989        default_abs: 5.0,
3990        default_rel: 0.25,
3991        display_unit: "%",
3992        accessor: |_| None,
3993    },
3994    MetricDef {
3995        name: "off_cpu_spread_pct",
3996        polarity: crate::test_support::Polarity::LowerBetter,
3997        kind: MetricKind::PerPhase,
3998        default_abs: 5.0,
3999        default_rel: 0.25,
4000        display_unit: "%",
4001        accessor: |_| None,
4002    },
4003    MetricDef {
4004        name: "migration_ratio",
4005        polarity: crate::test_support::Polarity::LowerBetter,
4006        kind: MetricKind::PerPhase,
4007        default_abs: 0.05,
4008        default_rel: 0.20,
4009        display_unit: "",
4010        accessor: |_| None,
4011    },
4012    MetricDef {
4013        name: "iterations_per_worker",
4014        polarity: crate::test_support::Polarity::HigherBetter,
4015        kind: MetricKind::PerPhase,
4016        default_abs: 10.0,
4017        default_rel: 0.10,
4018        display_unit: "",
4019        accessor: |_| None,
4020    },
4021    MetricDef {
4022        name: "page_locality",
4023        polarity: crate::test_support::Polarity::HigherBetter,
4024        kind: MetricKind::PerPhase,
4025        default_abs: 0.05,
4026        default_rel: 0.10,
4027        display_unit: "",
4028        accessor: |_| None,
4029    },
4030    MetricDef {
4031        name: "cross_node_migration_ratio",
4032        polarity: crate::test_support::Polarity::LowerBetter,
4033        kind: MetricKind::PerPhase,
4034        default_abs: 0.05,
4035        default_rel: 0.20,
4036        display_unit: "",
4037        accessor: |_| None,
4038    },
4039    // Per-cgroup carrier counter (read via `cgroup_counter` /
4040    // `cgroup_counter_total`), Counter kind, `accessor: |_| None` (no
4041    // GauntletRow field). Mirrors `total_iterations`' HigherBetter polarity.
4042    MetricDef {
4043        name: "total_cpu_time_ns",
4044        polarity: crate::test_support::Polarity::HigherBetter,
4045        kind: MetricKind::Counter,
4046        default_abs: 100.0,
4047        default_rel: 0.10,
4048        display_unit: "ns",
4049        accessor: |_| None,
4050    },
4051];
4052
4053/// Minimum total iterations a run must have accumulated before the
4054/// `worst_wake_latency_tail_ratio` metric participates in regression
4055/// math.
4056///
4057/// Below this threshold the p99 / median ratio is dominated by a
4058/// handful of outlier samples rather than a distributional signal:
4059/// p99 on an N-sample set where `N < 100` collapses to approximately
4060/// `samples.max()` (the empirical p99 sits at the Nth item of a
4061/// sorted set, rounded down, so with N=10 every "p99" is in fact the
4062/// maximum), and the ratio `max/median` swings by order of magnitude
4063/// across runs that differ only in which worker happened to hit a
4064/// scheduling stall. `compare_rows` would report those swings as
4065/// regressions / improvements, burying real signal under low-N noise.
4066///
4067/// 100 is the threshold of interest because percentile estimation
4068/// stabilizes when the sample count crosses `1 / (1 - target_p)` —
4069/// i.e. 100 samples for a p99 — which is the point at which at least
4070/// one sample is expected in the 99th-percentile tail by pigeonhole.
4071/// Below this floor the p99 estimator degenerates to the observed
4072/// maximum (`samples[99]` when N is exactly 100, and a still-sparse
4073/// tail at N just above 100). Above 100 the ratio begins to reflect
4074/// actual tail behavior rather than single-sample extrema.
4075///
4076/// The gate uses `total_iterations` (scenario-wide sum across every
4077/// cgroup in the run) as a coarse floor, not an exact per-cgroup
4078/// sample count. That sum OVERESTIMATES the per-cgroup iteration
4079/// count when the scenario has multiple cgroups sharing load, so a
4080/// scenario whose total just clears the floor may still have
4081/// individual cgroups with fewer than 100 iterations and therefore
4082/// noisy per-cgroup tail ratios. The floor is a minimum-viable
4083/// filter against the lowest-N degeneracy, not a guarantee that
4084/// every cgroup in a passing row has a stable p99.
4085///
4086/// The gate is applied at the PRODUCER, not an accessor:
4087/// `crate::assert::populate_run_distribution_metrics` emits no
4088/// `worst_wake_latency_tail_ratio` ext key for a run with
4089/// `total_iterations < WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`. The absent key
4090/// is excluded from the cross-RUN mean and read as `None` by `compare_rows`,
4091/// where the `(None, None)` arm skips the pair entirely (no verdict, no
4092/// coverage diff) when the key is absent on both sides.
4093pub const WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS: u64 = 100;
4094
4095/// Look up a metric definition by name.
4096pub fn metric_def(name: &str) -> Option<&'static MetricDef> {
4097    METRICS.iter().find(|m| m.name == name)
4098}
4099
4100/// Rate-COMPONENT metric names suppressed from compare OUTPUT (scalar findings and the
4101/// noise per-phase spread + coverage rows). These are the internal
4102/// numerator/denominator Counters of the derived rates — `iteration_rate`
4103/// (`total_phase_iterations` / `total_phase_duration_sec`) and the pooled
4104/// `iterations_per_cpu_sec` (`total_iterations_pooled` / `total_cpu_time_sec`) —
4105/// and emitting them alongside their rate is redundant: three rows for one
4106/// user-facing concept.
4107///
4108/// They are suppressed ONLY at the compare-render layer. They REMAIN in the
4109/// persisted sidecar, in `GauntletRow::ext_metrics`, and in
4110/// `PhaseBucket::metrics`, because the cross-RUN re-pool
4111/// ([`group_and_average_by`]) re-derives the rates as `Σnum / Σdenom` from these
4112/// components read out of the rows — stripping them from storage would break
4113/// rate aggregation. The two user-facing rates and the typed `total_iterations`
4114/// are NOT suppressed. (Their `default_abs`/`default_rel` thresholds are inert
4115/// while suppressed — the compare significance gate never reads them — but the
4116/// entries keep their registry slot: `name` is the re-pool component key and
4117/// `kind` drives the fold dispatch.)
4118const RENDER_SUPPRESSED_COMPONENTS: &[&str] = &[
4119    "total_phase_iterations",
4120    "total_phase_duration_sec",
4121    "total_iterations_pooled",
4122    "total_cpu_time_sec",
4123    // taobench whole-run qps / hit_fraction Rate components (the raw op counts +
4124    // wall window). Suppressed so compare shows the four `taobench_*` Rates, not
4125    // the redundant raw counts. Remain in the sidecar / row for the cross-RUN
4126    // Σnum/Σdenom re-pool, like the iterations components above.
4127    TOTAL_TAOBENCH_OPS,
4128    TOTAL_TAOBENCH_FAST_OPS,
4129    TOTAL_TAOBENCH_SLOW_OPS,
4130    TOTAL_TAOBENCH_WALL_SEC,
4131    // taobench command-time hit Rate components (get_cmds + get_hits): suppressed
4132    // so compare shows `taobench_command_hit_rate`, not the raw get counts; remain
4133    // in the row for the cross-RUN Σhits/Σcmds re-pool.
4134    TOTAL_TAOBENCH_GET_CMDS,
4135    TOTAL_TAOBENCH_GET_HITS,
4136    // schbench role-separate run-delay gate-Rate components (raw run_delay_ns +
4137    // pcount per role). Suppressed so compare shows the two
4138    // `schbench_*_run_delay_ns_per_sched` Rates, not the raw Σ pairs. Remain in
4139    // the row for the cross-RUN Σ/Σ re-pool. (total_schbench_loops is NOT here —
4140    // it is a standalone throughput Counter, not a rate component.)
4141    TOTAL_SCHBENCH_MSG_RUN_DELAY_NS,
4142    TOTAL_SCHBENCH_MSG_PCOUNT,
4143    TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS,
4144    TOTAL_SCHBENCH_WORKER_PCOUNT,
4145];
4146
4147/// True when `name` is a Rate component suppressed from compare output (see
4148/// the private `RENDER_SUPPRESSED_COMPONENTS` list).
4149pub fn is_render_suppressed_component(name: &str) -> bool {
4150    RENDER_SUPPRESSED_COMPONENTS.contains(&name)
4151}
4152
4153/// Infer the regression polarity (`higher_is_worse`) of a metric
4154/// not present in [`METRICS`].
4155///
4156/// Used by [`crate::assert::AssertResult::merge`] when it folds an
4157/// `ext_metrics` value whose name is not registered. Returning the
4158/// wrong polarity here surfaces as a silent merge bug: a
4159/// throughput-shaped metric (`*_iops`, `*_throughput`) folded with
4160/// `max` keeps the BETTER value across cgroups instead of the
4161/// worst, masking the cgroup that fell behind. The previous
4162/// fallback (`unwrap_or(true)` — always max) had this exact bug
4163/// for any payload-author metric whose name was not pre-registered
4164/// in the static `METRICS` table.
4165///
4166/// The inference is name-substring based, in the style of the
4167/// `Polarity::Unknown` fallback used by `MetricHint`. The token
4168/// list mirrors the polarity choices in [`METRICS`] for the
4169/// metrics already registered there:
4170///
4171/// - Tokens that signal HigherBetter (returned `false`):
4172///   `iops`, `throughput`, `bandwidth`, `iterations`, `ops_per_sec`,
4173///   `locality`, `_score`, `goodput`. The scheduler-test fixture's
4174///   `total_iterations` and `worst_iterations_per_worker` already
4175///   carry this polarity in the registry; a payload-author metric
4176///   like `jobs.0.read.iops` from the schbench JSON path
4177///   should fold the same way.
4178/// - Tokens that signal LowerBetter (returned `true`):
4179///   `latency`, `delay`, `_gap`, `stall`, `stuck`, `_cv`, `error`,
4180///   `fail`, `drop`, `spread`, `_us`, `_ms`, `_ns`, `migration_ratio`,
4181///   `imbalance`, `_depth`, `dsq`. These are the polarity signals from the existing
4182///   registered LowerBetter entries (`worst_p99_wake_latency_us`,
4183///   `worst_run_delay_us`, `worst_gap_ms`, `stuck_count`,
4184///   `worst_wake_latency_cv`, `worst_spread`, `worst_migration_ratio`,
4185///   `max_imbalance_ratio`). `stall` covers payload-author metrics
4186///   that surface the sched_ext watchdog stall (`SCX_EXIT_ERROR_STALL`)
4187///   while `stuck` covers `stuck_count` (CPU's `rq_clock` not
4188///   advancing) — distinct conditions but both higher-is-worse.
4189///
4190/// When a name matches no token (e.g. `bogo_ops`, `read_kb`,
4191/// `jobs.0.runtime`), returns `true` (LowerBetter). The fallback
4192/// is conservative for regression detection: a payload that emits
4193/// a not-yet-classifiable metric and then folds an unexpectedly
4194/// high value across cgroups is more useful surfaced than silently
4195/// kept at the minimum (which would mask the high reading
4196/// entirely). Authors who need a different default should register
4197/// a [`MetricDef`] in [`METRICS`] or tag the metric via
4198/// [`crate::test_support::MetricHint`].
4199///
4200/// Token order matters when names contain both signals (e.g. the
4201/// hypothetical `low_iops_latency_ms` would match `latency` first
4202/// and be classified as higher-is-worse). The token lists above
4203/// are tested by `infer_higher_is_worse_*` in this module's tests.
4204pub fn infer_higher_is_worse(name: &str) -> bool {
4205    // First-pass: explicit "higher value is the regression" signals
4206    // (latency, delay, error, etc.). Checked first so a name
4207    // carrying both kinds of token (rare; e.g. `*_iops_latency_us`)
4208    // resolves to the latency interpretation, which matches the
4209    // semantics of compound counters/timers.
4210    const HIGHER_IS_WORSE_TOKENS: &[&str] = &[
4211        "latency",
4212        "delay",
4213        "_gap",
4214        "stall",
4215        "stuck",
4216        "_cv",
4217        "error",
4218        "fail",
4219        "drop",
4220        "spread",
4221        "_us",
4222        "_ms",
4223        "_ns",
4224        "migration_ratio",
4225        "imbalance",
4226        // DSQ depth is "lower is better" — a shallower queue
4227        // means the scheduler is keeping up. `_depth` and `dsq`
4228        // are independently meaningful: `_depth` catches names
4229        // like `max_dsq_depth` / `avg_dsq_depth` whose source is
4230        // the scheduler's local dispatch queue; `dsq` is the
4231        // defensive fallback for future DSQ-related metric names
4232        // that don't carry `_depth` (e.g. `dsq_overflow_count`).
4233        // Without these tokens, a future refactor that drops a
4234        // DSQ metric from the METRICS registry would fall through
4235        // to the conservative `true` default — correct by luck
4236        // for DSQ depth (higher = worse) but not by reasoning;
4237        // these tokens make the inference grounded.
4238        //
4239        // False-positive caveat: a future metric named
4240        // `dsq_throughput` / `dsq_iops` / `cache_depth` /
4241        // `tree_depth` would be classified higher-is-worse
4242        // here when the truth is the opposite. The fallback
4243        // path matters only when METRICS doesn't register the
4244        // name explicitly — register every new dsq-or-depth
4245        // metric so the token-based inference never runs.
4246        "_depth",
4247        "dsq",
4248    ];
4249    if HIGHER_IS_WORSE_TOKENS.iter().any(|t| name.contains(t)) {
4250        return true;
4251    }
4252    // Second-pass: "higher value is the improvement" signals
4253    // (throughput, iops, etc.). Matching here returns `false`
4254    // (LowerBetter inverted into HigherBetter, i.e. min is the
4255    // worst-case fold).
4256    const HIGHER_IS_BETTER_TOKENS: &[&str] = &[
4257        "iops",
4258        "throughput",
4259        "bandwidth",
4260        "iterations",
4261        "ops_per_sec",
4262        "locality",
4263        "_score",
4264        "goodput",
4265    ];
4266    if HIGHER_IS_BETTER_TOKENS.iter().any(|t| name.contains(t)) {
4267        return false;
4268    }
4269    // Conservative fallback: treat as higher-is-worse so a folded
4270    // value is the maximum across cgroups. Surfacing a maximum is
4271    // safer than masking it; payload authors who disagree should
4272    // register the metric.
4273    true
4274}
4275
4276/// Render the [`METRICS`] registry for `cargo ktstr stats list-metrics`.
4277///
4278/// `json=false` renders a comfy-table with one row per registered
4279/// metric and columns NAME / POLARITY / DEFAULT_ABS / DEFAULT_REL
4280/// / UNIT. `json=true` emits `serde_json::to_string_pretty`
4281/// on the whole [`METRICS`] slice — the `accessor` fn-pointer is
4282/// `#[serde(skip)]` so the array carries only wire-stable fields.
4283///
4284/// Iteration order equals [`METRICS`] declaration order (the
4285/// canonical surface order for sidecar / CI-gate consumers).
4286///
4287/// The return is owned `String` rather than a print-direct helper so
4288/// callers can pin output via `assert_eq!` in tests; the cargo-ktstr
4289/// dispatch arm at `run_stats` writes it to stdout verbatim.
4290pub fn list_metrics(json: bool) -> anyhow::Result<String> {
4291    if json {
4292        return serde_json::to_string_pretty(METRICS)
4293            .map_err(|e| anyhow::anyhow!("serialize METRICS to JSON: {e}"));
4294    }
4295
4296    let mut table = crate::cli::new_table();
4297    table.set_header(vec![
4298        "NAME",
4299        "POLARITY",
4300        "DEFAULT_ABS",
4301        "DEFAULT_REL",
4302        "UNIT",
4303    ]);
4304    for m in METRICS {
4305        table.add_row(vec![
4306            m.name.to_string(),
4307            polarity_label(m.polarity),
4308            format!("{}", m.default_abs),
4309            format!("{}", m.default_rel),
4310            m.display_unit.to_string(),
4311        ]);
4312    }
4313    Ok(format!("{table}\n"))
4314}
4315
4316/// Short human label for a [`Polarity`](crate::test_support::Polarity)
4317/// variant in the list-metrics table.
4318///
4319/// `HigherBetter` → `higher`, `LowerBetter` → `lower`,
4320/// `TargetValue(t)` → `target(t)`, `Unknown` → `unknown`. Match is
4321/// total; adding a new `Polarity` variant without extending this
4322/// rendering surfaces as a compile error.
4323fn polarity_label(p: crate::test_support::Polarity) -> String {
4324    use crate::test_support::Polarity;
4325    match p {
4326        Polarity::HigherBetter => "higher".to_string(),
4327        Polarity::LowerBetter => "lower".to_string(),
4328        Polarity::TargetValue(t) => format!("target({t})"),
4329        Polarity::Unknown => "unknown".to_string(),
4330        Polarity::Informational => "informational".to_string(),
4331    }
4332}